Initial Commit
This commit is contained in:
192
unshackle/core/utils/webvtt.py
Normal file
192
unshackle/core/utils/webvtt.py
Normal file
@@ -0,0 +1,192 @@
|
||||
import re
|
||||
import sys
|
||||
import typing
|
||||
from typing import Optional
|
||||
|
||||
from pycaption import Caption, CaptionList, CaptionNode, CaptionReadError, WebVTTReader, WebVTTWriter
|
||||
|
||||
|
||||
class CaptionListExt(CaptionList):
|
||||
@typing.no_type_check
|
||||
def __init__(self, iterable=None, layout_info=None):
|
||||
self.first_segment_mpegts = 0
|
||||
super().__init__(iterable, layout_info)
|
||||
|
||||
|
||||
class CaptionExt(Caption):
|
||||
@typing.no_type_check
|
||||
def __init__(self, start, end, nodes, style=None, layout_info=None, segment_index=0, mpegts=0, cue_time=0.0):
|
||||
style = style or {}
|
||||
self.segment_index: int = segment_index
|
||||
self.mpegts: float = mpegts
|
||||
self.cue_time: float = cue_time
|
||||
super().__init__(start, end, nodes, style, layout_info)
|
||||
|
||||
|
||||
class WebVTTReaderExt(WebVTTReader):
|
||||
# HLS extension support <https://datatracker.ietf.org/doc/html/rfc8216#section-3.5>
|
||||
RE_TIMESTAMP_MAP = re.compile(r"X-TIMESTAMP-MAP.*")
|
||||
RE_MPEGTS = re.compile(r"MPEGTS:(\d+)")
|
||||
RE_LOCAL = re.compile(r"LOCAL:((?:(\d{1,}):)?(\d{2}):(\d{2})\.(\d{3}))")
|
||||
|
||||
def _parse(self, lines: list[str]) -> CaptionList:
|
||||
captions = CaptionListExt()
|
||||
start = None
|
||||
end = None
|
||||
nodes: list[CaptionNode] = []
|
||||
layout_info = None
|
||||
found_timing = False
|
||||
segment_index = -1
|
||||
mpegts = 0
|
||||
cue_time = 0.0
|
||||
|
||||
# The first segment MPEGTS is needed to calculate the rest. It is possible that
|
||||
# the first segment contains no cue and is ignored by pycaption, this acts as a fallback.
|
||||
captions.first_segment_mpegts = 0
|
||||
|
||||
for i, line in enumerate(lines):
|
||||
if "-->" in line:
|
||||
found_timing = True
|
||||
timing_line = i
|
||||
last_start_time = captions[-1].start if captions else 0
|
||||
try:
|
||||
start, end, layout_info = self._parse_timing_line(line, last_start_time)
|
||||
except CaptionReadError as e:
|
||||
new_msg = f"{e.args[0]} (line {timing_line})"
|
||||
tb = sys.exc_info()[2]
|
||||
raise type(e)(new_msg).with_traceback(tb) from None
|
||||
|
||||
elif "" == line:
|
||||
if found_timing and nodes:
|
||||
found_timing = False
|
||||
caption = CaptionExt(
|
||||
start,
|
||||
end,
|
||||
nodes,
|
||||
layout_info=layout_info,
|
||||
segment_index=segment_index,
|
||||
mpegts=mpegts,
|
||||
cue_time=cue_time,
|
||||
)
|
||||
captions.append(caption)
|
||||
nodes = []
|
||||
|
||||
elif "WEBVTT" in line:
|
||||
# Merged segmented VTT doesn't have index information, track manually.
|
||||
segment_index += 1
|
||||
mpegts = 0
|
||||
cue_time = 0.0
|
||||
elif m := self.RE_TIMESTAMP_MAP.match(line):
|
||||
if r := self.RE_MPEGTS.search(m.group()):
|
||||
mpegts = int(r.group(1))
|
||||
|
||||
cue_time = self._parse_local(m.group())
|
||||
|
||||
# Early assignment in case the first segment contains no cue.
|
||||
if segment_index == 0:
|
||||
captions.first_segment_mpegts = mpegts
|
||||
|
||||
else:
|
||||
if found_timing:
|
||||
if nodes:
|
||||
nodes.append(CaptionNode.create_break())
|
||||
nodes.append(CaptionNode.create_text(self._decode(line)))
|
||||
else:
|
||||
# it's a comment or some metadata; ignore it
|
||||
pass
|
||||
|
||||
# Add a last caption if there are remaining nodes
|
||||
if nodes:
|
||||
caption = CaptionExt(start, end, nodes, layout_info=layout_info, segment_index=segment_index, mpegts=mpegts)
|
||||
captions.append(caption)
|
||||
|
||||
return captions
|
||||
|
||||
@staticmethod
|
||||
def _parse_local(string: str) -> float:
|
||||
"""
|
||||
Parse WebVTT LOCAL time and convert it to seconds.
|
||||
"""
|
||||
m = WebVTTReaderExt.RE_LOCAL.search(string)
|
||||
if not m:
|
||||
return 0
|
||||
|
||||
parsed = m.groups()
|
||||
if not parsed:
|
||||
return 0
|
||||
hours = int(parsed[1])
|
||||
minutes = int(parsed[2])
|
||||
seconds = int(parsed[3])
|
||||
milliseconds = int(parsed[4])
|
||||
return (milliseconds / 1000) + seconds + (minutes * 60) + (hours * 3600)
|
||||
|
||||
|
||||
def merge_segmented_webvtt(vtt_raw: str, segment_durations: Optional[list[int]] = None, timescale: int = 1) -> str:
|
||||
"""
|
||||
Merge Segmented WebVTT data.
|
||||
|
||||
Parameters:
|
||||
vtt_raw: The concatenated WebVTT files to merge. All WebVTT headers must be
|
||||
appropriately spaced apart, or it may produce unwanted effects like
|
||||
considering headers as captions, timestamp lines, etc.
|
||||
segment_durations: A list of each segment's duration. If not provided it will try
|
||||
to get it from the X-TIMESTAMP-MAP headers, specifically the MPEGTS number.
|
||||
timescale: The number of time units per second.
|
||||
|
||||
This parses the X-TIMESTAMP-MAP data to compute new absolute timestamps, replacing
|
||||
the old start and end timestamp values. All X-TIMESTAMP-MAP header information will
|
||||
be removed from the output as they are no longer of concern. Consider this function
|
||||
the opposite of a WebVTT Segmenter, a WebVTT Joiner of sorts.
|
||||
|
||||
Algorithm borrowed from N_m3u8DL-RE and shaka-player.
|
||||
"""
|
||||
MPEG_TIMESCALE = 90_000
|
||||
|
||||
vtt = WebVTTReaderExt().read(vtt_raw)
|
||||
for lang in vtt.get_languages():
|
||||
prev_caption = None
|
||||
duplicate_index: list[int] = []
|
||||
captions = vtt.get_captions(lang)
|
||||
|
||||
if captions[0].segment_index == 0:
|
||||
first_segment_mpegts = captions[0].mpegts
|
||||
else:
|
||||
first_segment_mpegts = segment_durations[0] if segment_durations else captions.first_segment_mpegts
|
||||
|
||||
caption: CaptionExt
|
||||
for i, caption in enumerate(captions):
|
||||
# DASH WebVTT doesn't have MPEGTS timestamp like HLS. Instead,
|
||||
# calculate the timestamp from SegmentTemplate/SegmentList duration.
|
||||
likely_dash = first_segment_mpegts == 0 and caption.mpegts == 0
|
||||
if likely_dash and segment_durations:
|
||||
duration = segment_durations[caption.segment_index]
|
||||
caption.mpegts = MPEG_TIMESCALE * (duration / timescale)
|
||||
|
||||
if caption.mpegts == 0:
|
||||
continue
|
||||
|
||||
# Commeted to fix DSNP subs being out of sync and mistimed.
|
||||
# seconds = (caption.mpegts - first_segment_mpegts) / MPEG_TIMESCALE - caption.cue_time
|
||||
# offset = seconds * 1_000_000 # pycaption use microseconds
|
||||
|
||||
# if caption.start < offset:
|
||||
# caption.start += offset
|
||||
# caption.end += offset
|
||||
|
||||
# If the difference between current and previous captions is <=1ms
|
||||
# and the payload is equal then splice.
|
||||
if (
|
||||
prev_caption
|
||||
and not caption.is_empty()
|
||||
and (caption.start - prev_caption.end) <= 1000 # 1ms in microseconds
|
||||
and caption.get_text() == prev_caption.get_text()
|
||||
):
|
||||
prev_caption.end = caption.end
|
||||
duplicate_index.append(i)
|
||||
|
||||
prev_caption = caption
|
||||
|
||||
# Remove duplicate
|
||||
captions[:] = [c for c_index, c in enumerate(captions) if c_index not in set(duplicate_index)]
|
||||
|
||||
return WebVTTWriter().write(vtt)
|
||||
Reference in New Issue
Block a user