213 lines
8.1 KiB
Python
213 lines
8.1 KiB
Python
import re
|
|
import sys
|
|
import typing
|
|
from typing import Optional
|
|
|
|
import pysubs2
|
|
from pycaption import Caption, CaptionList, CaptionNode, CaptionReadError, WebVTTReader, WebVTTWriter
|
|
|
|
from unshackle.core.config import config
|
|
|
|
|
|
class CaptionListExt(CaptionList):
|
|
@typing.no_type_check
|
|
def __init__(self, iterable=None, layout_info=None):
|
|
self.first_segment_mpegts = 0
|
|
super().__init__(iterable, layout_info)
|
|
|
|
|
|
class CaptionExt(Caption):
|
|
@typing.no_type_check
|
|
def __init__(self, start, end, nodes, style=None, layout_info=None, segment_index=0, mpegts=0, cue_time=0.0):
|
|
style = style or {}
|
|
self.segment_index: int = segment_index
|
|
self.mpegts: float = mpegts
|
|
self.cue_time: float = cue_time
|
|
super().__init__(start, end, nodes, style, layout_info)
|
|
|
|
|
|
class WebVTTReaderExt(WebVTTReader):
|
|
# HLS extension support <https://datatracker.ietf.org/doc/html/rfc8216#section-3.5>
|
|
RE_TIMESTAMP_MAP = re.compile(r"X-TIMESTAMP-MAP.*")
|
|
RE_MPEGTS = re.compile(r"MPEGTS:(\d+)")
|
|
RE_LOCAL = re.compile(r"LOCAL:((?:(\d{1,}):)?(\d{2}):(\d{2})\.(\d{3}))")
|
|
|
|
def _parse(self, lines: list[str]) -> CaptionList:
|
|
captions = CaptionListExt()
|
|
start = None
|
|
end = None
|
|
nodes: list[CaptionNode] = []
|
|
layout_info = None
|
|
found_timing = False
|
|
segment_index = -1
|
|
mpegts = 0
|
|
cue_time = 0.0
|
|
|
|
# The first segment MPEGTS is needed to calculate the rest. It is possible that
|
|
# the first segment contains no cue and is ignored by pycaption, this acts as a fallback.
|
|
captions.first_segment_mpegts = 0
|
|
|
|
for i, line in enumerate(lines):
|
|
if "-->" in line:
|
|
found_timing = True
|
|
timing_line = i
|
|
last_start_time = captions[-1].start if captions else 0
|
|
try:
|
|
start, end, layout_info = self._parse_timing_line(line, last_start_time)
|
|
except CaptionReadError as e:
|
|
new_msg = f"{e.args[0]} (line {timing_line})"
|
|
tb = sys.exc_info()[2]
|
|
raise type(e)(new_msg).with_traceback(tb) from None
|
|
|
|
elif "" == line:
|
|
if found_timing and nodes:
|
|
found_timing = False
|
|
caption = CaptionExt(
|
|
start,
|
|
end,
|
|
nodes,
|
|
layout_info=layout_info,
|
|
segment_index=segment_index,
|
|
mpegts=mpegts,
|
|
cue_time=cue_time,
|
|
)
|
|
captions.append(caption)
|
|
nodes = []
|
|
|
|
elif "WEBVTT" in line:
|
|
# Merged segmented VTT doesn't have index information, track manually.
|
|
segment_index += 1
|
|
mpegts = 0
|
|
cue_time = 0.0
|
|
elif m := self.RE_TIMESTAMP_MAP.match(line):
|
|
if r := self.RE_MPEGTS.search(m.group()):
|
|
mpegts = int(r.group(1))
|
|
|
|
cue_time = self._parse_local(m.group())
|
|
|
|
# Early assignment in case the first segment contains no cue.
|
|
if segment_index == 0:
|
|
captions.first_segment_mpegts = mpegts
|
|
|
|
else:
|
|
if found_timing:
|
|
if nodes:
|
|
nodes.append(CaptionNode.create_break())
|
|
nodes.append(CaptionNode.create_text(self._decode(line)))
|
|
else:
|
|
# it's a comment or some metadata; ignore it
|
|
pass
|
|
|
|
# Add a last caption if there are remaining nodes
|
|
if nodes:
|
|
caption = CaptionExt(start, end, nodes, layout_info=layout_info, segment_index=segment_index, mpegts=mpegts)
|
|
captions.append(caption)
|
|
|
|
return captions
|
|
|
|
@staticmethod
|
|
def _parse_local(string: str) -> float:
|
|
"""
|
|
Parse WebVTT LOCAL time and convert it to seconds.
|
|
"""
|
|
m = WebVTTReaderExt.RE_LOCAL.search(string)
|
|
if not m:
|
|
return 0
|
|
|
|
parsed = m.groups()
|
|
if not parsed:
|
|
return 0
|
|
hours = int(parsed[1])
|
|
minutes = int(parsed[2])
|
|
seconds = int(parsed[3])
|
|
milliseconds = int(parsed[4])
|
|
return (milliseconds / 1000) + seconds + (minutes * 60) + (hours * 3600)
|
|
|
|
|
|
def merge_segmented_webvtt(vtt_raw: str, segment_durations: Optional[list[int]] = None, timescale: int = 1) -> str:
|
|
"""
|
|
Merge Segmented WebVTT data.
|
|
|
|
Parameters:
|
|
vtt_raw: The concatenated WebVTT files to merge. All WebVTT headers must be
|
|
appropriately spaced apart, or it may produce unwanted effects like
|
|
considering headers as captions, timestamp lines, etc.
|
|
segment_durations: A list of each segment's duration. If not provided it will try
|
|
to get it from the X-TIMESTAMP-MAP headers, specifically the MPEGTS number.
|
|
timescale: The number of time units per second.
|
|
|
|
This parses the X-TIMESTAMP-MAP data to compute new absolute timestamps, replacing
|
|
the old start and end timestamp values. All X-TIMESTAMP-MAP header information will
|
|
be removed from the output as they are no longer of concern. Consider this function
|
|
the opposite of a WebVTT Segmenter, a WebVTT Joiner of sorts.
|
|
|
|
Algorithm borrowed from N_m3u8DL-RE and shaka-player.
|
|
"""
|
|
MPEG_TIMESCALE = 90_000
|
|
|
|
# Check config for conversion method preference
|
|
conversion_method = config.subtitle.get("conversion_method", "auto")
|
|
use_pysubs2 = conversion_method in ("pysubs2", "auto")
|
|
|
|
if use_pysubs2:
|
|
# Try using pysubs2 first for more lenient parsing
|
|
try:
|
|
# Use pysubs2 to parse and normalize the VTT
|
|
subs = pysubs2.SSAFile.from_string(vtt_raw)
|
|
# Convert back to WebVTT string for pycaption processing
|
|
normalized_vtt = subs.to_string("vtt")
|
|
vtt = WebVTTReaderExt().read(normalized_vtt)
|
|
except Exception:
|
|
# Fall back to direct pycaption parsing
|
|
vtt = WebVTTReaderExt().read(vtt_raw)
|
|
else:
|
|
# Use pycaption directly
|
|
vtt = WebVTTReaderExt().read(vtt_raw)
|
|
for lang in vtt.get_languages():
|
|
prev_caption = None
|
|
duplicate_index: list[int] = []
|
|
captions = vtt.get_captions(lang)
|
|
|
|
if captions[0].segment_index == 0:
|
|
first_segment_mpegts = captions[0].mpegts
|
|
else:
|
|
first_segment_mpegts = segment_durations[0] if segment_durations else captions.first_segment_mpegts
|
|
|
|
caption: CaptionExt
|
|
for i, caption in enumerate(captions):
|
|
# DASH WebVTT doesn't have MPEGTS timestamp like HLS. Instead,
|
|
# calculate the timestamp from SegmentTemplate/SegmentList duration.
|
|
likely_dash = first_segment_mpegts == 0 and caption.mpegts == 0
|
|
if likely_dash and segment_durations:
|
|
duration = segment_durations[caption.segment_index]
|
|
caption.mpegts = MPEG_TIMESCALE * (duration / timescale)
|
|
|
|
if caption.mpegts == 0:
|
|
continue
|
|
|
|
# Commeted to fix DSNP subs being out of sync and mistimed.
|
|
# seconds = (caption.mpegts - first_segment_mpegts) / MPEG_TIMESCALE - caption.cue_time
|
|
# offset = seconds * 1_000_000 # pycaption use microseconds
|
|
|
|
# if caption.start < offset:
|
|
# caption.start += offset
|
|
# caption.end += offset
|
|
|
|
# If the difference between current and previous captions is <=1ms
|
|
# and the payload is equal then splice.
|
|
if (
|
|
prev_caption
|
|
and not caption.is_empty()
|
|
and (caption.start - prev_caption.end) <= 1000 # 1ms in microseconds
|
|
and caption.get_text() == prev_caption.get_text()
|
|
):
|
|
prev_caption.end = caption.end
|
|
duplicate_index.append(i)
|
|
|
|
prev_caption = caption
|
|
|
|
# Remove duplicate
|
|
captions[:] = [c for c_index, c in enumerate(captions) if c_index not in set(duplicate_index)]
|
|
|
|
return WebVTTWriter().write(vtt)
|