feat(dl): extract closed captions from HLS manifests and improve CC extraction
- Parse CLOSED-CAPTIONS entries from HLS manifests and attach CC metadata (language, name, instream_id) to video tracks - Move CC extraction to run after decryption instead of before, fixing extraction failures on encrypted streams - Extract CCs even when other subtitle tracks exist, using manifest CC language info instead of guessing - Try ccextractor on the original file before repacking to preserve container-level CC data (e.g. c608 boxes) that ffmpeg remux strips - Display deduplicated closed captions in --list output and download progress, positioned after subtitles - Add closed_captions field to Video track class
This commit is contained in:
@@ -103,53 +103,78 @@ class Tracks:
|
||||
tree = Tree("", hide_root=True)
|
||||
for track_type in self.TRACK_ORDER_MAP:
|
||||
tracks = list(x for x in all_tracks if isinstance(x, track_type))
|
||||
if not tracks:
|
||||
continue
|
||||
num_tracks = len(tracks)
|
||||
track_type_plural = track_type.__name__ + ("s" if track_type != Audio and num_tracks != 1 else "")
|
||||
tracks_tree = tree.add(f"[repr.number]{num_tracks}[/] {track_type_plural}")
|
||||
for track in tracks:
|
||||
if add_progress and track_type not in (Chapter, Attachment):
|
||||
progress = Progress(
|
||||
SpinnerColumn(finished_text=""),
|
||||
BarColumn(),
|
||||
"•",
|
||||
TimeRemainingColumn(compact=True, elapsed_when_finished=True),
|
||||
"•",
|
||||
TextColumn("[progress.data.speed]{task.fields[downloaded]}"),
|
||||
console=console,
|
||||
speed_estimate_period=10,
|
||||
if tracks:
|
||||
num_tracks = len(tracks)
|
||||
track_type_plural = track_type.__name__ + ("s" if track_type != Audio and num_tracks != 1 else "")
|
||||
tracks_tree = tree.add(f"[repr.number]{num_tracks}[/] {track_type_plural}")
|
||||
for track in tracks:
|
||||
if add_progress and track_type not in (Chapter, Attachment):
|
||||
progress = Progress(
|
||||
SpinnerColumn(finished_text=""),
|
||||
BarColumn(),
|
||||
"•",
|
||||
TimeRemainingColumn(compact=True, elapsed_when_finished=True),
|
||||
"•",
|
||||
TextColumn("[progress.data.speed]{task.fields[downloaded]}"),
|
||||
console=console,
|
||||
speed_estimate_period=10,
|
||||
)
|
||||
task = progress.add_task("", downloaded="-")
|
||||
state = {"total": 100.0}
|
||||
|
||||
def update_track_progress(
|
||||
task_id: int = task,
|
||||
_state: dict[str, float] = state,
|
||||
_progress: Progress = progress,
|
||||
**kwargs,
|
||||
) -> None:
|
||||
"""
|
||||
Ensure terminal status states render as a fully completed bar.
|
||||
|
||||
Some downloaders can report completed slightly below total
|
||||
before emitting the final "Downloaded" state.
|
||||
"""
|
||||
if "total" in kwargs and kwargs["total"] is not None:
|
||||
_state["total"] = kwargs["total"]
|
||||
|
||||
downloaded_state = kwargs.get("downloaded")
|
||||
if downloaded_state in {"Downloaded", "Decrypted", "[yellow]SKIPPED"}:
|
||||
kwargs["completed"] = _state["total"]
|
||||
_progress.update(task_id=task_id, **kwargs)
|
||||
|
||||
progress_callables.append(update_track_progress)
|
||||
track_table = Table.grid()
|
||||
track_table.add_row(str(track)[6:], style="text2")
|
||||
track_table.add_row(progress)
|
||||
tracks_tree.add(track_table)
|
||||
else:
|
||||
tracks_tree.add(str(track)[6:], style="text2")
|
||||
|
||||
# Show Closed Captions right after Subtitles (even if no subtitle tracks exist)
|
||||
if track_type is Subtitle:
|
||||
seen_cc: set[str] = set()
|
||||
unique_cc: list[str] = []
|
||||
for video in (x for x in all_tracks if isinstance(x, Video)):
|
||||
for cc in getattr(video, "closed_captions", []):
|
||||
lang = cc.get("language", "und")
|
||||
name = cc.get("name", "")
|
||||
instream_id = cc.get("instream_id", "")
|
||||
key = f"{lang}|{instream_id}"
|
||||
if key in seen_cc:
|
||||
continue
|
||||
seen_cc.add(key)
|
||||
parts = [f"[CC] | {lang}"]
|
||||
if name:
|
||||
parts.append(name)
|
||||
if instream_id:
|
||||
parts.append(instream_id)
|
||||
unique_cc.append(" | ".join(parts))
|
||||
if unique_cc:
|
||||
cc_tree = tree.add(
|
||||
f"[repr.number]{len(unique_cc)}[/] Closed Caption{'s' if len(unique_cc) != 1 else ''}"
|
||||
)
|
||||
task = progress.add_task("", downloaded="-")
|
||||
state = {"total": 100.0}
|
||||
|
||||
def update_track_progress(
|
||||
task_id: int = task,
|
||||
_state: dict[str, float] = state,
|
||||
_progress: Progress = progress,
|
||||
**kwargs,
|
||||
) -> None:
|
||||
"""
|
||||
Ensure terminal status states render as a fully completed bar.
|
||||
|
||||
Some downloaders can report completed slightly below total
|
||||
before emitting the final "Downloaded" state.
|
||||
"""
|
||||
if "total" in kwargs and kwargs["total"] is not None:
|
||||
_state["total"] = kwargs["total"]
|
||||
|
||||
downloaded_state = kwargs.get("downloaded")
|
||||
if downloaded_state in {"Downloaded", "Decrypted", "[yellow]SKIPPED"}:
|
||||
kwargs["completed"] = _state["total"]
|
||||
_progress.update(task_id=task_id, **kwargs)
|
||||
|
||||
progress_callables.append(update_track_progress)
|
||||
track_table = Table.grid()
|
||||
track_table.add_row(str(track)[6:], style="text2")
|
||||
track_table.add_row(progress)
|
||||
tracks_tree.add(track_table)
|
||||
else:
|
||||
tracks_tree.add(str(track)[6:], style="text2")
|
||||
for cc_str in unique_cc:
|
||||
cc_tree.add(cc_str, style="text2")
|
||||
|
||||
return tree, progress_callables
|
||||
|
||||
|
||||
@@ -200,6 +200,7 @@ class Video(Track):
|
||||
height: Optional[int] = None,
|
||||
fps: Optional[Union[str, int, float]] = None,
|
||||
scan_type: Optional[Video.ScanType] = None,
|
||||
closed_captions: Optional[list[dict[str, Any]]] = None,
|
||||
**kwargs: Any,
|
||||
) -> None:
|
||||
"""
|
||||
@@ -264,6 +265,7 @@ class Video(Track):
|
||||
raise ValueError("Expected fps to be a number, float, or a string as numerator/denominator form, " + str(e))
|
||||
|
||||
self.scan_type = scan_type
|
||||
self.closed_captions: list[dict[str, Any]] = closed_captions or []
|
||||
self.needs_duration_fix = False
|
||||
|
||||
def __str__(self) -> str:
|
||||
@@ -346,22 +348,27 @@ class Video(Track):
|
||||
if not binaries.CCExtractor:
|
||||
raise EnvironmentError("ccextractor executable was not found.")
|
||||
|
||||
# ccextractor often fails in weird ways unless we repack
|
||||
self.repackage()
|
||||
|
||||
out_path = Path(out_path)
|
||||
|
||||
try:
|
||||
subprocess.run(
|
||||
[binaries.CCExtractor, "-trim", "-nobom", "-noru", "-ru1", "-o", out_path, self.path],
|
||||
check=True,
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
)
|
||||
except subprocess.CalledProcessError as e:
|
||||
out_path.unlink(missing_ok=True)
|
||||
if not e.returncode == 10: # No captions found
|
||||
raise
|
||||
def _run_ccextractor() -> bool:
|
||||
try:
|
||||
subprocess.run(
|
||||
[binaries.CCExtractor, "-trim", "-nobom", "-noru", "-ru1", "-o", out_path, self.path],
|
||||
check=True,
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
)
|
||||
except subprocess.CalledProcessError as e:
|
||||
out_path.unlink(missing_ok=True)
|
||||
if e.returncode != 10: # 10 = No captions found
|
||||
raise
|
||||
return out_path.exists()
|
||||
|
||||
# Try on the original file first (preserves container-level CC data like c608 boxes),
|
||||
# then fall back to repacked file (ccextractor can fail on some container formats).
|
||||
if not _run_ccextractor():
|
||||
self.repackage()
|
||||
_run_ccextractor()
|
||||
|
||||
if out_path.exists():
|
||||
cc_track = Subtitle(
|
||||
|
||||
Reference in New Issue
Block a user