feat(dl): extract closed captions from HLS manifests and improve CC extraction

- Parse CLOSED-CAPTIONS entries from HLS manifests and attach CC metadata (language, name, instream_id) to video tracks - Move CC extraction to run after decryption instead of before, fixing extraction failures on encrypted streams - Extract CCs even when other subtitle tracks exist, using manifest CC language info instead of guessing - Try ccextractor on the original file before repacking to preserve container-level CC data (e.g. c608 boxes) that ffmpeg remux strips - Display deduplicated closed captions in --list output and download progress, positioned after subtitles - Add closed_captions field to Video track class
2026-03-05 15:57:29 -07:00
parent 7dd6323be5
commit 15acaea208
4 changed files with 156 additions and 103 deletions
--- a/unshackle/core/manifests/hls.py
+++ b/unshackle/core/manifests/hls.py
@@ -112,6 +112,15 @@ class HLS:
        session_drm = HLS.get_all_drm(session_keys)

        audio_codecs_by_group_id: dict[str, Audio.Codec] = {}
+        cc_by_group_id: dict[str, list[dict[str, Any]]] = {}
+        for media in self.manifest.media:
+            if media.type == "CLOSED-CAPTIONS":
+                cc_by_group_id.setdefault(media.group_id, []).append({
+                    "language": media.language,
+                    "name": media.name,
+                    "instream_id": media.instream_id,
+                    "characteristics": media.characteristics,
+                })
        tracks = Tracks()

        for playlist in self.manifest.playlists:
@@ -161,6 +170,9 @@ class HLS:
                            width=playlist.stream_info.resolution[0] if playlist.stream_info.resolution else None,
                            height=playlist.stream_info.resolution[1] if playlist.stream_info.resolution else None,
                            fps=playlist.stream_info.frame_rate,
+                            closed_captions=cc_by_group_id.get(
+                                (playlist.stream_info.closed_captions or "").strip('"'), []
+                            ),
                        )
                        if primary_track_type is Video
                        else {}
--- a/unshackle/core/tracks/tracks.py
+++ b/unshackle/core/tracks/tracks.py
@@ -103,53 +103,78 @@ class Tracks:
        tree = Tree("", hide_root=True)
        for track_type in self.TRACK_ORDER_MAP:
            tracks = list(x for x in all_tracks if isinstance(x, track_type))
-            if not tracks:
-                continue
-            num_tracks = len(tracks)
-            track_type_plural = track_type.__name__ + ("s" if track_type != Audio and num_tracks != 1 else "")
-            tracks_tree = tree.add(f"[repr.number]{num_tracks}[/] {track_type_plural}")
-            for track in tracks:
-                if add_progress and track_type not in (Chapter, Attachment):
-                    progress = Progress(
-                        SpinnerColumn(finished_text=""),
-                        BarColumn(),
-                        "•",
-                        TimeRemainingColumn(compact=True, elapsed_when_finished=True),
-                        "•",
-                        TextColumn("[progress.data.speed]{task.fields[downloaded]}"),
-                        console=console,
-                        speed_estimate_period=10,
+            if tracks:
+                num_tracks = len(tracks)
+                track_type_plural = track_type.__name__ + ("s" if track_type != Audio and num_tracks != 1 else "")
+                tracks_tree = tree.add(f"[repr.number]{num_tracks}[/] {track_type_plural}")
+                for track in tracks:
+                    if add_progress and track_type not in (Chapter, Attachment):
+                        progress = Progress(
+                            SpinnerColumn(finished_text=""),
+                            BarColumn(),
+                            "•",
+                            TimeRemainingColumn(compact=True, elapsed_when_finished=True),
+                            "•",
+                            TextColumn("[progress.data.speed]{task.fields[downloaded]}"),
+                            console=console,
+                            speed_estimate_period=10,
+                        )
+                        task = progress.add_task("", downloaded="-")
+                        state = {"total": 100.0}
+
+                        def update_track_progress(
+                            task_id: int = task,
+                            _state: dict[str, float] = state,
+                            _progress: Progress = progress,
+                            **kwargs,
+                        ) -> None:
+                            """
+                            Ensure terminal status states render as a fully completed bar.
+
+                            Some downloaders can report completed slightly below total
+                            before emitting the final "Downloaded" state.
+                            """
+                            if "total" in kwargs and kwargs["total"] is not None:
+                                _state["total"] = kwargs["total"]
+
+                            downloaded_state = kwargs.get("downloaded")
+                            if downloaded_state in {"Downloaded", "Decrypted", "[yellow]SKIPPED"}:
+                                kwargs["completed"] = _state["total"]
+                            _progress.update(task_id=task_id, **kwargs)
+
+                        progress_callables.append(update_track_progress)
+                        track_table = Table.grid()
+                        track_table.add_row(str(track)[6:], style="text2")
+                        track_table.add_row(progress)
+                        tracks_tree.add(track_table)
+                    else:
+                        tracks_tree.add(str(track)[6:], style="text2")
+
+            # Show Closed Captions right after Subtitles (even if no subtitle tracks exist)
+            if track_type is Subtitle:
+                seen_cc: set[str] = set()
+                unique_cc: list[str] = []
+                for video in (x for x in all_tracks if isinstance(x, Video)):
+                    for cc in getattr(video, "closed_captions", []):
+                        lang = cc.get("language", "und")
+                        name = cc.get("name", "")
+                        instream_id = cc.get("instream_id", "")
+                        key = f"{lang}|{instream_id}"
+                        if key in seen_cc:
+                            continue
+                        seen_cc.add(key)
+                        parts = [f"[CC] | {lang}"]
+                        if name:
+                            parts.append(name)
+                        if instream_id:
+                            parts.append(instream_id)
+                        unique_cc.append(" | ".join(parts))
+                if unique_cc:
+                    cc_tree = tree.add(
+                        f"[repr.number]{len(unique_cc)}[/] Closed Caption{'s' if len(unique_cc) != 1 else ''}"
                    )
-                    task = progress.add_task("", downloaded="-")
-                    state = {"total": 100.0}
-
-                    def update_track_progress(
-                        task_id: int = task,
-                        _state: dict[str, float] = state,
-                        _progress: Progress = progress,
-                        **kwargs,
-                    ) -> None:
-                        """
-                        Ensure terminal status states render as a fully completed bar.
-
-                        Some downloaders can report completed slightly below total
-                        before emitting the final "Downloaded" state.
-                        """
-                        if "total" in kwargs and kwargs["total"] is not None:
-                            _state["total"] = kwargs["total"]
-
-                        downloaded_state = kwargs.get("downloaded")
-                        if downloaded_state in {"Downloaded", "Decrypted", "[yellow]SKIPPED"}:
-                            kwargs["completed"] = _state["total"]
-                        _progress.update(task_id=task_id, **kwargs)
-
-                    progress_callables.append(update_track_progress)
-                    track_table = Table.grid()
-                    track_table.add_row(str(track)[6:], style="text2")
-                    track_table.add_row(progress)
-                    tracks_tree.add(track_table)
-                else:
-                    tracks_tree.add(str(track)[6:], style="text2")
+                    for cc_str in unique_cc:
+                        cc_tree.add(cc_str, style="text2")

        return tree, progress_callables

--- a/unshackle/core/tracks/video.py
+++ b/unshackle/core/tracks/video.py
@@ -200,6 +200,7 @@ class Video(Track):
        height: Optional[int] = None,
        fps: Optional[Union[str, int, float]] = None,
        scan_type: Optional[Video.ScanType] = None,
+        closed_captions: Optional[list[dict[str, Any]]] = None,
        **kwargs: Any,
    ) -> None:
        """
@@ -264,6 +265,7 @@ class Video(Track):
            raise ValueError("Expected fps to be a number, float, or a string as numerator/denominator form, " + str(e))

        self.scan_type = scan_type
+        self.closed_captions: list[dict[str, Any]] = closed_captions or []
        self.needs_duration_fix = False

    def __str__(self) -> str:
@@ -346,22 +348,27 @@ class Video(Track):
        if not binaries.CCExtractor:
            raise EnvironmentError("ccextractor executable was not found.")

-        # ccextractor often fails in weird ways unless we repack
-        self.repackage()
-
        out_path = Path(out_path)

-        try:
-            subprocess.run(
-                [binaries.CCExtractor, "-trim", "-nobom", "-noru", "-ru1", "-o", out_path, self.path],
-                check=True,
-                stdout=subprocess.PIPE,
-                stderr=subprocess.PIPE,
-            )
-        except subprocess.CalledProcessError as e:
-            out_path.unlink(missing_ok=True)
-            if not e.returncode == 10:  # No captions found
-                raise
+        def _run_ccextractor() -> bool:
+            try:
+                subprocess.run(
+                    [binaries.CCExtractor, "-trim", "-nobom", "-noru", "-ru1", "-o", out_path, self.path],
+                    check=True,
+                    stdout=subprocess.PIPE,
+                    stderr=subprocess.PIPE,
+                )
+            except subprocess.CalledProcessError as e:
+                out_path.unlink(missing_ok=True)
+                if e.returncode != 10:  # 10 = No captions found
+                    raise
+            return out_path.exists()
+
+        # Try on the original file first (preserves container-level CC data like c608 boxes),
+        # then fall back to repacked file (ccextractor can fail on some container formats).
+        if not _run_ccextractor():
+            self.repackage()
+            _run_ccextractor()

        if out_path.exists():
            cc_track = Subtitle(