feat(dl): extract closed captions from HLS manifests and improve CC extraction

- Parse CLOSED-CAPTIONS entries from HLS manifests and attach CC metadata (language, name, instream_id) to video tracks - Move CC extraction to run after decryption instead of before, fixing extraction failures on encrypted streams - Extract CCs even when other subtitle tracks exist, using manifest CC language info instead of guessing - Try ccextractor on the original file before repacking to preserve container-level CC data (e.g. c608 boxes) that ffmpeg remux strips - Display deduplicated closed captions in --list output and download progress, positioned after subtitles - Add closed_captions field to Video track class
2026-03-05 15:57:29 -07:00
parent 7dd6323be5
commit 15acaea208
4 changed files with 156 additions and 103 deletions
--- a/unshackle/core/manifests/hls.py
+++ b/unshackle/core/manifests/hls.py
@@ -112,6 +112,15 @@ class HLS:
        session_drm = HLS.get_all_drm(session_keys)

        audio_codecs_by_group_id: dict[str, Audio.Codec] = {}
+        cc_by_group_id: dict[str, list[dict[str, Any]]] = {}
+        for media in self.manifest.media:
+            if media.type == "CLOSED-CAPTIONS":
+                cc_by_group_id.setdefault(media.group_id, []).append({
+                    "language": media.language,
+                    "name": media.name,
+                    "instream_id": media.instream_id,
+                    "characteristics": media.characteristics,
+                })
        tracks = Tracks()

        for playlist in self.manifest.playlists:
@@ -161,6 +170,9 @@ class HLS:
                            width=playlist.stream_info.resolution[0] if playlist.stream_info.resolution else None,
                            height=playlist.stream_info.resolution[1] if playlist.stream_info.resolution else None,
                            fps=playlist.stream_info.frame_rate,
+                            closed_captions=cc_by_group_id.get(
+                                (playlist.stream_info.closed_captions or "").strip('"'), []
+                            ),
                        )
                        if primary_track_type is Video
                        else {}