feat(dl): extract closed captions from HLS manifests and improve CC extraction

- Parse CLOSED-CAPTIONS entries from HLS manifests and attach CC metadata (language, name, instream_id) to video tracks
- Move CC extraction to run after decryption instead of before, fixing extraction failures on encrypted streams
- Extract CCs even when other subtitle tracks exist, using manifest CC language info instead of guessing
- Try ccextractor on the original file before repacking to preserve container-level CC data (e.g. c608 boxes) that ffmpeg remux strips
- Display deduplicated closed captions in --list output and download progress, positioned after subtitles
- Add closed_captions field to Video track class
This commit is contained in:
Andy
2026-03-05 15:57:29 -07:00
parent 7dd6323be5
commit 15acaea208
4 changed files with 156 additions and 103 deletions

View File

@@ -112,6 +112,15 @@ class HLS:
session_drm = HLS.get_all_drm(session_keys)
audio_codecs_by_group_id: dict[str, Audio.Codec] = {}
cc_by_group_id: dict[str, list[dict[str, Any]]] = {}
for media in self.manifest.media:
if media.type == "CLOSED-CAPTIONS":
cc_by_group_id.setdefault(media.group_id, []).append({
"language": media.language,
"name": media.name,
"instream_id": media.instream_id,
"characteristics": media.characteristics,
})
tracks = Tracks()
for playlist in self.manifest.playlists:
@@ -161,6 +170,9 @@ class HLS:
width=playlist.stream_info.resolution[0] if playlist.stream_info.resolution else None,
height=playlist.stream_info.resolution[1] if playlist.stream_info.resolution else None,
fps=playlist.stream_info.frame_rate,
closed_captions=cc_by_group_id.get(
(playlist.stream_info.closed_captions or "").strip('"'), []
),
)
if primary_track_type is Video
else {}