import re from langcodes import Language, tag_is_valid def first_valid_bcp47(parts): """ Return the first token in parts that is a valid BCP 47 tag, or None if none are valid. """ for p in parts: tok = p.strip() # Remove bracketed markers like [Original] if tok.startswith("[") and tok.endswith("]"): continue # langcodes works with exact case; tags are typically case-insensitive # but language=lower, region/script=proper-case is okay. # We'll just feed the token as-is; tag_is_valid handles common cases. if tag_is_valid(tok): return tok return None def extract_langs(text): audio = [] subs = [] LANG = r'([a-z]{2}(?:-[A-Z]{2})?)' for line in text.splitlines(): # audio m_audio = re.search( rf'\[(AAC|DD\+?|AC-4|OPUS|VORB|DTS|ALAC|FLAC)\]\s*\|\s*{LANG}', line ) if m_audio: lang = m_audio.group(2) if lang not in audio: audio.append(lang) # subtitles m_sub = re.search( rf'\[(SRT|SSA|ASS|VTT|TTML|SMI|SUB|MPL2|TMP|STPP|WVTT)\]\s*\|\s*{LANG}', line ) if m_sub: lang = m_sub.group(2) if lang not in subs: subs.append(lang) return audio, subs def check_langs_with_langcodes(stderr_text: str, audio_lang_cfg: list[str], sub_lang_cfg: list[str]): # audio_tags = find_audio_tags(stderr_text) # sub_tags = find_sub_tags(stderr_text) audio_tags,sub_tags=extract_langs(stderr_text) # Normalize found tags to their primary language subtags audio_langs_found = {Language.get(tag).language for tag in audio_tags} sub_langs_found = {Language.get(tag).language for tag in sub_tags} return { "audio": { "configured": audio_lang_cfg, "found_tags": audio_tags, "found_langs": sorted(audio_langs_found), "exists_all": all(Language.get(c).language in audio_langs_found for c in audio_lang_cfg), }, "subtitle": { "configured": sub_lang_cfg, "found_tags": sub_tags, "found_langs": sorted(sub_langs_found), "exists_all": all(Language.get(c).language in sub_langs_found for c in sub_lang_cfg), }, } def video_details(stderr_text: str): """ Parses the 'All Tracks' part (stopping at 'Selected Tracks') using a single regex. Returns a list of dicts with codec, range, resolution [w,h], bitrate (int kb/s), framerate (float or None if unknown), and size (e.g., '376.04 MiB'). """ # One regex, anchored to 'VID | [ ... ]' so it won't ever read the log-level [I] VID_RE = re.compile(r""" VID\s*\|\s*\[\s*(?P[^,\]]+)\s*(?:,\s*(?P[^\]]+))?\]\s*\|\s* (?P\d{3,4})x(?P\d{3,4})\s*@\s*(?P[\d,]+)\s*kb/s (?:\s*\((?P[^()]*?(?:MiB|GiB)[^()]*)\))?\s*,\s*(?P\d+(?:\.\d+)?)\s*FPS """, re.VERBOSE) # Only parse the 'All Tracks' section if 'Selected Tracks' exists if "Selected Tracks" in stderr_text: all_section = stderr_text.split("Selected Tracks", 1)[0] else: all_section = stderr_text results = [] for m in VID_RE.finditer(all_section): bitrate_kbps = int(m.group("kbps").replace(",", "")) fps_val = None if m.group("fps"): try: fps_val = float(m.group("fps")) except ValueError: fps_val = None # fallback if numeric parse fails results.append({ "codec": m.group("codec").strip() if m.group("codec") else None, "range": (m.group("range").strip() if m.group("range") else None), "resolution": [m.group("width"), m.group("height")], "bitrate": bitrate_kbps, "framerate": fps_val, # None when 'Unknown FPS' "size": (m.group("size").strip() if m.group("size") else None), }) return results def extract_chapters(stderr_text: str): """ Parse chapter lines from vinetrimmer-like logs. Returns: list of dicts: {'index': '01', 'time': '00:04:21.762', 'name': 'intro'} Stops parsing at 'Selected Tracks' to prefer the 'All Tracks' inventory if present. """ # Matches: "CHP | [01] | 00:04:21.762 | intro" CHAPTER_RE = re.compile( r""" ^.*?\bCHP\b\s*\|\s*\[(?P\d{1,3})\]\s*\|\s* (?P