refactor(providers): extract metadata providers into modular system
- Create `unshackle/core/providers/` package with abstract base class, IMDBApi (free, no key), SIMKL, and TMDB provider implementations - Add consensus-based ID enrichment: cross-references IMDB IDs with TMDB and SIMKL, drops all data from providers that disagree on tmdb_id (likely resolved to wrong title) - Cache enriched IDs alongside raw provider data so they survive cache round-trips - Genericize TitleCacher with `cache_provider()`/`get_cached_provider()` replacing provider-specific methods; respect `--no-cache` flag - Add `--imdb` CLI flag to dl command for direct IMDB ID lookup
This commit is contained in:
428
unshackle/core/providers/__init__.py
Normal file
428
unshackle/core/providers/__init__.py
Normal file
@@ -0,0 +1,428 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import TYPE_CHECKING, Optional
|
||||
|
||||
import requests
|
||||
|
||||
from unshackle.core.providers._base import ExternalIds, MetadataProvider, MetadataResult, fuzzy_match, log
|
||||
from unshackle.core.providers.imdbapi import IMDBApiProvider
|
||||
from unshackle.core.providers.simkl import SimklProvider
|
||||
from unshackle.core.providers.tmdb import TMDBProvider
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from unshackle.core.title_cacher import TitleCacher
|
||||
|
||||
# Ordered by priority: IMDBApi (free), SIMKL, TMDB
|
||||
ALL_PROVIDERS: list[type[MetadataProvider]] = [IMDBApiProvider, SimklProvider, TMDBProvider]
|
||||
|
||||
|
||||
def get_available_providers() -> list[MetadataProvider]:
|
||||
"""Return instantiated providers that have valid credentials."""
|
||||
return [cls() for cls in ALL_PROVIDERS if cls().is_available()]
|
||||
|
||||
|
||||
def get_provider(name: str) -> Optional[MetadataProvider]:
|
||||
"""Get a specific provider by name."""
|
||||
for cls in ALL_PROVIDERS:
|
||||
if cls.NAME == name:
|
||||
p = cls()
|
||||
return p if p.is_available() else None
|
||||
return None
|
||||
|
||||
|
||||
# -- Public API (replaces tags.py functions) --
|
||||
|
||||
|
||||
def search_metadata(
|
||||
title: str,
|
||||
year: Optional[int],
|
||||
kind: str,
|
||||
title_cacher: Optional[TitleCacher] = None,
|
||||
cache_title_id: Optional[str] = None,
|
||||
cache_region: Optional[str] = None,
|
||||
cache_account_hash: Optional[str] = None,
|
||||
) -> Optional[MetadataResult]:
|
||||
"""Search all available providers for metadata. Returns best match."""
|
||||
# Check cache first
|
||||
if title_cacher and cache_title_id:
|
||||
for cls in ALL_PROVIDERS:
|
||||
p = cls()
|
||||
if not p.is_available():
|
||||
continue
|
||||
cached = title_cacher.get_cached_provider(p.NAME, cache_title_id, kind, cache_region, cache_account_hash)
|
||||
if cached:
|
||||
result = _cached_to_result(cached, p.NAME, kind)
|
||||
if result and result.title and fuzzy_match(result.title, title):
|
||||
log.debug("Using cached %s data for %r", p.NAME, title)
|
||||
return result
|
||||
|
||||
# Search providers in priority order
|
||||
for cls in ALL_PROVIDERS:
|
||||
p = cls()
|
||||
if not p.is_available():
|
||||
continue
|
||||
try:
|
||||
result = p.search(title, year, kind)
|
||||
except (requests.RequestException, ValueError, KeyError) as exc:
|
||||
log.debug("%s search failed: %s", p.NAME, exc)
|
||||
continue
|
||||
if result and result.title and fuzzy_match(result.title, title):
|
||||
# Enrich with cross-referenced IDs if we have IMDB but missing TMDB/TVDB
|
||||
enrich_ids(result)
|
||||
# Cache the result (include enriched IDs so they survive round-trip)
|
||||
if title_cacher and cache_title_id and result.raw:
|
||||
try:
|
||||
cache_data = result.raw
|
||||
if result.external_ids.tmdb_id or result.external_ids.tvdb_id:
|
||||
cache_data = {
|
||||
**result.raw,
|
||||
"_enriched_ids": _external_ids_to_dict(result.external_ids),
|
||||
}
|
||||
title_cacher.cache_provider(
|
||||
p.NAME, cache_title_id, cache_data, kind, cache_region, cache_account_hash
|
||||
)
|
||||
except Exception as exc:
|
||||
log.debug("Failed to cache %s data: %s", p.NAME, exc)
|
||||
return result
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def get_title_by_id(
|
||||
tmdb_id: int,
|
||||
kind: str,
|
||||
title_cacher: Optional[TitleCacher] = None,
|
||||
cache_title_id: Optional[str] = None,
|
||||
cache_region: Optional[str] = None,
|
||||
cache_account_hash: Optional[str] = None,
|
||||
) -> Optional[str]:
|
||||
"""Get title name by TMDB ID."""
|
||||
# Check cache first
|
||||
if title_cacher and cache_title_id:
|
||||
cached = title_cacher.get_cached_provider("tmdb", cache_title_id, kind, cache_region, cache_account_hash)
|
||||
if cached and cached.get("detail"):
|
||||
detail = cached["detail"]
|
||||
tmdb_title = detail.get("title") or detail.get("name")
|
||||
if tmdb_title:
|
||||
log.debug("Using cached TMDB title: %r", tmdb_title)
|
||||
return tmdb_title
|
||||
|
||||
tmdb = get_provider("tmdb")
|
||||
if not tmdb:
|
||||
return None
|
||||
result = tmdb.get_by_id(tmdb_id, kind)
|
||||
if not result:
|
||||
return None
|
||||
|
||||
# Cache if possible
|
||||
if title_cacher and cache_title_id and result.raw:
|
||||
try:
|
||||
ext_ids = tmdb.get_external_ids(tmdb_id, kind)
|
||||
title_cacher.cache_provider(
|
||||
"tmdb",
|
||||
cache_title_id,
|
||||
{"detail": result.raw, "external_ids": _external_ids_to_dict(ext_ids)},
|
||||
kind,
|
||||
cache_region,
|
||||
cache_account_hash,
|
||||
)
|
||||
except Exception as exc:
|
||||
log.debug("Failed to cache TMDB data: %s", exc)
|
||||
|
||||
return result.title
|
||||
|
||||
|
||||
def get_year_by_id(
|
||||
tmdb_id: int,
|
||||
kind: str,
|
||||
title_cacher: Optional[TitleCacher] = None,
|
||||
cache_title_id: Optional[str] = None,
|
||||
cache_region: Optional[str] = None,
|
||||
cache_account_hash: Optional[str] = None,
|
||||
) -> Optional[int]:
|
||||
"""Get release year by TMDB ID."""
|
||||
# Check cache first
|
||||
if title_cacher and cache_title_id:
|
||||
cached = title_cacher.get_cached_provider("tmdb", cache_title_id, kind, cache_region, cache_account_hash)
|
||||
if cached and cached.get("detail"):
|
||||
detail = cached["detail"]
|
||||
date = detail.get("release_date") or detail.get("first_air_date")
|
||||
if date and len(date) >= 4 and date[:4].isdigit():
|
||||
year = int(date[:4])
|
||||
log.debug("Using cached TMDB year: %d", year)
|
||||
return year
|
||||
|
||||
tmdb = get_provider("tmdb")
|
||||
if not tmdb:
|
||||
return None
|
||||
result = tmdb.get_by_id(tmdb_id, kind)
|
||||
if not result:
|
||||
return None
|
||||
|
||||
# Cache if possible
|
||||
if title_cacher and cache_title_id and result.raw:
|
||||
try:
|
||||
ext_ids = tmdb.get_external_ids(tmdb_id, kind)
|
||||
title_cacher.cache_provider(
|
||||
"tmdb",
|
||||
cache_title_id,
|
||||
{"detail": result.raw, "external_ids": _external_ids_to_dict(ext_ids)},
|
||||
kind,
|
||||
cache_region,
|
||||
cache_account_hash,
|
||||
)
|
||||
except Exception as exc:
|
||||
log.debug("Failed to cache TMDB data: %s", exc)
|
||||
|
||||
return result.year
|
||||
|
||||
|
||||
def fetch_external_ids(
|
||||
tmdb_id: int,
|
||||
kind: str,
|
||||
title_cacher: Optional[TitleCacher] = None,
|
||||
cache_title_id: Optional[str] = None,
|
||||
cache_region: Optional[str] = None,
|
||||
cache_account_hash: Optional[str] = None,
|
||||
) -> ExternalIds:
|
||||
"""Get external IDs by TMDB ID."""
|
||||
# Check cache first
|
||||
if title_cacher and cache_title_id:
|
||||
cached = title_cacher.get_cached_provider("tmdb", cache_title_id, kind, cache_region, cache_account_hash)
|
||||
if cached and cached.get("external_ids"):
|
||||
log.debug("Using cached TMDB external IDs")
|
||||
raw = cached["external_ids"]
|
||||
return ExternalIds(
|
||||
imdb_id=raw.get("imdb_id"),
|
||||
tmdb_id=tmdb_id,
|
||||
tmdb_kind=kind,
|
||||
tvdb_id=raw.get("tvdb_id"),
|
||||
)
|
||||
|
||||
tmdb = get_provider("tmdb")
|
||||
if not tmdb:
|
||||
return ExternalIds()
|
||||
ext = tmdb.get_external_ids(tmdb_id, kind)
|
||||
|
||||
# Cache if possible
|
||||
if title_cacher and cache_title_id:
|
||||
try:
|
||||
detail = None
|
||||
result = tmdb.get_by_id(tmdb_id, kind)
|
||||
if result and result.raw:
|
||||
detail = result.raw
|
||||
if detail:
|
||||
title_cacher.cache_provider(
|
||||
"tmdb",
|
||||
cache_title_id,
|
||||
{"detail": detail, "external_ids": _external_ids_to_dict(ext)},
|
||||
kind,
|
||||
cache_region,
|
||||
cache_account_hash,
|
||||
)
|
||||
except Exception as exc:
|
||||
log.debug("Failed to cache TMDB data: %s", exc)
|
||||
|
||||
return ext
|
||||
|
||||
|
||||
# -- Internal helpers --
|
||||
|
||||
|
||||
# Provider authority ranking for tie-breaking (lower index = more authoritative)
|
||||
_ENRICHMENT_PROVIDERS = ("tmdb", "simkl")
|
||||
_ENRICHMENT_AUTHORITY: dict[str, int] = {name: i for i, name in enumerate(_ENRICHMENT_PROVIDERS)}
|
||||
|
||||
|
||||
def enrich_ids(result: MetadataResult) -> None:
|
||||
"""Enrich a MetadataResult by cross-referencing IMDB ID with available providers.
|
||||
|
||||
Queries all available providers, cross-validates tmdb_id as anchor.
|
||||
If a provider returns a different tmdb_id than the authoritative source,
|
||||
ALL of that provider's data is dropped (likely resolved to wrong title).
|
||||
"""
|
||||
ids = result.external_ids
|
||||
if not ids.imdb_id:
|
||||
return
|
||||
if ids.tmdb_id and ids.tvdb_id:
|
||||
return # already have everything
|
||||
|
||||
kind = result.kind or "movie"
|
||||
|
||||
# Step 1: Collect enrichment results from all available providers
|
||||
enrichments: list[tuple[str, ExternalIds]] = []
|
||||
for provider_name in _ENRICHMENT_PROVIDERS:
|
||||
p = get_provider(provider_name)
|
||||
if not p:
|
||||
continue
|
||||
try:
|
||||
enriched = p.find_by_imdb_id(ids.imdb_id, kind) # type: ignore[union-attr]
|
||||
except Exception as exc:
|
||||
log.debug("Enrichment via %s failed: %s", provider_name, exc)
|
||||
continue
|
||||
if enriched:
|
||||
enrichments.append((provider_name, enriched))
|
||||
|
||||
if not enrichments:
|
||||
return
|
||||
|
||||
# Step 2: Cross-validate using tmdb_id as anchor — drop providers that disagree
|
||||
validated = _validate_enrichments(enrichments)
|
||||
|
||||
# Step 3: Merge validated data (fill gaps only)
|
||||
for _provider_name, ext in validated:
|
||||
if not ids.tmdb_id and ext.tmdb_id:
|
||||
ids.tmdb_id = ext.tmdb_id
|
||||
ids.tmdb_kind = ext.tmdb_kind or kind
|
||||
if not ids.tvdb_id and ext.tvdb_id:
|
||||
ids.tvdb_id = ext.tvdb_id
|
||||
|
||||
|
||||
def _validate_enrichments(
|
||||
enrichments: list[tuple[str, ExternalIds]],
|
||||
) -> list[tuple[str, ExternalIds]]:
|
||||
"""Drop providers whose tmdb_id conflicts with the authoritative value.
|
||||
|
||||
If providers disagree on tmdb_id, the more authoritative source wins
|
||||
and ALL data from disagreeing providers is discarded (different tmdb_id
|
||||
means the provider likely resolved to a different title entirely).
|
||||
"""
|
||||
from collections import Counter
|
||||
|
||||
# Collect tmdb_id votes
|
||||
tmdb_votes: dict[str, int] = {}
|
||||
for provider_name, ext in enrichments:
|
||||
if ext.tmdb_id is not None:
|
||||
tmdb_votes[provider_name] = ext.tmdb_id
|
||||
|
||||
if len(set(tmdb_votes.values())) <= 1:
|
||||
return enrichments # all agree or only one voted — no conflict
|
||||
|
||||
# Find the authoritative tmdb_id
|
||||
value_counts = Counter(tmdb_votes.values())
|
||||
most_common_val, most_common_count = value_counts.most_common(1)[0]
|
||||
|
||||
if most_common_count > 1:
|
||||
anchor_tmdb_id = most_common_val
|
||||
else:
|
||||
# No majority — pick the most authoritative provider
|
||||
best_provider = min(
|
||||
tmdb_votes.keys(),
|
||||
key=lambda name: _ENRICHMENT_AUTHORITY.get(name, 99),
|
||||
)
|
||||
anchor_tmdb_id = tmdb_votes[best_provider]
|
||||
|
||||
# Drop any provider that disagrees
|
||||
validated: list[tuple[str, ExternalIds]] = []
|
||||
for provider_name, ext in enrichments:
|
||||
if ext.tmdb_id is not None and ext.tmdb_id != anchor_tmdb_id:
|
||||
log.debug(
|
||||
"Dropping %s enrichment data: tmdb_id %s conflicts with "
|
||||
"authoritative value %s (likely resolved to wrong title)",
|
||||
provider_name,
|
||||
ext.tmdb_id,
|
||||
anchor_tmdb_id,
|
||||
)
|
||||
continue
|
||||
validated.append((provider_name, ext))
|
||||
|
||||
return validated
|
||||
|
||||
|
||||
def _external_ids_to_dict(ext: ExternalIds) -> dict:
|
||||
"""Convert ExternalIds to a dict for caching."""
|
||||
result: dict = {}
|
||||
if ext.imdb_id:
|
||||
result["imdb_id"] = ext.imdb_id
|
||||
if ext.tmdb_id:
|
||||
result["tmdb_id"] = ext.tmdb_id
|
||||
if ext.tmdb_kind:
|
||||
result["tmdb_kind"] = ext.tmdb_kind
|
||||
if ext.tvdb_id:
|
||||
result["tvdb_id"] = ext.tvdb_id
|
||||
return result
|
||||
|
||||
|
||||
def _cached_to_result(cached: dict, provider_name: str, kind: str) -> Optional[MetadataResult]:
|
||||
"""Convert a cached provider dict back to a MetadataResult."""
|
||||
if provider_name == "tmdb":
|
||||
detail = cached.get("detail", {})
|
||||
ext_raw = cached.get("external_ids", {})
|
||||
title = detail.get("title") or detail.get("name")
|
||||
date = detail.get("release_date") or detail.get("first_air_date")
|
||||
year = int(date[:4]) if date and len(date) >= 4 and date[:4].isdigit() else None
|
||||
tmdb_id = detail.get("id")
|
||||
return MetadataResult(
|
||||
title=title,
|
||||
year=year,
|
||||
kind=kind,
|
||||
external_ids=ExternalIds(
|
||||
imdb_id=ext_raw.get("imdb_id"),
|
||||
tmdb_id=tmdb_id,
|
||||
tmdb_kind=kind,
|
||||
tvdb_id=ext_raw.get("tvdb_id"),
|
||||
),
|
||||
source="tmdb",
|
||||
raw=cached,
|
||||
)
|
||||
elif provider_name == "simkl":
|
||||
response = cached.get("response", cached)
|
||||
if response.get("type") == "episode" and "show" in response:
|
||||
info = response["show"]
|
||||
elif response.get("type") == "movie" and "movie" in response:
|
||||
info = response["movie"]
|
||||
else:
|
||||
return None
|
||||
ids = info.get("ids", {})
|
||||
tmdb_id = ids.get("tmdbtv") or ids.get("tmdb") or ids.get("moviedb")
|
||||
if tmdb_id:
|
||||
tmdb_id = int(tmdb_id)
|
||||
return MetadataResult(
|
||||
title=info.get("title"),
|
||||
year=info.get("year"),
|
||||
kind=kind,
|
||||
external_ids=ExternalIds(
|
||||
imdb_id=ids.get("imdb"),
|
||||
tmdb_id=tmdb_id,
|
||||
tmdb_kind=kind,
|
||||
tvdb_id=ids.get("tvdb"),
|
||||
),
|
||||
source="simkl",
|
||||
raw=cached,
|
||||
)
|
||||
elif provider_name == "imdbapi":
|
||||
title = cached.get("primaryTitle") or cached.get("originalTitle")
|
||||
year = cached.get("startYear")
|
||||
imdb_id = cached.get("id")
|
||||
# Restore enriched IDs that were saved alongside the raw data
|
||||
enriched = cached.get("_enriched_ids", {})
|
||||
return MetadataResult(
|
||||
title=title,
|
||||
year=year,
|
||||
kind=kind,
|
||||
external_ids=ExternalIds(
|
||||
imdb_id=imdb_id,
|
||||
tmdb_id=enriched.get("tmdb_id"),
|
||||
tmdb_kind=enriched.get("tmdb_kind"),
|
||||
tvdb_id=enriched.get("tvdb_id"),
|
||||
),
|
||||
source="imdbapi",
|
||||
raw=cached,
|
||||
)
|
||||
return None
|
||||
|
||||
|
||||
__all__ = [
|
||||
"ALL_PROVIDERS",
|
||||
"ExternalIds",
|
||||
"MetadataProvider",
|
||||
"MetadataResult",
|
||||
"enrich_ids",
|
||||
"fetch_external_ids",
|
||||
"fuzzy_match",
|
||||
"get_available_providers",
|
||||
"get_provider",
|
||||
"get_title_by_id",
|
||||
"get_year_by_id",
|
||||
"search_metadata",
|
||||
]
|
||||
Reference in New Issue
Block a user