refactor(providers): extract metadata providers into modular system

- Create `unshackle/core/providers/` package with abstract base class, IMDBApi (free, no key), SIMKL, and TMDB provider implementations
- Add consensus-based ID enrichment: cross-references IMDB IDs with TMDB and SIMKL, drops all data from providers that disagree on tmdb_id (likely resolved to wrong title)
- Cache enriched IDs alongside raw provider data so they survive cache round-trips
- Genericize TitleCacher with `cache_provider()`/`get_cached_provider()` replacing provider-specific methods; respect `--no-cache` flag
- Add `--imdb` CLI flag to dl command for direct IMDB ID lookup
This commit is contained in:
Andy
2026-02-25 19:02:18 -07:00
parent 42d6ef5765
commit 820db5f179
10 changed files with 1207 additions and 749 deletions

View File

@@ -0,0 +1,428 @@
from __future__ import annotations
from typing import TYPE_CHECKING, Optional
import requests
from unshackle.core.providers._base import ExternalIds, MetadataProvider, MetadataResult, fuzzy_match, log
from unshackle.core.providers.imdbapi import IMDBApiProvider
from unshackle.core.providers.simkl import SimklProvider
from unshackle.core.providers.tmdb import TMDBProvider
if TYPE_CHECKING:
from unshackle.core.title_cacher import TitleCacher
# Ordered by priority: IMDBApi (free), SIMKL, TMDB
ALL_PROVIDERS: list[type[MetadataProvider]] = [IMDBApiProvider, SimklProvider, TMDBProvider]
def get_available_providers() -> list[MetadataProvider]:
"""Return instantiated providers that have valid credentials."""
return [cls() for cls in ALL_PROVIDERS if cls().is_available()]
def get_provider(name: str) -> Optional[MetadataProvider]:
"""Get a specific provider by name."""
for cls in ALL_PROVIDERS:
if cls.NAME == name:
p = cls()
return p if p.is_available() else None
return None
# -- Public API (replaces tags.py functions) --
def search_metadata(
title: str,
year: Optional[int],
kind: str,
title_cacher: Optional[TitleCacher] = None,
cache_title_id: Optional[str] = None,
cache_region: Optional[str] = None,
cache_account_hash: Optional[str] = None,
) -> Optional[MetadataResult]:
"""Search all available providers for metadata. Returns best match."""
# Check cache first
if title_cacher and cache_title_id:
for cls in ALL_PROVIDERS:
p = cls()
if not p.is_available():
continue
cached = title_cacher.get_cached_provider(p.NAME, cache_title_id, kind, cache_region, cache_account_hash)
if cached:
result = _cached_to_result(cached, p.NAME, kind)
if result and result.title and fuzzy_match(result.title, title):
log.debug("Using cached %s data for %r", p.NAME, title)
return result
# Search providers in priority order
for cls in ALL_PROVIDERS:
p = cls()
if not p.is_available():
continue
try:
result = p.search(title, year, kind)
except (requests.RequestException, ValueError, KeyError) as exc:
log.debug("%s search failed: %s", p.NAME, exc)
continue
if result and result.title and fuzzy_match(result.title, title):
# Enrich with cross-referenced IDs if we have IMDB but missing TMDB/TVDB
enrich_ids(result)
# Cache the result (include enriched IDs so they survive round-trip)
if title_cacher and cache_title_id and result.raw:
try:
cache_data = result.raw
if result.external_ids.tmdb_id or result.external_ids.tvdb_id:
cache_data = {
**result.raw,
"_enriched_ids": _external_ids_to_dict(result.external_ids),
}
title_cacher.cache_provider(
p.NAME, cache_title_id, cache_data, kind, cache_region, cache_account_hash
)
except Exception as exc:
log.debug("Failed to cache %s data: %s", p.NAME, exc)
return result
return None
def get_title_by_id(
tmdb_id: int,
kind: str,
title_cacher: Optional[TitleCacher] = None,
cache_title_id: Optional[str] = None,
cache_region: Optional[str] = None,
cache_account_hash: Optional[str] = None,
) -> Optional[str]:
"""Get title name by TMDB ID."""
# Check cache first
if title_cacher and cache_title_id:
cached = title_cacher.get_cached_provider("tmdb", cache_title_id, kind, cache_region, cache_account_hash)
if cached and cached.get("detail"):
detail = cached["detail"]
tmdb_title = detail.get("title") or detail.get("name")
if tmdb_title:
log.debug("Using cached TMDB title: %r", tmdb_title)
return tmdb_title
tmdb = get_provider("tmdb")
if not tmdb:
return None
result = tmdb.get_by_id(tmdb_id, kind)
if not result:
return None
# Cache if possible
if title_cacher and cache_title_id and result.raw:
try:
ext_ids = tmdb.get_external_ids(tmdb_id, kind)
title_cacher.cache_provider(
"tmdb",
cache_title_id,
{"detail": result.raw, "external_ids": _external_ids_to_dict(ext_ids)},
kind,
cache_region,
cache_account_hash,
)
except Exception as exc:
log.debug("Failed to cache TMDB data: %s", exc)
return result.title
def get_year_by_id(
tmdb_id: int,
kind: str,
title_cacher: Optional[TitleCacher] = None,
cache_title_id: Optional[str] = None,
cache_region: Optional[str] = None,
cache_account_hash: Optional[str] = None,
) -> Optional[int]:
"""Get release year by TMDB ID."""
# Check cache first
if title_cacher and cache_title_id:
cached = title_cacher.get_cached_provider("tmdb", cache_title_id, kind, cache_region, cache_account_hash)
if cached and cached.get("detail"):
detail = cached["detail"]
date = detail.get("release_date") or detail.get("first_air_date")
if date and len(date) >= 4 and date[:4].isdigit():
year = int(date[:4])
log.debug("Using cached TMDB year: %d", year)
return year
tmdb = get_provider("tmdb")
if not tmdb:
return None
result = tmdb.get_by_id(tmdb_id, kind)
if not result:
return None
# Cache if possible
if title_cacher and cache_title_id and result.raw:
try:
ext_ids = tmdb.get_external_ids(tmdb_id, kind)
title_cacher.cache_provider(
"tmdb",
cache_title_id,
{"detail": result.raw, "external_ids": _external_ids_to_dict(ext_ids)},
kind,
cache_region,
cache_account_hash,
)
except Exception as exc:
log.debug("Failed to cache TMDB data: %s", exc)
return result.year
def fetch_external_ids(
tmdb_id: int,
kind: str,
title_cacher: Optional[TitleCacher] = None,
cache_title_id: Optional[str] = None,
cache_region: Optional[str] = None,
cache_account_hash: Optional[str] = None,
) -> ExternalIds:
"""Get external IDs by TMDB ID."""
# Check cache first
if title_cacher and cache_title_id:
cached = title_cacher.get_cached_provider("tmdb", cache_title_id, kind, cache_region, cache_account_hash)
if cached and cached.get("external_ids"):
log.debug("Using cached TMDB external IDs")
raw = cached["external_ids"]
return ExternalIds(
imdb_id=raw.get("imdb_id"),
tmdb_id=tmdb_id,
tmdb_kind=kind,
tvdb_id=raw.get("tvdb_id"),
)
tmdb = get_provider("tmdb")
if not tmdb:
return ExternalIds()
ext = tmdb.get_external_ids(tmdb_id, kind)
# Cache if possible
if title_cacher and cache_title_id:
try:
detail = None
result = tmdb.get_by_id(tmdb_id, kind)
if result and result.raw:
detail = result.raw
if detail:
title_cacher.cache_provider(
"tmdb",
cache_title_id,
{"detail": detail, "external_ids": _external_ids_to_dict(ext)},
kind,
cache_region,
cache_account_hash,
)
except Exception as exc:
log.debug("Failed to cache TMDB data: %s", exc)
return ext
# -- Internal helpers --
# Provider authority ranking for tie-breaking (lower index = more authoritative)
_ENRICHMENT_PROVIDERS = ("tmdb", "simkl")
_ENRICHMENT_AUTHORITY: dict[str, int] = {name: i for i, name in enumerate(_ENRICHMENT_PROVIDERS)}
def enrich_ids(result: MetadataResult) -> None:
"""Enrich a MetadataResult by cross-referencing IMDB ID with available providers.
Queries all available providers, cross-validates tmdb_id as anchor.
If a provider returns a different tmdb_id than the authoritative source,
ALL of that provider's data is dropped (likely resolved to wrong title).
"""
ids = result.external_ids
if not ids.imdb_id:
return
if ids.tmdb_id and ids.tvdb_id:
return # already have everything
kind = result.kind or "movie"
# Step 1: Collect enrichment results from all available providers
enrichments: list[tuple[str, ExternalIds]] = []
for provider_name in _ENRICHMENT_PROVIDERS:
p = get_provider(provider_name)
if not p:
continue
try:
enriched = p.find_by_imdb_id(ids.imdb_id, kind) # type: ignore[union-attr]
except Exception as exc:
log.debug("Enrichment via %s failed: %s", provider_name, exc)
continue
if enriched:
enrichments.append((provider_name, enriched))
if not enrichments:
return
# Step 2: Cross-validate using tmdb_id as anchor — drop providers that disagree
validated = _validate_enrichments(enrichments)
# Step 3: Merge validated data (fill gaps only)
for _provider_name, ext in validated:
if not ids.tmdb_id and ext.tmdb_id:
ids.tmdb_id = ext.tmdb_id
ids.tmdb_kind = ext.tmdb_kind or kind
if not ids.tvdb_id and ext.tvdb_id:
ids.tvdb_id = ext.tvdb_id
def _validate_enrichments(
enrichments: list[tuple[str, ExternalIds]],
) -> list[tuple[str, ExternalIds]]:
"""Drop providers whose tmdb_id conflicts with the authoritative value.
If providers disagree on tmdb_id, the more authoritative source wins
and ALL data from disagreeing providers is discarded (different tmdb_id
means the provider likely resolved to a different title entirely).
"""
from collections import Counter
# Collect tmdb_id votes
tmdb_votes: dict[str, int] = {}
for provider_name, ext in enrichments:
if ext.tmdb_id is not None:
tmdb_votes[provider_name] = ext.tmdb_id
if len(set(tmdb_votes.values())) <= 1:
return enrichments # all agree or only one voted — no conflict
# Find the authoritative tmdb_id
value_counts = Counter(tmdb_votes.values())
most_common_val, most_common_count = value_counts.most_common(1)[0]
if most_common_count > 1:
anchor_tmdb_id = most_common_val
else:
# No majority — pick the most authoritative provider
best_provider = min(
tmdb_votes.keys(),
key=lambda name: _ENRICHMENT_AUTHORITY.get(name, 99),
)
anchor_tmdb_id = tmdb_votes[best_provider]
# Drop any provider that disagrees
validated: list[tuple[str, ExternalIds]] = []
for provider_name, ext in enrichments:
if ext.tmdb_id is not None and ext.tmdb_id != anchor_tmdb_id:
log.debug(
"Dropping %s enrichment data: tmdb_id %s conflicts with "
"authoritative value %s (likely resolved to wrong title)",
provider_name,
ext.tmdb_id,
anchor_tmdb_id,
)
continue
validated.append((provider_name, ext))
return validated
def _external_ids_to_dict(ext: ExternalIds) -> dict:
"""Convert ExternalIds to a dict for caching."""
result: dict = {}
if ext.imdb_id:
result["imdb_id"] = ext.imdb_id
if ext.tmdb_id:
result["tmdb_id"] = ext.tmdb_id
if ext.tmdb_kind:
result["tmdb_kind"] = ext.tmdb_kind
if ext.tvdb_id:
result["tvdb_id"] = ext.tvdb_id
return result
def _cached_to_result(cached: dict, provider_name: str, kind: str) -> Optional[MetadataResult]:
"""Convert a cached provider dict back to a MetadataResult."""
if provider_name == "tmdb":
detail = cached.get("detail", {})
ext_raw = cached.get("external_ids", {})
title = detail.get("title") or detail.get("name")
date = detail.get("release_date") or detail.get("first_air_date")
year = int(date[:4]) if date and len(date) >= 4 and date[:4].isdigit() else None
tmdb_id = detail.get("id")
return MetadataResult(
title=title,
year=year,
kind=kind,
external_ids=ExternalIds(
imdb_id=ext_raw.get("imdb_id"),
tmdb_id=tmdb_id,
tmdb_kind=kind,
tvdb_id=ext_raw.get("tvdb_id"),
),
source="tmdb",
raw=cached,
)
elif provider_name == "simkl":
response = cached.get("response", cached)
if response.get("type") == "episode" and "show" in response:
info = response["show"]
elif response.get("type") == "movie" and "movie" in response:
info = response["movie"]
else:
return None
ids = info.get("ids", {})
tmdb_id = ids.get("tmdbtv") or ids.get("tmdb") or ids.get("moviedb")
if tmdb_id:
tmdb_id = int(tmdb_id)
return MetadataResult(
title=info.get("title"),
year=info.get("year"),
kind=kind,
external_ids=ExternalIds(
imdb_id=ids.get("imdb"),
tmdb_id=tmdb_id,
tmdb_kind=kind,
tvdb_id=ids.get("tvdb"),
),
source="simkl",
raw=cached,
)
elif provider_name == "imdbapi":
title = cached.get("primaryTitle") or cached.get("originalTitle")
year = cached.get("startYear")
imdb_id = cached.get("id")
# Restore enriched IDs that were saved alongside the raw data
enriched = cached.get("_enriched_ids", {})
return MetadataResult(
title=title,
year=year,
kind=kind,
external_ids=ExternalIds(
imdb_id=imdb_id,
tmdb_id=enriched.get("tmdb_id"),
tmdb_kind=enriched.get("tmdb_kind"),
tvdb_id=enriched.get("tvdb_id"),
),
source="imdbapi",
raw=cached,
)
return None
__all__ = [
"ALL_PROVIDERS",
"ExternalIds",
"MetadataProvider",
"MetadataResult",
"enrich_ids",
"fetch_external_ids",
"fuzzy_match",
"get_available_providers",
"get_provider",
"get_title_by_id",
"get_year_by_id",
"search_metadata",
]

View File

@@ -0,0 +1,97 @@
from __future__ import annotations
import logging
import re
from abc import ABCMeta, abstractmethod
from dataclasses import dataclass, field
from difflib import SequenceMatcher
from typing import Optional, Union
import requests
from requests.adapters import HTTPAdapter, Retry
log = logging.getLogger("METADATA")
HEADERS = {"User-Agent": "unshackle-tags/1.0"}
STRIP_RE = re.compile(r"[^a-z0-9]+", re.I)
YEAR_RE = re.compile(r"\s*\(?[12][0-9]{3}\)?$")
@dataclass
class ExternalIds:
"""Normalized external IDs across providers."""
imdb_id: Optional[str] = None
tmdb_id: Optional[int] = None
tmdb_kind: Optional[str] = None # "movie" or "tv"
tvdb_id: Optional[int] = None
@dataclass
class MetadataResult:
"""Unified metadata result from any provider."""
title: Optional[str] = None
year: Optional[int] = None
kind: Optional[str] = None # "movie" or "tv"
external_ids: ExternalIds = field(default_factory=ExternalIds)
source: str = "" # provider name, e.g. "tmdb", "simkl", "imdbapi"
raw: Optional[dict] = None # original API response for caching
class MetadataProvider(metaclass=ABCMeta):
"""Abstract base for metadata providers."""
NAME: str = ""
REQUIRES_KEY: bool = True
def __init__(self) -> None:
self.log = logging.getLogger(f"METADATA.{self.NAME.upper()}")
self._session: Optional[requests.Session] = None
@property
def session(self) -> requests.Session:
if self._session is None:
self._session = requests.Session()
self._session.headers.update(HEADERS)
retry = Retry(
total=3,
backoff_factor=1,
status_forcelist=[429, 500, 502, 503, 504],
allowed_methods=["GET", "POST"],
)
adapter = HTTPAdapter(max_retries=retry)
self._session.mount("https://", adapter)
self._session.mount("http://", adapter)
return self._session
@abstractmethod
def is_available(self) -> bool:
"""Return True if this provider has the credentials/keys it needs."""
@abstractmethod
def search(self, title: str, year: Optional[int], kind: str) -> Optional[MetadataResult]:
"""Search for a title and return metadata, or None on failure/no match."""
@abstractmethod
def get_by_id(self, provider_id: Union[int, str], kind: str) -> Optional[MetadataResult]:
"""Fetch metadata by this provider's native ID."""
@abstractmethod
def get_external_ids(self, provider_id: Union[int, str], kind: str) -> ExternalIds:
"""Fetch external IDs for a title by this provider's native ID."""
def _clean(s: str) -> str:
return STRIP_RE.sub("", s).lower()
def _strip_year(s: str) -> str:
return YEAR_RE.sub("", s).strip()
def fuzzy_match(a: str, b: str, threshold: float = 0.8) -> bool:
"""Return True if ``a`` and ``b`` are a close match."""
ratio = SequenceMatcher(None, _clean(a), _clean(b)).ratio()
return ratio >= threshold

View File

@@ -0,0 +1,123 @@
from __future__ import annotations
from difflib import SequenceMatcher
from typing import Optional, Union
import requests
from unshackle.core.providers._base import ExternalIds, MetadataProvider, MetadataResult, _clean, fuzzy_match
# Mapping from our kind ("movie"/"tv") to imdbapi.dev title types
KIND_TO_TYPES: dict[str, list[str]] = {
"movie": ["movie"],
"tv": ["tvSeries", "tvMiniSeries"],
}
class IMDBApiProvider(MetadataProvider):
"""IMDb metadata provider using imdbapi.dev (free, no API key)."""
NAME = "imdbapi"
REQUIRES_KEY = False
BASE_URL = "https://api.imdbapi.dev"
def is_available(self) -> bool:
return True # no key needed
def search(self, title: str, year: Optional[int], kind: str) -> Optional[MetadataResult]:
self.log.debug("Searching IMDBApi for %r (%s, %s)", title, kind, year)
try:
params: dict[str, str | int] = {"query": title, "limit": 20}
r = self.session.get(
f"{self.BASE_URL}/search/titles",
params=params,
timeout=30,
)
r.raise_for_status()
data = r.json()
except (requests.RequestException, ValueError) as exc:
self.log.debug("IMDBApi search failed: %s", exc)
return None
results = data.get("titles") or data.get("results") or []
if not results:
self.log.debug("IMDBApi returned no results for %r", title)
return None
# Filter by type if possible
type_filter = KIND_TO_TYPES.get(kind, [])
filtered = [r for r in results if r.get("type") in type_filter] if type_filter else results
candidates = filtered if filtered else results
# Find best fuzzy match, optionally filtered by year
best_match: Optional[dict] = None
best_ratio = 0.0
for candidate in candidates:
primary = candidate.get("primaryTitle") or ""
original = candidate.get("originalTitle") or ""
for name in [primary, original]:
if not name:
continue
ratio = SequenceMatcher(None, _clean(title), _clean(name)).ratio()
if ratio > best_ratio:
# If year provided, prefer matches within 1 year
candidate_year = candidate.get("startYear")
if year and candidate_year and abs(year - candidate_year) > 1:
continue
best_ratio = ratio
best_match = candidate
if not best_match:
self.log.debug("No matching result found in IMDBApi for %r", title)
return None
result_title = best_match.get("primaryTitle") or best_match.get("originalTitle")
if not result_title or not fuzzy_match(result_title, title):
self.log.debug("IMDBApi title mismatch: searched %r, got %r", title, result_title)
return None
imdb_id = best_match.get("id")
result_year = best_match.get("startYear")
self.log.debug("IMDBApi -> %s (ID %s)", result_title, imdb_id)
return MetadataResult(
title=result_title,
year=result_year,
kind=kind,
external_ids=ExternalIds(imdb_id=imdb_id),
source="imdbapi",
raw=best_match,
)
def get_by_id(self, provider_id: Union[int, str], kind: str) -> Optional[MetadataResult]:
"""Fetch metadata by IMDB ID (e.g. 'tt1375666')."""
imdb_id = str(provider_id)
self.log.debug("Fetching IMDBApi title %s", imdb_id)
try:
r = self.session.get(f"{self.BASE_URL}/titles/{imdb_id}", timeout=30)
r.raise_for_status()
data = r.json()
except (requests.RequestException, ValueError) as exc:
self.log.debug("IMDBApi get_by_id failed: %s", exc)
return None
title = data.get("primaryTitle") or data.get("originalTitle")
result_year = data.get("startYear")
return MetadataResult(
title=title,
year=result_year,
kind=kind,
external_ids=ExternalIds(imdb_id=data.get("id")),
source="imdbapi",
raw=data,
)
def get_external_ids(self, provider_id: Union[int, str], kind: str) -> ExternalIds:
"""Return external IDs. For IMDB, the provider_id IS the IMDB ID."""
return ExternalIds(imdb_id=str(provider_id))

View File

@@ -0,0 +1,172 @@
from __future__ import annotations
from typing import Optional, Union
import requests
from unshackle.core.config import config
from unshackle.core.providers._base import ExternalIds, MetadataProvider, MetadataResult, fuzzy_match
class SimklProvider(MetadataProvider):
"""SIMKL metadata provider (filename-based search)."""
NAME = "simkl"
REQUIRES_KEY = True
BASE_URL = "https://api.simkl.com"
def is_available(self) -> bool:
return bool(config.simkl_client_id)
def search(self, title: str, year: Optional[int], kind: str) -> Optional[MetadataResult]:
self.log.debug("Searching Simkl for %r (%s, %s)", title, kind, year)
# Construct appropriate filename based on type
filename = f"{title}"
if year:
filename = f"{title} {year}"
if kind == "tv":
filename += " S01E01.mkv"
else:
filename += " 2160p.mkv"
try:
headers = {"simkl-api-key": config.simkl_client_id}
resp = self.session.post(
f"{self.BASE_URL}/search/file", json={"file": filename}, headers=headers, timeout=30
)
resp.raise_for_status()
data = resp.json()
self.log.debug("Simkl API response received")
except (requests.RequestException, ValueError) as exc:
self.log.debug("Simkl search failed: %s", exc)
return None
# Handle case where SIMKL returns empty list (no results)
if isinstance(data, list):
self.log.debug("Simkl returned list (no matches) for %r", filename)
return None
return self._parse_response(data, title, year, kind)
def get_by_id(self, provider_id: Union[int, str], kind: str) -> Optional[MetadataResult]:
return None # SIMKL has no direct ID lookup used here
def get_external_ids(self, provider_id: Union[int, str], kind: str) -> ExternalIds:
return ExternalIds() # IDs come from search() response
def find_by_imdb_id(self, imdb_id: str, kind: str) -> Optional[ExternalIds]:
"""Look up TMDB/TVDB IDs from an IMDB ID using SIMKL's /search/id and detail endpoints."""
self.log.debug("Looking up IMDB ID %s on SIMKL", imdb_id)
headers = {"simkl-api-key": config.simkl_client_id}
try:
r = self.session.get(f"{self.BASE_URL}/search/id", params={"imdb": imdb_id}, headers=headers, timeout=30)
r.raise_for_status()
data = r.json()
except (requests.RequestException, ValueError) as exc:
self.log.debug("SIMKL search/id failed: %s", exc)
return None
if not isinstance(data, list) or not data:
self.log.debug("No SIMKL results for IMDB ID %s", imdb_id)
return None
entry = data[0]
simkl_id = entry.get("ids", {}).get("simkl")
if not simkl_id:
return None
# Map SIMKL type to endpoint
simkl_type = entry.get("type", "")
endpoint = "tv" if simkl_type in ("tv", "anime") else "movies"
# Fetch full details to get cross-referenced IDs
try:
r2 = self.session.get(
f"{self.BASE_URL}/{endpoint}/{simkl_id}",
params={"extended": "full"},
headers=headers,
timeout=30,
)
r2.raise_for_status()
detail = r2.json()
except (requests.RequestException, ValueError) as exc:
self.log.debug("SIMKL detail fetch failed: %s", exc)
return None
ids = detail.get("ids", {})
tmdb_id: Optional[int] = None
raw_tmdb = ids.get("tmdb")
if raw_tmdb:
tmdb_id = int(raw_tmdb)
tvdb_id: Optional[int] = None
raw_tvdb = ids.get("tvdb")
if raw_tvdb:
tvdb_id = int(raw_tvdb)
self.log.debug("SIMKL find -> TMDB %s, TVDB %s for IMDB %s", tmdb_id, tvdb_id, imdb_id)
return ExternalIds(
imdb_id=imdb_id,
tmdb_id=tmdb_id,
tmdb_kind=kind,
tvdb_id=tvdb_id,
)
def _parse_response(
self, data: dict, search_title: str, search_year: Optional[int], kind: str
) -> Optional[MetadataResult]:
"""Parse a SIMKL response into a MetadataResult."""
if data.get("type") == "episode" and "show" in data:
info = data["show"]
content_type = "tv"
elif data.get("type") == "movie" and "movie" in data:
info = data["movie"]
content_type = "movie"
else:
return None
result_title = info.get("title")
result_year = info.get("year")
# Verify title matches
if not result_title or not fuzzy_match(result_title, search_title):
self.log.debug("Simkl title mismatch: searched %r, got %r", search_title, result_title)
return None
# Verify year if provided (allow 1 year difference)
if search_year and result_year and abs(search_year - result_year) > 1:
self.log.debug("Simkl year mismatch: searched %d, got %d", search_year, result_year)
return None
ids = info.get("ids", {})
tmdb_id: Optional[int] = None
if content_type == "tv":
raw_tmdb = ids.get("tmdbtv")
else:
raw_tmdb = ids.get("tmdb") or ids.get("moviedb")
if raw_tmdb:
tmdb_id = int(raw_tmdb)
tvdb_id: Optional[int] = None
raw_tvdb = ids.get("tvdb")
if raw_tvdb:
tvdb_id = int(raw_tvdb)
self.log.debug("Simkl -> %s (TMDB ID %s)", result_title, tmdb_id)
return MetadataResult(
title=result_title,
year=result_year,
kind=kind,
external_ids=ExternalIds(
imdb_id=ids.get("imdb"),
tmdb_id=tmdb_id,
tmdb_kind=kind,
tvdb_id=tvdb_id,
),
source="simkl",
raw=data,
)

View File

@@ -0,0 +1,199 @@
from __future__ import annotations
from difflib import SequenceMatcher
from typing import Optional, Union
import requests
from unshackle.core.config import config
from unshackle.core.providers._base import ExternalIds, MetadataProvider, MetadataResult, _clean, _strip_year
class TMDBProvider(MetadataProvider):
"""TMDB (The Movie Database) metadata provider."""
NAME = "tmdb"
REQUIRES_KEY = True
BASE_URL = "https://api.themoviedb.org/3"
def is_available(self) -> bool:
return bool(config.tmdb_api_key)
@property
def _api_key(self) -> str:
return config.tmdb_api_key
def search(self, title: str, year: Optional[int], kind: str) -> Optional[MetadataResult]:
search_title = _strip_year(title)
self.log.debug("Searching TMDB for %r (%s, %s)", search_title, kind, year)
params: dict[str, str | int] = {"api_key": self._api_key, "query": search_title}
if year is not None:
params["year" if kind == "movie" else "first_air_date_year"] = year
try:
r = self.session.get(f"{self.BASE_URL}/search/{kind}", params=params, timeout=30)
r.raise_for_status()
results = r.json().get("results") or []
self.log.debug("TMDB returned %d results", len(results))
if not results:
return None
except requests.RequestException as exc:
self.log.warning("Failed to search TMDB for %s: %s", title, exc)
return None
best_ratio = 0.0
best_id: Optional[int] = None
best_title: Optional[str] = None
for result in results:
candidates = [
result.get("title"),
result.get("name"),
result.get("original_title"),
result.get("original_name"),
]
candidates = [c for c in candidates if c]
for candidate in candidates:
ratio = SequenceMatcher(None, _clean(search_title), _clean(candidate)).ratio()
if ratio > best_ratio:
best_ratio = ratio
best_id = result.get("id")
best_title = candidate
self.log.debug("Best candidate ratio %.2f for %r (ID %s)", best_ratio, best_title, best_id)
if best_id is None:
first = results[0]
best_id = first.get("id")
best_title = first.get("title") or first.get("name")
if best_id is None:
return None
# Fetch full detail for caching
detail = self._fetch_detail(best_id, kind)
ext_raw = self._fetch_external_ids_raw(best_id, kind)
date = (detail or {}).get("release_date") or (detail or {}).get("first_air_date")
result_year = int(date[:4]) if date and len(date) >= 4 and date[:4].isdigit() else None
ext = ExternalIds(
imdb_id=ext_raw.get("imdb_id") if ext_raw else None,
tmdb_id=best_id,
tmdb_kind=kind,
tvdb_id=ext_raw.get("tvdb_id") if ext_raw else None,
)
return MetadataResult(
title=best_title,
year=result_year,
kind=kind,
external_ids=ext,
source="tmdb",
raw={"detail": detail or {}, "external_ids": ext_raw or {}},
)
def get_by_id(self, provider_id: Union[int, str], kind: str) -> Optional[MetadataResult]:
detail = self._fetch_detail(int(provider_id), kind)
if not detail:
return None
title = detail.get("title") or detail.get("name")
date = detail.get("release_date") or detail.get("first_air_date")
year = int(date[:4]) if date and len(date) >= 4 and date[:4].isdigit() else None
return MetadataResult(
title=title,
year=year,
kind=kind,
external_ids=ExternalIds(tmdb_id=int(provider_id), tmdb_kind=kind),
source="tmdb",
raw=detail,
)
def get_external_ids(self, provider_id: Union[int, str], kind: str) -> ExternalIds:
raw = self._fetch_external_ids_raw(int(provider_id), kind)
if not raw:
return ExternalIds(tmdb_id=int(provider_id), tmdb_kind=kind)
return ExternalIds(
imdb_id=raw.get("imdb_id"),
tmdb_id=int(provider_id),
tmdb_kind=kind,
tvdb_id=raw.get("tvdb_id"),
)
def find_by_imdb_id(self, imdb_id: str, kind: str) -> Optional[ExternalIds]:
"""Look up TMDB/TVDB IDs from an IMDB ID using TMDB's /find endpoint."""
self.log.debug("Looking up IMDB ID %s on TMDB", imdb_id)
try:
r = self.session.get(
f"{self.BASE_URL}/find/{imdb_id}",
params={"api_key": self._api_key, "external_source": "imdb_id"},
timeout=30,
)
r.raise_for_status()
data = r.json()
except requests.RequestException as exc:
self.log.debug("TMDB find by IMDB ID failed: %s", exc)
return None
# Check movie_results or tv_results based on kind
if kind == "movie":
results = data.get("movie_results") or []
else:
results = data.get("tv_results") or []
if not results:
# Try the other type as fallback
fallback_key = "tv_results" if kind == "movie" else "movie_results"
results = data.get(fallback_key) or []
if results:
kind = "tv" if kind == "movie" else "movie"
if not results:
self.log.debug("No TMDB results found for IMDB ID %s", imdb_id)
return None
match = results[0]
tmdb_id = match.get("id")
if not tmdb_id:
return None
self.log.debug("TMDB find -> ID %s (%s) for IMDB %s", tmdb_id, kind, imdb_id)
# Now fetch the full external IDs from TMDB to get TVDB etc.
ext_raw = self._fetch_external_ids_raw(tmdb_id, kind)
return ExternalIds(
imdb_id=imdb_id,
tmdb_id=tmdb_id,
tmdb_kind=kind,
tvdb_id=ext_raw.get("tvdb_id") if ext_raw else None,
)
def _fetch_detail(self, tmdb_id: int, kind: str) -> Optional[dict]:
try:
r = self.session.get(
f"{self.BASE_URL}/{kind}/{tmdb_id}",
params={"api_key": self._api_key},
timeout=30,
)
r.raise_for_status()
return r.json()
except requests.RequestException as exc:
self.log.debug("Failed to fetch TMDB detail: %s", exc)
return None
def _fetch_external_ids_raw(self, tmdb_id: int, kind: str) -> Optional[dict]:
try:
r = self.session.get(
f"{self.BASE_URL}/{kind}/{tmdb_id}/external_ids",
params={"api_key": self._api_key},
timeout=30,
)
r.raise_for_status()
return r.json()
except requests.RequestException as exc:
self.log.debug("Failed to fetch TMDB external IDs: %s", exc)
return None