Initial Commit
This commit is contained in:
6
unshackle/core/downloaders/__init__.py
Normal file
6
unshackle/core/downloaders/__init__.py
Normal file
@@ -0,0 +1,6 @@
|
||||
from .aria2c import aria2c
|
||||
from .curl_impersonate import curl_impersonate
|
||||
from .n_m3u8dl_re import n_m3u8dl_re
|
||||
from .requests import requests
|
||||
|
||||
__all__ = ("aria2c", "curl_impersonate", "requests", "n_m3u8dl_re")
|
||||
331
unshackle/core/downloaders/aria2c.py
Normal file
331
unshackle/core/downloaders/aria2c.py
Normal file
@@ -0,0 +1,331 @@
|
||||
import os
|
||||
import subprocess
|
||||
import textwrap
|
||||
import time
|
||||
from functools import partial
|
||||
from http.cookiejar import CookieJar
|
||||
from pathlib import Path
|
||||
from typing import Any, Callable, Generator, MutableMapping, Optional, Union
|
||||
from urllib.parse import urlparse
|
||||
|
||||
import requests
|
||||
from Crypto.Random import get_random_bytes
|
||||
from requests import Session
|
||||
from requests.cookies import cookiejar_from_dict, get_cookie_header
|
||||
from rich import filesize
|
||||
from rich.text import Text
|
||||
|
||||
from unshackle.core import binaries
|
||||
from unshackle.core.config import config
|
||||
from unshackle.core.console import console
|
||||
from unshackle.core.constants import DOWNLOAD_CANCELLED
|
||||
from unshackle.core.utilities import get_extension, get_free_port
|
||||
|
||||
|
||||
def rpc(caller: Callable, secret: str, method: str, params: Optional[list[Any]] = None) -> Any:
|
||||
"""Make a call to Aria2's JSON-RPC API."""
|
||||
try:
|
||||
rpc_res = caller(
|
||||
json={
|
||||
"jsonrpc": "2.0",
|
||||
"id": get_random_bytes(16).hex(),
|
||||
"method": method,
|
||||
"params": [f"token:{secret}", *(params or [])],
|
||||
}
|
||||
).json()
|
||||
if rpc_res.get("code"):
|
||||
# wrap to console width - padding - '[Aria2c]: '
|
||||
error_pretty = "\n ".join(
|
||||
textwrap.wrap(
|
||||
f"RPC Error: {rpc_res['message']} ({rpc_res['code']})".strip(),
|
||||
width=console.width - 20,
|
||||
initial_indent="",
|
||||
)
|
||||
)
|
||||
console.log(Text.from_ansi("\n[Aria2c]: " + error_pretty))
|
||||
return rpc_res["result"]
|
||||
except requests.exceptions.ConnectionError:
|
||||
# absorb, process likely ended as it was calling RPC
|
||||
return
|
||||
|
||||
|
||||
def download(
|
||||
urls: Union[str, list[str], dict[str, Any], list[dict[str, Any]]],
|
||||
output_dir: Path,
|
||||
filename: str,
|
||||
headers: Optional[MutableMapping[str, Union[str, bytes]]] = None,
|
||||
cookies: Optional[Union[MutableMapping[str, str], CookieJar]] = None,
|
||||
proxy: Optional[str] = None,
|
||||
max_workers: Optional[int] = None,
|
||||
) -> Generator[dict[str, Any], None, None]:
|
||||
if not urls:
|
||||
raise ValueError("urls must be provided and not empty")
|
||||
elif not isinstance(urls, (str, dict, list)):
|
||||
raise TypeError(f"Expected urls to be {str} or {dict} or a list of one of them, not {type(urls)}")
|
||||
|
||||
if not output_dir:
|
||||
raise ValueError("output_dir must be provided")
|
||||
elif not isinstance(output_dir, Path):
|
||||
raise TypeError(f"Expected output_dir to be {Path}, not {type(output_dir)}")
|
||||
|
||||
if not filename:
|
||||
raise ValueError("filename must be provided")
|
||||
elif not isinstance(filename, str):
|
||||
raise TypeError(f"Expected filename to be {str}, not {type(filename)}")
|
||||
|
||||
if not isinstance(headers, (MutableMapping, type(None))):
|
||||
raise TypeError(f"Expected headers to be {MutableMapping}, not {type(headers)}")
|
||||
|
||||
if not isinstance(cookies, (MutableMapping, CookieJar, type(None))):
|
||||
raise TypeError(f"Expected cookies to be {MutableMapping} or {CookieJar}, not {type(cookies)}")
|
||||
|
||||
if not isinstance(proxy, (str, type(None))):
|
||||
raise TypeError(f"Expected proxy to be {str}, not {type(proxy)}")
|
||||
|
||||
if not max_workers:
|
||||
max_workers = min(32, (os.cpu_count() or 1) + 4)
|
||||
elif not isinstance(max_workers, int):
|
||||
raise TypeError(f"Expected max_workers to be {int}, not {type(max_workers)}")
|
||||
|
||||
if not isinstance(urls, list):
|
||||
urls = [urls]
|
||||
|
||||
if not binaries.Aria2:
|
||||
raise EnvironmentError("Aria2c executable not found...")
|
||||
|
||||
if proxy and not proxy.lower().startswith("http://"):
|
||||
raise ValueError("Only HTTP proxies are supported by aria2(c)")
|
||||
|
||||
if cookies and not isinstance(cookies, CookieJar):
|
||||
cookies = cookiejar_from_dict(cookies)
|
||||
|
||||
url_files = []
|
||||
for i, url in enumerate(urls):
|
||||
if isinstance(url, str):
|
||||
url_data = {"url": url}
|
||||
else:
|
||||
url_data: dict[str, Any] = url
|
||||
url_filename = filename.format(i=i, ext=get_extension(url_data["url"]))
|
||||
url_text = url_data["url"]
|
||||
url_text += f"\n\tdir={output_dir}"
|
||||
url_text += f"\n\tout={url_filename}"
|
||||
if cookies:
|
||||
mock_request = requests.Request(url=url_data["url"])
|
||||
cookie_header = get_cookie_header(cookies, mock_request)
|
||||
if cookie_header:
|
||||
url_text += f"\n\theader=Cookie: {cookie_header}"
|
||||
for key, value in url_data.items():
|
||||
if key == "url":
|
||||
continue
|
||||
if key == "headers":
|
||||
for header_name, header_value in value.items():
|
||||
url_text += f"\n\theader={header_name}: {header_value}"
|
||||
else:
|
||||
url_text += f"\n\t{key}={value}"
|
||||
url_files.append(url_text)
|
||||
url_file = "\n".join(url_files)
|
||||
|
||||
rpc_port = get_free_port()
|
||||
rpc_secret = get_random_bytes(16).hex()
|
||||
rpc_uri = f"http://127.0.0.1:{rpc_port}/jsonrpc"
|
||||
rpc_session = Session()
|
||||
|
||||
max_concurrent_downloads = int(config.aria2c.get("max_concurrent_downloads", max_workers))
|
||||
max_connection_per_server = int(config.aria2c.get("max_connection_per_server", 1))
|
||||
split = int(config.aria2c.get("split", 5))
|
||||
file_allocation = config.aria2c.get("file_allocation", "prealloc")
|
||||
if len(urls) > 1:
|
||||
split = 1
|
||||
file_allocation = "none"
|
||||
|
||||
arguments = [
|
||||
# [Basic Options]
|
||||
"--input-file",
|
||||
"-",
|
||||
"--all-proxy",
|
||||
proxy or "",
|
||||
"--continue=true",
|
||||
# [Connection Options]
|
||||
f"--max-concurrent-downloads={max_concurrent_downloads}",
|
||||
f"--max-connection-per-server={max_connection_per_server}",
|
||||
f"--split={split}", # each split uses their own connection
|
||||
"--max-file-not-found=5", # counted towards --max-tries
|
||||
"--max-tries=5",
|
||||
"--retry-wait=2",
|
||||
# [Advanced Options]
|
||||
"--allow-overwrite=true",
|
||||
"--auto-file-renaming=false",
|
||||
"--console-log-level=warn",
|
||||
"--download-result=default",
|
||||
f"--file-allocation={file_allocation}",
|
||||
"--summary-interval=0",
|
||||
# [RPC Options]
|
||||
"--enable-rpc=true",
|
||||
f"--rpc-listen-port={rpc_port}",
|
||||
f"--rpc-secret={rpc_secret}",
|
||||
]
|
||||
|
||||
for header, value in (headers or {}).items():
|
||||
if header.lower() == "cookie":
|
||||
raise ValueError("You cannot set Cookies as a header manually, please use the `cookies` param.")
|
||||
if header.lower() == "accept-encoding":
|
||||
# we cannot set an allowed encoding, or it will return compressed
|
||||
# and the code is not set up to uncompress the data
|
||||
continue
|
||||
if header.lower() == "referer":
|
||||
arguments.extend(["--referer", value])
|
||||
continue
|
||||
if header.lower() == "user-agent":
|
||||
arguments.extend(["--user-agent", value])
|
||||
continue
|
||||
arguments.extend(["--header", f"{header}: {value}"])
|
||||
|
||||
yield dict(total=len(urls))
|
||||
|
||||
try:
|
||||
p = subprocess.Popen([binaries.Aria2, *arguments], stdin=subprocess.PIPE, stdout=subprocess.DEVNULL)
|
||||
|
||||
p.stdin.write(url_file.encode())
|
||||
p.stdin.close()
|
||||
|
||||
while p.poll() is None:
|
||||
global_stats: dict[str, Any] = (
|
||||
rpc(caller=partial(rpc_session.post, url=rpc_uri), secret=rpc_secret, method="aria2.getGlobalStat")
|
||||
or {}
|
||||
)
|
||||
|
||||
number_stopped = int(global_stats.get("numStoppedTotal", 0))
|
||||
download_speed = int(global_stats.get("downloadSpeed", -1))
|
||||
|
||||
if number_stopped:
|
||||
yield dict(completed=number_stopped)
|
||||
if download_speed != -1:
|
||||
yield dict(downloaded=f"{filesize.decimal(download_speed)}/s")
|
||||
|
||||
stopped_downloads: list[dict[str, Any]] = (
|
||||
rpc(
|
||||
caller=partial(rpc_session.post, url=rpc_uri),
|
||||
secret=rpc_secret,
|
||||
method="aria2.tellStopped",
|
||||
params=[0, 999999],
|
||||
)
|
||||
or []
|
||||
)
|
||||
|
||||
for dl in stopped_downloads:
|
||||
if dl["status"] == "error":
|
||||
used_uri = next(
|
||||
uri["uri"]
|
||||
for file in dl["files"]
|
||||
if file["selected"] == "true"
|
||||
for uri in file["uris"]
|
||||
if uri["status"] == "used"
|
||||
)
|
||||
error = f"Download Error (#{dl['gid']}): {dl['errorMessage']} ({dl['errorCode']}), {used_uri}"
|
||||
error_pretty = "\n ".join(
|
||||
textwrap.wrap(error, width=console.width - 20, initial_indent="")
|
||||
)
|
||||
console.log(Text.from_ansi("\n[Aria2c]: " + error_pretty))
|
||||
raise ValueError(error)
|
||||
|
||||
if number_stopped == len(urls):
|
||||
rpc(caller=partial(rpc_session.post, url=rpc_uri), secret=rpc_secret, method="aria2.shutdown")
|
||||
break
|
||||
|
||||
time.sleep(1)
|
||||
|
||||
p.wait()
|
||||
|
||||
if p.returncode != 0:
|
||||
raise subprocess.CalledProcessError(p.returncode, arguments)
|
||||
except ConnectionResetError:
|
||||
# interrupted while passing URI to download
|
||||
raise KeyboardInterrupt()
|
||||
except subprocess.CalledProcessError as e:
|
||||
if e.returncode in (7, 0xC000013A):
|
||||
# 7 is when Aria2(c) handled the CTRL+C
|
||||
# 0xC000013A is when it never got the chance to
|
||||
raise KeyboardInterrupt()
|
||||
raise
|
||||
except KeyboardInterrupt:
|
||||
DOWNLOAD_CANCELLED.set() # skip pending track downloads
|
||||
yield dict(downloaded="[yellow]CANCELLED")
|
||||
raise
|
||||
except Exception:
|
||||
DOWNLOAD_CANCELLED.set() # skip pending track downloads
|
||||
yield dict(downloaded="[red]FAILED")
|
||||
raise
|
||||
finally:
|
||||
rpc(caller=partial(rpc_session.post, url=rpc_uri), secret=rpc_secret, method="aria2.shutdown")
|
||||
|
||||
|
||||
def aria2c(
|
||||
urls: Union[str, list[str], dict[str, Any], list[dict[str, Any]]],
|
||||
output_dir: Path,
|
||||
filename: str,
|
||||
headers: Optional[MutableMapping[str, Union[str, bytes]]] = None,
|
||||
cookies: Optional[Union[MutableMapping[str, str], CookieJar]] = None,
|
||||
proxy: Optional[str] = None,
|
||||
max_workers: Optional[int] = None,
|
||||
) -> Generator[dict[str, Any], None, None]:
|
||||
"""
|
||||
Download files using Aria2(c).
|
||||
https://aria2.github.io
|
||||
|
||||
Yields the following download status updates while chunks are downloading:
|
||||
|
||||
- {total: 100} (100% download total)
|
||||
- {completed: 1} (1% download progress out of 100%)
|
||||
- {downloaded: "10.1 MB/s"} (currently downloading at a rate of 10.1 MB/s)
|
||||
|
||||
The data is in the same format accepted by rich's progress.update() function.
|
||||
|
||||
Parameters:
|
||||
urls: Web URL(s) to file(s) to download. You can use a dictionary with the key
|
||||
"url" for the URI, and other keys for extra arguments to use per-URL.
|
||||
output_dir: The folder to save the file into. If the save path's directory does
|
||||
not exist then it will be made automatically.
|
||||
filename: The filename or filename template to use for each file. The variables
|
||||
you can use are `i` for the URL index and `ext` for the URL extension.
|
||||
headers: A mapping of HTTP Header Key/Values to use for all downloads.
|
||||
cookies: A mapping of Cookie Key/Values or a Cookie Jar to use for all downloads.
|
||||
proxy: An optional proxy URI to route connections through for all downloads.
|
||||
max_workers: The maximum amount of threads to use for downloads. Defaults to
|
||||
min(32,(cpu_count+4)). Use for the --max-concurrent-downloads option.
|
||||
"""
|
||||
if proxy and not proxy.lower().startswith("http://"):
|
||||
# Only HTTP proxies are supported by aria2(c)
|
||||
proxy = urlparse(proxy)
|
||||
|
||||
port = get_free_port()
|
||||
username, password = get_random_bytes(8).hex(), get_random_bytes(8).hex()
|
||||
local_proxy = f"http://{username}:{password}@localhost:{port}"
|
||||
|
||||
scheme = {"https": "http+ssl", "socks5h": "socks"}.get(proxy.scheme, proxy.scheme)
|
||||
|
||||
remote_server = f"{scheme}://{proxy.hostname}"
|
||||
if proxy.port:
|
||||
remote_server += f":{proxy.port}"
|
||||
if proxy.username or proxy.password:
|
||||
remote_server += "#"
|
||||
if proxy.username:
|
||||
remote_server += proxy.username
|
||||
if proxy.password:
|
||||
remote_server += f":{proxy.password}"
|
||||
|
||||
p = subprocess.Popen(
|
||||
["pproxy", "-l", f"http://:{port}#{username}:{password}", "-r", remote_server],
|
||||
stdout=subprocess.DEVNULL,
|
||||
stderr=subprocess.DEVNULL,
|
||||
)
|
||||
|
||||
try:
|
||||
yield from download(urls, output_dir, filename, headers, cookies, local_proxy, max_workers)
|
||||
finally:
|
||||
p.kill()
|
||||
p.wait()
|
||||
return
|
||||
yield from download(urls, output_dir, filename, headers, cookies, proxy, max_workers)
|
||||
|
||||
|
||||
__all__ = ("aria2c",)
|
||||
259
unshackle/core/downloaders/curl_impersonate.py
Normal file
259
unshackle/core/downloaders/curl_impersonate.py
Normal file
@@ -0,0 +1,259 @@
|
||||
import math
|
||||
import time
|
||||
from concurrent import futures
|
||||
from concurrent.futures.thread import ThreadPoolExecutor
|
||||
from http.cookiejar import CookieJar
|
||||
from pathlib import Path
|
||||
from typing import Any, Generator, MutableMapping, Optional, Union
|
||||
|
||||
from curl_cffi.requests import Session
|
||||
from rich import filesize
|
||||
|
||||
from unshackle.core.config import config
|
||||
from unshackle.core.constants import DOWNLOAD_CANCELLED
|
||||
from unshackle.core.utilities import get_extension
|
||||
|
||||
MAX_ATTEMPTS = 5
|
||||
RETRY_WAIT = 2
|
||||
CHUNK_SIZE = 1024
|
||||
PROGRESS_WINDOW = 5
|
||||
BROWSER = config.curl_impersonate.get("browser", "chrome124")
|
||||
|
||||
|
||||
def download(url: str, save_path: Path, session: Session, **kwargs: Any) -> Generator[dict[str, Any], None, None]:
|
||||
"""
|
||||
Download files using Curl Impersonate.
|
||||
https://github.com/lwthiker/curl-impersonate
|
||||
|
||||
Yields the following download status updates while chunks are downloading:
|
||||
|
||||
- {total: 123} (there are 123 chunks to download)
|
||||
- {total: None} (there are an unknown number of chunks to download)
|
||||
- {advance: 1} (one chunk was downloaded)
|
||||
- {downloaded: "10.1 MB/s"} (currently downloading at a rate of 10.1 MB/s)
|
||||
- {file_downloaded: Path(...), written: 1024} (download finished, has the save path and size)
|
||||
|
||||
The data is in the same format accepted by rich's progress.update() function. The
|
||||
`downloaded` key is custom and is not natively accepted by all rich progress bars.
|
||||
|
||||
Parameters:
|
||||
url: Web URL of a file to download.
|
||||
save_path: The path to save the file to. If the save path's directory does not
|
||||
exist then it will be made automatically.
|
||||
session: The Requests or Curl-Impersonate Session to make HTTP requests with.
|
||||
Useful to set Header, Cookie, and Proxy data. Connections are saved and
|
||||
re-used with the session so long as the server keeps the connection alive.
|
||||
kwargs: Any extra keyword arguments to pass to the session.get() call. Use this
|
||||
for one-time request changes like a header, cookie, or proxy. For example,
|
||||
to request Byte-ranges use e.g., `headers={"Range": "bytes=0-128"}`.
|
||||
"""
|
||||
save_dir = save_path.parent
|
||||
control_file = save_path.with_name(f"{save_path.name}.!dev")
|
||||
|
||||
save_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
if control_file.exists():
|
||||
# consider the file corrupt if the control file exists
|
||||
save_path.unlink(missing_ok=True)
|
||||
control_file.unlink()
|
||||
elif save_path.exists():
|
||||
# if it exists, and no control file, then it should be safe
|
||||
yield dict(file_downloaded=save_path, written=save_path.stat().st_size)
|
||||
|
||||
# TODO: Design a control file format so we know how much of the file is missing
|
||||
control_file.write_bytes(b"")
|
||||
|
||||
attempts = 1
|
||||
try:
|
||||
while True:
|
||||
written = 0
|
||||
download_sizes = []
|
||||
last_speed_refresh = time.time()
|
||||
|
||||
try:
|
||||
stream = session.get(url, stream=True, **kwargs)
|
||||
stream.raise_for_status()
|
||||
|
||||
try:
|
||||
content_length = int(stream.headers.get("Content-Length", "0"))
|
||||
except ValueError:
|
||||
content_length = 0
|
||||
|
||||
if content_length > 0:
|
||||
yield dict(total=math.ceil(content_length / CHUNK_SIZE))
|
||||
else:
|
||||
# we have no data to calculate total chunks
|
||||
yield dict(total=None) # indeterminate mode
|
||||
|
||||
with open(save_path, "wb") as f:
|
||||
for chunk in stream.iter_content(chunk_size=CHUNK_SIZE):
|
||||
download_size = len(chunk)
|
||||
f.write(chunk)
|
||||
written += download_size
|
||||
|
||||
yield dict(advance=1)
|
||||
|
||||
now = time.time()
|
||||
time_since = now - last_speed_refresh
|
||||
|
||||
download_sizes.append(download_size)
|
||||
if time_since > PROGRESS_WINDOW or download_size < CHUNK_SIZE:
|
||||
data_size = sum(download_sizes)
|
||||
download_speed = math.ceil(data_size / (time_since or 1))
|
||||
yield dict(downloaded=f"{filesize.decimal(download_speed)}/s")
|
||||
last_speed_refresh = now
|
||||
download_sizes.clear()
|
||||
|
||||
if content_length and written < content_length:
|
||||
raise IOError(f"Failed to read {content_length} bytes from the track URI.")
|
||||
|
||||
yield dict(file_downloaded=save_path, written=written)
|
||||
break
|
||||
except Exception as e:
|
||||
save_path.unlink(missing_ok=True)
|
||||
if DOWNLOAD_CANCELLED.is_set() or attempts == MAX_ATTEMPTS:
|
||||
raise e
|
||||
time.sleep(RETRY_WAIT)
|
||||
attempts += 1
|
||||
finally:
|
||||
control_file.unlink()
|
||||
|
||||
|
||||
def curl_impersonate(
|
||||
urls: Union[str, list[str], dict[str, Any], list[dict[str, Any]]],
|
||||
output_dir: Path,
|
||||
filename: str,
|
||||
headers: Optional[MutableMapping[str, Union[str, bytes]]] = None,
|
||||
cookies: Optional[Union[MutableMapping[str, str], CookieJar]] = None,
|
||||
proxy: Optional[str] = None,
|
||||
max_workers: Optional[int] = None,
|
||||
) -> Generator[dict[str, Any], None, None]:
|
||||
"""
|
||||
Download files using Curl Impersonate.
|
||||
https://github.com/lwthiker/curl-impersonate
|
||||
|
||||
Yields the following download status updates while chunks are downloading:
|
||||
|
||||
- {total: 123} (there are 123 chunks to download)
|
||||
- {total: None} (there are an unknown number of chunks to download)
|
||||
- {advance: 1} (one chunk was downloaded)
|
||||
- {downloaded: "10.1 MB/s"} (currently downloading at a rate of 10.1 MB/s)
|
||||
- {file_downloaded: Path(...), written: 1024} (download finished, has the save path and size)
|
||||
|
||||
The data is in the same format accepted by rich's progress.update() function.
|
||||
However, The `downloaded`, `file_downloaded` and `written` keys are custom and not
|
||||
natively accepted by rich progress bars.
|
||||
|
||||
Parameters:
|
||||
urls: Web URL(s) to file(s) to download. You can use a dictionary with the key
|
||||
"url" for the URI, and other keys for extra arguments to use per-URL.
|
||||
output_dir: The folder to save the file into. If the save path's directory does
|
||||
not exist then it will be made automatically.
|
||||
filename: The filename or filename template to use for each file. The variables
|
||||
you can use are `i` for the URL index and `ext` for the URL extension.
|
||||
headers: A mapping of HTTP Header Key/Values to use for all downloads.
|
||||
cookies: A mapping of Cookie Key/Values or a Cookie Jar to use for all downloads.
|
||||
proxy: An optional proxy URI to route connections through for all downloads.
|
||||
max_workers: The maximum amount of threads to use for downloads. Defaults to
|
||||
min(32,(cpu_count+4)).
|
||||
"""
|
||||
if not urls:
|
||||
raise ValueError("urls must be provided and not empty")
|
||||
elif not isinstance(urls, (str, dict, list)):
|
||||
raise TypeError(f"Expected urls to be {str} or {dict} or a list of one of them, not {type(urls)}")
|
||||
|
||||
if not output_dir:
|
||||
raise ValueError("output_dir must be provided")
|
||||
elif not isinstance(output_dir, Path):
|
||||
raise TypeError(f"Expected output_dir to be {Path}, not {type(output_dir)}")
|
||||
|
||||
if not filename:
|
||||
raise ValueError("filename must be provided")
|
||||
elif not isinstance(filename, str):
|
||||
raise TypeError(f"Expected filename to be {str}, not {type(filename)}")
|
||||
|
||||
if not isinstance(headers, (MutableMapping, type(None))):
|
||||
raise TypeError(f"Expected headers to be {MutableMapping}, not {type(headers)}")
|
||||
|
||||
if not isinstance(cookies, (MutableMapping, CookieJar, type(None))):
|
||||
raise TypeError(f"Expected cookies to be {MutableMapping} or {CookieJar}, not {type(cookies)}")
|
||||
|
||||
if not isinstance(proxy, (str, type(None))):
|
||||
raise TypeError(f"Expected proxy to be {str}, not {type(proxy)}")
|
||||
|
||||
if not isinstance(max_workers, (int, type(None))):
|
||||
raise TypeError(f"Expected max_workers to be {int}, not {type(max_workers)}")
|
||||
|
||||
if not isinstance(urls, list):
|
||||
urls = [urls]
|
||||
|
||||
urls = [
|
||||
dict(save_path=save_path, **url) if isinstance(url, dict) else dict(url=url, save_path=save_path)
|
||||
for i, url in enumerate(urls)
|
||||
for save_path in [
|
||||
output_dir / filename.format(i=i, ext=get_extension(url["url"] if isinstance(url, dict) else url))
|
||||
]
|
||||
]
|
||||
|
||||
session = Session(impersonate=BROWSER)
|
||||
if headers:
|
||||
headers = {k: v for k, v in headers.items() if k.lower() != "accept-encoding"}
|
||||
session.headers.update(headers)
|
||||
if cookies:
|
||||
session.cookies.update(cookies)
|
||||
if proxy:
|
||||
session.proxies.update({"all": proxy})
|
||||
|
||||
yield dict(total=len(urls))
|
||||
|
||||
download_sizes = []
|
||||
last_speed_refresh = time.time()
|
||||
|
||||
with ThreadPoolExecutor(max_workers=max_workers) as pool:
|
||||
for i, future in enumerate(
|
||||
futures.as_completed((pool.submit(download, session=session, **url) for url in urls))
|
||||
):
|
||||
file_path, download_size = None, None
|
||||
try:
|
||||
for status_update in future.result():
|
||||
if status_update.get("file_downloaded") and status_update.get("written"):
|
||||
file_path = status_update["file_downloaded"]
|
||||
download_size = status_update["written"]
|
||||
elif len(urls) == 1:
|
||||
# these are per-chunk updates, only useful if it's one big file
|
||||
yield status_update
|
||||
except KeyboardInterrupt:
|
||||
DOWNLOAD_CANCELLED.set() # skip pending track downloads
|
||||
yield dict(downloaded="[yellow]CANCELLING")
|
||||
pool.shutdown(wait=True, cancel_futures=True)
|
||||
yield dict(downloaded="[yellow]CANCELLED")
|
||||
# tell dl that it was cancelled
|
||||
# the pool is already shut down, so exiting loop is fine
|
||||
raise
|
||||
except Exception:
|
||||
DOWNLOAD_CANCELLED.set() # skip pending track downloads
|
||||
yield dict(downloaded="[red]FAILING")
|
||||
pool.shutdown(wait=True, cancel_futures=True)
|
||||
yield dict(downloaded="[red]FAILED")
|
||||
# tell dl that it failed
|
||||
# the pool is already shut down, so exiting loop is fine
|
||||
raise
|
||||
else:
|
||||
yield dict(file_downloaded=file_path)
|
||||
yield dict(advance=1)
|
||||
|
||||
now = time.time()
|
||||
time_since = now - last_speed_refresh
|
||||
|
||||
if download_size: # no size == skipped dl
|
||||
download_sizes.append(download_size)
|
||||
|
||||
if download_sizes and (time_since > PROGRESS_WINDOW or i == len(urls)):
|
||||
data_size = sum(download_sizes)
|
||||
download_speed = math.ceil(data_size / (time_since or 1))
|
||||
yield dict(downloaded=f"{filesize.decimal(download_speed)}/s")
|
||||
last_speed_refresh = now
|
||||
download_sizes.clear()
|
||||
|
||||
|
||||
__all__ = ("curl_impersonate",)
|
||||
299
unshackle/core/downloaders/n_m3u8dl_re.py
Normal file
299
unshackle/core/downloaders/n_m3u8dl_re.py
Normal file
@@ -0,0 +1,299 @@
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import subprocess
|
||||
import warnings
|
||||
from http.cookiejar import CookieJar
|
||||
from itertools import chain
|
||||
from pathlib import Path
|
||||
from typing import Any, Generator, MutableMapping, Optional, Union
|
||||
|
||||
import requests
|
||||
from requests.cookies import cookiejar_from_dict, get_cookie_header
|
||||
|
||||
from unshackle.core import binaries
|
||||
from unshackle.core.config import config
|
||||
from unshackle.core.console import console
|
||||
from unshackle.core.constants import DOWNLOAD_CANCELLED
|
||||
|
||||
# Ignore FutureWarnings
|
||||
warnings.simplefilter(action="ignore", category=FutureWarning)
|
||||
|
||||
AUDIO_CODEC_MAP = {"AAC": "mp4a", "AC3": "ac-3", "EC3": "ec-3"}
|
||||
VIDEO_CODEC_MAP = {"AVC": "avc", "HEVC": "hvc", "DV": "dvh", "HLG": "hev"}
|
||||
|
||||
|
||||
def track_selection(track: object) -> list[str]:
|
||||
"""Return the N_m3u8DL-RE stream selection arguments for a track."""
|
||||
|
||||
if "dash" in track.data:
|
||||
adaptation_set = track.data["dash"]["adaptation_set"]
|
||||
representation = track.data["dash"]["representation"]
|
||||
|
||||
track_type = track.__class__.__name__
|
||||
codec = track.codec.name
|
||||
bitrate = track.bitrate // 1000
|
||||
language = track.language
|
||||
width = track.width if track_type == "Video" else None
|
||||
height = track.height if track_type == "Video" else None
|
||||
range = track.range.name if track_type == "Video" else None
|
||||
|
||||
elif "ism" in track.data:
|
||||
stream_index = track.data["ism"]["stream_index"]
|
||||
quality_level = track.data["ism"]["quality_level"]
|
||||
|
||||
track_type = track.__class__.__name__
|
||||
codec = track.codec.name
|
||||
bitrate = track.bitrate // 1000
|
||||
language = track.language
|
||||
width = track.width if track_type == "Video" else None
|
||||
height = track.height if track_type == "Video" else None
|
||||
range = track.range.name if track_type == "Video" else None
|
||||
adaptation_set = stream_index
|
||||
representation = quality_level
|
||||
|
||||
else:
|
||||
return []
|
||||
|
||||
if track_type == "Audio":
|
||||
codecs = AUDIO_CODEC_MAP.get(codec)
|
||||
langs = adaptation_set.findall("lang") + representation.findall("lang")
|
||||
track_ids = list(
|
||||
set(
|
||||
v
|
||||
for x in chain(adaptation_set, representation)
|
||||
for v in (x.get("audioTrackId"), x.get("id"))
|
||||
if v is not None
|
||||
)
|
||||
)
|
||||
roles = adaptation_set.findall("Role") + representation.findall("Role")
|
||||
role = ":role=main" if next((i for i in roles if i.get("value").lower() == "main"), None) else ""
|
||||
bandwidth = f"bwMin={bitrate}:bwMax={bitrate + 5}"
|
||||
|
||||
if langs:
|
||||
track_selection = ["-sa", f"lang={language}:codecs={codecs}:{bandwidth}{role}"]
|
||||
elif len(track_ids) == 1:
|
||||
track_selection = ["-sa", f"id={track_ids[0]}"]
|
||||
else:
|
||||
track_selection = ["-sa", f"for=best{role}"]
|
||||
return track_selection
|
||||
|
||||
if track_type == "Video":
|
||||
# adjust codec based on range
|
||||
codec_adjustments = {("HEVC", "DV"): "DV", ("HEVC", "HLG"): "HLG"}
|
||||
codec = codec_adjustments.get((codec, range), codec)
|
||||
codecs = VIDEO_CODEC_MAP.get(codec)
|
||||
|
||||
bandwidth = f"bwMin={bitrate}:bwMax={bitrate + 5}"
|
||||
if width and height:
|
||||
resolution = f"{width}x{height}"
|
||||
elif width:
|
||||
resolution = f"{width}*"
|
||||
else:
|
||||
resolution = "for=best"
|
||||
if resolution.startswith("for="):
|
||||
track_selection = ["-sv", resolution]
|
||||
track_selection.append(f"codecs={codecs}:{bandwidth}")
|
||||
else:
|
||||
track_selection = ["-sv", f"res={resolution}:codecs={codecs}:{bandwidth}"]
|
||||
return track_selection
|
||||
|
||||
|
||||
def download(
|
||||
urls: Union[str, dict[str, Any], list[str], list[dict[str, Any]]],
|
||||
track: object,
|
||||
output_dir: Path,
|
||||
filename: str,
|
||||
headers: Optional[MutableMapping[str, Union[str, bytes]]] = None,
|
||||
cookies: Optional[Union[MutableMapping[str, str], CookieJar]] = None,
|
||||
proxy: Optional[str] = None,
|
||||
max_workers: Optional[int] = None,
|
||||
content_keys: Optional[dict[str, Any]] = None,
|
||||
) -> Generator[dict[str, Any], None, None]:
|
||||
if not urls:
|
||||
raise ValueError("urls must be provided and not empty")
|
||||
elif not isinstance(urls, (str, dict, list)):
|
||||
raise TypeError(f"Expected urls to be {str} or {dict} or a list of one of them, not {type(urls)}")
|
||||
|
||||
if not output_dir:
|
||||
raise ValueError("output_dir must be provided")
|
||||
elif not isinstance(output_dir, Path):
|
||||
raise TypeError(f"Expected output_dir to be {Path}, not {type(output_dir)}")
|
||||
|
||||
if not filename:
|
||||
raise ValueError("filename must be provided")
|
||||
elif not isinstance(filename, str):
|
||||
raise TypeError(f"Expected filename to be {str}, not {type(filename)}")
|
||||
|
||||
if not isinstance(headers, (MutableMapping, type(None))):
|
||||
raise TypeError(f"Expected headers to be {MutableMapping}, not {type(headers)}")
|
||||
|
||||
if not isinstance(cookies, (MutableMapping, CookieJar, type(None))):
|
||||
raise TypeError(f"Expected cookies to be {MutableMapping} or {CookieJar}, not {type(cookies)}")
|
||||
|
||||
if not isinstance(proxy, (str, type(None))):
|
||||
raise TypeError(f"Expected proxy to be {str}, not {type(proxy)}")
|
||||
|
||||
if not max_workers:
|
||||
max_workers = min(32, (os.cpu_count() or 1) + 4)
|
||||
elif not isinstance(max_workers, int):
|
||||
raise TypeError(f"Expected max_workers to be {int}, not {type(max_workers)}")
|
||||
|
||||
if not isinstance(urls, list):
|
||||
urls = [urls]
|
||||
|
||||
if not binaries.N_m3u8DL_RE:
|
||||
raise EnvironmentError("N_m3u8DL-RE executable not found...")
|
||||
|
||||
if cookies and not isinstance(cookies, CookieJar):
|
||||
cookies = cookiejar_from_dict(cookies)
|
||||
|
||||
track_type = track.__class__.__name__
|
||||
thread_count = str(config.n_m3u8dl_re.get("thread_count", max_workers))
|
||||
ad_keyword = config.n_m3u8dl_re.get("ad_keyword")
|
||||
|
||||
arguments = [
|
||||
track.url,
|
||||
"--save-dir",
|
||||
output_dir,
|
||||
"--tmp-dir",
|
||||
output_dir,
|
||||
"--thread-count",
|
||||
thread_count,
|
||||
"--no-log",
|
||||
"--write-meta-json",
|
||||
"false",
|
||||
]
|
||||
|
||||
for header, value in (headers or {}).items():
|
||||
if header.lower() in ("accept-encoding", "cookie"):
|
||||
continue
|
||||
arguments.extend(["--header", f"{header}: {value}"])
|
||||
|
||||
if cookies:
|
||||
cookie_header = get_cookie_header(cookies, requests.Request(url=track.url))
|
||||
if cookie_header:
|
||||
arguments.extend(["--header", f"Cookie: {cookie_header}"])
|
||||
|
||||
if proxy:
|
||||
arguments.extend(["--custom-proxy", proxy])
|
||||
|
||||
if content_keys:
|
||||
for kid, key in content_keys.items():
|
||||
keys = f"{kid.hex}:{key.lower()}"
|
||||
arguments.extend(["--key", keys])
|
||||
arguments.extend(["--use-shaka-packager"])
|
||||
|
||||
if ad_keyword:
|
||||
arguments.extend(["--ad-keyword", ad_keyword])
|
||||
|
||||
if track.descriptor.name == "URL":
|
||||
error = f"[N_m3u8DL-RE]: {track.descriptor} is currently not supported"
|
||||
raise ValueError(error)
|
||||
elif track.descriptor.name == "DASH":
|
||||
arguments.extend(track_selection(track))
|
||||
|
||||
# TODO: improve this nonsense
|
||||
percent_re = re.compile(r"(\d+\.\d+%)")
|
||||
speed_re = re.compile(r"(?<!/)(\d+\.\d+MB)(?!.*\/)")
|
||||
warn = re.compile(r"(WARN : Response.*)")
|
||||
error = re.compile(r"(ERROR.*)")
|
||||
size_patterns = [
|
||||
re.compile(r"(\d+\.\d+MB/\d+\.\d+GB)"),
|
||||
re.compile(r"(\d+\.\d+GB/\d+\.\d+GB)"),
|
||||
re.compile(r"(\d+\.\d+MB/\d+\.\d+MB)"),
|
||||
]
|
||||
|
||||
yield dict(total=100)
|
||||
|
||||
try:
|
||||
with subprocess.Popen(
|
||||
[binaries.N_m3u8DL_RE, *arguments], stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True
|
||||
) as p:
|
||||
for line in p.stdout:
|
||||
output = line.strip()
|
||||
if output:
|
||||
percent = percent_re.search(output)
|
||||
speed = speed_re.search(output)
|
||||
size = next(
|
||||
(pattern.search(output).group(1) for pattern in size_patterns if pattern.search(output)), ""
|
||||
)
|
||||
|
||||
if speed:
|
||||
yield dict(downloaded=f"{speed.group(1)}ps {size}")
|
||||
if percent:
|
||||
progress = int(percent.group(1).split(".")[0])
|
||||
yield dict(completed=progress) if progress < 100 else dict(downloaded="Merging")
|
||||
|
||||
if warn.search(output):
|
||||
console.log(f"{track_type} " + warn.search(output).group(1))
|
||||
|
||||
p.wait()
|
||||
|
||||
if p.returncode != 0:
|
||||
if error.search(output):
|
||||
raise ValueError(f"[N_m3u8DL-RE]: {error.search(output).group(1)}")
|
||||
raise subprocess.CalledProcessError(p.returncode, arguments)
|
||||
|
||||
except ConnectionResetError:
|
||||
# interrupted while passing URI to download
|
||||
raise KeyboardInterrupt()
|
||||
except KeyboardInterrupt:
|
||||
DOWNLOAD_CANCELLED.set() # skip pending track downloads
|
||||
yield dict(downloaded="[yellow]CANCELLED")
|
||||
raise
|
||||
except Exception:
|
||||
DOWNLOAD_CANCELLED.set() # skip pending track downloads
|
||||
yield dict(downloaded="[red]FAILED")
|
||||
raise
|
||||
|
||||
|
||||
def n_m3u8dl_re(
|
||||
urls: Union[str, list[str], dict[str, Any], list[dict[str, Any]]],
|
||||
track: object,
|
||||
output_dir: Path,
|
||||
filename: str,
|
||||
headers: Optional[MutableMapping[str, Union[str, bytes]]] = None,
|
||||
cookies: Optional[Union[MutableMapping[str, str], CookieJar]] = None,
|
||||
proxy: Optional[str] = None,
|
||||
max_workers: Optional[int] = None,
|
||||
content_keys: Optional[dict[str, Any]] = None,
|
||||
) -> Generator[dict[str, Any], None, None]:
|
||||
"""
|
||||
Download files using N_m3u8DL-RE.
|
||||
https://github.com/nilaoda/N_m3u8DL-RE
|
||||
|
||||
Yields the following download status updates while chunks are downloading:
|
||||
|
||||
- {total: 100} (100% download total)
|
||||
- {completed: 1} (1% download progress out of 100%)
|
||||
- {downloaded: "10.1 MB/s"} (currently downloading at a rate of 10.1 MB/s)
|
||||
|
||||
The data is in the same format accepted by rich's progress.update() function.
|
||||
|
||||
Parameters:
|
||||
urls: Web URL(s) to file(s) to download. You can use a dictionary with the key
|
||||
"url" for the URI, and other keys for extra arguments to use per-URL.
|
||||
track: The track to download. Used to get track attributes for the selection
|
||||
process. Note that Track.Descriptor.URL is not supported by N_m3u8DL-RE.
|
||||
output_dir: The folder to save the file into. If the save path's directory does
|
||||
not exist then it will be made automatically.
|
||||
filename: The filename or filename template to use for each file. The variables
|
||||
you can use are `i` for the URL index and `ext` for the URL extension.
|
||||
headers: A mapping of HTTP Header Key/Values to use for the download.
|
||||
cookies: A mapping of Cookie Key/Values or a Cookie Jar to use for the download.
|
||||
max_workers: The maximum amount of threads to use for downloads. Defaults to
|
||||
min(32,(cpu_count+4)). Can be set in config with --thread-count option.
|
||||
content_keys: The content keys to use for decryption.
|
||||
"""
|
||||
track_type = track.__class__.__name__
|
||||
|
||||
log = logging.getLogger("N_m3u8DL-RE")
|
||||
if proxy and not config.n_m3u8dl_re.get("use_proxy", True):
|
||||
log.warning(f"{track_type}: Ignoring proxy as N_m3u8DL-RE is set to use_proxy=False")
|
||||
proxy = None
|
||||
|
||||
yield from download(urls, track, output_dir, filename, headers, cookies, proxy, max_workers, content_keys)
|
||||
|
||||
|
||||
__all__ = ("n_m3u8dl_re",)
|
||||
266
unshackle/core/downloaders/requests.py
Normal file
266
unshackle/core/downloaders/requests.py
Normal file
@@ -0,0 +1,266 @@
|
||||
import math
|
||||
import os
|
||||
import time
|
||||
from concurrent.futures import as_completed
|
||||
from concurrent.futures.thread import ThreadPoolExecutor
|
||||
from http.cookiejar import CookieJar
|
||||
from pathlib import Path
|
||||
from typing import Any, Generator, MutableMapping, Optional, Union
|
||||
|
||||
from requests import Session
|
||||
from requests.adapters import HTTPAdapter
|
||||
from rich import filesize
|
||||
|
||||
from unshackle.core.constants import DOWNLOAD_CANCELLED
|
||||
from unshackle.core.utilities import get_extension
|
||||
|
||||
MAX_ATTEMPTS = 5
|
||||
RETRY_WAIT = 2
|
||||
CHUNK_SIZE = 1024
|
||||
PROGRESS_WINDOW = 5
|
||||
|
||||
DOWNLOAD_SIZES = []
|
||||
LAST_SPEED_REFRESH = time.time()
|
||||
|
||||
|
||||
def download(
|
||||
url: str, save_path: Path, session: Optional[Session] = None, segmented: bool = False, **kwargs: Any
|
||||
) -> Generator[dict[str, Any], None, None]:
|
||||
"""
|
||||
Download a file using Python Requests.
|
||||
https://requests.readthedocs.io
|
||||
|
||||
Yields the following download status updates while chunks are downloading:
|
||||
|
||||
- {total: 123} (there are 123 chunks to download)
|
||||
- {total: None} (there are an unknown number of chunks to download)
|
||||
- {advance: 1} (one chunk was downloaded)
|
||||
- {downloaded: "10.1 MB/s"} (currently downloading at a rate of 10.1 MB/s)
|
||||
- {file_downloaded: Path(...), written: 1024} (download finished, has the save path and size)
|
||||
|
||||
The data is in the same format accepted by rich's progress.update() function. The
|
||||
`downloaded` key is custom and is not natively accepted by all rich progress bars.
|
||||
|
||||
Parameters:
|
||||
url: Web URL of a file to download.
|
||||
save_path: The path to save the file to. If the save path's directory does not
|
||||
exist then it will be made automatically.
|
||||
session: The Requests Session to make HTTP requests with. Useful to set Header,
|
||||
Cookie, and Proxy data. Connections are saved and re-used with the session
|
||||
so long as the server keeps the connection alive.
|
||||
segmented: If downloads are segments or parts of one bigger file.
|
||||
kwargs: Any extra keyword arguments to pass to the session.get() call. Use this
|
||||
for one-time request changes like a header, cookie, or proxy. For example,
|
||||
to request Byte-ranges use e.g., `headers={"Range": "bytes=0-128"}`.
|
||||
"""
|
||||
global LAST_SPEED_REFRESH
|
||||
|
||||
session = session or Session()
|
||||
|
||||
save_dir = save_path.parent
|
||||
control_file = save_path.with_name(f"{save_path.name}.!dev")
|
||||
|
||||
save_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
if control_file.exists():
|
||||
# consider the file corrupt if the control file exists
|
||||
save_path.unlink(missing_ok=True)
|
||||
control_file.unlink()
|
||||
elif save_path.exists():
|
||||
# if it exists, and no control file, then it should be safe
|
||||
yield dict(file_downloaded=save_path, written=save_path.stat().st_size)
|
||||
# TODO: This should return, potential recovery bug
|
||||
|
||||
# TODO: Design a control file format so we know how much of the file is missing
|
||||
control_file.write_bytes(b"")
|
||||
|
||||
attempts = 1
|
||||
try:
|
||||
while True:
|
||||
written = 0
|
||||
|
||||
# these are for single-url speed calcs only
|
||||
download_sizes = []
|
||||
last_speed_refresh = time.time()
|
||||
|
||||
try:
|
||||
stream = session.get(url, stream=True, **kwargs)
|
||||
stream.raise_for_status()
|
||||
|
||||
if not segmented:
|
||||
try:
|
||||
content_length = int(stream.headers.get("Content-Length", "0"))
|
||||
except ValueError:
|
||||
content_length = 0
|
||||
|
||||
if content_length > 0:
|
||||
yield dict(total=math.ceil(content_length / CHUNK_SIZE))
|
||||
else:
|
||||
# we have no data to calculate total chunks
|
||||
yield dict(total=None) # indeterminate mode
|
||||
|
||||
with open(save_path, "wb") as f:
|
||||
for chunk in stream.iter_content(chunk_size=CHUNK_SIZE):
|
||||
download_size = len(chunk)
|
||||
f.write(chunk)
|
||||
written += download_size
|
||||
|
||||
if not segmented:
|
||||
yield dict(advance=1)
|
||||
now = time.time()
|
||||
time_since = now - last_speed_refresh
|
||||
download_sizes.append(download_size)
|
||||
if time_since > PROGRESS_WINDOW or download_size < CHUNK_SIZE:
|
||||
data_size = sum(download_sizes)
|
||||
download_speed = math.ceil(data_size / (time_since or 1))
|
||||
yield dict(downloaded=f"{filesize.decimal(download_speed)}/s")
|
||||
last_speed_refresh = now
|
||||
download_sizes.clear()
|
||||
|
||||
if content_length and written < content_length:
|
||||
raise IOError(f"Failed to read {content_length} bytes from the track URI.")
|
||||
|
||||
yield dict(file_downloaded=save_path, written=written)
|
||||
|
||||
if segmented:
|
||||
yield dict(advance=1)
|
||||
now = time.time()
|
||||
time_since = now - LAST_SPEED_REFRESH
|
||||
if written: # no size == skipped dl
|
||||
DOWNLOAD_SIZES.append(written)
|
||||
if DOWNLOAD_SIZES and time_since > PROGRESS_WINDOW:
|
||||
data_size = sum(DOWNLOAD_SIZES)
|
||||
download_speed = math.ceil(data_size / (time_since or 1))
|
||||
yield dict(downloaded=f"{filesize.decimal(download_speed)}/s")
|
||||
LAST_SPEED_REFRESH = now
|
||||
DOWNLOAD_SIZES.clear()
|
||||
break
|
||||
except Exception as e:
|
||||
save_path.unlink(missing_ok=True)
|
||||
if DOWNLOAD_CANCELLED.is_set() or attempts == MAX_ATTEMPTS:
|
||||
raise e
|
||||
time.sleep(RETRY_WAIT)
|
||||
attempts += 1
|
||||
finally:
|
||||
control_file.unlink()
|
||||
|
||||
|
||||
def requests(
|
||||
urls: Union[str, list[str], dict[str, Any], list[dict[str, Any]]],
|
||||
output_dir: Path,
|
||||
filename: str,
|
||||
headers: Optional[MutableMapping[str, Union[str, bytes]]] = None,
|
||||
cookies: Optional[Union[MutableMapping[str, str], CookieJar]] = None,
|
||||
proxy: Optional[str] = None,
|
||||
max_workers: Optional[int] = None,
|
||||
) -> Generator[dict[str, Any], None, None]:
|
||||
"""
|
||||
Download a file using Python Requests.
|
||||
https://requests.readthedocs.io
|
||||
|
||||
Yields the following download status updates while chunks are downloading:
|
||||
|
||||
- {total: 123} (there are 123 chunks to download)
|
||||
- {total: None} (there are an unknown number of chunks to download)
|
||||
- {advance: 1} (one chunk was downloaded)
|
||||
- {downloaded: "10.1 MB/s"} (currently downloading at a rate of 10.1 MB/s)
|
||||
- {file_downloaded: Path(...), written: 1024} (download finished, has the save path and size)
|
||||
|
||||
The data is in the same format accepted by rich's progress.update() function.
|
||||
However, The `downloaded`, `file_downloaded` and `written` keys are custom and not
|
||||
natively accepted by rich progress bars.
|
||||
|
||||
Parameters:
|
||||
urls: Web URL(s) to file(s) to download. You can use a dictionary with the key
|
||||
"url" for the URI, and other keys for extra arguments to use per-URL.
|
||||
output_dir: The folder to save the file into. If the save path's directory does
|
||||
not exist then it will be made automatically.
|
||||
filename: The filename or filename template to use for each file. The variables
|
||||
you can use are `i` for the URL index and `ext` for the URL extension.
|
||||
headers: A mapping of HTTP Header Key/Values to use for all downloads.
|
||||
cookies: A mapping of Cookie Key/Values or a Cookie Jar to use for all downloads.
|
||||
proxy: An optional proxy URI to route connections through for all downloads.
|
||||
max_workers: The maximum amount of threads to use for downloads. Defaults to
|
||||
min(32,(cpu_count+4)).
|
||||
"""
|
||||
if not urls:
|
||||
raise ValueError("urls must be provided and not empty")
|
||||
elif not isinstance(urls, (str, dict, list)):
|
||||
raise TypeError(f"Expected urls to be {str} or {dict} or a list of one of them, not {type(urls)}")
|
||||
|
||||
if not output_dir:
|
||||
raise ValueError("output_dir must be provided")
|
||||
elif not isinstance(output_dir, Path):
|
||||
raise TypeError(f"Expected output_dir to be {Path}, not {type(output_dir)}")
|
||||
|
||||
if not filename:
|
||||
raise ValueError("filename must be provided")
|
||||
elif not isinstance(filename, str):
|
||||
raise TypeError(f"Expected filename to be {str}, not {type(filename)}")
|
||||
|
||||
if not isinstance(headers, (MutableMapping, type(None))):
|
||||
raise TypeError(f"Expected headers to be {MutableMapping}, not {type(headers)}")
|
||||
|
||||
if not isinstance(cookies, (MutableMapping, CookieJar, type(None))):
|
||||
raise TypeError(f"Expected cookies to be {MutableMapping} or {CookieJar}, not {type(cookies)}")
|
||||
|
||||
if not isinstance(proxy, (str, type(None))):
|
||||
raise TypeError(f"Expected proxy to be {str}, not {type(proxy)}")
|
||||
|
||||
if not isinstance(max_workers, (int, type(None))):
|
||||
raise TypeError(f"Expected max_workers to be {int}, not {type(max_workers)}")
|
||||
|
||||
if not isinstance(urls, list):
|
||||
urls = [urls]
|
||||
|
||||
if not max_workers:
|
||||
max_workers = min(32, (os.cpu_count() or 1) + 4)
|
||||
|
||||
urls = [
|
||||
dict(save_path=save_path, **url) if isinstance(url, dict) else dict(url=url, save_path=save_path)
|
||||
for i, url in enumerate(urls)
|
||||
for save_path in [
|
||||
output_dir / filename.format(i=i, ext=get_extension(url["url"] if isinstance(url, dict) else url))
|
||||
]
|
||||
]
|
||||
|
||||
session = Session()
|
||||
session.mount("https://", HTTPAdapter(pool_connections=max_workers, pool_maxsize=max_workers, pool_block=True))
|
||||
session.mount("http://", session.adapters["https://"])
|
||||
|
||||
if headers:
|
||||
headers = {k: v for k, v in headers.items() if k.lower() != "accept-encoding"}
|
||||
session.headers.update(headers)
|
||||
if cookies:
|
||||
session.cookies.update(cookies)
|
||||
if proxy:
|
||||
session.proxies.update({"all": proxy})
|
||||
|
||||
yield dict(total=len(urls))
|
||||
|
||||
try:
|
||||
with ThreadPoolExecutor(max_workers=max_workers) as pool:
|
||||
for future in as_completed(pool.submit(download, session=session, segmented=False, **url) for url in urls):
|
||||
try:
|
||||
yield from future.result()
|
||||
except KeyboardInterrupt:
|
||||
DOWNLOAD_CANCELLED.set() # skip pending track downloads
|
||||
yield dict(downloaded="[yellow]CANCELLING")
|
||||
pool.shutdown(wait=True, cancel_futures=True)
|
||||
yield dict(downloaded="[yellow]CANCELLED")
|
||||
# tell dl that it was cancelled
|
||||
# the pool is already shut down, so exiting loop is fine
|
||||
raise
|
||||
except Exception:
|
||||
DOWNLOAD_CANCELLED.set() # skip pending track downloads
|
||||
yield dict(downloaded="[red]FAILING")
|
||||
pool.shutdown(wait=True, cancel_futures=True)
|
||||
yield dict(downloaded="[red]FAILED")
|
||||
# tell dl that it failed
|
||||
# the pool is already shut down, so exiting loop is fine
|
||||
raise
|
||||
finally:
|
||||
DOWNLOAD_SIZES.clear()
|
||||
|
||||
|
||||
__all__ = ("requests",)
|
||||
Reference in New Issue
Block a user