mirror of
https://github.com/UrloMythus/UnHided.git
synced 2026-04-11 11:50:51 +00:00
124 lines
4.6 KiB
Python
124 lines
4.6 KiB
Python
from abc import ABC, abstractmethod
|
|
from typing import Dict, Optional, Any
|
|
|
|
import asyncio
|
|
import httpx
|
|
import logging
|
|
|
|
from mediaflow_proxy.configs import settings
|
|
from mediaflow_proxy.utils.http_utils import create_httpx_client, DownloadError
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class ExtractorError(Exception):
|
|
"""Base exception for all extractors."""
|
|
pass
|
|
|
|
|
|
class BaseExtractor(ABC):
|
|
"""Base class for all URL extractors.
|
|
|
|
Improvements:
|
|
- Built-in retry/backoff for transient network errors
|
|
- Configurable timeouts and per-request overrides
|
|
- Better logging of non-200 responses and body previews for debugging
|
|
"""
|
|
|
|
def __init__(self, request_headers: dict):
|
|
self.base_headers = {
|
|
"user-agent": settings.user_agent,
|
|
}
|
|
self.mediaflow_endpoint = "proxy_stream_endpoint"
|
|
# merge incoming headers (e.g. Accept-Language / Referer) with default base headers
|
|
self.base_headers.update(request_headers or {})
|
|
|
|
async def _make_request(
|
|
self,
|
|
url: str,
|
|
method: str = "GET",
|
|
headers: Optional[Dict] = None,
|
|
timeout: Optional[float] = None,
|
|
retries: int = 3,
|
|
backoff_factor: float = 0.5,
|
|
raise_on_status: bool = True,
|
|
**kwargs,
|
|
) -> httpx.Response:
|
|
"""
|
|
Make HTTP request with retry and timeout support.
|
|
|
|
Parameters
|
|
----------
|
|
timeout : float | None
|
|
Seconds to wait for the request (applied to httpx.Timeout). Defaults to 15s.
|
|
retries : int
|
|
Number of attempts for transient errors.
|
|
backoff_factor : float
|
|
Base for exponential backoff between retries.
|
|
raise_on_status : bool
|
|
If True, HTTP non-2xx raises DownloadError (preserves status code).
|
|
"""
|
|
attempt = 0
|
|
last_exc = None
|
|
|
|
# build request headers merging base and per-request
|
|
request_headers = self.base_headers.copy()
|
|
if headers:
|
|
request_headers.update(headers)
|
|
|
|
timeout_cfg = httpx.Timeout(timeout or 15.0)
|
|
|
|
while attempt < retries:
|
|
try:
|
|
async with create_httpx_client(timeout=timeout_cfg) as client:
|
|
response = await client.request(
|
|
method,
|
|
url,
|
|
headers=request_headers,
|
|
**kwargs,
|
|
)
|
|
|
|
if raise_on_status:
|
|
try:
|
|
response.raise_for_status()
|
|
except httpx.HTTPStatusError as e:
|
|
# Provide a short body preview for debugging
|
|
body_preview = ""
|
|
try:
|
|
body_preview = e.response.text[:500]
|
|
except Exception:
|
|
body_preview = "<unreadable body>"
|
|
logger.debug(
|
|
"HTTPStatusError for %s (status=%s) -- body preview: %s",
|
|
url,
|
|
e.response.status_code,
|
|
body_preview,
|
|
)
|
|
raise DownloadError(e.response.status_code, f"HTTP error {e.response.status_code} while requesting {url}")
|
|
return response
|
|
|
|
except DownloadError:
|
|
# Do not retry on explicit HTTP status errors (they are intentional)
|
|
raise
|
|
except (httpx.ReadTimeout, httpx.ConnectTimeout, httpx.NetworkError, httpx.TransportError) as e:
|
|
# Transient network error — retry with backoff
|
|
last_exc = e
|
|
attempt += 1
|
|
sleep_for = backoff_factor * (2 ** (attempt - 1))
|
|
logger.warning("Transient network error (attempt %s/%s) for %s: %s — retrying in %.1fs",
|
|
attempt, retries, url, e, sleep_for)
|
|
await asyncio.sleep(sleep_for)
|
|
continue
|
|
except Exception as e:
|
|
# Unexpected exception — wrap as ExtractorError to keep interface consistent
|
|
logger.exception("Unhandled exception while requesting %s: %s", url, e)
|
|
raise ExtractorError(f"Request failed for URL {url}: {str(e)}")
|
|
|
|
logger.error("All retries failed for %s: %s", url, last_exc)
|
|
raise ExtractorError(f"Request failed for URL {url}: {str(last_exc)}")
|
|
|
|
@abstractmethod
|
|
async def extract(self, url: str, **kwargs) -> Dict[str, Any]:
|
|
"""Extract final URL and required headers."""
|
|
pass
|