mirror of
https://github.com/UrloMythus/UnHided.git
synced 2026-04-11 11:50:51 +00:00
181 lines
6.0 KiB
Python
181 lines
6.0 KiB
Python
from abc import ABC, abstractmethod
|
|
from dataclasses import dataclass
|
|
from typing import Dict, Optional, Any
|
|
from urllib.parse import urlparse
|
|
|
|
import asyncio
|
|
import aiohttp
|
|
import json
|
|
import logging
|
|
|
|
from mediaflow_proxy.configs import settings
|
|
from mediaflow_proxy.utils.http_client import create_aiohttp_session
|
|
from mediaflow_proxy.utils.http_utils import DownloadError
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class ExtractorError(Exception):
|
|
"""Base exception for all extractors."""
|
|
|
|
pass
|
|
|
|
|
|
@dataclass
|
|
class HttpResponse:
|
|
"""
|
|
Simple response container for extractor HTTP requests.
|
|
|
|
Uses aiohttp-style naming conventions:
|
|
- status (not status_code)
|
|
- text (pre-loaded content as string)
|
|
- content (pre-loaded content as bytes)
|
|
"""
|
|
|
|
status: int
|
|
headers: Dict[str, str]
|
|
text: str
|
|
content: bytes
|
|
url: str
|
|
|
|
def json(self) -> Any:
|
|
"""Parse response content as JSON."""
|
|
return json.loads(self.text)
|
|
|
|
def get_origin(self) -> str:
|
|
"""Get the origin (scheme + host) from the response URL."""
|
|
parsed = urlparse(self.url)
|
|
return f"{parsed.scheme}://{parsed.netloc}"
|
|
|
|
|
|
class BaseExtractor(ABC):
|
|
"""Base class for all URL extractors.
|
|
|
|
Improvements:
|
|
- Built-in retry/backoff for transient network errors
|
|
- Configurable timeouts and per-request overrides
|
|
- Better logging of non-200 responses and body previews for debugging
|
|
"""
|
|
|
|
def __init__(self, request_headers: dict):
|
|
self.base_headers = {
|
|
"user-agent": settings.user_agent,
|
|
}
|
|
self.mediaflow_endpoint = "proxy_stream_endpoint"
|
|
# merge incoming headers (e.g. Accept-Language / Referer) with default base headers
|
|
self.base_headers.update(request_headers or {})
|
|
|
|
async def _make_request(
|
|
self,
|
|
url: str,
|
|
method: str = "GET",
|
|
headers: Optional[Dict] = None,
|
|
timeout: Optional[float] = None,
|
|
retries: int = 3,
|
|
backoff_factor: float = 0.5,
|
|
raise_on_status: bool = True,
|
|
**kwargs,
|
|
) -> HttpResponse:
|
|
"""
|
|
Make HTTP request with retry and timeout support using aiohttp.
|
|
|
|
Parameters
|
|
----------
|
|
url : str
|
|
The URL to request.
|
|
method : str
|
|
HTTP method (GET, POST, etc.). Defaults to GET.
|
|
headers : dict | None
|
|
Additional headers to merge with base headers.
|
|
timeout : float | None
|
|
Seconds to wait for the request. Defaults to 15s.
|
|
retries : int
|
|
Number of attempts for transient errors.
|
|
backoff_factor : float
|
|
Base for exponential backoff between retries.
|
|
raise_on_status : bool
|
|
If True, HTTP non-2xx raises DownloadError.
|
|
**kwargs
|
|
Additional arguments passed to aiohttp request (e.g., data, json).
|
|
|
|
Returns
|
|
-------
|
|
HttpResponse
|
|
Response object with pre-loaded content.
|
|
"""
|
|
attempt = 0
|
|
last_exc = None
|
|
|
|
# Build request headers merging base and per-request
|
|
request_headers = self.base_headers.copy()
|
|
if headers:
|
|
request_headers.update(headers)
|
|
|
|
timeout_val = timeout or 15.0
|
|
|
|
while attempt < retries:
|
|
try:
|
|
async with create_aiohttp_session(url, timeout=timeout_val) as (session, proxy_url):
|
|
async with session.request(
|
|
method,
|
|
url,
|
|
headers=request_headers,
|
|
proxy=proxy_url,
|
|
**kwargs,
|
|
) as response:
|
|
# Read content while session is still open
|
|
content = await response.read()
|
|
text = content.decode("utf-8", errors="replace")
|
|
final_url = str(response.url)
|
|
status = response.status
|
|
resp_headers = dict(response.headers)
|
|
|
|
if raise_on_status and status >= 400:
|
|
body_preview = text[:500]
|
|
logger.debug(
|
|
"HTTP error for %s (status=%s) -- body preview: %s",
|
|
url,
|
|
status,
|
|
body_preview,
|
|
)
|
|
raise DownloadError(status, f"HTTP error {status} while requesting {url}")
|
|
|
|
return HttpResponse(
|
|
status=status,
|
|
headers=resp_headers,
|
|
text=text,
|
|
content=content,
|
|
url=final_url,
|
|
)
|
|
|
|
except DownloadError:
|
|
# Do not retry on explicit HTTP status errors (they are intentional)
|
|
raise
|
|
except (asyncio.TimeoutError, aiohttp.ClientError) as e:
|
|
# Transient network error - retry with backoff
|
|
last_exc = e
|
|
attempt += 1
|
|
sleep_for = backoff_factor * (2 ** (attempt - 1))
|
|
logger.warning(
|
|
"Transient network error (attempt %s/%s) for %s: %s — retrying in %.1fs",
|
|
attempt,
|
|
retries,
|
|
url,
|
|
e,
|
|
sleep_for,
|
|
)
|
|
await asyncio.sleep(sleep_for)
|
|
continue
|
|
except Exception as e:
|
|
# Unexpected exception - wrap as ExtractorError to keep interface consistent
|
|
logger.exception("Unhandled exception while requesting %s: %s", url, e)
|
|
raise ExtractorError(f"Request failed for URL {url}: {str(e)}")
|
|
|
|
logger.error("All retries failed for %s: %s", url, last_exc)
|
|
raise ExtractorError(f"Request failed for URL {url}: {str(last_exc)}")
|
|
|
|
@abstractmethod
|
|
async def extract(self, url: str, **kwargs) -> Dict[str, Any]:
|
|
"""Extract final URL and required headers."""
|
|
pass
|