update

2026-06-10 09:10:23 +00:00 · 2026-02-19 20:15:03 +01:00
parent 7785e8c604
commit cfc6bbabc9
181 changed files with 32141 additions and 4629 deletions
@@ -1,3 +1,4 @@
+import copy
 import logging
 from typing import Annotated

@@ -7,7 +8,10 @@ from fastapi.responses import RedirectResponse
 from mediaflow_proxy.extractors.base import ExtractorError
 from mediaflow_proxy.extractors.factory import ExtractorFactory
 from mediaflow_proxy.schemas import ExtractorURLParams
-from mediaflow_proxy.utils.cache_utils import get_cached_extractor_result, set_cache_extractor_result
+from mediaflow_proxy.utils.cache_utils import (
+    get_cached_extractor_result,
+    set_cache_extractor_result,
+)
 from mediaflow_proxy.utils.http_utils import (
    DownloadError,
    encode_mediaflow_proxy_url,
@@ -16,11 +20,28 @@ from mediaflow_proxy.utils.http_utils import (
    get_proxy_headers,
 )
 from mediaflow_proxy.utils.base64_utils import process_potential_base64_url
+from mediaflow_proxy.utils import redis_utils

 extractor_router = APIRouter()
 logger = logging.getLogger(__name__)

-async def refresh_extractor_cache(cache_key: str, extractor_params: ExtractorURLParams, proxy_headers: ProxyRequestHeaders):
+# Cooldown duration for background refresh (2 minutes)
+_REFRESH_COOLDOWN = 120
+
+# Hosts where background refresh should be DISABLED
+# These hosts generate unique CDN URLs per extraction - refreshing invalidates existing streams!
+# When a new URL is extracted, the old URL becomes invalid and causes 509 errors.
+_NO_BACKGROUND_REFRESH_HOSTS = frozenset(
+    {
+        "Vidoza",
+        # Add other hosts here that generate unique per-extraction URLs
+    }
+)
+
+
+async def refresh_extractor_cache(
+    cache_key: str, extractor_params: ExtractorURLParams, proxy_headers: ProxyRequestHeaders
+):
    """Asynchronously refreshes the extractor cache in the background."""
    try:
        logger.info(f"Background cache refresh started for key: {cache_key}")
@@ -32,32 +53,114 @@ async def refresh_extractor_cache(cache_key: str, extractor_params: ExtractorURL
        logger.error(f"Background cache refresh failed for key {cache_key}: {e}")


-@extractor_router.head("/video")
-@extractor_router.get("/video")
-async def extract_url(
-    extractor_params: Annotated[ExtractorURLParams, Query()],
+# Extension to content-type mapping for player compatibility
+# When a player requests /extractor/video.m3u8, it can detect HLS from the URL
+EXTRACTOR_EXT_CONTENT_TYPES = {
+    "m3u8": "application/vnd.apple.mpegurl",
+    "m3u": "application/vnd.apple.mpegurl",
+    "mp4": "video/mp4",
+    "mkv": "video/x-matroska",
+    "ts": "video/mp2t",
+    "avi": "video/x-msvideo",
+    "webm": "video/webm",
+}
+
+
+async def _extract_url_impl(
+    extractor_params: ExtractorURLParams,
    request: Request,
    background_tasks: BackgroundTasks,
-    proxy_headers: Annotated[ProxyRequestHeaders, Depends(get_proxy_headers)],
+    proxy_headers: ProxyRequestHeaders,
+    ext: str | None = None,
 ):
-    """Extract clean links from various video hosting services."""
+    """
+    Core extraction logic shared by all extractor endpoints.
+
+    Args:
+        extractor_params: Extraction parameters from query string
+        request: FastAPI request object
+        background_tasks: Background task manager
+        proxy_headers: Proxy headers from request
+        ext: Optional file extension hint for player compatibility (e.g., "m3u8", "mp4")
+    """
    try:
        # Process potential base64 encoded destination URL
        processed_destination = process_potential_base64_url(extractor_params.destination)
        extractor_params.destination = processed_destination
-        
+
        cache_key = f"{extractor_params.host}_{extractor_params.model_dump_json()}"
-        response = await get_cached_extractor_result(cache_key)
-        
+
+        # Extractor results are resolved via the pod's outgoing IP and may not
+        # be valid when served from a different pod.  Namespace the cache and
+        # all associated coordination keys so each pod operates on its own
+        # partition of the shared Redis.  On single-instance deployments (no
+        # CACHE_NAMESPACE env var) make_instance_key() is a no-op.
+        instance_cache_key = redis_utils.make_instance_key(cache_key)
+
+        response = await get_cached_extractor_result(instance_cache_key)
+
        if response:
-            logger.info(f"Serving from cache for key: {cache_key}")
-            # Schedule a background task to refresh the cache without blocking the user
-            background_tasks.add_task(refresh_extractor_cache, cache_key, extractor_params, proxy_headers)
+            logger.info(f"Serving from cache for key: {instance_cache_key}")
+            # Schedule a background refresh, but only if:
+            # 1. The host is NOT in the no-refresh list (hosts with unique per-extraction URLs)
+            # 2. The cooldown has elapsed (prevents flooding upstream)
+            #
+            # WARNING: For hosts like Vidoza, background refresh is DANGEROUS!
+            # Each extraction generates a unique CDN URL. Refreshing invalidates the
+            # old URL, causing 509 errors for clients still using it.
+            if extractor_params.host not in _NO_BACKGROUND_REFRESH_HOSTS:
+                cooldown_key = f"extractor_refresh:{instance_cache_key}"
+                if await redis_utils.check_and_set_cooldown(cooldown_key, _REFRESH_COOLDOWN):
+                    background_tasks.add_task(
+                        refresh_extractor_cache, instance_cache_key, extractor_params, proxy_headers
+                    )
+            else:
+                logger.debug(f"Skipping background refresh for {extractor_params.host} (unique CDN URLs)")
        else:
-            logger.info(f"Cache miss for key: {cache_key}. Fetching fresh data.")
-            extractor = ExtractorFactory.get_extractor(extractor_params.host, proxy_headers.request)
-            response = await extractor.extract(extractor_params.destination, **extractor_params.extra_params)
-            await set_cache_extractor_result(cache_key, response)
+            # Use Redis-based in-flight tracking for cross-worker deduplication.
+            # If another worker is already extracting, wait for them to finish.
+            inflight_key = f"extractor:{instance_cache_key}"
+
+            if not await redis_utils.mark_inflight(inflight_key, ttl=60):
+                # Another worker is extracting - wait for them to finish and check cache
+                logger.info(f"Waiting for in-flight extraction (cross-worker) for key: {instance_cache_key}")
+                if await redis_utils.wait_for_completion(inflight_key, timeout=30.0):
+                    # Extraction completed, check cache
+                    response = await get_cached_extractor_result(instance_cache_key)
+                    if response:
+                        logger.info(f"Serving from cache (after wait) for key: {instance_cache_key}")
+
+            if response is None:
+                # We either marked it as in-flight (first) or waited and still no cache hit.
+                # Use Redis lock to ensure only one worker extracts at a time.
+                if await redis_utils.acquire_lock(f"extractor_lock:{instance_cache_key}", ttl=30, timeout=30.0):
+                    try:
+                        # Re-check cache after acquiring lock - another worker may have populated it
+                        response = await get_cached_extractor_result(instance_cache_key)
+                        if response:
+                            logger.info(f"Serving from cache (after lock) for key: {instance_cache_key}")
+                        else:
+                            logger.info(f"Cache miss for key: {instance_cache_key}. Fetching fresh data.")
+                            try:
+                                extractor = ExtractorFactory.get_extractor(extractor_params.host, proxy_headers.request)
+                                response = await extractor.extract(
+                                    extractor_params.destination, **extractor_params.extra_params
+                                )
+                                await set_cache_extractor_result(instance_cache_key, response)
+                            except Exception:
+                                raise
+                    finally:
+                        await redis_utils.release_lock(f"extractor_lock:{instance_cache_key}")
+                        await redis_utils.clear_inflight(inflight_key)
+                else:
+                    # Lock timeout - try to serve from cache anyway
+                    response = await get_cached_extractor_result(instance_cache_key)
+                    if not response:
+                        raise HTTPException(status_code=503, detail="Extraction in progress, please retry")
+
+        # Deep copy so each concurrent request gets its own dict to mutate
+        # (pop mediaflow_endpoint, update request_headers, etc.)
+        response = copy.deepcopy(response)

        # Ensure the latest request headers are used, even with cached data
        if "request_headers" not in response:
@@ -94,3 +197,62 @@ async def extract_url(
    except Exception as e:
        logger.exception(f"Extraction failed: {str(e)}")
        raise HTTPException(status_code=500, detail=f"Extraction failed: {str(e)}")
+
+
+@extractor_router.head("/video")
+@extractor_router.get("/video")
+async def extract_url(
+    extractor_params: Annotated[ExtractorURLParams, Query()],
+    request: Request,
+    background_tasks: BackgroundTasks,
+    proxy_headers: Annotated[ProxyRequestHeaders, Depends(get_proxy_headers)],
+):
+    """
+    Extract clean links from various video hosting services.
+
+    This is the base endpoint without extension. For better player compatibility
+    (especially ExoPlayer), use the extension variants:
+    - /extractor/video.m3u8 for HLS streams
+    - /extractor/video.mp4 for MP4 streams
+    """
+    return await _extract_url_impl(extractor_params, request, background_tasks, proxy_headers)
+
+
+@extractor_router.head("/video.{ext}")
+@extractor_router.get("/video.{ext}")
+async def extract_url_with_extension(
+    ext: str,
+    extractor_params: Annotated[ExtractorURLParams, Query()],
+    request: Request,
+    background_tasks: BackgroundTasks,
+    proxy_headers: Annotated[ProxyRequestHeaders, Depends(get_proxy_headers)],
+):
+    """
+    Extract clean links with file extension hint for player compatibility.
+
+    The extension in the URL helps players like ExoPlayer detect the content type
+    without needing to follow redirects or inspect headers. This is especially
+    important for HLS streams where ExoPlayer needs .m3u8 in the URL to use
+    HlsMediaSource instead of ProgressiveMediaSource.
+
+    Supported extensions:
+    - .m3u8, .m3u - HLS playlists (application/vnd.apple.mpegurl)
+    - .mp4 - MP4 video (video/mp4)
+    - .mkv - Matroska video (video/x-matroska)
+    - .ts - MPEG-TS (video/mp2t)
+    - .avi - AVI video (video/x-msvideo)
+    - .webm - WebM video (video/webm)
+
+    Example:
+        /extractor/video.m3u8?host=TurboVidPlay&d=...&redirect_stream=true
+
+    This URL clearly indicates HLS content, making ExoPlayer use the correct source.
+    """
+    ext_lower = ext.lower()
+    if ext_lower not in EXTRACTOR_EXT_CONTENT_TYPES:
+        raise HTTPException(
+            status_code=400,
+            detail=f"Unsupported extension: .{ext}. Supported: {', '.join('.' + e for e in EXTRACTOR_EXT_CONTENT_TYPES.keys())}",
+        )
+
+    return await _extract_url_impl(extractor_params, request, background_tasks, proxy_headers, ext=ext_lower)