new version

2026-06-10 09:10:23 +00:00 · 2026-05-19 20:28:26 +02:00
parent fbee2c1855
commit bd208c63ff
99 changed files with 1287 additions and 225 deletions
@@ -1,11 +1,13 @@
 # https://github.com/Gujal00/ResolveURL/blob/55c7f66524ebd65bc1f88650614e627b00167fa0/script.module.resolveurl/lib/resolveurl/plugins/f16px.py
-
 import base64
 import json
 import re
+import time
+import hmac
+import hashlib
+import os
 from typing import Dict, Any
 from urllib.parse import urlparse
-
 from mediaflow_proxy.extractors.base import BaseExtractor, ExtractorError
 from mediaflow_proxy.utils import python_aesgcm

@@ -17,16 +19,91 @@ class F16PxExtractor(BaseExtractor):

    @staticmethod
    def _b64url_decode(value: str) -> bytes:
-        # base64url -> base64
        value = value.replace("-", "+").replace("_", "/")
        padding = (-len(value)) % 4
        if padding:
            value += "=" * padding
        return base64.b64decode(value)

+    @staticmethod
+    def _b64url_encode(data: bytes) -> str:
+        return base64.urlsafe_b64encode(data).rstrip(b"=").decode()
+
    def _join_key_parts(self, parts) -> bytes:
        return b"".join(self._b64url_decode(p) for p in parts)

+    @staticmethod
+    def _pick_best(sources: list) -> str:
+        """Return URL of highest-quality source by numeric label."""
+
+        def label_key(s):
+            try:
+                return int(s.get("label", 0))
+            except (ValueError, TypeError):
+                return 0
+
+        return sorted(sources, key=label_key, reverse=True)[0]["url"]
+
+    def _make_fingerprint(self) -> dict:
+        viewer_id = self._b64url_encode(os.urandom(16))
+        device_id = self._b64url_encode(os.urandom(16))
+        now = int(time.time())
+
+        token_payload = {
+            "viewer_id": viewer_id,
+            "device_id": device_id,
+            "confidence": 0.93,
+            "iat": now,
+            "exp": now + 600,
+        }
+        payload_b64 = self._b64url_encode(json.dumps(token_payload, separators=(",", ":")).encode())
+        sig = hmac.new(b"", payload_b64.encode(), hashlib.sha256).digest()
+        token = f"{payload_b64}.{self._b64url_encode(sig)}"
+
+        return {
+            "fingerprint": {
+                "token": token,
+                "viewer_id": viewer_id,
+                "device_id": device_id,
+                "confidence": 0.93,
+            }
+        }
+
+    def _decrypt_playback(self, pb: dict) -> list:
+        """Decrypt primary payload, fall back to payload2+decrypt_keys."""
+        iv = self._b64url_decode(pb["iv"])
+        key = self._join_key_parts(pb["key_parts"])
+        payload = self._b64url_decode(pb["payload"])
+
+        cipher = python_aesgcm.new(key)
+        decrypted = cipher.open(iv, payload)
+
+        if decrypted is not None:
+            sources = json.loads(decrypted.decode("utf-8", "ignore")).get("sources") or []
+            if sources:
+                return sources
+
+        # Fallback: payload2 + decrypt_keys
+        decrypt_keys = pb.get("decrypt_keys") or {}
+        iv2 = pb.get("iv2")
+        pay2 = pb.get("payload2")
+        if iv2 and pay2 and decrypt_keys:
+            iv2 = self._b64url_decode(iv2)
+            pay2 = self._b64url_decode(pay2)
+            for key_b64 in decrypt_keys.values():
+                try:
+                    key2 = self._b64url_decode(key_b64)
+                    cipher2 = python_aesgcm.new(key2)
+                    decrypted = cipher2.open(iv2, pay2)
+                    if decrypted:
+                        sources = json.loads(decrypted.decode("utf-8", "ignore")).get("sources") or []
+                        if sources:
+                            return sources
+                except Exception:
+                    continue
+
+        return []
+
    async def extract(self, url: str) -> Dict[str, Any]:
        parsed = urlparse(url)
        host = parsed.netloc
@@ -35,26 +112,32 @@ class F16PxExtractor(BaseExtractor):
        match = re.search(r"/e/([A-Za-z0-9]+)", parsed.path or "")
        if not match:
            raise ExtractorError("F16PX: Invalid embed URL")
-
        media_id = match.group(1)
+
        api_url = f"https://{host}/api/videos/{media_id}/embed/playback"

        headers = self.base_headers.copy()
-        headers["referer"] = f"https://{host}/"
+        headers["referer"] = f"https://{host}/e/{media_id}"
+        headers["origin"] = origin
+        headers["content-type"] = "application/json"
+
+        resp = await self._make_request(
+            api_url,
+            headers=headers,
+            method="POST",
+            json=self._make_fingerprint(),
+        )

-        resp = await self._make_request(api_url, headers=headers)
        try:
            data = resp.json()
        except Exception:
            raise ExtractorError("F16PX: Invalid JSON response")

        # Case 1: plain sources
-        if "sources" in data and data["sources"]:
-            src = data["sources"][0].get("url")
-            if not src:
-                raise ExtractorError("F16PX: Empty source URL")
+        if data.get("sources"):
+            best = self._pick_best(data["sources"])
            return {
-                "destination_url": src,
+                "destination_url": best,
                "request_headers": headers,
                "mediaflow_endpoint": self.mediaflow_endpoint,
            }
@@ -65,40 +148,24 @@ class F16PxExtractor(BaseExtractor):
            raise ExtractorError("F16PX: No playback data")

        try:
-            iv = self._b64url_decode(pb["iv"])  # nonce
-            key = self._join_key_parts(pb["key_parts"])  # AES key
-            payload = self._b64url_decode(pb["payload"])  # ciphertext + tag
-
-            cipher = python_aesgcm.new(key)
-            decrypted = cipher.open(iv, payload)  # AAD = '' like ResolveURL
-
-            if decrypted is None:
-                raise ExtractorError("F16PX: GCM authentication failed")
-
-            decrypted_json = json.loads(decrypted.decode("utf-8", "ignore"))
-
-        except ExtractorError:
-            raise
+            sources = self._decrypt_playback(pb)
        except Exception as e:
            raise ExtractorError(f"F16PX: Decryption failed ({e})")

-        sources = decrypted_json.get("sources") or []
        if not sources:
            raise ExtractorError("F16PX: No sources after decryption")

-        best = sources[0].get("url")
-        if not best:
-            raise ExtractorError("F16PX: Empty source URL after decryption")
-
-        self.base_headers.clear()
-        self.base_headers["referer"] = f"{origin}/"
-        self.base_headers["origin"] = origin
-        self.base_headers["Accept-Language"] = "en-US,en;q=0.5"
-        self.base_headers["Accept"] = "*/*"
-        self.base_headers["user-agent"] = "Mozilla/5.0 (X11; Linux x86_64; rv:138.0) Gecko/20100101 Firefox/138.0"
+        best = self._pick_best(sources)

+        out_headers = {
+            "referer": f"{origin}/",
+            "origin": origin,
+            "Accept-Language": "en-US,en;q=0.5",
+            "Accept": "*/*",
+            "user-agent": "Mozilla/5.0 (X11; Linux x86_64; rv:138.0) Gecko/20100101 Firefox/138.0",
+        }
        return {
            "destination_url": best,
-            "request_headers": self.base_headers,
+            "request_headers": out_headers,
            "mediaflow_endpoint": self.mediaflow_endpoint,
        }
@@ -152,6 +152,25 @@ class DoodStreamExtractor(BaseExtractor):
        html = r.text
        base_url = f"https://{urlparse(final_url).netloc}"

+        # Some pages embed a JS redirect instead of a real HTTP redirect.
+        js_redirect = re.search(r'(?:window\.location|location\.href)\s*=\s*[\'"]https?://([^/\'"]+)', html)
+        if js_redirect:
+            redirected_host = js_redirect.group(1)
+            redirect_url = f"https://{redirected_host}/e/{video_id}"
+            logger.debug("JS redirect detected → %s", redirect_url)
+            async with AsyncSession() as s2:
+                r2 = await s2.get(
+                    redirect_url,
+                    impersonate="chrome",
+                    headers={"Referer": f"https://{redirected_host}/"},
+                    timeout=30,
+                    allow_redirects=True,
+                    **({"proxy": proxy} if proxy else {}),
+                )
+            final_url = str(r2.url)
+            html = r2.text
+            base_url = f"https://{urlparse(final_url).netloc}"
+
        if "pass_md5" not in html:
            if "turnstile" in html.lower() or "captcha_l" in html:
                raise ExtractorError(
@@ -197,6 +216,14 @@ class DoodStreamExtractor(BaseExtractor):
                "Ensure BYPARR_URL is set for reliable extraction."
            )

+        # CloudFlare R2 storage URLs are self-contained — no salt/token needed.
+        if "cloudflarestorage." in base_stream.lower():
+            return {
+                "destination_url": base_stream,
+                "request_headers": headers,
+                "mediaflow_endpoint": "proxy_stream_endpoint",
+            }
+
        token_match = re.search(r"token=([^&\s'\"]+)", html)
        if not token_match:
            raise ExtractorError("DoodStream: token not found in embed HTML")
@@ -25,6 +25,7 @@ from mediaflow_proxy.extractors.vixcloud import VixCloudExtractor
 from mediaflow_proxy.extractors.fastream import FastreamExtractor
 from mediaflow_proxy.extractors.voe import VoeExtractor
 from mediaflow_proxy.extractors.vidfast import VidFastExtractor
+from mediaflow_proxy.extractors.streamhg import StreamHGExtractor


 class ExtractorFactory:
@@ -55,6 +56,7 @@ class ExtractorFactory:
        "Voe": VoeExtractor,
        "Sportsonline": SportsonlineExtractor,
        "VidFast": VidFastExtractor,
+        "StreamHG": StreamHGExtractor,
    }

    @classmethod
@@ -1,9 +1,45 @@
-import re
+import base64
+import json
 from typing import Dict, Any
-from urllib.parse import urlparse, urljoin
+from urllib.parse import urlparse
+
+from cryptography.hazmat.primitives.ciphers.aead import AESGCM

 from mediaflow_proxy.extractors.base import BaseExtractor, ExtractorError
-from mediaflow_proxy.utils.packed import eval_solver
+
+
+def _base64url_decode(input_str: str) -> bytes:
+    """Decode a base64url-encoded string to bytes."""
+    padded = input_str.replace("-", "+").replace("_", "/")
+    padding = 4 - len(padded) % 4
+    if padding != 4:
+        padded += "=" * padding
+    return base64.b64decode(padded)
+
+
+def _combine_key_parts(key_parts: list) -> bytes:
+    """Combine base64url-encoded key parts into a single key."""
+    decoded = [_base64url_decode(part) for part in key_parts]
+    return b"".join(decoded)
+
+
+def _decrypt_playback(playback: dict) -> dict:
+    """Decrypt AES-256-GCM encrypted playback payload."""
+    key = _combine_key_parts(playback["key_parts"])
+    iv = _base64url_decode(playback["iv"])
+    payload = _base64url_decode(playback["payload"])
+
+    # GCM auth tag is the last 16 bytes of the payload
+    tag = payload[-16:]
+    ciphertext = payload[:-16]
+
+    aesgcm = AESGCM(key)
+    try:
+        plaintext = aesgcm.decrypt(iv, ciphertext + tag, None)
+    except Exception as e:
+        raise ExtractorError(f"Decryption failed: {e}")
+
+    return json.loads(plaintext.decode("utf-8"))


 class FileMoonExtractor(BaseExtractor):
@@ -12,41 +48,49 @@ class FileMoonExtractor(BaseExtractor):
        self.mediaflow_endpoint = "hls_manifest_proxy"

    async def extract(self, url: str, **kwargs) -> Dict[str, Any]:
-        response = await self._make_request(url)
+        # URL format: https://filemoon.sx/e/{code} or https://filemoon.sx/d/{code}
+        parsed = urlparse(url)
+        path = parsed.path.rstrip("/")
+        code = path.split("/")[-1] if path else None

-        pattern = r'iframe.*?src=["\'](.*?)["\']'
-        match = re.search(pattern, response.text, re.DOTALL)
-        if not match:
-            raise ExtractorError("Failed to extract iframe URL")
+        if not code or code in ("e", "d"):
+            raise ExtractorError(f"Could not extract video code from URL: {url}")

-        iframe_url = match.group(1)
-
-        parsed = urlparse(str(response.url))
-        base_url = f"{parsed.scheme}://{parsed.netloc}"
-
-        if iframe_url.startswith("//"):
-            iframe_url = f"{parsed.scheme}:{iframe_url}"
-        elif not urlparse(iframe_url).scheme:
-            iframe_url = urljoin(base_url, iframe_url)
+        api_url = f"{parsed.scheme}://{parsed.netloc}/api/videos/{code}"

        headers = {"Referer": url}
-        patterns = [r'file:"(.*?)"']
+        response = await self._make_request(api_url, headers=headers)

-        final_url = await eval_solver(
-            self,
-            iframe_url,
-            headers,
-            patterns,
-        )
+        try:
+            data = response.json()
+        except Exception as e:
+            raise ExtractorError(f"Failed to parse API response: {e}")

-        test_resp = await self._make_request(final_url, headers=headers)
-        if test_resp.status == 404:
-            raise ExtractorError("Stream not found (404)")
+        if "error" in data:
+            raise ExtractorError(f"FileMoon API error: {data['error']}")
+
+        playback = data.get("playback")
+        if not playback or not playback.get("key_parts") or not playback.get("payload"):
+            raise ExtractorError("No playback data available")
+
+        decrypted = _decrypt_playback(playback)
+
+        sources = decrypted.get("sources", [])
+        hls_source = None
+        for source in sources:
+            if source.get("mime_type") == "application/vnd.apple.mpegurl":
+                hls_source = source
+                break
+
+        if not hls_source:
+            raise ExtractorError("No HLS source found in decrypted playback")
+
+        destination_url = hls_source["url"]

        self.base_headers["referer"] = url

        return {
-            "destination_url": final_url,
+            "destination_url": destination_url,
            "request_headers": self.base_headers,
            "mediaflow_endpoint": self.mediaflow_endpoint,
        }
@@ -1,67 +1,651 @@
+"""Maxstream URL extractor — full uprot bypass pipeline.
+
+Solves the problem of `uprot.net` redirects on `/msf/`, `/msfi/` and
+`/msfld/` paths used by Italian aggregators (CB01, EuroStreaming, etc).
+
+Key features:
+  1. TLS-fingerprint-resistant fetch via curl_cffi (chrome131 impersonation)
+  2. 4-digit captcha solver with multi-engine OCR ensemble:
+       ddddocr (primary) → tesseract (fallback) → CF Workers AI (3rd, opt-in)
+  3. Honeypot URL filtering on the post-captcha page
+  4. uprots/uprotem → maxstream redirect chain follow with cookie continuity
+  5. /msfld/ folder picker (season + episode kwargs from MFP route)
+  6. Optional persistent URL cache (when paired with services/uprot_warmer.py)
+
+All advanced features are guarded by lazy imports — if `curl_cffi`,
+`pytesseract`, `Pillow` or `ddddocr` are not installed the extractor
+falls back to the previous behaviour for `/msf/` URLs and skips
+`/msfld/` cleanly.
+
+Activation:
+  CF_WORKER_OCR_URL    e.g. https://easyproxy-ocr.user.workers.dev
+  CF_WORKER_OCR_AUTH   Worker AUTH_TOKEN
+
+Credits: pipeline ported from NelloStream
+(https://github.com/vitouchiha/nello-stream) — `workers/cfworker.js`
+functions `_uprotBypassWithCookies`, `_extractMaxstreamVideo`,
+`_aiOcrDigits`, `_handleScheduledUprotRefresh`. All credit to Nello.
+"""
+
+import asyncio
+import logging
+import os
 import re
-from typing import Dict, Any
+from typing import Any, Dict, Optional
+from urllib.parse import urljoin, urlparse, urlencode

 from bs4 import BeautifulSoup

 from mediaflow_proxy.extractors.base import BaseExtractor, ExtractorError

+logger = logging.getLogger(__name__)
+

 class MaxstreamExtractor(BaseExtractor):
-    """Maxstream URL extractor."""
+    """Maxstream URL extractor with full uprot bypass pipeline."""

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.mediaflow_endpoint = "hls_manifest_proxy"
+        # Persistent cookie jar across the uprot → maxstream redirect chain.
+        # PHPSESSID + captcha hash + uprot_session must travel together for
+        # the post-captcha redirect to be honoured by the maxstream WAF.
+        self.cookies: Dict[str, str] = {}
+        self._last_solve_text: Optional[str] = None

-    async def get_uprot(self, link: str):
-        """Extract MaxStream URL."""
-        if "msf" in link:
-            link = link.replace("msf", "mse")
-        response = await self._make_request(link)
-        soup = BeautifulSoup(response.text, "lxml")
-        maxstream_url = soup.find("a")
-        maxstream_url = maxstream_url.get("href")
-        return maxstream_url
+    # ───────────────────────── HTTP layer ──────────────────────────────
+
+    async def _curl_cffi_fetch(
+        self,
+        url: str,
+        method: str = "GET",
+        data: Optional[Any] = None,
+        headers: Optional[Dict[str, str]] = None,
+        allow_redirects: bool = True,
+        timeout: int = 30,
+    ) -> Optional[Dict[str, Any]]:
+        """Browser-impersonated fetch via curl_cffi.
+
+        uprot.net inspects TLS fingerprints; aiohttp's JA3 is recognised as
+        a bot within a few requests and served captcha pages or 503 even
+        from clean residential IPs. curl_cffi with `impersonate="chrome131"`
+        replays a real Chrome JA3 + ALPN order, so uprot serves the real
+        redirect link or the (legitimately-protected) captcha page.
+
+        Returns None if curl_cffi is not installed (caller falls back to
+        BaseExtractor._make_request for the simpler legacy /msf/ path).
+        """
+        try:
+            from curl_cffi import requests as cffi_requests
+        except ImportError:
+            logger.debug("curl_cffi not installed — uprot bypass disabled")
+            return None
+
+        merged_headers = dict(self.base_headers)
+        if headers:
+            merged_headers.update(headers)
+        if method.upper() == "POST" and isinstance(data, (str, bytes)):
+            merged_headers.setdefault("content-type", "application/x-www-form-urlencoded")
+
+        proxy = self._get_proxy(url)
+        proxies_arg = {"http": proxy, "https": proxy} if proxy else None
+
+        loop = asyncio.get_running_loop()
+
+        def _do_request():
+            try:
+                req_cookies = dict(self.cookies) if self.cookies else None
+                r = cffi_requests.request(
+                    method,
+                    url,
+                    headers=merged_headers,
+                    data=data,
+                    cookies=req_cookies,
+                    proxies=proxies_arg,
+                    impersonate="chrome131",
+                    timeout=timeout,
+                    allow_redirects=allow_redirects,
+                )
+                cookies = {}
+                try:
+                    cookies = {c.name: c.value for c in r.cookies.jar}
+                except Exception:
+                    cookies = dict(r.cookies) if r.cookies else {}
+                return {
+                    "ok": r.status_code < 400,
+                    "status": r.status_code,
+                    "text": r.text,
+                    "content": r.content,
+                    "url": str(r.url),
+                    "headers": dict(r.headers),
+                    "cookies": cookies,
+                }
+            except Exception as e:
+                return {
+                    "ok": False,
+                    "status": 0,
+                    "text": "",
+                    "content": b"",
+                    "url": url,
+                    "headers": {},
+                    "cookies": {},
+                    "error": str(e),
+                }
+
+        result = await loop.run_in_executor(None, _do_request)
+        if result.get("cookies"):
+            self.cookies.update(result["cookies"])
+        return result
+
+    # ─────────────────────── Honeypot filter ───────────────────────────
+
+    @staticmethod
+    def _strip_uprot_honeypots(html: str) -> str:
+        """Remove uprot's anti-bot honeypot blocks before URL extraction.
+
+        The post-captcha success page intentionally hides decoy URLs in:
+          1. HTML comments  (<!-- … -->)
+          2. <div style="display:none">…</div> blocks containing fake
+             "Continue" buttons that point to placeholder URLs like
+             `maxstream.video/uprots/123456789012` (12 sequential digits).
+
+        A naive regex grabs the FIRST match (the honeypot). Strip both
+        before parsing so the regex/BS4 see only the visible-to-user DOM.
+        """
+        no_comments = re.sub(r"<!--[\s\S]*?-->", "", html)
+        no_hidden = re.sub(
+            r"<div[^>]*style=[\"'][^\"']*display\s*:\s*none[^\"']*[\"'][^>]*>[\s\S]*?</div>",
+            "",
+            no_comments,
+            flags=re.IGNORECASE,
+        )
+        return no_hidden
+
+    # ─────────────────────── Redirect parser ───────────────────────────
+
+    def _parse_uprot_html(self, text: str) -> Optional[str]:
+        """Parse a uprot success page and return the next-hop URL.
+
+        Strategy mirrored from NelloStream `_uprotBypassWithCookies`:
+          1. Strip honeypot blocks first
+          2. Prefer explicit `id="buttok"` CONTINUE button (uprot marker)
+          3. Fallback: <a><button>Continue</button></a> (case+spacing tolerant)
+          4. Last resort: a `/uprots/` or `/uprotem/` URL appearing exactly
+             once in the cleaned HTML (uprot scatters multiple decoys)
+          5. Generic stayonline.pro / maxstream.video regex with honeypot
+             literal filter
+          6. window.location / meta refresh / BS4 button fallbacks
+        """
+        cleaned = self._strip_uprot_honeypots(text).replace("\\/", "/")
+
+        def _valid(c):
+            if not c:
+                return None
+            try:
+                p = urlparse(c)
+                if p.netloc and "maxstream.video" in p.netloc and p.path.startswith("/cdn-cgi/"):
+                    return None
+            except Exception:
+                pass
+            return c
+
+        # 1. id="buttok" CONTINUE button
+        m = re.search(
+            r'href=["\'](https?://[^"\']+)["\'][^>]*>\s*<button[^>]*id=["\']buttok["\'][^>]*>\s*C\s*O\s*N\s*T\s*I\s*N\s*U\s*E',
+            cleaned,
+            re.IGNORECASE,
+        )
+        if m and _valid(m.group(1)):
+            return m.group(1)
+
+        # 2. Generic <a><button>Continue</button></a>
+        m = re.search(
+            r'href=["\'](https?://[^"\']+)["\'][^>]*>\s*<button[^>]*>\s*[Cc]\s*[Oo]\s*[Nn]\s*[Tt]\s*[Ii]\s*[Nn]\s*[Uu]\s*[Ee]',
+            cleaned,
+        )
+        if m and _valid(m.group(1)):
+            return m.group(1)
+
+        # 3. Unique uprots/uprotem URL
+        all_uprots = re.findall(
+            r'href=["\'](https?://[^"\']*uprot(?:s|em)/[^"\']+)["\']',
+            cleaned,
+            re.IGNORECASE,
+        )
+        if all_uprots:
+            counts: Dict[str, int] = {}
+            for u in all_uprots:
+                counts[u] = counts.get(u, 0) + 1
+            unique = [u for u, c in counts.items() if c == 1]
+            if unique and _valid(unique[0]):
+                return unique[0]
+
+        # 4. Generic stayonline / maxstream regex
+        m = re.search(
+            r'https?://(?:www\.)?(?:stayonline\.pro|maxstream\.video)[^"\'\s<>\\ ]+',
+            cleaned,
+        )
+        if m and "/uprots/123456789012" not in m.group(0) and _valid(m.group(0)):
+            return m.group(0)
+
+        # 5. window.location / meta refresh
+        m = re.search(r'window\.location(?:\.href)?\s*=\s*["\']([^"\']+)["\']', cleaned)
+        if m and _valid(m.group(1)):
+            return m.group(1)
+        m = re.search(r'content=["\']0;\s*url=([^"\']+)["\']', cleaned, re.I)
+        if m and _valid(m.group(1)):
+            return m.group(1)
+
+        # 6. BS4 buttons / forms (rare paths)
+        soup = BeautifulSoup(cleaned, "lxml")
+        for btn in soup.find_all(["a", "button"]):
+            t = btn.get_text().strip().lower()
+            if "continue" in t or "continua" in t or "vai al" in t:
+                href = btn.get("href")
+                if not href and btn.parent and btn.parent.name == "a":
+                    href = btn.parent.get("href")
+                if href and "uprot.net" not in href and _valid(href):
+                    return href
+        return None
+
+    def _parse_uprot_folder(self, text: str, season, episode) -> Optional[str]:
+        """Parse a /msfld/ folder HTML and return the /msfi/ link for S{ss}E{ee}."""
+        try:
+            s_int = int(season)
+            e_int = int(episode)
+        except (TypeError, ValueError):
+            return None
+        s_pad = f"{s_int:02d}"
+        e_pad = f"{e_int:02d}"
+        patterns = [
+            rf"S{s_pad}E{e_pad}",
+            rf"\b0*{s_int}x0*{e_int}\b",
+            rf"\b0*{s_int}&#215;0*{e_int}\b",
+            rf"\b0*{s_int}×0*{e_int}\b",
+        ]
+        for pat in patterns:
+            m = re.search(
+                rf"{pat}[\s\S]{{0,500}}?href=['\"]([^'\"]+/msfi/[^'\"]+)['\"]",
+                text,
+                re.I,
+            )
+            if m:
+                return m.group(1)
+        return None
+
+    # ─────────────────────── OCR backends ──────────────────────────────
+
+    @staticmethod
+    def _preprocess_captcha_png(img_bytes: bytes) -> bytes:
+        """Binarize + denoise the captcha PNG to boost ddddocr accuracy."""
+        try:
+            from PIL import Image, ImageFilter
+            import io
+
+            img = Image.open(io.BytesIO(img_bytes)).convert("L")
+            img = img.point(lambda p: 255 if p >= 140 else 0, mode="L")
+            img = img.filter(ImageFilter.MaxFilter(3))
+            img = img.filter(ImageFilter.MinFilter(3))
+            out = io.BytesIO()
+            img.save(out, format="PNG")
+            return out.getvalue()
+        except Exception:
+            return img_bytes
+
+    @staticmethod
+    def _tesseract_classify(img_bytes: bytes) -> str:
+        try:
+            import pytesseract
+            from PIL import Image, ImageFilter
+            import io
+
+            img = Image.open(io.BytesIO(img_bytes)).convert("L")
+            img = img.point(lambda p: 255 if p >= 140 else 0, mode="L")
+            img = img.filter(ImageFilter.MaxFilter(3))
+            img = img.filter(ImageFilter.MinFilter(3))
+            return pytesseract.image_to_string(img, config="--psm 7 -c tessedit_char_whitelist=0123456789").strip()
+        except Exception:
+            return ""
+
+    @staticmethod
+    async def _cf_worker_ocr(img_bytes: bytes, expected_digits: int = 4) -> str:
+        """Optional 3rd OCR backend: Cloudflare Workers AI vision LLM.
+
+        ddddocr + tesseract top out at ~50-65% on uprot's noisy captcha.
+        A vision LLM (Llama 4 Scout / Gemma 3 / LLaVA) gets ~80-90%.
+        POSTs the captcha PNG to a user-deployed CF Worker (see
+        docs/MAXSTREAM_UPROT.md for setup).
+
+        Activated only when both env vars are set:
+          CF_WORKER_OCR_URL
+          CF_WORKER_OCR_AUTH
+        Returns "" on any failure — caller falls through gracefully.
+        """
+        base = (os.getenv("CF_WORKER_OCR_URL") or "").strip().rstrip("/")
+        if not base:
+            return ""
+        auth = (os.getenv("CF_WORKER_OCR_AUTH") or "").strip()
+        try:
+            import aiohttp
+
+            headers = {"content-type": "image/png"}
+            if auth:
+                headers["x-worker-auth"] = auth
+            async with aiohttp.ClientSession(timeout=aiohttp.ClientTimeout(total=20)) as s:
+                async with s.post(
+                    f"{base}/?ocr=1&digits={expected_digits}",
+                    data=img_bytes,
+                    headers=headers,
+                ) as resp:
+                    if resp.status != 200:
+                        return ""
+                    data = await resp.json()
+                    return (data.get("digits") or "").strip()
+        except Exception as e:
+            logger.debug(f"CF Worker OCR failed: {e}")
+            return ""
+
+    # ─────────────────── Captcha solver loop ───────────────────────────
+
+    async def _solve_uprot_captcha_once(self, text: str, original_url: str, preprocess: bool = False) -> Optional[str]:
+        try:
+            import ddddocr
+        except ImportError:
+            logger.debug("ddddocr not installed — skipping captcha solve")
+            return None
+
+        soup = BeautifulSoup(text, "lxml")
+        img_tag = soup.find("img", src=re.compile(r"data:image/|/captcha|/image/|captcha\.php"))
+        img_url = img_tag.get("src") if img_tag else None
+        if not img_url:
+            m = re.search(
+                r'<img[^>]+src=["\']([^"\']*(?:data:image/|captcha|image)[^"\']*)["\']',
+                text,
+            )
+            img_url = m.group(1) if m else None
+        if not img_url:
+            return None
+
+        form = soup.find("form")
+        form_action = form.get("action") if form else ""
+        if not form_action or form_action == "#":
+            form_action = original_url
+        elif form_action.startswith("/"):
+            p = urlparse(original_url)
+            form_action = f"{p.scheme}://{p.netloc}{form_action}"
+
+        # Download captcha image
+        if img_url.startswith("data:"):
+            try:
+                import base64
+
+                _, b64 = img_url.split(",", 1)
+                img_data = base64.b64decode(b64)
+            except Exception:
+                return None
+        else:
+            full_url = img_url
+            if full_url.startswith("/"):
+                p = urlparse(original_url)
+                full_url = f"{p.scheme}://{p.netloc}{full_url}"
+            res = await self._curl_cffi_fetch(full_url)
+            if not res or not res.get("ok"):
+                return None
+            img_data = res.get("content") or b""
+
+        ocr_input = self._preprocess_captcha_png(img_data) if preprocess else img_data
+
+        if not hasattr(self, "_ocr_engine"):
+            self._ocr_engine = ddddocr.DdddOcr(show_ad=False)
+        res_str = self._ocr_engine.classification(ocr_input)
+        res_digits = "".join(c for c in str(res_str) if c.isdigit())
+
+        # Accept 3-or-4 digit answers (uprot uses 4 today; legacy 3 still seen)
+        def _ok(n):
+            return 3 <= n <= 4
+
+        if not _ok(len(res_digits)):
+            tess = self._tesseract_classify(ocr_input)
+            tess_digits = "".join(c for c in str(tess) if c.isdigit())
+            if _ok(len(tess_digits)):
+                res_digits = tess_digits
+            else:
+                cf = await self._cf_worker_ocr(ocr_input, expected_digits=4)
+                cf_digits = "".join(c for c in str(cf) if c.isdigit())
+                if _ok(len(cf_digits)):
+                    res_digits = cf_digits
+                else:
+                    return None
+
+        # Prepare POST data
+        captcha_input = soup.find("input", {"name": re.compile(r"captcha|code|val", re.I)})
+        if captcha_input and captcha_input.get("name"):
+            field_name = captcha_input["name"]
+        else:
+            m = re.search(r'name=["\'](captcha|code|val|captch5)[^"\']*["\']', text, re.I)
+            field_name = m.group(1) if m else "captcha"
+
+        post_data = {field_name: res_digits}
+        if form:
+            for inp in form.find_all(["input", "button", "select"]):
+                n = inp.get("name")
+                v = inp.get("value", "")
+                if n and n not in post_data:
+                    post_data[n] = v
+
+        headers = {**self.base_headers, "referer": original_url}
+        result = await self._curl_cffi_fetch(form_action, method="POST", data=urlencode(post_data), headers=headers)
+        if not result:
+            return None
+        solved_text = result.get("text") or ""
+        self._last_solve_text = solved_text if isinstance(solved_text, str) else None
+        return self._parse_uprot_html(solved_text)
+
+    async def _solve_uprot_captcha(self, text: str, original_url: str, max_attempts: int = 4) -> Optional[str]:
+        """Solve the captcha with retries on fresh images.
+
+        Each wrong submit triggers uprot to serve a brand-new captcha
+        image; we feed that fresh page back into the next attempt instead
+        of OCRing the same image with different preprocessing.
+        """
+        current = text
+        for attempt in range(1, max_attempts + 1):
+            preprocess = attempt % 2 == 0
+            result = await self._solve_uprot_captcha_once(current, original_url, preprocess=preprocess)
+            if result:
+                return result
+            new_text = self._last_solve_text
+            if new_text and new_text != current:
+                current = new_text
+        return None
+
+    # ──────────────────── Redirect chain ───────────────────────────────
+
+    async def _follow_uprots_chain(self, url: str, max_hops: int = 10) -> str:
+        """Walk the uprots/uprotem → maxstream redirect chain manually.
+
+        After captcha, the URL we extract is usually
+        `maxstream.video/uprots/<token>` whose WAF only honours the token
+        when reached via the proper redirect chain (Referer + cookie
+        continuity from uprot.net). Direct GET → Error 131.
+
+        Walks hop-by-hop preserving cookies until landing on
+        `maxsun{N}.online/watchfree/...` or `maxstream.video/emvvv/<id>`,
+        then converts watchfree → emvvv so the existing packer extraction
+        works.
+        """
+        if "/uprots/" not in url and "/uprotem/" not in url:
+            return url
+
+        current = url
+        for _ in range(max_hops):
+            res = await self._curl_cffi_fetch(
+                current,
+                headers={**self.base_headers, "referer": "https://uprot.net/"},
+                allow_redirects=False,
+                timeout=15,
+            )
+            if not res:
+                break
+            loc = (res.get("headers") or {}).get("location") or (res.get("headers") or {}).get("Location")
+            if not loc:
+                current = res.get("url") or current
+                break
+            current = urljoin(current, loc)
+            if "/uprots/" not in current and "/uprotem/" not in current:
+                break
+
+        if "watchfree/" in current:
+            try:
+                tail = current.split("watchfree/", 1)[1]
+                segments = [s for s in tail.split("/") if s]
+                if len(segments) >= 2:
+                    current = f"https://maxstream.video/emvvv/{segments[1]}"
+            except Exception:
+                pass
+
+        return current
+
+    # ─────────────────────── Public flow ───────────────────────────────
+
+    async def get_uprot(self, link: str, season=None, episode=None) -> str:
+        """Resolve a uprot URL to its maxstream destination.
+
+        Supports:
+          - /msf/{id}    single movie (legacy alias /mse/)
+          - /msfi/{id}   single episode
+          - /msfld/{id}  folder of episodes (requires season + episode)
+        """
+        # Map only the modern /msf/ single-video path to its legacy /mse/
+        # alias. A naive str.replace("msf", "mse") corrupts /msfld/ into
+        # /mseld/ (404) and /msfi/ into /msei/ (deprecated 500 on new IDs).
+        link = re.sub(r"/msf/", "/mse/", link)
+
+        # Try curl_cffi first; fall back to BaseExtractor._make_request if
+        # curl_cffi isn't installed (legacy /msf/ path may still work).
+        cffi = await self._curl_cffi_fetch(link)
+        if cffi and cffi.get("ok"):
+            text = cffi["text"]
+        else:
+            response = await self._make_request(link)
+            text = response.text
+
+        if "/msfld/" in link:
+            if season is None or episode is None:
+                raise ExtractorError("msfld folder URL requires 'season' and 'episode' parameters")
+            episode_link = self._parse_uprot_folder(text, season, episode)
+            if not episode_link:
+                raise ExtractorError(f"Episode S{season}E{episode} not found in msfld folder")
+            link = episode_link
+            cffi = await self._curl_cffi_fetch(link)
+            if cffi and cffi.get("ok"):
+                text = cffi["text"]
+            else:
+                response = await self._make_request(link)
+                text = response.text
+
+        # 1. Direct parse — works on legacy uprot pages without captcha
+        res = self._parse_uprot_html(text)
+        if res:
+            return res
+
+        # 2. Captcha solver
+        res = await self._solve_uprot_captcha(text, link)
+        if res:
+            return res
+
+        raise ExtractorError("Redirect link not found in uprot page")

    async def extract(self, url: str, **kwargs) -> Dict[str, Any]:
-        """Extract Maxstream URL."""
-        maxstream_url = await self.get_uprot(url)
-        response = await self._make_request(maxstream_url, headers={"accept-language": "en-US,en;q=0.5"})
+        """Extract Maxstream URL.

-        # Extract and decode URL
-        match = re.search(r"\}\('(.+)',.+,'(.+)'\.split", response.text)
-        if not match:
+        For /msfld/ folder URLs, callers must pass season=N&episode=M as
+        query parameters (forwarded by MFP routes as kwargs).
+
+        Optional persistent cache: if `mediaflow_proxy.services.uprot_url_cache`
+        is importable, cache hits skip captcha+chain entirely (<100ms).
+        """
+        season = kwargs.get("season")
+        episode = kwargs.get("episode")
+
+        cached = None
+        try:
+            from mediaflow_proxy.services import uprot_url_cache  # type: ignore
+
+            cached = uprot_url_cache.get(url, season=season, episode=episode)
+        except Exception:
+            pass
+
+        if cached:
+            logger.debug(f"uprot cache HIT: {url[:80]}")
+            maxstream_url = cached
+        else:
+            maxstream_url = await self.get_uprot(url, season=season, episode=episode)
+            maxstream_url = await self._follow_uprots_chain(maxstream_url)
+
+        # Fetch the maxstream embed page
+        cffi = await self._curl_cffi_fetch(
+            maxstream_url,
+            headers={**self.base_headers, "referer": "https://uprot.net/", "accept-language": "en-US,en;q=0.5"},
+        )
+        if cffi and cffi.get("ok"):
+            text = cffi["text"]
+        else:
+            response = await self._make_request(maxstream_url, headers={"accept-language": "en-US,en;q=0.5"})
+            text = response.text
+
+        if not cached:
+            try:
+                from mediaflow_proxy.services import uprot_url_cache  # type: ignore
+
+                uprot_url_cache.put(url, maxstream_url, season=season, episode=episode)
+            except Exception:
+                pass
+
+        # Direct sources check
+        m = re.search(r'sources:\s*\[\{src:\s*"([^"]+)"', text)
+        if m:
+            return {
+                "destination_url": m.group(1),
+                "request_headers": {**self.base_headers, "referer": maxstream_url},
+                "mediaflow_endpoint": self.mediaflow_endpoint,
+            }
+
+        # Packer fallback
+        m = re.search(r"\}\('(.+)',.+,'(.+)'\.split", text)
+        if not m:
+            m = re.search(r"eval\(function\(p,a,c,k,e,d\).+?\}\('(.+?)',.+?,'(.+?)'\.split", text, re.S)
+        if not m:
            raise ExtractorError("Failed to extract URL components")

-        s1 = match.group(2)
-        # Extract Terms
-        terms = s1.split("|")
-        urlset_index = terms.index("urlset")
-        hls_index = terms.index("hls")
-        sources_index = terms.index("sources")
-        result = terms[urlset_index + 1 : hls_index]
-        reversed_elements = result[::-1]
-        first_part = terms[hls_index + 1 : sources_index]
-        reversed_first_part = first_part[::-1]
-        first_url_part = ""
-        for first_part in reversed_first_part:
-            if "0" in first_part:
-                first_url_part += first_part
-            else:
-                first_url_part += first_part + "-"
+        terms = m.group(2).split("|")
+        try:
+            urlset_index = terms.index("urlset")
+            hls_index = terms.index("hls")
+            sources_index = terms.index("sources")
+        except ValueError as e:
+            raise ExtractorError(f"Missing components in packer: {e}")

-        base_url = f"https://{first_url_part}.host-cdn.net/hls/"
+        result_parts = terms[urlset_index + 1 : hls_index]
+        reversed_elements = result_parts[::-1]
+        first_part_terms = terms[hls_index + 1 : sources_index]
+        reversed_first_part = first_part_terms[::-1]
+
+        first_url_part = ""
+        for fp in reversed_first_part:
+            if "0" in fp:
+                first_url_part += fp
+            else:
+                first_url_part += fp + "-"
+
+        base_url = f"https://{first_url_part.rstrip('-')}.host-cdn.net/hls/"
        if len(reversed_elements) == 1:
            final_url = base_url + "," + reversed_elements[0] + ".urlset/master.m3u8"
-        lenght = len(reversed_elements)
-        i = 1
-        for element in reversed_elements:
-            base_url += element + ","
-            if lenght == i:
-                base_url += ".urlset/master.m3u8"
-            else:
-                i += 1
-        final_url = base_url
+        else:
+            final_url = base_url
+            for element in reversed_elements:
+                final_url += element + ","
+            final_url = final_url.rstrip(",") + ".urlset/master.m3u8"

        self.base_headers["referer"] = url
        return {
@@ -0,0 +1,25 @@
+from typing import Dict, Any
+
+from mediaflow_proxy.extractors.base import BaseExtractor
+from mediaflow_proxy.utils.packed import eval_solver
+
+
+class StreamHGExtractor(BaseExtractor):
+    """StreamHG URL extractor."""
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.mediaflow_endpoint = "hls_manifest_proxy"
+
+    async def extract(self, url: str, **kwargs) -> Dict[str, Any]:
+        """Extract StreamHG URL."""
+        headers = {}
+        patterns = [r'"hls2":"([^"]+)"']
+
+        final_url = await eval_solver(self, url, headers, patterns)
+
+        return {
+            "destination_url": final_url,
+            "request_headers": self.base_headers,
+            "mediaflow_endpoint": self.mediaflow_endpoint,
+        }
@@ -1,7 +1,6 @@
 import re
 from typing import Dict, Any
 from urllib.parse import urljoin, urlparse
-
 from mediaflow_proxy.extractors.base import BaseExtractor, ExtractorError


@@ -15,12 +14,23 @@ class VidmolyExtractor(BaseExtractor):
        if not parsed.hostname or "vidmoly" not in parsed.hostname:
            raise ExtractorError("VIDMOLY: Invalid domain")

+        embed_id_match = re.search(r"/embed-([a-zA-Z0-9]+)\.html", parsed.path)
+        if not embed_id_match:
+            raise ExtractorError("VIDMOLY: Could not extract embed ID from URL")
+        embed_id = embed_id_match.group(1)
+
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
            "AppleWebKit/537.36 (KHTML, like Gecko) "
            "Chrome/120 Safari/537.36",
+            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
+            "Accept-Language": "en-US,en;q=0.9",
+            "Connection": "keep-alive",
+            "Cookie": f"cf_turnstile_demo_pass_{embed_id}=1",
            "Referer": url,
-            "Sec-Fetch-Dest": "iframe",
+            "Sec-Fetch-Dest": "document",
+            "Sec-Fetch-Mode": "navigate",
+            "Sec-Fetch-Site": "same-origin",
        }

        # --- Fetch embed page ---
@@ -33,11 +43,10 @@ class VidmolyExtractor(BaseExtractor):
            raise ExtractorError("VIDMOLY: Stream URL not found")

        master_url = match.group(1)
-
        if not master_url.startswith("http"):
            master_url = urljoin(url, master_url)

-        # --- Validate stream (prevents Stremio timeout) ---
+        # --- Validate stream ---
        try:
            test = await self._make_request(master_url, headers=headers)
        except Exception as e:
@@ -48,8 +57,6 @@ class VidmolyExtractor(BaseExtractor):
        if test.status >= 400:
            raise ExtractorError(f"VIDMOLY: Stream unavailable ({test.status})")

-        # Return MASTER playlist, not variant
-        # Let MediaFlow Proxy handle variants
        return {
            "destination_url": master_url,
            "request_headers": headers,
@@ -54,9 +54,9 @@ class VixCloudExtractor(BaseExtractor):
                "Origin": f"{site_url}",
            }

-            response = await self._make_request(site_url + '/api' + parts[1])
+            response = await self._make_request(site_url + "/api" + parts[1])

-            response = await self._make_request(site_url + '/' + response.json()['src'],headers=headers)
+            response = await self._make_request(site_url + "/" + response.json()["src"], headers=headers)

        if response.status != 200:
            raise ExtractorError("Failed to extract URL components, Invalid Request")