UnHided/mediaflow_proxy/extractors/doodstream.py

import logging
import re
import time
from urllib.parse import urlparse, urljoin

import aiohttp
from curl_cffi.requests import AsyncSession

from mediaflow_proxy.extractors.base import BaseExtractor, ExtractorError
from mediaflow_proxy.configs import settings

logger = logging.getLogger(__name__)

_DOOD_UA = (
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36"
)


class DoodStreamExtractor(BaseExtractor):
    """
    DoodStream / PlayMogo extractor.

    All DoodStream mirror domains (dsvplay.com, myvidplay.com, dood.to, …) now
    redirect to playmogo.com which sits behind Cloudflare and may require a
    Turnstile CAPTCHA before serving the pass_md5 URL.

    Extraction order:
    1. Byparr  — set BYPARR_URL (Firefox/Camoufox → Turnstile auto-validates,
                 not blocked by DisableDevtool.js)
    2. curl_cffi — Chrome impersonation; works when Turnstile is not triggered,
                   raises a descriptive error if captcha is detected.
    """

    async def extract(self, url: str, **kwargs):
        parsed = urlparse(url)
        video_id = parsed.path.rstrip("/").split("/")[-1]
        if not video_id:
            raise ExtractorError("Invalid DoodStream URL: no video ID found")

        if settings.byparr_url:
            try:
                return await self._extract_via_byparr(url, video_id)
            except ExtractorError:
                raise

        return await self._extract_via_curl_cffi(url, video_id)

    # ------------------------------------------------------------------
    # Path 1 – Byparr (Firefox/Camoufox → Turnstile auto-validates)
    # ------------------------------------------------------------------

    async def _extract_via_byparr(self, url: str, video_id: str) -> dict:
        """
        Use Byparr to bypass Cloudflare protection on the DoodStream embed page.

        Strategy: fetch the embed page without any injected script. Byparr's
        Firefox/Camoufox browser auto-passes Cloudflare's bot checks and often
        bypasses the Turnstile CAPTCHA gate directly, returning the embed HTML
        with pass_md5.  If the response doesn't contain pass_md5, reuse the CF
        cookies + UA from Byparr in a follow-up curl_cffi request (which avoids
        re-triggering the bot check).
        """
        endpoint = f"{settings.byparr_url.rstrip('/')}/v1"
        embed_url = url if "/e/" in url else f"https://{urlparse(url).netloc}/e/{video_id}"
        payload = {
            "cmd": "request.get",
            "url": embed_url,
            "maxTimeout": settings.byparr_timeout * 1000,
        }

        async with aiohttp.ClientSession() as session:
            async with session.post(
                endpoint,
                json=payload,
                timeout=aiohttp.ClientTimeout(total=settings.byparr_timeout + 15),
            ) as resp:
                if resp.status != 200:
                    raise ExtractorError(f"Byparr HTTP {resp.status}")
                data = await resp.json()

        if data.get("status") != "ok":
            raise ExtractorError(f"Byparr: {data.get('message', 'unknown error')}")

        solution = data.get("solution", {})
        final_url = solution.get("url", embed_url)
        if not final_url.startswith("http"):
            final_url = embed_url
        base_url = f"https://{urlparse(final_url).netloc}"
        html = solution.get("response", "")

        if "pass_md5" not in html:
            # Byparr may not have the pass_md5 in the initial response.
            # Try two recovery strategies in order:
            #
            # 1. Cookie reuse — if Byparr collected CF clearance cookies before
            #    the page loaded fully, inject them into a curl_cffi request.
            # 2. Plain curl_cffi — Chrome TLS impersonation without JS execution.
            raw_cookies = solution.get("cookies", [])
            cookies = {c["name"]: c["value"] for c in raw_cookies}
            ua = solution.get("userAgent", _DOOD_UA)

            if cookies:
                cf_domain = (
                    next(
                        (c.get("domain", "").lstrip(".") for c in raw_cookies if c.get("name") == "cf_clearance"),
                        None,
                    )
                    or "playmogo.com"
                )
                retry_url = f"https://{cf_domain}/e/{video_id}"
                logger.debug(
                    "Byparr response lacked pass_md5 (final_url=%s); retrying %s with CF cookies via curl_cffi",
                    final_url,
                    retry_url,
                )
                proxy = self._get_proxy(retry_url)
                async with AsyncSession() as s:
                    r = await s.get(
                        retry_url,
                        impersonate="chrome",
                        cookies=cookies,
                        headers={"User-Agent": ua, "Referer": f"https://{cf_domain}/"},
                        timeout=20,
                        **({"proxy": proxy} if proxy else {}),
                    )
                    html = r.text
                    final_url = str(r.url)
                    base_url = f"https://{urlparse(final_url).netloc}"

            if "pass_md5" not in html:
                logger.debug("Byparr cookie reuse also failed; falling back to curl_cffi for %s", embed_url)
                return await self._extract_via_curl_cffi(embed_url, video_id)

        return await self._parse_embed_html(html, base_url)

    # ------------------------------------------------------------------
    # Path 2 – curl_cffi (bypasses CF bot protection; Turnstile may block)
    # ------------------------------------------------------------------

    async def _extract_via_curl_cffi(self, url: str, video_id: str) -> dict:
        proxy = self._get_proxy(url)
        async with AsyncSession() as s:
            r = await s.get(
                url,
                impersonate="chrome",
                headers={"Referer": f"https://{urlparse(url).netloc}/"},
                timeout=30,
                allow_redirects=True,
                **({"proxy": proxy} if proxy else {}),
            )
        final_url = str(r.url)
        html = r.text
        base_url = f"https://{urlparse(final_url).netloc}"

        # Some pages embed a JS redirect instead of a real HTTP redirect.
        js_redirect = re.search(r'(?:window\.location|location\.href)\s*=\s*[\'"]https?://([^/\'"]+)', html)
        if js_redirect:
            redirected_host = js_redirect.group(1)
            redirect_url = f"https://{redirected_host}/e/{video_id}"
            logger.debug("JS redirect detected → %s", redirect_url)
            async with AsyncSession() as s2:
                r2 = await s2.get(
                    redirect_url,
                    impersonate="chrome",
                    headers={"Referer": f"https://{redirected_host}/"},
                    timeout=30,
                    allow_redirects=True,
                    **({"proxy": proxy} if proxy else {}),
                )
            final_url = str(r2.url)
            html = r2.text
            base_url = f"https://{urlparse(final_url).netloc}"

        if "pass_md5" not in html:
            if "turnstile" in html.lower() or "captcha_l" in html:
                raise ExtractorError(
                    "DoodStream: site is serving a Turnstile CAPTCHA that requires "
                    "browser interaction — cannot be bypassed automatically from this "
                    "network location. Try a residential IP or a VPN/proxy."
                )
            raise ExtractorError(f"DoodStream: pass_md5 not found in embed HTML ({final_url})")

        return await self._parse_embed_html(html, base_url)

    # ------------------------------------------------------------------
    # Common HTML parser
    # ------------------------------------------------------------------

    async def _parse_embed_html(self, html: str, base_url: str) -> dict:
        pass_match = re.search(r"(/pass_md5/[^'\"<>\s]+)", html)
        if not pass_match:
            raise ExtractorError("DoodStream: pass_md5 path not found in embed HTML")

        pass_url = urljoin(base_url, pass_match.group(1))
        ua = self.base_headers.get("user-agent") or _DOOD_UA
        headers = {
            "user-agent": ua,
            "referer": f"{base_url}/",
        }

        proxy = self._get_proxy(pass_url)
        async with AsyncSession() as s:
            r = await s.get(
                pass_url,
                impersonate="chrome",
                headers=headers,
                timeout=20,
                **({"proxy": proxy} if proxy else {}),
            )

        base_stream = r.text.strip()
        if not base_stream or "RELOAD" in base_stream:
            raise ExtractorError(
                "DoodStream: pass_md5 endpoint returned no stream URL "
                "(captcha session may have expired). "
                "Ensure BYPARR_URL is set for reliable extraction."
            )

        # CloudFlare R2 storage URLs are self-contained — no salt/token needed.
        if "cloudflarestorage." in base_stream.lower():
            return {
                "destination_url": base_stream,
                "request_headers": headers,
                "mediaflow_endpoint": "proxy_stream_endpoint",
            }

        token_match = re.search(r"token=([^&\s'\"]+)", html)
        if not token_match:
            raise ExtractorError("DoodStream: token not found in embed HTML")

        token = token_match.group(1)
        expiry = int(time.time())
        final_url = f"{base_stream}123456789?token={token}&expiry={expiry}"

        return {
            "destination_url": final_url,
            "request_headers": headers,
            "mediaflow_endpoint": "proxy_stream_endpoint",
        }