new version

2026-06-10 09:10:23 +00:00 · 2026-04-15 19:23:14 +02:00
parent 5120b19d0b
commit 8134936d59
135 changed files with 3013 additions and 1589 deletions
@@ -1,7 +1,7 @@
 import re
 import logging
 from typing import Any, Dict
-from urllib.parse import urlparse
+from urllib.parse import urljoin, urlparse

 from mediaflow_proxy.extractors.base import BaseExtractor, ExtractorError
 from mediaflow_proxy.utils.packed import unpack
@@ -14,7 +14,7 @@ class SportsonlineExtractor(BaseExtractor):

    Strategy:
    1. Fetch page -> find first <iframe src="...">
-    2. Fetch iframe with Referer=https://sportzonline.st/
+    2. Fetch iframe with dynamic source-page Referer/Origin
    3. Collect packed eval blocks; if >=2 use second (index 1) else first.
    4. Unpack P.A.C.K.E.R. and search var src="...m3u8".
    5. Return final m3u8 with referer header.
@@ -33,56 +33,125 @@ class SportsonlineExtractor(BaseExtractor):
        """
        Detect and extract packed eval blocks from HTML.
        """
-        # Find all eval(function...) blocks - more greedy to capture full packed code
-        pattern = re.compile(r"eval\(function\(p,a,c,k,e,.*?\)\)(?:\s*;|\s*<)", re.DOTALL)
-        raw_matches = pattern.findall(html)
+        raw_matches: list[str] = []
+        strict_eval_pattern = re.compile(r"eval\(function\(p,a,c,k,e,.*?\}\(.*?\)\)", re.DOTALL)
+        relaxed_eval_pattern = re.compile(r"eval\(function\(p,a,c,k,e,[dr]\).*?\}\(.*?\)\)", re.DOTALL)
+
+        # Prefer script-body extraction first. This is more resilient when the packed
+        # code has nested parentheses/semicolons that are hard to capture with a
+        # single regex.
+        script_pattern = re.compile(r"<script[^>]*>(.*?)</script>", re.IGNORECASE | re.DOTALL)
+        for script_body in script_pattern.findall(html):
+            if "eval(function(p,a,c,k,e" in script_body:
+                strict_matches = strict_eval_pattern.findall(script_body)
+                if strict_matches:
+                    raw_matches.extend(strict_matches)
+                    continue
+
+                relaxed_matches = relaxed_eval_pattern.findall(script_body)
+                if relaxed_matches:
+                    raw_matches.extend(relaxed_matches)
+
+        if raw_matches:
+            return raw_matches
+
+        # Fallback: direct eval(...) extraction from raw HTML.
+        raw_matches = strict_eval_pattern.findall(html)

        # If no matches with the strict pattern, try a more relaxed one
        if not raw_matches:
-            # Try to find eval(function and capture until we find the closing ))
-            pattern = re.compile(r"eval\(function\(p,a,c,k,e,[dr]\).*?\}\(.*?\)\)", re.DOTALL)
-            raw_matches = pattern.findall(html)
+            raw_matches = relaxed_eval_pattern.findall(html)

        return raw_matches

+    @staticmethod
+    def _extract_m3u8_candidate(text: str) -> str | None:
+        patterns = [
+            r"var\s+src\s*=\s*[\"']([^\"']+\.m3u8[^\"']*)[\"']",
+            r"src\s*=\s*[\"']([^\"']+\.m3u8[^\"']*)[\"']",
+            r"file\s*:\s*[\"']([^\"']+\.m3u8[^\"']*)[\"']",
+            r"[\"']([^\"']*https?://[^\"']+\.m3u8[^\"']*)[\"']",
+            r"(https?://[^\s\"'>]+\.m3u8[^\s\"'>]*)",
+            r"(//[^\s\"'>]+\.m3u8[^\s\"'>]*)",
+            r"(/[^\s\"'>]+\.m3u8[^\s\"'>]*)",
+        ]
+
+        for pattern in patterns:
+            match = re.search(pattern, text)
+            if match:
+                return match.group(1)
+
+        return None
+
+    @staticmethod
+    def _normalize_stream_url(stream_url: str, base_url: str) -> str:
+        cleaned = stream_url.strip().strip("\"'").replace("\\/", "/")
+        if cleaned.startswith("//"):
+            parsed_base = urlparse(base_url)
+            return f"{parsed_base.scheme or 'https'}:{cleaned}"
+        if not urlparse(cleaned).scheme:
+            return urljoin(base_url, cleaned)
+        return cleaned
+
    async def extract(self, url: str, **kwargs) -> Dict[str, Any]:
        """Main extraction flow: fetch page, extract iframe, unpack and find m3u8."""
        try:
+            parsed_source = urlparse(url)
+            source_origin = f"{parsed_source.scheme}://{parsed_source.netloc}"
+            source_referer = self.base_headers.get("Referer") or self.base_headers.get("referer") or f"{source_origin}/"
+            user_agent = self.base_headers.get("User-Agent") or self.base_headers.get("user-agent") or "Mozilla/5.0"
+
            # Step 1: Fetch main page
            logger.info(f"Fetching main page: {url}")
-            main_response = await self._make_request(url, timeout=15)
+            main_response = await self._make_request(
+                url,
+                headers={
+                    "Referer": source_referer,
+                    "Origin": source_origin,
+                    "User-Agent": user_agent,
+                    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
+                    "Accept-Language": "en-US,en;q=0.9,it;q=0.8",
+                    "Cache-Control": "no-cache",
+                },
+                timeout=15,
+            )
            main_html = main_response.text
+            parsed_main = urlparse(main_response.url)
+            main_origin = f"{parsed_main.scheme}://{parsed_main.netloc}"

-            # Extract first iframe
-            iframe_match = re.search(r'<iframe\s+src=["\']([^"\']+)["\']', main_html, re.IGNORECASE)
-            if not iframe_match:
-                raise ExtractorError("No iframe found on the page")
+            # Extract first iframe (src can appear in any attribute order)
+            iframe_match = re.search(r'<iframe[^>]+(?<!data-)src=["\']([^"\']+)["\']', main_html, re.IGNORECASE)
+            iframe_url = main_response.url
+            iframe_html = main_html

-            iframe_url = iframe_match.group(1)
+            if iframe_match:
+                iframe_url = self._normalize_stream_url(iframe_match.group(1), main_response.url)
+                logger.info(f"Found iframe URL: {iframe_url}")

-            # Normalize iframe URL
-            if iframe_url.startswith("//"):
-                iframe_url = "https:" + iframe_url
-            elif iframe_url.startswith("/"):
-                parsed_main = urlparse(url)
-                iframe_url = f"{parsed_main.scheme}://{parsed_main.netloc}{iframe_url}"
+                # Step 2: Fetch iframe with source page as referer
+                iframe_headers = {
+                    "Referer": main_response.url,
+                    "Origin": main_origin,
+                    "User-Agent": user_agent,
+                    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
+                    "Accept-Language": "en-US,en;q=0.9,it;q=0.8",
+                    "Cache-Control": "no-cache",
+                }

-            logger.info(f"Found iframe URL: {iframe_url}")
+                iframe_response = await self._make_request(iframe_url, headers=iframe_headers, timeout=15)
+                iframe_html = iframe_response.text
+                iframe_url = iframe_response.url
+                logger.debug(f"Iframe HTML length: {len(iframe_html)}")
+            else:
+                logger.warning("No iframe found on page, attempting extraction from main HTML")

-            # Step 2: Fetch iframe with Referer
-            iframe_headers = {
-                "Referer": "https://sportzonline.st/",
-                "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0 Safari/537.36",
-                "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
-                "Accept-Language": "en-US,en;q=0.9,it;q=0.8",
-                "Cache-Control": "no-cache",
+            parsed_iframe = urlparse(iframe_url)
+            playback_headers = {
+                "Referer": iframe_url,
+                "Origin": f"{parsed_iframe.scheme}://{parsed_iframe.netloc}",
+                "User-Agent": user_agent,
            }

-            iframe_response = await self._make_request(iframe_url, headers=iframe_headers, timeout=15)
-            iframe_html = iframe_response.text
-
-            logger.debug(f"Iframe HTML length: {len(iframe_html)}")
-
            # Step 3: Detect packed blocks
            packed_blocks = self._detect_packed_blocks(iframe_html)

@@ -91,21 +160,19 @@ class SportsonlineExtractor(BaseExtractor):
            if not packed_blocks:
                logger.warning("No packed blocks found, trying direct m3u8 search")
                # Fallback: try direct m3u8 search
-                direct_match = re.search(r'(https?://[^\s"\'>]+\.m3u8[^\s"\'>]*)', iframe_html)
+                direct_match = self._extract_m3u8_candidate(iframe_html)
                if direct_match:
-                    m3u8_url = direct_match.group(1)
+                    m3u8_url = self._normalize_stream_url(direct_match, iframe_url)
                    logger.info(f"Found direct m3u8 URL: {m3u8_url}")

                    return {
                        "destination_url": m3u8_url,
-                        "request_headers": {"Referer": iframe_url, "User-Agent": iframe_headers["User-Agent"]},
+                        "request_headers": playback_headers,
                        "mediaflow_endpoint": self.mediaflow_endpoint,
                    }
                else:
                    raise ExtractorError("No packed blocks or direct m3u8 URL found")

-            logger.info(f"Found {len(packed_blocks)} packed blocks")
-
            # Choose block: if >=2 use second (index 1), else first (index 0)
            chosen_idx = 1 if len(packed_blocks) > 1 else 0
            m3u8_url = None
@@ -123,22 +190,7 @@ class SportsonlineExtractor(BaseExtractor):

            # Search for var src="...m3u8" with multiple patterns
            if unpacked_code:
-                # Try multiple patterns as in the TypeScript version
-                patterns = [
-                    r'var\s+src\s*=\s*["\']([^"\']+)["\']',  # var src="..."
-                    r'src\s*=\s*["\']([^"\']+\.m3u8[^"\']*)["\']',  # src="...m3u8"
-                    r'file\s*:\s*["\']([^"\']+\.m3u8[^"\']*)["\']',  # file: "...m3u8"
-                    r'["\']([^"\']*https?://[^"\']+\.m3u8[^"\']*)["\']',  # any m3u8 URL
-                ]
-
-                for pattern in patterns:
-                    src_match = re.search(pattern, unpacked_code)
-                    if src_match:
-                        m3u8_url = src_match.group(1)
-                        # Verify it looks like a valid m3u8 URL
-                        if ".m3u8" in m3u8_url or "http" in m3u8_url:
-                            break
-                        m3u8_url = None
+                m3u8_url = self._extract_m3u8_candidate(unpacked_code)

            # If not found, try all other blocks
            if not m3u8_url:
@@ -148,36 +200,30 @@ class SportsonlineExtractor(BaseExtractor):
                        continue
                    try:
                        unpacked_code = unpack(block)
-                        # Use the same patterns as above
-                        for pattern in [
-                            r'var\s+src\s*=\s*["\']([^"\']+)["\']',
-                            r'src\s*=\s*["\']([^"\']+\.m3u8[^"\']*)["\']',
-                            r'file\s*:\s*["\']([^"\']+\.m3u8[^"\']*)["\']',
-                            r'["\']([^"\']*https?://[^"\']+\.m3u8[^"\']*)["\']',
-                        ]:
-                            src_match = re.search(pattern, unpacked_code)
-                            if src_match:
-                                test_url = src_match.group(1)
-                                if ".m3u8" in test_url or "http" in test_url:
-                                    m3u8_url = test_url
-                                    logger.info(f"Found m3u8 in block {i}")
-                                    break
-
+                        m3u8_url = self._extract_m3u8_candidate(unpacked_code)
                        if m3u8_url:
+                            logger.info(f"Found m3u8 in block {i}")
                            break
                    except Exception as e:
                        logger.debug(f"Failed to process block {i}: {e}")
                        continue

+            if not m3u8_url:
+                fallback_candidate = self._extract_m3u8_candidate(iframe_html)
+                if fallback_candidate:
+                    m3u8_url = fallback_candidate
+
            if not m3u8_url:
                raise ExtractorError("Could not extract m3u8 URL from packed code")

+            m3u8_url = self._normalize_stream_url(m3u8_url, iframe_url)
+
            logger.info(f"Successfully extracted m3u8 URL: {m3u8_url}")

            # Return stream configuration
            return {
                "destination_url": m3u8_url,
-                "request_headers": {"Referer": iframe_url, "User-Agent": iframe_headers["User-Agent"]},
+                "request_headers": playback_headers,
                "mediaflow_endpoint": self.mediaflow_endpoint,
            }