mirror of
https://github.com/UrloMythus/UnHided.git
synced 2026-06-10 09:10:23 +00:00
new version
This commit is contained in:
@@ -1,7 +1,7 @@
|
||||
import re
|
||||
import logging
|
||||
from typing import Any, Dict
|
||||
from urllib.parse import urlparse
|
||||
from urllib.parse import urljoin, urlparse
|
||||
|
||||
from mediaflow_proxy.extractors.base import BaseExtractor, ExtractorError
|
||||
from mediaflow_proxy.utils.packed import unpack
|
||||
@@ -14,7 +14,7 @@ class SportsonlineExtractor(BaseExtractor):
|
||||
|
||||
Strategy:
|
||||
1. Fetch page -> find first <iframe src="...">
|
||||
2. Fetch iframe with Referer=https://sportzonline.st/
|
||||
2. Fetch iframe with dynamic source-page Referer/Origin
|
||||
3. Collect packed eval blocks; if >=2 use second (index 1) else first.
|
||||
4. Unpack P.A.C.K.E.R. and search var src="...m3u8".
|
||||
5. Return final m3u8 with referer header.
|
||||
@@ -33,56 +33,125 @@ class SportsonlineExtractor(BaseExtractor):
|
||||
"""
|
||||
Detect and extract packed eval blocks from HTML.
|
||||
"""
|
||||
# Find all eval(function...) blocks - more greedy to capture full packed code
|
||||
pattern = re.compile(r"eval\(function\(p,a,c,k,e,.*?\)\)(?:\s*;|\s*<)", re.DOTALL)
|
||||
raw_matches = pattern.findall(html)
|
||||
raw_matches: list[str] = []
|
||||
strict_eval_pattern = re.compile(r"eval\(function\(p,a,c,k,e,.*?\}\(.*?\)\)", re.DOTALL)
|
||||
relaxed_eval_pattern = re.compile(r"eval\(function\(p,a,c,k,e,[dr]\).*?\}\(.*?\)\)", re.DOTALL)
|
||||
|
||||
# Prefer script-body extraction first. This is more resilient when the packed
|
||||
# code has nested parentheses/semicolons that are hard to capture with a
|
||||
# single regex.
|
||||
script_pattern = re.compile(r"<script[^>]*>(.*?)</script>", re.IGNORECASE | re.DOTALL)
|
||||
for script_body in script_pattern.findall(html):
|
||||
if "eval(function(p,a,c,k,e" in script_body:
|
||||
strict_matches = strict_eval_pattern.findall(script_body)
|
||||
if strict_matches:
|
||||
raw_matches.extend(strict_matches)
|
||||
continue
|
||||
|
||||
relaxed_matches = relaxed_eval_pattern.findall(script_body)
|
||||
if relaxed_matches:
|
||||
raw_matches.extend(relaxed_matches)
|
||||
|
||||
if raw_matches:
|
||||
return raw_matches
|
||||
|
||||
# Fallback: direct eval(...) extraction from raw HTML.
|
||||
raw_matches = strict_eval_pattern.findall(html)
|
||||
|
||||
# If no matches with the strict pattern, try a more relaxed one
|
||||
if not raw_matches:
|
||||
# Try to find eval(function and capture until we find the closing ))
|
||||
pattern = re.compile(r"eval\(function\(p,a,c,k,e,[dr]\).*?\}\(.*?\)\)", re.DOTALL)
|
||||
raw_matches = pattern.findall(html)
|
||||
raw_matches = relaxed_eval_pattern.findall(html)
|
||||
|
||||
return raw_matches
|
||||
|
||||
@staticmethod
|
||||
def _extract_m3u8_candidate(text: str) -> str | None:
|
||||
patterns = [
|
||||
r"var\s+src\s*=\s*[\"']([^\"']+\.m3u8[^\"']*)[\"']",
|
||||
r"src\s*=\s*[\"']([^\"']+\.m3u8[^\"']*)[\"']",
|
||||
r"file\s*:\s*[\"']([^\"']+\.m3u8[^\"']*)[\"']",
|
||||
r"[\"']([^\"']*https?://[^\"']+\.m3u8[^\"']*)[\"']",
|
||||
r"(https?://[^\s\"'>]+\.m3u8[^\s\"'>]*)",
|
||||
r"(//[^\s\"'>]+\.m3u8[^\s\"'>]*)",
|
||||
r"(/[^\s\"'>]+\.m3u8[^\s\"'>]*)",
|
||||
]
|
||||
|
||||
for pattern in patterns:
|
||||
match = re.search(pattern, text)
|
||||
if match:
|
||||
return match.group(1)
|
||||
|
||||
return None
|
||||
|
||||
@staticmethod
|
||||
def _normalize_stream_url(stream_url: str, base_url: str) -> str:
|
||||
cleaned = stream_url.strip().strip("\"'").replace("\\/", "/")
|
||||
if cleaned.startswith("//"):
|
||||
parsed_base = urlparse(base_url)
|
||||
return f"{parsed_base.scheme or 'https'}:{cleaned}"
|
||||
if not urlparse(cleaned).scheme:
|
||||
return urljoin(base_url, cleaned)
|
||||
return cleaned
|
||||
|
||||
async def extract(self, url: str, **kwargs) -> Dict[str, Any]:
|
||||
"""Main extraction flow: fetch page, extract iframe, unpack and find m3u8."""
|
||||
try:
|
||||
parsed_source = urlparse(url)
|
||||
source_origin = f"{parsed_source.scheme}://{parsed_source.netloc}"
|
||||
source_referer = self.base_headers.get("Referer") or self.base_headers.get("referer") or f"{source_origin}/"
|
||||
user_agent = self.base_headers.get("User-Agent") or self.base_headers.get("user-agent") or "Mozilla/5.0"
|
||||
|
||||
# Step 1: Fetch main page
|
||||
logger.info(f"Fetching main page: {url}")
|
||||
main_response = await self._make_request(url, timeout=15)
|
||||
main_response = await self._make_request(
|
||||
url,
|
||||
headers={
|
||||
"Referer": source_referer,
|
||||
"Origin": source_origin,
|
||||
"User-Agent": user_agent,
|
||||
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
||||
"Accept-Language": "en-US,en;q=0.9,it;q=0.8",
|
||||
"Cache-Control": "no-cache",
|
||||
},
|
||||
timeout=15,
|
||||
)
|
||||
main_html = main_response.text
|
||||
parsed_main = urlparse(main_response.url)
|
||||
main_origin = f"{parsed_main.scheme}://{parsed_main.netloc}"
|
||||
|
||||
# Extract first iframe
|
||||
iframe_match = re.search(r'<iframe\s+src=["\']([^"\']+)["\']', main_html, re.IGNORECASE)
|
||||
if not iframe_match:
|
||||
raise ExtractorError("No iframe found on the page")
|
||||
# Extract first iframe (src can appear in any attribute order)
|
||||
iframe_match = re.search(r'<iframe[^>]+(?<!data-)src=["\']([^"\']+)["\']', main_html, re.IGNORECASE)
|
||||
iframe_url = main_response.url
|
||||
iframe_html = main_html
|
||||
|
||||
iframe_url = iframe_match.group(1)
|
||||
if iframe_match:
|
||||
iframe_url = self._normalize_stream_url(iframe_match.group(1), main_response.url)
|
||||
logger.info(f"Found iframe URL: {iframe_url}")
|
||||
|
||||
# Normalize iframe URL
|
||||
if iframe_url.startswith("//"):
|
||||
iframe_url = "https:" + iframe_url
|
||||
elif iframe_url.startswith("/"):
|
||||
parsed_main = urlparse(url)
|
||||
iframe_url = f"{parsed_main.scheme}://{parsed_main.netloc}{iframe_url}"
|
||||
# Step 2: Fetch iframe with source page as referer
|
||||
iframe_headers = {
|
||||
"Referer": main_response.url,
|
||||
"Origin": main_origin,
|
||||
"User-Agent": user_agent,
|
||||
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
||||
"Accept-Language": "en-US,en;q=0.9,it;q=0.8",
|
||||
"Cache-Control": "no-cache",
|
||||
}
|
||||
|
||||
logger.info(f"Found iframe URL: {iframe_url}")
|
||||
iframe_response = await self._make_request(iframe_url, headers=iframe_headers, timeout=15)
|
||||
iframe_html = iframe_response.text
|
||||
iframe_url = iframe_response.url
|
||||
logger.debug(f"Iframe HTML length: {len(iframe_html)}")
|
||||
else:
|
||||
logger.warning("No iframe found on page, attempting extraction from main HTML")
|
||||
|
||||
# Step 2: Fetch iframe with Referer
|
||||
iframe_headers = {
|
||||
"Referer": "https://sportzonline.st/",
|
||||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0 Safari/537.36",
|
||||
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
||||
"Accept-Language": "en-US,en;q=0.9,it;q=0.8",
|
||||
"Cache-Control": "no-cache",
|
||||
parsed_iframe = urlparse(iframe_url)
|
||||
playback_headers = {
|
||||
"Referer": iframe_url,
|
||||
"Origin": f"{parsed_iframe.scheme}://{parsed_iframe.netloc}",
|
||||
"User-Agent": user_agent,
|
||||
}
|
||||
|
||||
iframe_response = await self._make_request(iframe_url, headers=iframe_headers, timeout=15)
|
||||
iframe_html = iframe_response.text
|
||||
|
||||
logger.debug(f"Iframe HTML length: {len(iframe_html)}")
|
||||
|
||||
# Step 3: Detect packed blocks
|
||||
packed_blocks = self._detect_packed_blocks(iframe_html)
|
||||
|
||||
@@ -91,21 +160,19 @@ class SportsonlineExtractor(BaseExtractor):
|
||||
if not packed_blocks:
|
||||
logger.warning("No packed blocks found, trying direct m3u8 search")
|
||||
# Fallback: try direct m3u8 search
|
||||
direct_match = re.search(r'(https?://[^\s"\'>]+\.m3u8[^\s"\'>]*)', iframe_html)
|
||||
direct_match = self._extract_m3u8_candidate(iframe_html)
|
||||
if direct_match:
|
||||
m3u8_url = direct_match.group(1)
|
||||
m3u8_url = self._normalize_stream_url(direct_match, iframe_url)
|
||||
logger.info(f"Found direct m3u8 URL: {m3u8_url}")
|
||||
|
||||
return {
|
||||
"destination_url": m3u8_url,
|
||||
"request_headers": {"Referer": iframe_url, "User-Agent": iframe_headers["User-Agent"]},
|
||||
"request_headers": playback_headers,
|
||||
"mediaflow_endpoint": self.mediaflow_endpoint,
|
||||
}
|
||||
else:
|
||||
raise ExtractorError("No packed blocks or direct m3u8 URL found")
|
||||
|
||||
logger.info(f"Found {len(packed_blocks)} packed blocks")
|
||||
|
||||
# Choose block: if >=2 use second (index 1), else first (index 0)
|
||||
chosen_idx = 1 if len(packed_blocks) > 1 else 0
|
||||
m3u8_url = None
|
||||
@@ -123,22 +190,7 @@ class SportsonlineExtractor(BaseExtractor):
|
||||
|
||||
# Search for var src="...m3u8" with multiple patterns
|
||||
if unpacked_code:
|
||||
# Try multiple patterns as in the TypeScript version
|
||||
patterns = [
|
||||
r'var\s+src\s*=\s*["\']([^"\']+)["\']', # var src="..."
|
||||
r'src\s*=\s*["\']([^"\']+\.m3u8[^"\']*)["\']', # src="...m3u8"
|
||||
r'file\s*:\s*["\']([^"\']+\.m3u8[^"\']*)["\']', # file: "...m3u8"
|
||||
r'["\']([^"\']*https?://[^"\']+\.m3u8[^"\']*)["\']', # any m3u8 URL
|
||||
]
|
||||
|
||||
for pattern in patterns:
|
||||
src_match = re.search(pattern, unpacked_code)
|
||||
if src_match:
|
||||
m3u8_url = src_match.group(1)
|
||||
# Verify it looks like a valid m3u8 URL
|
||||
if ".m3u8" in m3u8_url or "http" in m3u8_url:
|
||||
break
|
||||
m3u8_url = None
|
||||
m3u8_url = self._extract_m3u8_candidate(unpacked_code)
|
||||
|
||||
# If not found, try all other blocks
|
||||
if not m3u8_url:
|
||||
@@ -148,36 +200,30 @@ class SportsonlineExtractor(BaseExtractor):
|
||||
continue
|
||||
try:
|
||||
unpacked_code = unpack(block)
|
||||
# Use the same patterns as above
|
||||
for pattern in [
|
||||
r'var\s+src\s*=\s*["\']([^"\']+)["\']',
|
||||
r'src\s*=\s*["\']([^"\']+\.m3u8[^"\']*)["\']',
|
||||
r'file\s*:\s*["\']([^"\']+\.m3u8[^"\']*)["\']',
|
||||
r'["\']([^"\']*https?://[^"\']+\.m3u8[^"\']*)["\']',
|
||||
]:
|
||||
src_match = re.search(pattern, unpacked_code)
|
||||
if src_match:
|
||||
test_url = src_match.group(1)
|
||||
if ".m3u8" in test_url or "http" in test_url:
|
||||
m3u8_url = test_url
|
||||
logger.info(f"Found m3u8 in block {i}")
|
||||
break
|
||||
|
||||
m3u8_url = self._extract_m3u8_candidate(unpacked_code)
|
||||
if m3u8_url:
|
||||
logger.info(f"Found m3u8 in block {i}")
|
||||
break
|
||||
except Exception as e:
|
||||
logger.debug(f"Failed to process block {i}: {e}")
|
||||
continue
|
||||
|
||||
if not m3u8_url:
|
||||
fallback_candidate = self._extract_m3u8_candidate(iframe_html)
|
||||
if fallback_candidate:
|
||||
m3u8_url = fallback_candidate
|
||||
|
||||
if not m3u8_url:
|
||||
raise ExtractorError("Could not extract m3u8 URL from packed code")
|
||||
|
||||
m3u8_url = self._normalize_stream_url(m3u8_url, iframe_url)
|
||||
|
||||
logger.info(f"Successfully extracted m3u8 URL: {m3u8_url}")
|
||||
|
||||
# Return stream configuration
|
||||
return {
|
||||
"destination_url": m3u8_url,
|
||||
"request_headers": {"Referer": iframe_url, "User-Agent": iframe_headers["User-Agent"]},
|
||||
"request_headers": playback_headers,
|
||||
"mediaflow_endpoint": self.mediaflow_endpoint,
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user