Files
UnHided/mediaflow_proxy/extractors/maxstream.py
T
UrloMythus bd208c63ff new version
2026-05-19 20:28:26 +02:00

656 lines
26 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""Maxstream URL extractor — full uprot bypass pipeline.
Solves the problem of `uprot.net` redirects on `/msf/`, `/msfi/` and
`/msfld/` paths used by Italian aggregators (CB01, EuroStreaming, etc).
Key features:
1. TLS-fingerprint-resistant fetch via curl_cffi (chrome131 impersonation)
2. 4-digit captcha solver with multi-engine OCR ensemble:
ddddocr (primary) → tesseract (fallback) → CF Workers AI (3rd, opt-in)
3. Honeypot URL filtering on the post-captcha page
4. uprots/uprotem → maxstream redirect chain follow with cookie continuity
5. /msfld/ folder picker (season + episode kwargs from MFP route)
6. Optional persistent URL cache (when paired with services/uprot_warmer.py)
All advanced features are guarded by lazy imports — if `curl_cffi`,
`pytesseract`, `Pillow` or `ddddocr` are not installed the extractor
falls back to the previous behaviour for `/msf/` URLs and skips
`/msfld/` cleanly.
Activation:
CF_WORKER_OCR_URL e.g. https://easyproxy-ocr.user.workers.dev
CF_WORKER_OCR_AUTH Worker AUTH_TOKEN
Credits: pipeline ported from NelloStream
(https://github.com/vitouchiha/nello-stream) — `workers/cfworker.js`
functions `_uprotBypassWithCookies`, `_extractMaxstreamVideo`,
`_aiOcrDigits`, `_handleScheduledUprotRefresh`. All credit to Nello.
"""
import asyncio
import logging
import os
import re
from typing import Any, Dict, Optional
from urllib.parse import urljoin, urlparse, urlencode
from bs4 import BeautifulSoup
from mediaflow_proxy.extractors.base import BaseExtractor, ExtractorError
logger = logging.getLogger(__name__)
class MaxstreamExtractor(BaseExtractor):
"""Maxstream URL extractor with full uprot bypass pipeline."""
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.mediaflow_endpoint = "hls_manifest_proxy"
# Persistent cookie jar across the uprot → maxstream redirect chain.
# PHPSESSID + captcha hash + uprot_session must travel together for
# the post-captcha redirect to be honoured by the maxstream WAF.
self.cookies: Dict[str, str] = {}
self._last_solve_text: Optional[str] = None
# ───────────────────────── HTTP layer ──────────────────────────────
async def _curl_cffi_fetch(
self,
url: str,
method: str = "GET",
data: Optional[Any] = None,
headers: Optional[Dict[str, str]] = None,
allow_redirects: bool = True,
timeout: int = 30,
) -> Optional[Dict[str, Any]]:
"""Browser-impersonated fetch via curl_cffi.
uprot.net inspects TLS fingerprints; aiohttp's JA3 is recognised as
a bot within a few requests and served captcha pages or 503 even
from clean residential IPs. curl_cffi with `impersonate="chrome131"`
replays a real Chrome JA3 + ALPN order, so uprot serves the real
redirect link or the (legitimately-protected) captcha page.
Returns None if curl_cffi is not installed (caller falls back to
BaseExtractor._make_request for the simpler legacy /msf/ path).
"""
try:
from curl_cffi import requests as cffi_requests
except ImportError:
logger.debug("curl_cffi not installed — uprot bypass disabled")
return None
merged_headers = dict(self.base_headers)
if headers:
merged_headers.update(headers)
if method.upper() == "POST" and isinstance(data, (str, bytes)):
merged_headers.setdefault("content-type", "application/x-www-form-urlencoded")
proxy = self._get_proxy(url)
proxies_arg = {"http": proxy, "https": proxy} if proxy else None
loop = asyncio.get_running_loop()
def _do_request():
try:
req_cookies = dict(self.cookies) if self.cookies else None
r = cffi_requests.request(
method,
url,
headers=merged_headers,
data=data,
cookies=req_cookies,
proxies=proxies_arg,
impersonate="chrome131",
timeout=timeout,
allow_redirects=allow_redirects,
)
cookies = {}
try:
cookies = {c.name: c.value for c in r.cookies.jar}
except Exception:
cookies = dict(r.cookies) if r.cookies else {}
return {
"ok": r.status_code < 400,
"status": r.status_code,
"text": r.text,
"content": r.content,
"url": str(r.url),
"headers": dict(r.headers),
"cookies": cookies,
}
except Exception as e:
return {
"ok": False,
"status": 0,
"text": "",
"content": b"",
"url": url,
"headers": {},
"cookies": {},
"error": str(e),
}
result = await loop.run_in_executor(None, _do_request)
if result.get("cookies"):
self.cookies.update(result["cookies"])
return result
# ─────────────────────── Honeypot filter ───────────────────────────
@staticmethod
def _strip_uprot_honeypots(html: str) -> str:
"""Remove uprot's anti-bot honeypot blocks before URL extraction.
The post-captcha success page intentionally hides decoy URLs in:
1. HTML comments (<!-- … -->)
2. <div style="display:none">…</div> blocks containing fake
"Continue" buttons that point to placeholder URLs like
`maxstream.video/uprots/123456789012` (12 sequential digits).
A naive regex grabs the FIRST match (the honeypot). Strip both
before parsing so the regex/BS4 see only the visible-to-user DOM.
"""
no_comments = re.sub(r"<!--[\s\S]*?-->", "", html)
no_hidden = re.sub(
r"<div[^>]*style=[\"'][^\"']*display\s*:\s*none[^\"']*[\"'][^>]*>[\s\S]*?</div>",
"",
no_comments,
flags=re.IGNORECASE,
)
return no_hidden
# ─────────────────────── Redirect parser ───────────────────────────
def _parse_uprot_html(self, text: str) -> Optional[str]:
"""Parse a uprot success page and return the next-hop URL.
Strategy mirrored from NelloStream `_uprotBypassWithCookies`:
1. Strip honeypot blocks first
2. Prefer explicit `id="buttok"` CONTINUE button (uprot marker)
3. Fallback: <a><button>Continue</button></a> (case+spacing tolerant)
4. Last resort: a `/uprots/` or `/uprotem/` URL appearing exactly
once in the cleaned HTML (uprot scatters multiple decoys)
5. Generic stayonline.pro / maxstream.video regex with honeypot
literal filter
6. window.location / meta refresh / BS4 button fallbacks
"""
cleaned = self._strip_uprot_honeypots(text).replace("\\/", "/")
def _valid(c):
if not c:
return None
try:
p = urlparse(c)
if p.netloc and "maxstream.video" in p.netloc and p.path.startswith("/cdn-cgi/"):
return None
except Exception:
pass
return c
# 1. id="buttok" CONTINUE button
m = re.search(
r'href=["\'](https?://[^"\']+)["\'][^>]*>\s*<button[^>]*id=["\']buttok["\'][^>]*>\s*C\s*O\s*N\s*T\s*I\s*N\s*U\s*E',
cleaned,
re.IGNORECASE,
)
if m and _valid(m.group(1)):
return m.group(1)
# 2. Generic <a><button>Continue</button></a>
m = re.search(
r'href=["\'](https?://[^"\']+)["\'][^>]*>\s*<button[^>]*>\s*[Cc]\s*[Oo]\s*[Nn]\s*[Tt]\s*[Ii]\s*[Nn]\s*[Uu]\s*[Ee]',
cleaned,
)
if m and _valid(m.group(1)):
return m.group(1)
# 3. Unique uprots/uprotem URL
all_uprots = re.findall(
r'href=["\'](https?://[^"\']*uprot(?:s|em)/[^"\']+)["\']',
cleaned,
re.IGNORECASE,
)
if all_uprots:
counts: Dict[str, int] = {}
for u in all_uprots:
counts[u] = counts.get(u, 0) + 1
unique = [u for u, c in counts.items() if c == 1]
if unique and _valid(unique[0]):
return unique[0]
# 4. Generic stayonline / maxstream regex
m = re.search(
r'https?://(?:www\.)?(?:stayonline\.pro|maxstream\.video)[^"\'\s<>\\ ]+',
cleaned,
)
if m and "/uprots/123456789012" not in m.group(0) and _valid(m.group(0)):
return m.group(0)
# 5. window.location / meta refresh
m = re.search(r'window\.location(?:\.href)?\s*=\s*["\']([^"\']+)["\']', cleaned)
if m and _valid(m.group(1)):
return m.group(1)
m = re.search(r'content=["\']0;\s*url=([^"\']+)["\']', cleaned, re.I)
if m and _valid(m.group(1)):
return m.group(1)
# 6. BS4 buttons / forms (rare paths)
soup = BeautifulSoup(cleaned, "lxml")
for btn in soup.find_all(["a", "button"]):
t = btn.get_text().strip().lower()
if "continue" in t or "continua" in t or "vai al" in t:
href = btn.get("href")
if not href and btn.parent and btn.parent.name == "a":
href = btn.parent.get("href")
if href and "uprot.net" not in href and _valid(href):
return href
return None
def _parse_uprot_folder(self, text: str, season, episode) -> Optional[str]:
"""Parse a /msfld/ folder HTML and return the /msfi/ link for S{ss}E{ee}."""
try:
s_int = int(season)
e_int = int(episode)
except (TypeError, ValueError):
return None
s_pad = f"{s_int:02d}"
e_pad = f"{e_int:02d}"
patterns = [
rf"S{s_pad}E{e_pad}",
rf"\b0*{s_int}x0*{e_int}\b",
rf"\b0*{s_int}&#215;0*{e_int}\b",
rf"\b0*{s_int}×0*{e_int}\b",
]
for pat in patterns:
m = re.search(
rf"{pat}[\s\S]{{0,500}}?href=['\"]([^'\"]+/msfi/[^'\"]+)['\"]",
text,
re.I,
)
if m:
return m.group(1)
return None
# ─────────────────────── OCR backends ──────────────────────────────
@staticmethod
def _preprocess_captcha_png(img_bytes: bytes) -> bytes:
"""Binarize + denoise the captcha PNG to boost ddddocr accuracy."""
try:
from PIL import Image, ImageFilter
import io
img = Image.open(io.BytesIO(img_bytes)).convert("L")
img = img.point(lambda p: 255 if p >= 140 else 0, mode="L")
img = img.filter(ImageFilter.MaxFilter(3))
img = img.filter(ImageFilter.MinFilter(3))
out = io.BytesIO()
img.save(out, format="PNG")
return out.getvalue()
except Exception:
return img_bytes
@staticmethod
def _tesseract_classify(img_bytes: bytes) -> str:
try:
import pytesseract
from PIL import Image, ImageFilter
import io
img = Image.open(io.BytesIO(img_bytes)).convert("L")
img = img.point(lambda p: 255 if p >= 140 else 0, mode="L")
img = img.filter(ImageFilter.MaxFilter(3))
img = img.filter(ImageFilter.MinFilter(3))
return pytesseract.image_to_string(img, config="--psm 7 -c tessedit_char_whitelist=0123456789").strip()
except Exception:
return ""
@staticmethod
async def _cf_worker_ocr(img_bytes: bytes, expected_digits: int = 4) -> str:
"""Optional 3rd OCR backend: Cloudflare Workers AI vision LLM.
ddddocr + tesseract top out at ~50-65% on uprot's noisy captcha.
A vision LLM (Llama 4 Scout / Gemma 3 / LLaVA) gets ~80-90%.
POSTs the captcha PNG to a user-deployed CF Worker (see
docs/MAXSTREAM_UPROT.md for setup).
Activated only when both env vars are set:
CF_WORKER_OCR_URL
CF_WORKER_OCR_AUTH
Returns "" on any failure — caller falls through gracefully.
"""
base = (os.getenv("CF_WORKER_OCR_URL") or "").strip().rstrip("/")
if not base:
return ""
auth = (os.getenv("CF_WORKER_OCR_AUTH") or "").strip()
try:
import aiohttp
headers = {"content-type": "image/png"}
if auth:
headers["x-worker-auth"] = auth
async with aiohttp.ClientSession(timeout=aiohttp.ClientTimeout(total=20)) as s:
async with s.post(
f"{base}/?ocr=1&digits={expected_digits}",
data=img_bytes,
headers=headers,
) as resp:
if resp.status != 200:
return ""
data = await resp.json()
return (data.get("digits") or "").strip()
except Exception as e:
logger.debug(f"CF Worker OCR failed: {e}")
return ""
# ─────────────────── Captcha solver loop ───────────────────────────
async def _solve_uprot_captcha_once(self, text: str, original_url: str, preprocess: bool = False) -> Optional[str]:
try:
import ddddocr
except ImportError:
logger.debug("ddddocr not installed — skipping captcha solve")
return None
soup = BeautifulSoup(text, "lxml")
img_tag = soup.find("img", src=re.compile(r"data:image/|/captcha|/image/|captcha\.php"))
img_url = img_tag.get("src") if img_tag else None
if not img_url:
m = re.search(
r'<img[^>]+src=["\']([^"\']*(?:data:image/|captcha|image)[^"\']*)["\']',
text,
)
img_url = m.group(1) if m else None
if not img_url:
return None
form = soup.find("form")
form_action = form.get("action") if form else ""
if not form_action or form_action == "#":
form_action = original_url
elif form_action.startswith("/"):
p = urlparse(original_url)
form_action = f"{p.scheme}://{p.netloc}{form_action}"
# Download captcha image
if img_url.startswith("data:"):
try:
import base64
_, b64 = img_url.split(",", 1)
img_data = base64.b64decode(b64)
except Exception:
return None
else:
full_url = img_url
if full_url.startswith("/"):
p = urlparse(original_url)
full_url = f"{p.scheme}://{p.netloc}{full_url}"
res = await self._curl_cffi_fetch(full_url)
if not res or not res.get("ok"):
return None
img_data = res.get("content") or b""
ocr_input = self._preprocess_captcha_png(img_data) if preprocess else img_data
if not hasattr(self, "_ocr_engine"):
self._ocr_engine = ddddocr.DdddOcr(show_ad=False)
res_str = self._ocr_engine.classification(ocr_input)
res_digits = "".join(c for c in str(res_str) if c.isdigit())
# Accept 3-or-4 digit answers (uprot uses 4 today; legacy 3 still seen)
def _ok(n):
return 3 <= n <= 4
if not _ok(len(res_digits)):
tess = self._tesseract_classify(ocr_input)
tess_digits = "".join(c for c in str(tess) if c.isdigit())
if _ok(len(tess_digits)):
res_digits = tess_digits
else:
cf = await self._cf_worker_ocr(ocr_input, expected_digits=4)
cf_digits = "".join(c for c in str(cf) if c.isdigit())
if _ok(len(cf_digits)):
res_digits = cf_digits
else:
return None
# Prepare POST data
captcha_input = soup.find("input", {"name": re.compile(r"captcha|code|val", re.I)})
if captcha_input and captcha_input.get("name"):
field_name = captcha_input["name"]
else:
m = re.search(r'name=["\'](captcha|code|val|captch5)[^"\']*["\']', text, re.I)
field_name = m.group(1) if m else "captcha"
post_data = {field_name: res_digits}
if form:
for inp in form.find_all(["input", "button", "select"]):
n = inp.get("name")
v = inp.get("value", "")
if n and n not in post_data:
post_data[n] = v
headers = {**self.base_headers, "referer": original_url}
result = await self._curl_cffi_fetch(form_action, method="POST", data=urlencode(post_data), headers=headers)
if not result:
return None
solved_text = result.get("text") or ""
self._last_solve_text = solved_text if isinstance(solved_text, str) else None
return self._parse_uprot_html(solved_text)
async def _solve_uprot_captcha(self, text: str, original_url: str, max_attempts: int = 4) -> Optional[str]:
"""Solve the captcha with retries on fresh images.
Each wrong submit triggers uprot to serve a brand-new captcha
image; we feed that fresh page back into the next attempt instead
of OCRing the same image with different preprocessing.
"""
current = text
for attempt in range(1, max_attempts + 1):
preprocess = attempt % 2 == 0
result = await self._solve_uprot_captcha_once(current, original_url, preprocess=preprocess)
if result:
return result
new_text = self._last_solve_text
if new_text and new_text != current:
current = new_text
return None
# ──────────────────── Redirect chain ───────────────────────────────
async def _follow_uprots_chain(self, url: str, max_hops: int = 10) -> str:
"""Walk the uprots/uprotem → maxstream redirect chain manually.
After captcha, the URL we extract is usually
`maxstream.video/uprots/<token>` whose WAF only honours the token
when reached via the proper redirect chain (Referer + cookie
continuity from uprot.net). Direct GET → Error 131.
Walks hop-by-hop preserving cookies until landing on
`maxsun{N}.online/watchfree/...` or `maxstream.video/emvvv/<id>`,
then converts watchfree → emvvv so the existing packer extraction
works.
"""
if "/uprots/" not in url and "/uprotem/" not in url:
return url
current = url
for _ in range(max_hops):
res = await self._curl_cffi_fetch(
current,
headers={**self.base_headers, "referer": "https://uprot.net/"},
allow_redirects=False,
timeout=15,
)
if not res:
break
loc = (res.get("headers") or {}).get("location") or (res.get("headers") or {}).get("Location")
if not loc:
current = res.get("url") or current
break
current = urljoin(current, loc)
if "/uprots/" not in current and "/uprotem/" not in current:
break
if "watchfree/" in current:
try:
tail = current.split("watchfree/", 1)[1]
segments = [s for s in tail.split("/") if s]
if len(segments) >= 2:
current = f"https://maxstream.video/emvvv/{segments[1]}"
except Exception:
pass
return current
# ─────────────────────── Public flow ───────────────────────────────
async def get_uprot(self, link: str, season=None, episode=None) -> str:
"""Resolve a uprot URL to its maxstream destination.
Supports:
- /msf/{id} single movie (legacy alias /mse/)
- /msfi/{id} single episode
- /msfld/{id} folder of episodes (requires season + episode)
"""
# Map only the modern /msf/ single-video path to its legacy /mse/
# alias. A naive str.replace("msf", "mse") corrupts /msfld/ into
# /mseld/ (404) and /msfi/ into /msei/ (deprecated 500 on new IDs).
link = re.sub(r"/msf/", "/mse/", link)
# Try curl_cffi first; fall back to BaseExtractor._make_request if
# curl_cffi isn't installed (legacy /msf/ path may still work).
cffi = await self._curl_cffi_fetch(link)
if cffi and cffi.get("ok"):
text = cffi["text"]
else:
response = await self._make_request(link)
text = response.text
if "/msfld/" in link:
if season is None or episode is None:
raise ExtractorError("msfld folder URL requires 'season' and 'episode' parameters")
episode_link = self._parse_uprot_folder(text, season, episode)
if not episode_link:
raise ExtractorError(f"Episode S{season}E{episode} not found in msfld folder")
link = episode_link
cffi = await self._curl_cffi_fetch(link)
if cffi and cffi.get("ok"):
text = cffi["text"]
else:
response = await self._make_request(link)
text = response.text
# 1. Direct parse — works on legacy uprot pages without captcha
res = self._parse_uprot_html(text)
if res:
return res
# 2. Captcha solver
res = await self._solve_uprot_captcha(text, link)
if res:
return res
raise ExtractorError("Redirect link not found in uprot page")
async def extract(self, url: str, **kwargs) -> Dict[str, Any]:
"""Extract Maxstream URL.
For /msfld/ folder URLs, callers must pass season=N&episode=M as
query parameters (forwarded by MFP routes as kwargs).
Optional persistent cache: if `mediaflow_proxy.services.uprot_url_cache`
is importable, cache hits skip captcha+chain entirely (<100ms).
"""
season = kwargs.get("season")
episode = kwargs.get("episode")
cached = None
try:
from mediaflow_proxy.services import uprot_url_cache # type: ignore
cached = uprot_url_cache.get(url, season=season, episode=episode)
except Exception:
pass
if cached:
logger.debug(f"uprot cache HIT: {url[:80]}")
maxstream_url = cached
else:
maxstream_url = await self.get_uprot(url, season=season, episode=episode)
maxstream_url = await self._follow_uprots_chain(maxstream_url)
# Fetch the maxstream embed page
cffi = await self._curl_cffi_fetch(
maxstream_url,
headers={**self.base_headers, "referer": "https://uprot.net/", "accept-language": "en-US,en;q=0.5"},
)
if cffi and cffi.get("ok"):
text = cffi["text"]
else:
response = await self._make_request(maxstream_url, headers={"accept-language": "en-US,en;q=0.5"})
text = response.text
if not cached:
try:
from mediaflow_proxy.services import uprot_url_cache # type: ignore
uprot_url_cache.put(url, maxstream_url, season=season, episode=episode)
except Exception:
pass
# Direct sources check
m = re.search(r'sources:\s*\[\{src:\s*"([^"]+)"', text)
if m:
return {
"destination_url": m.group(1),
"request_headers": {**self.base_headers, "referer": maxstream_url},
"mediaflow_endpoint": self.mediaflow_endpoint,
}
# Packer fallback
m = re.search(r"\}\('(.+)',.+,'(.+)'\.split", text)
if not m:
m = re.search(r"eval\(function\(p,a,c,k,e,d\).+?\}\('(.+?)',.+?,'(.+?)'\.split", text, re.S)
if not m:
raise ExtractorError("Failed to extract URL components")
terms = m.group(2).split("|")
try:
urlset_index = terms.index("urlset")
hls_index = terms.index("hls")
sources_index = terms.index("sources")
except ValueError as e:
raise ExtractorError(f"Missing components in packer: {e}")
result_parts = terms[urlset_index + 1 : hls_index]
reversed_elements = result_parts[::-1]
first_part_terms = terms[hls_index + 1 : sources_index]
reversed_first_part = first_part_terms[::-1]
first_url_part = ""
for fp in reversed_first_part:
if "0" in fp:
first_url_part += fp
else:
first_url_part += fp + "-"
base_url = f"https://{first_url_part.rstrip('-')}.host-cdn.net/hls/"
if len(reversed_elements) == 1:
final_url = base_url + "," + reversed_elements[0] + ".urlset/master.m3u8"
else:
final_url = base_url
for element in reversed_elements:
final_url += element + ","
final_url = final_url.rstrip(",") + ".urlset/master.m3u8"
self.base_headers["referer"] = url
return {
"destination_url": final_url,
"request_headers": self.base_headers,
"mediaflow_endpoint": self.mediaflow_endpoint,
}