Files
UnHided/mediaflow_proxy/remuxer/container_probe.py
UrloMythus cfc6bbabc9 update
2026-02-19 20:15:03 +01:00

615 lines
22 KiB
Python

"""
Container format probing -- MKV Cues and MP4 moov.
Pure Python probing using EBML parsing (MKV) and struct-based atom
scanning (MP4). No FFmpeg dependency.
Source-agnostic: accepts any MediaSource protocol implementation
(Telegram, HTTP, etc.) for byte-range reads.
Provides:
- probe_mkv_cues: probe MKV file to extract seek index (MKVCueIndex)
- probe_mp4_moov: probe MP4 file to extract moov atom and build seek index (MP4Index)
"""
import base64
import hashlib
import json
import logging
import struct
from mediaflow_proxy.utils import redis_utils
from mediaflow_proxy.remuxer.ebml_parser import (
MKVCueIndex,
build_cue_index,
parse_ebml_header,
parse_seek_head,
CUES,
INFO,
)
from mediaflow_proxy.remuxer.mp4_parser import (
MP4Index,
build_cue_points_from_moov,
is_mp4_header,
rewrite_moov_offsets,
)
logger = logging.getLogger(__name__)
# How much of the MKV header to fetch for SeekHead + Info parsing
_HEADER_PROBE_SIZE = 64 * 1024 # 64 KB
# Max Cues element size we'll attempt to fetch
_MAX_CUES_SIZE = 2 * 1024 * 1024 # 2 MB
# Redis cache for MKV Cue indexes
_CUE_INDEX_CACHE_PREFIX = "mfp:cue_index:"
_CUE_INDEX_CACHE_TTL = 3600 # 1 hour
# =============================================================================
# MKV Cues probing
# =============================================================================
def derive_cue_cache_key(
source_key: str = "",
*,
chat_id: str | int | None = None,
message_id: int | None = None,
file_id: str | None = None,
) -> str:
"""
Derive a deterministic cache key for a file's cue index.
Accepts either a pre-computed source_key (from MediaSource.cache_key)
or legacy Telegram-style parameters for backwards compatibility.
"""
if source_key:
return source_key
if file_id:
raw = f"file_id:{file_id}"
elif chat_id is not None and message_id is not None:
raw = f"chat:{chat_id}:msg:{message_id}"
else:
return ""
return hashlib.sha256(raw.encode()).hexdigest()[:16]
async def _get_cached_cue_index(cache_key: str) -> MKVCueIndex | None:
"""Try to load a MKVCueIndex from Redis cache."""
if not cache_key:
return None
r = await redis_utils.get_redis()
if r is None:
return None
redis_key = f"{_CUE_INDEX_CACHE_PREFIX}{cache_key}"
data = await r.get(redis_key)
if not data:
return None
try:
d = json.loads(data)
seek_header = b""
if d.get("seek_header_b64"):
seek_header = base64.b64decode(d["seek_header_b64"])
video_codec_private = b""
if d.get("video_codec_private_b64"):
video_codec_private = base64.b64decode(d["video_codec_private_b64"])
index = MKVCueIndex(
duration_ms=d["duration_ms"],
timestamp_scale=d["timestamp_scale"],
cue_points=[(cp[0], cp[1]) for cp in d["cue_points"]],
segment_data_offset=d["segment_data_offset"],
first_cluster_offset=d.get("first_cluster_offset", 0),
seek_header=seek_header,
audio_codec_id=d.get("audio_codec_id", ""),
audio_bitrate=d.get("audio_bitrate", 0),
audio_channels=d.get("audio_channels", 0),
audio_sample_rate=d.get("audio_sample_rate", 0.0),
video_codec_id=d.get("video_codec_id", ""),
video_codec_private=video_codec_private,
video_width=d.get("video_width", 0),
video_height=d.get("video_height", 0),
video_fps=d.get("video_fps", 0.0),
video_default_duration_ns=d.get("video_default_duration_ns", 0),
)
logger.debug("[container_probe] Loaded cue index from cache: %s", cache_key)
return index
except (KeyError, TypeError, json.JSONDecodeError) as e:
logger.warning("[container_probe] Invalid cached cue index: %s", e)
return None
async def _set_cached_cue_index(cache_key: str, index: MKVCueIndex) -> None:
"""Cache a MKVCueIndex in Redis."""
if not cache_key:
return
r = await redis_utils.get_redis()
if r is None:
return
redis_key = f"{_CUE_INDEX_CACHE_PREFIX}{cache_key}"
data = json.dumps(
{
"duration_ms": index.duration_ms,
"timestamp_scale": index.timestamp_scale,
"cue_points": index.cue_points,
"segment_data_offset": index.segment_data_offset,
"first_cluster_offset": index.first_cluster_offset,
"seek_header_b64": base64.b64encode(index.seek_header).decode() if index.seek_header else "",
"audio_codec_id": index.audio_codec_id,
"audio_bitrate": index.audio_bitrate,
"audio_channels": index.audio_channels,
"audio_sample_rate": index.audio_sample_rate,
"video_codec_id": index.video_codec_id,
"video_codec_private_b64": base64.b64encode(index.video_codec_private).decode()
if index.video_codec_private
else "",
"video_width": index.video_width,
"video_height": index.video_height,
"video_fps": index.video_fps,
"video_default_duration_ns": index.video_default_duration_ns,
}
)
await r.set(redis_key, data, ex=_CUE_INDEX_CACHE_TTL)
logger.debug("[container_probe] Cached cue index: %s", cache_key)
async def probe_mkv_cues(
source,
file_size: int = 0,
cache_key: str = "",
header_data: bytes | None = None,
) -> MKVCueIndex | None:
"""
Probe an MKV file's EBML header and Cues to build a seek index.
Pure Python -- parses EBML structures directly, no FFmpeg involved.
Makes up to two small byte-range reads via the provided source:
1. First ~64KB: EBML header + SeekHead + Info (skipped if header_data provided)
2. Cues section: byte range from SeekHead's Cues position
Args:
source: A MediaSource protocol implementation, or any object with
a ``stream(offset, limit)`` async generator method.
file_size: Total file size in bytes. If 0, tries ``source.file_size``.
cache_key: Optional cache key for Redis caching. If empty, tries
``source.cache_key``.
header_data: Pre-fetched header bytes (first ~64KB). If provided,
skips the initial header fetch from source.
Returns:
MKVCueIndex if successful, None if the file has no Cues or parsing fails.
"""
# Resolve file_size and cache_key from source if not provided
if file_size <= 0:
file_size = getattr(source, "file_size", 0)
if not cache_key:
cache_key = getattr(source, "cache_key", "")
# Check cache first
if cache_key:
cached = await _get_cached_cue_index(cache_key)
if cached:
return cached
try:
# Step 1: Use pre-fetched header or fetch from source
if header_data is None:
header_size = min(_HEADER_PROBE_SIZE, file_size) if file_size > 0 else _HEADER_PROBE_SIZE
header_data = b""
async for chunk in source.stream(offset=0, limit=header_size):
header_data += chunk
if len(header_data) < 64:
logger.warning("[container_probe] Header too small (%d bytes), cannot probe", len(header_data))
return None
# Step 2: Parse EBML header to find Segment data offset
segment_data_offset = parse_ebml_header(header_data)
# Step 3: Parse SeekHead to find Cues and Info positions
seek_positions = parse_seek_head(header_data, segment_data_offset)
if CUES not in seek_positions:
logger.info("[container_probe] No Cues position in SeekHead, seeking not available")
return None
cues_relative_offset = seek_positions[CUES]
cues_absolute_offset = segment_data_offset + cues_relative_offset
logger.info(
"[container_probe] SeekHead: Cues at offset %d (absolute %d), Info at %s",
cues_relative_offset,
cues_absolute_offset,
seek_positions.get(INFO, "not found"),
)
# Step 4: Fetch the Cues element
cues_max = file_size - cues_absolute_offset if file_size > 0 else _MAX_CUES_SIZE
cues_fetch_size = min(_MAX_CUES_SIZE, cues_max)
if cues_fetch_size <= 0:
logger.warning("[container_probe] Cues offset %d beyond file size %d", cues_absolute_offset, file_size)
return None
cues_data = b""
async for chunk in source.stream(offset=cues_absolute_offset, limit=cues_fetch_size):
cues_data += chunk
if len(cues_data) < 16:
logger.warning("[container_probe] Cues data too small (%d bytes)", len(cues_data))
return None
# Step 5: Build the cue index
index = build_cue_index(
header_data=header_data,
cues_data=cues_data,
cues_file_offset=cues_absolute_offset,
segment_data_offset=segment_data_offset,
)
# Cache the result
if cache_key:
await _set_cached_cue_index(cache_key, index)
return index
except Exception as e:
logger.warning("[container_probe] Failed to probe MKV cues: %s", e)
return None
# =============================================================================
# MP4 Moov probing
# =============================================================================
# Redis cache for MP4 indexes
_MP4_INDEX_CACHE_PREFIX = "mfp:mp4_index:"
_MP4_INDEX_CACHE_TTL = 3600 # 1 hour
# How much to read from the start for ftyp + initial atom scanning
_MP4_HEADER_PROBE_SIZE = 64 * 1024 # 64 KB
# Max moov size we'll accept
_MAX_MOOV_SIZE = 50 * 1024 * 1024 # 50 MB
# How much to read from the end of the file to find moov
_MP4_TAIL_PROBE_SIZE = 512 * 1024 # 512 KB
async def _get_cached_mp4_index(cache_key: str) -> MP4Index | None:
"""Try to load an MP4Index from Redis cache."""
if not cache_key:
return None
r = await redis_utils.get_redis()
if r is None:
return None
redis_key = f"{_MP4_INDEX_CACHE_PREFIX}{cache_key}"
data = await r.get(redis_key)
if not data:
return None
try:
d = json.loads(data)
ftyp_data = b""
if d.get("ftyp_data_b64"):
ftyp_data = base64.b64decode(d["ftyp_data_b64"])
index = MP4Index(
duration_ms=d["duration_ms"],
timescale=d["timescale"],
cue_points=[(cp[0], cp[1]) for cp in d["cue_points"]],
moov_offset=d["moov_offset"],
moov_size=d["moov_size"],
ftyp_data=ftyp_data,
mdat_offset=d["mdat_offset"],
mdat_size=d["mdat_size"],
video_codec=d.get("video_codec", ""),
audio_codec=d.get("audio_codec", ""),
# moov_data is NOT cached (too large), it will be re-fetched
)
logger.debug("[container_probe] Loaded MP4 index from cache: %s", cache_key)
return index
except (KeyError, TypeError, json.JSONDecodeError) as e:
logger.warning("[container_probe] Invalid cached MP4 index: %s", e)
return None
async def _set_cached_mp4_index(cache_key: str, index: MP4Index) -> None:
"""Cache an MP4Index in Redis (without moov_data)."""
if not cache_key:
return
r = await redis_utils.get_redis()
if r is None:
return
redis_key = f"{_MP4_INDEX_CACHE_PREFIX}{cache_key}"
data = json.dumps(
{
"duration_ms": index.duration_ms,
"timescale": index.timescale,
"cue_points": index.cue_points,
"moov_offset": index.moov_offset,
"moov_size": index.moov_size,
"ftyp_data_b64": base64.b64encode(index.ftyp_data).decode() if index.ftyp_data else "",
"mdat_offset": index.mdat_offset,
"mdat_size": index.mdat_size,
"video_codec": index.video_codec,
"audio_codec": index.audio_codec,
}
)
await r.set(redis_key, data, ex=_MP4_INDEX_CACHE_TTL)
logger.debug("[container_probe] Cached MP4 index: %s", cache_key)
def _scan_top_level_atoms(data: bytes) -> list[tuple[bytes, int, int]]:
"""
Scan top-level atom headers from raw file bytes.
Returns:
List of (box_type, absolute_offset, total_size) for each atom found.
"""
atoms = []
offset = 0
while offset + 8 <= len(data):
size = struct.unpack_from(">I", data, offset)[0]
box_type = data[offset + 4 : offset + 8]
if size == 1: # Extended size
if offset + 16 > len(data):
break
size = struct.unpack_from(">Q", data, offset + 8)[0]
elif size == 0:
# Extends to end of file - we can't know the real size from
# a partial read, but record what we have
atoms.append((box_type, offset, 0))
break
if size < 8:
break
atoms.append((box_type, offset, size))
offset += size
return atoms
async def probe_mp4_moov(
source,
file_size: int = 0,
cache_key: str = "",
header_data: bytes | None = None,
) -> MP4Index | None:
"""
Probe an MP4 file's moov atom to build a seek index.
Pure Python -- scans MP4 box headers with struct, no FFmpeg involved.
Strategy:
1. Read first ~64KB to check for ftyp (MP4 signature).
2. Scan top-level atoms to find moov and mdat.
3. If moov is at the start (faststart), read it from the header data.
4. If moov is not in the header, read from the tail of the file.
5. Parse moov sample tables to build cue points.
Args:
source: A MediaSource protocol implementation with stream(offset, limit).
file_size: Total file size in bytes.
cache_key: Optional cache key for Redis caching.
header_data: Pre-fetched header bytes (first ~64KB). If provided,
skips the initial header fetch from source.
Returns:
MP4Index if successful, None if not an MP4 or parsing fails.
"""
if file_size <= 0:
file_size = getattr(source, "file_size", 0)
if not cache_key:
cache_key = getattr(source, "cache_key", "")
# Check cache first
if cache_key:
cached = await _get_cached_mp4_index(cache_key)
if cached:
# Re-fetch moov_data (not cached due to size) and rewrite offsets
if cached.moov_size > 0 and cached.moov_size <= _MAX_MOOV_SIZE:
moov_data = b""
async for chunk in source.stream(offset=cached.moov_offset, limit=cached.moov_size):
moov_data += chunk
if cached.mdat_offset >= 0:
new_mdat_start = len(cached.ftyp_data) + cached.moov_size
offset_delta = new_mdat_start - cached.mdat_offset
if offset_delta != 0:
moov_data = rewrite_moov_offsets(moov_data, offset_delta)
cached.moov_data = moov_data
return cached
try:
# Step 1: Use pre-fetched header or fetch from source
if header_data is None:
header_size = min(_MP4_HEADER_PROBE_SIZE, file_size) if file_size > 0 else _MP4_HEADER_PROBE_SIZE
header_data = b""
async for chunk in source.stream(offset=0, limit=header_size):
header_data += chunk
if len(header_data) < 12:
return None
# Step 2: Check for ftyp
if not is_mp4_header(header_data):
return None
logger.info("[container_probe] MP4 detected, scanning atoms (header=%d bytes)", len(header_data))
# Step 3: Scan top-level atoms from header
atoms = _scan_top_level_atoms(header_data)
ftyp_offset = -1
ftyp_size = 0
moov_offset = -1
moov_size = 0
mdat_offset = -1
mdat_size = 0
for box_type, atom_offset, atom_size in atoms:
if box_type == b"ftyp":
ftyp_offset = atom_offset
ftyp_size = atom_size
elif box_type == b"moov":
moov_offset = atom_offset
moov_size = atom_size
elif box_type == b"mdat":
mdat_offset = atom_offset
mdat_size = atom_size
# Step 4: If moov not found in header, scan from tail
if moov_offset < 0 and file_size > 0:
tail_start = max(0, file_size - _MP4_TAIL_PROBE_SIZE)
tail_data = b""
async for chunk in source.stream(offset=tail_start, limit=file_size - tail_start):
tail_data += chunk
if tail_data:
tail_atoms = _scan_top_level_atoms(tail_data)
for box_type, rel_offset, atom_size in tail_atoms:
abs_offset = tail_start + rel_offset
if box_type == b"moov":
moov_offset = abs_offset
moov_size = atom_size
elif box_type == b"mdat" and mdat_offset < 0:
mdat_offset = abs_offset
mdat_size = atom_size
# If the initial scan yielded no moov (tail_start may land
# inside a large mdat payload producing garbage atom headers),
# resync by scanning 8-byte aligned windows for b"moov".
if moov_offset < 0:
needle = b"moov"
search_pos = 0
while search_pos + 8 <= len(tail_data):
idx = tail_data.find(needle, search_pos)
if idx < 0 or idx < 4:
break
candidate_size = struct.unpack_from(">I", tail_data, idx - 4)[0]
if 8 < candidate_size <= _MAX_MOOV_SIZE:
moov_offset = tail_start + idx - 4
moov_size = candidate_size
break
search_pos = idx + 4
if moov_offset < 0:
logger.info("[container_probe] No moov atom found in MP4")
return None
if moov_size <= 0 or moov_size > _MAX_MOOV_SIZE:
logger.warning("[container_probe] moov size %d is invalid or too large", moov_size)
return None
logger.info(
"[container_probe] MP4 atoms: moov at %d (%d bytes), mdat at %d (%d bytes)",
moov_offset,
moov_size,
mdat_offset,
mdat_size,
)
# Step 5: Fetch full moov atom
# Check if moov is already contained in the header data we read
if moov_offset + moov_size <= len(header_data):
moov_data = header_data[moov_offset : moov_offset + moov_size]
else:
moov_data = b""
async for chunk in source.stream(offset=moov_offset, limit=moov_size):
moov_data += chunk
if len(moov_data) < moov_size:
logger.warning(
"[container_probe] Incomplete moov: got %d of %d bytes",
len(moov_data),
moov_size,
)
return None
# Step 6: Parse moov body (skip box header)
# Determine header size
raw_size = struct.unpack_from(">I", moov_data, 0)[0]
hdr_size = 16 if raw_size == 1 else 8
moov_body = moov_data[hdr_size:]
cue_points, duration_ms, timescale, video_codec, audio_codec = build_cue_points_from_moov(moov_body)
# If mdat wasn't found via header scan, it's likely right after ftyp
# or right after moov. Common layouts:
# ftyp + moov + mdat (faststart) or ftyp + mdat + moov
if mdat_offset < 0:
# Walk atoms to find mdat by scanning just enough from the file
# In most cases, mdat is either before or after moov
if moov_offset < file_size // 2:
# moov is early -> mdat likely follows
mdat_search_offset = moov_offset + moov_size
else:
# moov is late -> mdat likely right after ftyp
ftyp_size = struct.unpack_from(">I", header_data, 0)[0]
if ftyp_size == 1:
ftyp_size = struct.unpack_from(">Q", header_data, 8)[0]
mdat_search_offset = ftyp_size
# Read a small amount to find the mdat header
mdat_header = b""
async for chunk in source.stream(offset=mdat_search_offset, limit=16):
mdat_header += chunk
if len(mdat_header) >= 8:
box_type = mdat_header[4:8]
if box_type == b"mdat":
mdat_offset = mdat_search_offset
raw_sz = struct.unpack_from(">I", mdat_header, 0)[0]
if raw_sz == 1 and len(mdat_header) >= 16:
mdat_size = struct.unpack_from(">Q", mdat_header, 8)[0]
else:
mdat_size = raw_sz
# Step 7: Extract ftyp data (always in the header since it's the first atom)
ftyp_data = b""
if ftyp_offset >= 0 and ftyp_size > 0 and ftyp_offset + ftyp_size <= len(header_data):
ftyp_data = header_data[ftyp_offset : ftyp_offset + ftyp_size]
# Step 8: Rewrite moov chunk offsets for faststart pipe layout.
# The pipe stream will be: ftyp + moov + mdat. The stco/co64
# offsets in the original moov point to positions in the original
# file. We need to shift them to account for the new layout.
# New mdat position = ftyp_size + moov_size
# Delta = new_mdat_position - original_mdat_offset
if mdat_offset >= 0:
new_mdat_start = len(ftyp_data) + moov_size
offset_delta = new_mdat_start - mdat_offset
if offset_delta != 0:
moov_data = rewrite_moov_offsets(moov_data, offset_delta)
index = MP4Index(
duration_ms=duration_ms,
timescale=timescale,
cue_points=cue_points,
moov_offset=moov_offset,
moov_size=moov_size,
moov_data=moov_data,
ftyp_data=ftyp_data,
mdat_offset=mdat_offset,
mdat_size=mdat_size,
video_codec=video_codec,
audio_codec=audio_codec,
)
logger.info(
"[container_probe] MP4 index: duration=%.1fs, %d cue points, video=%s, audio=%s",
duration_ms / 1000.0,
len(cue_points),
video_codec,
audio_codec,
)
if cache_key:
await _set_cached_mp4_index(cache_key, index)
return index
except Exception as e:
logger.warning("[container_probe] Failed to probe MP4 moov: %s", e)
return None