UnHided/mediaflow_proxy/remuxer/container_probe.py

"""
Container format probing -- MKV Cues and MP4 moov.

Pure Python probing using EBML parsing (MKV) and struct-based atom
scanning (MP4). No FFmpeg dependency.

Source-agnostic: accepts any MediaSource protocol implementation
(Telegram, HTTP, etc.) for byte-range reads.

Provides:
- probe_mkv_cues: probe MKV file to extract seek index (MKVCueIndex)
- probe_mp4_moov: probe MP4 file to extract moov atom and build seek index (MP4Index)
"""

import base64
import hashlib
import json
import logging
import struct

from mediaflow_proxy.utils import redis_utils
from mediaflow_proxy.remuxer.ebml_parser import (
    MKVCueIndex,
    build_cue_index,
    parse_ebml_header,
    parse_seek_head,
    CUES,
    INFO,
)
from mediaflow_proxy.remuxer.mp4_parser import (
    MP4Index,
    build_cue_points_from_moov,
    is_mp4_header,
    rewrite_moov_offsets,
)

logger = logging.getLogger(__name__)

# How much of the MKV header to fetch for SeekHead + Info parsing
_HEADER_PROBE_SIZE = 64 * 1024  # 64 KB

# Max Cues element size we'll attempt to fetch
_MAX_CUES_SIZE = 2 * 1024 * 1024  # 2 MB

# Redis cache for MKV Cue indexes
_CUE_INDEX_CACHE_PREFIX = "mfp:cue_index:"
_CUE_INDEX_CACHE_TTL = 3600  # 1 hour


# =============================================================================
# MKV Cues probing
# =============================================================================


def derive_cue_cache_key(
    source_key: str = "",
    *,
    chat_id: str | int | None = None,
    message_id: int | None = None,
    file_id: str | None = None,
) -> str:
    """
    Derive a deterministic cache key for a file's cue index.

    Accepts either a pre-computed source_key (from MediaSource.cache_key)
    or legacy Telegram-style parameters for backwards compatibility.
    """
    if source_key:
        return source_key
    if file_id:
        raw = f"file_id:{file_id}"
    elif chat_id is not None and message_id is not None:
        raw = f"chat:{chat_id}:msg:{message_id}"
    else:
        return ""
    return hashlib.sha256(raw.encode()).hexdigest()[:16]


async def _get_cached_cue_index(cache_key: str) -> MKVCueIndex | None:
    """Try to load a MKVCueIndex from Redis cache."""
    if not cache_key:
        return None
    r = await redis_utils.get_redis()
    if r is None:
        return None
    redis_key = f"{_CUE_INDEX_CACHE_PREFIX}{cache_key}"
    data = await r.get(redis_key)
    if not data:
        return None
    try:
        d = json.loads(data)
        seek_header = b""
        if d.get("seek_header_b64"):
            seek_header = base64.b64decode(d["seek_header_b64"])
        video_codec_private = b""
        if d.get("video_codec_private_b64"):
            video_codec_private = base64.b64decode(d["video_codec_private_b64"])
        index = MKVCueIndex(
            duration_ms=d["duration_ms"],
            timestamp_scale=d["timestamp_scale"],
            cue_points=[(cp[0], cp[1]) for cp in d["cue_points"]],
            segment_data_offset=d["segment_data_offset"],
            first_cluster_offset=d.get("first_cluster_offset", 0),
            seek_header=seek_header,
            audio_codec_id=d.get("audio_codec_id", ""),
            audio_bitrate=d.get("audio_bitrate", 0),
            audio_channels=d.get("audio_channels", 0),
            audio_sample_rate=d.get("audio_sample_rate", 0.0),
            video_codec_id=d.get("video_codec_id", ""),
            video_codec_private=video_codec_private,
            video_width=d.get("video_width", 0),
            video_height=d.get("video_height", 0),
            video_fps=d.get("video_fps", 0.0),
            video_default_duration_ns=d.get("video_default_duration_ns", 0),
        )
        logger.debug("[container_probe] Loaded cue index from cache: %s", cache_key)
        return index
    except (KeyError, TypeError, json.JSONDecodeError) as e:
        logger.warning("[container_probe] Invalid cached cue index: %s", e)
        return None


async def _set_cached_cue_index(cache_key: str, index: MKVCueIndex) -> None:
    """Cache a MKVCueIndex in Redis."""
    if not cache_key:
        return
    r = await redis_utils.get_redis()
    if r is None:
        return
    redis_key = f"{_CUE_INDEX_CACHE_PREFIX}{cache_key}"
    data = json.dumps(
        {
            "duration_ms": index.duration_ms,
            "timestamp_scale": index.timestamp_scale,
            "cue_points": index.cue_points,
            "segment_data_offset": index.segment_data_offset,
            "first_cluster_offset": index.first_cluster_offset,
            "seek_header_b64": base64.b64encode(index.seek_header).decode() if index.seek_header else "",
            "audio_codec_id": index.audio_codec_id,
            "audio_bitrate": index.audio_bitrate,
            "audio_channels": index.audio_channels,
            "audio_sample_rate": index.audio_sample_rate,
            "video_codec_id": index.video_codec_id,
            "video_codec_private_b64": base64.b64encode(index.video_codec_private).decode()
            if index.video_codec_private
            else "",
            "video_width": index.video_width,
            "video_height": index.video_height,
            "video_fps": index.video_fps,
            "video_default_duration_ns": index.video_default_duration_ns,
        }
    )
    await r.set(redis_key, data, ex=_CUE_INDEX_CACHE_TTL)
    logger.debug("[container_probe] Cached cue index: %s", cache_key)


async def probe_mkv_cues(
    source,
    file_size: int = 0,
    cache_key: str = "",
    header_data: bytes | None = None,
) -> MKVCueIndex | None:
    """
    Probe an MKV file's EBML header and Cues to build a seek index.

    Pure Python -- parses EBML structures directly, no FFmpeg involved.

    Makes up to two small byte-range reads via the provided source:
    1. First ~64KB: EBML header + SeekHead + Info (skipped if header_data provided)
    2. Cues section: byte range from SeekHead's Cues position

    Args:
        source: A MediaSource protocol implementation, or any object with
                a ``stream(offset, limit)`` async generator method.
        file_size: Total file size in bytes. If 0, tries ``source.file_size``.
        cache_key: Optional cache key for Redis caching. If empty, tries
                   ``source.cache_key``.
        header_data: Pre-fetched header bytes (first ~64KB). If provided,
                     skips the initial header fetch from source.

    Returns:
        MKVCueIndex if successful, None if the file has no Cues or parsing fails.
    """
    # Resolve file_size and cache_key from source if not provided
    if file_size <= 0:
        file_size = getattr(source, "file_size", 0)
    if not cache_key:
        cache_key = getattr(source, "cache_key", "")

    # Check cache first
    if cache_key:
        cached = await _get_cached_cue_index(cache_key)
        if cached:
            return cached

    try:
        # Step 1: Use pre-fetched header or fetch from source
        if header_data is None:
            header_size = min(_HEADER_PROBE_SIZE, file_size) if file_size > 0 else _HEADER_PROBE_SIZE
            header_data = b""
            async for chunk in source.stream(offset=0, limit=header_size):
                header_data += chunk

        if len(header_data) < 64:
            logger.warning("[container_probe] Header too small (%d bytes), cannot probe", len(header_data))
            return None

        # Step 2: Parse EBML header to find Segment data offset
        segment_data_offset = parse_ebml_header(header_data)

        # Step 3: Parse SeekHead to find Cues and Info positions
        seek_positions = parse_seek_head(header_data, segment_data_offset)

        if CUES not in seek_positions:
            logger.info("[container_probe] No Cues position in SeekHead, seeking not available")
            return None

        cues_relative_offset = seek_positions[CUES]
        cues_absolute_offset = segment_data_offset + cues_relative_offset

        logger.info(
            "[container_probe] SeekHead: Cues at offset %d (absolute %d), Info at %s",
            cues_relative_offset,
            cues_absolute_offset,
            seek_positions.get(INFO, "not found"),
        )

        # Step 4: Fetch the Cues element
        cues_max = file_size - cues_absolute_offset if file_size > 0 else _MAX_CUES_SIZE
        cues_fetch_size = min(_MAX_CUES_SIZE, cues_max)
        if cues_fetch_size <= 0:
            logger.warning("[container_probe] Cues offset %d beyond file size %d", cues_absolute_offset, file_size)
            return None

        cues_data = b""
        async for chunk in source.stream(offset=cues_absolute_offset, limit=cues_fetch_size):
            cues_data += chunk

        if len(cues_data) < 16:
            logger.warning("[container_probe] Cues data too small (%d bytes)", len(cues_data))
            return None

        # Step 5: Build the cue index
        index = build_cue_index(
            header_data=header_data,
            cues_data=cues_data,
            cues_file_offset=cues_absolute_offset,
            segment_data_offset=segment_data_offset,
        )

        # Cache the result
        if cache_key:
            await _set_cached_cue_index(cache_key, index)

        return index

    except Exception as e:
        logger.warning("[container_probe] Failed to probe MKV cues: %s", e)
        return None


# =============================================================================
# MP4 Moov probing
# =============================================================================

# Redis cache for MP4 indexes
_MP4_INDEX_CACHE_PREFIX = "mfp:mp4_index:"
_MP4_INDEX_CACHE_TTL = 3600  # 1 hour

# How much to read from the start for ftyp + initial atom scanning
_MP4_HEADER_PROBE_SIZE = 64 * 1024  # 64 KB

# Max moov size we'll accept
_MAX_MOOV_SIZE = 50 * 1024 * 1024  # 50 MB

# How much to read from the end of the file to find moov
_MP4_TAIL_PROBE_SIZE = 512 * 1024  # 512 KB


async def _get_cached_mp4_index(cache_key: str) -> MP4Index | None:
    """Try to load an MP4Index from Redis cache."""
    if not cache_key:
        return None
    r = await redis_utils.get_redis()
    if r is None:
        return None
    redis_key = f"{_MP4_INDEX_CACHE_PREFIX}{cache_key}"
    data = await r.get(redis_key)
    if not data:
        return None
    try:
        d = json.loads(data)
        ftyp_data = b""
        if d.get("ftyp_data_b64"):
            ftyp_data = base64.b64decode(d["ftyp_data_b64"])
        index = MP4Index(
            duration_ms=d["duration_ms"],
            timescale=d["timescale"],
            cue_points=[(cp[0], cp[1]) for cp in d["cue_points"]],
            moov_offset=d["moov_offset"],
            moov_size=d["moov_size"],
            ftyp_data=ftyp_data,
            mdat_offset=d["mdat_offset"],
            mdat_size=d["mdat_size"],
            video_codec=d.get("video_codec", ""),
            audio_codec=d.get("audio_codec", ""),
            # moov_data is NOT cached (too large), it will be re-fetched
        )
        logger.debug("[container_probe] Loaded MP4 index from cache: %s", cache_key)
        return index
    except (KeyError, TypeError, json.JSONDecodeError) as e:
        logger.warning("[container_probe] Invalid cached MP4 index: %s", e)
        return None


async def _set_cached_mp4_index(cache_key: str, index: MP4Index) -> None:
    """Cache an MP4Index in Redis (without moov_data)."""
    if not cache_key:
        return
    r = await redis_utils.get_redis()
    if r is None:
        return
    redis_key = f"{_MP4_INDEX_CACHE_PREFIX}{cache_key}"
    data = json.dumps(
        {
            "duration_ms": index.duration_ms,
            "timescale": index.timescale,
            "cue_points": index.cue_points,
            "moov_offset": index.moov_offset,
            "moov_size": index.moov_size,
            "ftyp_data_b64": base64.b64encode(index.ftyp_data).decode() if index.ftyp_data else "",
            "mdat_offset": index.mdat_offset,
            "mdat_size": index.mdat_size,
            "video_codec": index.video_codec,
            "audio_codec": index.audio_codec,
        }
    )
    await r.set(redis_key, data, ex=_MP4_INDEX_CACHE_TTL)
    logger.debug("[container_probe] Cached MP4 index: %s", cache_key)


def _scan_top_level_atoms(data: bytes) -> list[tuple[bytes, int, int]]:
    """
    Scan top-level atom headers from raw file bytes.

    Returns:
        List of (box_type, absolute_offset, total_size) for each atom found.
    """
    atoms = []
    offset = 0
    while offset + 8 <= len(data):
        size = struct.unpack_from(">I", data, offset)[0]
        box_type = data[offset + 4 : offset + 8]

        if size == 1:  # Extended size
            if offset + 16 > len(data):
                break
            size = struct.unpack_from(">Q", data, offset + 8)[0]
        elif size == 0:
            # Extends to end of file - we can't know the real size from
            # a partial read, but record what we have
            atoms.append((box_type, offset, 0))
            break

        if size < 8:
            break

        atoms.append((box_type, offset, size))
        offset += size

    return atoms


async def probe_mp4_moov(
    source,
    file_size: int = 0,
    cache_key: str = "",
    header_data: bytes | None = None,
) -> MP4Index | None:
    """
    Probe an MP4 file's moov atom to build a seek index.

    Pure Python -- scans MP4 box headers with struct, no FFmpeg involved.

    Strategy:
    1. Read first ~64KB to check for ftyp (MP4 signature).
    2. Scan top-level atoms to find moov and mdat.
    3. If moov is at the start (faststart), read it from the header data.
    4. If moov is not in the header, read from the tail of the file.
    5. Parse moov sample tables to build cue points.

    Args:
        source: A MediaSource protocol implementation with stream(offset, limit).
        file_size: Total file size in bytes.
        cache_key: Optional cache key for Redis caching.
        header_data: Pre-fetched header bytes (first ~64KB). If provided,
                     skips the initial header fetch from source.

    Returns:
        MP4Index if successful, None if not an MP4 or parsing fails.
    """
    if file_size <= 0:
        file_size = getattr(source, "file_size", 0)
    if not cache_key:
        cache_key = getattr(source, "cache_key", "")

    # Check cache first
    if cache_key:
        cached = await _get_cached_mp4_index(cache_key)
        if cached:
            # Re-fetch moov_data (not cached due to size) and rewrite offsets
            if cached.moov_size > 0 and cached.moov_size <= _MAX_MOOV_SIZE:
                moov_data = b""
                async for chunk in source.stream(offset=cached.moov_offset, limit=cached.moov_size):
                    moov_data += chunk
                if cached.mdat_offset >= 0:
                    new_mdat_start = len(cached.ftyp_data) + cached.moov_size
                    offset_delta = new_mdat_start - cached.mdat_offset
                    if offset_delta != 0:
                        moov_data = rewrite_moov_offsets(moov_data, offset_delta)
                cached.moov_data = moov_data
            return cached

    try:
        # Step 1: Use pre-fetched header or fetch from source
        if header_data is None:
            header_size = min(_MP4_HEADER_PROBE_SIZE, file_size) if file_size > 0 else _MP4_HEADER_PROBE_SIZE
            header_data = b""
            async for chunk in source.stream(offset=0, limit=header_size):
                header_data += chunk

        if len(header_data) < 12:
            return None

        # Step 2: Check for ftyp
        if not is_mp4_header(header_data):
            return None

        logger.info("[container_probe] MP4 detected, scanning atoms (header=%d bytes)", len(header_data))

        # Step 3: Scan top-level atoms from header
        atoms = _scan_top_level_atoms(header_data)

        ftyp_offset = -1
        ftyp_size = 0
        moov_offset = -1
        moov_size = 0
        mdat_offset = -1
        mdat_size = 0

        for box_type, atom_offset, atom_size in atoms:
            if box_type == b"ftyp":
                ftyp_offset = atom_offset
                ftyp_size = atom_size
            elif box_type == b"moov":
                moov_offset = atom_offset
                moov_size = atom_size
            elif box_type == b"mdat":
                mdat_offset = atom_offset
                mdat_size = atom_size

        # Step 4: If moov not found in header, scan from tail
        if moov_offset < 0 and file_size > 0:
            tail_start = max(0, file_size - _MP4_TAIL_PROBE_SIZE)
            tail_data = b""
            async for chunk in source.stream(offset=tail_start, limit=file_size - tail_start):
                tail_data += chunk

            if tail_data:
                tail_atoms = _scan_top_level_atoms(tail_data)
                for box_type, rel_offset, atom_size in tail_atoms:
                    abs_offset = tail_start + rel_offset
                    if box_type == b"moov":
                        moov_offset = abs_offset
                        moov_size = atom_size
                    elif box_type == b"mdat" and mdat_offset < 0:
                        mdat_offset = abs_offset
                        mdat_size = atom_size

                # If the initial scan yielded no moov (tail_start may land
                # inside a large mdat payload producing garbage atom headers),
                # resync by scanning 8-byte aligned windows for b"moov".
                if moov_offset < 0:
                    needle = b"moov"
                    search_pos = 0
                    while search_pos + 8 <= len(tail_data):
                        idx = tail_data.find(needle, search_pos)
                        if idx < 0 or idx < 4:
                            break
                        candidate_size = struct.unpack_from(">I", tail_data, idx - 4)[0]
                        if 8 < candidate_size <= _MAX_MOOV_SIZE:
                            moov_offset = tail_start + idx - 4
                            moov_size = candidate_size
                            break
                        search_pos = idx + 4

        if moov_offset < 0:
            logger.info("[container_probe] No moov atom found in MP4")
            return None

        if moov_size <= 0 or moov_size > _MAX_MOOV_SIZE:
            logger.warning("[container_probe] moov size %d is invalid or too large", moov_size)
            return None

        logger.info(
            "[container_probe] MP4 atoms: moov at %d (%d bytes), mdat at %d (%d bytes)",
            moov_offset,
            moov_size,
            mdat_offset,
            mdat_size,
        )

        # Step 5: Fetch full moov atom
        # Check if moov is already contained in the header data we read
        if moov_offset + moov_size <= len(header_data):
            moov_data = header_data[moov_offset : moov_offset + moov_size]
        else:
            moov_data = b""
            async for chunk in source.stream(offset=moov_offset, limit=moov_size):
                moov_data += chunk

        if len(moov_data) < moov_size:
            logger.warning(
                "[container_probe] Incomplete moov: got %d of %d bytes",
                len(moov_data),
                moov_size,
            )
            return None

        # Step 6: Parse moov body (skip box header)
        # Determine header size
        raw_size = struct.unpack_from(">I", moov_data, 0)[0]
        hdr_size = 16 if raw_size == 1 else 8
        moov_body = moov_data[hdr_size:]

        cue_points, duration_ms, timescale, video_codec, audio_codec = build_cue_points_from_moov(moov_body)

        # If mdat wasn't found via header scan, it's likely right after ftyp
        # or right after moov. Common layouts:
        # ftyp + moov + mdat (faststart) or ftyp + mdat + moov
        if mdat_offset < 0:
            # Walk atoms to find mdat by scanning just enough from the file
            # In most cases, mdat is either before or after moov
            if moov_offset < file_size // 2:
                # moov is early -> mdat likely follows
                mdat_search_offset = moov_offset + moov_size
            else:
                # moov is late -> mdat likely right after ftyp
                ftyp_size = struct.unpack_from(">I", header_data, 0)[0]
                if ftyp_size == 1:
                    ftyp_size = struct.unpack_from(">Q", header_data, 8)[0]
                mdat_search_offset = ftyp_size

            # Read a small amount to find the mdat header
            mdat_header = b""
            async for chunk in source.stream(offset=mdat_search_offset, limit=16):
                mdat_header += chunk
            if len(mdat_header) >= 8:
                box_type = mdat_header[4:8]
                if box_type == b"mdat":
                    mdat_offset = mdat_search_offset
                    raw_sz = struct.unpack_from(">I", mdat_header, 0)[0]
                    if raw_sz == 1 and len(mdat_header) >= 16:
                        mdat_size = struct.unpack_from(">Q", mdat_header, 8)[0]
                    else:
                        mdat_size = raw_sz

        # Step 7: Extract ftyp data (always in the header since it's the first atom)
        ftyp_data = b""
        if ftyp_offset >= 0 and ftyp_size > 0 and ftyp_offset + ftyp_size <= len(header_data):
            ftyp_data = header_data[ftyp_offset : ftyp_offset + ftyp_size]

        # Step 8: Rewrite moov chunk offsets for faststart pipe layout.
        # The pipe stream will be: ftyp + moov + mdat. The stco/co64
        # offsets in the original moov point to positions in the original
        # file. We need to shift them to account for the new layout.
        # New mdat position = ftyp_size + moov_size
        # Delta = new_mdat_position - original_mdat_offset
        if mdat_offset >= 0:
            new_mdat_start = len(ftyp_data) + moov_size
            offset_delta = new_mdat_start - mdat_offset
            if offset_delta != 0:
                moov_data = rewrite_moov_offsets(moov_data, offset_delta)

        index = MP4Index(
            duration_ms=duration_ms,
            timescale=timescale,
            cue_points=cue_points,
            moov_offset=moov_offset,
            moov_size=moov_size,
            moov_data=moov_data,
            ftyp_data=ftyp_data,
            mdat_offset=mdat_offset,
            mdat_size=mdat_size,
            video_codec=video_codec,
            audio_codec=audio_codec,
        )

        logger.info(
            "[container_probe] MP4 index: duration=%.1fs, %d cue points, video=%s, audio=%s",
            duration_ms / 1000.0,
            len(cue_points),
            video_codec,
            audio_codec,
        )

        if cache_key:
            await _set_cached_mp4_index(cache_key, index)

        return index

    except Exception as e:
        logger.warning("[container_probe] Failed to probe MP4 moov: %s", e)
        return None