update

2026-06-10 09:10:23 +00:00 · 2026-02-19 20:15:03 +01:00
parent 7785e8c604
commit cfc6bbabc9
181 changed files with 32141 additions and 4629 deletions
@@ -0,0 +1,18 @@
+"""
+Media remuxer package.
+
+Provides pure Python implementations for media container parsing, remuxing,
+and transcoding:
+
+- ebml_parser: Minimal EBML/MKV parser for seeking and demuxing
+- ts_muxer: fMP4 -> MPEG-TS remuxer
+- mkv_demuxer: Streaming MKV demuxer
+- mp4_muxer: MP4 box builder for standard moov-first MP4
+- audio_transcoder: PyAV-based audio frame transcoding
+- video_transcoder: GPU-accelerated video transcoding via PyAV
+- pyav_demuxer: Universal PyAV-based streaming demuxer (any container)
+- codec_utils: Codec compatibility detection and decision engine
+- media_source: Abstract MediaSource protocol (Telegram, HTTP, etc.)
+- transcode_handler: Shared transcode request orchestrator
+- transcode_pipeline: MKV fast-path and universal transcode pipelines
+"""
@@ -0,0 +1,351 @@
+"""
+PyAV-based audio transcoder for frame-level codec conversion.
+
+Transcodes audio frames between codecs using PyAV's CodecContext API
+(Python bindings for FFmpeg's libavcodec). This provides in-process
+audio transcoding without subprocess management or pipe overhead.
+
+Supported input codecs: EAC3, AC3, AAC, Opus, Vorbis, FLAC, MP3
+Output codec: AAC-LC (stereo, configurable bitrate)
+
+Architecture:
+  raw_frame_bytes -> parse() -> decode() -> resample() -> encode() -> raw_aac_bytes
+
+Usage:
+    transcoder = AudioTranscoder("eac3", sample_rate=48000, channels=6)
+    for raw_eac3_frame in frames:
+        aac_frames = transcoder.transcode(raw_eac3_frame)
+        for aac_data in aac_frames:
+            write(aac_data)
+    # Flush remaining frames
+    for aac_data in transcoder.flush():
+        write(aac_data)
+"""
+
+import logging
+
+import av
+from av.audio.resampler import AudioResampler
+
+from mediaflow_proxy.remuxer.ebml_parser import (
+    CODEC_ID_AAC,
+    CODEC_ID_AC3,
+    CODEC_ID_EAC3,
+    CODEC_ID_FLAC,
+    CODEC_ID_OPUS,
+    CODEC_ID_VORBIS,
+)
+
+logger = logging.getLogger(__name__)
+
+
+def _generate_silence_aac_frame() -> bytes | None:
+    """Pre-encode a single silent AAC frame (48 kHz stereo, 1024 samples).
+
+    PyAV's AAC encoder has an intermittent ``avcodec_send_frame`` bug when
+    rapidly creating/destroying codec contexts, so we retry a few times.
+    This function is called once at module load; the result is cached in
+    ``_SILENCE_AAC_FRAME``.
+    """
+    for _attempt in range(10):
+        try:
+            enc = av.CodecContext.create("aac", "w")
+            enc.sample_rate = 48000
+            enc.layout = "stereo"
+            enc.format = av.AudioFormat("fltp")
+            enc.bit_rate = 192000
+            enc.open()
+
+            frame = av.AudioFrame(
+                format=enc.format.name,
+                layout=enc.layout.name,
+                samples=enc.frame_size or 1024,
+            )
+            frame.sample_rate = enc.sample_rate
+            frame.pts = 0
+
+            for pkt in enc.encode(frame):
+                return bytes(pkt)
+            # AAC priming delay: first encode buffered; flush to retrieve
+            for pkt in enc.encode(None):
+                return bytes(pkt)
+        except Exception:
+            continue
+    return None
+
+
+# Module-level silence frame -- generated once, reused by every transcoder.
+_SILENCE_AAC_FRAME: bytes | None = _generate_silence_aac_frame()
+
+# Map MKV codec IDs to PyAV/FFmpeg codec names
+_MKV_TO_FFMPEG_CODEC = {
+    CODEC_ID_EAC3: "eac3",
+    CODEC_ID_AC3: "ac3",
+    CODEC_ID_AAC: "aac",
+    CODEC_ID_OPUS: "opus",
+    CODEC_ID_VORBIS: "vorbis",
+    CODEC_ID_FLAC: "flac",
+    "A_DTS": "dts",
+    "A_MP3": "mp3",
+    "A_MPEG/L3": "mp3",
+}
+
+# Codecs that need transcoding to AAC for browser playback
+NEEDS_TRANSCODE = frozenset(
+    {
+        CODEC_ID_EAC3,
+        CODEC_ID_AC3,
+        CODEC_ID_OPUS,
+        CODEC_ID_VORBIS,
+        CODEC_ID_FLAC,
+        "A_DTS",
+        "A_MP3",
+        "A_MPEG/L3",
+    }
+)
+
+# Output AAC settings
+_OUTPUT_CODEC = "aac"
+_OUTPUT_SAMPLE_FORMAT = "fltp"  # AAC requires float planar
+_OUTPUT_LAYOUT = "stereo"
+
+# Map channel count -> FFmpeg layout name
+_CHANNEL_LAYOUT_MAP = {
+    1: "mono",
+    2: "stereo",
+    3: "2.1",
+    4: "quad",
+    6: "5.1",
+    8: "7.1",
+}
+
+
+def needs_transcode(codec_id: str) -> bool:
+    """Check if an MKV audio codec needs transcoding for browser playback."""
+    return codec_id in NEEDS_TRANSCODE
+
+
+def get_ffmpeg_codec_name(mkv_codec_id: str) -> str | None:
+    """Map an MKV CodecID to an FFmpeg codec name."""
+    return _MKV_TO_FFMPEG_CODEC.get(mkv_codec_id)
+
+
+class AudioTranscoder:
+    """
+    In-process audio transcoder using PyAV's CodecContext API.
+
+    Decodes raw audio frames from one codec and encodes them to AAC-LC
+    stereo, suitable for MP4 container and browser playback. No container
+    I/O or subprocess involved -- operates directly on raw frame bytes.
+
+    The transcoder handles sample format conversion and resampling
+    automatically via AudioResampler.
+    """
+
+    def __init__(
+        self,
+        input_codec: str,
+        input_sample_rate: int = 48000,
+        input_channels: int = 6,
+        output_sample_rate: int = 48000,
+        output_channels: int = 2,
+        output_bitrate: int = 192000,
+    ) -> None:
+        """
+        Initialize the transcoder.
+
+        Args:
+            input_codec: FFmpeg codec name (e.g., "eac3", "ac3", "aac").
+            input_sample_rate: Input sample rate in Hz.
+            input_channels: Input channel count.
+            output_sample_rate: Output sample rate in Hz (default 48000).
+            output_channels: Output channel count (default 2 = stereo).
+            output_bitrate: Output bitrate in bits/s (default 192000).
+        """
+        # Set up decoder -- use layout to configure channel count
+        # (PyAV's channels property is read-only; layout drives it)
+        self._decoder = av.CodecContext.create(input_codec, "r")
+        self._decoder.sample_rate = input_sample_rate
+        input_layout = _CHANNEL_LAYOUT_MAP.get(input_channels, "stereo")
+        self._decoder.layout = input_layout
+
+        # Set up encoder
+        self._encoder = av.CodecContext.create(_OUTPUT_CODEC, "w")
+        self._encoder.sample_rate = output_sample_rate
+        self._encoder.layout = _OUTPUT_LAYOUT
+        self._encoder.format = av.AudioFormat(_OUTPUT_SAMPLE_FORMAT)
+        self._encoder.bit_rate = output_bitrate
+        self._encoder.open()
+
+        # Set up resampler for format/rate/channel conversion
+        self._resampler = AudioResampler(
+            format=_OUTPUT_SAMPLE_FORMAT,
+            layout=_OUTPUT_LAYOUT,
+            rate=output_sample_rate,
+        )
+
+        self._input_codec = input_codec
+        self._frames_decoded = 0
+        self._frames_encoded = 0
+        self._audio_specific_config: bytes | None = None
+
+        logger.info(
+            "[audio_transcoder] Initialized: %s %dHz %dch -> aac %dHz %dch @%dk",
+            input_codec,
+            input_sample_rate,
+            input_channels,
+            output_sample_rate,
+            output_channels,
+            output_bitrate // 1000,
+        )
+
+    @property
+    def audio_specific_config(self) -> bytes | None:
+        """
+        AAC AudioSpecificConfig from the encoder (available after first encode).
+
+        This is needed for the MP4 esds box.
+        """
+        if self._audio_specific_config is not None:
+            return self._audio_specific_config
+
+        # PyAV exposes extradata after the encoder is opened
+        if self._encoder.extradata:
+            self._audio_specific_config = bytes(self._encoder.extradata)
+            return self._audio_specific_config
+        return None
+
+    @property
+    def output_sample_rate(self) -> int:
+        return self._encoder.sample_rate
+
+    @property
+    def output_channels(self) -> int:
+        return self._encoder.channels
+
+    @property
+    def frame_size(self) -> int:
+        """AAC frame size (samples per frame), typically 1024."""
+        return self._encoder.frame_size or 1024
+
+    def transcode(self, raw_frame_data: bytes) -> list[bytes]:
+        """
+        Transcode a raw audio frame from the input codec to AAC.
+
+        Args:
+            raw_frame_data: Raw audio frame bytes (one codec frame, e.g.,
+                           one EAC3 sync frame).
+
+        Returns:
+            List of raw AAC frame bytes. May return 0, 1, or more frames
+            depending on codec frame sizes and buffering.
+        """
+        output = []
+
+        # Parse raw bytes into packets
+        packets = self._decoder.parse(raw_frame_data)
+
+        for packet in packets:
+            # Decode to PCM frames
+            try:
+                decoded_frames = self._decoder.decode(packet)
+            except av.error.InvalidDataError as e:
+                logger.debug("[audio_transcoder] Decode error (skipping frame): %s", e)
+                continue
+
+            for frame in decoded_frames:
+                self._frames_decoded += 1
+
+                # Resample to match encoder format
+                resampled = self._resampler.resample(frame)
+                if resampled is None:
+                    continue
+
+                # resampled can be a single frame or list of frames
+                if not isinstance(resampled, list):
+                    resampled = [resampled]
+
+                for rs_frame in resampled:
+                    # Encode to AAC
+                    try:
+                        encoded_packets = self._encoder.encode(rs_frame)
+                    except av.error.InvalidDataError as e:
+                        logger.debug("[audio_transcoder] Encode error: %s", e)
+                        continue
+
+                    for enc_packet in encoded_packets:
+                        self._frames_encoded += 1
+                        output.append(bytes(enc_packet))
+
+        return output
+
+    def flush(self) -> list[bytes]:
+        """
+        Flush the decoder and encoder buffers.
+
+        Call this when the input stream ends to get remaining frames.
+
+        Returns:
+            List of remaining raw AAC frame bytes.
+        """
+        output = []
+
+        # Flush decoder
+        try:
+            for frame in self._decoder.decode(None):
+                self._frames_decoded += 1
+                resampled = self._resampler.resample(frame)
+                if resampled is None:
+                    continue
+                if not isinstance(resampled, list):
+                    resampled = [resampled]
+                for rs_frame in resampled:
+                    for enc_packet in self._encoder.encode(rs_frame):
+                        self._frames_encoded += 1
+                        output.append(bytes(enc_packet))
+        except Exception as e:
+            logger.debug("[audio_transcoder] Decoder flush error: %s", e)
+
+        # Flush resampler
+        try:
+            resampled = self._resampler.resample(None)
+            if resampled is not None:
+                if not isinstance(resampled, list):
+                    resampled = [resampled]
+                for rs_frame in resampled:
+                    for enc_packet in self._encoder.encode(rs_frame):
+                        self._frames_encoded += 1
+                        output.append(bytes(enc_packet))
+        except Exception as e:
+            logger.debug("[audio_transcoder] Resampler flush error: %s", e)
+
+        # Flush encoder
+        try:
+            for enc_packet in self._encoder.encode(None):
+                self._frames_encoded += 1
+                output.append(bytes(enc_packet))
+        except Exception as e:
+            logger.debug("[audio_transcoder] Encoder flush error: %s", e)
+
+        logger.info(
+            "[audio_transcoder] Flushed: %d decoded, %d encoded total",
+            self._frames_decoded,
+            self._frames_encoded,
+        )
+        return output
+
+    def generate_silence_frame(self) -> bytes | None:
+        """Return a pre-encoded silent AAC frame (module-level singleton)."""
+        return _SILENCE_AAC_FRAME
+
+    def close(self) -> None:
+        """Release codec contexts (best-effort; PyAV AudioCodecContext may not have close())."""
+        for ctx in (self._decoder, self._encoder):
+            try:
+                if hasattr(ctx, "close"):
+                    ctx.close()
+            except Exception:
+                pass
+
+    def __del__(self) -> None:
+        self.close()
@@ -0,0 +1,515 @@
+"""
+Codec decision engine for browser compatibility detection.
+
+Determines whether video/audio streams need transcoding for browser
+playback and selects appropriate output codecs.
+"""
+
+import logging
+import struct
+
+logger = logging.getLogger(__name__)
+
+# ────────────────────────────────────────────────────────────────────
+# Browser-compatible codecs (work natively in HTML5 <video>)
+# ────────────────────────────────────────────────────────────────────
+BROWSER_VIDEO_CODECS = frozenset(
+    {
+        "V_MPEG4/ISO/AVC",  # H.264/AVC -- universal
+        "h264",
+        "avc1",  # FFmpeg/PyAV names
+    }
+)
+
+BROWSER_AUDIO_CODECS = frozenset(
+    {
+        "A_AAC",  # AAC-LC -- universal
+        "A_AAC/MPEG2/LC",
+        "A_AAC/MPEG4/LC",
+        "aac",  # FFmpeg/PyAV name
+    }
+)
+
+# ────────────────────────────────────────────────────────────────────
+# Video codecs that need re-encoding to H.264
+# ────────────────────────────────────────────────────────────────────
+VIDEO_NEEDS_REENCODE = frozenset(
+    {
+        "V_MPEGH/ISO/HEVC",  # H.265/HEVC (Chrome/Firefox don't support)
+        "V_MPEG2",  # MPEG-2 (DVD-era)
+        "V_MPEG4/ISO/SP",  # MPEG-4 Part 2 Simple Profile
+        "V_MPEG4/ISO/ASP",  # MPEG-4 Part 2 Advanced Simple (DivX/Xvid)
+        "V_MPEG4/ISO/AP",  # MPEG-4 Part 2 Advanced Profile
+        "V_MPEG4/MS/V3",  # MS MPEG-4 v3 (WMV)
+        "V_MS/VFW/FOURCC",  # Generic VFW (VC-1, etc.)
+        "V_REAL/RV10",
+        "V_REAL/RV20",
+        "V_REAL/RV30",
+        "V_REAL/RV40",
+        "V_THEORA",
+        "V_VP8",
+        "V_VP9",  # VP9 in MKV (needs WebM container for browser)
+        "V_AV1",  # AV1 (partial support, safer to reencode)
+        # PyAV / FFmpeg codec names
+        "hevc",
+        "h265",
+        "mpeg2video",
+        "mpeg4",
+        "vc1",
+        "vp8",
+        "vp9",
+        "av1",
+        "theora",
+        "wmv3",
+        "rv30",
+        "rv40",
+    }
+)
+
+# ────────────────────────────────────────────────────────────────────
+# Audio codecs that need transcoding to AAC
+# (superset of the list in audio_transcoder.py, uses both MKV and
+# PyAV codec names for universal lookup)
+# ────────────────────────────────────────────────────────────────────
+AUDIO_NEEDS_TRANSCODE = frozenset(
+    {
+        # MKV CodecIDs
+        "A_EAC3",
+        "A_AC3",
+        "A_DTS",
+        "A_DTS/EXPRESS",
+        "A_DTS/LOSSLESS",
+        "A_OPUS",
+        "A_VORBIS",
+        "A_FLAC",
+        "A_TRUEHD",
+        "A_MLP",
+        "A_PCM/INT/LIT",
+        "A_PCM/INT/BIG",
+        "A_PCM/FLOAT/IEEE",
+        "A_REAL/28_8",
+        "A_REAL/COOK",
+        "A_REAL/SIPR",
+        "A_REAL/ATRC",
+        "A_MS/ACM",  # Generic Windows audio
+        "A_MP3",
+        "A_MPEG/L3",
+        # PyAV / FFmpeg names
+        "eac3",
+        "ac3",
+        "dts",
+        "dca",
+        "truehd",
+        "mlp",
+        "mp3",
+        "opus",
+        "vorbis",
+        "flac",
+        "pcm_s16le",
+        "pcm_s24le",
+        "pcm_f32le",
+        "wmav2",
+        "wmavoice",
+        "wmapro",
+        "cook",
+        "sipr",
+        "atrac3",
+    }
+)
+
+# Map PyAV codec names to MKV CodecIDs (for the MKV fast-path)
+_PYAV_TO_MKV_VIDEO = {
+    "h264": "V_MPEG4/ISO/AVC",
+    "hevc": "V_MPEGH/ISO/HEVC",
+    "h265": "V_MPEGH/ISO/HEVC",
+    "mpeg2video": "V_MPEG2",
+    "vp8": "V_VP8",
+    "vp9": "V_VP9",
+    "av1": "V_AV1",
+}
+
+_PYAV_TO_MKV_AUDIO = {
+    "aac": "A_AAC",
+    "eac3": "A_EAC3",
+    "ac3": "A_AC3",
+    "dts": "A_DTS",
+    "opus": "A_OPUS",
+    "vorbis": "A_VORBIS",
+    "flac": "A_FLAC",
+    "mp3": "A_MPEG/L3",
+    "truehd": "A_TRUEHD",
+}
+
+
+# ────────────────────────────────────────────────────────────────────
+# NAL unit format conversion (Annex B ↔ AVCC)
+# ────────────────────────────────────────────────────────────────────
+
+# H.264 NAL types that belong in the init segment (avcC), not in samples
+_H264_PARAM_NAL_TYPES = frozenset({7, 8, 9})  # SPS, PPS, AUD
+
+
+def _find_annexb_nals(data: bytes) -> list[tuple[int, int]]:
+    """
+    Find all NAL unit [start, end) byte ranges in Annex B formatted data.
+
+    Handles both 3-byte (00 00 01) and 4-byte (00 00 00 01) start codes.
+    Returns a list of (start, end) tuples pointing into *data*.
+    """
+    size = len(data)
+    nals: list[tuple[int, int]] = []
+    i = 0
+
+    while i < size - 2:
+        # Scan for 0x000001 or 0x00000001
+        if data[i] != 0:
+            i += 1
+            continue
+        if data[i + 1] != 0:
+            i += 2
+            continue
+        if data[i + 2] == 1:
+            nal_start = i + 3
+        elif data[i + 2] == 0 and i + 3 < size and data[i + 3] == 1:
+            nal_start = i + 4
+        else:
+            i += 1
+            continue
+
+        # Record end of previous NAL
+        if nals:
+            nals[-1] = (nals[-1][0], i)
+        nals.append((nal_start, size))
+        i = nal_start
+
+    return nals
+
+
+def is_annexb(data: bytes) -> bool:
+    """
+    Return True if *data* starts with an Annex B start code.
+
+    Disambiguates AVCC (4-byte length prefix) from Annex B when the data
+    begins with ``00 00 01 xx`` or ``00 00 00 01`` by checking whether
+    the AVCC interpretation yields a plausible H.264 NAL.  If the 4-byte
+    big-endian length + subsequent NAL header byte is valid and the
+    length fits within the data, this is AVCC -- not Annex B.
+    """
+    if len(data) < 5:
+        return False
+
+    # 4-byte start code: 00 00 00 01
+    if data[0] == 0 and data[1] == 0 and data[2] == 0 and data[3] == 1:
+        return True
+
+    # 3-byte start code: 00 00 01 -- but could also be AVCC with length
+    # that starts with 00 00 01 (i.e. length 0x000001xx = 256..511).
+    if data[0] == 0 and data[1] == 0 and data[2] == 1:
+        # Interpret as AVCC: 4-byte big-endian length
+        avcc_len = int.from_bytes(data[0:4], "big")
+        if 0 < avcc_len <= len(data) - 4:
+            # Check if the NAL header byte is a valid H.264 NAL
+            nal_byte = data[4]
+            forbidden = (nal_byte >> 7) & 1
+            nal_type = nal_byte & 0x1F
+            if forbidden == 0 and 1 <= nal_type <= 12:
+                # Plausible AVCC: valid length + valid NAL type
+                return False
+        # Not plausible AVCC, treat as Annex B
+        return True
+
+    return False
+
+
+def annexb_to_avcc(data: bytes, filter_ps: bool = True) -> bytes:
+    """
+    Convert Annex B (start-code-prefixed) NAL units to AVCC
+    (4-byte length-prefixed) format suitable for fMP4 samples.
+
+    Args:
+        data: H.264 access unit in Annex B format.
+        filter_ps: If True, strip SPS/PPS/AUD NAL units (they belong
+                   in the avcC box of the init segment, not in samples).
+
+    Returns:
+        The same NAL units with 4-byte big-endian length prefixes.
+    """
+    if not data or not is_annexb(data):
+        return data  # Already AVCC or empty
+
+    nals = _find_annexb_nals(data)
+    if not nals:
+        return data
+
+    out = bytearray()
+    for start, end in nals:
+        # Strip trailing zero-padding before next start code
+        while end > start and data[end - 1] == 0:
+            end -= 1
+        if end <= start:
+            continue
+
+        if filter_ps:
+            nal_type = data[start] & 0x1F
+            if nal_type in _H264_PARAM_NAL_TYPES:
+                continue
+
+        length = end - start
+        out.extend(length.to_bytes(4, "big"))
+        out.extend(data[start:end])
+
+    # If every NAL was filtered out (e.g. packet only contains SPS/PPS/AUD),
+    # return empty so callers can drop this sample. Returning original Annex-B
+    # bytes here would corrupt fMP4 samples (expects AVCC length prefixes).
+    return bytes(out)
+
+
+# H.264 profiles that require the avcC High Profile extension fields
+# (chroma_format_idc, bit_depth_luma/chroma, numSpsExt).
+_HIGH_PROFILE_IDCS = frozenset({100, 110, 122, 244, 44, 83, 86, 118, 128, 138, 139, 134})
+
+
+def _fix_avcc_high_profile(avcc: bytes) -> bytes:
+    """
+    Ensure an avcC record includes High Profile extension bytes.
+
+    The ISO/IEC 14496-15 spec requires additional fields after the PPS
+    section when ``AVCProfileIndication`` is 100 (High), 110, 122, or 244.
+    Some MKV muxers omit these, causing decoders to not know the chroma
+    format or bit depth, which leads to widespread decode errors.
+
+    If the extensions are missing, appends the defaults for 4:2:0 / 8-bit
+    with zero extended SPS sets.
+    """
+    if len(avcc) < 7:
+        return avcc
+    if avcc[0] != 1:
+        return avcc  # Not an avcC record
+
+    profile_idc = avcc[1]
+    if profile_idc not in _HIGH_PROFILE_IDCS:
+        return avcc  # Not a High Profile variant, no extensions needed
+
+    # Walk past SPS and PPS sections to find where extensions should be
+    off = 5
+    num_sps = avcc[off] & 0x1F
+    off += 1
+    for _ in range(num_sps):
+        if off + 2 > len(avcc):
+            return avcc
+        sps_len = struct.unpack(">H", avcc[off : off + 2])[0]
+        off += 2 + sps_len
+
+    if off >= len(avcc):
+        return avcc
+    num_pps = avcc[off]
+    off += 1
+    for _ in range(num_pps):
+        if off + 2 > len(avcc):
+            return avcc
+        pps_len = struct.unpack(">H", avcc[off : off + 2])[0]
+        off += 2 + pps_len
+
+    # If there are already bytes after the PPS section, extensions exist
+    if off < len(avcc):
+        return avcc
+
+    # Append default High Profile extensions:
+    #   chroma_format_idc = 1 (4:2:0)  -> 0xFC | 0x01 = 0xFD  (reserved 111111 + 01)
+    #   bit_depth_luma_minus8 = 0       -> 0xF8 | 0x00 = 0xF8  (reserved 11111 + 000)
+    #   bit_depth_chroma_minus8 = 0     -> 0xF8 | 0x00 = 0xF8  (reserved 11111 + 000)
+    #   numOfSequenceParameterSetExt = 0
+    ext = bytearray(avcc)
+    ext.append(0xFD)  # 111111_01 : chroma_format_idc = 1
+    ext.append(0xF8)  # 11111_000 : bit_depth_luma_minus8 = 0
+    ext.append(0xF8)  # 11111_000 : bit_depth_chroma_minus8 = 0
+    ext.append(0x00)  # numOfSequenceParameterSetExt = 0
+    return bytes(ext)
+
+
+def ensure_avcc_extradata(extradata: bytes) -> bytes:
+    """
+    Ensure h264 extradata is in avcC format for the fMP4 init segment.
+
+    PyAV returns extradata in the container's native format:
+    - MKV/MP4: avcC format (starts with 0x01)
+    - MPEG-TS: Annex B format (starts with 0x00 0x00)
+
+    If Annex B, parses SPS/PPS NAL units and builds proper avcC.
+    If already avcC, validates and fixes High Profile extension fields.
+    """
+    if not extradata or len(extradata) < 4:
+        return extradata
+
+    # Already avcC format (configurationVersion == 1)
+    if extradata[0] == 0x01:
+        return _fix_avcc_high_profile(extradata)
+
+    # Parse Annex B NAL units to extract SPS and PPS
+    nals = _find_annexb_nals(extradata)
+    if not nals:
+        return extradata
+
+    sps_list: list[bytes] = []
+    pps_list: list[bytes] = []
+
+    for start, end in nals:
+        while end > start and extradata[end - 1] == 0:
+            end -= 1
+        if end <= start:
+            continue
+        nal_type = extradata[start] & 0x1F
+        nal_data = extradata[start:end]
+        if nal_type == 7:  # SPS
+            sps_list.append(nal_data)
+        elif nal_type == 8:  # PPS
+            pps_list.append(nal_data)
+
+    if not sps_list:
+        return extradata  # Can't build avcC without SPS
+
+    sps = sps_list[0]
+    if len(sps) < 4:
+        return extradata
+
+    # Build avcC box content
+    avcc = bytearray()
+    avcc.append(1)  # configurationVersion
+    avcc.append(sps[1])  # AVCProfileIndication
+    avcc.append(sps[2])  # profile_compatibility
+    avcc.append(sps[3])  # AVCLevelIndication
+    avcc.append(0xFF)  # 6 bits reserved (0x3F) + lengthSizeMinusOne=3 -> 4-byte NAL lengths
+    avcc.append(0xE0 | len(sps_list))  # 3 bits reserved (0x07) + numOfSPS
+
+    for s in sps_list:
+        avcc.extend(struct.pack(">H", len(s)))
+        avcc.extend(s)
+
+    avcc.append(len(pps_list))  # numOfPPS
+    for p in pps_list:
+        avcc.extend(struct.pack(">H", len(p)))
+        avcc.extend(p)
+
+    return _fix_avcc_high_profile(bytes(avcc))
+
+
+def extract_sps_pps_from_annexb(data: bytes) -> bytes:
+    """
+    Extract SPS and PPS NAL units from Annex B encoded data and build
+    an avcC-format extradata blob.
+
+    Hardware encoders like VideoToolbox embed SPS/PPS as in-band NAL
+    units in their first keyframe output rather than setting extradata
+    on the codec context.  This function finds those parameter sets
+    and returns proper avcC bytes suitable for the fMP4 init segment.
+
+    Returns:
+        avcC bytes if SPS/PPS were found, empty bytes otherwise.
+    """
+    if not data or not is_annexb(data):
+        return b""
+
+    nals = _find_annexb_nals(data)
+    if not nals:
+        return b""
+
+    sps_list: list[bytes] = []
+    pps_list: list[bytes] = []
+
+    for start, end in nals:
+        # Strip trailing zero-padding
+        while end > start and data[end - 1] == 0:
+            end -= 1
+        if end <= start:
+            continue
+
+        nal_type = data[start] & 0x1F
+        if nal_type == 7:  # SPS
+            sps_list.append(data[start:end])
+        elif nal_type == 8:  # PPS
+            pps_list.append(data[start:end])
+
+    if not sps_list:
+        return b""
+
+    sps = sps_list[0]
+    if len(sps) < 4:
+        return b""
+
+    # Build avcC box content
+    avcc = bytearray()
+    avcc.append(1)  # configurationVersion
+    avcc.append(sps[1])  # AVCProfileIndication
+    avcc.append(sps[2])  # profile_compatibility
+    avcc.append(sps[3])  # AVCLevelIndication
+    avcc.append(0xFF)  # 6 bits reserved + lengthSizeMinusOne=3
+    avcc.append(0xE0 | len(sps_list))  # 3 bits reserved + numOfSPS
+
+    for s in sps_list:
+        avcc.extend(struct.pack(">H", len(s)))
+        avcc.extend(s)
+
+    avcc.append(len(pps_list))  # numOfPPS
+    for p in pps_list:
+        avcc.extend(struct.pack(">H", len(p)))
+        avcc.extend(p)
+
+    return bytes(avcc)
+
+
+def video_needs_reencode(codec_id: str) -> bool:
+    """Check if a video codec requires re-encoding for browser playback."""
+    if not codec_id:
+        return False
+    return codec_id in VIDEO_NEEDS_REENCODE
+
+
+def audio_needs_transcode(codec_id: str) -> bool:
+    """Check if an audio codec requires transcoding for browser playback."""
+    if not codec_id:
+        return False
+    return codec_id in AUDIO_NEEDS_TRANSCODE
+
+
+def is_browser_compatible(video_codec: str, audio_codec: str) -> bool:
+    """
+    Check if a video+audio combination is fully browser-compatible.
+
+    Returns True only if BOTH video and audio can be played natively in
+    an HTML5 <video> element inside an MP4 container.
+    """
+    video_ok = video_codec in BROWSER_VIDEO_CODECS or not video_codec
+    audio_ok = audio_codec in BROWSER_AUDIO_CODECS or not audio_codec
+    return video_ok and audio_ok
+
+
+class TranscodeDecision:
+    """Result of analyzing a stream's codec compatibility."""
+
+    __slots__ = ("transcode_video", "transcode_audio", "video_codec", "audio_codec")
+
+    def __init__(self, video_codec: str = "", audio_codec: str = "") -> None:
+        self.video_codec = video_codec
+        self.audio_codec = audio_codec
+        self.transcode_video = video_needs_reencode(video_codec)
+        self.transcode_audio = audio_needs_transcode(audio_codec)
+
+    @property
+    def needs_transcode(self) -> bool:
+        """True if any stream needs transcoding."""
+        return self.transcode_video or self.transcode_audio
+
+    @property
+    def passthrough_ok(self) -> bool:
+        """True if the stream can be served as-is to a browser."""
+        return not self.needs_transcode
+
+    def __repr__(self) -> str:
+        parts = []
+        if self.transcode_video:
+            parts.append(f"video:{self.video_codec}->h264")
+        if self.transcode_audio:
+            parts.append(f"audio:{self.audio_codec}->aac")
+        if not parts:
+            parts.append("passthrough")
+        return f"TranscodeDecision({', '.join(parts)})"
@@ -0,0 +1,614 @@
+"""
+Container format probing -- MKV Cues and MP4 moov.
+
+Pure Python probing using EBML parsing (MKV) and struct-based atom
+scanning (MP4). No FFmpeg dependency.
+
+Source-agnostic: accepts any MediaSource protocol implementation
+(Telegram, HTTP, etc.) for byte-range reads.
+
+Provides:
+- probe_mkv_cues: probe MKV file to extract seek index (MKVCueIndex)
+- probe_mp4_moov: probe MP4 file to extract moov atom and build seek index (MP4Index)
+"""
+
+import base64
+import hashlib
+import json
+import logging
+import struct
+
+from mediaflow_proxy.utils import redis_utils
+from mediaflow_proxy.remuxer.ebml_parser import (
+    MKVCueIndex,
+    build_cue_index,
+    parse_ebml_header,
+    parse_seek_head,
+    CUES,
+    INFO,
+)
+from mediaflow_proxy.remuxer.mp4_parser import (
+    MP4Index,
+    build_cue_points_from_moov,
+    is_mp4_header,
+    rewrite_moov_offsets,
+)
+
+logger = logging.getLogger(__name__)
+
+# How much of the MKV header to fetch for SeekHead + Info parsing
+_HEADER_PROBE_SIZE = 64 * 1024  # 64 KB
+
+# Max Cues element size we'll attempt to fetch
+_MAX_CUES_SIZE = 2 * 1024 * 1024  # 2 MB
+
+# Redis cache for MKV Cue indexes
+_CUE_INDEX_CACHE_PREFIX = "mfp:cue_index:"
+_CUE_INDEX_CACHE_TTL = 3600  # 1 hour
+
+
+# =============================================================================
+# MKV Cues probing
+# =============================================================================
+
+
+def derive_cue_cache_key(
+    source_key: str = "",
+    *,
+    chat_id: str | int | None = None,
+    message_id: int | None = None,
+    file_id: str | None = None,
+) -> str:
+    """
+    Derive a deterministic cache key for a file's cue index.
+
+    Accepts either a pre-computed source_key (from MediaSource.cache_key)
+    or legacy Telegram-style parameters for backwards compatibility.
+    """
+    if source_key:
+        return source_key
+    if file_id:
+        raw = f"file_id:{file_id}"
+    elif chat_id is not None and message_id is not None:
+        raw = f"chat:{chat_id}:msg:{message_id}"
+    else:
+        return ""
+    return hashlib.sha256(raw.encode()).hexdigest()[:16]
+
+
+async def _get_cached_cue_index(cache_key: str) -> MKVCueIndex | None:
+    """Try to load a MKVCueIndex from Redis cache."""
+    if not cache_key:
+        return None
+    r = await redis_utils.get_redis()
+    if r is None:
+        return None
+    redis_key = f"{_CUE_INDEX_CACHE_PREFIX}{cache_key}"
+    data = await r.get(redis_key)
+    if not data:
+        return None
+    try:
+        d = json.loads(data)
+        seek_header = b""
+        if d.get("seek_header_b64"):
+            seek_header = base64.b64decode(d["seek_header_b64"])
+        video_codec_private = b""
+        if d.get("video_codec_private_b64"):
+            video_codec_private = base64.b64decode(d["video_codec_private_b64"])
+        index = MKVCueIndex(
+            duration_ms=d["duration_ms"],
+            timestamp_scale=d["timestamp_scale"],
+            cue_points=[(cp[0], cp[1]) for cp in d["cue_points"]],
+            segment_data_offset=d["segment_data_offset"],
+            first_cluster_offset=d.get("first_cluster_offset", 0),
+            seek_header=seek_header,
+            audio_codec_id=d.get("audio_codec_id", ""),
+            audio_bitrate=d.get("audio_bitrate", 0),
+            audio_channels=d.get("audio_channels", 0),
+            audio_sample_rate=d.get("audio_sample_rate", 0.0),
+            video_codec_id=d.get("video_codec_id", ""),
+            video_codec_private=video_codec_private,
+            video_width=d.get("video_width", 0),
+            video_height=d.get("video_height", 0),
+            video_fps=d.get("video_fps", 0.0),
+            video_default_duration_ns=d.get("video_default_duration_ns", 0),
+        )
+        logger.debug("[container_probe] Loaded cue index from cache: %s", cache_key)
+        return index
+    except (KeyError, TypeError, json.JSONDecodeError) as e:
+        logger.warning("[container_probe] Invalid cached cue index: %s", e)
+        return None
+
+
+async def _set_cached_cue_index(cache_key: str, index: MKVCueIndex) -> None:
+    """Cache a MKVCueIndex in Redis."""
+    if not cache_key:
+        return
+    r = await redis_utils.get_redis()
+    if r is None:
+        return
+    redis_key = f"{_CUE_INDEX_CACHE_PREFIX}{cache_key}"
+    data = json.dumps(
+        {
+            "duration_ms": index.duration_ms,
+            "timestamp_scale": index.timestamp_scale,
+            "cue_points": index.cue_points,
+            "segment_data_offset": index.segment_data_offset,
+            "first_cluster_offset": index.first_cluster_offset,
+            "seek_header_b64": base64.b64encode(index.seek_header).decode() if index.seek_header else "",
+            "audio_codec_id": index.audio_codec_id,
+            "audio_bitrate": index.audio_bitrate,
+            "audio_channels": index.audio_channels,
+            "audio_sample_rate": index.audio_sample_rate,
+            "video_codec_id": index.video_codec_id,
+            "video_codec_private_b64": base64.b64encode(index.video_codec_private).decode()
+            if index.video_codec_private
+            else "",
+            "video_width": index.video_width,
+            "video_height": index.video_height,
+            "video_fps": index.video_fps,
+            "video_default_duration_ns": index.video_default_duration_ns,
+        }
+    )
+    await r.set(redis_key, data, ex=_CUE_INDEX_CACHE_TTL)
+    logger.debug("[container_probe] Cached cue index: %s", cache_key)
+
+
+async def probe_mkv_cues(
+    source,
+    file_size: int = 0,
+    cache_key: str = "",
+    header_data: bytes | None = None,
+) -> MKVCueIndex | None:
+    """
+    Probe an MKV file's EBML header and Cues to build a seek index.
+
+    Pure Python -- parses EBML structures directly, no FFmpeg involved.
+
+    Makes up to two small byte-range reads via the provided source:
+    1. First ~64KB: EBML header + SeekHead + Info (skipped if header_data provided)
+    2. Cues section: byte range from SeekHead's Cues position
+
+    Args:
+        source: A MediaSource protocol implementation, or any object with
+                a ``stream(offset, limit)`` async generator method.
+        file_size: Total file size in bytes. If 0, tries ``source.file_size``.
+        cache_key: Optional cache key for Redis caching. If empty, tries
+                   ``source.cache_key``.
+        header_data: Pre-fetched header bytes (first ~64KB). If provided,
+                     skips the initial header fetch from source.
+
+    Returns:
+        MKVCueIndex if successful, None if the file has no Cues or parsing fails.
+    """
+    # Resolve file_size and cache_key from source if not provided
+    if file_size <= 0:
+        file_size = getattr(source, "file_size", 0)
+    if not cache_key:
+        cache_key = getattr(source, "cache_key", "")
+
+    # Check cache first
+    if cache_key:
+        cached = await _get_cached_cue_index(cache_key)
+        if cached:
+            return cached
+
+    try:
+        # Step 1: Use pre-fetched header or fetch from source
+        if header_data is None:
+            header_size = min(_HEADER_PROBE_SIZE, file_size) if file_size > 0 else _HEADER_PROBE_SIZE
+            header_data = b""
+            async for chunk in source.stream(offset=0, limit=header_size):
+                header_data += chunk
+
+        if len(header_data) < 64:
+            logger.warning("[container_probe] Header too small (%d bytes), cannot probe", len(header_data))
+            return None
+
+        # Step 2: Parse EBML header to find Segment data offset
+        segment_data_offset = parse_ebml_header(header_data)
+
+        # Step 3: Parse SeekHead to find Cues and Info positions
+        seek_positions = parse_seek_head(header_data, segment_data_offset)
+
+        if CUES not in seek_positions:
+            logger.info("[container_probe] No Cues position in SeekHead, seeking not available")
+            return None
+
+        cues_relative_offset = seek_positions[CUES]
+        cues_absolute_offset = segment_data_offset + cues_relative_offset
+
+        logger.info(
+            "[container_probe] SeekHead: Cues at offset %d (absolute %d), Info at %s",
+            cues_relative_offset,
+            cues_absolute_offset,
+            seek_positions.get(INFO, "not found"),
+        )
+
+        # Step 4: Fetch the Cues element
+        cues_max = file_size - cues_absolute_offset if file_size > 0 else _MAX_CUES_SIZE
+        cues_fetch_size = min(_MAX_CUES_SIZE, cues_max)
+        if cues_fetch_size <= 0:
+            logger.warning("[container_probe] Cues offset %d beyond file size %d", cues_absolute_offset, file_size)
+            return None
+
+        cues_data = b""
+        async for chunk in source.stream(offset=cues_absolute_offset, limit=cues_fetch_size):
+            cues_data += chunk
+
+        if len(cues_data) < 16:
+            logger.warning("[container_probe] Cues data too small (%d bytes)", len(cues_data))
+            return None
+
+        # Step 5: Build the cue index
+        index = build_cue_index(
+            header_data=header_data,
+            cues_data=cues_data,
+            cues_file_offset=cues_absolute_offset,
+            segment_data_offset=segment_data_offset,
+        )
+
+        # Cache the result
+        if cache_key:
+            await _set_cached_cue_index(cache_key, index)
+
+        return index
+
+    except Exception as e:
+        logger.warning("[container_probe] Failed to probe MKV cues: %s", e)
+        return None
+
+
+# =============================================================================
+# MP4 Moov probing
+# =============================================================================
+
+# Redis cache for MP4 indexes
+_MP4_INDEX_CACHE_PREFIX = "mfp:mp4_index:"
+_MP4_INDEX_CACHE_TTL = 3600  # 1 hour
+
+# How much to read from the start for ftyp + initial atom scanning
+_MP4_HEADER_PROBE_SIZE = 64 * 1024  # 64 KB
+
+# Max moov size we'll accept
+_MAX_MOOV_SIZE = 50 * 1024 * 1024  # 50 MB
+
+# How much to read from the end of the file to find moov
+_MP4_TAIL_PROBE_SIZE = 512 * 1024  # 512 KB
+
+
+async def _get_cached_mp4_index(cache_key: str) -> MP4Index | None:
+    """Try to load an MP4Index from Redis cache."""
+    if not cache_key:
+        return None
+    r = await redis_utils.get_redis()
+    if r is None:
+        return None
+    redis_key = f"{_MP4_INDEX_CACHE_PREFIX}{cache_key}"
+    data = await r.get(redis_key)
+    if not data:
+        return None
+    try:
+        d = json.loads(data)
+        ftyp_data = b""
+        if d.get("ftyp_data_b64"):
+            ftyp_data = base64.b64decode(d["ftyp_data_b64"])
+        index = MP4Index(
+            duration_ms=d["duration_ms"],
+            timescale=d["timescale"],
+            cue_points=[(cp[0], cp[1]) for cp in d["cue_points"]],
+            moov_offset=d["moov_offset"],
+            moov_size=d["moov_size"],
+            ftyp_data=ftyp_data,
+            mdat_offset=d["mdat_offset"],
+            mdat_size=d["mdat_size"],
+            video_codec=d.get("video_codec", ""),
+            audio_codec=d.get("audio_codec", ""),
+            # moov_data is NOT cached (too large), it will be re-fetched
+        )
+        logger.debug("[container_probe] Loaded MP4 index from cache: %s", cache_key)
+        return index
+    except (KeyError, TypeError, json.JSONDecodeError) as e:
+        logger.warning("[container_probe] Invalid cached MP4 index: %s", e)
+        return None
+
+
+async def _set_cached_mp4_index(cache_key: str, index: MP4Index) -> None:
+    """Cache an MP4Index in Redis (without moov_data)."""
+    if not cache_key:
+        return
+    r = await redis_utils.get_redis()
+    if r is None:
+        return
+    redis_key = f"{_MP4_INDEX_CACHE_PREFIX}{cache_key}"
+    data = json.dumps(
+        {
+            "duration_ms": index.duration_ms,
+            "timescale": index.timescale,
+            "cue_points": index.cue_points,
+            "moov_offset": index.moov_offset,
+            "moov_size": index.moov_size,
+            "ftyp_data_b64": base64.b64encode(index.ftyp_data).decode() if index.ftyp_data else "",
+            "mdat_offset": index.mdat_offset,
+            "mdat_size": index.mdat_size,
+            "video_codec": index.video_codec,
+            "audio_codec": index.audio_codec,
+        }
+    )
+    await r.set(redis_key, data, ex=_MP4_INDEX_CACHE_TTL)
+    logger.debug("[container_probe] Cached MP4 index: %s", cache_key)
+
+
+def _scan_top_level_atoms(data: bytes) -> list[tuple[bytes, int, int]]:
+    """
+    Scan top-level atom headers from raw file bytes.
+
+    Returns:
+        List of (box_type, absolute_offset, total_size) for each atom found.
+    """
+    atoms = []
+    offset = 0
+    while offset + 8 <= len(data):
+        size = struct.unpack_from(">I", data, offset)[0]
+        box_type = data[offset + 4 : offset + 8]
+
+        if size == 1:  # Extended size
+            if offset + 16 > len(data):
+                break
+            size = struct.unpack_from(">Q", data, offset + 8)[0]
+        elif size == 0:
+            # Extends to end of file - we can't know the real size from
+            # a partial read, but record what we have
+            atoms.append((box_type, offset, 0))
+            break
+
+        if size < 8:
+            break
+
+        atoms.append((box_type, offset, size))
+        offset += size
+
+    return atoms
+
+
+async def probe_mp4_moov(
+    source,
+    file_size: int = 0,
+    cache_key: str = "",
+    header_data: bytes | None = None,
+) -> MP4Index | None:
+    """
+    Probe an MP4 file's moov atom to build a seek index.
+
+    Pure Python -- scans MP4 box headers with struct, no FFmpeg involved.
+
+    Strategy:
+    1. Read first ~64KB to check for ftyp (MP4 signature).
+    2. Scan top-level atoms to find moov and mdat.
+    3. If moov is at the start (faststart), read it from the header data.
+    4. If moov is not in the header, read from the tail of the file.
+    5. Parse moov sample tables to build cue points.
+
+    Args:
+        source: A MediaSource protocol implementation with stream(offset, limit).
+        file_size: Total file size in bytes.
+        cache_key: Optional cache key for Redis caching.
+        header_data: Pre-fetched header bytes (first ~64KB). If provided,
+                     skips the initial header fetch from source.
+
+    Returns:
+        MP4Index if successful, None if not an MP4 or parsing fails.
+    """
+    if file_size <= 0:
+        file_size = getattr(source, "file_size", 0)
+    if not cache_key:
+        cache_key = getattr(source, "cache_key", "")
+
+    # Check cache first
+    if cache_key:
+        cached = await _get_cached_mp4_index(cache_key)
+        if cached:
+            # Re-fetch moov_data (not cached due to size) and rewrite offsets
+            if cached.moov_size > 0 and cached.moov_size <= _MAX_MOOV_SIZE:
+                moov_data = b""
+                async for chunk in source.stream(offset=cached.moov_offset, limit=cached.moov_size):
+                    moov_data += chunk
+                if cached.mdat_offset >= 0:
+                    new_mdat_start = len(cached.ftyp_data) + cached.moov_size
+                    offset_delta = new_mdat_start - cached.mdat_offset
+                    if offset_delta != 0:
+                        moov_data = rewrite_moov_offsets(moov_data, offset_delta)
+                cached.moov_data = moov_data
+            return cached
+
+    try:
+        # Step 1: Use pre-fetched header or fetch from source
+        if header_data is None:
+            header_size = min(_MP4_HEADER_PROBE_SIZE, file_size) if file_size > 0 else _MP4_HEADER_PROBE_SIZE
+            header_data = b""
+            async for chunk in source.stream(offset=0, limit=header_size):
+                header_data += chunk
+
+        if len(header_data) < 12:
+            return None
+
+        # Step 2: Check for ftyp
+        if not is_mp4_header(header_data):
+            return None
+
+        logger.info("[container_probe] MP4 detected, scanning atoms (header=%d bytes)", len(header_data))
+
+        # Step 3: Scan top-level atoms from header
+        atoms = _scan_top_level_atoms(header_data)
+
+        ftyp_offset = -1
+        ftyp_size = 0
+        moov_offset = -1
+        moov_size = 0
+        mdat_offset = -1
+        mdat_size = 0
+
+        for box_type, atom_offset, atom_size in atoms:
+            if box_type == b"ftyp":
+                ftyp_offset = atom_offset
+                ftyp_size = atom_size
+            elif box_type == b"moov":
+                moov_offset = atom_offset
+                moov_size = atom_size
+            elif box_type == b"mdat":
+                mdat_offset = atom_offset
+                mdat_size = atom_size
+
+        # Step 4: If moov not found in header, scan from tail
+        if moov_offset < 0 and file_size > 0:
+            tail_start = max(0, file_size - _MP4_TAIL_PROBE_SIZE)
+            tail_data = b""
+            async for chunk in source.stream(offset=tail_start, limit=file_size - tail_start):
+                tail_data += chunk
+
+            if tail_data:
+                tail_atoms = _scan_top_level_atoms(tail_data)
+                for box_type, rel_offset, atom_size in tail_atoms:
+                    abs_offset = tail_start + rel_offset
+                    if box_type == b"moov":
+                        moov_offset = abs_offset
+                        moov_size = atom_size
+                    elif box_type == b"mdat" and mdat_offset < 0:
+                        mdat_offset = abs_offset
+                        mdat_size = atom_size
+
+                # If the initial scan yielded no moov (tail_start may land
+                # inside a large mdat payload producing garbage atom headers),
+                # resync by scanning 8-byte aligned windows for b"moov".
+                if moov_offset < 0:
+                    needle = b"moov"
+                    search_pos = 0
+                    while search_pos + 8 <= len(tail_data):
+                        idx = tail_data.find(needle, search_pos)
+                        if idx < 0 or idx < 4:
+                            break
+                        candidate_size = struct.unpack_from(">I", tail_data, idx - 4)[0]
+                        if 8 < candidate_size <= _MAX_MOOV_SIZE:
+                            moov_offset = tail_start + idx - 4
+                            moov_size = candidate_size
+                            break
+                        search_pos = idx + 4
+
+        if moov_offset < 0:
+            logger.info("[container_probe] No moov atom found in MP4")
+            return None
+
+        if moov_size <= 0 or moov_size > _MAX_MOOV_SIZE:
+            logger.warning("[container_probe] moov size %d is invalid or too large", moov_size)
+            return None
+
+        logger.info(
+            "[container_probe] MP4 atoms: moov at %d (%d bytes), mdat at %d (%d bytes)",
+            moov_offset,
+            moov_size,
+            mdat_offset,
+            mdat_size,
+        )
+
+        # Step 5: Fetch full moov atom
+        # Check if moov is already contained in the header data we read
+        if moov_offset + moov_size <= len(header_data):
+            moov_data = header_data[moov_offset : moov_offset + moov_size]
+        else:
+            moov_data = b""
+            async for chunk in source.stream(offset=moov_offset, limit=moov_size):
+                moov_data += chunk
+
+        if len(moov_data) < moov_size:
+            logger.warning(
+                "[container_probe] Incomplete moov: got %d of %d bytes",
+                len(moov_data),
+                moov_size,
+            )
+            return None
+
+        # Step 6: Parse moov body (skip box header)
+        # Determine header size
+        raw_size = struct.unpack_from(">I", moov_data, 0)[0]
+        hdr_size = 16 if raw_size == 1 else 8
+        moov_body = moov_data[hdr_size:]
+
+        cue_points, duration_ms, timescale, video_codec, audio_codec = build_cue_points_from_moov(moov_body)
+
+        # If mdat wasn't found via header scan, it's likely right after ftyp
+        # or right after moov. Common layouts:
+        # ftyp + moov + mdat (faststart) or ftyp + mdat + moov
+        if mdat_offset < 0:
+            # Walk atoms to find mdat by scanning just enough from the file
+            # In most cases, mdat is either before or after moov
+            if moov_offset < file_size // 2:
+                # moov is early -> mdat likely follows
+                mdat_search_offset = moov_offset + moov_size
+            else:
+                # moov is late -> mdat likely right after ftyp
+                ftyp_size = struct.unpack_from(">I", header_data, 0)[0]
+                if ftyp_size == 1:
+                    ftyp_size = struct.unpack_from(">Q", header_data, 8)[0]
+                mdat_search_offset = ftyp_size
+
+            # Read a small amount to find the mdat header
+            mdat_header = b""
+            async for chunk in source.stream(offset=mdat_search_offset, limit=16):
+                mdat_header += chunk
+            if len(mdat_header) >= 8:
+                box_type = mdat_header[4:8]
+                if box_type == b"mdat":
+                    mdat_offset = mdat_search_offset
+                    raw_sz = struct.unpack_from(">I", mdat_header, 0)[0]
+                    if raw_sz == 1 and len(mdat_header) >= 16:
+                        mdat_size = struct.unpack_from(">Q", mdat_header, 8)[0]
+                    else:
+                        mdat_size = raw_sz
+
+        # Step 7: Extract ftyp data (always in the header since it's the first atom)
+        ftyp_data = b""
+        if ftyp_offset >= 0 and ftyp_size > 0 and ftyp_offset + ftyp_size <= len(header_data):
+            ftyp_data = header_data[ftyp_offset : ftyp_offset + ftyp_size]
+
+        # Step 8: Rewrite moov chunk offsets for faststart pipe layout.
+        # The pipe stream will be: ftyp + moov + mdat. The stco/co64
+        # offsets in the original moov point to positions in the original
+        # file. We need to shift them to account for the new layout.
+        # New mdat position = ftyp_size + moov_size
+        # Delta = new_mdat_position - original_mdat_offset
+        if mdat_offset >= 0:
+            new_mdat_start = len(ftyp_data) + moov_size
+            offset_delta = new_mdat_start - mdat_offset
+            if offset_delta != 0:
+                moov_data = rewrite_moov_offsets(moov_data, offset_delta)
+
+        index = MP4Index(
+            duration_ms=duration_ms,
+            timescale=timescale,
+            cue_points=cue_points,
+            moov_offset=moov_offset,
+            moov_size=moov_size,
+            moov_data=moov_data,
+            ftyp_data=ftyp_data,
+            mdat_offset=mdat_offset,
+            mdat_size=mdat_size,
+            video_codec=video_codec,
+            audio_codec=audio_codec,
+        )
+
+        logger.info(
+            "[container_probe] MP4 index: duration=%.1fs, %d cue points, video=%s, audio=%s",
+            duration_ms / 1000.0,
+            len(cue_points),
+            video_codec,
+            audio_codec,
+        )
+
+        if cache_key:
+            await _set_cached_mp4_index(cache_key, index)
+
+        return index
+
+    except Exception as e:
+        logger.warning("[container_probe] Failed to probe MP4 moov: %s", e)
+        return None
@@ -0,0 +1,151 @@
+"""
+HLS VOD playlist generator for on-the-fly fMP4 transcoding.
+
+Produces an M3U8 VOD playlist from an ``MKVCueIndex`` or ``MP4Index``.
+Consecutive keyframes that are closer together than the target segment
+duration are merged into a single HLS segment, matching the behaviour
+of ``ffmpeg -hls_time``.
+
+The init segment is referenced via ``#EXT-X-MAP``.
+
+Requires ``#EXT-X-VERSION:7`` for fMP4 (CMAF) segments.
+"""
+
+from __future__ import annotations
+
+import math
+from typing import TYPE_CHECKING
+
+if TYPE_CHECKING:
+    pass
+
+
+def merge_cue_points(
+    cue_points: list[tuple[float, int]],
+    target_duration_ms: float = 5000.0,
+) -> list[tuple[float, int]]:
+    """Merge consecutive keyframes into segments of *>= target_duration_ms*.
+
+    This replicates the logic of ``ffmpeg -hls_time``: a new segment
+    boundary is created only when a keyframe is encountered **at least**
+    ``target_duration_ms`` after the start of the current segment.
+    Keyframes that fall within the target window are absorbed into the
+    current segment.
+
+    Side-effects:
+    * Eliminates duplicate byte-offset entries (previously handled by
+      ``deduplicate_cue_points``).
+    * Eliminates very short "runt" segments (e.g. 0.3 s).
+
+    Args:
+        cue_points: Sorted ``(time_ms, byte_offset)`` list.
+        target_duration_ms: Minimum segment duration in milliseconds.
+
+    Returns:
+        A reduced list of ``(time_ms, byte_offset)`` tuples representing
+        the merged segment boundaries.
+    """
+    if not cue_points:
+        return []
+
+    # Normalize duplicate offsets first: keep the earliest timestamp for each
+    # byte offset. Some MKV files expose multiple cue times for the same
+    # cluster offset; if we keep a later duplicate, segment start times no
+    # longer match the actual bytes and can produce timestamp regressions.
+    # Sorting by (time, offset) ensures earliest time wins deterministically.
+    by_time = sorted(cue_points, key=lambda x: (x[0], x[1]))
+    deduped: list[tuple[float, int]] = []
+    seen_offsets: set[int] = set()
+    for time_ms, byte_offset in by_time:
+        if byte_offset in seen_offsets:
+            continue
+        seen_offsets.add(byte_offset)
+        deduped.append((time_ms, byte_offset))
+
+    if not deduped:
+        return []
+
+    merged: list[tuple[float, int]] = [deduped[0]]
+    for i in range(1, len(deduped)):
+        time_since_last = deduped[i][0] - merged[-1][0]
+        if time_since_last >= target_duration_ms:
+            merged.append(deduped[i])
+    return merged
+
+
+def generate_vod_playlist(
+    cue_points: list[tuple[float, int]],
+    duration_ms: float,
+    init_url: str,
+    segment_url_template: str,
+    target_segment_duration_ms: float = 5000.0,
+) -> str:
+    """Build an HLS VOD M3U8 playlist from cue-point data.
+
+    Consecutive keyframes that are closer than *target_segment_duration_ms*
+    are merged into a single segment (matching ``ffmpeg -hls_time``).
+
+    Segment URLs use ``{start_ms}`` and ``{end_ms}`` placeholders that are
+    replaced with the segment's time range in milliseconds.
+
+    Args:
+        cue_points: Sorted list of ``(time_ms, byte_offset)`` tuples.
+        duration_ms: Total media duration in milliseconds.
+        init_url: URL for the fMP4 init segment (``#EXT-X-MAP`` URI).
+        segment_url_template: URL template containing ``{seg}``,
+            ``{start_ms}`` and ``{end_ms}`` placeholders.
+        target_segment_duration_ms: Target minimum segment duration.
+
+    Returns:
+        Complete M3U8 playlist string.
+    """
+    if not cue_points:
+        return ""
+
+    merged = merge_cue_points(cue_points, target_segment_duration_ms)
+
+    # Build per-segment (start_ms, end_ms, duration_s) list.
+    segments: list[tuple[float, float, float]] = []
+    for i in range(len(merged)):
+        start_ms = merged[i][0]
+        end_ms = merged[i + 1][0] if i + 1 < len(merged) else duration_ms
+        dur_s = max((end_ms - start_ms) / 1000.0, 0.001)
+        segments.append((start_ms, end_ms, dur_s))
+
+    if not segments:
+        return ""
+
+    target_duration = math.ceil(max(dur_s for _, _, dur_s in segments))
+    target_duration = max(target_duration, 1)
+
+    lines: list[str] = [
+        "#EXTM3U",
+        "#EXT-X-VERSION:7",
+        f"#EXT-X-TARGETDURATION:{target_duration}",
+        "#EXT-X-PLAYLIST-TYPE:VOD",
+        "#EXT-X-MEDIA-SEQUENCE:0",
+        f'#EXT-X-MAP:URI="{init_url}"',
+    ]
+
+    for seg_num, (start_ms, end_ms, dur_s) in enumerate(segments):
+        lines.append(f"#EXTINF:{dur_s:.3f},")
+        url = (
+            segment_url_template.replace(
+                "{seg}",
+                str(seg_num),
+            )
+            .replace(
+                "{start_ms}",
+                str(int(start_ms)),
+            )
+            .replace(
+                "{end_ms}",
+                str(int(end_ms)),
+            )
+        )
+        lines.append(url)
+
+    lines.append("#EXT-X-ENDLIST")
+    lines.append("")  # trailing newline
+
+    return "\n".join(lines)
@@ -0,0 +1,234 @@
+"""
+Abstract media source protocol for source-agnostic transcode pipeline.
+
+Decouples the transcode pipeline, MKV cue probing, and seeking logic
+from any specific transport (Telegram, HTTP, etc.). Each transport
+implements the MediaSource protocol to provide byte-range streaming.
+"""
+
+import hashlib
+import logging
+from collections.abc import AsyncIterator
+from typing import Protocol, runtime_checkable
+from urllib.parse import urlparse, unquote
+
+from mediaflow_proxy.utils.http_client import create_aiohttp_session
+from mediaflow_proxy.utils.telegram import telegram_manager
+
+logger = logging.getLogger(__name__)
+
+# Extensions mapped to container format hints used by transcode_handler
+_MKV_EXTENSIONS = frozenset({".mkv", ".webm"})
+_MP4_EXTENSIONS = frozenset({".mp4", ".m4v", ".mov", ".m4a", ".3gp"})
+
+
+def _extract_extension(path: str) -> str:
+    """Extract lowercase file extension (e.g. '.mkv') from a path or URL."""
+    # Strip query/fragment first for URL paths
+    dot_pos = path.rfind(".")
+    if dot_pos < 0:
+        return ""
+    ext = path[dot_pos:].lower()
+    # Trim anything after the extension (query params from raw paths)
+    for ch in ("?", "#", "&"):
+        idx = ext.find(ch)
+        if idx > 0:
+            ext = ext[:idx]
+    return ext
+
+
+def filename_hint_from_url(url: str) -> str:
+    """Derive a filename hint from a URL path (e.g. '.mkv', '.mp4')."""
+    try:
+        parsed = urlparse(url)
+        return _extract_extension(unquote(parsed.path))
+    except Exception:
+        return ""
+
+
+def filename_hint_from_name(filename: str) -> str:
+    """Derive a filename hint from a filename string."""
+    return _extract_extension(filename) if filename else ""
+
+
+@runtime_checkable
+class MediaSource(Protocol):
+    """
+    Protocol for streaming media byte ranges.
+
+    Implementations must provide:
+    - stream(): async iterator of bytes from offset/limit
+    - file_size: total file size in bytes
+    - cache_key: deterministic key for caching (cue index, etc.)
+    - filename_hint: optional file extension hint (e.g. '.mkv', '.mp4')
+    """
+
+    @property
+    def file_size(self) -> int:
+        """Total file size in bytes."""
+        ...
+
+    @property
+    def cache_key(self) -> str:
+        """Deterministic cache key derived from the source identity."""
+        ...
+
+    @property
+    def filename_hint(self) -> str:
+        """Optional file extension hint (e.g. '.mkv', '.mp4') for format detection."""
+        ...
+
+    async def stream(self, offset: int = 0, limit: int | None = None) -> AsyncIterator[bytes]:
+        """
+        Stream bytes from the source.
+
+        Args:
+            offset: Byte offset to start from.
+            limit: Number of bytes to read. None = read to end.
+
+        Yields:
+            Chunks of bytes.
+        """
+        ...
+
+
+class TelegramMediaSource:
+    """
+    MediaSource backed by Telegram MTProto downloads.
+
+    Supports two download modes:
+
+    * **parallel** (default): Uses ``ParallelTransferrer`` with multiple
+      MTProtoSender connections for maximum throughput.  Best for full-file
+      streaming (e.g. ``/proxy/telegram/stream``).
+
+    * **single** (``use_single_client=True``): Uses Telethon's built-in
+      ``iter_download`` over the existing client connection.  Avoids the
+      overhead of creating/destroying extra connections for each request,
+      ideal for small byte-range fetches like HLS segments and probe
+      headers.
+    """
+
+    def __init__(
+        self,
+        telegram_ref,
+        file_size: int,
+        file_name: str = "",
+        *,
+        use_single_client: bool = False,
+    ) -> None:
+        self._ref = telegram_ref
+        self._file_size = file_size
+        self._filename_hint = filename_hint_from_name(file_name)
+        self._use_single_client = use_single_client
+
+    @property
+    def file_size(self) -> int:
+        return self._file_size
+
+    @property
+    def cache_key(self) -> str:
+        ref = self._ref
+        if ref.file_id:
+            raw = f"file_id:{ref.file_id}"
+        elif ref.chat_id is not None and ref.message_id is not None:
+            raw = f"chat:{ref.chat_id}:msg:{ref.message_id}"
+        else:
+            return ""
+        return hashlib.sha256(raw.encode()).hexdigest()[:16]
+
+    @property
+    def filename_hint(self) -> str:
+        return self._filename_hint
+
+    async def stream(self, offset: int = 0, limit: int | None = None) -> AsyncIterator[bytes]:
+        effective_limit = limit or self._file_size
+        if self._use_single_client:
+            async for chunk in telegram_manager.stream_media_single(
+                self._ref,
+                offset=offset,
+                limit=effective_limit,
+                file_size=self._file_size,
+            ):
+                yield chunk
+        else:
+            async for chunk in telegram_manager.stream_media(
+                self._ref,
+                offset=offset,
+                limit=effective_limit,
+                file_size=self._file_size,
+            ):
+                yield chunk
+
+
+class HTTPMediaSource:
+    """MediaSource backed by HTTP byte-range requests via aiohttp."""
+
+    def __init__(self, url: str, headers: dict | None = None, file_size: int = 0) -> None:
+        self._url = url
+        self._headers = headers or {}
+        self._file_size = file_size
+        self._filename_hint = filename_hint_from_url(url)
+
+    @property
+    def file_size(self) -> int:
+        return self._file_size
+
+    @property
+    def cache_key(self) -> str:
+        return hashlib.sha256(self._url.encode()).hexdigest()[:16]
+
+    @property
+    def filename_hint(self) -> str:
+        return self._filename_hint
+
+    async def resolve_file_size(self) -> int:
+        """Perform a HEAD request to determine file size if not already known."""
+        if self._file_size > 0:
+            return self._file_size
+
+        async with create_aiohttp_session(self._url, headers=self._headers) as (session, proxy_url):
+            async with session.head(
+                self._url,
+                headers=self._headers,
+                proxy=proxy_url,
+                allow_redirects=True,
+            ) as resp:
+                cl = resp.headers.get("content-length")
+                if cl:
+                    self._file_size = int(cl)
+                else:
+                    # Try GET with range to get content-range
+                    async with session.get(
+                        self._url,
+                        headers={**self._headers, "range": "bytes=0-0"},
+                        proxy=proxy_url,
+                        allow_redirects=True,
+                    ) as range_resp:
+                        cr = range_resp.headers.get("content-range", "")
+                        if "/" in cr:
+                            try:
+                                self._file_size = int(cr.split("/")[-1])
+                            except ValueError:
+                                pass
+        return self._file_size
+
+    async def stream(self, offset: int = 0, limit: int | None = None) -> AsyncIterator[bytes]:
+        headers = dict(self._headers)
+
+        if offset > 0 or limit is not None:
+            end = ""
+            if limit is not None:
+                end = str(offset + limit - 1)
+            headers["range"] = f"bytes={offset}-{end}"
+
+        async with create_aiohttp_session(self._url, headers=headers) as (session, proxy_url):
+            async with session.get(
+                self._url,
+                headers=headers,
+                proxy=proxy_url,
+                allow_redirects=True,
+            ) as resp:
+                resp.raise_for_status()
+                async for chunk in resp.content.iter_any():
+                    yield chunk
@@ -0,0 +1,469 @@
+"""
+Streaming MKV demuxer.
+
+Reads an MKV byte stream via an async iterator and yields individual media
+frames (MKVFrame) with absolute timestamps. Designed for on-the-fly remuxing
+without buffering the entire file.
+
+Architecture:
+  AsyncIterator[bytes] -> StreamBuffer -> EBML parsing -> MKVFrame yields
+
+The demuxer works in two phases:
+  1. read_header(): Consume bytes until Tracks is fully parsed, returning
+     a list of MKVTrack with codec metadata.
+  2. iter_frames(): Yield MKVFrame objects from Cluster/SimpleBlock data
+     as clusters arrive.
+"""
+
+import logging
+from collections.abc import AsyncIterator
+from dataclasses import dataclass, field
+
+from mediaflow_proxy.remuxer.ebml_parser import (
+    CLUSTER,
+    CLUSTER_TIMESTAMP,
+    EBML_HEADER,
+    INFO,
+    MKVFrame,
+    MKVTrack,
+    SEGMENT,
+    SIMPLE_BLOCK,
+    BLOCK_GROUP,
+    TRACKS,
+    TIMESTAMP_SCALE,
+    DURATION,
+    UNKNOWN_SIZE,
+    extract_block_frames,
+    parse_tracks,
+    read_element_id,
+    read_element_size,
+    read_float,
+    read_uint,
+    _parse_block_group,
+    iter_elements,
+)
+
+logger = logging.getLogger(__name__)
+
+
+class StreamBuffer:
+    """
+    Accumulating byte buffer for streaming EBML parsing.
+
+    Collects chunks from an async byte source and provides read-ahead
+    capabilities for EBML element parsing. Supports consuming parsed
+    bytes to keep memory usage bounded.
+    """
+
+    def __init__(self) -> None:
+        self._chunks: list[bytes] = []
+        self._total: int = 0
+        self._consumed: int = 0  # Logical bytes consumed (for offset tracking)
+
+    @property
+    def available(self) -> int:
+        """Number of buffered bytes available for reading."""
+        return self._total
+
+    @property
+    def consumed(self) -> int:
+        """Total bytes consumed so far (for absolute offset tracking)."""
+        return self._consumed
+
+    def append(self, data: bytes) -> None:
+        """Add bytes to the buffer."""
+        if data:
+            self._chunks.append(data)
+            self._total += len(data)
+
+    def peek(self, size: int) -> bytes:
+        """Read up to size bytes without consuming."""
+        if size <= 0:
+            return b""
+        result = bytearray()
+        remaining = size
+        for chunk in self._chunks:
+            if remaining <= 0:
+                break
+            take = min(len(chunk), remaining)
+            result.extend(chunk[:take])
+            remaining -= take
+        return bytes(result)
+
+    def get_all(self) -> bytes:
+        """Get all buffered data as a single bytes object (without consuming)."""
+        if len(self._chunks) == 1:
+            return self._chunks[0]
+        data = b"".join(self._chunks)
+        self._chunks = [data]
+        return data
+
+    def consume(self, size: int) -> bytes:
+        """Remove and return size bytes from the front of the buffer."""
+        if size <= 0:
+            return b""
+        if size > self._total:
+            size = self._total
+
+        result = bytearray()
+        remaining = size
+        while remaining > 0 and self._chunks:
+            chunk = self._chunks[0]
+            if len(chunk) <= remaining:
+                result.extend(chunk)
+                remaining -= len(chunk)
+                self._chunks.pop(0)
+            else:
+                result.extend(chunk[:remaining])
+                self._chunks[0] = chunk[remaining:]
+                remaining = 0
+
+        consumed = len(result)
+        self._total -= consumed
+        self._consumed += consumed
+        return bytes(result)
+
+    def skip(self, size: int) -> int:
+        """Discard size bytes from the front. Returns actual bytes skipped."""
+        if size <= 0:
+            return 0
+        actual = min(size, self._total)
+        remaining = actual
+        while remaining > 0 and self._chunks:
+            chunk = self._chunks[0]
+            if len(chunk) <= remaining:
+                remaining -= len(chunk)
+                self._chunks.pop(0)
+            else:
+                self._chunks[0] = chunk[remaining:]
+                remaining = 0
+        self._total -= actual
+        self._consumed += actual
+        return actual
+
+
+@dataclass
+class MKVHeader:
+    """Parsed MKV header metadata."""
+
+    tracks: list[MKVTrack] = field(default_factory=list)
+    timestamp_scale_ns: int = 1_000_000  # Default 1ms
+    duration_ms: float = 0.0
+    segment_data_offset: int = 0  # Absolute byte offset of Segment children
+
+
+class MKVDemuxer:
+    """
+    Streaming async MKV demuxer.
+
+    Reads an MKV byte stream from an async iterator and provides:
+    - read_header(): Parse EBML header + Segment metadata + Tracks
+    - iter_frames(): Yield MKVFrame objects from Clusters
+
+    Usage:
+        demuxer = MKVDemuxer()
+        header = await demuxer.read_header(source)
+        async for frame in demuxer.iter_frames(source):
+            process(frame)
+    """
+
+    # Minimum bytes to try parsing an element header (ID + size)
+    _MIN_ELEMENT_HEADER = 12
+
+    def __init__(self) -> None:
+        self._buf = StreamBuffer()
+        self._header: MKVHeader | None = None
+        self._scale_ms: float = 1.0  # timestamp_scale / 1_000_000
+
+    @property
+    def header(self) -> MKVHeader | None:
+        return self._header
+
+    async def read_header(self, source: AsyncIterator[bytes]) -> MKVHeader:
+        """
+        Read and parse the MKV header (EBML header, Segment, Info, Tracks).
+
+        Consumes bytes from source until Tracks is fully parsed. Any leftover
+        bytes (start of first Cluster) remain in the internal buffer for
+        iter_frames().
+
+        Returns:
+            MKVHeader with track info and timing metadata.
+        """
+        header = MKVHeader()
+
+        # Phase 1: Accumulate enough data for EBML header + Segment header
+        await self._ensure_bytes(source, 64)
+
+        data = self._buf.get_all()
+        if len(data) < 4:
+            raise ValueError(
+                f"Source ended prematurely: got {len(data)} bytes, need at least an EBML header (source disconnected?)"
+            )
+        pos = 0
+
+        # Parse EBML Header
+        eid, pos = read_element_id(data, pos)
+        if eid != EBML_HEADER:
+            raise ValueError(f"Not an MKV file: expected EBML header, got 0x{eid:X}")
+        size, pos = read_element_size(data, pos)
+        if size == UNKNOWN_SIZE:
+            raise ValueError("EBML header has unknown size")
+        pos += size  # Skip EBML header content
+
+        # Parse Segment element header
+        eid, pos = read_element_id(data, pos)
+        if eid != SEGMENT:
+            raise ValueError(f"Expected Segment, got 0x{eid:X}")
+        _seg_size, pos = read_element_size(data, pos)
+        header.segment_data_offset = self._buf.consumed + pos
+
+        # Phase 2: Parse Segment children until we have Tracks
+        # We need to iterate top-level Segment children: SeekHead, Info, Tracks
+        # Stop when we hit the first Cluster (media data).
+        tracks_found = False
+
+        while not tracks_found:
+            # Ensure we have enough for element header
+            await self._ensure_bytes(source, pos + self._MIN_ELEMENT_HEADER)
+            data = self._buf.get_all()
+
+            if pos >= len(data):
+                break
+
+            try:
+                eid, pos2 = read_element_id(data, pos)
+                size, pos3 = read_element_size(data, pos2)
+            except (ValueError, IndexError):
+                await self._ensure_bytes(source, pos + 32)
+                data = self._buf.get_all()
+                try:
+                    eid, pos2 = read_element_id(data, pos)
+                    size, pos3 = read_element_size(data, pos2)
+                except (ValueError, IndexError):
+                    break
+
+            if eid == CLUSTER:
+                # Reached media data; header parsing is done.
+                # Don't consume the Cluster -- leave it for iter_frames.
+                break
+
+            if size == UNKNOWN_SIZE:
+                # Can't handle unknown-size elements in header
+                logger.warning("[mkv_demuxer] Unknown-size element 0x%X in header at pos %d", eid, pos)
+                break
+
+            # Ensure we have the full element
+            elem_end = pos3 + size
+            await self._ensure_bytes(source, elem_end)
+            data = self._buf.get_all()
+
+            if eid == INFO:
+                self._parse_info_element(data, pos3, pos3 + size, header)
+            elif eid == TRACKS:
+                header.tracks = parse_tracks(data, pos3, pos3 + size)
+                tracks_found = True
+                logger.info(
+                    "[mkv_demuxer] Parsed %d tracks: %s",
+                    len(header.tracks),
+                    ", ".join(f"#{t.track_number}={t.codec_id}" for t in header.tracks),
+                )
+
+            pos = elem_end
+
+        # Consume everything up to the current position (Cluster boundary)
+        self._buf.consume(pos)
+
+        # Set timing scale
+        self._scale_ms = header.timestamp_scale_ns / 1_000_000.0
+        self._header = header
+        return header
+
+    async def iter_frames(self, source: AsyncIterator[bytes]) -> AsyncIterator[MKVFrame]:
+        """
+        Yield MKVFrame objects from Cluster/SimpleBlock data.
+
+        Must be called after read_header(). Continues consuming bytes from
+        source, parsing Clusters and yielding individual frames.
+        """
+        if self._header is None:
+            raise RuntimeError("read_header() must be called before iter_frames()")
+
+        while True:
+            # Try to read the next element header
+            if not await self._ensure_bytes_soft(source, self._MIN_ELEMENT_HEADER):
+                break
+
+            data = self._buf.get_all()
+            pos = 0
+
+            try:
+                eid, pos2 = read_element_id(data, pos)
+                size, pos3 = read_element_size(data, pos2)
+            except (ValueError, IndexError):
+                # Try to get more data
+                if not await self._ensure_bytes_soft(source, len(data) + 4096):
+                    break
+                data = self._buf.get_all()
+                try:
+                    eid, pos2 = read_element_id(data, pos)
+                    size, pos3 = read_element_size(data, pos2)
+                except (ValueError, IndexError):
+                    break
+
+            if eid == CLUSTER:
+                if size == UNKNOWN_SIZE:
+                    # Unknown-size Cluster: parse children until we hit the next
+                    # Cluster or run out of data
+                    self._buf.consume(pos3)  # consume Cluster header
+                    async for frame in self._parse_unknown_size_cluster(source):
+                        yield frame
+                else:
+                    # Known-size Cluster: ensure we have all data
+                    elem_end = pos3 + size
+                    await self._ensure_bytes(source, elem_end)
+                    data = self._buf.get_all()
+
+                    for frame in self._parse_cluster_data(data, pos3, pos3 + size):
+                        yield frame
+
+                    self._buf.consume(elem_end)
+            else:
+                # Skip non-Cluster top-level elements
+                if size == UNKNOWN_SIZE:
+                    break
+                elem_end = pos3 + size
+                if elem_end > len(data):
+                    # Need to skip bytes we don't have yet
+                    self._buf.consume(len(data))
+                    skip_remaining = elem_end - len(data)
+                    await self._skip_bytes(source, skip_remaining)
+                else:
+                    self._buf.consume(elem_end)
+
+    def _parse_info_element(self, data: bytes, start: int, end: int, header: MKVHeader) -> None:
+        """Parse Info element children for timestamp scale and duration."""
+        for eid, off, size, _ in iter_elements(data, start, end):
+            if eid == TIMESTAMP_SCALE:
+                header.timestamp_scale_ns = read_uint(data, off, size)
+            elif eid == DURATION:
+                scale = header.timestamp_scale_ns / 1_000_000.0
+                header.duration_ms = read_float(data, off, size) * scale
+
+    def _parse_cluster_data(self, data: bytes, start: int, end: int) -> list[MKVFrame]:
+        """Parse a known-size Cluster and return its frames."""
+        cluster_timecode = 0
+        frames = []
+
+        for eid, data_off, size, _ in iter_elements(data, start, end):
+            if eid == CLUSTER_TIMESTAMP:
+                cluster_timecode = read_uint(data, data_off, size)
+            elif eid == SIMPLE_BLOCK:
+                for track_num, rel_tc, flags, frame_list in extract_block_frames(data, data_off, size):
+                    is_kf = bool(flags & 0x80)
+                    abs_ts_ms = (cluster_timecode + rel_tc) * self._scale_ms
+                    for frame_data in frame_list:
+                        frames.append(
+                            MKVFrame(
+                                track_number=track_num,
+                                timestamp_ms=abs_ts_ms,
+                                is_keyframe=is_kf,
+                                data=frame_data,
+                            )
+                        )
+            elif eid == BLOCK_GROUP:
+                _parse_block_group(data, data_off, data_off + size, cluster_timecode, self._scale_ms, frames)
+
+        return frames
+
+    async def _parse_unknown_size_cluster(self, source: AsyncIterator[bytes]) -> AsyncIterator[MKVFrame]:
+        """Parse an unknown-size Cluster by reading children until next Cluster."""
+        cluster_timecode = 0
+
+        while True:
+            if not await self._ensure_bytes_soft(source, self._MIN_ELEMENT_HEADER):
+                break
+
+            data = self._buf.get_all()
+            pos = 0
+
+            try:
+                eid, pos2 = read_element_id(data, pos)
+                size, pos3 = read_element_size(data, pos2)
+            except (ValueError, IndexError):
+                if not await self._ensure_bytes_soft(source, len(data) + 4096):
+                    break
+                data = self._buf.get_all()
+                try:
+                    eid, pos2 = read_element_id(data, pos)
+                    size, pos3 = read_element_size(data, pos2)
+                except (ValueError, IndexError):
+                    break
+
+            # A new Cluster or top-level element signals end of current Cluster
+            if eid == CLUSTER or eid == SEGMENT:
+                break
+
+            if size == UNKNOWN_SIZE:
+                break
+
+            elem_end = pos3 + size
+            await self._ensure_bytes(source, elem_end)
+            data = self._buf.get_all()
+
+            if eid == CLUSTER_TIMESTAMP:
+                cluster_timecode = read_uint(data, pos3, size)
+            elif eid == SIMPLE_BLOCK:
+                for track_num, rel_tc, flags, frame_list in extract_block_frames(data, pos3, size):
+                    is_kf = bool(flags & 0x80)
+                    abs_ts_ms = (cluster_timecode + rel_tc) * self._scale_ms
+                    for frame_data in frame_list:
+                        yield MKVFrame(
+                            track_number=track_num,
+                            timestamp_ms=abs_ts_ms,
+                            is_keyframe=is_kf,
+                            data=frame_data,
+                        )
+            elif eid == BLOCK_GROUP:
+                bg_frames = []
+                _parse_block_group(data, pos3, pos3 + size, cluster_timecode, self._scale_ms, bg_frames)
+                for frame in bg_frames:
+                    yield frame
+
+            self._buf.consume(elem_end)
+
+    async def _ensure_bytes(self, source: AsyncIterator[bytes], needed: int) -> None:
+        """Ensure the buffer has at least 'needed' bytes. Raises StopAsyncIteration if exhausted."""
+        while self._buf.available < needed:
+            try:
+                chunk = await source.__anext__()
+                self._buf.append(chunk)
+            except StopAsyncIteration:
+                return
+
+    async def _ensure_bytes_soft(self, source: AsyncIterator[bytes], needed: int) -> bool:
+        """Like _ensure_bytes but returns False instead of raising."""
+        while self._buf.available < needed:
+            try:
+                chunk = await source.__anext__()
+                if not chunk:
+                    return self._buf.available > 0
+                self._buf.append(chunk)
+            except StopAsyncIteration:
+                return self._buf.available > 0
+        return True
+
+    async def _skip_bytes(self, source: AsyncIterator[bytes], count: int) -> None:
+        """Skip count bytes from the source without buffering."""
+        remaining = count
+        while remaining > 0:
+            try:
+                chunk = await source.__anext__()
+                if len(chunk) <= remaining:
+                    remaining -= len(chunk)
+                else:
+                    # Put the excess back
+                    self._buf.append(chunk[remaining:])
+                    remaining = 0
+            except StopAsyncIteration:
+                break
@@ -0,0 +1,834 @@
+"""
+MP4 container parser for moov atom probing.
+
+Provides:
+- MP4Index: seek index extracted from MP4 moov atom (parallel to MKVCueIndex)
+- Top-level atom scanning
+- Sample table parsers (stco, co64, stss, stsz, stts, stsc)
+- Moov-to-cue-point builder
+- rewrite_moov_offsets: adjust stco/co64 in moov for file rearrangement
+
+The parsers are the inverse of the builder functions in mp4_muxer.py.
+Box navigation reuses the pattern from ts_muxer.py's read_box/find_box/iter_boxes.
+"""
+
+import bisect
+import logging
+import struct
+from dataclasses import dataclass, field
+
+logger = logging.getLogger(__name__)
+
+# =============================================================================
+# MP4 Box Utilities
+# =============================================================================
+
+# Minimum bytes needed to read a standard box header
+_BOX_HEADER_SIZE = 8
+
+# ftyp brands that identify MP4/MOV containers
+_MP4_BRANDS = {
+    b"isom",
+    b"iso2",
+    b"iso3",
+    b"iso4",
+    b"iso5",
+    b"iso6",
+    b"mp41",
+    b"mp42",
+    b"M4V ",
+    b"M4A ",
+    b"f4v ",
+    b"kddi",
+    b"avc1",
+    b"qt  ",
+    b"MSNV",
+    b"dash",
+    b"3gp4",
+    b"3gp5",
+    b"3gp6",
+}
+
+
+def is_mp4_header(data: bytes) -> bool:
+    """Check if the data starts with an ftyp box (MP4 signature)."""
+    if len(data) < 8:
+        return False
+    size = struct.unpack_from(">I", data, 0)[0]
+    box_type = data[4:8]
+    if box_type != b"ftyp":
+        return False
+    if size < 12 or size > len(data):
+        return size >= 12  # might be valid but truncated
+    major_brand = data[8:12]
+    return major_brand in _MP4_BRANDS
+
+
+def read_box_header(data: bytes, offset: int) -> tuple[bytes, int, int] | None:
+    """
+    Read a box header at the given offset.
+
+    Returns:
+        (box_type, header_size, total_box_size) or None if not enough data.
+    """
+    if offset + 8 > len(data):
+        return None
+
+    size, box_type = struct.unpack_from(">I4s", data, offset)
+    header_size = 8
+
+    if size == 1:  # Extended size (64-bit)
+        if offset + 16 > len(data):
+            return None
+        size = struct.unpack_from(">Q", data, offset + 8)[0]
+        header_size = 16
+    elif size == 0:  # Box extends to end of data
+        size = len(data) - offset
+
+    return box_type, header_size, size
+
+
+def iter_top_level_boxes(data: bytes):
+    """
+    Iterate over top-level box headers.
+
+    Yields:
+        (box_type, header_size, total_size, data_offset)
+    """
+    offset = 0
+    while offset < len(data):
+        result = read_box_header(data, offset)
+        if result is None:
+            break
+        box_type, header_size, total_size = result
+        yield box_type, header_size, total_size, offset + header_size
+        if total_size == 0:
+            break
+        offset += total_size
+
+
+def find_box(data: bytes, target: bytes) -> bytes | None:
+    """Find a box by type and return its body (data after header)."""
+    for box_type, header_size, total_size, data_offset in iter_top_level_boxes(data):
+        if box_type == target:
+            return data[data_offset : data_offset - header_size + total_size]
+    return None
+
+
+def iter_boxes(data: bytes):
+    """Iterate over child boxes: yields (box_type, box_body_bytes)."""
+    for box_type, header_size, total_size, data_offset in iter_top_level_boxes(data):
+        end = data_offset - header_size + total_size
+        yield box_type, data[data_offset:end]
+
+
+# =============================================================================
+# Sample Table Parsers (inverse of mp4_muxer.py builders)
+# =============================================================================
+
+
+def parse_full_box_header(data: bytes) -> tuple[int, int, int]:
+    """
+    Parse a full box header (version + flags).
+
+    Returns:
+        (version, flags, header_size) where header_size is 4 bytes.
+    """
+    if len(data) < 4:
+        return 0, 0, 0
+    version = data[0]
+    flags = (data[1] << 16) | (data[2] << 8) | data[3]
+    return version, flags, 4
+
+
+def parse_stco(data: bytes) -> list[int]:
+    """
+    Parse Chunk Offset box (stco) - 32-bit offsets.
+
+    Layout: version(1) + flags(3) + entry_count(4) + [offset(4)]...
+    """
+    if len(data) < 8:
+        return []
+    _, _, hdr = parse_full_box_header(data)
+    pos = hdr
+    entry_count = struct.unpack_from(">I", data, pos)[0]
+    pos += 4
+
+    if len(data) < pos + entry_count * 4:
+        return []
+
+    offsets = []
+    for _ in range(entry_count):
+        offsets.append(struct.unpack_from(">I", data, pos)[0])
+        pos += 4
+    return offsets
+
+
+def parse_co64(data: bytes) -> list[int]:
+    """
+    Parse Chunk Offset box (co64) - 64-bit offsets.
+
+    Layout: version(1) + flags(3) + entry_count(4) + [offset(8)]...
+    """
+    if len(data) < 8:
+        return []
+    _, _, hdr = parse_full_box_header(data)
+    pos = hdr
+    entry_count = struct.unpack_from(">I", data, pos)[0]
+    pos += 4
+
+    if len(data) < pos + entry_count * 8:
+        return []
+
+    offsets = []
+    for _ in range(entry_count):
+        offsets.append(struct.unpack_from(">Q", data, pos)[0])
+        pos += 8
+    return offsets
+
+
+def parse_stss(data: bytes) -> list[int]:
+    """
+    Parse Sync Sample box (stss) - keyframe indices (1-based).
+
+    Layout: version(1) + flags(3) + entry_count(4) + [sample_number(4)]...
+    """
+    if len(data) < 8:
+        return []
+    _, _, hdr = parse_full_box_header(data)
+    pos = hdr
+    entry_count = struct.unpack_from(">I", data, pos)[0]
+    pos += 4
+
+    if len(data) < pos + entry_count * 4:
+        return []
+
+    indices = []
+    for _ in range(entry_count):
+        indices.append(struct.unpack_from(">I", data, pos)[0])
+        pos += 4
+    return indices
+
+
+def parse_stsz(data: bytes) -> tuple[int, list[int]]:
+    """
+    Parse Sample Size box (stsz).
+
+    Layout: version(1) + flags(3) + sample_size(4) + sample_count(4) + [size(4)]...
+
+    Returns:
+        (uniform_size, sizes_list).
+        If uniform_size > 0, all samples have that size and sizes_list is empty.
+        Otherwise, sizes_list contains per-sample sizes.
+    """
+    if len(data) < 12:
+        return 0, []
+    _, _, hdr = parse_full_box_header(data)
+    pos = hdr
+    sample_size = struct.unpack_from(">I", data, pos)[0]
+    sample_count = struct.unpack_from(">I", data, pos + 4)[0]
+    pos += 8
+
+    if sample_size > 0:
+        return sample_size, []
+
+    if len(data) < pos + sample_count * 4:
+        return 0, []
+
+    sizes = []
+    for _ in range(sample_count):
+        sizes.append(struct.unpack_from(">I", data, pos)[0])
+        pos += 4
+    return 0, sizes
+
+
+def parse_stts(data: bytes) -> list[tuple[int, int]]:
+    """
+    Parse Time-to-Sample box (stts) - run-length encoded durations.
+
+    Layout: version(1) + flags(3) + entry_count(4) + [sample_count(4) + sample_delta(4)]...
+
+    Returns:
+        List of (sample_count, sample_delta) entries.
+    """
+    if len(data) < 8:
+        return []
+    _, _, hdr = parse_full_box_header(data)
+    pos = hdr
+    entry_count = struct.unpack_from(">I", data, pos)[0]
+    pos += 4
+
+    if len(data) < pos + entry_count * 8:
+        return []
+
+    entries = []
+    for _ in range(entry_count):
+        count = struct.unpack_from(">I", data, pos)[0]
+        delta = struct.unpack_from(">I", data, pos + 4)[0]
+        entries.append((count, delta))
+        pos += 8
+    return entries
+
+
+def parse_stsc(data: bytes) -> list[tuple[int, int, int]]:
+    """
+    Parse Sample-to-Chunk box (stsc).
+
+    Layout: version(1) + flags(3) + entry_count(4) +
+            [first_chunk(4) + samples_per_chunk(4) + sample_desc_index(4)]...
+
+    Returns:
+        List of (first_chunk, samples_per_chunk, sample_desc_index) entries.
+        first_chunk is 1-based.
+    """
+    if len(data) < 8:
+        return []
+    _, _, hdr = parse_full_box_header(data)
+    pos = hdr
+    entry_count = struct.unpack_from(">I", data, pos)[0]
+    pos += 4
+
+    if len(data) < pos + entry_count * 12:
+        return []
+
+    entries = []
+    for _ in range(entry_count):
+        first_chunk = struct.unpack_from(">I", data, pos)[0]
+        spc = struct.unpack_from(">I", data, pos + 4)[0]
+        sdi = struct.unpack_from(">I", data, pos + 8)[0]
+        entries.append((first_chunk, spc, sdi))
+        pos += 12
+    return entries
+
+
+def parse_mdhd(data: bytes) -> tuple[int, int]:
+    """
+    Parse Media Header box (mdhd) for timescale and duration.
+
+    Returns:
+        (timescale, duration) in media timescale units.
+    """
+    if len(data) < 4:
+        return 0, 0
+    version = data[0]
+    if version == 1:
+        # 64-bit: skip version(1)+flags(3)+creation(8)+modification(8)
+        if len(data) < 32:
+            return 0, 0
+        timescale = struct.unpack_from(">I", data, 20)[0]
+        duration = struct.unpack_from(">Q", data, 24)[0]
+    else:
+        # 32-bit: skip version(1)+flags(3)+creation(4)+modification(4)
+        if len(data) < 20:
+            return 0, 0
+        timescale = struct.unpack_from(">I", data, 12)[0]
+        duration = struct.unpack_from(">I", data, 16)[0]
+    return timescale, duration
+
+
+def parse_stsd_codec(data: bytes) -> str:
+    """
+    Parse Sample Description box (stsd) to extract the codec FourCC.
+
+    Returns the codec name as a string (e.g. "avc1", "hvc1", "mp4a").
+    """
+    if len(data) < 16:
+        return ""
+    # version(1)+flags(3)+entry_count(4)
+    pos = 8
+    # First entry: size(4)+type(4)
+    if pos + 8 > len(data):
+        return ""
+    codec_fourcc = data[pos + 4 : pos + 8]
+    try:
+        return codec_fourcc.decode("ascii").strip()
+    except (UnicodeDecodeError, ValueError):
+        return ""
+
+
+# =============================================================================
+# MP4 Index (parallel to MKVCueIndex)
+# =============================================================================
+
+
+@dataclass
+class MP4Index:
+    """
+    Seek index extracted from an MP4 file's moov atom.
+
+    Parallel to ``MKVCueIndex`` for MKV files. Provides keyframe-indexed
+    cue points for time-based seeking and the raw moov bytes needed to
+    reconstruct a streamable (faststart) MP4 for on-the-fly demuxing.
+    """
+
+    duration_ms: float = 0.0
+    timescale: int = 0
+    cue_points: list[tuple[float, int]] = field(default_factory=list)  # [(time_ms, byte_offset), ...]
+    moov_offset: int = 0  # Absolute file offset where moov atom starts
+    moov_size: int = 0  # Total size of the moov atom (header + body)
+    moov_data: bytes = b""  # Raw moov atom bytes (for prepending to mdat pipe)
+    ftyp_data: bytes = b""  # Raw ftyp atom bytes (for prepending before moov)
+    mdat_offset: int = 0  # Absolute file offset where mdat atom starts
+    mdat_size: int = 0  # Total size of the mdat atom
+    video_codec: str = ""  # e.g. "avc1", "hvc1", "mp4v"
+    audio_codec: str = ""  # e.g. "mp4a", "ac-3"
+
+    def byte_offset_for_time(self, time_ms: float) -> tuple[int, float]:
+        """
+        Find the byte offset for the nearest keyframe at or before time_ms.
+
+        Returns:
+            (absolute_byte_offset, actual_keyframe_time_ms)
+        """
+        if not self.cue_points:
+            return 0, 0.0
+
+        times = [cp[0] for cp in self.cue_points]
+        idx = bisect.bisect_right(times, time_ms) - 1
+        if idx < 0:
+            idx = 0
+
+        cue_time_ms, byte_offset = self.cue_points[idx]
+        return byte_offset, cue_time_ms
+
+
+# =============================================================================
+# Moov -> Cue Points Builder
+# =============================================================================
+
+
+def _find_nested_box(data: bytes, *path: bytes) -> bytes | None:
+    """Walk a box hierarchy: find_nested_box(data, b"trak", b"mdia") etc."""
+    current = data
+    for box_name in path:
+        found = find_box(current, box_name)
+        if found is None:
+            return None
+        current = found
+    return current
+
+
+def build_cue_points_from_moov(moov_body: bytes) -> tuple[list[tuple[float, int]], float, int, str, str]:
+    """
+    Parse a moov body to build keyframe-indexed cue points.
+
+    Walks the first video trak's stbl to extract:
+    - Chunk offsets (stco/co64)
+    - Keyframe sample indices (stss)
+    - Sample sizes (stsz)
+    - Sample durations (stts)
+    - Sample-to-chunk mapping (stsc)
+    - Timescale and duration from mdhd
+
+    Returns:
+        (cue_points, duration_ms, timescale, video_codec, audio_codec)
+    """
+    cue_points: list[tuple[float, int]] = []
+    duration_ms = 0.0
+    timescale = 0
+    video_codec = ""
+    audio_codec = ""
+
+    # Find all traks
+    video_stbl = None
+    video_mdhd = None
+
+    offset = 0
+    data = moov_body
+    while offset < len(data):
+        result = read_box_header(data, offset)
+        if result is None:
+            break
+        box_type, hdr_size, total_size = result
+
+        if box_type == b"trak":
+            trak_body = data[offset + hdr_size : offset + total_size]
+
+            # Check handler type to identify video/audio
+            hdlr_data = _find_nested_box(trak_body, b"mdia", b"hdlr")
+            handler_type = b""
+            if hdlr_data and len(hdlr_data) >= 12:
+                # hdlr: version(1)+flags(3)+pre_defined(4)+handler_type(4)
+                handler_type = hdlr_data[8:12]
+
+            if handler_type == b"vide" and video_stbl is None:
+                video_stbl = _find_nested_box(trak_body, b"mdia", b"minf", b"stbl")
+                video_mdhd_data = _find_nested_box(trak_body, b"mdia", b"mdhd")
+                if video_mdhd_data:
+                    video_mdhd = video_mdhd_data
+
+                stsd_data = _find_nested_box(trak_body, b"mdia", b"minf", b"stbl", b"stsd")
+                if stsd_data:
+                    video_codec = parse_stsd_codec(stsd_data)
+
+            elif handler_type == b"soun" and not audio_codec:
+                stsd_data = _find_nested_box(trak_body, b"mdia", b"minf", b"stbl", b"stsd")
+                if stsd_data:
+                    audio_codec = parse_stsd_codec(stsd_data)
+
+        elif box_type == b"mvhd":
+            # Fallback: parse mvhd for timescale/duration if no mdhd
+            mvhd_body = data[offset + hdr_size : offset + total_size]
+            if len(mvhd_body) >= 20:
+                version = mvhd_body[0]
+                if version == 1:
+                    if len(mvhd_body) >= 28:
+                        ts = struct.unpack_from(">I", mvhd_body, 20)[0]
+                        dur = struct.unpack_from(">Q", mvhd_body, 24)[0]
+                        if timescale == 0:
+                            timescale = ts
+                            duration_ms = dur / ts * 1000.0 if ts else 0.0
+                else:
+                    ts = struct.unpack_from(">I", mvhd_body, 12)[0]
+                    dur = struct.unpack_from(">I", mvhd_body, 16)[0]
+                    if timescale == 0:
+                        timescale = ts
+                        duration_ms = dur / ts * 1000.0 if ts else 0.0
+
+        if total_size == 0:
+            break
+        offset += total_size
+
+    # Parse mdhd for video timescale (more precise than mvhd)
+    if video_mdhd:
+        ts, dur = parse_mdhd(video_mdhd)
+        if ts > 0:
+            timescale = ts
+            duration_ms = dur / ts * 1000.0
+
+    if video_stbl is None:
+        logger.warning("[mp4_parser] No video stbl found in moov")
+        return cue_points, duration_ms, timescale, video_codec, audio_codec
+
+    # Parse sample tables from video stbl
+    stco_data = find_box(video_stbl, b"stco")
+    co64_data = find_box(video_stbl, b"co64")
+    stss_data = find_box(video_stbl, b"stss")
+    stsz_data = find_box(video_stbl, b"stsz")
+    stts_data = find_box(video_stbl, b"stts")
+    stsc_data = find_box(video_stbl, b"stsc")
+
+    # Chunk offsets
+    chunk_offsets = parse_co64(co64_data) if co64_data else (parse_stco(stco_data) if stco_data else [])
+
+    # Keyframe sample numbers (1-based)
+    keyframe_samples = set(parse_stss(stss_data)) if stss_data else set()
+    all_are_keyframes = not stss_data  # No stss means all samples are sync
+
+    # Sample sizes
+    uniform_size, size_list = parse_stsz(stsz_data) if stsz_data else (0, [])
+
+    # Sample durations (run-length encoded)
+    stts_entries = parse_stts(stts_data) if stts_data else []
+
+    # Sample-to-chunk mapping
+    stsc_entries = parse_stsc(stsc_data) if stsc_data else []
+
+    if not chunk_offsets or timescale == 0:
+        logger.warning(
+            "[mp4_parser] Missing data: chunks=%d, timescale=%d",
+            len(chunk_offsets),
+            timescale,
+        )
+        return cue_points, duration_ms, timescale, video_codec, audio_codec
+
+    # Expand stts to per-sample durations
+    sample_durations: list[int] = []
+    for count, delta in stts_entries:
+        sample_durations.extend([delta] * count)
+
+    # Expand stsc to determine which samples belong to which chunk
+    # Build a mapping: chunk_index (0-based) -> samples_per_chunk
+    total_chunks = len(chunk_offsets)
+    chunk_sample_counts: list[int] = [0] * total_chunks
+
+    if stsc_entries:
+        for i, (first_chunk, spc, _sdi) in enumerate(stsc_entries):
+            # first_chunk is 1-based
+            start = first_chunk - 1
+            if i + 1 < len(stsc_entries):
+                end = stsc_entries[i + 1][0] - 1
+            else:
+                end = total_chunks
+            for c in range(start, end):
+                if c < total_chunks:
+                    chunk_sample_counts[c] = spc
+    else:
+        # Default: 1 sample per chunk
+        chunk_sample_counts = [1] * total_chunks
+
+    # Count total samples
+    total_samples = sum(chunk_sample_counts)
+
+    # Get per-sample sizes
+    if uniform_size > 0:
+        sample_sizes = [uniform_size] * total_samples
+    else:
+        sample_sizes = size_list
+
+    # Build cumulative timestamp for each sample and map keyframes to byte offsets
+    current_sample = 0  # 0-based sample index
+    current_time = 0  # in timescale units
+
+    for chunk_idx, chunk_offset in enumerate(chunk_offsets):
+        spc = chunk_sample_counts[chunk_idx] if chunk_idx < len(chunk_sample_counts) else 1
+        byte_pos = chunk_offset
+
+        for s in range(spc):
+            sample_num = current_sample + 1  # 1-based for stss comparison
+            is_keyframe = all_are_keyframes or sample_num in keyframe_samples
+
+            if is_keyframe:
+                time_ms = current_time / timescale * 1000.0
+                cue_points.append((time_ms, byte_pos))
+
+            # Advance byte position by this sample's size
+            if current_sample < len(sample_sizes):
+                byte_pos += sample_sizes[current_sample]
+
+            # Advance timestamp
+            if current_sample < len(sample_durations):
+                current_time += sample_durations[current_sample]
+
+            current_sample += 1
+
+    logger.info(
+        "[mp4_parser] Built %d cue points from %d samples, duration=%.1fs, video=%s, audio=%s",
+        len(cue_points),
+        total_samples,
+        duration_ms / 1000.0,
+        video_codec,
+        audio_codec,
+    )
+
+    return cue_points, duration_ms, timescale, video_codec, audio_codec
+
+
+# =============================================================================
+# Moov Offset Rewriting (for faststart pipe construction)
+# =============================================================================
+
+
+def _rewrite_stco_in_place(data: bytearray, box_start: int, box_size: int, delta: int) -> int:
+    """Rewrite stco chunk offsets by adding delta. Returns number of entries fixed."""
+    # FullBox header: version(1) + flags(3) = 4 bytes
+    body_start = box_start + 4
+    if body_start + 4 > box_start + box_size:
+        return 0
+    entry_count = struct.unpack_from(">I", data, body_start)[0]
+    pos = body_start + 4
+    for _ in range(entry_count):
+        if pos + 4 > box_start + box_size:
+            break
+        old_val = struct.unpack_from(">I", data, pos)[0]
+        struct.pack_into(">I", data, pos, old_val + delta)
+        pos += 4
+    return entry_count
+
+
+def _rewrite_co64_in_place(data: bytearray, box_start: int, box_size: int, delta: int) -> int:
+    """Rewrite co64 chunk offsets by adding delta. Returns number of entries fixed."""
+    body_start = box_start + 4
+    if body_start + 4 > box_start + box_size:
+        return 0
+    entry_count = struct.unpack_from(">I", data, body_start)[0]
+    pos = body_start + 4
+    for _ in range(entry_count):
+        if pos + 8 > box_start + box_size:
+            break
+        old_val = struct.unpack_from(">Q", data, pos)[0]
+        struct.pack_into(">Q", data, pos, old_val + delta)
+        pos += 8
+    return entry_count
+
+
+def _walk_and_rewrite(data: bytearray, start: int, end: int, delta: int) -> int:
+    """
+    Recursively walk boxes within [start, end) looking for stco/co64 boxes
+    and rewriting their offsets.
+
+    Returns total number of offset entries rewritten.
+    """
+    total = 0
+    offset = start
+    while offset + 8 <= end:
+        size = struct.unpack_from(">I", data, offset)[0]
+        box_type = data[offset + 4 : offset + 8]
+        hdr_size = 8
+
+        if size == 1:
+            if offset + 16 > end:
+                break
+            size = struct.unpack_from(">Q", data, offset + 8)[0]
+            hdr_size = 16
+        elif size == 0:
+            size = end - offset
+
+        if size < 8 or offset + size > end:
+            break
+
+        body_start = offset + hdr_size
+        body_end = offset + size
+
+        if box_type == b"stco":
+            total += _rewrite_stco_in_place(data, body_start, size - hdr_size, delta)
+        elif box_type == b"co64":
+            total += _rewrite_co64_in_place(data, body_start, size - hdr_size, delta)
+        elif box_type in (b"moov", b"trak", b"mdia", b"minf", b"stbl"):
+            # Container box -- recurse into children
+            total += _walk_and_rewrite(data, body_start, body_end, delta)
+
+        offset += size
+
+    return total
+
+
+def extract_video_track_from_moov(moov_data: bytes):
+    """
+    Extract video codec configuration from an MP4 moov atom.
+
+    Walks the moov box tree to find the first video trak, extracts its
+    resolution and codec-private data (avcC/hvcC), and returns a synthetic
+    ``MKVTrack`` suitable for building an fMP4 init segment.
+
+    Returns:
+        An ``MKVTrack`` with video metadata, or ``None`` if no video track
+        is found.
+    """
+    from mediaflow_proxy.remuxer.ebml_parser import (
+        CODEC_ID_H264,
+        CODEC_ID_H265,
+        MKVTrack,
+    )
+
+    # Strip the moov box header to get the body
+    if len(moov_data) < 8:
+        return None
+    raw_size = struct.unpack_from(">I", moov_data, 0)[0]
+    hdr_size = 16 if raw_size == 1 else 8
+    moov_body = moov_data[hdr_size:]
+
+    # Walk traks looking for video handler
+    offset = 0
+    while offset < len(moov_body):
+        result = read_box_header(moov_body, offset)
+        if result is None:
+            break
+        box_type, box_hdr_size, total_size = result
+
+        if box_type == b"trak":
+            trak_body = moov_body[offset + box_hdr_size : offset + total_size]
+
+            # Check handler type
+            hdlr_data = _find_nested_box(trak_body, b"mdia", b"hdlr")
+            handler_type = b""
+            if hdlr_data and len(hdlr_data) >= 12:
+                handler_type = hdlr_data[8:12]
+
+            if handler_type == b"vide":
+                # Found video trak -- extract stsd for codec config
+                stsd_data = _find_nested_box(trak_body, b"mdia", b"minf", b"stbl", b"stsd")
+                if not stsd_data or len(stsd_data) < 16:
+                    offset += total_size
+                    continue
+
+                codec_name = parse_stsd_codec(stsd_data)
+
+                # Map MP4 codec names to MKV codec IDs
+                if codec_name in ("avc1", "avc3"):
+                    mkv_codec_id = CODEC_ID_H264
+                elif codec_name in ("hvc1", "hev1"):
+                    mkv_codec_id = CODEC_ID_H265
+                else:
+                    mkv_codec_id = f"V_MP4/{codec_name}"
+
+                # Extract codec private (avcC or hvcC box) from inside the
+                # sample entry. The stsd structure is:
+                #   version(1) + flags(3) + entry_count(4)
+                #   then entry: size(4) + type(4) + ... + nested boxes
+                # The avcC/hvcC is a child box of the sample entry.
+                codec_private = b""
+                width = 0
+                height = 0
+
+                # Parse sample entry to get width/height and codec config
+                entry_start = 8  # skip version+flags+entry_count
+                if entry_start + 8 <= len(stsd_data):
+                    entry_size = struct.unpack_from(">I", stsd_data, entry_start)[0]
+                    entry_body_start = entry_start + 8  # skip size+type
+                    entry_end = min(entry_start + entry_size, len(stsd_data))
+
+                    # Visual sample entry: 6 reserved + 2 data_ref_idx + ...
+                    # At offset 24 from entry body start: width(2) + height(2)
+                    vis_offset = entry_body_start + 24
+                    if vis_offset + 4 <= entry_end:
+                        width = struct.unpack_from(">H", stsd_data, vis_offset)[0]
+                        height = struct.unpack_from(">H", stsd_data, vis_offset + 2)[0]
+
+                    # Scan nested boxes for avcC or hvcC
+                    # Visual sample entry fixed fields = 70 bytes from entry body
+                    nested_start = entry_body_start + 70
+                    if nested_start < entry_end:
+                        nested_data = stsd_data[nested_start:entry_end]
+                        for target in (b"avcC", b"hvcC"):
+                            found = find_box(nested_data, target)
+                            if found:
+                                codec_private = found
+                                break
+
+                # Get duration from mdhd if available
+                default_duration_ns = 0
+                mdhd_data = _find_nested_box(trak_body, b"mdia", b"mdhd")
+                if mdhd_data and len(mdhd_data) >= 20:
+                    version = mdhd_data[0]
+                    if version == 1 and len(mdhd_data) >= 28:
+                        ts = struct.unpack_from(">I", mdhd_data, 20)[0]
+                        dur = struct.unpack_from(">Q", mdhd_data, 24)[0]
+                    else:
+                        ts = struct.unpack_from(">I", mdhd_data, 12)[0]
+                        dur = struct.unpack_from(">I", mdhd_data, 16)[0]
+                    if ts > 0 and dur > 0:
+                        # Rough estimate: assume 24fps if we can't determine.
+                        default_duration_ns = int(1_000_000_000 / 24)
+
+                return MKVTrack(
+                    track_number=1,
+                    track_type=1,  # video
+                    codec_id=mkv_codec_id,
+                    codec_private=codec_private,
+                    pixel_width=width,
+                    pixel_height=height,
+                    default_duration_ns=default_duration_ns,
+                )
+
+        offset += total_size
+
+    return None
+
+
+def rewrite_moov_offsets(moov_data: bytes, delta: int) -> bytes:
+    """
+    Rewrite all stco/co64 chunk offsets in a moov atom by adding ``delta``.
+
+    This is needed when rearranging an MP4 file for pipe streaming:
+    the original moov's chunk offsets reference positions in the original
+    file layout. When we prepend moov before mdat, the offsets must be
+    shifted by ``delta = moov_size - original_mdat_offset``.
+
+    Args:
+        moov_data: Raw bytes of the complete moov box (header + body).
+        delta: Offset adjustment to add to every chunk offset.
+
+    Returns:
+        Modified moov bytes with updated chunk offsets.
+    """
+    buf = bytearray(moov_data)
+
+    # Determine moov box header size
+    raw_size = struct.unpack_from(">I", buf, 0)[0]
+    hdr_size = 16 if raw_size == 1 else 8
+
+    total = _walk_and_rewrite(buf, hdr_size, len(buf), delta)
+    logger.info("[mp4_parser] Rewrote %d chunk offset entries (delta=%+d)", total, delta)
+
+    return bytes(buf)
@@ -0,0 +1,608 @@
+"""
+Universal PyAV-based streaming demuxer.
+
+Bridges async byte streams to PyAV's synchronous I/O using an OS pipe,
+allowing on-the-fly demuxing of any container format (MKV, MP4, TS,
+FLV, WebM, etc.) from an async source.
+
+Architecture:
+  AsyncIterator[bytes]  -->  async feeder task --> queue.Queue --> writer thread (pipe)
+                                                                        |
+                                                                OS pipe (kernel buffer)
+                                                                        |
+                                    demux thread: av.open + discover + demux
+                                                                        |
+                                                queue.Queue --> run_in_executor consumer
+
+Performance: Uses plain threading.Queue on both sides (writer input and
+packet output) to avoid per-item ``run_coroutine_threadsafe`` overhead.
+The async/thread bridge is done via ``run_in_executor`` on the consumer
+side and a dedicated asyncio task on the producer side.
+
+For MP4 inputs, the caller (transcode_handler) prepends the moov atom
+to the stream so PyAV receives a "faststart"-style MP4 through the pipe.
+This allows true on-the-fly demuxing for all container formats.
+"""
+
+import asyncio
+import logging
+import os
+import queue
+import threading
+from collections.abc import AsyncIterator
+from dataclasses import dataclass
+
+import av
+
+logger = logging.getLogger(__name__)
+
+# Sentinel object to signal end-of-stream in queues
+_SENTINEL = object()
+
+
+@dataclass(slots=True)
+class DemuxedStream:
+    """Metadata about a demuxed stream."""
+
+    index: int
+    codec_name: str
+    codec_type: str  # "video" or "audio"
+    # Video-specific
+    width: int = 0
+    height: int = 0
+    fps: float = 0.0
+    pixel_format: str = ""
+    # Audio-specific
+    sample_rate: int = 0
+    channels: int = 0
+    # Timing
+    time_base_num: int = 1
+    time_base_den: int = 1000
+    duration_seconds: float = 0.0
+    # Raw codec extradata (e.g. SPS/PPS for H.264, AudioSpecificConfig for AAC)
+    extradata: bytes = b""
+
+
+@dataclass(slots=True)
+class DemuxedPacket:
+    """A demuxed packet with timing info."""
+
+    stream_index: int
+    codec_type: str  # "video" or "audio"
+    data: bytes
+    pts: int  # Presentation timestamp in stream time_base units
+    dts: int  # Decode timestamp in stream time_base units
+    duration: int  # Duration in stream time_base units
+    is_keyframe: bool
+    time_base_num: int
+    time_base_den: int
+    # Optional decoded frame when decode_video/decode_audio is True
+    # av.VideoFrame for video, av.AudioFrame for audio
+    decoded_frame: object = None
+
+    @property
+    def pts_seconds(self) -> float:
+        if self.time_base_den == 0:
+            return 0.0
+        return self.pts * self.time_base_num / self.time_base_den
+
+    @property
+    def dts_seconds(self) -> float:
+        if self.time_base_den == 0:
+            return 0.0
+        return self.dts * self.time_base_num / self.time_base_den
+
+    @property
+    def duration_seconds(self) -> float:
+        if self.time_base_den == 0:
+            return 0.0
+        return self.duration * self.time_base_num / self.time_base_den
+
+
+class PyAVDemuxer:
+    """
+    Streaming demuxer using PyAV with pipe-based I/O.
+
+    All container I/O happens in background threads. The writer thread
+    feeds source bytes into a pipe; a single demux thread opens the
+    container, discovers streams, and demuxes packets -- all on the
+    same file object, ensuring the pipe's read cursor is never lost.
+
+    Performance optimisation: both the writer-input side and the
+    packet-output side use plain ``queue.Queue`` (no event-loop
+    involvement per item). The async/thread bridge is done via
+    ``run_in_executor`` on the consumer and an asyncio task on the
+    producer, eliminating ~1700 ``run_coroutine_threadsafe`` round-trips
+    per 30 s of 4K content.
+
+    Usage:
+        demuxer = PyAVDemuxer()
+        await demuxer.start(source_async_iter)
+        # demuxer.video_stream / audio_stream are now available
+        async for packet in demuxer.iter_packets():
+            if packet.codec_type == "video":
+                ...
+    """
+
+    def __init__(self, decode_video: bool = False, decode_audio: bool = False) -> None:
+        """
+        Args:
+            decode_video: If True, the demux thread will decode video packets
+                using the container's codec context and attach decoded frames
+                to DemuxedPacket.decoded_frame. This avoids format conversion
+                issues with standalone decoders (HVCC vs Annex B).
+            decode_audio: If True, the demux thread will decode audio packets
+                using the container's codec context and attach decoded frames
+                to DemuxedPacket.decoded_frame. This is needed for codecs like
+                Vorbis/Opus where the standalone decoder requires codec headers
+                that are only available in the container context. Can also be
+                set after start() returns (before packets are consumed) via
+                the ``enable_audio_decode()`` method.
+        """
+        self._decode_video = decode_video
+        self._decode_audio = decode_audio
+        self._video_decode_decided = threading.Event()
+        self._audio_decode_decided = threading.Event()
+        # If decode flags were set at construction time, mark decided immediately
+        if decode_video:
+            self._video_decode_decided.set()
+        if decode_audio:
+            self._audio_decode_decided.set()
+        self._container: av.InputContainer | None = None
+        self._video_stream: DemuxedStream | None = None
+        self._audio_stream: DemuxedStream | None = None
+        # Thread-safe queues (no event-loop involvement per put/get)
+        self._packet_queue: queue.Queue | None = None
+        self._source_queue: queue.Queue | None = None
+        self._demux_thread: threading.Thread | None = None
+        self._writer_thread: threading.Thread | None = None
+        self._feeder_task: asyncio.Task | None = None
+        self._write_fd: int | None = None
+        self._read_fd: int | None = None
+
+    @property
+    def video_stream(self) -> DemuxedStream | None:
+        return self._video_stream
+
+    @property
+    def audio_stream(self) -> DemuxedStream | None:
+        return self._audio_stream
+
+    def enable_video_decode(self, enable: bool = True) -> None:
+        """
+        Enable or disable in-thread video decoding.
+
+        Call this after ``start()`` returns (stream metadata is available)
+        but before consuming packets via ``iter_packets()``. The demux
+        thread waits for this signal before processing video packets.
+        """
+        self._decode_video = enable
+        self._video_decode_decided.set()
+
+    def enable_audio_decode(self, enable: bool = True) -> None:
+        """
+        Enable or disable in-thread audio decoding.
+
+        Call this after ``start()`` returns (stream metadata is available)
+        but before consuming packets via ``iter_packets()``. The demux
+        thread waits for this signal before processing audio packets.
+        """
+        self._decode_audio = enable
+        self._audio_decode_decided.set()
+
+    # ── Writer side ──────────────────────────────────────────────────
+
+    async def _async_feeder(self, source: AsyncIterator[bytes]) -> None:
+        """
+        Async task: pull chunks from the async source and push them
+        into a plain ``queue.Queue`` for the writer thread.
+
+        This replaces the old per-chunk ``run_coroutine_threadsafe``
+        pattern, batching the async-to-sync bridge into one task.
+
+        ``queue.Queue.put()`` is a blocking call, so we use
+        ``run_in_executor`` to avoid blocking the event loop when the
+        queue is full.
+        """
+        loop = asyncio.get_running_loop()
+        sq = self._source_queue
+        try:
+            async for chunk in source:
+                await loop.run_in_executor(None, sq.put, chunk)
+        except (asyncio.CancelledError, GeneratorExit):
+            pass
+        except Exception:
+            pass
+        finally:
+            sq.put(_SENTINEL)
+
+    def _write_chunks_sync(self) -> None:
+        """
+        Writer thread: pull pre-buffered chunks from ``_source_queue``
+        and write to the OS pipe. No event-loop interaction.
+        """
+        write_fd = self._write_fd
+        sq = self._source_queue
+        try:
+            while True:
+                chunk = sq.get(timeout=30.0)
+                if chunk is _SENTINEL:
+                    break
+                os.write(write_fd, chunk)
+        except Exception:
+            pass
+        finally:
+            try:
+                os.close(write_fd)
+            except OSError:
+                pass
+            self._write_fd = None
+
+    # ── Demux side ───────────────────────────────────────────────────
+
+    async def start(self, source: AsyncIterator[bytes]) -> None:
+        """
+        Start pipe-based streaming: writer thread feeds the pipe, a single
+        demux thread opens the container, discovers streams, and begins
+        enqueuing packets.
+
+        After this returns, ``video_stream`` and ``audio_stream`` are
+        populated and packets are being enqueued for ``iter_packets()``.
+        """
+        loop = asyncio.get_running_loop()
+
+        # Create OS pipe
+        self._read_fd, self._write_fd = os.pipe()
+
+        # Source buffer queue (async feeder task -> writer thread)
+        self._source_queue = queue.Queue(maxsize=256)
+
+        # Kick off the async feeder task
+        self._feeder_task = asyncio.create_task(self._async_feeder(source))
+
+        # Start writer thread (drains source_queue into the pipe)
+        self._writer_thread = threading.Thread(
+            target=self._write_chunks_sync,
+            daemon=True,
+            name="pyav-writer",
+        )
+        self._writer_thread.start()
+
+        # Packet queue for demux-thread -> async consumer bridge
+        self._packet_queue = queue.Queue(maxsize=128)
+        streams_ready = threading.Event()
+
+        def _open_and_demux():
+            """
+            Single background thread: open container, discover streams,
+            demux all packets.
+
+            Critical: av.open(), _discover_streams(), and container.demux()
+            all happen on the same file object in the same thread. This
+            ensures the pipe read cursor is never lost between open and demux.
+            """
+            pkt_count = 0
+            pq = self._packet_queue
+            try:
+                # Open container from read end of pipe
+                read_file = os.fdopen(self._read_fd, "rb")
+                self._read_fd = None  # ownership transferred
+
+                self._container = av.open(
+                    read_file,
+                    mode="r",
+                    options={
+                        # Tolerate mid-stream joins / broken data in live TS
+                        "err_detect": "ignore_err",
+                        "fflags": "+discardcorrupt+genpts",
+                    },
+                )
+                self._discover_streams()
+
+                # Signal stream metadata is available
+                streams_ready.set()
+
+                if self._video_stream is None and self._audio_stream is None:
+                    logger.warning("[pyav_demuxer] No video or audio streams found")
+                    return
+
+                # Select streams to demux
+                streams_to_demux = []
+                if self._video_stream is not None:
+                    streams_to_demux.append(self._container.streams[self._video_stream.index])
+                if self._audio_stream is not None:
+                    streams_to_demux.append(self._container.streams[self._audio_stream.index])
+
+                # Wait for the caller to decide on video/audio decoding
+                # (if not already decided at construction time).
+                if not self._video_decode_decided.is_set():
+                    self._video_decode_decided.wait(timeout=10.0)
+                if not self._audio_decode_decided.is_set():
+                    self._audio_decode_decided.wait(timeout=10.0)
+
+                # Cache stream objects and time_base for the hot loop
+                video_stream_obj = (
+                    self._container.streams[self._video_stream.index] if self._video_stream is not None else None
+                )
+                audio_stream_obj = (
+                    self._container.streams[self._audio_stream.index] if self._audio_stream is not None else None
+                )
+
+                video_tb_num = video_stream_obj.time_base.numerator if video_stream_obj else 1
+                video_tb_den = video_stream_obj.time_base.denominator if video_stream_obj else 1
+                audio_tb_num = audio_stream_obj.time_base.numerator if audio_stream_obj else 1
+                audio_tb_den = audio_stream_obj.time_base.denominator if audio_stream_obj else 1
+
+                decode_video = self._decode_video
+                decode_audio = self._decode_audio
+
+                # Demux and enqueue packets -- plain queue.put(), no event loop
+                for packet in self._container.demux(*streams_to_demux):
+                    if packet.size == 0:
+                        continue
+
+                    stream = self._container.streams[packet.stream_index]
+                    is_video = stream.type == "video"
+                    is_audio = stream.type == "audio"
+
+                    # Optionally decode video packets in-thread
+                    if decode_video and is_video and video_stream_obj is not None:
+                        try:
+                            frames = video_stream_obj.codec_context.decode(packet)
+                        except Exception:
+                            frames = []
+                        for frame in frames:
+                            pq.put(
+                                DemuxedPacket(
+                                    stream_index=packet.stream_index,
+                                    codec_type="video",
+                                    data=b"",
+                                    pts=int(frame.pts) if frame.pts is not None else 0,
+                                    dts=int(frame.pts) if frame.pts is not None else 0,
+                                    duration=int(packet.duration) if packet.duration is not None else 0,
+                                    is_keyframe=frame.key_frame,
+                                    time_base_num=video_tb_num,
+                                    time_base_den=video_tb_den,
+                                    decoded_frame=frame,
+                                )
+                            )
+                            pkt_count += 1
+
+                    # Optionally decode audio packets in-thread
+                    elif decode_audio and is_audio and audio_stream_obj is not None:
+                        try:
+                            frames = audio_stream_obj.codec_context.decode(packet)
+                        except Exception:
+                            frames = []
+                        for frame in frames:
+                            pq.put(
+                                DemuxedPacket(
+                                    stream_index=packet.stream_index,
+                                    codec_type="audio",
+                                    data=b"",
+                                    pts=int(frame.pts) if frame.pts is not None else 0,
+                                    dts=int(frame.pts) if frame.pts is not None else 0,
+                                    duration=int(packet.duration) if packet.duration is not None else 0,
+                                    is_keyframe=False,
+                                    time_base_num=audio_tb_num,
+                                    time_base_den=audio_tb_den,
+                                    decoded_frame=frame,
+                                )
+                            )
+                            pkt_count += 1
+
+                    else:
+                        tb_num = video_tb_num if is_video else audio_tb_num
+                        tb_den = video_tb_den if is_video else audio_tb_den
+                        pq.put(
+                            DemuxedPacket(
+                                stream_index=packet.stream_index,
+                                codec_type=stream.type,
+                                data=bytes(packet),
+                                pts=int(packet.pts) if packet.pts is not None else 0,
+                                dts=int(packet.dts) if packet.dts is not None else 0,
+                                duration=int(packet.duration) if packet.duration is not None else 0,
+                                is_keyframe=packet.is_keyframe,
+                                time_base_num=tb_num,
+                                time_base_den=tb_den,
+                            )
+                        )
+                        pkt_count += 1
+
+                # Flush the video decoder if we were decoding
+                if decode_video and video_stream_obj is not None:
+                    try:
+                        for frame in video_stream_obj.codec_context.decode(None):
+                            pq.put(
+                                DemuxedPacket(
+                                    stream_index=video_stream_obj.index,
+                                    codec_type="video",
+                                    data=b"",
+                                    pts=int(frame.pts) if frame.pts is not None else 0,
+                                    dts=int(frame.pts) if frame.pts is not None else 0,
+                                    duration=0,
+                                    is_keyframe=frame.key_frame,
+                                    time_base_num=video_tb_num,
+                                    time_base_den=video_tb_den,
+                                    decoded_frame=frame,
+                                )
+                            )
+                            pkt_count += 1
+                    except Exception:
+                        pass
+
+                # Flush the audio decoder if we were decoding
+                if decode_audio and audio_stream_obj is not None:
+                    try:
+                        for frame in audio_stream_obj.codec_context.decode(None):
+                            pq.put(
+                                DemuxedPacket(
+                                    stream_index=audio_stream_obj.index,
+                                    codec_type="audio",
+                                    data=b"",
+                                    pts=int(frame.pts) if frame.pts is not None else 0,
+                                    dts=int(frame.pts) if frame.pts is not None else 0,
+                                    duration=0,
+                                    is_keyframe=False,
+                                    time_base_num=audio_tb_num,
+                                    time_base_den=audio_tb_den,
+                                    decoded_frame=frame,
+                                )
+                            )
+                            pkt_count += 1
+                    except Exception:
+                        pass
+
+                logger.info("[pyav_demuxer] Demux complete: %d packets", pkt_count)
+
+            except Exception as e:
+                if "Invalid data" not in str(e):
+                    logger.debug("[pyav_demuxer] Demux thread error: %s", e)
+                # Ensure streams_ready is set even on error
+                streams_ready.set()
+            finally:
+                pq.put(_SENTINEL)
+
+        self._demux_thread = threading.Thread(target=_open_and_demux, daemon=True, name="pyav-demux")
+        self._demux_thread.start()
+
+        # Wait for stream discovery before returning.
+        # Use run_in_executor to avoid blocking the event loop.
+        await loop.run_in_executor(None, streams_ready.wait)
+
+    async def iter_packets(self) -> AsyncIterator[DemuxedPacket]:
+        """
+        Yield demuxed packets from the background thread.
+
+        Uses ``run_in_executor`` for the blocking ``queue.get()`` call,
+        avoiding per-packet ``run_coroutine_threadsafe`` overhead.
+
+        ``start()`` must be called first.
+        """
+        if self._packet_queue is None:
+            raise RuntimeError("Call start() before iter_packets()")
+
+        loop = asyncio.get_running_loop()
+        pq = self._packet_queue
+
+        try:
+            while True:
+                packet = await loop.run_in_executor(None, pq.get)
+                if packet is _SENTINEL:
+                    break
+                yield packet
+
+            if self._demux_thread is not None:
+                self._demux_thread.join(timeout=5.0)
+
+        except GeneratorExit:
+            logger.debug("[pyav_demuxer] Generator closed")
+        except asyncio.CancelledError:
+            logger.debug("[pyav_demuxer] Cancelled")
+        finally:
+            self._cleanup()
+
+    def _discover_streams(self) -> None:
+        """Inspect the opened container and record stream metadata."""
+        if self._container is None:
+            return
+
+        for stream in self._container.streams:
+            if stream.type == "video" and self._video_stream is None:
+                codec_ctx = stream.codec_context
+                fps = float(stream.average_rate) if stream.average_rate else 24.0
+                self._video_stream = DemuxedStream(
+                    index=stream.index,
+                    codec_name=codec_ctx.name if codec_ctx else stream.codec.name,
+                    codec_type="video",
+                    width=codec_ctx.width if codec_ctx else 0,
+                    height=codec_ctx.height if codec_ctx else 0,
+                    fps=fps,
+                    pixel_format=str(codec_ctx.pix_fmt) if codec_ctx and codec_ctx.pix_fmt else "yuv420p",
+                    time_base_num=stream.time_base.numerator,
+                    time_base_den=stream.time_base.denominator,
+                    duration_seconds=float(stream.duration * stream.time_base) if stream.duration else 0.0,
+                    extradata=bytes(codec_ctx.extradata) if codec_ctx and codec_ctx.extradata else b"",
+                )
+                logger.info(
+                    "[pyav_demuxer] Video: %s %dx%d @%.1ffps",
+                    self._video_stream.codec_name,
+                    self._video_stream.width,
+                    self._video_stream.height,
+                    self._video_stream.fps,
+                )
+
+            elif stream.type == "audio" and self._audio_stream is None:
+                codec_ctx = stream.codec_context
+                self._audio_stream = DemuxedStream(
+                    index=stream.index,
+                    codec_name=codec_ctx.name if codec_ctx else stream.codec.name,
+                    codec_type="audio",
+                    sample_rate=codec_ctx.sample_rate if codec_ctx else 0,
+                    channels=codec_ctx.channels if codec_ctx else 0,
+                    time_base_num=stream.time_base.numerator,
+                    time_base_den=stream.time_base.denominator,
+                    duration_seconds=float(stream.duration * stream.time_base) if stream.duration else 0.0,
+                    extradata=bytes(codec_ctx.extradata) if codec_ctx and codec_ctx.extradata else b"",
+                )
+                logger.info(
+                    "[pyav_demuxer] Audio: %s %dHz %dch",
+                    self._audio_stream.codec_name,
+                    self._audio_stream.sample_rate,
+                    self._audio_stream.channels,
+                )
+
+    def _cleanup(self) -> None:
+        """Stop threads and release all resources safely.
+
+        The order is critical to avoid SIGSEGV from closing the container
+        while the demux thread is still calling container.demux():
+
+        1. Cancel the feeder task (stops new bytes being queued).
+        2. Put a sentinel into the source queue so the writer thread
+           unblocks and exits. The writer's ``finally`` closes the pipe
+           write-end, which causes the demux thread to see EOF.
+        3. Join the writer thread (wait for it to drain and exit).
+        4. Join the demux thread (it finishes after pipe EOF).
+        5. ONLY THEN close the container (no thread is using it).
+        6. Close any remaining pipe FDs (read end, if still open).
+        """
+        # 1. Cancel feeder task
+        if self._feeder_task is not None:
+            self._feeder_task.cancel()
+            self._feeder_task = None
+
+        # 2. Unblock writer thread so it exits and closes the pipe
+        if self._source_queue is not None:
+            try:
+                self._source_queue.put_nowait(_SENTINEL)
+            except Exception:
+                pass
+
+        # 3. Join writer thread (it closes _write_fd in its finally block)
+        if self._writer_thread is not None:
+            self._writer_thread.join(timeout=5.0)
+            self._writer_thread = None
+
+        # 4. Join demux thread -- must finish before we close the container
+        if self._demux_thread is not None:
+            self._demux_thread.join(timeout=5.0)
+            self._demux_thread = None
+
+        # 5. Now safe to close the container (no thread is using it)
+        if self._container is not None:
+            try:
+                self._container.close()
+            except Exception:
+                pass
+            self._container = None
+
+        # 6. Close any remaining pipe FDs
+        for fd_name in ("_read_fd", "_write_fd"):
+            fd = getattr(self, fd_name, None)
+            if fd is not None:
+                try:
+                    os.close(fd)
+                except OSError:
+                    pass
+                setattr(self, fd_name, None)
@@ -0,0 +1,403 @@
+"""
+GPU-accelerated video transcoder with runtime detection.
+
+Detects available hardware encoders/decoders at first use and selects
+the best available backend:
+  - NVIDIA:         h264_nvenc / hevc_cuvid (NVENC + CUDA)
+  - Apple macOS:    h264_videotoolbox / hevc_videotoolbox
+  - Intel Linux:    h264_vaapi / h264_qsv
+  - Fallback:       libx264 (CPU)
+
+The transcoder operates at the packet/frame level via PyAV, suitable
+for integration into the streaming pipeline.
+"""
+
+import logging
+from dataclasses import dataclass, field
+from enum import Enum
+from fractions import Fraction
+
+import av
+
+from mediaflow_proxy.configs import settings
+
+logger = logging.getLogger(__name__)
+
+
+class HWAccelType(Enum):
+    NONE = "none"
+    NVIDIA = "nvidia"
+    VIDEOTOOLBOX = "videotoolbox"
+    VAAPI = "vaapi"
+    QSV = "qsv"
+
+
+@dataclass
+class HWCapability:
+    """Detected hardware acceleration capability."""
+
+    accel_type: HWAccelType = HWAccelType.NONE
+    h264_encoder: str = "libx264"
+    h264_decoder: str | None = None  # None = use default software decoder
+    hevc_decoder: str | None = None
+    available_encoders: list[str] = field(default_factory=list)
+    available_decoders: list[str] = field(default_factory=list)
+
+
+# Module-level singleton -- populated on first call to get_hw_capability()
+_hw_capability: HWCapability | None = None
+
+
+def _probe_codec(name: str, mode: str = "w") -> bool:
+    """
+    Check if a PyAV codec is available by name.
+
+    Args:
+        name: Codec name (e.g. 'h264_videotoolbox').
+        mode: 'w' for encoder, 'r' for decoder.
+    """
+    try:
+        av.Codec(name, mode)
+        return True
+    except Exception:
+        return False
+
+
+def _detect_hw_capability() -> HWCapability:
+    """
+    Probe the runtime environment for hardware encoder/decoder availability.
+
+    Checks NVIDIA, Apple VideoToolbox, Intel VAAPI/QSV in priority order.
+    Falls back to libx264 CPU encoding.
+    """
+    cap = HWCapability()
+
+    # Collect available encoders/decoders for logging
+    hw_encoders = [
+        "h264_nvenc",
+        "hevc_nvenc",
+        "h264_videotoolbox",
+        "hevc_videotoolbox",
+        "h264_vaapi",
+        "hevc_vaapi",
+        "h264_qsv",
+        "hevc_qsv",
+    ]
+    hw_decoders = [
+        "h264_cuvid",
+        "hevc_cuvid",
+        "h264_qsv",
+        "hevc_qsv",
+    ]
+
+    cap.available_encoders = [c for c in hw_encoders if _probe_codec(c, "w")]
+    cap.available_decoders = [c for c in hw_decoders if _probe_codec(c, "r")]
+
+    # Priority 1: NVIDIA NVENC
+    if "h264_nvenc" in cap.available_encoders:
+        cap.accel_type = HWAccelType.NVIDIA
+        cap.h264_encoder = "h264_nvenc"
+        if "h264_cuvid" in cap.available_decoders:
+            cap.h264_decoder = "h264_cuvid"
+        if "hevc_cuvid" in cap.available_decoders:
+            cap.hevc_decoder = "hevc_cuvid"
+        return cap
+
+    # Priority 2: Apple VideoToolbox
+    if "h264_videotoolbox" in cap.available_encoders:
+        cap.accel_type = HWAccelType.VIDEOTOOLBOX
+        cap.h264_encoder = "h264_videotoolbox"
+        # VideoToolbox decoders are used automatically via hwaccel
+        return cap
+
+    # Priority 3: Intel VAAPI (Linux)
+    if "h264_vaapi" in cap.available_encoders:
+        cap.accel_type = HWAccelType.VAAPI
+        cap.h264_encoder = "h264_vaapi"
+        return cap
+
+    # Priority 4: Intel QSV
+    if "h264_qsv" in cap.available_encoders:
+        cap.accel_type = HWAccelType.QSV
+        cap.h264_encoder = "h264_qsv"
+        if "h264_qsv" in cap.available_decoders:
+            cap.h264_decoder = "h264_qsv"
+        if "hevc_qsv" in cap.available_decoders:
+            cap.hevc_decoder = "hevc_qsv"
+        return cap
+
+    # Fallback: CPU
+    cap.accel_type = HWAccelType.NONE
+    cap.h264_encoder = "libx264"
+    return cap
+
+
+def get_hw_capability() -> HWCapability:
+    """Get the detected hardware acceleration capability (cached singleton)."""
+    global _hw_capability
+    if _hw_capability is None:
+        _hw_capability = _detect_hw_capability()
+        if settings.transcode_prefer_gpu and _hw_capability.accel_type != HWAccelType.NONE:
+            logger.info(
+                "[video_transcoder] GPU acceleration: %s (encoder=%s, decoders=%s)",
+                _hw_capability.accel_type.value,
+                _hw_capability.h264_encoder,
+                _hw_capability.available_decoders or "software",
+            )
+        else:
+            logger.info(
+                "[video_transcoder] Using CPU encoder: %s (available HW: encoders=%s, decoders=%s)",
+                _hw_capability.h264_encoder,
+                _hw_capability.available_encoders or "none",
+                _hw_capability.available_decoders or "none",
+            )
+    return _hw_capability
+
+
+class VideoTranscoder:
+    """
+    In-process video transcoder using PyAV.
+
+    Decodes input video packets and re-encodes to H.264 using the best
+    available hardware encoder (or CPU libx264 fallback).
+
+    Operates at the frame level: caller provides raw video packets (from
+    PyAV demuxer), transcoder returns encoded H.264 NAL data suitable
+    for the fMP4 muxer.
+    """
+
+    def __init__(
+        self,
+        input_codec_name: str,
+        width: int,
+        height: int,
+        fps: float = 24.0,
+        pixel_format: str = "yuv420p",
+        force_software: bool = False,
+    ) -> None:
+        hw = get_hw_capability()
+        use_gpu = settings.transcode_prefer_gpu and hw.accel_type != HWAccelType.NONE and not force_software
+
+        # --- Decoder ---
+        hw_decoder = None
+        if use_gpu:
+            if "hevc" in input_codec_name or "h265" in input_codec_name:
+                hw_decoder = hw.hevc_decoder
+            else:
+                hw_decoder = hw.h264_decoder
+
+        decoder_name = hw_decoder or input_codec_name
+        self._decoder = av.CodecContext.create(decoder_name, "r")
+
+        # --- Encoder ---
+        encoder_name = hw.h264_encoder if use_gpu else "libx264"
+
+        # H.264 requires even dimensions
+        enc_width = width if width % 2 == 0 else width + 1
+        enc_height = height if height % 2 == 0 else height + 1
+
+        self._encoder = av.CodecContext.create(encoder_name, "w")
+        self._encoder.width = enc_width
+        self._encoder.height = enc_height
+        self._encoder.pix_fmt = "yuv420p"  # H.264 requires yuv420p
+        self._encoder.time_base = Fraction(1, int(fps * 1000))
+        self._encoder.framerate = Fraction(int(fps * 1000), 1000)
+        self._encoder.bit_rate = _parse_bitrate(settings.transcode_video_bitrate)
+        self._encoder.gop_size = int(fps * 2)  # Keyframe every ~2 seconds
+
+        # Encoder options based on backend
+        opts = {}
+        if encoder_name == "libx264":
+            opts["preset"] = settings.transcode_video_preset
+            opts["tune"] = "zerolatency"
+            opts["profile"] = "high"
+        elif "nvenc" in encoder_name:
+            opts["preset"] = "p4"  # NVENC preset (p1=fastest .. p7=slowest)
+            opts["tune"] = "ll"  # Low latency
+            opts["rc"] = "vbr"
+        elif "videotoolbox" in encoder_name:
+            opts["realtime"] = "1"
+            opts["allow_sw"] = "1"  # Fallback to software if HW busy
+        elif "vaapi" in encoder_name:
+            opts["rc_mode"] = "VBR"
+        elif "qsv" in encoder_name:
+            opts["preset"] = "medium"
+
+        self._encoder.options = opts
+        self._encoder.open()
+
+        width = enc_width
+        height = enc_height
+
+        self._input_codec = input_codec_name
+        self._encoder_name = encoder_name
+        self._frames_decoded = 0
+        self._frames_encoded = 0
+        self._width = width
+        self._height = height
+        # Tracks whether the standalone decoder was actually used (via decode_packet).
+        # When the demux thread decodes frames in-thread (decode_video=True),
+        # the standalone decoder is never fed packets and flushing it is wasted work.
+        self._decoder_used = False
+        self._flushed = False  # Prevents double-flush which causes SIGSEGV
+
+        logger.info(
+            "[video_transcoder] Initialized: %s -> %s (%s), %dx%d @%.1ffps %dk",
+            input_codec_name,
+            encoder_name,
+            hw.accel_type.value,
+            width,
+            height,
+            fps,
+            self._encoder.bit_rate // 1000 if self._encoder.bit_rate else 0,
+        )
+
+    @property
+    def codec_private_data(self) -> bytes | None:
+        """H.264 extradata (SPS/PPS) from the encoder, for the fMP4 init segment."""
+        if self._encoder.extradata:
+            return bytes(self._encoder.extradata)
+        return None
+
+    @property
+    def width(self) -> int:
+        return self._width
+
+    @property
+    def height(self) -> int:
+        return self._height
+
+    def transcode_frame(self, frame: av.VideoFrame) -> list[tuple[bytes, bool, int, int]]:
+        """
+        Encode a decoded video frame to H.264.
+
+        Args:
+            frame: A decoded av.VideoFrame.
+
+        Returns:
+            List of (nal_data, is_keyframe, pts, dts) tuples.
+        """
+        self._frames_decoded += 1
+        output = []
+
+        # Ensure correct pixel format for encoder
+        if frame.format.name != self._encoder.pix_fmt:
+            frame = frame.reformat(format=self._encoder.pix_fmt)
+
+        try:
+            for packet in self._encoder.encode(frame):
+                self._frames_encoded += 1
+                output.append(
+                    (
+                        bytes(packet),
+                        packet.is_keyframe,
+                        int(packet.pts) if packet.pts is not None else 0,
+                        int(packet.dts) if packet.dts is not None else 0,
+                    )
+                )
+        except av.error.InvalidDataError as e:
+            logger.debug("[video_transcoder] Encode error: %s", e)
+
+        return output
+
+    def decode_packet(self, packet: av.Packet) -> list[av.VideoFrame]:
+        """Decode a video packet into frames."""
+        self._decoder_used = True
+        try:
+            return list(self._decoder.decode(packet))
+        except av.error.InvalidDataError as e:
+            logger.debug("[video_transcoder] Decode error: %s", e)
+            return []
+
+    def flush(self) -> list[tuple[bytes, bool, int, int]]:
+        """
+        Flush encoder (and decoder, if it was used) buffers.
+
+        When ``decode_video=True`` is used in PyAVDemuxer, the demux thread
+        decodes frames using the container's codec context. In that case the
+        standalone ``_decoder`` here is never fed any packets, so flushing
+        it is skipped -- avoiding a stall that added ~5 s on some backends.
+
+        Safe to call multiple times -- subsequent calls return an empty list.
+        """
+        if self._flushed:
+            return []
+        self._flushed = True
+
+        output = []
+
+        # Flush decoder only if it was actually used (via decode_packet)
+        if self._decoder_used:
+            try:
+                for frame in self._decoder.decode(None):
+                    self._frames_decoded += 1
+                    if frame.format.name != self._encoder.pix_fmt:
+                        frame = frame.reformat(format=self._encoder.pix_fmt)
+                    for packet in self._encoder.encode(frame):
+                        self._frames_encoded += 1
+                        output.append(
+                            (
+                                bytes(packet),
+                                packet.is_keyframe,
+                                int(packet.pts) if packet.pts is not None else 0,
+                                int(packet.dts) if packet.dts is not None else 0,
+                            )
+                        )
+            except Exception as e:
+                logger.debug("[video_transcoder] Decoder flush error: %s", e)
+        else:
+            logger.debug("[video_transcoder] Skipping decoder flush (decoder not used)")
+
+        # Flush encoder
+        try:
+            for packet in self._encoder.encode(None):
+                self._frames_encoded += 1
+                output.append(
+                    (
+                        bytes(packet),
+                        packet.is_keyframe,
+                        int(packet.pts) if packet.pts is not None else 0,
+                        int(packet.dts) if packet.dts is not None else 0,
+                    )
+                )
+        except Exception as e:
+            logger.debug("[video_transcoder] Encoder flush error: %s", e)
+
+        logger.info(
+            "[video_transcoder] Flushed: %d decoded, %d encoded total (decoder_used=%s)",
+            self._frames_decoded,
+            self._frames_encoded,
+            self._decoder_used,
+        )
+        return output
+
+    def close(self) -> None:
+        """Release codec contexts.
+
+        Flushes the encoder (if not already flushed) before releasing to avoid
+        SIGSEGV when libx264 or hardware encoders have buffered frames at
+        teardown time. Double-flushing is the most common cause of SIGSEGV
+        in the transcode pipeline.
+
+        PyAV codec contexts are released via garbage collection (no explicit
+        close method), so we flush first to ensure native buffers are drained
+        before the C-level codec is freed.
+        """
+        # flush() is idempotent -- safe to call even if already flushed
+        self.flush()
+        # Release references -- GC will free the native codec contexts
+        self._encoder = None
+        self._decoder = None
+
+    def __del__(self) -> None:
+        self.close()
+
+
+def _parse_bitrate(bitrate_str: str) -> int:
+    """Parse a bitrate string like '4M', '2000k', '5000000' to int bits/s."""
+    s = bitrate_str.strip().lower()
+    if s.endswith("m"):
+        return int(float(s[:-1]) * 1_000_000)
+    if s.endswith("k"):
+        return int(float(s[:-1]) * 1_000)
+    return int(s)