UnHided/mediaflow_proxy/remuxer/mp4_muxer.py

"""
Pure Python MP4 box builder for both standard and fragmented MP4.

Supports two modes:

1. Standard MP4 (moov-first): For progressive download with HTTP Range seeking.
   File layout: ftyp | moov (full sample tables) | mdat

2. Fragmented MP4 (fMP4): For on-the-fly streaming via StreamingResponse.
   Init segment: ftyp | moov (empty_moov with mvex)
   Media segments: moof (tfhd + tfdt + trun) | mdat

The fMP4 mode is used for the transcode pipeline where MKV frames are
demuxed, audio is transcoded, and fMP4 fragments are streamed out
immediately without buffering the entire file.
"""

import logging
import struct
from dataclasses import dataclass, field

from mediaflow_proxy.remuxer.ebml_parser import MKVTrack, CODEC_ID_H264, CODEC_ID_H265

logger = logging.getLogger(__name__)


# =============================================================================
# Sample metadata
# =============================================================================


@dataclass
class SampleEntry:
    """Metadata for a single sample (frame) in the MP4 file."""

    size: int  # Sample size in bytes
    duration: int  # Duration in track timescale ticks
    is_sync: bool  # True for keyframes (video) or all audio samples
    composition_offset: int = 0  # CTS offset (for B-frames)


@dataclass
class TrackSamples:
    """Collected sample metadata for one track during muxing."""

    samples: list[SampleEntry] = field(default_factory=list)
    chunk_offsets: list[int] = field(default_factory=list)  # Absolute byte offset of each chunk in mdat
    total_size: int = 0  # Total bytes of all samples
    total_duration: int = 0  # Total duration in timescale ticks

    def add(self, sample: SampleEntry) -> None:
        self.samples.append(sample)
        self.total_size += sample.size
        self.total_duration += sample.duration


# =============================================================================
# Box building primitives
# =============================================================================


def build_box(box_type: bytes, payload: bytes) -> bytes:
    """Build a standard MP4 box: [4-byte size][4-byte type][payload]."""
    size = 8 + len(payload)
    return struct.pack(">I", size) + box_type + payload


def build_full_box(box_type: bytes, version: int, flags: int, payload: bytes) -> bytes:
    """Build a full box with version and flags."""
    inner = struct.pack(">I", (version << 24) | (flags & 0xFFFFFF)) + payload
    return build_box(box_type, inner)


def build_box_header_large(box_type: bytes, total_size: int) -> bytes:
    """Build a box header for large boxes using 64-bit extended size."""
    # size=1 signals extended size; actual size follows as uint64
    return struct.pack(">I", 1) + box_type + struct.pack(">Q", total_size)


# =============================================================================
# ftyp box
# =============================================================================


def build_ftyp() -> bytes:
    """Build the File Type box for isom/iso2/mp41 compatible MP4."""
    payload = b"isom"  # major brand
    payload += struct.pack(">I", 0x200)  # minor version
    payload += b"isom" + b"iso2" + b"mp41"  # compatible brands
    return build_box(b"ftyp", payload)


# =============================================================================
# moov box and children
# =============================================================================


def build_mvhd(timescale: int, duration: int) -> bytes:
    """Build Movie Header box (mvhd), version 0."""
    payload = bytearray()
    payload.extend(struct.pack(">I", 0))  # creation_time
    payload.extend(struct.pack(">I", 0))  # modification_time
    payload.extend(struct.pack(">I", timescale))
    payload.extend(struct.pack(">I", duration))
    payload.extend(struct.pack(">I", 0x00010000))  # rate = 1.0
    payload.extend(struct.pack(">H", 0x0100))  # volume = 1.0
    payload.extend(b"\x00" * 10)  # reserved
    # Unity matrix (3x3, each 4 bytes, 9 values = 36 bytes)
    payload.extend(struct.pack(">9I", 0x00010000, 0, 0, 0, 0x00010000, 0, 0, 0, 0x40000000))
    payload.extend(b"\x00" * 24)  # pre_defined
    payload.extend(struct.pack(">I", 3))  # next_track_ID (1=video, 2=audio, next=3)
    return build_full_box(b"mvhd", 0, 0, bytes(payload))


def build_tkhd(track_id: int, duration: int, width: int = 0, height: int = 0, is_audio: bool = False) -> bytes:
    """Build Track Header box (tkhd), version 0."""
    flags = 0x000003  # track_enabled | track_in_movie
    payload = bytearray()
    payload.extend(struct.pack(">I", 0))  # creation_time
    payload.extend(struct.pack(">I", 0))  # modification_time
    payload.extend(struct.pack(">I", track_id))
    payload.extend(b"\x00" * 4)  # reserved
    payload.extend(struct.pack(">I", duration))
    payload.extend(b"\x00" * 8)  # reserved
    payload.extend(struct.pack(">H", 0))  # layer
    payload.extend(struct.pack(">H", 0 if not is_audio else 1))  # alternate_group
    payload.extend(struct.pack(">H", 0x0100 if is_audio else 0))  # volume
    payload.extend(b"\x00" * 2)  # reserved
    # Unity matrix
    payload.extend(struct.pack(">9I", 0x00010000, 0, 0, 0, 0x00010000, 0, 0, 0, 0x40000000))
    # Width and height as 16.16 fixed-point
    payload.extend(struct.pack(">I", width << 16))
    payload.extend(struct.pack(">I", height << 16))
    return build_full_box(b"tkhd", 0, flags, bytes(payload))


def build_mdhd(timescale: int, duration: int) -> bytes:
    """Build Media Header box (mdhd), version 0."""
    payload = bytearray()
    payload.extend(struct.pack(">I", 0))  # creation_time
    payload.extend(struct.pack(">I", 0))  # modification_time
    payload.extend(struct.pack(">I", timescale))
    payload.extend(struct.pack(">I", duration))
    payload.extend(struct.pack(">H", 0x55C4))  # language: 'und'
    payload.extend(struct.pack(">H", 0))  # pre_defined
    return build_full_box(b"mdhd", 0, 0, bytes(payload))


def build_hdlr(handler_type: bytes, name: str) -> bytes:
    """Build Handler Reference box (hdlr)."""
    payload = bytearray()
    payload.extend(b"\x00" * 4)  # pre_defined
    payload.extend(handler_type)  # handler_type (4 bytes)
    payload.extend(b"\x00" * 12)  # reserved
    payload.extend(name.encode("utf-8") + b"\x00")
    return build_full_box(b"hdlr", 0, 0, bytes(payload))


def build_vmhd() -> bytes:
    """Build Video Media Header box (vmhd)."""
    payload = struct.pack(">H", 0)  # graphicsmode
    payload += struct.pack(">3H", 0, 0, 0)  # opcolor
    return build_full_box(b"vmhd", 0, 1, payload)  # flags=1


def build_smhd() -> bytes:
    """Build Sound Media Header box (smhd)."""
    payload = struct.pack(">H", 0)  # balance
    payload += b"\x00\x00"  # reserved
    return build_full_box(b"smhd", 0, 0, payload)


def build_dref() -> bytes:
    """Build Data Reference box (dref) with a self-contained URL entry."""
    url_box = build_full_box(b"url ", 0, 1, b"")  # flags=1 = self-contained
    payload = struct.pack(">I", 1) + url_box  # entry_count=1
    return build_full_box(b"dref", 0, 0, payload)


def build_dinf() -> bytes:
    """Build Data Information box (dinf)."""
    return build_box(b"dinf", build_dref())


# =============================================================================
# Sample table boxes (stbl)
# =============================================================================


def build_stsd_video(track: MKVTrack) -> bytes:
    """Build Sample Description box (stsd) for a video track."""
    # Build the codec-specific sample entry
    if track.codec_id == CODEC_ID_H264:
        entry = _build_avc1_entry(track)
    elif track.codec_id == CODEC_ID_H265:
        entry = _build_hvc1_entry(track)
    else:
        raise ValueError(f"Unsupported video codec: {track.codec_id}")

    payload = struct.pack(">I", 1) + entry  # entry_count=1
    return build_full_box(b"stsd", 0, 0, payload)


def _build_colr_nclx(
    colour_primaries: int = 1,
    transfer_characteristics: int = 1,
    matrix_coefficients: int = 1,
    full_range: bool = False,
) -> bytes:
    """
    Build a colr box with nclx (video colour) information.

    Defaults to BT.709 (the standard for HD content), matching ffmpeg's
    default behaviour for fMP4 output.
    """
    payload = b"nclx"
    payload += struct.pack(">HHH", colour_primaries, transfer_characteristics, matrix_coefficients)
    payload += struct.pack("B", 0x80 if full_range else 0x00)
    return build_box(b"colr", payload)


def _build_pasp(h_spacing: int = 1, v_spacing: int = 1) -> bytes:
    """
    Build a pasp (Pixel Aspect Ratio) box.

    Default 1:1 (square pixels), which is the norm for HD content.
    """
    return build_box(b"pasp", struct.pack(">II", h_spacing, v_spacing))


def _build_avc1_entry(track: MKVTrack) -> bytes:
    """Build an avc3 VisualSampleEntry.

    Uses ``avc3`` instead of ``avc1`` to allow in-band SPS/PPS parameter
    set updates in sample data.  Many MKV sources embed mid-stream PPS
    changes in the bitstream; ``avc3`` signals to the player that these
    may appear in any sample, avoiding "non-existing PPS" decode errors.
    """
    payload = bytearray()
    payload.extend(b"\x00" * 6)  # reserved
    payload.extend(struct.pack(">H", 1))  # data_reference_index
    payload.extend(b"\x00" * 16)  # pre_defined + reserved
    payload.extend(struct.pack(">H", track.pixel_width))
    payload.extend(struct.pack(">H", track.pixel_height))
    payload.extend(struct.pack(">I", 0x00480000))  # horizresolution 72 dpi
    payload.extend(struct.pack(">I", 0x00480000))  # vertresolution 72 dpi
    payload.extend(b"\x00" * 4)  # reserved
    payload.extend(struct.pack(">H", 1))  # frame_count
    payload.extend(b"\x00" * 32)  # compressorname
    payload.extend(struct.pack(">H", 0x0018))  # depth = 24
    payload.extend(struct.pack(">h", -1))  # pre_defined

    # avcC box from CodecPrivate
    if track.codec_private:
        avcc = build_box(b"avcC", track.codec_private)
        payload.extend(avcc)

    # colr box -- nclx colour information (BT.709)
    payload.extend(_build_colr_nclx())
    # pasp box -- pixel aspect ratio (1:1)
    payload.extend(_build_pasp())

    return build_box(b"avc3", bytes(payload))


def _build_hvc1_entry(track: MKVTrack) -> bytes:
    """Build an hvc1 VisualSampleEntry."""
    payload = bytearray()
    payload.extend(b"\x00" * 6)  # reserved
    payload.extend(struct.pack(">H", 1))  # data_reference_index
    payload.extend(b"\x00" * 16)  # pre_defined + reserved
    payload.extend(struct.pack(">H", track.pixel_width))
    payload.extend(struct.pack(">H", track.pixel_height))
    payload.extend(struct.pack(">I", 0x00480000))  # horizresolution
    payload.extend(struct.pack(">I", 0x00480000))  # vertresolution
    payload.extend(b"\x00" * 4)  # reserved
    payload.extend(struct.pack(">H", 1))  # frame_count
    payload.extend(b"\x00" * 32)  # compressorname
    payload.extend(struct.pack(">H", 0x0018))  # depth
    payload.extend(struct.pack(">h", -1))  # pre_defined

    # hvcC box from CodecPrivate
    if track.codec_private:
        hvcc = build_box(b"hvcC", track.codec_private)
        payload.extend(hvcc)

    # colr box -- nclx colour information (BT.709)
    payload.extend(_build_colr_nclx())
    # pasp box -- pixel aspect ratio (1:1)
    payload.extend(_build_pasp())

    return build_box(b"hvc1", bytes(payload))


def build_stsd_audio(sample_rate: int, channels: int, audio_specific_config: bytes) -> bytes:
    """Build Sample Description box (stsd) for an AAC audio track."""
    entry = _build_mp4a_entry(sample_rate, channels, audio_specific_config)
    payload = struct.pack(">I", 1) + entry  # entry_count=1
    return build_full_box(b"stsd", 0, 0, payload)


def _build_mp4a_entry(sample_rate: int, channels: int, asc: bytes) -> bytes:
    """Build an mp4a AudioSampleEntry with esds box."""
    payload = bytearray()
    payload.extend(b"\x00" * 6)  # reserved
    payload.extend(struct.pack(">H", 1))  # data_reference_index
    payload.extend(b"\x00" * 8)  # reserved
    payload.extend(struct.pack(">H", channels))
    payload.extend(struct.pack(">H", 16))  # sample_size (16-bit)
    payload.extend(b"\x00" * 4)  # pre_defined + reserved
    payload.extend(struct.pack(">I", sample_rate << 16))  # sample_rate 16.16

    # esds box
    esds = _build_esds(sample_rate, channels, asc)
    payload.extend(esds)

    return build_box(b"mp4a", bytes(payload))


def _build_esds(sample_rate: int, channels: int, asc: bytes) -> bytes:
    """Build an Elementary Stream Descriptor box (esds) for AAC."""
    # ES_Descriptor
    es_desc = bytearray()
    es_desc.extend(struct.pack(">H", 1))  # ES_ID
    es_desc.append(0x00)  # stream priority

    # DecoderConfigDescriptor
    dec_config = bytearray()
    dec_config.append(0x40)  # objectTypeIndication = Audio ISO/IEC 14496-3 (AAC)
    dec_config.append(0x15)  # streamType=5 (audio) upstream=0 reserved=1
    dec_config.extend(b"\x00\x00\x00")  # bufferSizeDB (3 bytes)
    dec_config.extend(struct.pack(">I", 192000))  # maxBitrate
    dec_config.extend(struct.pack(">I", 192000))  # avgBitrate

    # DecoderSpecificInfo (AudioSpecificConfig)
    dec_specific = _build_descriptor(0x05, asc)
    dec_config.extend(dec_specific)

    dec_config_desc = _build_descriptor(0x04, bytes(dec_config))
    es_desc.extend(dec_config_desc)

    # SLConfigDescriptor (predefined=2 for MP4)
    sl_config = _build_descriptor(0x06, b"\x02")
    es_desc.extend(sl_config)

    es_descriptor = _build_descriptor(0x03, bytes(es_desc))
    payload = es_descriptor
    return build_full_box(b"esds", 0, 0, payload)


def _build_descriptor(tag: int, data: bytes) -> bytes:
    """Build an ISO 14496-1 descriptor with expandable length encoding."""
    length = len(data)
    result = bytearray()
    result.append(tag)

    # Expandable length: use 4 bytes (most compatible)
    result.append(0x80 | ((length >> 21) & 0x7F))
    result.append(0x80 | ((length >> 14) & 0x7F))
    result.append(0x80 | ((length >> 7) & 0x7F))
    result.append(length & 0x7F)

    result.extend(data)
    return bytes(result)


def build_stts(samples: list[SampleEntry]) -> bytes:
    """
    Build Time-to-Sample box (stts) with run-length encoding.

    Groups consecutive samples with the same duration.
    """
    if not samples:
        return build_full_box(b"stts", 0, 0, struct.pack(">I", 0))

    # Run-length encode durations
    entries = []
    current_duration = samples[0].duration
    current_count = 1

    for s in samples[1:]:
        if s.duration == current_duration:
            current_count += 1
        else:
            entries.append((current_count, current_duration))
            current_duration = s.duration
            current_count = 1
    entries.append((current_count, current_duration))

    payload = bytearray()
    payload.extend(struct.pack(">I", len(entries)))
    for count, delta in entries:
        payload.extend(struct.pack(">II", count, delta))

    return build_full_box(b"stts", 0, 0, bytes(payload))


def build_stss(samples: list[SampleEntry]) -> bytes | None:
    """
    Build Sync Sample box (stss) listing keyframe indices.

    Returns None if all samples are sync (audio tracks), as stss is
    only needed when not all samples are sync points.
    """
    sync_indices = [i + 1 for i, s in enumerate(samples) if s.is_sync]  # 1-based

    if len(sync_indices) == len(samples):
        return None  # All samples are sync; omit stss

    payload = bytearray()
    payload.extend(struct.pack(">I", len(sync_indices)))
    for idx in sync_indices:
        payload.extend(struct.pack(">I", idx))

    return build_full_box(b"stss", 0, 0, bytes(payload))


def build_ctts(samples: list[SampleEntry]) -> bytes | None:
    """
    Build Composition Time-to-Sample box (ctts) for B-frame offsets.

    Returns None if no samples have composition offsets.
    """
    has_offsets = any(s.composition_offset != 0 for s in samples)
    if not has_offsets:
        return None

    # Run-length encode offsets
    entries = []
    current_offset = samples[0].composition_offset
    current_count = 1

    for s in samples[1:]:
        if s.composition_offset == current_offset:
            current_count += 1
        else:
            entries.append((current_count, current_offset))
            current_offset = s.composition_offset
            current_count = 1
    entries.append((current_count, current_offset))

    payload = bytearray()
    payload.extend(struct.pack(">I", len(entries)))
    for count, offset in entries:
        payload.extend(struct.pack(">II", count, offset))

    return build_full_box(b"ctts", 0, 0, bytes(payload))


def build_stsz(samples: list[SampleEntry]) -> bytes:
    """Build Sample Size box (stsz)."""
    payload = bytearray()

    # Check if all samples are the same size
    if samples:
        first_size = samples[0].size
        all_same = all(s.size == first_size for s in samples)
    else:
        all_same = True
        first_size = 0

    if all_same and samples:
        payload.extend(struct.pack(">I", first_size))  # sample_size (uniform)
        payload.extend(struct.pack(">I", len(samples)))  # sample_count
    else:
        payload.extend(struct.pack(">I", 0))  # sample_size = 0 (variable)
        payload.extend(struct.pack(">I", len(samples)))
        for s in samples:
            payload.extend(struct.pack(">I", s.size))

    return build_full_box(b"stsz", 0, 0, bytes(payload))


def build_stsc(num_chunks: int) -> bytes:
    """
    Build Sample-to-Chunk box (stsc).

    For simplicity, we use one sample per chunk (each sample gets its
    own chunk offset). This is slightly less compact but much simpler
    and fully correct.
    """
    payload = bytearray()
    payload.extend(struct.pack(">I", 1))  # entry_count
    payload.extend(struct.pack(">III", 1, 1, 1))  # first_chunk=1, samples_per_chunk=1, desc_index=1
    return build_full_box(b"stsc", 0, 0, bytes(payload))


def build_stco(offsets: list[int]) -> bytes:
    """Build Chunk Offset box (stco, 32-bit offsets)."""
    payload = bytearray()
    payload.extend(struct.pack(">I", len(offsets)))
    for off in offsets:
        payload.extend(struct.pack(">I", off))
    return build_full_box(b"stco", 0, 0, bytes(payload))


def build_co64(offsets: list[int]) -> bytes:
    """Build Chunk Offset box (co64, 64-bit offsets) for large files."""
    payload = bytearray()
    payload.extend(struct.pack(">I", len(offsets)))
    for off in offsets:
        payload.extend(struct.pack(">Q", off))
    return build_full_box(b"co64", 0, 0, bytes(payload))


# =============================================================================
# Track building (assembles trak box hierarchy)
# =============================================================================


def build_stbl(track_samples: TrackSamples, stsd: bytes) -> bytes:
    """Build the Sample Table box (stbl) for a track."""
    children = bytearray()
    children.extend(stsd)
    children.extend(build_stts(track_samples.samples))

    stss = build_stss(track_samples.samples)
    if stss is not None:
        children.extend(stss)

    ctts = build_ctts(track_samples.samples)
    if ctts is not None:
        children.extend(ctts)

    children.extend(build_stsz(track_samples.samples))
    children.extend(build_stsc(len(track_samples.chunk_offsets)))

    # Use co64 if any offset exceeds 32-bit range
    needs_64 = any(off > 0xFFFFFFFF for off in track_samples.chunk_offsets)
    if needs_64:
        children.extend(build_co64(track_samples.chunk_offsets))
    else:
        children.extend(build_stco(track_samples.chunk_offsets))

    return build_box(b"stbl", bytes(children))


def build_minf(is_audio: bool, stbl: bytes) -> bytes:
    """Build Media Information box (minf)."""
    children = bytearray()
    if is_audio:
        children.extend(build_smhd())
    else:
        children.extend(build_vmhd())
    children.extend(build_dinf())
    children.extend(stbl)
    return build_box(b"minf", bytes(children))


def build_mdia(timescale: int, duration: int, handler_type: bytes, handler_name: str, minf: bytes) -> bytes:
    """Build Media box (mdia)."""
    children = bytearray()
    children.extend(build_mdhd(timescale, duration))
    children.extend(build_hdlr(handler_type, handler_name))
    children.extend(minf)
    return build_box(b"mdia", bytes(children))


def build_video_trak(
    track: MKVTrack,
    track_id: int,
    timescale: int,
    track_samples: TrackSamples,
    movie_timescale: int,
) -> bytes:
    """Build a complete video trak box."""
    duration_in_track = track_samples.total_duration
    # Convert track duration to movie timescale for tkhd
    if timescale > 0:
        duration_in_movie = int(duration_in_track * movie_timescale / timescale)
    else:
        duration_in_movie = 0

    tkhd = build_tkhd(track_id, duration_in_movie, width=track.pixel_width, height=track.pixel_height)
    stsd = build_stsd_video(track)
    stbl = build_stbl(track_samples, stsd)
    minf = build_minf(is_audio=False, stbl=stbl)
    mdia = build_mdia(timescale, duration_in_track, b"vide", "VideoHandler", minf)

    return build_box(b"trak", tkhd + mdia)


def build_audio_trak(
    track_id: int,
    timescale: int,
    track_samples: TrackSamples,
    movie_timescale: int,
    sample_rate: int,
    channels: int,
    audio_specific_config: bytes,
) -> bytes:
    """Build a complete audio trak box."""
    duration_in_track = track_samples.total_duration
    if timescale > 0:
        duration_in_movie = int(duration_in_track * movie_timescale / timescale)
    else:
        duration_in_movie = 0

    tkhd = build_tkhd(track_id, duration_in_movie, is_audio=True)
    stsd = build_stsd_audio(sample_rate, channels, audio_specific_config)
    stbl = build_stbl(track_samples, stsd)
    minf = build_minf(is_audio=True, stbl=stbl)
    mdia = build_mdia(timescale, duration_in_track, b"soun", "SoundHandler", minf)

    return build_box(b"trak", tkhd + mdia)


# =============================================================================
# Complete moov builder
# =============================================================================


def build_moov(
    video_track: MKVTrack,
    audio_track_info: dict,
    video_samples: TrackSamples,
    audio_samples: TrackSamples,
    mdat_offset: int,
    video_timescale: int = 90000,
    audio_timescale: int = 48000,
    movie_timescale: int = 1000,
) -> bytes:
    """
    Build the complete moov box with all track metadata.

    Args:
        video_track: MKVTrack with video codec info.
        audio_track_info: Dict with keys: sample_rate, channels, audio_specific_config.
        video_samples: Collected video sample metadata.
        audio_samples: Collected audio sample metadata.
        mdat_offset: Byte offset where mdat data starts (after ftyp + moov + mdat header).
        video_timescale: Video track timescale (default 90000 for 90kHz).
        audio_timescale: Audio track timescale (typically sample_rate).
        movie_timescale: Movie header timescale (default 1000 = ms).

    Returns:
        Complete moov box bytes.
    """
    # Calculate movie duration
    video_dur_movie = 0
    if video_timescale > 0 and video_samples.total_duration > 0:
        video_dur_movie = int(video_samples.total_duration * movie_timescale / video_timescale)

    audio_dur_movie = 0
    if audio_timescale > 0 and audio_samples.total_duration > 0:
        audio_dur_movie = int(audio_samples.total_duration * movie_timescale / audio_timescale)

    movie_duration = max(video_dur_movie, audio_dur_movie)

    # Build moov children
    children = bytearray()
    children.extend(build_mvhd(movie_timescale, movie_duration))

    children.extend(
        build_video_trak(
            video_track,
            track_id=1,
            timescale=video_timescale,
            track_samples=video_samples,
            movie_timescale=movie_timescale,
        )
    )

    children.extend(
        build_audio_trak(
            track_id=2,
            timescale=audio_timescale,
            track_samples=audio_samples,
            movie_timescale=movie_timescale,
            sample_rate=audio_track_info["sample_rate"],
            channels=audio_track_info["channels"],
            audio_specific_config=audio_track_info["audio_specific_config"],
        )
    )

    return build_box(b"moov", bytes(children))


# =============================================================================
# mdat box header
# =============================================================================


def build_mdat_header(data_size: int) -> bytes:
    """
    Build the mdat box header.

    Uses extended (64-bit) size if data_size + header > 4GB.
    """
    total = 8 + data_size  # header(8) + data
    if total <= 0xFFFFFFFF:
        return struct.pack(">I", total) + b"mdat"
    # Extended size: size field = 1, then 8-byte actual size
    total_ext = 16 + data_size  # header(16) + data
    return struct.pack(">I", 1) + b"mdat" + struct.pack(">Q", total_ext)


# =============================================================================
# MP4 Builder (high-level orchestrator)
# =============================================================================


class MP4Builder:
    """
    High-level MP4 file builder.

    Collects video and audio samples during a transcode pass, then produces
    a complete moov-first MP4 file.

    Usage:
        builder = MP4Builder(video_track, audio_sample_rate=48000,
                             audio_channels=2, audio_specific_config=asc)
        for frame in video_frames:
            builder.add_video_sample(frame.data, frame.duration_ticks, frame.is_keyframe)
        for frame in audio_frames:
            builder.add_audio_sample(frame.data, frame.duration_ticks)
        moov_bytes, mdat_header, sample_data_list = builder.finalize()
    """

    def __init__(
        self,
        video_track: MKVTrack,
        audio_sample_rate: int = 48000,
        audio_channels: int = 2,
        audio_specific_config: bytes = b"",
        video_timescale: int = 90000,
        audio_timescale: int = 48000,
    ) -> None:
        self._video_track = video_track
        self._audio_info = {
            "sample_rate": audio_sample_rate,
            "channels": audio_channels,
            "audio_specific_config": audio_specific_config,
        }
        self._video_timescale = video_timescale
        self._audio_timescale = audio_timescale

        self._video_samples = TrackSamples()
        self._audio_samples = TrackSamples()
        self._mdat_chunks: list[bytes] = []  # Interleaved sample data
        self._mdat_size: int = 0
        self._sample_order: list[str] = []  # "v" or "a" for each mdat chunk

    def add_video_sample(self, data: bytes, duration_ticks: int, is_keyframe: bool) -> None:
        """Add a video sample (H.264/H.265 NALUs) to the builder."""
        entry = SampleEntry(size=len(data), duration=duration_ticks, is_sync=is_keyframe)
        self._video_samples.add(entry)
        self._mdat_chunks.append(data)
        self._mdat_size += len(data)
        self._sample_order.append("v")

    def add_audio_sample(self, data: bytes, duration_ticks: int) -> None:
        """Add an audio sample (AAC frame) to the builder."""
        entry = SampleEntry(size=len(data), duration=duration_ticks, is_sync=True)
        self._audio_samples.add(entry)
        self._mdat_chunks.append(data)
        self._mdat_size += len(data)
        self._sample_order.append("a")

    @property
    def video_sample_count(self) -> int:
        return len(self._video_samples.samples)

    @property
    def audio_sample_count(self) -> int:
        return len(self._audio_samples.samples)

    @property
    def mdat_size(self) -> int:
        return self._mdat_size

    def finalize(self) -> tuple[bytes, bytes, list[bytes]]:
        """
        Build the final MP4 file components.

        Since moov needs accurate chunk offsets (stco/co64) that depend on
        moov's own size, we do a two-pass approach:
        1. Build moov with placeholder offsets to determine its size
        2. Rebuild moov with correct offsets

        Returns:
            (ftyp_moov_bytes, mdat_header_bytes, mdat_chunk_list)
            Concatenating these gives the complete MP4 file.
        """
        ftyp = build_ftyp()

        # Build mdat header
        mdat_hdr = build_mdat_header(self._mdat_size)

        # Pass 1: Build moov with placeholder (0) offsets to measure its size
        self._compute_chunk_offsets(0)  # Placeholder base
        moov_pass1 = build_moov(
            self._video_track,
            self._audio_info,
            self._video_samples,
            self._audio_samples,
            mdat_offset=0,
            video_timescale=self._video_timescale,
            audio_timescale=self._audio_timescale,
        )

        # Calculate actual mdat data start:
        # ftyp + moov + mdat_header
        mdat_data_start = len(ftyp) + len(moov_pass1) + len(mdat_hdr)

        # Pass 2: Rebuild moov with correct chunk offsets
        self._compute_chunk_offsets(mdat_data_start)
        moov_final = build_moov(
            self._video_track,
            self._audio_info,
            self._video_samples,
            self._audio_samples,
            mdat_offset=mdat_data_start,
            video_timescale=self._video_timescale,
            audio_timescale=self._audio_timescale,
        )

        # Verify moov size didn't change (it shouldn't since offsets are same width)
        if len(moov_final) != len(moov_pass1):
            # Size changed (e.g., offsets crossed 32/64-bit boundary). Redo.
            mdat_data_start = len(ftyp) + len(moov_final) + len(mdat_hdr)
            self._compute_chunk_offsets(mdat_data_start)
            moov_final = build_moov(
                self._video_track,
                self._audio_info,
                self._video_samples,
                self._audio_samples,
                mdat_offset=mdat_data_start,
                video_timescale=self._video_timescale,
                audio_timescale=self._audio_timescale,
            )

        header_bytes = ftyp + moov_final

        logger.info(
            "[mp4_muxer] Finalized: ftyp=%d moov=%d mdat=%d (header=%d) video=%d samples audio=%d samples",
            len(ftyp),
            len(moov_final),
            self._mdat_size,
            len(mdat_hdr),
            len(self._video_samples.samples),
            len(self._audio_samples.samples),
        )

        return header_bytes, mdat_hdr, self._mdat_chunks

    def _compute_chunk_offsets(self, mdat_data_start: int) -> None:
        """Compute absolute byte offsets for each sample in the mdat."""
        # Samples were added interleaved (video/audio/video/audio...)
        # so mdat_chunks[i] corresponds to samples in order.
        # We need to assign offsets per track.
        video_offsets = []
        audio_offsets = []

        offset = mdat_data_start
        vi = 0
        ai = 0

        for chunk in self._mdat_chunks:
            chunk_size = len(chunk)
            # Determine if this chunk is video or audio based on sample order
            if vi < len(self._video_samples.samples) and (
                ai >= len(self._audio_samples.samples) or self._is_video_sample(vi, ai)
            ):
                video_offsets.append(offset)
                vi += 1
            else:
                audio_offsets.append(offset)
                ai += 1
            offset += chunk_size

        self._video_samples.chunk_offsets = video_offsets
        self._audio_samples.chunk_offsets = audio_offsets

    def _is_video_sample(self, vi: int, ai: int) -> bool:
        """
        Determine if the next mdat chunk at position (vi+ai) is a video sample.

        This relies on the add order tracking. We use a simple scheme:
        samples are added in their interleaved order, and we track which
        indices are video vs audio.
        """
        # The _mdat_chunks list contains samples in the order they were added.
        # We need to know the order. For now, use the _sample_order tracker.
        idx = vi + ai
        if idx < len(self._sample_order):
            return self._sample_order[idx] == "v"
        return vi < len(self._video_samples.samples)

    def update_audio_specific_config(self, asc: bytes) -> None:
        """Update the AudioSpecificConfig (e.g., after first encode)."""
        self._audio_info["audio_specific_config"] = asc


# =============================================================================
# Fragmented MP4 (fMP4) builder for streaming output
# =============================================================================
#
# fMP4 layout:
#   Init segment:  ftyp + moov (mvhd + mvex/trex + trak[video] + trak[audio])
#   Media segments: moof (mfhd + traf[tfhd + tfdt + trun]) + mdat
#
# The moov in fMP4 has empty sample tables (stts/stsz/stsc/stco with 0 entries)
# and an mvex box with trex entries signaling fragmented mode.
# =============================================================================


def _build_empty_stbl(stsd: bytes) -> bytes:
    """Build an stbl with empty sample tables (for fMP4 init segment)."""
    children = bytearray()
    children.extend(stsd)
    # Empty stts
    children.extend(build_full_box(b"stts", 0, 0, struct.pack(">I", 0)))
    # Empty stsc
    children.extend(build_full_box(b"stsc", 0, 0, struct.pack(">I", 0)))
    # Empty stsz
    children.extend(build_full_box(b"stsz", 0, 0, struct.pack(">II", 0, 0)))
    # Empty stco
    children.extend(build_full_box(b"stco", 0, 0, struct.pack(">I", 0)))
    return build_box(b"stbl", bytes(children))


def build_fmp4_init_segment(
    video_track: MKVTrack,
    audio_sample_rate: int,
    audio_channels: int,
    audio_specific_config: bytes,
    video_timescale: int = 90000,
    audio_timescale: int = 48000,
    duration_ms: float = 0.0,
) -> bytes:
    """
    Build an fMP4 initialization segment (ftyp + moov with empty_moov).

    The moov contains track descriptions (codec config) and mvex/trex
    entries signaling fragmented mode. No sample data.

    Args:
        video_track: MKVTrack with video codec info.
        audio_sample_rate: Output audio sample rate.
        audio_channels: Output audio channel count.
        audio_specific_config: AAC AudioSpecificConfig bytes.
        video_timescale: Video track timescale (default 90000).
        audio_timescale: Audio track timescale (default sample_rate).
        duration_ms: Total duration in ms (0 = unknown/live).

    Returns:
        Complete init segment bytes (ftyp + moov).
    """
    ftyp = _build_fmp4_ftyp()

    movie_timescale = 1000  # ms
    movie_duration = int(duration_ms) if duration_ms > 0 else 0

    # mvhd
    mvhd = build_mvhd(movie_timescale, movie_duration)

    # Video trak (with empty stbl)
    video_duration = int(duration_ms * video_timescale / 1000.0) if duration_ms > 0 else 0
    video_tkhd = build_tkhd(
        1, int(duration_ms) if duration_ms > 0 else 0, width=video_track.pixel_width, height=video_track.pixel_height
    )
    video_stsd = build_stsd_video(video_track)
    video_stbl = _build_empty_stbl(video_stsd)
    video_minf = build_minf(is_audio=False, stbl=video_stbl)
    video_mdia = build_mdia(video_timescale, video_duration, b"vide", "VideoHandler", video_minf)
    video_trak = build_box(b"trak", video_tkhd + video_mdia)

    # Audio trak (with empty stbl)
    audio_duration = int(duration_ms * audio_timescale / 1000.0) if duration_ms > 0 else 0
    audio_tkhd = build_tkhd(2, int(duration_ms) if duration_ms > 0 else 0, is_audio=True)
    audio_stsd = build_stsd_audio(audio_sample_rate, audio_channels, audio_specific_config)
    audio_stbl = _build_empty_stbl(audio_stsd)
    audio_minf = build_minf(is_audio=True, stbl=audio_stbl)
    audio_mdia = build_mdia(audio_timescale, audio_duration, b"soun", "SoundHandler", audio_minf)
    audio_trak = build_box(b"trak", audio_tkhd + audio_mdia)

    # mvex (Movie Extends) - signals this is a fragmented MP4
    # trex (Track Extends) for each track.
    # Use 0x00000000 for default_sample_flags (same as ffmpeg), deferring
    # all sample flag decisions to per-fragment tfhd.default_sample_flags
    # and trun.first_sample_flags.  This avoids global defaults that could
    # confuse strict browser parsers.
    trex_video = build_full_box(
        b"trex",
        0,
        0,
        struct.pack(
            ">IIIII",
            1,  # track_ID
            1,  # default_sample_description_index
            0,  # default_sample_duration
            0,  # default_sample_size
            0x00000000,  # default_sample_flags (deferred to tfhd per fragment)
        ),
    )
    trex_audio = build_full_box(
        b"trex",
        0,
        0,
        struct.pack(
            ">IIIII",
            2,  # track_ID
            1,  # default_sample_description_index
            0,  # default_sample_duration
            0,  # default_sample_size
            0x00000000,  # default_sample_flags (deferred to tfhd per fragment)
        ),
    )
    mvex = build_box(b"mvex", trex_video + trex_audio)

    # Assemble moov
    moov = build_box(b"moov", mvhd + video_trak + audio_trak + mvex)

    return ftyp + moov


def _build_fmp4_ftyp() -> bytes:
    """Build ftyp box for fragmented MP4."""
    payload = b"isom"  # major brand
    payload += struct.pack(">I", 0x200)  # minor version
    payload += b"isom" + b"iso6" + b"mp41" + b"msdh" + b"msix"
    return build_box(b"ftyp", payload)


@dataclass
class FragmentSample:
    """A single sample to be written into an fMP4 fragment."""

    data: bytes
    duration: int  # In track timescale
    is_sync: bool = False
    composition_offset: int = 0

    @property
    def size(self) -> int:
        return len(self.data)


def build_fmp4_fragment(
    sequence_number: int,
    track_id: int,
    base_decode_time: int,
    samples: list[FragmentSample],
) -> bytes:
    """
    Build an fMP4 media segment (moof + mdat) for a single track.

    Args:
        sequence_number: Fragment sequence number (1-based, incrementing).
        track_id: Track ID (1=video, 2=audio).
        base_decode_time: Decode time of the first sample in track timescale.
        samples: List of samples for this fragment.

    Returns:
        Complete moof + mdat bytes.
    """
    if not samples:
        return b""

    # mdat payload
    mdat_payload = b"".join(s.data for s in samples)

    # Build trun (Track Fragment Run)
    # Flags: 0x000301 = data_offset_present + sample_duration_present + sample_size_present
    # Add 0x000400 if any sample has composition offset
    # Add 0x000004 for first_sample_flags_present
    has_cts = any(s.composition_offset != 0 for s in samples)
    trun_flags = 0x000001 | 0x000100 | 0x000200  # data_offset + duration + size
    if has_cts:
        trun_flags |= 0x000800  # sample_composition_time_offsets_present
    # Use first_sample_flags for keyframe indication
    trun_flags |= 0x000004  # first_sample_flags_present

    trun_payload = bytearray()
    trun_payload.extend(struct.pack(">I", len(samples)))  # sample_count

    # data_offset: will be patched after we know moof size
    # Placeholder for now (4 bytes)
    data_offset_pos = len(trun_payload)
    trun_payload.extend(struct.pack(">i", 0))  # data_offset placeholder

    # first_sample_flags
    if samples[0].is_sync:
        first_flags = 0x02000000  # sample_depends_on=2 (does not depend, i.e., sync)
    else:
        first_flags = 0x01010000  # sample_depends_on=1, is_non_sync=1
    trun_payload.extend(struct.pack(">I", first_flags))

    # Per-sample entries
    for s in samples:
        trun_payload.extend(struct.pack(">I", s.duration))
        trun_payload.extend(struct.pack(">I", s.size))
        if has_cts:
            trun_payload.extend(struct.pack(">i", s.composition_offset))

    # Use version 1 when CTS offsets are present (supports signed offsets for B-frames)
    trun_version = 1 if has_cts else 0
    trun = build_full_box(b"trun", trun_version, trun_flags, bytes(trun_payload))

    # tfdt (Track Fragment Decode Time) - version 1 for 64-bit time
    tfdt_payload = struct.pack(">Q", base_decode_time)
    tfdt = build_full_box(b"tfdt", 1, 0, tfdt_payload)

    # tfhd (Track Fragment Header)
    # Flags: 0x020000 = default_base_is_moof
    #        0x000020 = default_sample_flags_present
    # Since trex.default_sample_flags is 0x00000000, we set per-fragment
    # defaults here (matching ffmpeg behaviour):
    #   - Video: 0x01010000 (sample_depends_on=1, is_non_sync=1)
    #   - Audio: 0x02000000 (sample_depends_on=2 = independent)
    # The trun.first_sample_flags overrides this for keyframes.
    is_video = track_id == 1
    default_sample_flags = 0x01010000 if is_video else 0x02000000
    tfhd_flags = 0x020000 | 0x000020  # default_base_is_moof + default_sample_flags_present
    tfhd_payload = struct.pack(">II", track_id, default_sample_flags)
    tfhd = build_full_box(b"tfhd", 0, tfhd_flags, tfhd_payload)

    # traf
    traf = build_box(b"traf", tfhd + tfdt + trun)

    # mfhd (Movie Fragment Header)
    mfhd = build_full_box(b"mfhd", 0, 0, struct.pack(">I", sequence_number))

    # moof
    moof = build_box(b"moof", mfhd + traf)

    # Patch data_offset in trun: offset from moof start to mdat payload start
    # mdat header is 8 bytes, so data_offset = moof_size + 8
    data_offset = len(moof) + 8  # 8 = mdat box header

    # Find the trun data_offset position within the moof
    # trun is inside traf, which is inside moof.
    # The data_offset is at a fixed position in the trun payload.
    # We need to search for it. Since we built the structure, we can calculate:
    # moof header (8) + mfhd (full box) + traf header (8) + tfhd (full box) + tfdt (full box)
    # + trun header (12 = 8 box + 4 version/flags) + sample_count (4) -> data_offset position
    # Instead of fragile offset math, search for the placeholder pattern.
    # Actually, let's just rebuild with the correct offset.

    # Re-encode trun with correct data_offset
    trun_payload_fixed = bytearray(trun_payload)
    struct.pack_into(">i", trun_payload_fixed, data_offset_pos, data_offset)
    trun_fixed = build_full_box(b"trun", trun_version, trun_flags, bytes(trun_payload_fixed))

    # Rebuild traf -> moof with fixed trun
    traf_fixed = build_box(b"traf", tfhd + tfdt + trun_fixed)
    moof_fixed = build_box(b"moof", mfhd + traf_fixed)

    # Verify size didn't change (it shouldn't)
    assert len(moof_fixed) == len(moof), "moof size changed after data_offset patch"

    # mdat
    mdat = build_box(b"mdat", mdat_payload)

    return bytes(moof_fixed) + mdat


class FMP4StreamMuxer:
    """
    Streaming fMP4 muxer that produces fragments on-the-fly.

    Usage:
        muxer = FMP4StreamMuxer(video_track, audio_sample_rate, ...)
        init_seg = muxer.build_init_segment()
        yield init_seg

        for frame in demuxed_frames:
            muxer.add_frame(frame)
            fragment = muxer.flush_fragment()
            if fragment:
                yield fragment

        final = muxer.flush_final()
        if final:
            yield final
    """

    def __init__(
        self,
        video_track: MKVTrack,
        audio_sample_rate: int = 48000,
        audio_channels: int = 2,
        audio_specific_config: bytes = b"",
        video_timescale: int = 90000,
        audio_timescale: int = 48000,
        duration_ms: float = 0.0,
        fragment_duration_ms: float = 2000.0,
        start_decode_time_ms: float = 0.0,
        audio_frame_size: int = 0,
    ) -> None:
        self._video_track = video_track
        self._audio_sample_rate = audio_sample_rate
        self._audio_channels = audio_channels
        self._audio_specific_config = audio_specific_config
        self._video_timescale = video_timescale
        self._audio_timescale = audio_timescale
        self._duration_ms = duration_ms
        self._fragment_duration_ms = fragment_duration_ms

        # Fragment accumulation
        self._video_samples: list[FragmentSample] = []
        self._audio_samples: list[FragmentSample] = []
        self._sequence_number = 1

        # Track decode times (in timescale ticks).
        # When producing HLS segments, start_decode_time_ms places this
        # segment's tfdt at the correct position in the global timeline.
        self._video_decode_time = int(start_decode_time_ms * video_timescale / 1000.0)

        # For audio, we must align the tfdt to exact frame boundaries to
        # avoid DTS discontinuities at segment borders.  AAC frames are
        # exactly ``audio_frame_size`` samples each (typically 1024).  If
        # the caller provides audio_frame_size, compute the audio base
        # time as the exact number of whole frames that fit before this
        # segment's start time.
        if audio_frame_size > 0 and start_decode_time_ms > 0:
            total_samples_before = start_decode_time_ms / 1000.0 * audio_timescale
            whole_frames_before = int(total_samples_before / audio_frame_size)
            self._audio_decode_time = whole_frames_before * audio_frame_size
        else:
            self._audio_decode_time = int(start_decode_time_ms * audio_timescale / 1000.0)

        # Track accumulated duration for fragment boundary detection
        self._fragment_video_duration = 0  # video ticks accumulated in current fragment
        self._fragment_threshold = int(fragment_duration_ms * video_timescale / 1000.0)

    @property
    def video_position_ticks(self) -> int:
        """Current video decode position (timescale ticks from stream start)."""
        return self._video_decode_time + self._fragment_video_duration

    def advance_video_decode_time(self, ticks: int) -> None:
        """Advance the video base decode time by *ticks*.

        Can be used to adjust the segment's starting decode position when
        the first emitted frame doesn't align with the tfdt origin.
        """
        self._video_decode_time += ticks

    def build_init_segment(self) -> bytes:
        """Build and return the fMP4 init segment (ftyp + moov)."""
        return build_fmp4_init_segment(
            video_track=self._video_track,
            audio_sample_rate=self._audio_sample_rate,
            audio_channels=self._audio_channels,
            audio_specific_config=self._audio_specific_config,
            video_timescale=self._video_timescale,
            audio_timescale=self._audio_timescale,
            duration_ms=self._duration_ms,
        )

    def update_audio_specific_config(self, asc: bytes) -> None:
        """Update the AAC AudioSpecificConfig (call before build_init_segment if possible)."""
        self._audio_specific_config = asc

    def add_video_sample(
        self,
        data: bytes,
        duration_ticks: int,
        is_keyframe: bool,
        pts_ticks: int | None = None,
    ) -> None:
        """
        Add a video sample to the current fragment.

        Args:
            data: Raw video NALUs.
            duration_ticks: Duration in video timescale ticks.
            is_keyframe: Whether this is an IDR/sync sample.
            pts_ticks: Presentation timestamp in video timescale ticks.
                       Used to compute composition_time_offset for B-frame
                       reordering. If None, assumes PTS == DTS (no B-frames).
        """
        # Compute composition_time_offset = PTS - DTS
        # DTS is the running decode time for this fragment
        cts_offset = 0
        if pts_ticks is not None:
            dts = self._video_decode_time + self._fragment_video_duration
            cts_offset = pts_ticks - dts

        self._video_samples.append(
            FragmentSample(
                data=data,
                duration=duration_ticks,
                is_sync=is_keyframe,
                composition_offset=cts_offset,
            )
        )
        self._fragment_video_duration += duration_ticks

    def add_audio_sample(self, data: bytes, duration_ticks: int) -> None:
        """Add an audio sample to the current fragment."""
        self._audio_samples.append(
            FragmentSample(
                data=data,
                duration=duration_ticks,
                is_sync=True,
            )
        )

    def should_flush(self) -> bool:
        """Check if the current fragment has enough data to emit."""
        # Flush on keyframe boundaries after accumulating enough duration
        if self._fragment_video_duration < self._fragment_threshold:
            return False
        # Only flush at a keyframe boundary (if there's a pending keyframe)
        if len(self._video_samples) > 1 and self._video_samples[-1].is_sync:
            return True
        return False

    def flush_fragment(self, force: bool = False) -> bytes | None:
        """
        Flush the current fragment if ready.

        Args:
            force: Force flush even if fragment duration threshold isn't reached.

        Returns:
            Fragment bytes (moof+mdat for video + moof+mdat for audio) or None.
        """
        if not force and not self.should_flush():
            return None

        if not self._video_samples and not self._audio_samples:
            return None

        result = bytearray()

        # When flushing at a keyframe, the last sample (the new keyframe)
        # belongs to the NEXT fragment. Split there.
        if not force and len(self._video_samples) > 1 and self._video_samples[-1].is_sync:
            video_to_emit = self._video_samples[:-1]
            video_remaining = [self._video_samples[-1]]
        else:
            video_to_emit = self._video_samples
            video_remaining = []

        # Emit video fragment
        if video_to_emit:
            frag = build_fmp4_fragment(
                sequence_number=self._sequence_number,
                track_id=1,
                base_decode_time=self._video_decode_time,
                samples=video_to_emit,
            )
            result.extend(frag)
            self._sequence_number += 1

            emitted_duration = sum(s.duration for s in video_to_emit)
            self._video_decode_time += emitted_duration

        # Emit audio fragment (matching time range)
        if self._audio_samples:
            frag = build_fmp4_fragment(
                sequence_number=self._sequence_number,
                track_id=2,
                base_decode_time=self._audio_decode_time,
                samples=self._audio_samples,
            )
            result.extend(frag)
            self._sequence_number += 1

            emitted_audio_duration = sum(s.duration for s in self._audio_samples)
            self._audio_decode_time += emitted_audio_duration
            self._audio_samples = []

        # Reset for next fragment
        self._video_samples = video_remaining
        self._fragment_video_duration = sum(s.duration for s in video_remaining)

        return bytes(result) if result else None

    def flush_final(self) -> bytes | None:
        """Flush any remaining samples as the final fragment."""
        return self.flush_fragment(force=True)