""" Pure Python MP4 box builder for both standard and fragmented MP4. Supports two modes: 1. Standard MP4 (moov-first): For progressive download with HTTP Range seeking. File layout: ftyp | moov (full sample tables) | mdat 2. Fragmented MP4 (fMP4): For on-the-fly streaming via StreamingResponse. Init segment: ftyp | moov (empty_moov with mvex) Media segments: moof (tfhd + tfdt + trun) | mdat The fMP4 mode is used for the transcode pipeline where MKV frames are demuxed, audio is transcoded, and fMP4 fragments are streamed out immediately without buffering the entire file. """ import logging import struct from dataclasses import dataclass, field from mediaflow_proxy.remuxer.ebml_parser import MKVTrack, CODEC_ID_H264, CODEC_ID_H265 logger = logging.getLogger(__name__) # ============================================================================= # Sample metadata # ============================================================================= @dataclass class SampleEntry: """Metadata for a single sample (frame) in the MP4 file.""" size: int # Sample size in bytes duration: int # Duration in track timescale ticks is_sync: bool # True for keyframes (video) or all audio samples composition_offset: int = 0 # CTS offset (for B-frames) @dataclass class TrackSamples: """Collected sample metadata for one track during muxing.""" samples: list[SampleEntry] = field(default_factory=list) chunk_offsets: list[int] = field(default_factory=list) # Absolute byte offset of each chunk in mdat total_size: int = 0 # Total bytes of all samples total_duration: int = 0 # Total duration in timescale ticks def add(self, sample: SampleEntry) -> None: self.samples.append(sample) self.total_size += sample.size self.total_duration += sample.duration # ============================================================================= # Box building primitives # ============================================================================= def build_box(box_type: bytes, payload: bytes) -> bytes: """Build a standard MP4 box: [4-byte size][4-byte type][payload].""" size = 8 + len(payload) return struct.pack(">I", size) + box_type + payload def build_full_box(box_type: bytes, version: int, flags: int, payload: bytes) -> bytes: """Build a full box with version and flags.""" inner = struct.pack(">I", (version << 24) | (flags & 0xFFFFFF)) + payload return build_box(box_type, inner) def build_box_header_large(box_type: bytes, total_size: int) -> bytes: """Build a box header for large boxes using 64-bit extended size.""" # size=1 signals extended size; actual size follows as uint64 return struct.pack(">I", 1) + box_type + struct.pack(">Q", total_size) # ============================================================================= # ftyp box # ============================================================================= def build_ftyp() -> bytes: """Build the File Type box for isom/iso2/mp41 compatible MP4.""" payload = b"isom" # major brand payload += struct.pack(">I", 0x200) # minor version payload += b"isom" + b"iso2" + b"mp41" # compatible brands return build_box(b"ftyp", payload) # ============================================================================= # moov box and children # ============================================================================= def build_mvhd(timescale: int, duration: int) -> bytes: """Build Movie Header box (mvhd), version 0.""" payload = bytearray() payload.extend(struct.pack(">I", 0)) # creation_time payload.extend(struct.pack(">I", 0)) # modification_time payload.extend(struct.pack(">I", timescale)) payload.extend(struct.pack(">I", duration)) payload.extend(struct.pack(">I", 0x00010000)) # rate = 1.0 payload.extend(struct.pack(">H", 0x0100)) # volume = 1.0 payload.extend(b"\x00" * 10) # reserved # Unity matrix (3x3, each 4 bytes, 9 values = 36 bytes) payload.extend(struct.pack(">9I", 0x00010000, 0, 0, 0, 0x00010000, 0, 0, 0, 0x40000000)) payload.extend(b"\x00" * 24) # pre_defined payload.extend(struct.pack(">I", 3)) # next_track_ID (1=video, 2=audio, next=3) return build_full_box(b"mvhd", 0, 0, bytes(payload)) def build_tkhd(track_id: int, duration: int, width: int = 0, height: int = 0, is_audio: bool = False) -> bytes: """Build Track Header box (tkhd), version 0.""" flags = 0x000003 # track_enabled | track_in_movie payload = bytearray() payload.extend(struct.pack(">I", 0)) # creation_time payload.extend(struct.pack(">I", 0)) # modification_time payload.extend(struct.pack(">I", track_id)) payload.extend(b"\x00" * 4) # reserved payload.extend(struct.pack(">I", duration)) payload.extend(b"\x00" * 8) # reserved payload.extend(struct.pack(">H", 0)) # layer payload.extend(struct.pack(">H", 0 if not is_audio else 1)) # alternate_group payload.extend(struct.pack(">H", 0x0100 if is_audio else 0)) # volume payload.extend(b"\x00" * 2) # reserved # Unity matrix payload.extend(struct.pack(">9I", 0x00010000, 0, 0, 0, 0x00010000, 0, 0, 0, 0x40000000)) # Width and height as 16.16 fixed-point payload.extend(struct.pack(">I", width << 16)) payload.extend(struct.pack(">I", height << 16)) return build_full_box(b"tkhd", 0, flags, bytes(payload)) def build_mdhd(timescale: int, duration: int) -> bytes: """Build Media Header box (mdhd), version 0.""" payload = bytearray() payload.extend(struct.pack(">I", 0)) # creation_time payload.extend(struct.pack(">I", 0)) # modification_time payload.extend(struct.pack(">I", timescale)) payload.extend(struct.pack(">I", duration)) payload.extend(struct.pack(">H", 0x55C4)) # language: 'und' payload.extend(struct.pack(">H", 0)) # pre_defined return build_full_box(b"mdhd", 0, 0, bytes(payload)) def build_hdlr(handler_type: bytes, name: str) -> bytes: """Build Handler Reference box (hdlr).""" payload = bytearray() payload.extend(b"\x00" * 4) # pre_defined payload.extend(handler_type) # handler_type (4 bytes) payload.extend(b"\x00" * 12) # reserved payload.extend(name.encode("utf-8") + b"\x00") return build_full_box(b"hdlr", 0, 0, bytes(payload)) def build_vmhd() -> bytes: """Build Video Media Header box (vmhd).""" payload = struct.pack(">H", 0) # graphicsmode payload += struct.pack(">3H", 0, 0, 0) # opcolor return build_full_box(b"vmhd", 0, 1, payload) # flags=1 def build_smhd() -> bytes: """Build Sound Media Header box (smhd).""" payload = struct.pack(">H", 0) # balance payload += b"\x00\x00" # reserved return build_full_box(b"smhd", 0, 0, payload) def build_dref() -> bytes: """Build Data Reference box (dref) with a self-contained URL entry.""" url_box = build_full_box(b"url ", 0, 1, b"") # flags=1 = self-contained payload = struct.pack(">I", 1) + url_box # entry_count=1 return build_full_box(b"dref", 0, 0, payload) def build_dinf() -> bytes: """Build Data Information box (dinf).""" return build_box(b"dinf", build_dref()) # ============================================================================= # Sample table boxes (stbl) # ============================================================================= def build_stsd_video(track: MKVTrack) -> bytes: """Build Sample Description box (stsd) for a video track.""" # Build the codec-specific sample entry if track.codec_id == CODEC_ID_H264: entry = _build_avc1_entry(track) elif track.codec_id == CODEC_ID_H265: entry = _build_hvc1_entry(track) else: raise ValueError(f"Unsupported video codec: {track.codec_id}") payload = struct.pack(">I", 1) + entry # entry_count=1 return build_full_box(b"stsd", 0, 0, payload) def _build_colr_nclx( colour_primaries: int = 1, transfer_characteristics: int = 1, matrix_coefficients: int = 1, full_range: bool = False, ) -> bytes: """ Build a colr box with nclx (video colour) information. Defaults to BT.709 (the standard for HD content), matching ffmpeg's default behaviour for fMP4 output. """ payload = b"nclx" payload += struct.pack(">HHH", colour_primaries, transfer_characteristics, matrix_coefficients) payload += struct.pack("B", 0x80 if full_range else 0x00) return build_box(b"colr", payload) def _build_pasp(h_spacing: int = 1, v_spacing: int = 1) -> bytes: """ Build a pasp (Pixel Aspect Ratio) box. Default 1:1 (square pixels), which is the norm for HD content. """ return build_box(b"pasp", struct.pack(">II", h_spacing, v_spacing)) def _build_avc1_entry(track: MKVTrack) -> bytes: """Build an avc3 VisualSampleEntry. Uses ``avc3`` instead of ``avc1`` to allow in-band SPS/PPS parameter set updates in sample data. Many MKV sources embed mid-stream PPS changes in the bitstream; ``avc3`` signals to the player that these may appear in any sample, avoiding "non-existing PPS" decode errors. """ payload = bytearray() payload.extend(b"\x00" * 6) # reserved payload.extend(struct.pack(">H", 1)) # data_reference_index payload.extend(b"\x00" * 16) # pre_defined + reserved payload.extend(struct.pack(">H", track.pixel_width)) payload.extend(struct.pack(">H", track.pixel_height)) payload.extend(struct.pack(">I", 0x00480000)) # horizresolution 72 dpi payload.extend(struct.pack(">I", 0x00480000)) # vertresolution 72 dpi payload.extend(b"\x00" * 4) # reserved payload.extend(struct.pack(">H", 1)) # frame_count payload.extend(b"\x00" * 32) # compressorname payload.extend(struct.pack(">H", 0x0018)) # depth = 24 payload.extend(struct.pack(">h", -1)) # pre_defined # avcC box from CodecPrivate if track.codec_private: avcc = build_box(b"avcC", track.codec_private) payload.extend(avcc) # colr box -- nclx colour information (BT.709) payload.extend(_build_colr_nclx()) # pasp box -- pixel aspect ratio (1:1) payload.extend(_build_pasp()) return build_box(b"avc3", bytes(payload)) def _build_hvc1_entry(track: MKVTrack) -> bytes: """Build an hvc1 VisualSampleEntry.""" payload = bytearray() payload.extend(b"\x00" * 6) # reserved payload.extend(struct.pack(">H", 1)) # data_reference_index payload.extend(b"\x00" * 16) # pre_defined + reserved payload.extend(struct.pack(">H", track.pixel_width)) payload.extend(struct.pack(">H", track.pixel_height)) payload.extend(struct.pack(">I", 0x00480000)) # horizresolution payload.extend(struct.pack(">I", 0x00480000)) # vertresolution payload.extend(b"\x00" * 4) # reserved payload.extend(struct.pack(">H", 1)) # frame_count payload.extend(b"\x00" * 32) # compressorname payload.extend(struct.pack(">H", 0x0018)) # depth payload.extend(struct.pack(">h", -1)) # pre_defined # hvcC box from CodecPrivate if track.codec_private: hvcc = build_box(b"hvcC", track.codec_private) payload.extend(hvcc) # colr box -- nclx colour information (BT.709) payload.extend(_build_colr_nclx()) # pasp box -- pixel aspect ratio (1:1) payload.extend(_build_pasp()) return build_box(b"hvc1", bytes(payload)) def build_stsd_audio(sample_rate: int, channels: int, audio_specific_config: bytes) -> bytes: """Build Sample Description box (stsd) for an AAC audio track.""" entry = _build_mp4a_entry(sample_rate, channels, audio_specific_config) payload = struct.pack(">I", 1) + entry # entry_count=1 return build_full_box(b"stsd", 0, 0, payload) def _build_mp4a_entry(sample_rate: int, channels: int, asc: bytes) -> bytes: """Build an mp4a AudioSampleEntry with esds box.""" payload = bytearray() payload.extend(b"\x00" * 6) # reserved payload.extend(struct.pack(">H", 1)) # data_reference_index payload.extend(b"\x00" * 8) # reserved payload.extend(struct.pack(">H", channels)) payload.extend(struct.pack(">H", 16)) # sample_size (16-bit) payload.extend(b"\x00" * 4) # pre_defined + reserved payload.extend(struct.pack(">I", sample_rate << 16)) # sample_rate 16.16 # esds box esds = _build_esds(sample_rate, channels, asc) payload.extend(esds) return build_box(b"mp4a", bytes(payload)) def _build_esds(sample_rate: int, channels: int, asc: bytes) -> bytes: """Build an Elementary Stream Descriptor box (esds) for AAC.""" # ES_Descriptor es_desc = bytearray() es_desc.extend(struct.pack(">H", 1)) # ES_ID es_desc.append(0x00) # stream priority # DecoderConfigDescriptor dec_config = bytearray() dec_config.append(0x40) # objectTypeIndication = Audio ISO/IEC 14496-3 (AAC) dec_config.append(0x15) # streamType=5 (audio) upstream=0 reserved=1 dec_config.extend(b"\x00\x00\x00") # bufferSizeDB (3 bytes) dec_config.extend(struct.pack(">I", 192000)) # maxBitrate dec_config.extend(struct.pack(">I", 192000)) # avgBitrate # DecoderSpecificInfo (AudioSpecificConfig) dec_specific = _build_descriptor(0x05, asc) dec_config.extend(dec_specific) dec_config_desc = _build_descriptor(0x04, bytes(dec_config)) es_desc.extend(dec_config_desc) # SLConfigDescriptor (predefined=2 for MP4) sl_config = _build_descriptor(0x06, b"\x02") es_desc.extend(sl_config) es_descriptor = _build_descriptor(0x03, bytes(es_desc)) payload = es_descriptor return build_full_box(b"esds", 0, 0, payload) def _build_descriptor(tag: int, data: bytes) -> bytes: """Build an ISO 14496-1 descriptor with expandable length encoding.""" length = len(data) result = bytearray() result.append(tag) # Expandable length: use 4 bytes (most compatible) result.append(0x80 | ((length >> 21) & 0x7F)) result.append(0x80 | ((length >> 14) & 0x7F)) result.append(0x80 | ((length >> 7) & 0x7F)) result.append(length & 0x7F) result.extend(data) return bytes(result) def build_stts(samples: list[SampleEntry]) -> bytes: """ Build Time-to-Sample box (stts) with run-length encoding. Groups consecutive samples with the same duration. """ if not samples: return build_full_box(b"stts", 0, 0, struct.pack(">I", 0)) # Run-length encode durations entries = [] current_duration = samples[0].duration current_count = 1 for s in samples[1:]: if s.duration == current_duration: current_count += 1 else: entries.append((current_count, current_duration)) current_duration = s.duration current_count = 1 entries.append((current_count, current_duration)) payload = bytearray() payload.extend(struct.pack(">I", len(entries))) for count, delta in entries: payload.extend(struct.pack(">II", count, delta)) return build_full_box(b"stts", 0, 0, bytes(payload)) def build_stss(samples: list[SampleEntry]) -> bytes | None: """ Build Sync Sample box (stss) listing keyframe indices. Returns None if all samples are sync (audio tracks), as stss is only needed when not all samples are sync points. """ sync_indices = [i + 1 for i, s in enumerate(samples) if s.is_sync] # 1-based if len(sync_indices) == len(samples): return None # All samples are sync; omit stss payload = bytearray() payload.extend(struct.pack(">I", len(sync_indices))) for idx in sync_indices: payload.extend(struct.pack(">I", idx)) return build_full_box(b"stss", 0, 0, bytes(payload)) def build_ctts(samples: list[SampleEntry]) -> bytes | None: """ Build Composition Time-to-Sample box (ctts) for B-frame offsets. Returns None if no samples have composition offsets. """ has_offsets = any(s.composition_offset != 0 for s in samples) if not has_offsets: return None # Run-length encode offsets entries = [] current_offset = samples[0].composition_offset current_count = 1 for s in samples[1:]: if s.composition_offset == current_offset: current_count += 1 else: entries.append((current_count, current_offset)) current_offset = s.composition_offset current_count = 1 entries.append((current_count, current_offset)) payload = bytearray() payload.extend(struct.pack(">I", len(entries))) for count, offset in entries: payload.extend(struct.pack(">II", count, offset)) return build_full_box(b"ctts", 0, 0, bytes(payload)) def build_stsz(samples: list[SampleEntry]) -> bytes: """Build Sample Size box (stsz).""" payload = bytearray() # Check if all samples are the same size if samples: first_size = samples[0].size all_same = all(s.size == first_size for s in samples) else: all_same = True first_size = 0 if all_same and samples: payload.extend(struct.pack(">I", first_size)) # sample_size (uniform) payload.extend(struct.pack(">I", len(samples))) # sample_count else: payload.extend(struct.pack(">I", 0)) # sample_size = 0 (variable) payload.extend(struct.pack(">I", len(samples))) for s in samples: payload.extend(struct.pack(">I", s.size)) return build_full_box(b"stsz", 0, 0, bytes(payload)) def build_stsc(num_chunks: int) -> bytes: """ Build Sample-to-Chunk box (stsc). For simplicity, we use one sample per chunk (each sample gets its own chunk offset). This is slightly less compact but much simpler and fully correct. """ payload = bytearray() payload.extend(struct.pack(">I", 1)) # entry_count payload.extend(struct.pack(">III", 1, 1, 1)) # first_chunk=1, samples_per_chunk=1, desc_index=1 return build_full_box(b"stsc", 0, 0, bytes(payload)) def build_stco(offsets: list[int]) -> bytes: """Build Chunk Offset box (stco, 32-bit offsets).""" payload = bytearray() payload.extend(struct.pack(">I", len(offsets))) for off in offsets: payload.extend(struct.pack(">I", off)) return build_full_box(b"stco", 0, 0, bytes(payload)) def build_co64(offsets: list[int]) -> bytes: """Build Chunk Offset box (co64, 64-bit offsets) for large files.""" payload = bytearray() payload.extend(struct.pack(">I", len(offsets))) for off in offsets: payload.extend(struct.pack(">Q", off)) return build_full_box(b"co64", 0, 0, bytes(payload)) # ============================================================================= # Track building (assembles trak box hierarchy) # ============================================================================= def build_stbl(track_samples: TrackSamples, stsd: bytes) -> bytes: """Build the Sample Table box (stbl) for a track.""" children = bytearray() children.extend(stsd) children.extend(build_stts(track_samples.samples)) stss = build_stss(track_samples.samples) if stss is not None: children.extend(stss) ctts = build_ctts(track_samples.samples) if ctts is not None: children.extend(ctts) children.extend(build_stsz(track_samples.samples)) children.extend(build_stsc(len(track_samples.chunk_offsets))) # Use co64 if any offset exceeds 32-bit range needs_64 = any(off > 0xFFFFFFFF for off in track_samples.chunk_offsets) if needs_64: children.extend(build_co64(track_samples.chunk_offsets)) else: children.extend(build_stco(track_samples.chunk_offsets)) return build_box(b"stbl", bytes(children)) def build_minf(is_audio: bool, stbl: bytes) -> bytes: """Build Media Information box (minf).""" children = bytearray() if is_audio: children.extend(build_smhd()) else: children.extend(build_vmhd()) children.extend(build_dinf()) children.extend(stbl) return build_box(b"minf", bytes(children)) def build_mdia(timescale: int, duration: int, handler_type: bytes, handler_name: str, minf: bytes) -> bytes: """Build Media box (mdia).""" children = bytearray() children.extend(build_mdhd(timescale, duration)) children.extend(build_hdlr(handler_type, handler_name)) children.extend(minf) return build_box(b"mdia", bytes(children)) def build_video_trak( track: MKVTrack, track_id: int, timescale: int, track_samples: TrackSamples, movie_timescale: int, ) -> bytes: """Build a complete video trak box.""" duration_in_track = track_samples.total_duration # Convert track duration to movie timescale for tkhd if timescale > 0: duration_in_movie = int(duration_in_track * movie_timescale / timescale) else: duration_in_movie = 0 tkhd = build_tkhd(track_id, duration_in_movie, width=track.pixel_width, height=track.pixel_height) stsd = build_stsd_video(track) stbl = build_stbl(track_samples, stsd) minf = build_minf(is_audio=False, stbl=stbl) mdia = build_mdia(timescale, duration_in_track, b"vide", "VideoHandler", minf) return build_box(b"trak", tkhd + mdia) def build_audio_trak( track_id: int, timescale: int, track_samples: TrackSamples, movie_timescale: int, sample_rate: int, channels: int, audio_specific_config: bytes, ) -> bytes: """Build a complete audio trak box.""" duration_in_track = track_samples.total_duration if timescale > 0: duration_in_movie = int(duration_in_track * movie_timescale / timescale) else: duration_in_movie = 0 tkhd = build_tkhd(track_id, duration_in_movie, is_audio=True) stsd = build_stsd_audio(sample_rate, channels, audio_specific_config) stbl = build_stbl(track_samples, stsd) minf = build_minf(is_audio=True, stbl=stbl) mdia = build_mdia(timescale, duration_in_track, b"soun", "SoundHandler", minf) return build_box(b"trak", tkhd + mdia) # ============================================================================= # Complete moov builder # ============================================================================= def build_moov( video_track: MKVTrack, audio_track_info: dict, video_samples: TrackSamples, audio_samples: TrackSamples, mdat_offset: int, video_timescale: int = 90000, audio_timescale: int = 48000, movie_timescale: int = 1000, ) -> bytes: """ Build the complete moov box with all track metadata. Args: video_track: MKVTrack with video codec info. audio_track_info: Dict with keys: sample_rate, channels, audio_specific_config. video_samples: Collected video sample metadata. audio_samples: Collected audio sample metadata. mdat_offset: Byte offset where mdat data starts (after ftyp + moov + mdat header). video_timescale: Video track timescale (default 90000 for 90kHz). audio_timescale: Audio track timescale (typically sample_rate). movie_timescale: Movie header timescale (default 1000 = ms). Returns: Complete moov box bytes. """ # Calculate movie duration video_dur_movie = 0 if video_timescale > 0 and video_samples.total_duration > 0: video_dur_movie = int(video_samples.total_duration * movie_timescale / video_timescale) audio_dur_movie = 0 if audio_timescale > 0 and audio_samples.total_duration > 0: audio_dur_movie = int(audio_samples.total_duration * movie_timescale / audio_timescale) movie_duration = max(video_dur_movie, audio_dur_movie) # Build moov children children = bytearray() children.extend(build_mvhd(movie_timescale, movie_duration)) children.extend( build_video_trak( video_track, track_id=1, timescale=video_timescale, track_samples=video_samples, movie_timescale=movie_timescale, ) ) children.extend( build_audio_trak( track_id=2, timescale=audio_timescale, track_samples=audio_samples, movie_timescale=movie_timescale, sample_rate=audio_track_info["sample_rate"], channels=audio_track_info["channels"], audio_specific_config=audio_track_info["audio_specific_config"], ) ) return build_box(b"moov", bytes(children)) # ============================================================================= # mdat box header # ============================================================================= def build_mdat_header(data_size: int) -> bytes: """ Build the mdat box header. Uses extended (64-bit) size if data_size + header > 4GB. """ total = 8 + data_size # header(8) + data if total <= 0xFFFFFFFF: return struct.pack(">I", total) + b"mdat" # Extended size: size field = 1, then 8-byte actual size total_ext = 16 + data_size # header(16) + data return struct.pack(">I", 1) + b"mdat" + struct.pack(">Q", total_ext) # ============================================================================= # MP4 Builder (high-level orchestrator) # ============================================================================= class MP4Builder: """ High-level MP4 file builder. Collects video and audio samples during a transcode pass, then produces a complete moov-first MP4 file. Usage: builder = MP4Builder(video_track, audio_sample_rate=48000, audio_channels=2, audio_specific_config=asc) for frame in video_frames: builder.add_video_sample(frame.data, frame.duration_ticks, frame.is_keyframe) for frame in audio_frames: builder.add_audio_sample(frame.data, frame.duration_ticks) moov_bytes, mdat_header, sample_data_list = builder.finalize() """ def __init__( self, video_track: MKVTrack, audio_sample_rate: int = 48000, audio_channels: int = 2, audio_specific_config: bytes = b"", video_timescale: int = 90000, audio_timescale: int = 48000, ) -> None: self._video_track = video_track self._audio_info = { "sample_rate": audio_sample_rate, "channels": audio_channels, "audio_specific_config": audio_specific_config, } self._video_timescale = video_timescale self._audio_timescale = audio_timescale self._video_samples = TrackSamples() self._audio_samples = TrackSamples() self._mdat_chunks: list[bytes] = [] # Interleaved sample data self._mdat_size: int = 0 self._sample_order: list[str] = [] # "v" or "a" for each mdat chunk def add_video_sample(self, data: bytes, duration_ticks: int, is_keyframe: bool) -> None: """Add a video sample (H.264/H.265 NALUs) to the builder.""" entry = SampleEntry(size=len(data), duration=duration_ticks, is_sync=is_keyframe) self._video_samples.add(entry) self._mdat_chunks.append(data) self._mdat_size += len(data) self._sample_order.append("v") def add_audio_sample(self, data: bytes, duration_ticks: int) -> None: """Add an audio sample (AAC frame) to the builder.""" entry = SampleEntry(size=len(data), duration=duration_ticks, is_sync=True) self._audio_samples.add(entry) self._mdat_chunks.append(data) self._mdat_size += len(data) self._sample_order.append("a") @property def video_sample_count(self) -> int: return len(self._video_samples.samples) @property def audio_sample_count(self) -> int: return len(self._audio_samples.samples) @property def mdat_size(self) -> int: return self._mdat_size def finalize(self) -> tuple[bytes, bytes, list[bytes]]: """ Build the final MP4 file components. Since moov needs accurate chunk offsets (stco/co64) that depend on moov's own size, we do a two-pass approach: 1. Build moov with placeholder offsets to determine its size 2. Rebuild moov with correct offsets Returns: (ftyp_moov_bytes, mdat_header_bytes, mdat_chunk_list) Concatenating these gives the complete MP4 file. """ ftyp = build_ftyp() # Build mdat header mdat_hdr = build_mdat_header(self._mdat_size) # Pass 1: Build moov with placeholder (0) offsets to measure its size self._compute_chunk_offsets(0) # Placeholder base moov_pass1 = build_moov( self._video_track, self._audio_info, self._video_samples, self._audio_samples, mdat_offset=0, video_timescale=self._video_timescale, audio_timescale=self._audio_timescale, ) # Calculate actual mdat data start: # ftyp + moov + mdat_header mdat_data_start = len(ftyp) + len(moov_pass1) + len(mdat_hdr) # Pass 2: Rebuild moov with correct chunk offsets self._compute_chunk_offsets(mdat_data_start) moov_final = build_moov( self._video_track, self._audio_info, self._video_samples, self._audio_samples, mdat_offset=mdat_data_start, video_timescale=self._video_timescale, audio_timescale=self._audio_timescale, ) # Verify moov size didn't change (it shouldn't since offsets are same width) if len(moov_final) != len(moov_pass1): # Size changed (e.g., offsets crossed 32/64-bit boundary). Redo. mdat_data_start = len(ftyp) + len(moov_final) + len(mdat_hdr) self._compute_chunk_offsets(mdat_data_start) moov_final = build_moov( self._video_track, self._audio_info, self._video_samples, self._audio_samples, mdat_offset=mdat_data_start, video_timescale=self._video_timescale, audio_timescale=self._audio_timescale, ) header_bytes = ftyp + moov_final logger.info( "[mp4_muxer] Finalized: ftyp=%d moov=%d mdat=%d (header=%d) video=%d samples audio=%d samples", len(ftyp), len(moov_final), self._mdat_size, len(mdat_hdr), len(self._video_samples.samples), len(self._audio_samples.samples), ) return header_bytes, mdat_hdr, self._mdat_chunks def _compute_chunk_offsets(self, mdat_data_start: int) -> None: """Compute absolute byte offsets for each sample in the mdat.""" # Samples were added interleaved (video/audio/video/audio...) # so mdat_chunks[i] corresponds to samples in order. # We need to assign offsets per track. video_offsets = [] audio_offsets = [] offset = mdat_data_start vi = 0 ai = 0 for chunk in self._mdat_chunks: chunk_size = len(chunk) # Determine if this chunk is video or audio based on sample order if vi < len(self._video_samples.samples) and ( ai >= len(self._audio_samples.samples) or self._is_video_sample(vi, ai) ): video_offsets.append(offset) vi += 1 else: audio_offsets.append(offset) ai += 1 offset += chunk_size self._video_samples.chunk_offsets = video_offsets self._audio_samples.chunk_offsets = audio_offsets def _is_video_sample(self, vi: int, ai: int) -> bool: """ Determine if the next mdat chunk at position (vi+ai) is a video sample. This relies on the add order tracking. We use a simple scheme: samples are added in their interleaved order, and we track which indices are video vs audio. """ # The _mdat_chunks list contains samples in the order they were added. # We need to know the order. For now, use the _sample_order tracker. idx = vi + ai if idx < len(self._sample_order): return self._sample_order[idx] == "v" return vi < len(self._video_samples.samples) def update_audio_specific_config(self, asc: bytes) -> None: """Update the AudioSpecificConfig (e.g., after first encode).""" self._audio_info["audio_specific_config"] = asc # ============================================================================= # Fragmented MP4 (fMP4) builder for streaming output # ============================================================================= # # fMP4 layout: # Init segment: ftyp + moov (mvhd + mvex/trex + trak[video] + trak[audio]) # Media segments: moof (mfhd + traf[tfhd + tfdt + trun]) + mdat # # The moov in fMP4 has empty sample tables (stts/stsz/stsc/stco with 0 entries) # and an mvex box with trex entries signaling fragmented mode. # ============================================================================= def _build_empty_stbl(stsd: bytes) -> bytes: """Build an stbl with empty sample tables (for fMP4 init segment).""" children = bytearray() children.extend(stsd) # Empty stts children.extend(build_full_box(b"stts", 0, 0, struct.pack(">I", 0))) # Empty stsc children.extend(build_full_box(b"stsc", 0, 0, struct.pack(">I", 0))) # Empty stsz children.extend(build_full_box(b"stsz", 0, 0, struct.pack(">II", 0, 0))) # Empty stco children.extend(build_full_box(b"stco", 0, 0, struct.pack(">I", 0))) return build_box(b"stbl", bytes(children)) def build_fmp4_init_segment( video_track: MKVTrack, audio_sample_rate: int, audio_channels: int, audio_specific_config: bytes, video_timescale: int = 90000, audio_timescale: int = 48000, duration_ms: float = 0.0, ) -> bytes: """ Build an fMP4 initialization segment (ftyp + moov with empty_moov). The moov contains track descriptions (codec config) and mvex/trex entries signaling fragmented mode. No sample data. Args: video_track: MKVTrack with video codec info. audio_sample_rate: Output audio sample rate. audio_channels: Output audio channel count. audio_specific_config: AAC AudioSpecificConfig bytes. video_timescale: Video track timescale (default 90000). audio_timescale: Audio track timescale (default sample_rate). duration_ms: Total duration in ms (0 = unknown/live). Returns: Complete init segment bytes (ftyp + moov). """ ftyp = _build_fmp4_ftyp() movie_timescale = 1000 # ms movie_duration = int(duration_ms) if duration_ms > 0 else 0 # mvhd mvhd = build_mvhd(movie_timescale, movie_duration) # Video trak (with empty stbl) video_duration = int(duration_ms * video_timescale / 1000.0) if duration_ms > 0 else 0 video_tkhd = build_tkhd( 1, int(duration_ms) if duration_ms > 0 else 0, width=video_track.pixel_width, height=video_track.pixel_height ) video_stsd = build_stsd_video(video_track) video_stbl = _build_empty_stbl(video_stsd) video_minf = build_minf(is_audio=False, stbl=video_stbl) video_mdia = build_mdia(video_timescale, video_duration, b"vide", "VideoHandler", video_minf) video_trak = build_box(b"trak", video_tkhd + video_mdia) # Audio trak (with empty stbl) audio_duration = int(duration_ms * audio_timescale / 1000.0) if duration_ms > 0 else 0 audio_tkhd = build_tkhd(2, int(duration_ms) if duration_ms > 0 else 0, is_audio=True) audio_stsd = build_stsd_audio(audio_sample_rate, audio_channels, audio_specific_config) audio_stbl = _build_empty_stbl(audio_stsd) audio_minf = build_minf(is_audio=True, stbl=audio_stbl) audio_mdia = build_mdia(audio_timescale, audio_duration, b"soun", "SoundHandler", audio_minf) audio_trak = build_box(b"trak", audio_tkhd + audio_mdia) # mvex (Movie Extends) - signals this is a fragmented MP4 # trex (Track Extends) for each track. # Use 0x00000000 for default_sample_flags (same as ffmpeg), deferring # all sample flag decisions to per-fragment tfhd.default_sample_flags # and trun.first_sample_flags. This avoids global defaults that could # confuse strict browser parsers. trex_video = build_full_box( b"trex", 0, 0, struct.pack( ">IIIII", 1, # track_ID 1, # default_sample_description_index 0, # default_sample_duration 0, # default_sample_size 0x00000000, # default_sample_flags (deferred to tfhd per fragment) ), ) trex_audio = build_full_box( b"trex", 0, 0, struct.pack( ">IIIII", 2, # track_ID 1, # default_sample_description_index 0, # default_sample_duration 0, # default_sample_size 0x00000000, # default_sample_flags (deferred to tfhd per fragment) ), ) mvex = build_box(b"mvex", trex_video + trex_audio) # Assemble moov moov = build_box(b"moov", mvhd + video_trak + audio_trak + mvex) return ftyp + moov def _build_fmp4_ftyp() -> bytes: """Build ftyp box for fragmented MP4.""" payload = b"isom" # major brand payload += struct.pack(">I", 0x200) # minor version payload += b"isom" + b"iso6" + b"mp41" + b"msdh" + b"msix" return build_box(b"ftyp", payload) @dataclass class FragmentSample: """A single sample to be written into an fMP4 fragment.""" data: bytes duration: int # In track timescale is_sync: bool = False composition_offset: int = 0 @property def size(self) -> int: return len(self.data) def build_fmp4_fragment( sequence_number: int, track_id: int, base_decode_time: int, samples: list[FragmentSample], ) -> bytes: """ Build an fMP4 media segment (moof + mdat) for a single track. Args: sequence_number: Fragment sequence number (1-based, incrementing). track_id: Track ID (1=video, 2=audio). base_decode_time: Decode time of the first sample in track timescale. samples: List of samples for this fragment. Returns: Complete moof + mdat bytes. """ if not samples: return b"" # mdat payload mdat_payload = b"".join(s.data for s in samples) # Build trun (Track Fragment Run) # Flags: 0x000301 = data_offset_present + sample_duration_present + sample_size_present # Add 0x000400 if any sample has composition offset # Add 0x000004 for first_sample_flags_present has_cts = any(s.composition_offset != 0 for s in samples) trun_flags = 0x000001 | 0x000100 | 0x000200 # data_offset + duration + size if has_cts: trun_flags |= 0x000800 # sample_composition_time_offsets_present # Use first_sample_flags for keyframe indication trun_flags |= 0x000004 # first_sample_flags_present trun_payload = bytearray() trun_payload.extend(struct.pack(">I", len(samples))) # sample_count # data_offset: will be patched after we know moof size # Placeholder for now (4 bytes) data_offset_pos = len(trun_payload) trun_payload.extend(struct.pack(">i", 0)) # data_offset placeholder # first_sample_flags if samples[0].is_sync: first_flags = 0x02000000 # sample_depends_on=2 (does not depend, i.e., sync) else: first_flags = 0x01010000 # sample_depends_on=1, is_non_sync=1 trun_payload.extend(struct.pack(">I", first_flags)) # Per-sample entries for s in samples: trun_payload.extend(struct.pack(">I", s.duration)) trun_payload.extend(struct.pack(">I", s.size)) if has_cts: trun_payload.extend(struct.pack(">i", s.composition_offset)) # Use version 1 when CTS offsets are present (supports signed offsets for B-frames) trun_version = 1 if has_cts else 0 trun = build_full_box(b"trun", trun_version, trun_flags, bytes(trun_payload)) # tfdt (Track Fragment Decode Time) - version 1 for 64-bit time tfdt_payload = struct.pack(">Q", base_decode_time) tfdt = build_full_box(b"tfdt", 1, 0, tfdt_payload) # tfhd (Track Fragment Header) # Flags: 0x020000 = default_base_is_moof # 0x000020 = default_sample_flags_present # Since trex.default_sample_flags is 0x00000000, we set per-fragment # defaults here (matching ffmpeg behaviour): # - Video: 0x01010000 (sample_depends_on=1, is_non_sync=1) # - Audio: 0x02000000 (sample_depends_on=2 = independent) # The trun.first_sample_flags overrides this for keyframes. is_video = track_id == 1 default_sample_flags = 0x01010000 if is_video else 0x02000000 tfhd_flags = 0x020000 | 0x000020 # default_base_is_moof + default_sample_flags_present tfhd_payload = struct.pack(">II", track_id, default_sample_flags) tfhd = build_full_box(b"tfhd", 0, tfhd_flags, tfhd_payload) # traf traf = build_box(b"traf", tfhd + tfdt + trun) # mfhd (Movie Fragment Header) mfhd = build_full_box(b"mfhd", 0, 0, struct.pack(">I", sequence_number)) # moof moof = build_box(b"moof", mfhd + traf) # Patch data_offset in trun: offset from moof start to mdat payload start # mdat header is 8 bytes, so data_offset = moof_size + 8 data_offset = len(moof) + 8 # 8 = mdat box header # Find the trun data_offset position within the moof # trun is inside traf, which is inside moof. # The data_offset is at a fixed position in the trun payload. # We need to search for it. Since we built the structure, we can calculate: # moof header (8) + mfhd (full box) + traf header (8) + tfhd (full box) + tfdt (full box) # + trun header (12 = 8 box + 4 version/flags) + sample_count (4) -> data_offset position # Instead of fragile offset math, search for the placeholder pattern. # Actually, let's just rebuild with the correct offset. # Re-encode trun with correct data_offset trun_payload_fixed = bytearray(trun_payload) struct.pack_into(">i", trun_payload_fixed, data_offset_pos, data_offset) trun_fixed = build_full_box(b"trun", trun_version, trun_flags, bytes(trun_payload_fixed)) # Rebuild traf -> moof with fixed trun traf_fixed = build_box(b"traf", tfhd + tfdt + trun_fixed) moof_fixed = build_box(b"moof", mfhd + traf_fixed) # Verify size didn't change (it shouldn't) assert len(moof_fixed) == len(moof), "moof size changed after data_offset patch" # mdat mdat = build_box(b"mdat", mdat_payload) return bytes(moof_fixed) + mdat class FMP4StreamMuxer: """ Streaming fMP4 muxer that produces fragments on-the-fly. Usage: muxer = FMP4StreamMuxer(video_track, audio_sample_rate, ...) init_seg = muxer.build_init_segment() yield init_seg for frame in demuxed_frames: muxer.add_frame(frame) fragment = muxer.flush_fragment() if fragment: yield fragment final = muxer.flush_final() if final: yield final """ def __init__( self, video_track: MKVTrack, audio_sample_rate: int = 48000, audio_channels: int = 2, audio_specific_config: bytes = b"", video_timescale: int = 90000, audio_timescale: int = 48000, duration_ms: float = 0.0, fragment_duration_ms: float = 2000.0, start_decode_time_ms: float = 0.0, audio_frame_size: int = 0, ) -> None: self._video_track = video_track self._audio_sample_rate = audio_sample_rate self._audio_channels = audio_channels self._audio_specific_config = audio_specific_config self._video_timescale = video_timescale self._audio_timescale = audio_timescale self._duration_ms = duration_ms self._fragment_duration_ms = fragment_duration_ms # Fragment accumulation self._video_samples: list[FragmentSample] = [] self._audio_samples: list[FragmentSample] = [] self._sequence_number = 1 # Track decode times (in timescale ticks). # When producing HLS segments, start_decode_time_ms places this # segment's tfdt at the correct position in the global timeline. self._video_decode_time = int(start_decode_time_ms * video_timescale / 1000.0) # For audio, we must align the tfdt to exact frame boundaries to # avoid DTS discontinuities at segment borders. AAC frames are # exactly ``audio_frame_size`` samples each (typically 1024). If # the caller provides audio_frame_size, compute the audio base # time as the exact number of whole frames that fit before this # segment's start time. if audio_frame_size > 0 and start_decode_time_ms > 0: total_samples_before = start_decode_time_ms / 1000.0 * audio_timescale whole_frames_before = int(total_samples_before / audio_frame_size) self._audio_decode_time = whole_frames_before * audio_frame_size else: self._audio_decode_time = int(start_decode_time_ms * audio_timescale / 1000.0) # Track accumulated duration for fragment boundary detection self._fragment_video_duration = 0 # video ticks accumulated in current fragment self._fragment_threshold = int(fragment_duration_ms * video_timescale / 1000.0) @property def video_position_ticks(self) -> int: """Current video decode position (timescale ticks from stream start).""" return self._video_decode_time + self._fragment_video_duration def advance_video_decode_time(self, ticks: int) -> None: """Advance the video base decode time by *ticks*. Can be used to adjust the segment's starting decode position when the first emitted frame doesn't align with the tfdt origin. """ self._video_decode_time += ticks def build_init_segment(self) -> bytes: """Build and return the fMP4 init segment (ftyp + moov).""" return build_fmp4_init_segment( video_track=self._video_track, audio_sample_rate=self._audio_sample_rate, audio_channels=self._audio_channels, audio_specific_config=self._audio_specific_config, video_timescale=self._video_timescale, audio_timescale=self._audio_timescale, duration_ms=self._duration_ms, ) def update_audio_specific_config(self, asc: bytes) -> None: """Update the AAC AudioSpecificConfig (call before build_init_segment if possible).""" self._audio_specific_config = asc def add_video_sample( self, data: bytes, duration_ticks: int, is_keyframe: bool, pts_ticks: int | None = None, ) -> None: """ Add a video sample to the current fragment. Args: data: Raw video NALUs. duration_ticks: Duration in video timescale ticks. is_keyframe: Whether this is an IDR/sync sample. pts_ticks: Presentation timestamp in video timescale ticks. Used to compute composition_time_offset for B-frame reordering. If None, assumes PTS == DTS (no B-frames). """ # Compute composition_time_offset = PTS - DTS # DTS is the running decode time for this fragment cts_offset = 0 if pts_ticks is not None: dts = self._video_decode_time + self._fragment_video_duration cts_offset = pts_ticks - dts self._video_samples.append( FragmentSample( data=data, duration=duration_ticks, is_sync=is_keyframe, composition_offset=cts_offset, ) ) self._fragment_video_duration += duration_ticks def add_audio_sample(self, data: bytes, duration_ticks: int) -> None: """Add an audio sample to the current fragment.""" self._audio_samples.append( FragmentSample( data=data, duration=duration_ticks, is_sync=True, ) ) def should_flush(self) -> bool: """Check if the current fragment has enough data to emit.""" # Flush on keyframe boundaries after accumulating enough duration if self._fragment_video_duration < self._fragment_threshold: return False # Only flush at a keyframe boundary (if there's a pending keyframe) if len(self._video_samples) > 1 and self._video_samples[-1].is_sync: return True return False def flush_fragment(self, force: bool = False) -> bytes | None: """ Flush the current fragment if ready. Args: force: Force flush even if fragment duration threshold isn't reached. Returns: Fragment bytes (moof+mdat for video + moof+mdat for audio) or None. """ if not force and not self.should_flush(): return None if not self._video_samples and not self._audio_samples: return None result = bytearray() # When flushing at a keyframe, the last sample (the new keyframe) # belongs to the NEXT fragment. Split there. if not force and len(self._video_samples) > 1 and self._video_samples[-1].is_sync: video_to_emit = self._video_samples[:-1] video_remaining = [self._video_samples[-1]] else: video_to_emit = self._video_samples video_remaining = [] # Emit video fragment if video_to_emit: frag = build_fmp4_fragment( sequence_number=self._sequence_number, track_id=1, base_decode_time=self._video_decode_time, samples=video_to_emit, ) result.extend(frag) self._sequence_number += 1 emitted_duration = sum(s.duration for s in video_to_emit) self._video_decode_time += emitted_duration # Emit audio fragment (matching time range) if self._audio_samples: frag = build_fmp4_fragment( sequence_number=self._sequence_number, track_id=2, base_decode_time=self._audio_decode_time, samples=self._audio_samples, ) result.extend(frag) self._sequence_number += 1 emitted_audio_duration = sum(s.duration for s in self._audio_samples) self._audio_decode_time += emitted_audio_duration self._audio_samples = [] # Reset for next fragment self._video_samples = video_remaining self._fragment_video_duration = sum(s.duration for s in video_remaining) return bytes(result) if result else None def flush_final(self) -> bytes | None: """Flush any remaining samples as the final fragment.""" return self.flush_fragment(force=True)