Files
UnHided/mediaflow_proxy/remuxer/mp4_muxer.py
UrloMythus cfc6bbabc9 update
2026-02-19 20:15:03 +01:00

1377 lines
50 KiB
Python

"""
Pure Python MP4 box builder for both standard and fragmented MP4.
Supports two modes:
1. Standard MP4 (moov-first): For progressive download with HTTP Range seeking.
File layout: ftyp | moov (full sample tables) | mdat
2. Fragmented MP4 (fMP4): For on-the-fly streaming via StreamingResponse.
Init segment: ftyp | moov (empty_moov with mvex)
Media segments: moof (tfhd + tfdt + trun) | mdat
The fMP4 mode is used for the transcode pipeline where MKV frames are
demuxed, audio is transcoded, and fMP4 fragments are streamed out
immediately without buffering the entire file.
"""
import logging
import struct
from dataclasses import dataclass, field
from mediaflow_proxy.remuxer.ebml_parser import MKVTrack, CODEC_ID_H264, CODEC_ID_H265
logger = logging.getLogger(__name__)
# =============================================================================
# Sample metadata
# =============================================================================
@dataclass
class SampleEntry:
"""Metadata for a single sample (frame) in the MP4 file."""
size: int # Sample size in bytes
duration: int # Duration in track timescale ticks
is_sync: bool # True for keyframes (video) or all audio samples
composition_offset: int = 0 # CTS offset (for B-frames)
@dataclass
class TrackSamples:
"""Collected sample metadata for one track during muxing."""
samples: list[SampleEntry] = field(default_factory=list)
chunk_offsets: list[int] = field(default_factory=list) # Absolute byte offset of each chunk in mdat
total_size: int = 0 # Total bytes of all samples
total_duration: int = 0 # Total duration in timescale ticks
def add(self, sample: SampleEntry) -> None:
self.samples.append(sample)
self.total_size += sample.size
self.total_duration += sample.duration
# =============================================================================
# Box building primitives
# =============================================================================
def build_box(box_type: bytes, payload: bytes) -> bytes:
"""Build a standard MP4 box: [4-byte size][4-byte type][payload]."""
size = 8 + len(payload)
return struct.pack(">I", size) + box_type + payload
def build_full_box(box_type: bytes, version: int, flags: int, payload: bytes) -> bytes:
"""Build a full box with version and flags."""
inner = struct.pack(">I", (version << 24) | (flags & 0xFFFFFF)) + payload
return build_box(box_type, inner)
def build_box_header_large(box_type: bytes, total_size: int) -> bytes:
"""Build a box header for large boxes using 64-bit extended size."""
# size=1 signals extended size; actual size follows as uint64
return struct.pack(">I", 1) + box_type + struct.pack(">Q", total_size)
# =============================================================================
# ftyp box
# =============================================================================
def build_ftyp() -> bytes:
"""Build the File Type box for isom/iso2/mp41 compatible MP4."""
payload = b"isom" # major brand
payload += struct.pack(">I", 0x200) # minor version
payload += b"isom" + b"iso2" + b"mp41" # compatible brands
return build_box(b"ftyp", payload)
# =============================================================================
# moov box and children
# =============================================================================
def build_mvhd(timescale: int, duration: int) -> bytes:
"""Build Movie Header box (mvhd), version 0."""
payload = bytearray()
payload.extend(struct.pack(">I", 0)) # creation_time
payload.extend(struct.pack(">I", 0)) # modification_time
payload.extend(struct.pack(">I", timescale))
payload.extend(struct.pack(">I", duration))
payload.extend(struct.pack(">I", 0x00010000)) # rate = 1.0
payload.extend(struct.pack(">H", 0x0100)) # volume = 1.0
payload.extend(b"\x00" * 10) # reserved
# Unity matrix (3x3, each 4 bytes, 9 values = 36 bytes)
payload.extend(struct.pack(">9I", 0x00010000, 0, 0, 0, 0x00010000, 0, 0, 0, 0x40000000))
payload.extend(b"\x00" * 24) # pre_defined
payload.extend(struct.pack(">I", 3)) # next_track_ID (1=video, 2=audio, next=3)
return build_full_box(b"mvhd", 0, 0, bytes(payload))
def build_tkhd(track_id: int, duration: int, width: int = 0, height: int = 0, is_audio: bool = False) -> bytes:
"""Build Track Header box (tkhd), version 0."""
flags = 0x000003 # track_enabled | track_in_movie
payload = bytearray()
payload.extend(struct.pack(">I", 0)) # creation_time
payload.extend(struct.pack(">I", 0)) # modification_time
payload.extend(struct.pack(">I", track_id))
payload.extend(b"\x00" * 4) # reserved
payload.extend(struct.pack(">I", duration))
payload.extend(b"\x00" * 8) # reserved
payload.extend(struct.pack(">H", 0)) # layer
payload.extend(struct.pack(">H", 0 if not is_audio else 1)) # alternate_group
payload.extend(struct.pack(">H", 0x0100 if is_audio else 0)) # volume
payload.extend(b"\x00" * 2) # reserved
# Unity matrix
payload.extend(struct.pack(">9I", 0x00010000, 0, 0, 0, 0x00010000, 0, 0, 0, 0x40000000))
# Width and height as 16.16 fixed-point
payload.extend(struct.pack(">I", width << 16))
payload.extend(struct.pack(">I", height << 16))
return build_full_box(b"tkhd", 0, flags, bytes(payload))
def build_mdhd(timescale: int, duration: int) -> bytes:
"""Build Media Header box (mdhd), version 0."""
payload = bytearray()
payload.extend(struct.pack(">I", 0)) # creation_time
payload.extend(struct.pack(">I", 0)) # modification_time
payload.extend(struct.pack(">I", timescale))
payload.extend(struct.pack(">I", duration))
payload.extend(struct.pack(">H", 0x55C4)) # language: 'und'
payload.extend(struct.pack(">H", 0)) # pre_defined
return build_full_box(b"mdhd", 0, 0, bytes(payload))
def build_hdlr(handler_type: bytes, name: str) -> bytes:
"""Build Handler Reference box (hdlr)."""
payload = bytearray()
payload.extend(b"\x00" * 4) # pre_defined
payload.extend(handler_type) # handler_type (4 bytes)
payload.extend(b"\x00" * 12) # reserved
payload.extend(name.encode("utf-8") + b"\x00")
return build_full_box(b"hdlr", 0, 0, bytes(payload))
def build_vmhd() -> bytes:
"""Build Video Media Header box (vmhd)."""
payload = struct.pack(">H", 0) # graphicsmode
payload += struct.pack(">3H", 0, 0, 0) # opcolor
return build_full_box(b"vmhd", 0, 1, payload) # flags=1
def build_smhd() -> bytes:
"""Build Sound Media Header box (smhd)."""
payload = struct.pack(">H", 0) # balance
payload += b"\x00\x00" # reserved
return build_full_box(b"smhd", 0, 0, payload)
def build_dref() -> bytes:
"""Build Data Reference box (dref) with a self-contained URL entry."""
url_box = build_full_box(b"url ", 0, 1, b"") # flags=1 = self-contained
payload = struct.pack(">I", 1) + url_box # entry_count=1
return build_full_box(b"dref", 0, 0, payload)
def build_dinf() -> bytes:
"""Build Data Information box (dinf)."""
return build_box(b"dinf", build_dref())
# =============================================================================
# Sample table boxes (stbl)
# =============================================================================
def build_stsd_video(track: MKVTrack) -> bytes:
"""Build Sample Description box (stsd) for a video track."""
# Build the codec-specific sample entry
if track.codec_id == CODEC_ID_H264:
entry = _build_avc1_entry(track)
elif track.codec_id == CODEC_ID_H265:
entry = _build_hvc1_entry(track)
else:
raise ValueError(f"Unsupported video codec: {track.codec_id}")
payload = struct.pack(">I", 1) + entry # entry_count=1
return build_full_box(b"stsd", 0, 0, payload)
def _build_colr_nclx(
colour_primaries: int = 1,
transfer_characteristics: int = 1,
matrix_coefficients: int = 1,
full_range: bool = False,
) -> bytes:
"""
Build a colr box with nclx (video colour) information.
Defaults to BT.709 (the standard for HD content), matching ffmpeg's
default behaviour for fMP4 output.
"""
payload = b"nclx"
payload += struct.pack(">HHH", colour_primaries, transfer_characteristics, matrix_coefficients)
payload += struct.pack("B", 0x80 if full_range else 0x00)
return build_box(b"colr", payload)
def _build_pasp(h_spacing: int = 1, v_spacing: int = 1) -> bytes:
"""
Build a pasp (Pixel Aspect Ratio) box.
Default 1:1 (square pixels), which is the norm for HD content.
"""
return build_box(b"pasp", struct.pack(">II", h_spacing, v_spacing))
def _build_avc1_entry(track: MKVTrack) -> bytes:
"""Build an avc3 VisualSampleEntry.
Uses ``avc3`` instead of ``avc1`` to allow in-band SPS/PPS parameter
set updates in sample data. Many MKV sources embed mid-stream PPS
changes in the bitstream; ``avc3`` signals to the player that these
may appear in any sample, avoiding "non-existing PPS" decode errors.
"""
payload = bytearray()
payload.extend(b"\x00" * 6) # reserved
payload.extend(struct.pack(">H", 1)) # data_reference_index
payload.extend(b"\x00" * 16) # pre_defined + reserved
payload.extend(struct.pack(">H", track.pixel_width))
payload.extend(struct.pack(">H", track.pixel_height))
payload.extend(struct.pack(">I", 0x00480000)) # horizresolution 72 dpi
payload.extend(struct.pack(">I", 0x00480000)) # vertresolution 72 dpi
payload.extend(b"\x00" * 4) # reserved
payload.extend(struct.pack(">H", 1)) # frame_count
payload.extend(b"\x00" * 32) # compressorname
payload.extend(struct.pack(">H", 0x0018)) # depth = 24
payload.extend(struct.pack(">h", -1)) # pre_defined
# avcC box from CodecPrivate
if track.codec_private:
avcc = build_box(b"avcC", track.codec_private)
payload.extend(avcc)
# colr box -- nclx colour information (BT.709)
payload.extend(_build_colr_nclx())
# pasp box -- pixel aspect ratio (1:1)
payload.extend(_build_pasp())
return build_box(b"avc3", bytes(payload))
def _build_hvc1_entry(track: MKVTrack) -> bytes:
"""Build an hvc1 VisualSampleEntry."""
payload = bytearray()
payload.extend(b"\x00" * 6) # reserved
payload.extend(struct.pack(">H", 1)) # data_reference_index
payload.extend(b"\x00" * 16) # pre_defined + reserved
payload.extend(struct.pack(">H", track.pixel_width))
payload.extend(struct.pack(">H", track.pixel_height))
payload.extend(struct.pack(">I", 0x00480000)) # horizresolution
payload.extend(struct.pack(">I", 0x00480000)) # vertresolution
payload.extend(b"\x00" * 4) # reserved
payload.extend(struct.pack(">H", 1)) # frame_count
payload.extend(b"\x00" * 32) # compressorname
payload.extend(struct.pack(">H", 0x0018)) # depth
payload.extend(struct.pack(">h", -1)) # pre_defined
# hvcC box from CodecPrivate
if track.codec_private:
hvcc = build_box(b"hvcC", track.codec_private)
payload.extend(hvcc)
# colr box -- nclx colour information (BT.709)
payload.extend(_build_colr_nclx())
# pasp box -- pixel aspect ratio (1:1)
payload.extend(_build_pasp())
return build_box(b"hvc1", bytes(payload))
def build_stsd_audio(sample_rate: int, channels: int, audio_specific_config: bytes) -> bytes:
"""Build Sample Description box (stsd) for an AAC audio track."""
entry = _build_mp4a_entry(sample_rate, channels, audio_specific_config)
payload = struct.pack(">I", 1) + entry # entry_count=1
return build_full_box(b"stsd", 0, 0, payload)
def _build_mp4a_entry(sample_rate: int, channels: int, asc: bytes) -> bytes:
"""Build an mp4a AudioSampleEntry with esds box."""
payload = bytearray()
payload.extend(b"\x00" * 6) # reserved
payload.extend(struct.pack(">H", 1)) # data_reference_index
payload.extend(b"\x00" * 8) # reserved
payload.extend(struct.pack(">H", channels))
payload.extend(struct.pack(">H", 16)) # sample_size (16-bit)
payload.extend(b"\x00" * 4) # pre_defined + reserved
payload.extend(struct.pack(">I", sample_rate << 16)) # sample_rate 16.16
# esds box
esds = _build_esds(sample_rate, channels, asc)
payload.extend(esds)
return build_box(b"mp4a", bytes(payload))
def _build_esds(sample_rate: int, channels: int, asc: bytes) -> bytes:
"""Build an Elementary Stream Descriptor box (esds) for AAC."""
# ES_Descriptor
es_desc = bytearray()
es_desc.extend(struct.pack(">H", 1)) # ES_ID
es_desc.append(0x00) # stream priority
# DecoderConfigDescriptor
dec_config = bytearray()
dec_config.append(0x40) # objectTypeIndication = Audio ISO/IEC 14496-3 (AAC)
dec_config.append(0x15) # streamType=5 (audio) upstream=0 reserved=1
dec_config.extend(b"\x00\x00\x00") # bufferSizeDB (3 bytes)
dec_config.extend(struct.pack(">I", 192000)) # maxBitrate
dec_config.extend(struct.pack(">I", 192000)) # avgBitrate
# DecoderSpecificInfo (AudioSpecificConfig)
dec_specific = _build_descriptor(0x05, asc)
dec_config.extend(dec_specific)
dec_config_desc = _build_descriptor(0x04, bytes(dec_config))
es_desc.extend(dec_config_desc)
# SLConfigDescriptor (predefined=2 for MP4)
sl_config = _build_descriptor(0x06, b"\x02")
es_desc.extend(sl_config)
es_descriptor = _build_descriptor(0x03, bytes(es_desc))
payload = es_descriptor
return build_full_box(b"esds", 0, 0, payload)
def _build_descriptor(tag: int, data: bytes) -> bytes:
"""Build an ISO 14496-1 descriptor with expandable length encoding."""
length = len(data)
result = bytearray()
result.append(tag)
# Expandable length: use 4 bytes (most compatible)
result.append(0x80 | ((length >> 21) & 0x7F))
result.append(0x80 | ((length >> 14) & 0x7F))
result.append(0x80 | ((length >> 7) & 0x7F))
result.append(length & 0x7F)
result.extend(data)
return bytes(result)
def build_stts(samples: list[SampleEntry]) -> bytes:
"""
Build Time-to-Sample box (stts) with run-length encoding.
Groups consecutive samples with the same duration.
"""
if not samples:
return build_full_box(b"stts", 0, 0, struct.pack(">I", 0))
# Run-length encode durations
entries = []
current_duration = samples[0].duration
current_count = 1
for s in samples[1:]:
if s.duration == current_duration:
current_count += 1
else:
entries.append((current_count, current_duration))
current_duration = s.duration
current_count = 1
entries.append((current_count, current_duration))
payload = bytearray()
payload.extend(struct.pack(">I", len(entries)))
for count, delta in entries:
payload.extend(struct.pack(">II", count, delta))
return build_full_box(b"stts", 0, 0, bytes(payload))
def build_stss(samples: list[SampleEntry]) -> bytes | None:
"""
Build Sync Sample box (stss) listing keyframe indices.
Returns None if all samples are sync (audio tracks), as stss is
only needed when not all samples are sync points.
"""
sync_indices = [i + 1 for i, s in enumerate(samples) if s.is_sync] # 1-based
if len(sync_indices) == len(samples):
return None # All samples are sync; omit stss
payload = bytearray()
payload.extend(struct.pack(">I", len(sync_indices)))
for idx in sync_indices:
payload.extend(struct.pack(">I", idx))
return build_full_box(b"stss", 0, 0, bytes(payload))
def build_ctts(samples: list[SampleEntry]) -> bytes | None:
"""
Build Composition Time-to-Sample box (ctts) for B-frame offsets.
Returns None if no samples have composition offsets.
"""
has_offsets = any(s.composition_offset != 0 for s in samples)
if not has_offsets:
return None
# Run-length encode offsets
entries = []
current_offset = samples[0].composition_offset
current_count = 1
for s in samples[1:]:
if s.composition_offset == current_offset:
current_count += 1
else:
entries.append((current_count, current_offset))
current_offset = s.composition_offset
current_count = 1
entries.append((current_count, current_offset))
payload = bytearray()
payload.extend(struct.pack(">I", len(entries)))
for count, offset in entries:
payload.extend(struct.pack(">II", count, offset))
return build_full_box(b"ctts", 0, 0, bytes(payload))
def build_stsz(samples: list[SampleEntry]) -> bytes:
"""Build Sample Size box (stsz)."""
payload = bytearray()
# Check if all samples are the same size
if samples:
first_size = samples[0].size
all_same = all(s.size == first_size for s in samples)
else:
all_same = True
first_size = 0
if all_same and samples:
payload.extend(struct.pack(">I", first_size)) # sample_size (uniform)
payload.extend(struct.pack(">I", len(samples))) # sample_count
else:
payload.extend(struct.pack(">I", 0)) # sample_size = 0 (variable)
payload.extend(struct.pack(">I", len(samples)))
for s in samples:
payload.extend(struct.pack(">I", s.size))
return build_full_box(b"stsz", 0, 0, bytes(payload))
def build_stsc(num_chunks: int) -> bytes:
"""
Build Sample-to-Chunk box (stsc).
For simplicity, we use one sample per chunk (each sample gets its
own chunk offset). This is slightly less compact but much simpler
and fully correct.
"""
payload = bytearray()
payload.extend(struct.pack(">I", 1)) # entry_count
payload.extend(struct.pack(">III", 1, 1, 1)) # first_chunk=1, samples_per_chunk=1, desc_index=1
return build_full_box(b"stsc", 0, 0, bytes(payload))
def build_stco(offsets: list[int]) -> bytes:
"""Build Chunk Offset box (stco, 32-bit offsets)."""
payload = bytearray()
payload.extend(struct.pack(">I", len(offsets)))
for off in offsets:
payload.extend(struct.pack(">I", off))
return build_full_box(b"stco", 0, 0, bytes(payload))
def build_co64(offsets: list[int]) -> bytes:
"""Build Chunk Offset box (co64, 64-bit offsets) for large files."""
payload = bytearray()
payload.extend(struct.pack(">I", len(offsets)))
for off in offsets:
payload.extend(struct.pack(">Q", off))
return build_full_box(b"co64", 0, 0, bytes(payload))
# =============================================================================
# Track building (assembles trak box hierarchy)
# =============================================================================
def build_stbl(track_samples: TrackSamples, stsd: bytes) -> bytes:
"""Build the Sample Table box (stbl) for a track."""
children = bytearray()
children.extend(stsd)
children.extend(build_stts(track_samples.samples))
stss = build_stss(track_samples.samples)
if stss is not None:
children.extend(stss)
ctts = build_ctts(track_samples.samples)
if ctts is not None:
children.extend(ctts)
children.extend(build_stsz(track_samples.samples))
children.extend(build_stsc(len(track_samples.chunk_offsets)))
# Use co64 if any offset exceeds 32-bit range
needs_64 = any(off > 0xFFFFFFFF for off in track_samples.chunk_offsets)
if needs_64:
children.extend(build_co64(track_samples.chunk_offsets))
else:
children.extend(build_stco(track_samples.chunk_offsets))
return build_box(b"stbl", bytes(children))
def build_minf(is_audio: bool, stbl: bytes) -> bytes:
"""Build Media Information box (minf)."""
children = bytearray()
if is_audio:
children.extend(build_smhd())
else:
children.extend(build_vmhd())
children.extend(build_dinf())
children.extend(stbl)
return build_box(b"minf", bytes(children))
def build_mdia(timescale: int, duration: int, handler_type: bytes, handler_name: str, minf: bytes) -> bytes:
"""Build Media box (mdia)."""
children = bytearray()
children.extend(build_mdhd(timescale, duration))
children.extend(build_hdlr(handler_type, handler_name))
children.extend(minf)
return build_box(b"mdia", bytes(children))
def build_video_trak(
track: MKVTrack,
track_id: int,
timescale: int,
track_samples: TrackSamples,
movie_timescale: int,
) -> bytes:
"""Build a complete video trak box."""
duration_in_track = track_samples.total_duration
# Convert track duration to movie timescale for tkhd
if timescale > 0:
duration_in_movie = int(duration_in_track * movie_timescale / timescale)
else:
duration_in_movie = 0
tkhd = build_tkhd(track_id, duration_in_movie, width=track.pixel_width, height=track.pixel_height)
stsd = build_stsd_video(track)
stbl = build_stbl(track_samples, stsd)
minf = build_minf(is_audio=False, stbl=stbl)
mdia = build_mdia(timescale, duration_in_track, b"vide", "VideoHandler", minf)
return build_box(b"trak", tkhd + mdia)
def build_audio_trak(
track_id: int,
timescale: int,
track_samples: TrackSamples,
movie_timescale: int,
sample_rate: int,
channels: int,
audio_specific_config: bytes,
) -> bytes:
"""Build a complete audio trak box."""
duration_in_track = track_samples.total_duration
if timescale > 0:
duration_in_movie = int(duration_in_track * movie_timescale / timescale)
else:
duration_in_movie = 0
tkhd = build_tkhd(track_id, duration_in_movie, is_audio=True)
stsd = build_stsd_audio(sample_rate, channels, audio_specific_config)
stbl = build_stbl(track_samples, stsd)
minf = build_minf(is_audio=True, stbl=stbl)
mdia = build_mdia(timescale, duration_in_track, b"soun", "SoundHandler", minf)
return build_box(b"trak", tkhd + mdia)
# =============================================================================
# Complete moov builder
# =============================================================================
def build_moov(
video_track: MKVTrack,
audio_track_info: dict,
video_samples: TrackSamples,
audio_samples: TrackSamples,
mdat_offset: int,
video_timescale: int = 90000,
audio_timescale: int = 48000,
movie_timescale: int = 1000,
) -> bytes:
"""
Build the complete moov box with all track metadata.
Args:
video_track: MKVTrack with video codec info.
audio_track_info: Dict with keys: sample_rate, channels, audio_specific_config.
video_samples: Collected video sample metadata.
audio_samples: Collected audio sample metadata.
mdat_offset: Byte offset where mdat data starts (after ftyp + moov + mdat header).
video_timescale: Video track timescale (default 90000 for 90kHz).
audio_timescale: Audio track timescale (typically sample_rate).
movie_timescale: Movie header timescale (default 1000 = ms).
Returns:
Complete moov box bytes.
"""
# Calculate movie duration
video_dur_movie = 0
if video_timescale > 0 and video_samples.total_duration > 0:
video_dur_movie = int(video_samples.total_duration * movie_timescale / video_timescale)
audio_dur_movie = 0
if audio_timescale > 0 and audio_samples.total_duration > 0:
audio_dur_movie = int(audio_samples.total_duration * movie_timescale / audio_timescale)
movie_duration = max(video_dur_movie, audio_dur_movie)
# Build moov children
children = bytearray()
children.extend(build_mvhd(movie_timescale, movie_duration))
children.extend(
build_video_trak(
video_track,
track_id=1,
timescale=video_timescale,
track_samples=video_samples,
movie_timescale=movie_timescale,
)
)
children.extend(
build_audio_trak(
track_id=2,
timescale=audio_timescale,
track_samples=audio_samples,
movie_timescale=movie_timescale,
sample_rate=audio_track_info["sample_rate"],
channels=audio_track_info["channels"],
audio_specific_config=audio_track_info["audio_specific_config"],
)
)
return build_box(b"moov", bytes(children))
# =============================================================================
# mdat box header
# =============================================================================
def build_mdat_header(data_size: int) -> bytes:
"""
Build the mdat box header.
Uses extended (64-bit) size if data_size + header > 4GB.
"""
total = 8 + data_size # header(8) + data
if total <= 0xFFFFFFFF:
return struct.pack(">I", total) + b"mdat"
# Extended size: size field = 1, then 8-byte actual size
total_ext = 16 + data_size # header(16) + data
return struct.pack(">I", 1) + b"mdat" + struct.pack(">Q", total_ext)
# =============================================================================
# MP4 Builder (high-level orchestrator)
# =============================================================================
class MP4Builder:
"""
High-level MP4 file builder.
Collects video and audio samples during a transcode pass, then produces
a complete moov-first MP4 file.
Usage:
builder = MP4Builder(video_track, audio_sample_rate=48000,
audio_channels=2, audio_specific_config=asc)
for frame in video_frames:
builder.add_video_sample(frame.data, frame.duration_ticks, frame.is_keyframe)
for frame in audio_frames:
builder.add_audio_sample(frame.data, frame.duration_ticks)
moov_bytes, mdat_header, sample_data_list = builder.finalize()
"""
def __init__(
self,
video_track: MKVTrack,
audio_sample_rate: int = 48000,
audio_channels: int = 2,
audio_specific_config: bytes = b"",
video_timescale: int = 90000,
audio_timescale: int = 48000,
) -> None:
self._video_track = video_track
self._audio_info = {
"sample_rate": audio_sample_rate,
"channels": audio_channels,
"audio_specific_config": audio_specific_config,
}
self._video_timescale = video_timescale
self._audio_timescale = audio_timescale
self._video_samples = TrackSamples()
self._audio_samples = TrackSamples()
self._mdat_chunks: list[bytes] = [] # Interleaved sample data
self._mdat_size: int = 0
self._sample_order: list[str] = [] # "v" or "a" for each mdat chunk
def add_video_sample(self, data: bytes, duration_ticks: int, is_keyframe: bool) -> None:
"""Add a video sample (H.264/H.265 NALUs) to the builder."""
entry = SampleEntry(size=len(data), duration=duration_ticks, is_sync=is_keyframe)
self._video_samples.add(entry)
self._mdat_chunks.append(data)
self._mdat_size += len(data)
self._sample_order.append("v")
def add_audio_sample(self, data: bytes, duration_ticks: int) -> None:
"""Add an audio sample (AAC frame) to the builder."""
entry = SampleEntry(size=len(data), duration=duration_ticks, is_sync=True)
self._audio_samples.add(entry)
self._mdat_chunks.append(data)
self._mdat_size += len(data)
self._sample_order.append("a")
@property
def video_sample_count(self) -> int:
return len(self._video_samples.samples)
@property
def audio_sample_count(self) -> int:
return len(self._audio_samples.samples)
@property
def mdat_size(self) -> int:
return self._mdat_size
def finalize(self) -> tuple[bytes, bytes, list[bytes]]:
"""
Build the final MP4 file components.
Since moov needs accurate chunk offsets (stco/co64) that depend on
moov's own size, we do a two-pass approach:
1. Build moov with placeholder offsets to determine its size
2. Rebuild moov with correct offsets
Returns:
(ftyp_moov_bytes, mdat_header_bytes, mdat_chunk_list)
Concatenating these gives the complete MP4 file.
"""
ftyp = build_ftyp()
# Build mdat header
mdat_hdr = build_mdat_header(self._mdat_size)
# Pass 1: Build moov with placeholder (0) offsets to measure its size
self._compute_chunk_offsets(0) # Placeholder base
moov_pass1 = build_moov(
self._video_track,
self._audio_info,
self._video_samples,
self._audio_samples,
mdat_offset=0,
video_timescale=self._video_timescale,
audio_timescale=self._audio_timescale,
)
# Calculate actual mdat data start:
# ftyp + moov + mdat_header
mdat_data_start = len(ftyp) + len(moov_pass1) + len(mdat_hdr)
# Pass 2: Rebuild moov with correct chunk offsets
self._compute_chunk_offsets(mdat_data_start)
moov_final = build_moov(
self._video_track,
self._audio_info,
self._video_samples,
self._audio_samples,
mdat_offset=mdat_data_start,
video_timescale=self._video_timescale,
audio_timescale=self._audio_timescale,
)
# Verify moov size didn't change (it shouldn't since offsets are same width)
if len(moov_final) != len(moov_pass1):
# Size changed (e.g., offsets crossed 32/64-bit boundary). Redo.
mdat_data_start = len(ftyp) + len(moov_final) + len(mdat_hdr)
self._compute_chunk_offsets(mdat_data_start)
moov_final = build_moov(
self._video_track,
self._audio_info,
self._video_samples,
self._audio_samples,
mdat_offset=mdat_data_start,
video_timescale=self._video_timescale,
audio_timescale=self._audio_timescale,
)
header_bytes = ftyp + moov_final
logger.info(
"[mp4_muxer] Finalized: ftyp=%d moov=%d mdat=%d (header=%d) video=%d samples audio=%d samples",
len(ftyp),
len(moov_final),
self._mdat_size,
len(mdat_hdr),
len(self._video_samples.samples),
len(self._audio_samples.samples),
)
return header_bytes, mdat_hdr, self._mdat_chunks
def _compute_chunk_offsets(self, mdat_data_start: int) -> None:
"""Compute absolute byte offsets for each sample in the mdat."""
# Samples were added interleaved (video/audio/video/audio...)
# so mdat_chunks[i] corresponds to samples in order.
# We need to assign offsets per track.
video_offsets = []
audio_offsets = []
offset = mdat_data_start
vi = 0
ai = 0
for chunk in self._mdat_chunks:
chunk_size = len(chunk)
# Determine if this chunk is video or audio based on sample order
if vi < len(self._video_samples.samples) and (
ai >= len(self._audio_samples.samples) or self._is_video_sample(vi, ai)
):
video_offsets.append(offset)
vi += 1
else:
audio_offsets.append(offset)
ai += 1
offset += chunk_size
self._video_samples.chunk_offsets = video_offsets
self._audio_samples.chunk_offsets = audio_offsets
def _is_video_sample(self, vi: int, ai: int) -> bool:
"""
Determine if the next mdat chunk at position (vi+ai) is a video sample.
This relies on the add order tracking. We use a simple scheme:
samples are added in their interleaved order, and we track which
indices are video vs audio.
"""
# The _mdat_chunks list contains samples in the order they were added.
# We need to know the order. For now, use the _sample_order tracker.
idx = vi + ai
if idx < len(self._sample_order):
return self._sample_order[idx] == "v"
return vi < len(self._video_samples.samples)
def update_audio_specific_config(self, asc: bytes) -> None:
"""Update the AudioSpecificConfig (e.g., after first encode)."""
self._audio_info["audio_specific_config"] = asc
# =============================================================================
# Fragmented MP4 (fMP4) builder for streaming output
# =============================================================================
#
# fMP4 layout:
# Init segment: ftyp + moov (mvhd + mvex/trex + trak[video] + trak[audio])
# Media segments: moof (mfhd + traf[tfhd + tfdt + trun]) + mdat
#
# The moov in fMP4 has empty sample tables (stts/stsz/stsc/stco with 0 entries)
# and an mvex box with trex entries signaling fragmented mode.
# =============================================================================
def _build_empty_stbl(stsd: bytes) -> bytes:
"""Build an stbl with empty sample tables (for fMP4 init segment)."""
children = bytearray()
children.extend(stsd)
# Empty stts
children.extend(build_full_box(b"stts", 0, 0, struct.pack(">I", 0)))
# Empty stsc
children.extend(build_full_box(b"stsc", 0, 0, struct.pack(">I", 0)))
# Empty stsz
children.extend(build_full_box(b"stsz", 0, 0, struct.pack(">II", 0, 0)))
# Empty stco
children.extend(build_full_box(b"stco", 0, 0, struct.pack(">I", 0)))
return build_box(b"stbl", bytes(children))
def build_fmp4_init_segment(
video_track: MKVTrack,
audio_sample_rate: int,
audio_channels: int,
audio_specific_config: bytes,
video_timescale: int = 90000,
audio_timescale: int = 48000,
duration_ms: float = 0.0,
) -> bytes:
"""
Build an fMP4 initialization segment (ftyp + moov with empty_moov).
The moov contains track descriptions (codec config) and mvex/trex
entries signaling fragmented mode. No sample data.
Args:
video_track: MKVTrack with video codec info.
audio_sample_rate: Output audio sample rate.
audio_channels: Output audio channel count.
audio_specific_config: AAC AudioSpecificConfig bytes.
video_timescale: Video track timescale (default 90000).
audio_timescale: Audio track timescale (default sample_rate).
duration_ms: Total duration in ms (0 = unknown/live).
Returns:
Complete init segment bytes (ftyp + moov).
"""
ftyp = _build_fmp4_ftyp()
movie_timescale = 1000 # ms
movie_duration = int(duration_ms) if duration_ms > 0 else 0
# mvhd
mvhd = build_mvhd(movie_timescale, movie_duration)
# Video trak (with empty stbl)
video_duration = int(duration_ms * video_timescale / 1000.0) if duration_ms > 0 else 0
video_tkhd = build_tkhd(
1, int(duration_ms) if duration_ms > 0 else 0, width=video_track.pixel_width, height=video_track.pixel_height
)
video_stsd = build_stsd_video(video_track)
video_stbl = _build_empty_stbl(video_stsd)
video_minf = build_minf(is_audio=False, stbl=video_stbl)
video_mdia = build_mdia(video_timescale, video_duration, b"vide", "VideoHandler", video_minf)
video_trak = build_box(b"trak", video_tkhd + video_mdia)
# Audio trak (with empty stbl)
audio_duration = int(duration_ms * audio_timescale / 1000.0) if duration_ms > 0 else 0
audio_tkhd = build_tkhd(2, int(duration_ms) if duration_ms > 0 else 0, is_audio=True)
audio_stsd = build_stsd_audio(audio_sample_rate, audio_channels, audio_specific_config)
audio_stbl = _build_empty_stbl(audio_stsd)
audio_minf = build_minf(is_audio=True, stbl=audio_stbl)
audio_mdia = build_mdia(audio_timescale, audio_duration, b"soun", "SoundHandler", audio_minf)
audio_trak = build_box(b"trak", audio_tkhd + audio_mdia)
# mvex (Movie Extends) - signals this is a fragmented MP4
# trex (Track Extends) for each track.
# Use 0x00000000 for default_sample_flags (same as ffmpeg), deferring
# all sample flag decisions to per-fragment tfhd.default_sample_flags
# and trun.first_sample_flags. This avoids global defaults that could
# confuse strict browser parsers.
trex_video = build_full_box(
b"trex",
0,
0,
struct.pack(
">IIIII",
1, # track_ID
1, # default_sample_description_index
0, # default_sample_duration
0, # default_sample_size
0x00000000, # default_sample_flags (deferred to tfhd per fragment)
),
)
trex_audio = build_full_box(
b"trex",
0,
0,
struct.pack(
">IIIII",
2, # track_ID
1, # default_sample_description_index
0, # default_sample_duration
0, # default_sample_size
0x00000000, # default_sample_flags (deferred to tfhd per fragment)
),
)
mvex = build_box(b"mvex", trex_video + trex_audio)
# Assemble moov
moov = build_box(b"moov", mvhd + video_trak + audio_trak + mvex)
return ftyp + moov
def _build_fmp4_ftyp() -> bytes:
"""Build ftyp box for fragmented MP4."""
payload = b"isom" # major brand
payload += struct.pack(">I", 0x200) # minor version
payload += b"isom" + b"iso6" + b"mp41" + b"msdh" + b"msix"
return build_box(b"ftyp", payload)
@dataclass
class FragmentSample:
"""A single sample to be written into an fMP4 fragment."""
data: bytes
duration: int # In track timescale
is_sync: bool = False
composition_offset: int = 0
@property
def size(self) -> int:
return len(self.data)
def build_fmp4_fragment(
sequence_number: int,
track_id: int,
base_decode_time: int,
samples: list[FragmentSample],
) -> bytes:
"""
Build an fMP4 media segment (moof + mdat) for a single track.
Args:
sequence_number: Fragment sequence number (1-based, incrementing).
track_id: Track ID (1=video, 2=audio).
base_decode_time: Decode time of the first sample in track timescale.
samples: List of samples for this fragment.
Returns:
Complete moof + mdat bytes.
"""
if not samples:
return b""
# mdat payload
mdat_payload = b"".join(s.data for s in samples)
# Build trun (Track Fragment Run)
# Flags: 0x000301 = data_offset_present + sample_duration_present + sample_size_present
# Add 0x000400 if any sample has composition offset
# Add 0x000004 for first_sample_flags_present
has_cts = any(s.composition_offset != 0 for s in samples)
trun_flags = 0x000001 | 0x000100 | 0x000200 # data_offset + duration + size
if has_cts:
trun_flags |= 0x000800 # sample_composition_time_offsets_present
# Use first_sample_flags for keyframe indication
trun_flags |= 0x000004 # first_sample_flags_present
trun_payload = bytearray()
trun_payload.extend(struct.pack(">I", len(samples))) # sample_count
# data_offset: will be patched after we know moof size
# Placeholder for now (4 bytes)
data_offset_pos = len(trun_payload)
trun_payload.extend(struct.pack(">i", 0)) # data_offset placeholder
# first_sample_flags
if samples[0].is_sync:
first_flags = 0x02000000 # sample_depends_on=2 (does not depend, i.e., sync)
else:
first_flags = 0x01010000 # sample_depends_on=1, is_non_sync=1
trun_payload.extend(struct.pack(">I", first_flags))
# Per-sample entries
for s in samples:
trun_payload.extend(struct.pack(">I", s.duration))
trun_payload.extend(struct.pack(">I", s.size))
if has_cts:
trun_payload.extend(struct.pack(">i", s.composition_offset))
# Use version 1 when CTS offsets are present (supports signed offsets for B-frames)
trun_version = 1 if has_cts else 0
trun = build_full_box(b"trun", trun_version, trun_flags, bytes(trun_payload))
# tfdt (Track Fragment Decode Time) - version 1 for 64-bit time
tfdt_payload = struct.pack(">Q", base_decode_time)
tfdt = build_full_box(b"tfdt", 1, 0, tfdt_payload)
# tfhd (Track Fragment Header)
# Flags: 0x020000 = default_base_is_moof
# 0x000020 = default_sample_flags_present
# Since trex.default_sample_flags is 0x00000000, we set per-fragment
# defaults here (matching ffmpeg behaviour):
# - Video: 0x01010000 (sample_depends_on=1, is_non_sync=1)
# - Audio: 0x02000000 (sample_depends_on=2 = independent)
# The trun.first_sample_flags overrides this for keyframes.
is_video = track_id == 1
default_sample_flags = 0x01010000 if is_video else 0x02000000
tfhd_flags = 0x020000 | 0x000020 # default_base_is_moof + default_sample_flags_present
tfhd_payload = struct.pack(">II", track_id, default_sample_flags)
tfhd = build_full_box(b"tfhd", 0, tfhd_flags, tfhd_payload)
# traf
traf = build_box(b"traf", tfhd + tfdt + trun)
# mfhd (Movie Fragment Header)
mfhd = build_full_box(b"mfhd", 0, 0, struct.pack(">I", sequence_number))
# moof
moof = build_box(b"moof", mfhd + traf)
# Patch data_offset in trun: offset from moof start to mdat payload start
# mdat header is 8 bytes, so data_offset = moof_size + 8
data_offset = len(moof) + 8 # 8 = mdat box header
# Find the trun data_offset position within the moof
# trun is inside traf, which is inside moof.
# The data_offset is at a fixed position in the trun payload.
# We need to search for it. Since we built the structure, we can calculate:
# moof header (8) + mfhd (full box) + traf header (8) + tfhd (full box) + tfdt (full box)
# + trun header (12 = 8 box + 4 version/flags) + sample_count (4) -> data_offset position
# Instead of fragile offset math, search for the placeholder pattern.
# Actually, let's just rebuild with the correct offset.
# Re-encode trun with correct data_offset
trun_payload_fixed = bytearray(trun_payload)
struct.pack_into(">i", trun_payload_fixed, data_offset_pos, data_offset)
trun_fixed = build_full_box(b"trun", trun_version, trun_flags, bytes(trun_payload_fixed))
# Rebuild traf -> moof with fixed trun
traf_fixed = build_box(b"traf", tfhd + tfdt + trun_fixed)
moof_fixed = build_box(b"moof", mfhd + traf_fixed)
# Verify size didn't change (it shouldn't)
assert len(moof_fixed) == len(moof), "moof size changed after data_offset patch"
# mdat
mdat = build_box(b"mdat", mdat_payload)
return bytes(moof_fixed) + mdat
class FMP4StreamMuxer:
"""
Streaming fMP4 muxer that produces fragments on-the-fly.
Usage:
muxer = FMP4StreamMuxer(video_track, audio_sample_rate, ...)
init_seg = muxer.build_init_segment()
yield init_seg
for frame in demuxed_frames:
muxer.add_frame(frame)
fragment = muxer.flush_fragment()
if fragment:
yield fragment
final = muxer.flush_final()
if final:
yield final
"""
def __init__(
self,
video_track: MKVTrack,
audio_sample_rate: int = 48000,
audio_channels: int = 2,
audio_specific_config: bytes = b"",
video_timescale: int = 90000,
audio_timescale: int = 48000,
duration_ms: float = 0.0,
fragment_duration_ms: float = 2000.0,
start_decode_time_ms: float = 0.0,
audio_frame_size: int = 0,
) -> None:
self._video_track = video_track
self._audio_sample_rate = audio_sample_rate
self._audio_channels = audio_channels
self._audio_specific_config = audio_specific_config
self._video_timescale = video_timescale
self._audio_timescale = audio_timescale
self._duration_ms = duration_ms
self._fragment_duration_ms = fragment_duration_ms
# Fragment accumulation
self._video_samples: list[FragmentSample] = []
self._audio_samples: list[FragmentSample] = []
self._sequence_number = 1
# Track decode times (in timescale ticks).
# When producing HLS segments, start_decode_time_ms places this
# segment's tfdt at the correct position in the global timeline.
self._video_decode_time = int(start_decode_time_ms * video_timescale / 1000.0)
# For audio, we must align the tfdt to exact frame boundaries to
# avoid DTS discontinuities at segment borders. AAC frames are
# exactly ``audio_frame_size`` samples each (typically 1024). If
# the caller provides audio_frame_size, compute the audio base
# time as the exact number of whole frames that fit before this
# segment's start time.
if audio_frame_size > 0 and start_decode_time_ms > 0:
total_samples_before = start_decode_time_ms / 1000.0 * audio_timescale
whole_frames_before = int(total_samples_before / audio_frame_size)
self._audio_decode_time = whole_frames_before * audio_frame_size
else:
self._audio_decode_time = int(start_decode_time_ms * audio_timescale / 1000.0)
# Track accumulated duration for fragment boundary detection
self._fragment_video_duration = 0 # video ticks accumulated in current fragment
self._fragment_threshold = int(fragment_duration_ms * video_timescale / 1000.0)
@property
def video_position_ticks(self) -> int:
"""Current video decode position (timescale ticks from stream start)."""
return self._video_decode_time + self._fragment_video_duration
def advance_video_decode_time(self, ticks: int) -> None:
"""Advance the video base decode time by *ticks*.
Can be used to adjust the segment's starting decode position when
the first emitted frame doesn't align with the tfdt origin.
"""
self._video_decode_time += ticks
def build_init_segment(self) -> bytes:
"""Build and return the fMP4 init segment (ftyp + moov)."""
return build_fmp4_init_segment(
video_track=self._video_track,
audio_sample_rate=self._audio_sample_rate,
audio_channels=self._audio_channels,
audio_specific_config=self._audio_specific_config,
video_timescale=self._video_timescale,
audio_timescale=self._audio_timescale,
duration_ms=self._duration_ms,
)
def update_audio_specific_config(self, asc: bytes) -> None:
"""Update the AAC AudioSpecificConfig (call before build_init_segment if possible)."""
self._audio_specific_config = asc
def add_video_sample(
self,
data: bytes,
duration_ticks: int,
is_keyframe: bool,
pts_ticks: int | None = None,
) -> None:
"""
Add a video sample to the current fragment.
Args:
data: Raw video NALUs.
duration_ticks: Duration in video timescale ticks.
is_keyframe: Whether this is an IDR/sync sample.
pts_ticks: Presentation timestamp in video timescale ticks.
Used to compute composition_time_offset for B-frame
reordering. If None, assumes PTS == DTS (no B-frames).
"""
# Compute composition_time_offset = PTS - DTS
# DTS is the running decode time for this fragment
cts_offset = 0
if pts_ticks is not None:
dts = self._video_decode_time + self._fragment_video_duration
cts_offset = pts_ticks - dts
self._video_samples.append(
FragmentSample(
data=data,
duration=duration_ticks,
is_sync=is_keyframe,
composition_offset=cts_offset,
)
)
self._fragment_video_duration += duration_ticks
def add_audio_sample(self, data: bytes, duration_ticks: int) -> None:
"""Add an audio sample to the current fragment."""
self._audio_samples.append(
FragmentSample(
data=data,
duration=duration_ticks,
is_sync=True,
)
)
def should_flush(self) -> bool:
"""Check if the current fragment has enough data to emit."""
# Flush on keyframe boundaries after accumulating enough duration
if self._fragment_video_duration < self._fragment_threshold:
return False
# Only flush at a keyframe boundary (if there's a pending keyframe)
if len(self._video_samples) > 1 and self._video_samples[-1].is_sync:
return True
return False
def flush_fragment(self, force: bool = False) -> bytes | None:
"""
Flush the current fragment if ready.
Args:
force: Force flush even if fragment duration threshold isn't reached.
Returns:
Fragment bytes (moof+mdat for video + moof+mdat for audio) or None.
"""
if not force and not self.should_flush():
return None
if not self._video_samples and not self._audio_samples:
return None
result = bytearray()
# When flushing at a keyframe, the last sample (the new keyframe)
# belongs to the NEXT fragment. Split there.
if not force and len(self._video_samples) > 1 and self._video_samples[-1].is_sync:
video_to_emit = self._video_samples[:-1]
video_remaining = [self._video_samples[-1]]
else:
video_to_emit = self._video_samples
video_remaining = []
# Emit video fragment
if video_to_emit:
frag = build_fmp4_fragment(
sequence_number=self._sequence_number,
track_id=1,
base_decode_time=self._video_decode_time,
samples=video_to_emit,
)
result.extend(frag)
self._sequence_number += 1
emitted_duration = sum(s.duration for s in video_to_emit)
self._video_decode_time += emitted_duration
# Emit audio fragment (matching time range)
if self._audio_samples:
frag = build_fmp4_fragment(
sequence_number=self._sequence_number,
track_id=2,
base_decode_time=self._audio_decode_time,
samples=self._audio_samples,
)
result.extend(frag)
self._sequence_number += 1
emitted_audio_duration = sum(s.duration for s in self._audio_samples)
self._audio_decode_time += emitted_audio_duration
self._audio_samples = []
# Reset for next fragment
self._video_samples = video_remaining
self._fragment_video_duration = sum(s.duration for s in video_remaining)
return bytes(result) if result else None
def flush_final(self) -> bytes | None:
"""Flush any remaining samples as the final fragment."""
return self.flush_fragment(force=True)