mirror of
https://github.com/UrloMythus/UnHided.git
synced 2026-04-11 11:50:51 +00:00
835 lines
27 KiB
Python
835 lines
27 KiB
Python
"""
|
|
MP4 container parser for moov atom probing.
|
|
|
|
Provides:
|
|
- MP4Index: seek index extracted from MP4 moov atom (parallel to MKVCueIndex)
|
|
- Top-level atom scanning
|
|
- Sample table parsers (stco, co64, stss, stsz, stts, stsc)
|
|
- Moov-to-cue-point builder
|
|
- rewrite_moov_offsets: adjust stco/co64 in moov for file rearrangement
|
|
|
|
The parsers are the inverse of the builder functions in mp4_muxer.py.
|
|
Box navigation reuses the pattern from ts_muxer.py's read_box/find_box/iter_boxes.
|
|
"""
|
|
|
|
import bisect
|
|
import logging
|
|
import struct
|
|
from dataclasses import dataclass, field
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# =============================================================================
|
|
# MP4 Box Utilities
|
|
# =============================================================================
|
|
|
|
# Minimum bytes needed to read a standard box header
|
|
_BOX_HEADER_SIZE = 8
|
|
|
|
# ftyp brands that identify MP4/MOV containers
|
|
_MP4_BRANDS = {
|
|
b"isom",
|
|
b"iso2",
|
|
b"iso3",
|
|
b"iso4",
|
|
b"iso5",
|
|
b"iso6",
|
|
b"mp41",
|
|
b"mp42",
|
|
b"M4V ",
|
|
b"M4A ",
|
|
b"f4v ",
|
|
b"kddi",
|
|
b"avc1",
|
|
b"qt ",
|
|
b"MSNV",
|
|
b"dash",
|
|
b"3gp4",
|
|
b"3gp5",
|
|
b"3gp6",
|
|
}
|
|
|
|
|
|
def is_mp4_header(data: bytes) -> bool:
|
|
"""Check if the data starts with an ftyp box (MP4 signature)."""
|
|
if len(data) < 8:
|
|
return False
|
|
size = struct.unpack_from(">I", data, 0)[0]
|
|
box_type = data[4:8]
|
|
if box_type != b"ftyp":
|
|
return False
|
|
if size < 12 or size > len(data):
|
|
return size >= 12 # might be valid but truncated
|
|
major_brand = data[8:12]
|
|
return major_brand in _MP4_BRANDS
|
|
|
|
|
|
def read_box_header(data: bytes, offset: int) -> tuple[bytes, int, int] | None:
|
|
"""
|
|
Read a box header at the given offset.
|
|
|
|
Returns:
|
|
(box_type, header_size, total_box_size) or None if not enough data.
|
|
"""
|
|
if offset + 8 > len(data):
|
|
return None
|
|
|
|
size, box_type = struct.unpack_from(">I4s", data, offset)
|
|
header_size = 8
|
|
|
|
if size == 1: # Extended size (64-bit)
|
|
if offset + 16 > len(data):
|
|
return None
|
|
size = struct.unpack_from(">Q", data, offset + 8)[0]
|
|
header_size = 16
|
|
elif size == 0: # Box extends to end of data
|
|
size = len(data) - offset
|
|
|
|
return box_type, header_size, size
|
|
|
|
|
|
def iter_top_level_boxes(data: bytes):
|
|
"""
|
|
Iterate over top-level box headers.
|
|
|
|
Yields:
|
|
(box_type, header_size, total_size, data_offset)
|
|
"""
|
|
offset = 0
|
|
while offset < len(data):
|
|
result = read_box_header(data, offset)
|
|
if result is None:
|
|
break
|
|
box_type, header_size, total_size = result
|
|
yield box_type, header_size, total_size, offset + header_size
|
|
if total_size == 0:
|
|
break
|
|
offset += total_size
|
|
|
|
|
|
def find_box(data: bytes, target: bytes) -> bytes | None:
|
|
"""Find a box by type and return its body (data after header)."""
|
|
for box_type, header_size, total_size, data_offset in iter_top_level_boxes(data):
|
|
if box_type == target:
|
|
return data[data_offset : data_offset - header_size + total_size]
|
|
return None
|
|
|
|
|
|
def iter_boxes(data: bytes):
|
|
"""Iterate over child boxes: yields (box_type, box_body_bytes)."""
|
|
for box_type, header_size, total_size, data_offset in iter_top_level_boxes(data):
|
|
end = data_offset - header_size + total_size
|
|
yield box_type, data[data_offset:end]
|
|
|
|
|
|
# =============================================================================
|
|
# Sample Table Parsers (inverse of mp4_muxer.py builders)
|
|
# =============================================================================
|
|
|
|
|
|
def parse_full_box_header(data: bytes) -> tuple[int, int, int]:
|
|
"""
|
|
Parse a full box header (version + flags).
|
|
|
|
Returns:
|
|
(version, flags, header_size) where header_size is 4 bytes.
|
|
"""
|
|
if len(data) < 4:
|
|
return 0, 0, 0
|
|
version = data[0]
|
|
flags = (data[1] << 16) | (data[2] << 8) | data[3]
|
|
return version, flags, 4
|
|
|
|
|
|
def parse_stco(data: bytes) -> list[int]:
|
|
"""
|
|
Parse Chunk Offset box (stco) - 32-bit offsets.
|
|
|
|
Layout: version(1) + flags(3) + entry_count(4) + [offset(4)]...
|
|
"""
|
|
if len(data) < 8:
|
|
return []
|
|
_, _, hdr = parse_full_box_header(data)
|
|
pos = hdr
|
|
entry_count = struct.unpack_from(">I", data, pos)[0]
|
|
pos += 4
|
|
|
|
if len(data) < pos + entry_count * 4:
|
|
return []
|
|
|
|
offsets = []
|
|
for _ in range(entry_count):
|
|
offsets.append(struct.unpack_from(">I", data, pos)[0])
|
|
pos += 4
|
|
return offsets
|
|
|
|
|
|
def parse_co64(data: bytes) -> list[int]:
|
|
"""
|
|
Parse Chunk Offset box (co64) - 64-bit offsets.
|
|
|
|
Layout: version(1) + flags(3) + entry_count(4) + [offset(8)]...
|
|
"""
|
|
if len(data) < 8:
|
|
return []
|
|
_, _, hdr = parse_full_box_header(data)
|
|
pos = hdr
|
|
entry_count = struct.unpack_from(">I", data, pos)[0]
|
|
pos += 4
|
|
|
|
if len(data) < pos + entry_count * 8:
|
|
return []
|
|
|
|
offsets = []
|
|
for _ in range(entry_count):
|
|
offsets.append(struct.unpack_from(">Q", data, pos)[0])
|
|
pos += 8
|
|
return offsets
|
|
|
|
|
|
def parse_stss(data: bytes) -> list[int]:
|
|
"""
|
|
Parse Sync Sample box (stss) - keyframe indices (1-based).
|
|
|
|
Layout: version(1) + flags(3) + entry_count(4) + [sample_number(4)]...
|
|
"""
|
|
if len(data) < 8:
|
|
return []
|
|
_, _, hdr = parse_full_box_header(data)
|
|
pos = hdr
|
|
entry_count = struct.unpack_from(">I", data, pos)[0]
|
|
pos += 4
|
|
|
|
if len(data) < pos + entry_count * 4:
|
|
return []
|
|
|
|
indices = []
|
|
for _ in range(entry_count):
|
|
indices.append(struct.unpack_from(">I", data, pos)[0])
|
|
pos += 4
|
|
return indices
|
|
|
|
|
|
def parse_stsz(data: bytes) -> tuple[int, list[int]]:
|
|
"""
|
|
Parse Sample Size box (stsz).
|
|
|
|
Layout: version(1) + flags(3) + sample_size(4) + sample_count(4) + [size(4)]...
|
|
|
|
Returns:
|
|
(uniform_size, sizes_list).
|
|
If uniform_size > 0, all samples have that size and sizes_list is empty.
|
|
Otherwise, sizes_list contains per-sample sizes.
|
|
"""
|
|
if len(data) < 12:
|
|
return 0, []
|
|
_, _, hdr = parse_full_box_header(data)
|
|
pos = hdr
|
|
sample_size = struct.unpack_from(">I", data, pos)[0]
|
|
sample_count = struct.unpack_from(">I", data, pos + 4)[0]
|
|
pos += 8
|
|
|
|
if sample_size > 0:
|
|
return sample_size, []
|
|
|
|
if len(data) < pos + sample_count * 4:
|
|
return 0, []
|
|
|
|
sizes = []
|
|
for _ in range(sample_count):
|
|
sizes.append(struct.unpack_from(">I", data, pos)[0])
|
|
pos += 4
|
|
return 0, sizes
|
|
|
|
|
|
def parse_stts(data: bytes) -> list[tuple[int, int]]:
|
|
"""
|
|
Parse Time-to-Sample box (stts) - run-length encoded durations.
|
|
|
|
Layout: version(1) + flags(3) + entry_count(4) + [sample_count(4) + sample_delta(4)]...
|
|
|
|
Returns:
|
|
List of (sample_count, sample_delta) entries.
|
|
"""
|
|
if len(data) < 8:
|
|
return []
|
|
_, _, hdr = parse_full_box_header(data)
|
|
pos = hdr
|
|
entry_count = struct.unpack_from(">I", data, pos)[0]
|
|
pos += 4
|
|
|
|
if len(data) < pos + entry_count * 8:
|
|
return []
|
|
|
|
entries = []
|
|
for _ in range(entry_count):
|
|
count = struct.unpack_from(">I", data, pos)[0]
|
|
delta = struct.unpack_from(">I", data, pos + 4)[0]
|
|
entries.append((count, delta))
|
|
pos += 8
|
|
return entries
|
|
|
|
|
|
def parse_stsc(data: bytes) -> list[tuple[int, int, int]]:
|
|
"""
|
|
Parse Sample-to-Chunk box (stsc).
|
|
|
|
Layout: version(1) + flags(3) + entry_count(4) +
|
|
[first_chunk(4) + samples_per_chunk(4) + sample_desc_index(4)]...
|
|
|
|
Returns:
|
|
List of (first_chunk, samples_per_chunk, sample_desc_index) entries.
|
|
first_chunk is 1-based.
|
|
"""
|
|
if len(data) < 8:
|
|
return []
|
|
_, _, hdr = parse_full_box_header(data)
|
|
pos = hdr
|
|
entry_count = struct.unpack_from(">I", data, pos)[0]
|
|
pos += 4
|
|
|
|
if len(data) < pos + entry_count * 12:
|
|
return []
|
|
|
|
entries = []
|
|
for _ in range(entry_count):
|
|
first_chunk = struct.unpack_from(">I", data, pos)[0]
|
|
spc = struct.unpack_from(">I", data, pos + 4)[0]
|
|
sdi = struct.unpack_from(">I", data, pos + 8)[0]
|
|
entries.append((first_chunk, spc, sdi))
|
|
pos += 12
|
|
return entries
|
|
|
|
|
|
def parse_mdhd(data: bytes) -> tuple[int, int]:
|
|
"""
|
|
Parse Media Header box (mdhd) for timescale and duration.
|
|
|
|
Returns:
|
|
(timescale, duration) in media timescale units.
|
|
"""
|
|
if len(data) < 4:
|
|
return 0, 0
|
|
version = data[0]
|
|
if version == 1:
|
|
# 64-bit: skip version(1)+flags(3)+creation(8)+modification(8)
|
|
if len(data) < 32:
|
|
return 0, 0
|
|
timescale = struct.unpack_from(">I", data, 20)[0]
|
|
duration = struct.unpack_from(">Q", data, 24)[0]
|
|
else:
|
|
# 32-bit: skip version(1)+flags(3)+creation(4)+modification(4)
|
|
if len(data) < 20:
|
|
return 0, 0
|
|
timescale = struct.unpack_from(">I", data, 12)[0]
|
|
duration = struct.unpack_from(">I", data, 16)[0]
|
|
return timescale, duration
|
|
|
|
|
|
def parse_stsd_codec(data: bytes) -> str:
|
|
"""
|
|
Parse Sample Description box (stsd) to extract the codec FourCC.
|
|
|
|
Returns the codec name as a string (e.g. "avc1", "hvc1", "mp4a").
|
|
"""
|
|
if len(data) < 16:
|
|
return ""
|
|
# version(1)+flags(3)+entry_count(4)
|
|
pos = 8
|
|
# First entry: size(4)+type(4)
|
|
if pos + 8 > len(data):
|
|
return ""
|
|
codec_fourcc = data[pos + 4 : pos + 8]
|
|
try:
|
|
return codec_fourcc.decode("ascii").strip()
|
|
except (UnicodeDecodeError, ValueError):
|
|
return ""
|
|
|
|
|
|
# =============================================================================
|
|
# MP4 Index (parallel to MKVCueIndex)
|
|
# =============================================================================
|
|
|
|
|
|
@dataclass
|
|
class MP4Index:
|
|
"""
|
|
Seek index extracted from an MP4 file's moov atom.
|
|
|
|
Parallel to ``MKVCueIndex`` for MKV files. Provides keyframe-indexed
|
|
cue points for time-based seeking and the raw moov bytes needed to
|
|
reconstruct a streamable (faststart) MP4 for on-the-fly demuxing.
|
|
"""
|
|
|
|
duration_ms: float = 0.0
|
|
timescale: int = 0
|
|
cue_points: list[tuple[float, int]] = field(default_factory=list) # [(time_ms, byte_offset), ...]
|
|
moov_offset: int = 0 # Absolute file offset where moov atom starts
|
|
moov_size: int = 0 # Total size of the moov atom (header + body)
|
|
moov_data: bytes = b"" # Raw moov atom bytes (for prepending to mdat pipe)
|
|
ftyp_data: bytes = b"" # Raw ftyp atom bytes (for prepending before moov)
|
|
mdat_offset: int = 0 # Absolute file offset where mdat atom starts
|
|
mdat_size: int = 0 # Total size of the mdat atom
|
|
video_codec: str = "" # e.g. "avc1", "hvc1", "mp4v"
|
|
audio_codec: str = "" # e.g. "mp4a", "ac-3"
|
|
|
|
def byte_offset_for_time(self, time_ms: float) -> tuple[int, float]:
|
|
"""
|
|
Find the byte offset for the nearest keyframe at or before time_ms.
|
|
|
|
Returns:
|
|
(absolute_byte_offset, actual_keyframe_time_ms)
|
|
"""
|
|
if not self.cue_points:
|
|
return 0, 0.0
|
|
|
|
times = [cp[0] for cp in self.cue_points]
|
|
idx = bisect.bisect_right(times, time_ms) - 1
|
|
if idx < 0:
|
|
idx = 0
|
|
|
|
cue_time_ms, byte_offset = self.cue_points[idx]
|
|
return byte_offset, cue_time_ms
|
|
|
|
|
|
# =============================================================================
|
|
# Moov -> Cue Points Builder
|
|
# =============================================================================
|
|
|
|
|
|
def _find_nested_box(data: bytes, *path: bytes) -> bytes | None:
|
|
"""Walk a box hierarchy: find_nested_box(data, b"trak", b"mdia") etc."""
|
|
current = data
|
|
for box_name in path:
|
|
found = find_box(current, box_name)
|
|
if found is None:
|
|
return None
|
|
current = found
|
|
return current
|
|
|
|
|
|
def build_cue_points_from_moov(moov_body: bytes) -> tuple[list[tuple[float, int]], float, int, str, str]:
|
|
"""
|
|
Parse a moov body to build keyframe-indexed cue points.
|
|
|
|
Walks the first video trak's stbl to extract:
|
|
- Chunk offsets (stco/co64)
|
|
- Keyframe sample indices (stss)
|
|
- Sample sizes (stsz)
|
|
- Sample durations (stts)
|
|
- Sample-to-chunk mapping (stsc)
|
|
- Timescale and duration from mdhd
|
|
|
|
Returns:
|
|
(cue_points, duration_ms, timescale, video_codec, audio_codec)
|
|
"""
|
|
cue_points: list[tuple[float, int]] = []
|
|
duration_ms = 0.0
|
|
timescale = 0
|
|
video_codec = ""
|
|
audio_codec = ""
|
|
|
|
# Find all traks
|
|
video_stbl = None
|
|
video_mdhd = None
|
|
|
|
offset = 0
|
|
data = moov_body
|
|
while offset < len(data):
|
|
result = read_box_header(data, offset)
|
|
if result is None:
|
|
break
|
|
box_type, hdr_size, total_size = result
|
|
|
|
if box_type == b"trak":
|
|
trak_body = data[offset + hdr_size : offset + total_size]
|
|
|
|
# Check handler type to identify video/audio
|
|
hdlr_data = _find_nested_box(trak_body, b"mdia", b"hdlr")
|
|
handler_type = b""
|
|
if hdlr_data and len(hdlr_data) >= 12:
|
|
# hdlr: version(1)+flags(3)+pre_defined(4)+handler_type(4)
|
|
handler_type = hdlr_data[8:12]
|
|
|
|
if handler_type == b"vide" and video_stbl is None:
|
|
video_stbl = _find_nested_box(trak_body, b"mdia", b"minf", b"stbl")
|
|
video_mdhd_data = _find_nested_box(trak_body, b"mdia", b"mdhd")
|
|
if video_mdhd_data:
|
|
video_mdhd = video_mdhd_data
|
|
|
|
stsd_data = _find_nested_box(trak_body, b"mdia", b"minf", b"stbl", b"stsd")
|
|
if stsd_data:
|
|
video_codec = parse_stsd_codec(stsd_data)
|
|
|
|
elif handler_type == b"soun" and not audio_codec:
|
|
stsd_data = _find_nested_box(trak_body, b"mdia", b"minf", b"stbl", b"stsd")
|
|
if stsd_data:
|
|
audio_codec = parse_stsd_codec(stsd_data)
|
|
|
|
elif box_type == b"mvhd":
|
|
# Fallback: parse mvhd for timescale/duration if no mdhd
|
|
mvhd_body = data[offset + hdr_size : offset + total_size]
|
|
if len(mvhd_body) >= 20:
|
|
version = mvhd_body[0]
|
|
if version == 1:
|
|
if len(mvhd_body) >= 28:
|
|
ts = struct.unpack_from(">I", mvhd_body, 20)[0]
|
|
dur = struct.unpack_from(">Q", mvhd_body, 24)[0]
|
|
if timescale == 0:
|
|
timescale = ts
|
|
duration_ms = dur / ts * 1000.0 if ts else 0.0
|
|
else:
|
|
ts = struct.unpack_from(">I", mvhd_body, 12)[0]
|
|
dur = struct.unpack_from(">I", mvhd_body, 16)[0]
|
|
if timescale == 0:
|
|
timescale = ts
|
|
duration_ms = dur / ts * 1000.0 if ts else 0.0
|
|
|
|
if total_size == 0:
|
|
break
|
|
offset += total_size
|
|
|
|
# Parse mdhd for video timescale (more precise than mvhd)
|
|
if video_mdhd:
|
|
ts, dur = parse_mdhd(video_mdhd)
|
|
if ts > 0:
|
|
timescale = ts
|
|
duration_ms = dur / ts * 1000.0
|
|
|
|
if video_stbl is None:
|
|
logger.warning("[mp4_parser] No video stbl found in moov")
|
|
return cue_points, duration_ms, timescale, video_codec, audio_codec
|
|
|
|
# Parse sample tables from video stbl
|
|
stco_data = find_box(video_stbl, b"stco")
|
|
co64_data = find_box(video_stbl, b"co64")
|
|
stss_data = find_box(video_stbl, b"stss")
|
|
stsz_data = find_box(video_stbl, b"stsz")
|
|
stts_data = find_box(video_stbl, b"stts")
|
|
stsc_data = find_box(video_stbl, b"stsc")
|
|
|
|
# Chunk offsets
|
|
chunk_offsets = parse_co64(co64_data) if co64_data else (parse_stco(stco_data) if stco_data else [])
|
|
|
|
# Keyframe sample numbers (1-based)
|
|
keyframe_samples = set(parse_stss(stss_data)) if stss_data else set()
|
|
all_are_keyframes = not stss_data # No stss means all samples are sync
|
|
|
|
# Sample sizes
|
|
uniform_size, size_list = parse_stsz(stsz_data) if stsz_data else (0, [])
|
|
|
|
# Sample durations (run-length encoded)
|
|
stts_entries = parse_stts(stts_data) if stts_data else []
|
|
|
|
# Sample-to-chunk mapping
|
|
stsc_entries = parse_stsc(stsc_data) if stsc_data else []
|
|
|
|
if not chunk_offsets or timescale == 0:
|
|
logger.warning(
|
|
"[mp4_parser] Missing data: chunks=%d, timescale=%d",
|
|
len(chunk_offsets),
|
|
timescale,
|
|
)
|
|
return cue_points, duration_ms, timescale, video_codec, audio_codec
|
|
|
|
# Expand stts to per-sample durations
|
|
sample_durations: list[int] = []
|
|
for count, delta in stts_entries:
|
|
sample_durations.extend([delta] * count)
|
|
|
|
# Expand stsc to determine which samples belong to which chunk
|
|
# Build a mapping: chunk_index (0-based) -> samples_per_chunk
|
|
total_chunks = len(chunk_offsets)
|
|
chunk_sample_counts: list[int] = [0] * total_chunks
|
|
|
|
if stsc_entries:
|
|
for i, (first_chunk, spc, _sdi) in enumerate(stsc_entries):
|
|
# first_chunk is 1-based
|
|
start = first_chunk - 1
|
|
if i + 1 < len(stsc_entries):
|
|
end = stsc_entries[i + 1][0] - 1
|
|
else:
|
|
end = total_chunks
|
|
for c in range(start, end):
|
|
if c < total_chunks:
|
|
chunk_sample_counts[c] = spc
|
|
else:
|
|
# Default: 1 sample per chunk
|
|
chunk_sample_counts = [1] * total_chunks
|
|
|
|
# Count total samples
|
|
total_samples = sum(chunk_sample_counts)
|
|
|
|
# Get per-sample sizes
|
|
if uniform_size > 0:
|
|
sample_sizes = [uniform_size] * total_samples
|
|
else:
|
|
sample_sizes = size_list
|
|
|
|
# Build cumulative timestamp for each sample and map keyframes to byte offsets
|
|
current_sample = 0 # 0-based sample index
|
|
current_time = 0 # in timescale units
|
|
|
|
for chunk_idx, chunk_offset in enumerate(chunk_offsets):
|
|
spc = chunk_sample_counts[chunk_idx] if chunk_idx < len(chunk_sample_counts) else 1
|
|
byte_pos = chunk_offset
|
|
|
|
for s in range(spc):
|
|
sample_num = current_sample + 1 # 1-based for stss comparison
|
|
is_keyframe = all_are_keyframes or sample_num in keyframe_samples
|
|
|
|
if is_keyframe:
|
|
time_ms = current_time / timescale * 1000.0
|
|
cue_points.append((time_ms, byte_pos))
|
|
|
|
# Advance byte position by this sample's size
|
|
if current_sample < len(sample_sizes):
|
|
byte_pos += sample_sizes[current_sample]
|
|
|
|
# Advance timestamp
|
|
if current_sample < len(sample_durations):
|
|
current_time += sample_durations[current_sample]
|
|
|
|
current_sample += 1
|
|
|
|
logger.info(
|
|
"[mp4_parser] Built %d cue points from %d samples, duration=%.1fs, video=%s, audio=%s",
|
|
len(cue_points),
|
|
total_samples,
|
|
duration_ms / 1000.0,
|
|
video_codec,
|
|
audio_codec,
|
|
)
|
|
|
|
return cue_points, duration_ms, timescale, video_codec, audio_codec
|
|
|
|
|
|
# =============================================================================
|
|
# Moov Offset Rewriting (for faststart pipe construction)
|
|
# =============================================================================
|
|
|
|
|
|
def _rewrite_stco_in_place(data: bytearray, box_start: int, box_size: int, delta: int) -> int:
|
|
"""Rewrite stco chunk offsets by adding delta. Returns number of entries fixed."""
|
|
# FullBox header: version(1) + flags(3) = 4 bytes
|
|
body_start = box_start + 4
|
|
if body_start + 4 > box_start + box_size:
|
|
return 0
|
|
entry_count = struct.unpack_from(">I", data, body_start)[0]
|
|
pos = body_start + 4
|
|
for _ in range(entry_count):
|
|
if pos + 4 > box_start + box_size:
|
|
break
|
|
old_val = struct.unpack_from(">I", data, pos)[0]
|
|
struct.pack_into(">I", data, pos, old_val + delta)
|
|
pos += 4
|
|
return entry_count
|
|
|
|
|
|
def _rewrite_co64_in_place(data: bytearray, box_start: int, box_size: int, delta: int) -> int:
|
|
"""Rewrite co64 chunk offsets by adding delta. Returns number of entries fixed."""
|
|
body_start = box_start + 4
|
|
if body_start + 4 > box_start + box_size:
|
|
return 0
|
|
entry_count = struct.unpack_from(">I", data, body_start)[0]
|
|
pos = body_start + 4
|
|
for _ in range(entry_count):
|
|
if pos + 8 > box_start + box_size:
|
|
break
|
|
old_val = struct.unpack_from(">Q", data, pos)[0]
|
|
struct.pack_into(">Q", data, pos, old_val + delta)
|
|
pos += 8
|
|
return entry_count
|
|
|
|
|
|
def _walk_and_rewrite(data: bytearray, start: int, end: int, delta: int) -> int:
|
|
"""
|
|
Recursively walk boxes within [start, end) looking for stco/co64 boxes
|
|
and rewriting their offsets.
|
|
|
|
Returns total number of offset entries rewritten.
|
|
"""
|
|
total = 0
|
|
offset = start
|
|
while offset + 8 <= end:
|
|
size = struct.unpack_from(">I", data, offset)[0]
|
|
box_type = data[offset + 4 : offset + 8]
|
|
hdr_size = 8
|
|
|
|
if size == 1:
|
|
if offset + 16 > end:
|
|
break
|
|
size = struct.unpack_from(">Q", data, offset + 8)[0]
|
|
hdr_size = 16
|
|
elif size == 0:
|
|
size = end - offset
|
|
|
|
if size < 8 or offset + size > end:
|
|
break
|
|
|
|
body_start = offset + hdr_size
|
|
body_end = offset + size
|
|
|
|
if box_type == b"stco":
|
|
total += _rewrite_stco_in_place(data, body_start, size - hdr_size, delta)
|
|
elif box_type == b"co64":
|
|
total += _rewrite_co64_in_place(data, body_start, size - hdr_size, delta)
|
|
elif box_type in (b"moov", b"trak", b"mdia", b"minf", b"stbl"):
|
|
# Container box -- recurse into children
|
|
total += _walk_and_rewrite(data, body_start, body_end, delta)
|
|
|
|
offset += size
|
|
|
|
return total
|
|
|
|
|
|
def extract_video_track_from_moov(moov_data: bytes):
|
|
"""
|
|
Extract video codec configuration from an MP4 moov atom.
|
|
|
|
Walks the moov box tree to find the first video trak, extracts its
|
|
resolution and codec-private data (avcC/hvcC), and returns a synthetic
|
|
``MKVTrack`` suitable for building an fMP4 init segment.
|
|
|
|
Returns:
|
|
An ``MKVTrack`` with video metadata, or ``None`` if no video track
|
|
is found.
|
|
"""
|
|
from mediaflow_proxy.remuxer.ebml_parser import (
|
|
CODEC_ID_H264,
|
|
CODEC_ID_H265,
|
|
MKVTrack,
|
|
)
|
|
|
|
# Strip the moov box header to get the body
|
|
if len(moov_data) < 8:
|
|
return None
|
|
raw_size = struct.unpack_from(">I", moov_data, 0)[0]
|
|
hdr_size = 16 if raw_size == 1 else 8
|
|
moov_body = moov_data[hdr_size:]
|
|
|
|
# Walk traks looking for video handler
|
|
offset = 0
|
|
while offset < len(moov_body):
|
|
result = read_box_header(moov_body, offset)
|
|
if result is None:
|
|
break
|
|
box_type, box_hdr_size, total_size = result
|
|
|
|
if box_type == b"trak":
|
|
trak_body = moov_body[offset + box_hdr_size : offset + total_size]
|
|
|
|
# Check handler type
|
|
hdlr_data = _find_nested_box(trak_body, b"mdia", b"hdlr")
|
|
handler_type = b""
|
|
if hdlr_data and len(hdlr_data) >= 12:
|
|
handler_type = hdlr_data[8:12]
|
|
|
|
if handler_type == b"vide":
|
|
# Found video trak -- extract stsd for codec config
|
|
stsd_data = _find_nested_box(trak_body, b"mdia", b"minf", b"stbl", b"stsd")
|
|
if not stsd_data or len(stsd_data) < 16:
|
|
offset += total_size
|
|
continue
|
|
|
|
codec_name = parse_stsd_codec(stsd_data)
|
|
|
|
# Map MP4 codec names to MKV codec IDs
|
|
if codec_name in ("avc1", "avc3"):
|
|
mkv_codec_id = CODEC_ID_H264
|
|
elif codec_name in ("hvc1", "hev1"):
|
|
mkv_codec_id = CODEC_ID_H265
|
|
else:
|
|
mkv_codec_id = f"V_MP4/{codec_name}"
|
|
|
|
# Extract codec private (avcC or hvcC box) from inside the
|
|
# sample entry. The stsd structure is:
|
|
# version(1) + flags(3) + entry_count(4)
|
|
# then entry: size(4) + type(4) + ... + nested boxes
|
|
# The avcC/hvcC is a child box of the sample entry.
|
|
codec_private = b""
|
|
width = 0
|
|
height = 0
|
|
|
|
# Parse sample entry to get width/height and codec config
|
|
entry_start = 8 # skip version+flags+entry_count
|
|
if entry_start + 8 <= len(stsd_data):
|
|
entry_size = struct.unpack_from(">I", stsd_data, entry_start)[0]
|
|
entry_body_start = entry_start + 8 # skip size+type
|
|
entry_end = min(entry_start + entry_size, len(stsd_data))
|
|
|
|
# Visual sample entry: 6 reserved + 2 data_ref_idx + ...
|
|
# At offset 24 from entry body start: width(2) + height(2)
|
|
vis_offset = entry_body_start + 24
|
|
if vis_offset + 4 <= entry_end:
|
|
width = struct.unpack_from(">H", stsd_data, vis_offset)[0]
|
|
height = struct.unpack_from(">H", stsd_data, vis_offset + 2)[0]
|
|
|
|
# Scan nested boxes for avcC or hvcC
|
|
# Visual sample entry fixed fields = 70 bytes from entry body
|
|
nested_start = entry_body_start + 70
|
|
if nested_start < entry_end:
|
|
nested_data = stsd_data[nested_start:entry_end]
|
|
for target in (b"avcC", b"hvcC"):
|
|
found = find_box(nested_data, target)
|
|
if found:
|
|
codec_private = found
|
|
break
|
|
|
|
# Get duration from mdhd if available
|
|
default_duration_ns = 0
|
|
mdhd_data = _find_nested_box(trak_body, b"mdia", b"mdhd")
|
|
if mdhd_data and len(mdhd_data) >= 20:
|
|
version = mdhd_data[0]
|
|
if version == 1 and len(mdhd_data) >= 28:
|
|
ts = struct.unpack_from(">I", mdhd_data, 20)[0]
|
|
dur = struct.unpack_from(">Q", mdhd_data, 24)[0]
|
|
else:
|
|
ts = struct.unpack_from(">I", mdhd_data, 12)[0]
|
|
dur = struct.unpack_from(">I", mdhd_data, 16)[0]
|
|
if ts > 0 and dur > 0:
|
|
# Rough estimate: assume 24fps if we can't determine.
|
|
default_duration_ns = int(1_000_000_000 / 24)
|
|
|
|
return MKVTrack(
|
|
track_number=1,
|
|
track_type=1, # video
|
|
codec_id=mkv_codec_id,
|
|
codec_private=codec_private,
|
|
pixel_width=width,
|
|
pixel_height=height,
|
|
default_duration_ns=default_duration_ns,
|
|
)
|
|
|
|
offset += total_size
|
|
|
|
return None
|
|
|
|
|
|
def rewrite_moov_offsets(moov_data: bytes, delta: int) -> bytes:
|
|
"""
|
|
Rewrite all stco/co64 chunk offsets in a moov atom by adding ``delta``.
|
|
|
|
This is needed when rearranging an MP4 file for pipe streaming:
|
|
the original moov's chunk offsets reference positions in the original
|
|
file layout. When we prepend moov before mdat, the offsets must be
|
|
shifted by ``delta = moov_size - original_mdat_offset``.
|
|
|
|
Args:
|
|
moov_data: Raw bytes of the complete moov box (header + body).
|
|
delta: Offset adjustment to add to every chunk offset.
|
|
|
|
Returns:
|
|
Modified moov bytes with updated chunk offsets.
|
|
"""
|
|
buf = bytearray(moov_data)
|
|
|
|
# Determine moov box header size
|
|
raw_size = struct.unpack_from(">I", buf, 0)[0]
|
|
hdr_size = 16 if raw_size == 1 else 8
|
|
|
|
total = _walk_and_rewrite(buf, hdr_size, len(buf), delta)
|
|
logger.info("[mp4_parser] Rewrote %d chunk offset entries (delta=%+d)", total, delta)
|
|
|
|
return bytes(buf)
|