""" Pure Python EBML/MKV parser for media remuxing. Provides two levels of MKV parsing: Level 1 (Seeking): Parse EBML Header, SeekHead, Info, and Cues to build a time-to-byte-offset map for fast seeking. Level 2 (Demuxing): Parse Tracks for codec metadata (CodecID, CodecPrivate, video/audio parameters) and Cluster/SimpleBlock/BlockGroup for extracting individual media frames with timestamps. """ import bisect import logging import struct from dataclasses import dataclass, field logger = logging.getLogger(__name__) # ============================================================================= # EBML Element IDs (Matroska spec) # ============================================================================= # Top-level EBML_HEADER = 0x1A45DFA3 SEGMENT = 0x18538067 # SeekHead SEEK_HEAD = 0x114D9B74 SEEK = 0x4DBB SEEK_ID = 0x53AB SEEK_POSITION = 0x53AC # Info INFO = 0x1549A966 TIMESTAMP_SCALE = 0x2AD7B1 DURATION = 0x4489 # Tracks TRACKS = 0x1654AE6B TRACK_ENTRY = 0xAE TRACK_NUMBER = 0xD7 TRACK_UID = 0x73C5 TRACK_TYPE = 0x83 CODEC_ID = 0x86 CODEC_PRIVATE = 0x63A2 DEFAULT_DURATION = 0x23E383 CODEC_DELAY = 0x56AA SEEK_PRE_ROLL = 0x56BB # Video track settings VIDEO = 0xE0 PIXEL_WIDTH = 0xB0 PIXEL_HEIGHT = 0xBA DISPLAY_WIDTH = 0x54B0 DISPLAY_HEIGHT = 0x54BA # Audio track settings AUDIO = 0xE1 SAMPLING_FREQUENCY = 0xB5 OUTPUT_SAMPLING_FREQUENCY = 0x78B5 CHANNELS = 0x9F BIT_DEPTH = 0x6264 # Cluster CLUSTER = 0x1F43B675 CLUSTER_TIMESTAMP = 0xE7 SIMPLE_BLOCK = 0xA3 BLOCK_GROUP = 0xA0 BLOCK = 0xA1 BLOCK_DURATION = 0x9B # Cues CUES = 0x1C53BB6B CUE_POINT = 0xBB CUE_TIME = 0xB3 CUE_TRACK_POSITIONS = 0xB7 CUE_TRACK = 0xF7 CUE_CLUSTER_POSITION = 0xF1 # Container elements (have children, not raw data) _CONTAINER_IDS = frozenset( { EBML_HEADER, SEGMENT, SEEK_HEAD, SEEK, INFO, TRACKS, TRACK_ENTRY, VIDEO, AUDIO, CLUSTER, BLOCK_GROUP, CUES, CUE_POINT, CUE_TRACK_POSITIONS, } ) # Unknown/indeterminate size sentinel UNKNOWN_SIZE = -1 # ============================================================================= # Low-level EBML parsing # ============================================================================= def read_vint(data: bytes, pos: int) -> tuple[int, int, int]: """ Read a variable-length integer (VINT) from EBML data. Returns: (raw_value, value_without_marker, new_pos) raw_value includes the VINT marker bit. value_without_marker has the marker bit masked off (for element sizes). """ if pos >= len(data): raise ValueError(f"EBML VINT: position {pos} beyond data length {len(data)}") first = data[pos] if first == 0: raise ValueError(f"EBML VINT: invalid leading byte 0x00 at pos {pos}") # Determine length from leading byte length = 1 mask = 0x80 while mask and not (first & mask): length += 1 mask >>= 1 if pos + length > len(data): raise ValueError(f"EBML VINT: need {length} bytes at pos {pos}, only {len(data) - pos} available") # Read the raw value raw = 0 for i in range(length): raw = (raw << 8) | data[pos + i] # Mask off the leading marker bit for size values value = raw & ~(1 << (7 * length)) # Check for unknown/indeterminate size (all value bits set) all_ones = (1 << (7 * length)) - 1 if value == all_ones: value = UNKNOWN_SIZE return raw, value, pos + length def read_element_id(data: bytes, pos: int) -> tuple[int, int]: """ Read an EBML element ID. Returns: (element_id, new_pos) """ raw, _, new_pos = read_vint(data, pos) return raw, new_pos def read_element_size(data: bytes, pos: int) -> tuple[int, int]: """ Read an EBML element data size. Returns: (size, new_pos) where size may be UNKNOWN_SIZE (-1) """ _, value, new_pos = read_vint(data, pos) return value, new_pos def read_uint(data: bytes, pos: int, length: int) -> int: """Read an unsigned integer of N bytes (big-endian).""" if length == 0: return 0 value = 0 for i in range(length): value = (value << 8) | data[pos + i] return value def read_float(data: bytes, pos: int, length: int) -> float: """Read a 4 or 8 byte IEEE float (big-endian).""" if length == 4: return struct.unpack(">f", data[pos : pos + 4])[0] elif length == 8: return struct.unpack(">d", data[pos : pos + 8])[0] raise ValueError(f"EBML float must be 4 or 8 bytes, got {length}") def read_element_id_bytes(data: bytes, pos: int) -> tuple[bytes, int]: """ Read an EBML element ID and return it as raw bytes (for SeekID matching). Returns: (id_bytes, new_pos) """ if pos >= len(data): raise ValueError(f"read_element_id_bytes: pos {pos} beyond data length {len(data)}") first = data[pos] length = 1 mask = 0x80 while mask and not (first & mask): length += 1 mask >>= 1 return data[pos : pos + length], pos + length # ============================================================================= # Element iteration # ============================================================================= def iter_elements(data: bytes, start: int, end: int): """ Iterate over EBML elements within a range. Yields: (element_id, data_offset, data_size, element_start) element_start is the byte position of the element ID. data_offset is where the element's data begins (after ID + size). data_size is the declared size (may be UNKNOWN_SIZE). """ pos = start while pos < end: try: element_start = pos eid, pos2 = read_element_id(data, pos) size, pos3 = read_element_size(data, pos2) except (ValueError, IndexError): break yield eid, pos3, size, element_start if size == UNKNOWN_SIZE: break pos = pos3 + size # ============================================================================= # High-level MKV parsing # ============================================================================= # MKV Track types (needed early for build_cue_index) TRACK_TYPE_VIDEO = 1 TRACK_TYPE_AUDIO = 2 TRACK_TYPE_SUBTITLE = 17 # Common MKV codec IDs (needed early for bitrate parsing in build_cue_index) CODEC_ID_H264 = "V_MPEG4/ISO/AVC" CODEC_ID_H265 = "V_MPEGH/ISO/HEVC" CODEC_ID_AAC = "A_AAC" CODEC_ID_AC3 = "A_AC3" CODEC_ID_EAC3 = "A_EAC3" CODEC_ID_OPUS = "A_OPUS" CODEC_ID_VORBIS = "A_VORBIS" CODEC_ID_FLAC = "A_FLAC" CODEC_ID_SRT = "S_TEXT/UTF8" CODEC_ID_ASS = "S_TEXT/ASS" @dataclass class MKVCueIndex: """Seek index extracted from an MKV file's Cues element.""" duration_ms: float = 0.0 timestamp_scale: int = 1_000_000 # nanoseconds per tick (default = 1ms) cue_points: list[tuple[float, int]] = field(default_factory=list) # [(time_ms, byte_offset), ...] segment_data_offset: int = 0 # Byte offset where Segment data begins in the file first_cluster_offset: int = 0 # Absolute file offset of the first Cluster element seek_header: bytes = b"" # Synthetic MKV header for seeking (EBML + Segment(UNKNOWN) + Info + Tracks) # Track metadata for size estimation and init segment generation audio_codec_id: str = "" # e.g. "A_EAC3", "A_AC3" audio_bitrate: int = 0 # Input audio bitrate in bits/s (from frame header parsing) audio_channels: int = 0 audio_sample_rate: float = 0.0 video_codec_id: str = "" # e.g. "V_MPEG4/ISO/AVC" video_codec_private: bytes = b"" # avcC / hvcC extradata for init segment video_width: int = 0 video_height: int = 0 video_fps: float = 0.0 # From default_duration_ns (0 = unknown) video_default_duration_ns: int = 0 # Raw default_duration_ns for MKVTrack def byte_offset_for_time(self, time_ms: float) -> tuple[int, float]: """ Find the cluster byte offset for the nearest keyframe at or before time_ms. Returns: (absolute_byte_offset, actual_keyframe_time_ms) """ if not self.cue_points: return 0, 0.0 # cue_points is sorted by time_ms times = [cp[0] for cp in self.cue_points] idx = bisect.bisect_right(times, time_ms) - 1 if idx < 0: idx = 0 cue_time_ms, cluster_offset = self.cue_points[idx] # cluster_offset is relative to Segment data start absolute_offset = self.segment_data_offset + cluster_offset return absolute_offset, cue_time_ms def estimate_fmp4_size( self, mkv_file_size: int, output_audio_bitrate: int = 192000, ) -> int | None: """ Estimate the total fMP4 output size from known MKV file size. Uses the audio bitrate delta (input vs output) and duration to calculate how much the audio track shrinks or grows. Video is copied unchanged. Container overhead difference is accounted for. Returns: Estimated fMP4 size in bytes, or None if insufficient metadata. """ if self.duration_ms <= 0 or self.audio_bitrate <= 0: return None duration_s = self.duration_ms / 1000.0 input_audio_bytes = self.audio_bitrate * duration_s / 8 output_audio_bytes = output_audio_bitrate * duration_s / 8 audio_delta = output_audio_bytes - input_audio_bytes # negative = shrinks # fMP4 container overhead: ~430 bytes/s for moof/trun boxes # (2s fragments = ~3 MB for 2h file) fmp4_overhead = int(duration_s * 430) estimated = int(mkv_file_size + audio_delta + fmp4_overhead) return max(estimated, 0) def parse_ebml_header(data: bytes) -> int: """ Validate EBML header and find the Segment element. Returns: Byte offset where the Segment element's data begins (after its header). """ pos = 0 # Parse EBML Header element eid, pos = read_element_id(data, pos) if eid != EBML_HEADER: raise ValueError(f"Not an EBML file: expected 0x{EBML_HEADER:X}, got 0x{eid:X}") size, pos = read_element_size(data, pos) if size == UNKNOWN_SIZE: raise ValueError("EBML header has unknown size") # Skip EBML header content pos += size # Next should be Segment eid, pos = read_element_id(data, pos) if eid != SEGMENT: raise ValueError(f"Expected Segment element 0x{SEGMENT:X}, got 0x{eid:X}") _size, pos = read_element_size(data, pos) # pos is now at the start of Segment's children return pos def build_seek_header(header_data: bytes, first_cluster_offset: int) -> bytes: """ Build a synthetic MKV header for seeking by rewriting the Segment size to UNKNOWN (-1). When FFmpeg receives MKV data via pipe, it needs the container header (EBML header + Segment + Info + Tracks) to initialize decoders. For seeking, we stream cluster data from a mid-file byte offset, so the Segment's original declared size becomes wrong. Rewriting it to UNKNOWN_SIZE (0x01FFFFFFFFFFFFFF in EBML) tells FFmpeg to read until EOF, which is correct for a live/truncated stream. Args: header_data: Original file header bytes (at least first_cluster_offset bytes). first_cluster_offset: Byte offset of the first Cluster element. Returns: Modified header bytes (EBML header through Tracks) with Segment size set to UNKNOWN_SIZE. """ pos = 0 # Skip EBML Header element eid, pos = read_element_id(header_data, pos) size, pos = read_element_size(header_data, pos) ebml_end = pos + size # We now have the EBML header: header_data[0:ebml_end] # Next is the Segment element ID segment_id_start = ebml_end eid, segment_id_end = read_element_id(header_data, segment_id_start) _segment_size, segment_data_start = read_element_size(header_data, segment_id_end) # Build the new header: # 1. Original EBML header (unchanged) # 2. Segment element ID (unchanged) # 3. Segment size rewritten to UNKNOWN_SIZE (8-byte VINT: 0x01 FF FF FF FF FF FF FF) # 4. Segment children from original (Info, Tracks, etc.) up to first Cluster result = bytearray() result.extend(header_data[:segment_id_end]) # EBML header + Segment ID result.extend(b"\x01\xff\xff\xff\xff\xff\xff\xff") # UNKNOWN_SIZE (8 bytes) result.extend(header_data[segment_data_start:first_cluster_offset]) # Info + Tracks return bytes(result) def parse_seek_head(data: bytes, segment_data_offset: int) -> dict[int, int]: """ Parse the SeekHead element to find positions of top-level elements. Scans from segment_data_offset for the SeekHead element, then parses its Seek entries. Returns: Dict mapping element_id -> byte_offset (relative to segment_data_offset). """ positions = {} for eid, data_off, size, _ in iter_elements(data, segment_data_offset, len(data)): if eid == SEEK_HEAD: # Parse Seek entries within SeekHead end = data_off + size for seek_eid, seek_off, seek_size, _ in iter_elements(data, data_off, end): if seek_eid == SEEK: seek_id_value = None seek_position = None seek_end = seek_off + seek_size for child_eid, child_off, child_size, _ in iter_elements(data, seek_off, seek_end): if child_eid == SEEK_ID: # SeekID is stored as the raw element ID bytes seek_id_value = read_uint(data, child_off, child_size) elif child_eid == SEEK_POSITION: seek_position = read_uint(data, child_off, child_size) if seek_id_value is not None and seek_position is not None: positions[seek_id_value] = seek_position break # Only need the first SeekHead # Stop if we hit Cluster data (SeekHead is always before Clusters) if eid == CLUSTER: break return positions def parse_info(data: bytes, info_offset: int) -> tuple[int, float]: """ Parse the Info element to extract TimestampScale and Duration. Args: data: Buffer containing the Info element. info_offset: Start of the Info element (at the element ID). Returns: (timestamp_scale_ns, duration_ticks) """ timestamp_scale = 1_000_000 # default: 1ms duration = 0.0 # Read element header eid, pos = read_element_id(data, info_offset) if eid != INFO: raise ValueError(f"Expected Info element 0x{INFO:X}, got 0x{eid:X}") size, pos = read_element_size(data, pos) end = pos + size for child_eid, child_off, child_size, _ in iter_elements(data, pos, end): if child_eid == TIMESTAMP_SCALE: timestamp_scale = read_uint(data, child_off, child_size) elif child_eid == DURATION: duration = read_float(data, child_off, child_size) return timestamp_scale, duration def parse_cues(data: bytes, cues_offset: int, timestamp_scale_ns: int) -> list[tuple[float, int]]: """ Parse the Cues element into a sorted list of (time_ms, cluster_byte_offset). Args: data: Buffer containing the Cues element. cues_offset: Start of the Cues element (at the element ID). timestamp_scale_ns: Nanoseconds per timestamp tick from Info. Returns: Sorted list of (time_ms, cluster_byte_offset_relative_to_segment). """ cue_points = [] ns_per_ms = 1_000_000 # 1ms = 1,000,000 ns scale_ms = timestamp_scale_ns / ns_per_ms # ticks -> ms multiplier # Read Cues element header eid, pos = read_element_id(data, cues_offset) if eid != CUES: raise ValueError(f"Expected Cues element 0x{CUES:X}, got 0x{eid:X}") size, pos = read_element_size(data, pos) end = pos + size for cp_eid, cp_off, cp_size, _ in iter_elements(data, pos, end): if cp_eid != CUE_POINT: continue cue_time_ticks = 0 cluster_position = None cp_end = cp_off + cp_size for child_eid, child_off, child_size, _ in iter_elements(data, cp_off, cp_end): if child_eid == CUE_TIME: cue_time_ticks = read_uint(data, child_off, child_size) elif child_eid == CUE_TRACK_POSITIONS: # Parse CueTrackPositions for CueClusterPosition ctp_end = child_off + child_size for ctp_eid, ctp_off, ctp_size, _ in iter_elements(data, child_off, ctp_end): if ctp_eid == CUE_CLUSTER_POSITION: cluster_position = read_uint(data, ctp_off, ctp_size) break # Take the first track's position if cluster_position is not None: time_ms = cue_time_ticks * scale_ms cue_points.append((time_ms, cluster_position)) cue_points.sort(key=lambda x: x[0]) return cue_points def parse_eac3_bitrate(frame_data: bytes) -> int | None: """ Parse an EAC3 (E-AC-3 / Dolby Digital Plus) sync frame header to determine the bitrate in bits per second. EAC3 frame layout (ETSI TS 102 366): - Sync word: 0x0B77 (2 bytes) - Byte 2-3: strmtyp(2) | substreamid(3) | frmsiz(11) - Byte 4 bits 7-6: fscod (sample rate code) - Byte 4 bits 5-4: numblkscod (if fscod != 0b11) Returns: Bitrate in bits/s, or None if parsing fails. """ if len(frame_data) < 6 or frame_data[0] != 0x0B or frame_data[1] != 0x77: return None # frmsiz is bits [5:15] of the 16-bit word at offset 2 word2 = (frame_data[2] << 8) | frame_data[3] frmsiz = word2 & 0x07FF # 11 bits frame_bytes = (frmsiz + 1) * 2 # fscod is bits [7:6] of byte 4 fscod = (frame_data[4] >> 6) & 0x03 sample_rates = {0: 48000, 1: 44100, 2: 32000} if fscod == 3: # fscod2 in bits [5:4], numblkscod is always 6 blocks fscod2 = (frame_data[4] >> 4) & 0x03 sr_map2 = {0: 24000, 1: 22050, 2: 16000} sr = sr_map2.get(fscod2, 48000) num_blocks = 6 else: sr = sample_rates.get(fscod, 48000) numblkscod = (frame_data[4] >> 4) & 0x03 num_blocks = [1, 2, 3, 6][numblkscod] # bitrate = frame_bytes * 8 * sample_rate / (256 * num_blocks) bitrate = frame_bytes * 8 * sr // (256 * num_blocks) return bitrate def parse_ac3_bitrate(frame_data: bytes) -> int | None: """ Parse an AC3 (Dolby Digital) sync frame header to determine the bitrate in bits per second. AC3 frame layout (ATSC A/52): - Sync word: 0x0B77 (2 bytes) - Bytes 2-3: CRC1 - Byte 4 bits 7-6: fscod (sample rate code) - Byte 4 bits 5-0: frmsizecod (frame size code) Returns: Bitrate in bits/s, or None if parsing fails. """ if len(frame_data) < 5 or frame_data[0] != 0x0B or frame_data[1] != 0x77: return None fscod = (frame_data[4] >> 6) & 0x03 frmsizecod = frame_data[4] & 0x3F # AC3 bitrate table (kbps) indexed by frmsizecod // 2 _AC3_BITRATES_KBPS = [ 32, 40, 48, 56, 64, 80, 96, 112, 128, 160, 192, 224, 256, 320, 384, 448, 512, 576, 640, ] idx = frmsizecod >> 1 if fscod > 2 or idx >= len(_AC3_BITRATES_KBPS): return None return _AC3_BITRATES_KBPS[idx] * 1000 def _extract_first_audio_frame( header_data: bytes, cluster_start: int, audio_track_number: int, ) -> bytes | None: """ Extract the first audio frame from the first Cluster in header data. Scans SimpleBlocks and BlockGroups within the Cluster for a block belonging to the given audio track number. Args: header_data: Buffer containing the MKV header + start of first Cluster. cluster_start: Byte offset of the Cluster element's children (after ID+size). audio_track_number: Track number of the audio track. Returns: Raw audio frame bytes, or None if not found within the data. """ try: # Read Cluster element header to get children start eid, id_end = read_element_id(header_data, cluster_start) if eid != CLUSTER: return None size, children_start = read_element_size(header_data, id_end) children_end = min( children_start + size if size != UNKNOWN_SIZE else len(header_data), len(header_data), ) for eid, data_off, size, _ in iter_elements(header_data, children_start, children_end): if eid == SIMPLE_BLOCK: for track_num, _, _, frame_list in extract_block_frames(header_data, data_off, size): if track_num == audio_track_number and frame_list: return frame_list[0] elif eid == BLOCK_GROUP: bg_end = data_off + size for child_eid, child_off, child_size, _ in iter_elements(header_data, data_off, bg_end): if child_eid == BLOCK: for track_num, _, _, frame_list in extract_block_frames(header_data, child_off, child_size): if track_num == audio_track_number and frame_list: return frame_list[0] except (ValueError, IndexError): pass return None def build_cue_index( header_data: bytes, cues_data: bytes, cues_file_offset: int, segment_data_offset: int, ) -> MKVCueIndex: """ Build a complete MKVCueIndex from pre-fetched header and Cues data. This is the main entry point for building the seek index. It expects: - header_data: the first N bytes of the file (enough for EBML header + SeekHead + Info) - cues_data: the bytes containing the Cues element - cues_file_offset: the absolute file offset where cues_data starts - segment_data_offset: where the Segment's children begin in the file Returns: MKVCueIndex with duration and cue points. """ # Parse Info, Tracks from header data and find the first Cluster offset. # Scan top-level Segment children for Info (metadata), Tracks (codec info), # and Cluster (media data start). timestamp_scale = 1_000_000 duration_ticks = 0.0 first_cluster_offset = 0 # absolute file offset of first Cluster element tracks: list = [] for eid, data_off, size, elem_start in iter_elements(header_data, segment_data_offset, len(header_data)): if eid == INFO: end = data_off + size for child_eid, child_off, child_size, _ in iter_elements(header_data, data_off, end): if child_eid == TIMESTAMP_SCALE: timestamp_scale = read_uint(header_data, child_off, child_size) elif child_eid == DURATION: duration_ticks = read_float(header_data, child_off, child_size) elif eid == TRACKS: tracks = parse_tracks(header_data, data_off, data_off + size) elif eid == CLUSTER: # elem_start is the byte offset in header_data where the Cluster # element ID begins -- everything before this is the MKV header # that FFmpeg needs for codec initialization when seeking. first_cluster_offset = elem_start break # Parse Cues from cues_data. # cues_data starts with the Cues element header (ID + size), so we must # first skip past it to reach the CuePoint children inside. cue_points = [] ns_per_ms = 1_000_000 scale_ms = timestamp_scale / ns_per_ms # Read the Cues element header to find where children start cues_eid, cues_id_end = read_element_id(cues_data, 0) if cues_eid != CUES: logger.warning("[ebml] Expected Cues element (0x%X), got 0x%X", CUES, cues_eid) return MKVCueIndex( duration_ms=duration_ticks * scale_ms, timestamp_scale=timestamp_scale, cue_points=[], segment_data_offset=segment_data_offset, ) cues_size, cues_children_start = read_element_size(cues_data, cues_id_end) cues_children_end = ( min(cues_children_start + cues_size, len(cues_data)) if cues_size != UNKNOWN_SIZE else len(cues_data) ) for cp_eid, cp_off, cp_size, _ in iter_elements(cues_data, cues_children_start, cues_children_end): if cp_eid != CUE_POINT: continue cue_time_ticks = 0 cluster_position = None cp_end = cp_off + cp_size for child_eid, child_off, child_size, _ in iter_elements(cues_data, cp_off, cp_end): if child_eid == CUE_TIME: cue_time_ticks = read_uint(cues_data, child_off, child_size) elif child_eid == CUE_TRACK_POSITIONS: ctp_end = child_off + child_size for ctp_eid, ctp_off, ctp_size, _ in iter_elements(cues_data, child_off, ctp_end): if ctp_eid == CUE_CLUSTER_POSITION: cluster_position = read_uint(cues_data, ctp_off, ctp_size) break if cluster_position is not None: time_ms = cue_time_ticks * scale_ms cue_points.append((time_ms, cluster_position)) cue_points.sort(key=lambda x: x[0]) duration_ms = duration_ticks * scale_ms # Build synthetic seek header (MKV header with Segment size = UNKNOWN) seek_header = b"" if first_cluster_offset > 0: try: seek_header = build_seek_header(header_data, first_cluster_offset) except Exception as e: logger.warning("[ebml] Failed to build seek header: %s", e) # Extract track metadata for size estimation and init segment generation audio_codec_id = "" audio_bitrate = 0 audio_channels = 0 audio_sample_rate = 0.0 video_codec_id = "" video_codec_private = b"" video_width = 0 video_height = 0 video_fps = 0.0 video_default_duration_ns = 0 audio_track = None for t in tracks: if t.track_type == TRACK_TYPE_AUDIO and not audio_track: audio_track = t audio_codec_id = t.codec_id audio_channels = t.channels audio_sample_rate = t.sample_rate elif t.track_type == TRACK_TYPE_VIDEO and not video_codec_id: video_codec_id = t.codec_id video_codec_private = t.codec_private video_width = t.pixel_width video_height = t.pixel_height video_default_duration_ns = t.default_duration_ns if t.default_duration_ns > 0: video_fps = 1_000_000_000.0 / t.default_duration_ns # Try to determine audio bitrate from the first audio frame in the Cluster if audio_track and first_cluster_offset > 0: frame_data = _extract_first_audio_frame( header_data, first_cluster_offset, audio_track.track_number, ) if frame_data: if audio_codec_id == CODEC_ID_EAC3: audio_bitrate = parse_eac3_bitrate(frame_data) or 0 elif audio_codec_id == CODEC_ID_AC3: audio_bitrate = parse_ac3_bitrate(frame_data) or 0 if audio_bitrate > 0: logger.info( "[ebml] Detected audio: %s %d kbps %dch %.0fHz", audio_codec_id, audio_bitrate // 1000, audio_channels, audio_sample_rate, ) index = MKVCueIndex( duration_ms=duration_ms, timestamp_scale=timestamp_scale, cue_points=cue_points, segment_data_offset=segment_data_offset, first_cluster_offset=first_cluster_offset, seek_header=seek_header, audio_codec_id=audio_codec_id, audio_bitrate=audio_bitrate, audio_channels=audio_channels, audio_sample_rate=audio_sample_rate, video_codec_id=video_codec_id, video_codec_private=video_codec_private, video_width=video_width, video_height=video_height, video_fps=video_fps, video_default_duration_ns=video_default_duration_ns, ) logger.info( "[ebml] Built cue index: duration=%.1fs, %d cue points, segment_offset=%d, " "first_cluster=%d, seek_header=%d bytes, audio=%s @%dkbps", duration_ms / 1000, len(cue_points), segment_data_offset, first_cluster_offset, len(seek_header), audio_codec_id or "none", audio_bitrate // 1000 if audio_bitrate else 0, ) return index # ============================================================================= # Level 2: Track and Frame parsing for demuxing # ============================================================================= # (Track type and codec ID constants are defined above in the High-level section # since they are also needed by build_cue_index.) @dataclass class MKVTrack: """Metadata for a single track extracted from MKV Tracks element.""" track_number: int = 0 track_uid: int = 0 track_type: int = 0 # 1=video, 2=audio, 17=subtitle codec_id: str = "" # e.g. "V_MPEG4/ISO/AVC", "A_EAC3" codec_private: bytes = b"" # Codec-specific init data (avcC, AudioSpecificConfig, etc.) default_duration_ns: int = 0 # Default frame duration in nanoseconds codec_delay_ns: int = 0 # Codec delay in nanoseconds seek_pre_roll_ns: int = 0 # Seek pre-roll in nanoseconds # Video fields pixel_width: int = 0 pixel_height: int = 0 display_width: int = 0 display_height: int = 0 # Audio fields sample_rate: float = 0.0 output_sample_rate: float = 0.0 # OutputSamplingFrequency (for SBR/HE-AAC) channels: int = 0 bit_depth: int = 0 @property def is_video(self) -> bool: return self.track_type == TRACK_TYPE_VIDEO @property def is_audio(self) -> bool: return self.track_type == TRACK_TYPE_AUDIO @property def is_subtitle(self) -> bool: return self.track_type == TRACK_TYPE_SUBTITLE @property def effective_sample_rate(self) -> float: """Return OutputSamplingFrequency if set, else SamplingFrequency.""" return self.output_sample_rate if self.output_sample_rate > 0 else self.sample_rate @property def frame_duration_ms(self) -> float: """Default frame duration in milliseconds, or 0 if not set.""" if self.default_duration_ns > 0: return self.default_duration_ns / 1_000_000.0 return 0.0 @dataclass class MKVFrame: """A single media frame extracted from an MKV Cluster.""" track_number: int timestamp_ms: float # Absolute timestamp in milliseconds is_keyframe: bool data: bytes duration_ms: float = 0.0 # Duration if known (from BlockDuration or DefaultDuration) def read_string(data: bytes, pos: int, length: int) -> str: """Read a UTF-8 string of N bytes, stripping null terminators.""" raw = data[pos : pos + length] return raw.rstrip(b"\x00").decode("utf-8", errors="replace") def parse_tracks(data: bytes, start: int, end: int) -> list[MKVTrack]: """ Parse the Tracks element children to extract track metadata. Args: data: Buffer containing the Tracks element children. start: Start offset of the Tracks children (after Tracks ID + size). end: End offset of the Tracks children. Returns: List of MKVTrack for each TrackEntry found. """ tracks = [] for eid, data_off, size, _ in iter_elements(data, start, end): if eid != TRACK_ENTRY: continue track = MKVTrack() te_end = data_off + size for child_eid, child_off, child_size, _ in iter_elements(data, data_off, te_end): if child_eid == TRACK_NUMBER: track.track_number = read_uint(data, child_off, child_size) elif child_eid == TRACK_UID: track.track_uid = read_uint(data, child_off, child_size) elif child_eid == TRACK_TYPE: track.track_type = read_uint(data, child_off, child_size) elif child_eid == CODEC_ID: track.codec_id = read_string(data, child_off, child_size) elif child_eid == CODEC_PRIVATE: track.codec_private = bytes(data[child_off : child_off + child_size]) elif child_eid == DEFAULT_DURATION: track.default_duration_ns = read_uint(data, child_off, child_size) elif child_eid == CODEC_DELAY: track.codec_delay_ns = read_uint(data, child_off, child_size) elif child_eid == SEEK_PRE_ROLL: track.seek_pre_roll_ns = read_uint(data, child_off, child_size) elif child_eid == VIDEO: _parse_video_settings(data, child_off, child_off + child_size, track) elif child_eid == AUDIO: _parse_audio_settings(data, child_off, child_off + child_size, track) if track.track_number > 0: tracks.append(track) return tracks def _parse_video_settings(data: bytes, start: int, end: int, track: MKVTrack) -> None: """Parse Video element children into MKVTrack fields.""" for eid, off, size, _ in iter_elements(data, start, end): if eid == PIXEL_WIDTH: track.pixel_width = read_uint(data, off, size) elif eid == PIXEL_HEIGHT: track.pixel_height = read_uint(data, off, size) elif eid == DISPLAY_WIDTH: track.display_width = read_uint(data, off, size) elif eid == DISPLAY_HEIGHT: track.display_height = read_uint(data, off, size) def _parse_audio_settings(data: bytes, start: int, end: int, track: MKVTrack) -> None: """Parse Audio element children into MKVTrack fields.""" for eid, off, size, _ in iter_elements(data, start, end): if eid == SAMPLING_FREQUENCY: track.sample_rate = read_float(data, off, size) elif eid == OUTPUT_SAMPLING_FREQUENCY: track.output_sample_rate = read_float(data, off, size) elif eid == CHANNELS: track.channels = read_uint(data, off, size) elif eid == BIT_DEPTH: track.bit_depth = read_uint(data, off, size) def parse_block_header(data: bytes, pos: int) -> tuple[int, int, int, int]: """ Parse the header of a SimpleBlock or Block element. The block header starts with: - Track number (VINT, variable length) - Relative timecode (int16, signed, big-endian) - Flags byte (keyframe, lacing, etc.) Args: data: Buffer containing the block data. pos: Start of the block data (after element ID + size). Returns: (track_number, relative_timecode, flags, header_end_pos) flags bit layout for SimpleBlock: - bit 7 (0x80): keyframe - bits 2-1 (0x06): lacing (0=none, 1=Xiph, 2=fixed, 3=EBML) - bit 0 (0x01): discardable """ # Track number is a VINT (but uses the raw value, not the size-masked value) _, track_number, pos2 = read_vint(data, pos) # For track number, we use the raw VINT value with marker bit removed # Actually, the Matroska spec says track number in Block uses the same # VINT encoding as element sizes, so the marker-stripped value is correct. # Relative timecode: signed 16-bit big-endian timecode_raw = (data[pos2] << 8) | data[pos2 + 1] if timecode_raw >= 0x8000: timecode_raw -= 0x10000 pos3 = pos2 + 2 # Flags flags = data[pos3] pos4 = pos3 + 1 return track_number, timecode_raw, flags, pos4 def extract_block_frames(data: bytes, pos: int, block_size: int) -> list[tuple[int, int, int, list[bytes]]]: """ Parse a SimpleBlock or Block and extract the frame data. Handles all four lacing modes: no lacing, Xiph, fixed-size, and EBML. Args: data: Buffer containing the block. pos: Start of the block data (after element ID + size). block_size: Total size of the block data. Returns: List of (track_number, relative_timecode, flags, [frame_bytes, ...]) """ block_end = pos + block_size track_number, rel_timecode, flags, header_end = parse_block_header(data, pos) lacing = (flags >> 1) & 0x03 # bits 2-1 if lacing == 0: # No lacing: single frame = rest of block frame_data = bytes(data[header_end:block_end]) return [(track_number, rel_timecode, flags, [frame_data])] # Laced: first byte after header is the number of frames minus one num_frames = data[header_end] + 1 lace_pos = header_end + 1 if lacing == 2: # Fixed-size lacing: all frames are the same size remaining = block_end - lace_pos frame_size = remaining // num_frames frames = [] for _ in range(num_frames): frames.append(bytes(data[lace_pos : lace_pos + frame_size])) lace_pos += frame_size return [(track_number, rel_timecode, flags, frames)] elif lacing == 1: # Xiph lacing: sizes encoded as sum of 255s + remainder frame_sizes = [] for _ in range(num_frames - 1): size = 0 while lace_pos < block_end: val = data[lace_pos] lace_pos += 1 size += val if val < 255: break frame_sizes.append(size) # Last frame gets remaining bytes. # lace_pos is already past the size bytes; frame data starts at lace_pos. frames = [] frame_data_start = lace_pos for sz in frame_sizes: frames.append(bytes(data[frame_data_start : frame_data_start + sz])) frame_data_start += sz frames.append(bytes(data[frame_data_start:block_end])) return [(track_number, rel_timecode, flags, frames)] elif lacing == 3: # EBML lacing: first size is VINT, subsequent are signed VINT deltas frame_sizes = [] # First frame size _, first_size, lace_pos = read_vint(data, lace_pos) frame_sizes.append(first_size) prev_size = first_size for _ in range(num_frames - 2): # Read VINT for delta (signed: subtract midpoint) raw, value, lace_pos = read_vint(data, lace_pos) # Determine VINT length to compute the sign bias vint_len = 0 test = raw while test > 0: test >>= 8 vint_len += 1 if vint_len == 0: vint_len = 1 # Signed delta: value - ((2^(7*vint_len - 1)) - 1) bias = (1 << (7 * vint_len - 1)) - 1 delta = value - bias current_size = prev_size + delta frame_sizes.append(current_size) prev_size = current_size frames = [] frame_data_start = lace_pos for sz in frame_sizes: frames.append(bytes(data[frame_data_start : frame_data_start + sz])) frame_data_start += sz frames.append(bytes(data[frame_data_start:block_end])) return [(track_number, rel_timecode, flags, frames)] return [(track_number, rel_timecode, flags, [bytes(data[header_end:block_end])])] def parse_cluster_frames( data: bytes, cluster_start: int, cluster_end: int, timestamp_scale_ns: int, ) -> tuple[float, list[MKVFrame]]: """ Parse a single Cluster element and extract all frames. Args: data: Buffer containing the Cluster element children. cluster_start: Start of Cluster children (after Cluster ID + size). cluster_end: End of Cluster data. timestamp_scale_ns: Nanoseconds per timestamp tick. Returns: (cluster_timestamp_ms, list_of_MKVFrame) """ scale_ms = timestamp_scale_ns / 1_000_000.0 cluster_timecode = 0 frames = [] for eid, data_off, size, _ in iter_elements(data, cluster_start, cluster_end): if eid == CLUSTER_TIMESTAMP: cluster_timecode = read_uint(data, data_off, size) elif eid == SIMPLE_BLOCK: for track_num, rel_tc, flags, frame_list in extract_block_frames(data, data_off, size): is_kf = bool(flags & 0x80) abs_ts_ms = (cluster_timecode + rel_tc) * scale_ms for frame_data in frame_list: frames.append( MKVFrame( track_number=track_num, timestamp_ms=abs_ts_ms, is_keyframe=is_kf, data=frame_data, ) ) elif eid == BLOCK_GROUP: _parse_block_group(data, data_off, data_off + size, cluster_timecode, scale_ms, frames) cluster_ts_ms = cluster_timecode * scale_ms return cluster_ts_ms, frames def _parse_block_group( data: bytes, start: int, end: int, cluster_timecode: int, scale_ms: float, frames: list[MKVFrame], ) -> None: """Parse a BlockGroup and append frames to the list.""" block_data_off = 0 block_data_size = 0 duration_ticks = 0 for eid, off, size, _ in iter_elements(data, start, end): if eid == BLOCK: block_data_off = off block_data_size = size elif eid == BLOCK_DURATION: duration_ticks = read_uint(data, off, size) if block_data_off == 0: return for track_num, rel_tc, flags, frame_list in extract_block_frames(data, block_data_off, block_data_size): # Block within BlockGroup: keyframe flag is NOT in the flags byte # (unlike SimpleBlock). Keyframe is inferred from context or absence # of ReferenceBlock. For simplicity, treat first block as keyframe if # there's no ReferenceBlock -- but we don't parse that here. # Default to non-keyframe for BlockGroup blocks. abs_ts_ms = (cluster_timecode + rel_tc) * scale_ms dur_ms = duration_ticks * scale_ms if duration_ticks > 0 else 0.0 for frame_data in frame_list: frames.append( MKVFrame( track_number=track_num, timestamp_ms=abs_ts_ms, is_keyframe=False, data=frame_data, duration_ms=dur_ms, ) )