This commit is contained in:
UrloMythus
2026-02-19 20:15:03 +01:00
parent 7785e8c604
commit cfc6bbabc9
181 changed files with 32141 additions and 4629 deletions

View File

@@ -0,0 +1,18 @@
"""
Media remuxer package.
Provides pure Python implementations for media container parsing, remuxing,
and transcoding:
- ebml_parser: Minimal EBML/MKV parser for seeking and demuxing
- ts_muxer: fMP4 -> MPEG-TS remuxer
- mkv_demuxer: Streaming MKV demuxer
- mp4_muxer: MP4 box builder for standard moov-first MP4
- audio_transcoder: PyAV-based audio frame transcoding
- video_transcoder: GPU-accelerated video transcoding via PyAV
- pyav_demuxer: Universal PyAV-based streaming demuxer (any container)
- codec_utils: Codec compatibility detection and decision engine
- media_source: Abstract MediaSource protocol (Telegram, HTTP, etc.)
- transcode_handler: Shared transcode request orchestrator
- transcode_pipeline: MKV fast-path and universal transcode pipelines
"""

View File

@@ -0,0 +1,351 @@
"""
PyAV-based audio transcoder for frame-level codec conversion.
Transcodes audio frames between codecs using PyAV's CodecContext API
(Python bindings for FFmpeg's libavcodec). This provides in-process
audio transcoding without subprocess management or pipe overhead.
Supported input codecs: EAC3, AC3, AAC, Opus, Vorbis, FLAC, MP3
Output codec: AAC-LC (stereo, configurable bitrate)
Architecture:
raw_frame_bytes -> parse() -> decode() -> resample() -> encode() -> raw_aac_bytes
Usage:
transcoder = AudioTranscoder("eac3", sample_rate=48000, channels=6)
for raw_eac3_frame in frames:
aac_frames = transcoder.transcode(raw_eac3_frame)
for aac_data in aac_frames:
write(aac_data)
# Flush remaining frames
for aac_data in transcoder.flush():
write(aac_data)
"""
import logging
import av
from av.audio.resampler import AudioResampler
from mediaflow_proxy.remuxer.ebml_parser import (
CODEC_ID_AAC,
CODEC_ID_AC3,
CODEC_ID_EAC3,
CODEC_ID_FLAC,
CODEC_ID_OPUS,
CODEC_ID_VORBIS,
)
logger = logging.getLogger(__name__)
def _generate_silence_aac_frame() -> bytes | None:
"""Pre-encode a single silent AAC frame (48 kHz stereo, 1024 samples).
PyAV's AAC encoder has an intermittent ``avcodec_send_frame`` bug when
rapidly creating/destroying codec contexts, so we retry a few times.
This function is called once at module load; the result is cached in
``_SILENCE_AAC_FRAME``.
"""
for _attempt in range(10):
try:
enc = av.CodecContext.create("aac", "w")
enc.sample_rate = 48000
enc.layout = "stereo"
enc.format = av.AudioFormat("fltp")
enc.bit_rate = 192000
enc.open()
frame = av.AudioFrame(
format=enc.format.name,
layout=enc.layout.name,
samples=enc.frame_size or 1024,
)
frame.sample_rate = enc.sample_rate
frame.pts = 0
for pkt in enc.encode(frame):
return bytes(pkt)
# AAC priming delay: first encode buffered; flush to retrieve
for pkt in enc.encode(None):
return bytes(pkt)
except Exception:
continue
return None
# Module-level silence frame -- generated once, reused by every transcoder.
_SILENCE_AAC_FRAME: bytes | None = _generate_silence_aac_frame()
# Map MKV codec IDs to PyAV/FFmpeg codec names
_MKV_TO_FFMPEG_CODEC = {
CODEC_ID_EAC3: "eac3",
CODEC_ID_AC3: "ac3",
CODEC_ID_AAC: "aac",
CODEC_ID_OPUS: "opus",
CODEC_ID_VORBIS: "vorbis",
CODEC_ID_FLAC: "flac",
"A_DTS": "dts",
"A_MP3": "mp3",
"A_MPEG/L3": "mp3",
}
# Codecs that need transcoding to AAC for browser playback
NEEDS_TRANSCODE = frozenset(
{
CODEC_ID_EAC3,
CODEC_ID_AC3,
CODEC_ID_OPUS,
CODEC_ID_VORBIS,
CODEC_ID_FLAC,
"A_DTS",
"A_MP3",
"A_MPEG/L3",
}
)
# Output AAC settings
_OUTPUT_CODEC = "aac"
_OUTPUT_SAMPLE_FORMAT = "fltp" # AAC requires float planar
_OUTPUT_LAYOUT = "stereo"
# Map channel count -> FFmpeg layout name
_CHANNEL_LAYOUT_MAP = {
1: "mono",
2: "stereo",
3: "2.1",
4: "quad",
6: "5.1",
8: "7.1",
}
def needs_transcode(codec_id: str) -> bool:
"""Check if an MKV audio codec needs transcoding for browser playback."""
return codec_id in NEEDS_TRANSCODE
def get_ffmpeg_codec_name(mkv_codec_id: str) -> str | None:
"""Map an MKV CodecID to an FFmpeg codec name."""
return _MKV_TO_FFMPEG_CODEC.get(mkv_codec_id)
class AudioTranscoder:
"""
In-process audio transcoder using PyAV's CodecContext API.
Decodes raw audio frames from one codec and encodes them to AAC-LC
stereo, suitable for MP4 container and browser playback. No container
I/O or subprocess involved -- operates directly on raw frame bytes.
The transcoder handles sample format conversion and resampling
automatically via AudioResampler.
"""
def __init__(
self,
input_codec: str,
input_sample_rate: int = 48000,
input_channels: int = 6,
output_sample_rate: int = 48000,
output_channels: int = 2,
output_bitrate: int = 192000,
) -> None:
"""
Initialize the transcoder.
Args:
input_codec: FFmpeg codec name (e.g., "eac3", "ac3", "aac").
input_sample_rate: Input sample rate in Hz.
input_channels: Input channel count.
output_sample_rate: Output sample rate in Hz (default 48000).
output_channels: Output channel count (default 2 = stereo).
output_bitrate: Output bitrate in bits/s (default 192000).
"""
# Set up decoder -- use layout to configure channel count
# (PyAV's channels property is read-only; layout drives it)
self._decoder = av.CodecContext.create(input_codec, "r")
self._decoder.sample_rate = input_sample_rate
input_layout = _CHANNEL_LAYOUT_MAP.get(input_channels, "stereo")
self._decoder.layout = input_layout
# Set up encoder
self._encoder = av.CodecContext.create(_OUTPUT_CODEC, "w")
self._encoder.sample_rate = output_sample_rate
self._encoder.layout = _OUTPUT_LAYOUT
self._encoder.format = av.AudioFormat(_OUTPUT_SAMPLE_FORMAT)
self._encoder.bit_rate = output_bitrate
self._encoder.open()
# Set up resampler for format/rate/channel conversion
self._resampler = AudioResampler(
format=_OUTPUT_SAMPLE_FORMAT,
layout=_OUTPUT_LAYOUT,
rate=output_sample_rate,
)
self._input_codec = input_codec
self._frames_decoded = 0
self._frames_encoded = 0
self._audio_specific_config: bytes | None = None
logger.info(
"[audio_transcoder] Initialized: %s %dHz %dch -> aac %dHz %dch @%dk",
input_codec,
input_sample_rate,
input_channels,
output_sample_rate,
output_channels,
output_bitrate // 1000,
)
@property
def audio_specific_config(self) -> bytes | None:
"""
AAC AudioSpecificConfig from the encoder (available after first encode).
This is needed for the MP4 esds box.
"""
if self._audio_specific_config is not None:
return self._audio_specific_config
# PyAV exposes extradata after the encoder is opened
if self._encoder.extradata:
self._audio_specific_config = bytes(self._encoder.extradata)
return self._audio_specific_config
return None
@property
def output_sample_rate(self) -> int:
return self._encoder.sample_rate
@property
def output_channels(self) -> int:
return self._encoder.channels
@property
def frame_size(self) -> int:
"""AAC frame size (samples per frame), typically 1024."""
return self._encoder.frame_size or 1024
def transcode(self, raw_frame_data: bytes) -> list[bytes]:
"""
Transcode a raw audio frame from the input codec to AAC.
Args:
raw_frame_data: Raw audio frame bytes (one codec frame, e.g.,
one EAC3 sync frame).
Returns:
List of raw AAC frame bytes. May return 0, 1, or more frames
depending on codec frame sizes and buffering.
"""
output = []
# Parse raw bytes into packets
packets = self._decoder.parse(raw_frame_data)
for packet in packets:
# Decode to PCM frames
try:
decoded_frames = self._decoder.decode(packet)
except av.error.InvalidDataError as e:
logger.debug("[audio_transcoder] Decode error (skipping frame): %s", e)
continue
for frame in decoded_frames:
self._frames_decoded += 1
# Resample to match encoder format
resampled = self._resampler.resample(frame)
if resampled is None:
continue
# resampled can be a single frame or list of frames
if not isinstance(resampled, list):
resampled = [resampled]
for rs_frame in resampled:
# Encode to AAC
try:
encoded_packets = self._encoder.encode(rs_frame)
except av.error.InvalidDataError as e:
logger.debug("[audio_transcoder] Encode error: %s", e)
continue
for enc_packet in encoded_packets:
self._frames_encoded += 1
output.append(bytes(enc_packet))
return output
def flush(self) -> list[bytes]:
"""
Flush the decoder and encoder buffers.
Call this when the input stream ends to get remaining frames.
Returns:
List of remaining raw AAC frame bytes.
"""
output = []
# Flush decoder
try:
for frame in self._decoder.decode(None):
self._frames_decoded += 1
resampled = self._resampler.resample(frame)
if resampled is None:
continue
if not isinstance(resampled, list):
resampled = [resampled]
for rs_frame in resampled:
for enc_packet in self._encoder.encode(rs_frame):
self._frames_encoded += 1
output.append(bytes(enc_packet))
except Exception as e:
logger.debug("[audio_transcoder] Decoder flush error: %s", e)
# Flush resampler
try:
resampled = self._resampler.resample(None)
if resampled is not None:
if not isinstance(resampled, list):
resampled = [resampled]
for rs_frame in resampled:
for enc_packet in self._encoder.encode(rs_frame):
self._frames_encoded += 1
output.append(bytes(enc_packet))
except Exception as e:
logger.debug("[audio_transcoder] Resampler flush error: %s", e)
# Flush encoder
try:
for enc_packet in self._encoder.encode(None):
self._frames_encoded += 1
output.append(bytes(enc_packet))
except Exception as e:
logger.debug("[audio_transcoder] Encoder flush error: %s", e)
logger.info(
"[audio_transcoder] Flushed: %d decoded, %d encoded total",
self._frames_decoded,
self._frames_encoded,
)
return output
def generate_silence_frame(self) -> bytes | None:
"""Return a pre-encoded silent AAC frame (module-level singleton)."""
return _SILENCE_AAC_FRAME
def close(self) -> None:
"""Release codec contexts (best-effort; PyAV AudioCodecContext may not have close())."""
for ctx in (self._decoder, self._encoder):
try:
if hasattr(ctx, "close"):
ctx.close()
except Exception:
pass
def __del__(self) -> None:
self.close()

View File

@@ -0,0 +1,515 @@
"""
Codec decision engine for browser compatibility detection.
Determines whether video/audio streams need transcoding for browser
playback and selects appropriate output codecs.
"""
import logging
import struct
logger = logging.getLogger(__name__)
# ────────────────────────────────────────────────────────────────────
# Browser-compatible codecs (work natively in HTML5 <video>)
# ────────────────────────────────────────────────────────────────────
BROWSER_VIDEO_CODECS = frozenset(
{
"V_MPEG4/ISO/AVC", # H.264/AVC -- universal
"h264",
"avc1", # FFmpeg/PyAV names
}
)
BROWSER_AUDIO_CODECS = frozenset(
{
"A_AAC", # AAC-LC -- universal
"A_AAC/MPEG2/LC",
"A_AAC/MPEG4/LC",
"aac", # FFmpeg/PyAV name
}
)
# ────────────────────────────────────────────────────────────────────
# Video codecs that need re-encoding to H.264
# ────────────────────────────────────────────────────────────────────
VIDEO_NEEDS_REENCODE = frozenset(
{
"V_MPEGH/ISO/HEVC", # H.265/HEVC (Chrome/Firefox don't support)
"V_MPEG2", # MPEG-2 (DVD-era)
"V_MPEG4/ISO/SP", # MPEG-4 Part 2 Simple Profile
"V_MPEG4/ISO/ASP", # MPEG-4 Part 2 Advanced Simple (DivX/Xvid)
"V_MPEG4/ISO/AP", # MPEG-4 Part 2 Advanced Profile
"V_MPEG4/MS/V3", # MS MPEG-4 v3 (WMV)
"V_MS/VFW/FOURCC", # Generic VFW (VC-1, etc.)
"V_REAL/RV10",
"V_REAL/RV20",
"V_REAL/RV30",
"V_REAL/RV40",
"V_THEORA",
"V_VP8",
"V_VP9", # VP9 in MKV (needs WebM container for browser)
"V_AV1", # AV1 (partial support, safer to reencode)
# PyAV / FFmpeg codec names
"hevc",
"h265",
"mpeg2video",
"mpeg4",
"vc1",
"vp8",
"vp9",
"av1",
"theora",
"wmv3",
"rv30",
"rv40",
}
)
# ────────────────────────────────────────────────────────────────────
# Audio codecs that need transcoding to AAC
# (superset of the list in audio_transcoder.py, uses both MKV and
# PyAV codec names for universal lookup)
# ────────────────────────────────────────────────────────────────────
AUDIO_NEEDS_TRANSCODE = frozenset(
{
# MKV CodecIDs
"A_EAC3",
"A_AC3",
"A_DTS",
"A_DTS/EXPRESS",
"A_DTS/LOSSLESS",
"A_OPUS",
"A_VORBIS",
"A_FLAC",
"A_TRUEHD",
"A_MLP",
"A_PCM/INT/LIT",
"A_PCM/INT/BIG",
"A_PCM/FLOAT/IEEE",
"A_REAL/28_8",
"A_REAL/COOK",
"A_REAL/SIPR",
"A_REAL/ATRC",
"A_MS/ACM", # Generic Windows audio
"A_MP3",
"A_MPEG/L3",
# PyAV / FFmpeg names
"eac3",
"ac3",
"dts",
"dca",
"truehd",
"mlp",
"mp3",
"opus",
"vorbis",
"flac",
"pcm_s16le",
"pcm_s24le",
"pcm_f32le",
"wmav2",
"wmavoice",
"wmapro",
"cook",
"sipr",
"atrac3",
}
)
# Map PyAV codec names to MKV CodecIDs (for the MKV fast-path)
_PYAV_TO_MKV_VIDEO = {
"h264": "V_MPEG4/ISO/AVC",
"hevc": "V_MPEGH/ISO/HEVC",
"h265": "V_MPEGH/ISO/HEVC",
"mpeg2video": "V_MPEG2",
"vp8": "V_VP8",
"vp9": "V_VP9",
"av1": "V_AV1",
}
_PYAV_TO_MKV_AUDIO = {
"aac": "A_AAC",
"eac3": "A_EAC3",
"ac3": "A_AC3",
"dts": "A_DTS",
"opus": "A_OPUS",
"vorbis": "A_VORBIS",
"flac": "A_FLAC",
"mp3": "A_MPEG/L3",
"truehd": "A_TRUEHD",
}
# ────────────────────────────────────────────────────────────────────
# NAL unit format conversion (Annex B ↔ AVCC)
# ────────────────────────────────────────────────────────────────────
# H.264 NAL types that belong in the init segment (avcC), not in samples
_H264_PARAM_NAL_TYPES = frozenset({7, 8, 9}) # SPS, PPS, AUD
def _find_annexb_nals(data: bytes) -> list[tuple[int, int]]:
"""
Find all NAL unit [start, end) byte ranges in Annex B formatted data.
Handles both 3-byte (00 00 01) and 4-byte (00 00 00 01) start codes.
Returns a list of (start, end) tuples pointing into *data*.
"""
size = len(data)
nals: list[tuple[int, int]] = []
i = 0
while i < size - 2:
# Scan for 0x000001 or 0x00000001
if data[i] != 0:
i += 1
continue
if data[i + 1] != 0:
i += 2
continue
if data[i + 2] == 1:
nal_start = i + 3
elif data[i + 2] == 0 and i + 3 < size and data[i + 3] == 1:
nal_start = i + 4
else:
i += 1
continue
# Record end of previous NAL
if nals:
nals[-1] = (nals[-1][0], i)
nals.append((nal_start, size))
i = nal_start
return nals
def is_annexb(data: bytes) -> bool:
"""
Return True if *data* starts with an Annex B start code.
Disambiguates AVCC (4-byte length prefix) from Annex B when the data
begins with ``00 00 01 xx`` or ``00 00 00 01`` by checking whether
the AVCC interpretation yields a plausible H.264 NAL. If the 4-byte
big-endian length + subsequent NAL header byte is valid and the
length fits within the data, this is AVCC -- not Annex B.
"""
if len(data) < 5:
return False
# 4-byte start code: 00 00 00 01
if data[0] == 0 and data[1] == 0 and data[2] == 0 and data[3] == 1:
return True
# 3-byte start code: 00 00 01 -- but could also be AVCC with length
# that starts with 00 00 01 (i.e. length 0x000001xx = 256..511).
if data[0] == 0 and data[1] == 0 and data[2] == 1:
# Interpret as AVCC: 4-byte big-endian length
avcc_len = int.from_bytes(data[0:4], "big")
if 0 < avcc_len <= len(data) - 4:
# Check if the NAL header byte is a valid H.264 NAL
nal_byte = data[4]
forbidden = (nal_byte >> 7) & 1
nal_type = nal_byte & 0x1F
if forbidden == 0 and 1 <= nal_type <= 12:
# Plausible AVCC: valid length + valid NAL type
return False
# Not plausible AVCC, treat as Annex B
return True
return False
def annexb_to_avcc(data: bytes, filter_ps: bool = True) -> bytes:
"""
Convert Annex B (start-code-prefixed) NAL units to AVCC
(4-byte length-prefixed) format suitable for fMP4 samples.
Args:
data: H.264 access unit in Annex B format.
filter_ps: If True, strip SPS/PPS/AUD NAL units (they belong
in the avcC box of the init segment, not in samples).
Returns:
The same NAL units with 4-byte big-endian length prefixes.
"""
if not data or not is_annexb(data):
return data # Already AVCC or empty
nals = _find_annexb_nals(data)
if not nals:
return data
out = bytearray()
for start, end in nals:
# Strip trailing zero-padding before next start code
while end > start and data[end - 1] == 0:
end -= 1
if end <= start:
continue
if filter_ps:
nal_type = data[start] & 0x1F
if nal_type in _H264_PARAM_NAL_TYPES:
continue
length = end - start
out.extend(length.to_bytes(4, "big"))
out.extend(data[start:end])
# If every NAL was filtered out (e.g. packet only contains SPS/PPS/AUD),
# return empty so callers can drop this sample. Returning original Annex-B
# bytes here would corrupt fMP4 samples (expects AVCC length prefixes).
return bytes(out)
# H.264 profiles that require the avcC High Profile extension fields
# (chroma_format_idc, bit_depth_luma/chroma, numSpsExt).
_HIGH_PROFILE_IDCS = frozenset({100, 110, 122, 244, 44, 83, 86, 118, 128, 138, 139, 134})
def _fix_avcc_high_profile(avcc: bytes) -> bytes:
"""
Ensure an avcC record includes High Profile extension bytes.
The ISO/IEC 14496-15 spec requires additional fields after the PPS
section when ``AVCProfileIndication`` is 100 (High), 110, 122, or 244.
Some MKV muxers omit these, causing decoders to not know the chroma
format or bit depth, which leads to widespread decode errors.
If the extensions are missing, appends the defaults for 4:2:0 / 8-bit
with zero extended SPS sets.
"""
if len(avcc) < 7:
return avcc
if avcc[0] != 1:
return avcc # Not an avcC record
profile_idc = avcc[1]
if profile_idc not in _HIGH_PROFILE_IDCS:
return avcc # Not a High Profile variant, no extensions needed
# Walk past SPS and PPS sections to find where extensions should be
off = 5
num_sps = avcc[off] & 0x1F
off += 1
for _ in range(num_sps):
if off + 2 > len(avcc):
return avcc
sps_len = struct.unpack(">H", avcc[off : off + 2])[0]
off += 2 + sps_len
if off >= len(avcc):
return avcc
num_pps = avcc[off]
off += 1
for _ in range(num_pps):
if off + 2 > len(avcc):
return avcc
pps_len = struct.unpack(">H", avcc[off : off + 2])[0]
off += 2 + pps_len
# If there are already bytes after the PPS section, extensions exist
if off < len(avcc):
return avcc
# Append default High Profile extensions:
# chroma_format_idc = 1 (4:2:0) -> 0xFC | 0x01 = 0xFD (reserved 111111 + 01)
# bit_depth_luma_minus8 = 0 -> 0xF8 | 0x00 = 0xF8 (reserved 11111 + 000)
# bit_depth_chroma_minus8 = 0 -> 0xF8 | 0x00 = 0xF8 (reserved 11111 + 000)
# numOfSequenceParameterSetExt = 0
ext = bytearray(avcc)
ext.append(0xFD) # 111111_01 : chroma_format_idc = 1
ext.append(0xF8) # 11111_000 : bit_depth_luma_minus8 = 0
ext.append(0xF8) # 11111_000 : bit_depth_chroma_minus8 = 0
ext.append(0x00) # numOfSequenceParameterSetExt = 0
return bytes(ext)
def ensure_avcc_extradata(extradata: bytes) -> bytes:
"""
Ensure h264 extradata is in avcC format for the fMP4 init segment.
PyAV returns extradata in the container's native format:
- MKV/MP4: avcC format (starts with 0x01)
- MPEG-TS: Annex B format (starts with 0x00 0x00)
If Annex B, parses SPS/PPS NAL units and builds proper avcC.
If already avcC, validates and fixes High Profile extension fields.
"""
if not extradata or len(extradata) < 4:
return extradata
# Already avcC format (configurationVersion == 1)
if extradata[0] == 0x01:
return _fix_avcc_high_profile(extradata)
# Parse Annex B NAL units to extract SPS and PPS
nals = _find_annexb_nals(extradata)
if not nals:
return extradata
sps_list: list[bytes] = []
pps_list: list[bytes] = []
for start, end in nals:
while end > start and extradata[end - 1] == 0:
end -= 1
if end <= start:
continue
nal_type = extradata[start] & 0x1F
nal_data = extradata[start:end]
if nal_type == 7: # SPS
sps_list.append(nal_data)
elif nal_type == 8: # PPS
pps_list.append(nal_data)
if not sps_list:
return extradata # Can't build avcC without SPS
sps = sps_list[0]
if len(sps) < 4:
return extradata
# Build avcC box content
avcc = bytearray()
avcc.append(1) # configurationVersion
avcc.append(sps[1]) # AVCProfileIndication
avcc.append(sps[2]) # profile_compatibility
avcc.append(sps[3]) # AVCLevelIndication
avcc.append(0xFF) # 6 bits reserved (0x3F) + lengthSizeMinusOne=3 -> 4-byte NAL lengths
avcc.append(0xE0 | len(sps_list)) # 3 bits reserved (0x07) + numOfSPS
for s in sps_list:
avcc.extend(struct.pack(">H", len(s)))
avcc.extend(s)
avcc.append(len(pps_list)) # numOfPPS
for p in pps_list:
avcc.extend(struct.pack(">H", len(p)))
avcc.extend(p)
return _fix_avcc_high_profile(bytes(avcc))
def extract_sps_pps_from_annexb(data: bytes) -> bytes:
"""
Extract SPS and PPS NAL units from Annex B encoded data and build
an avcC-format extradata blob.
Hardware encoders like VideoToolbox embed SPS/PPS as in-band NAL
units in their first keyframe output rather than setting extradata
on the codec context. This function finds those parameter sets
and returns proper avcC bytes suitable for the fMP4 init segment.
Returns:
avcC bytes if SPS/PPS were found, empty bytes otherwise.
"""
if not data or not is_annexb(data):
return b""
nals = _find_annexb_nals(data)
if not nals:
return b""
sps_list: list[bytes] = []
pps_list: list[bytes] = []
for start, end in nals:
# Strip trailing zero-padding
while end > start and data[end - 1] == 0:
end -= 1
if end <= start:
continue
nal_type = data[start] & 0x1F
if nal_type == 7: # SPS
sps_list.append(data[start:end])
elif nal_type == 8: # PPS
pps_list.append(data[start:end])
if not sps_list:
return b""
sps = sps_list[0]
if len(sps) < 4:
return b""
# Build avcC box content
avcc = bytearray()
avcc.append(1) # configurationVersion
avcc.append(sps[1]) # AVCProfileIndication
avcc.append(sps[2]) # profile_compatibility
avcc.append(sps[3]) # AVCLevelIndication
avcc.append(0xFF) # 6 bits reserved + lengthSizeMinusOne=3
avcc.append(0xE0 | len(sps_list)) # 3 bits reserved + numOfSPS
for s in sps_list:
avcc.extend(struct.pack(">H", len(s)))
avcc.extend(s)
avcc.append(len(pps_list)) # numOfPPS
for p in pps_list:
avcc.extend(struct.pack(">H", len(p)))
avcc.extend(p)
return bytes(avcc)
def video_needs_reencode(codec_id: str) -> bool:
"""Check if a video codec requires re-encoding for browser playback."""
if not codec_id:
return False
return codec_id in VIDEO_NEEDS_REENCODE
def audio_needs_transcode(codec_id: str) -> bool:
"""Check if an audio codec requires transcoding for browser playback."""
if not codec_id:
return False
return codec_id in AUDIO_NEEDS_TRANSCODE
def is_browser_compatible(video_codec: str, audio_codec: str) -> bool:
"""
Check if a video+audio combination is fully browser-compatible.
Returns True only if BOTH video and audio can be played natively in
an HTML5 <video> element inside an MP4 container.
"""
video_ok = video_codec in BROWSER_VIDEO_CODECS or not video_codec
audio_ok = audio_codec in BROWSER_AUDIO_CODECS or not audio_codec
return video_ok and audio_ok
class TranscodeDecision:
"""Result of analyzing a stream's codec compatibility."""
__slots__ = ("transcode_video", "transcode_audio", "video_codec", "audio_codec")
def __init__(self, video_codec: str = "", audio_codec: str = "") -> None:
self.video_codec = video_codec
self.audio_codec = audio_codec
self.transcode_video = video_needs_reencode(video_codec)
self.transcode_audio = audio_needs_transcode(audio_codec)
@property
def needs_transcode(self) -> bool:
"""True if any stream needs transcoding."""
return self.transcode_video or self.transcode_audio
@property
def passthrough_ok(self) -> bool:
"""True if the stream can be served as-is to a browser."""
return not self.needs_transcode
def __repr__(self) -> str:
parts = []
if self.transcode_video:
parts.append(f"video:{self.video_codec}->h264")
if self.transcode_audio:
parts.append(f"audio:{self.audio_codec}->aac")
if not parts:
parts.append("passthrough")
return f"TranscodeDecision({', '.join(parts)})"

View File

@@ -0,0 +1,614 @@
"""
Container format probing -- MKV Cues and MP4 moov.
Pure Python probing using EBML parsing (MKV) and struct-based atom
scanning (MP4). No FFmpeg dependency.
Source-agnostic: accepts any MediaSource protocol implementation
(Telegram, HTTP, etc.) for byte-range reads.
Provides:
- probe_mkv_cues: probe MKV file to extract seek index (MKVCueIndex)
- probe_mp4_moov: probe MP4 file to extract moov atom and build seek index (MP4Index)
"""
import base64
import hashlib
import json
import logging
import struct
from mediaflow_proxy.utils import redis_utils
from mediaflow_proxy.remuxer.ebml_parser import (
MKVCueIndex,
build_cue_index,
parse_ebml_header,
parse_seek_head,
CUES,
INFO,
)
from mediaflow_proxy.remuxer.mp4_parser import (
MP4Index,
build_cue_points_from_moov,
is_mp4_header,
rewrite_moov_offsets,
)
logger = logging.getLogger(__name__)
# How much of the MKV header to fetch for SeekHead + Info parsing
_HEADER_PROBE_SIZE = 64 * 1024 # 64 KB
# Max Cues element size we'll attempt to fetch
_MAX_CUES_SIZE = 2 * 1024 * 1024 # 2 MB
# Redis cache for MKV Cue indexes
_CUE_INDEX_CACHE_PREFIX = "mfp:cue_index:"
_CUE_INDEX_CACHE_TTL = 3600 # 1 hour
# =============================================================================
# MKV Cues probing
# =============================================================================
def derive_cue_cache_key(
source_key: str = "",
*,
chat_id: str | int | None = None,
message_id: int | None = None,
file_id: str | None = None,
) -> str:
"""
Derive a deterministic cache key for a file's cue index.
Accepts either a pre-computed source_key (from MediaSource.cache_key)
or legacy Telegram-style parameters for backwards compatibility.
"""
if source_key:
return source_key
if file_id:
raw = f"file_id:{file_id}"
elif chat_id is not None and message_id is not None:
raw = f"chat:{chat_id}:msg:{message_id}"
else:
return ""
return hashlib.sha256(raw.encode()).hexdigest()[:16]
async def _get_cached_cue_index(cache_key: str) -> MKVCueIndex | None:
"""Try to load a MKVCueIndex from Redis cache."""
if not cache_key:
return None
r = await redis_utils.get_redis()
if r is None:
return None
redis_key = f"{_CUE_INDEX_CACHE_PREFIX}{cache_key}"
data = await r.get(redis_key)
if not data:
return None
try:
d = json.loads(data)
seek_header = b""
if d.get("seek_header_b64"):
seek_header = base64.b64decode(d["seek_header_b64"])
video_codec_private = b""
if d.get("video_codec_private_b64"):
video_codec_private = base64.b64decode(d["video_codec_private_b64"])
index = MKVCueIndex(
duration_ms=d["duration_ms"],
timestamp_scale=d["timestamp_scale"],
cue_points=[(cp[0], cp[1]) for cp in d["cue_points"]],
segment_data_offset=d["segment_data_offset"],
first_cluster_offset=d.get("first_cluster_offset", 0),
seek_header=seek_header,
audio_codec_id=d.get("audio_codec_id", ""),
audio_bitrate=d.get("audio_bitrate", 0),
audio_channels=d.get("audio_channels", 0),
audio_sample_rate=d.get("audio_sample_rate", 0.0),
video_codec_id=d.get("video_codec_id", ""),
video_codec_private=video_codec_private,
video_width=d.get("video_width", 0),
video_height=d.get("video_height", 0),
video_fps=d.get("video_fps", 0.0),
video_default_duration_ns=d.get("video_default_duration_ns", 0),
)
logger.debug("[container_probe] Loaded cue index from cache: %s", cache_key)
return index
except (KeyError, TypeError, json.JSONDecodeError) as e:
logger.warning("[container_probe] Invalid cached cue index: %s", e)
return None
async def _set_cached_cue_index(cache_key: str, index: MKVCueIndex) -> None:
"""Cache a MKVCueIndex in Redis."""
if not cache_key:
return
r = await redis_utils.get_redis()
if r is None:
return
redis_key = f"{_CUE_INDEX_CACHE_PREFIX}{cache_key}"
data = json.dumps(
{
"duration_ms": index.duration_ms,
"timestamp_scale": index.timestamp_scale,
"cue_points": index.cue_points,
"segment_data_offset": index.segment_data_offset,
"first_cluster_offset": index.first_cluster_offset,
"seek_header_b64": base64.b64encode(index.seek_header).decode() if index.seek_header else "",
"audio_codec_id": index.audio_codec_id,
"audio_bitrate": index.audio_bitrate,
"audio_channels": index.audio_channels,
"audio_sample_rate": index.audio_sample_rate,
"video_codec_id": index.video_codec_id,
"video_codec_private_b64": base64.b64encode(index.video_codec_private).decode()
if index.video_codec_private
else "",
"video_width": index.video_width,
"video_height": index.video_height,
"video_fps": index.video_fps,
"video_default_duration_ns": index.video_default_duration_ns,
}
)
await r.set(redis_key, data, ex=_CUE_INDEX_CACHE_TTL)
logger.debug("[container_probe] Cached cue index: %s", cache_key)
async def probe_mkv_cues(
source,
file_size: int = 0,
cache_key: str = "",
header_data: bytes | None = None,
) -> MKVCueIndex | None:
"""
Probe an MKV file's EBML header and Cues to build a seek index.
Pure Python -- parses EBML structures directly, no FFmpeg involved.
Makes up to two small byte-range reads via the provided source:
1. First ~64KB: EBML header + SeekHead + Info (skipped if header_data provided)
2. Cues section: byte range from SeekHead's Cues position
Args:
source: A MediaSource protocol implementation, or any object with
a ``stream(offset, limit)`` async generator method.
file_size: Total file size in bytes. If 0, tries ``source.file_size``.
cache_key: Optional cache key for Redis caching. If empty, tries
``source.cache_key``.
header_data: Pre-fetched header bytes (first ~64KB). If provided,
skips the initial header fetch from source.
Returns:
MKVCueIndex if successful, None if the file has no Cues or parsing fails.
"""
# Resolve file_size and cache_key from source if not provided
if file_size <= 0:
file_size = getattr(source, "file_size", 0)
if not cache_key:
cache_key = getattr(source, "cache_key", "")
# Check cache first
if cache_key:
cached = await _get_cached_cue_index(cache_key)
if cached:
return cached
try:
# Step 1: Use pre-fetched header or fetch from source
if header_data is None:
header_size = min(_HEADER_PROBE_SIZE, file_size) if file_size > 0 else _HEADER_PROBE_SIZE
header_data = b""
async for chunk in source.stream(offset=0, limit=header_size):
header_data += chunk
if len(header_data) < 64:
logger.warning("[container_probe] Header too small (%d bytes), cannot probe", len(header_data))
return None
# Step 2: Parse EBML header to find Segment data offset
segment_data_offset = parse_ebml_header(header_data)
# Step 3: Parse SeekHead to find Cues and Info positions
seek_positions = parse_seek_head(header_data, segment_data_offset)
if CUES not in seek_positions:
logger.info("[container_probe] No Cues position in SeekHead, seeking not available")
return None
cues_relative_offset = seek_positions[CUES]
cues_absolute_offset = segment_data_offset + cues_relative_offset
logger.info(
"[container_probe] SeekHead: Cues at offset %d (absolute %d), Info at %s",
cues_relative_offset,
cues_absolute_offset,
seek_positions.get(INFO, "not found"),
)
# Step 4: Fetch the Cues element
cues_max = file_size - cues_absolute_offset if file_size > 0 else _MAX_CUES_SIZE
cues_fetch_size = min(_MAX_CUES_SIZE, cues_max)
if cues_fetch_size <= 0:
logger.warning("[container_probe] Cues offset %d beyond file size %d", cues_absolute_offset, file_size)
return None
cues_data = b""
async for chunk in source.stream(offset=cues_absolute_offset, limit=cues_fetch_size):
cues_data += chunk
if len(cues_data) < 16:
logger.warning("[container_probe] Cues data too small (%d bytes)", len(cues_data))
return None
# Step 5: Build the cue index
index = build_cue_index(
header_data=header_data,
cues_data=cues_data,
cues_file_offset=cues_absolute_offset,
segment_data_offset=segment_data_offset,
)
# Cache the result
if cache_key:
await _set_cached_cue_index(cache_key, index)
return index
except Exception as e:
logger.warning("[container_probe] Failed to probe MKV cues: %s", e)
return None
# =============================================================================
# MP4 Moov probing
# =============================================================================
# Redis cache for MP4 indexes
_MP4_INDEX_CACHE_PREFIX = "mfp:mp4_index:"
_MP4_INDEX_CACHE_TTL = 3600 # 1 hour
# How much to read from the start for ftyp + initial atom scanning
_MP4_HEADER_PROBE_SIZE = 64 * 1024 # 64 KB
# Max moov size we'll accept
_MAX_MOOV_SIZE = 50 * 1024 * 1024 # 50 MB
# How much to read from the end of the file to find moov
_MP4_TAIL_PROBE_SIZE = 512 * 1024 # 512 KB
async def _get_cached_mp4_index(cache_key: str) -> MP4Index | None:
"""Try to load an MP4Index from Redis cache."""
if not cache_key:
return None
r = await redis_utils.get_redis()
if r is None:
return None
redis_key = f"{_MP4_INDEX_CACHE_PREFIX}{cache_key}"
data = await r.get(redis_key)
if not data:
return None
try:
d = json.loads(data)
ftyp_data = b""
if d.get("ftyp_data_b64"):
ftyp_data = base64.b64decode(d["ftyp_data_b64"])
index = MP4Index(
duration_ms=d["duration_ms"],
timescale=d["timescale"],
cue_points=[(cp[0], cp[1]) for cp in d["cue_points"]],
moov_offset=d["moov_offset"],
moov_size=d["moov_size"],
ftyp_data=ftyp_data,
mdat_offset=d["mdat_offset"],
mdat_size=d["mdat_size"],
video_codec=d.get("video_codec", ""),
audio_codec=d.get("audio_codec", ""),
# moov_data is NOT cached (too large), it will be re-fetched
)
logger.debug("[container_probe] Loaded MP4 index from cache: %s", cache_key)
return index
except (KeyError, TypeError, json.JSONDecodeError) as e:
logger.warning("[container_probe] Invalid cached MP4 index: %s", e)
return None
async def _set_cached_mp4_index(cache_key: str, index: MP4Index) -> None:
"""Cache an MP4Index in Redis (without moov_data)."""
if not cache_key:
return
r = await redis_utils.get_redis()
if r is None:
return
redis_key = f"{_MP4_INDEX_CACHE_PREFIX}{cache_key}"
data = json.dumps(
{
"duration_ms": index.duration_ms,
"timescale": index.timescale,
"cue_points": index.cue_points,
"moov_offset": index.moov_offset,
"moov_size": index.moov_size,
"ftyp_data_b64": base64.b64encode(index.ftyp_data).decode() if index.ftyp_data else "",
"mdat_offset": index.mdat_offset,
"mdat_size": index.mdat_size,
"video_codec": index.video_codec,
"audio_codec": index.audio_codec,
}
)
await r.set(redis_key, data, ex=_MP4_INDEX_CACHE_TTL)
logger.debug("[container_probe] Cached MP4 index: %s", cache_key)
def _scan_top_level_atoms(data: bytes) -> list[tuple[bytes, int, int]]:
"""
Scan top-level atom headers from raw file bytes.
Returns:
List of (box_type, absolute_offset, total_size) for each atom found.
"""
atoms = []
offset = 0
while offset + 8 <= len(data):
size = struct.unpack_from(">I", data, offset)[0]
box_type = data[offset + 4 : offset + 8]
if size == 1: # Extended size
if offset + 16 > len(data):
break
size = struct.unpack_from(">Q", data, offset + 8)[0]
elif size == 0:
# Extends to end of file - we can't know the real size from
# a partial read, but record what we have
atoms.append((box_type, offset, 0))
break
if size < 8:
break
atoms.append((box_type, offset, size))
offset += size
return atoms
async def probe_mp4_moov(
source,
file_size: int = 0,
cache_key: str = "",
header_data: bytes | None = None,
) -> MP4Index | None:
"""
Probe an MP4 file's moov atom to build a seek index.
Pure Python -- scans MP4 box headers with struct, no FFmpeg involved.
Strategy:
1. Read first ~64KB to check for ftyp (MP4 signature).
2. Scan top-level atoms to find moov and mdat.
3. If moov is at the start (faststart), read it from the header data.
4. If moov is not in the header, read from the tail of the file.
5. Parse moov sample tables to build cue points.
Args:
source: A MediaSource protocol implementation with stream(offset, limit).
file_size: Total file size in bytes.
cache_key: Optional cache key for Redis caching.
header_data: Pre-fetched header bytes (first ~64KB). If provided,
skips the initial header fetch from source.
Returns:
MP4Index if successful, None if not an MP4 or parsing fails.
"""
if file_size <= 0:
file_size = getattr(source, "file_size", 0)
if not cache_key:
cache_key = getattr(source, "cache_key", "")
# Check cache first
if cache_key:
cached = await _get_cached_mp4_index(cache_key)
if cached:
# Re-fetch moov_data (not cached due to size) and rewrite offsets
if cached.moov_size > 0 and cached.moov_size <= _MAX_MOOV_SIZE:
moov_data = b""
async for chunk in source.stream(offset=cached.moov_offset, limit=cached.moov_size):
moov_data += chunk
if cached.mdat_offset >= 0:
new_mdat_start = len(cached.ftyp_data) + cached.moov_size
offset_delta = new_mdat_start - cached.mdat_offset
if offset_delta != 0:
moov_data = rewrite_moov_offsets(moov_data, offset_delta)
cached.moov_data = moov_data
return cached
try:
# Step 1: Use pre-fetched header or fetch from source
if header_data is None:
header_size = min(_MP4_HEADER_PROBE_SIZE, file_size) if file_size > 0 else _MP4_HEADER_PROBE_SIZE
header_data = b""
async for chunk in source.stream(offset=0, limit=header_size):
header_data += chunk
if len(header_data) < 12:
return None
# Step 2: Check for ftyp
if not is_mp4_header(header_data):
return None
logger.info("[container_probe] MP4 detected, scanning atoms (header=%d bytes)", len(header_data))
# Step 3: Scan top-level atoms from header
atoms = _scan_top_level_atoms(header_data)
ftyp_offset = -1
ftyp_size = 0
moov_offset = -1
moov_size = 0
mdat_offset = -1
mdat_size = 0
for box_type, atom_offset, atom_size in atoms:
if box_type == b"ftyp":
ftyp_offset = atom_offset
ftyp_size = atom_size
elif box_type == b"moov":
moov_offset = atom_offset
moov_size = atom_size
elif box_type == b"mdat":
mdat_offset = atom_offset
mdat_size = atom_size
# Step 4: If moov not found in header, scan from tail
if moov_offset < 0 and file_size > 0:
tail_start = max(0, file_size - _MP4_TAIL_PROBE_SIZE)
tail_data = b""
async for chunk in source.stream(offset=tail_start, limit=file_size - tail_start):
tail_data += chunk
if tail_data:
tail_atoms = _scan_top_level_atoms(tail_data)
for box_type, rel_offset, atom_size in tail_atoms:
abs_offset = tail_start + rel_offset
if box_type == b"moov":
moov_offset = abs_offset
moov_size = atom_size
elif box_type == b"mdat" and mdat_offset < 0:
mdat_offset = abs_offset
mdat_size = atom_size
# If the initial scan yielded no moov (tail_start may land
# inside a large mdat payload producing garbage atom headers),
# resync by scanning 8-byte aligned windows for b"moov".
if moov_offset < 0:
needle = b"moov"
search_pos = 0
while search_pos + 8 <= len(tail_data):
idx = tail_data.find(needle, search_pos)
if idx < 0 or idx < 4:
break
candidate_size = struct.unpack_from(">I", tail_data, idx - 4)[0]
if 8 < candidate_size <= _MAX_MOOV_SIZE:
moov_offset = tail_start + idx - 4
moov_size = candidate_size
break
search_pos = idx + 4
if moov_offset < 0:
logger.info("[container_probe] No moov atom found in MP4")
return None
if moov_size <= 0 or moov_size > _MAX_MOOV_SIZE:
logger.warning("[container_probe] moov size %d is invalid or too large", moov_size)
return None
logger.info(
"[container_probe] MP4 atoms: moov at %d (%d bytes), mdat at %d (%d bytes)",
moov_offset,
moov_size,
mdat_offset,
mdat_size,
)
# Step 5: Fetch full moov atom
# Check if moov is already contained in the header data we read
if moov_offset + moov_size <= len(header_data):
moov_data = header_data[moov_offset : moov_offset + moov_size]
else:
moov_data = b""
async for chunk in source.stream(offset=moov_offset, limit=moov_size):
moov_data += chunk
if len(moov_data) < moov_size:
logger.warning(
"[container_probe] Incomplete moov: got %d of %d bytes",
len(moov_data),
moov_size,
)
return None
# Step 6: Parse moov body (skip box header)
# Determine header size
raw_size = struct.unpack_from(">I", moov_data, 0)[0]
hdr_size = 16 if raw_size == 1 else 8
moov_body = moov_data[hdr_size:]
cue_points, duration_ms, timescale, video_codec, audio_codec = build_cue_points_from_moov(moov_body)
# If mdat wasn't found via header scan, it's likely right after ftyp
# or right after moov. Common layouts:
# ftyp + moov + mdat (faststart) or ftyp + mdat + moov
if mdat_offset < 0:
# Walk atoms to find mdat by scanning just enough from the file
# In most cases, mdat is either before or after moov
if moov_offset < file_size // 2:
# moov is early -> mdat likely follows
mdat_search_offset = moov_offset + moov_size
else:
# moov is late -> mdat likely right after ftyp
ftyp_size = struct.unpack_from(">I", header_data, 0)[0]
if ftyp_size == 1:
ftyp_size = struct.unpack_from(">Q", header_data, 8)[0]
mdat_search_offset = ftyp_size
# Read a small amount to find the mdat header
mdat_header = b""
async for chunk in source.stream(offset=mdat_search_offset, limit=16):
mdat_header += chunk
if len(mdat_header) >= 8:
box_type = mdat_header[4:8]
if box_type == b"mdat":
mdat_offset = mdat_search_offset
raw_sz = struct.unpack_from(">I", mdat_header, 0)[0]
if raw_sz == 1 and len(mdat_header) >= 16:
mdat_size = struct.unpack_from(">Q", mdat_header, 8)[0]
else:
mdat_size = raw_sz
# Step 7: Extract ftyp data (always in the header since it's the first atom)
ftyp_data = b""
if ftyp_offset >= 0 and ftyp_size > 0 and ftyp_offset + ftyp_size <= len(header_data):
ftyp_data = header_data[ftyp_offset : ftyp_offset + ftyp_size]
# Step 8: Rewrite moov chunk offsets for faststart pipe layout.
# The pipe stream will be: ftyp + moov + mdat. The stco/co64
# offsets in the original moov point to positions in the original
# file. We need to shift them to account for the new layout.
# New mdat position = ftyp_size + moov_size
# Delta = new_mdat_position - original_mdat_offset
if mdat_offset >= 0:
new_mdat_start = len(ftyp_data) + moov_size
offset_delta = new_mdat_start - mdat_offset
if offset_delta != 0:
moov_data = rewrite_moov_offsets(moov_data, offset_delta)
index = MP4Index(
duration_ms=duration_ms,
timescale=timescale,
cue_points=cue_points,
moov_offset=moov_offset,
moov_size=moov_size,
moov_data=moov_data,
ftyp_data=ftyp_data,
mdat_offset=mdat_offset,
mdat_size=mdat_size,
video_codec=video_codec,
audio_codec=audio_codec,
)
logger.info(
"[container_probe] MP4 index: duration=%.1fs, %d cue points, video=%s, audio=%s",
duration_ms / 1000.0,
len(cue_points),
video_codec,
audio_codec,
)
if cache_key:
await _set_cached_mp4_index(cache_key, index)
return index
except Exception as e:
logger.warning("[container_probe] Failed to probe MP4 moov: %s", e)
return None

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,151 @@
"""
HLS VOD playlist generator for on-the-fly fMP4 transcoding.
Produces an M3U8 VOD playlist from an ``MKVCueIndex`` or ``MP4Index``.
Consecutive keyframes that are closer together than the target segment
duration are merged into a single HLS segment, matching the behaviour
of ``ffmpeg -hls_time``.
The init segment is referenced via ``#EXT-X-MAP``.
Requires ``#EXT-X-VERSION:7`` for fMP4 (CMAF) segments.
"""
from __future__ import annotations
import math
from typing import TYPE_CHECKING
if TYPE_CHECKING:
pass
def merge_cue_points(
cue_points: list[tuple[float, int]],
target_duration_ms: float = 5000.0,
) -> list[tuple[float, int]]:
"""Merge consecutive keyframes into segments of *>= target_duration_ms*.
This replicates the logic of ``ffmpeg -hls_time``: a new segment
boundary is created only when a keyframe is encountered **at least**
``target_duration_ms`` after the start of the current segment.
Keyframes that fall within the target window are absorbed into the
current segment.
Side-effects:
* Eliminates duplicate byte-offset entries (previously handled by
``deduplicate_cue_points``).
* Eliminates very short "runt" segments (e.g. 0.3 s).
Args:
cue_points: Sorted ``(time_ms, byte_offset)`` list.
target_duration_ms: Minimum segment duration in milliseconds.
Returns:
A reduced list of ``(time_ms, byte_offset)`` tuples representing
the merged segment boundaries.
"""
if not cue_points:
return []
# Normalize duplicate offsets first: keep the earliest timestamp for each
# byte offset. Some MKV files expose multiple cue times for the same
# cluster offset; if we keep a later duplicate, segment start times no
# longer match the actual bytes and can produce timestamp regressions.
# Sorting by (time, offset) ensures earliest time wins deterministically.
by_time = sorted(cue_points, key=lambda x: (x[0], x[1]))
deduped: list[tuple[float, int]] = []
seen_offsets: set[int] = set()
for time_ms, byte_offset in by_time:
if byte_offset in seen_offsets:
continue
seen_offsets.add(byte_offset)
deduped.append((time_ms, byte_offset))
if not deduped:
return []
merged: list[tuple[float, int]] = [deduped[0]]
for i in range(1, len(deduped)):
time_since_last = deduped[i][0] - merged[-1][0]
if time_since_last >= target_duration_ms:
merged.append(deduped[i])
return merged
def generate_vod_playlist(
cue_points: list[tuple[float, int]],
duration_ms: float,
init_url: str,
segment_url_template: str,
target_segment_duration_ms: float = 5000.0,
) -> str:
"""Build an HLS VOD M3U8 playlist from cue-point data.
Consecutive keyframes that are closer than *target_segment_duration_ms*
are merged into a single segment (matching ``ffmpeg -hls_time``).
Segment URLs use ``{start_ms}`` and ``{end_ms}`` placeholders that are
replaced with the segment's time range in milliseconds.
Args:
cue_points: Sorted list of ``(time_ms, byte_offset)`` tuples.
duration_ms: Total media duration in milliseconds.
init_url: URL for the fMP4 init segment (``#EXT-X-MAP`` URI).
segment_url_template: URL template containing ``{seg}``,
``{start_ms}`` and ``{end_ms}`` placeholders.
target_segment_duration_ms: Target minimum segment duration.
Returns:
Complete M3U8 playlist string.
"""
if not cue_points:
return ""
merged = merge_cue_points(cue_points, target_segment_duration_ms)
# Build per-segment (start_ms, end_ms, duration_s) list.
segments: list[tuple[float, float, float]] = []
for i in range(len(merged)):
start_ms = merged[i][0]
end_ms = merged[i + 1][0] if i + 1 < len(merged) else duration_ms
dur_s = max((end_ms - start_ms) / 1000.0, 0.001)
segments.append((start_ms, end_ms, dur_s))
if not segments:
return ""
target_duration = math.ceil(max(dur_s for _, _, dur_s in segments))
target_duration = max(target_duration, 1)
lines: list[str] = [
"#EXTM3U",
"#EXT-X-VERSION:7",
f"#EXT-X-TARGETDURATION:{target_duration}",
"#EXT-X-PLAYLIST-TYPE:VOD",
"#EXT-X-MEDIA-SEQUENCE:0",
f'#EXT-X-MAP:URI="{init_url}"',
]
for seg_num, (start_ms, end_ms, dur_s) in enumerate(segments):
lines.append(f"#EXTINF:{dur_s:.3f},")
url = (
segment_url_template.replace(
"{seg}",
str(seg_num),
)
.replace(
"{start_ms}",
str(int(start_ms)),
)
.replace(
"{end_ms}",
str(int(end_ms)),
)
)
lines.append(url)
lines.append("#EXT-X-ENDLIST")
lines.append("") # trailing newline
return "\n".join(lines)

View File

@@ -0,0 +1,234 @@
"""
Abstract media source protocol for source-agnostic transcode pipeline.
Decouples the transcode pipeline, MKV cue probing, and seeking logic
from any specific transport (Telegram, HTTP, etc.). Each transport
implements the MediaSource protocol to provide byte-range streaming.
"""
import hashlib
import logging
from collections.abc import AsyncIterator
from typing import Protocol, runtime_checkable
from urllib.parse import urlparse, unquote
from mediaflow_proxy.utils.http_client import create_aiohttp_session
from mediaflow_proxy.utils.telegram import telegram_manager
logger = logging.getLogger(__name__)
# Extensions mapped to container format hints used by transcode_handler
_MKV_EXTENSIONS = frozenset({".mkv", ".webm"})
_MP4_EXTENSIONS = frozenset({".mp4", ".m4v", ".mov", ".m4a", ".3gp"})
def _extract_extension(path: str) -> str:
"""Extract lowercase file extension (e.g. '.mkv') from a path or URL."""
# Strip query/fragment first for URL paths
dot_pos = path.rfind(".")
if dot_pos < 0:
return ""
ext = path[dot_pos:].lower()
# Trim anything after the extension (query params from raw paths)
for ch in ("?", "#", "&"):
idx = ext.find(ch)
if idx > 0:
ext = ext[:idx]
return ext
def filename_hint_from_url(url: str) -> str:
"""Derive a filename hint from a URL path (e.g. '.mkv', '.mp4')."""
try:
parsed = urlparse(url)
return _extract_extension(unquote(parsed.path))
except Exception:
return ""
def filename_hint_from_name(filename: str) -> str:
"""Derive a filename hint from a filename string."""
return _extract_extension(filename) if filename else ""
@runtime_checkable
class MediaSource(Protocol):
"""
Protocol for streaming media byte ranges.
Implementations must provide:
- stream(): async iterator of bytes from offset/limit
- file_size: total file size in bytes
- cache_key: deterministic key for caching (cue index, etc.)
- filename_hint: optional file extension hint (e.g. '.mkv', '.mp4')
"""
@property
def file_size(self) -> int:
"""Total file size in bytes."""
...
@property
def cache_key(self) -> str:
"""Deterministic cache key derived from the source identity."""
...
@property
def filename_hint(self) -> str:
"""Optional file extension hint (e.g. '.mkv', '.mp4') for format detection."""
...
async def stream(self, offset: int = 0, limit: int | None = None) -> AsyncIterator[bytes]:
"""
Stream bytes from the source.
Args:
offset: Byte offset to start from.
limit: Number of bytes to read. None = read to end.
Yields:
Chunks of bytes.
"""
...
class TelegramMediaSource:
"""
MediaSource backed by Telegram MTProto downloads.
Supports two download modes:
* **parallel** (default): Uses ``ParallelTransferrer`` with multiple
MTProtoSender connections for maximum throughput. Best for full-file
streaming (e.g. ``/proxy/telegram/stream``).
* **single** (``use_single_client=True``): Uses Telethon's built-in
``iter_download`` over the existing client connection. Avoids the
overhead of creating/destroying extra connections for each request,
ideal for small byte-range fetches like HLS segments and probe
headers.
"""
def __init__(
self,
telegram_ref,
file_size: int,
file_name: str = "",
*,
use_single_client: bool = False,
) -> None:
self._ref = telegram_ref
self._file_size = file_size
self._filename_hint = filename_hint_from_name(file_name)
self._use_single_client = use_single_client
@property
def file_size(self) -> int:
return self._file_size
@property
def cache_key(self) -> str:
ref = self._ref
if ref.file_id:
raw = f"file_id:{ref.file_id}"
elif ref.chat_id is not None and ref.message_id is not None:
raw = f"chat:{ref.chat_id}:msg:{ref.message_id}"
else:
return ""
return hashlib.sha256(raw.encode()).hexdigest()[:16]
@property
def filename_hint(self) -> str:
return self._filename_hint
async def stream(self, offset: int = 0, limit: int | None = None) -> AsyncIterator[bytes]:
effective_limit = limit or self._file_size
if self._use_single_client:
async for chunk in telegram_manager.stream_media_single(
self._ref,
offset=offset,
limit=effective_limit,
file_size=self._file_size,
):
yield chunk
else:
async for chunk in telegram_manager.stream_media(
self._ref,
offset=offset,
limit=effective_limit,
file_size=self._file_size,
):
yield chunk
class HTTPMediaSource:
"""MediaSource backed by HTTP byte-range requests via aiohttp."""
def __init__(self, url: str, headers: dict | None = None, file_size: int = 0) -> None:
self._url = url
self._headers = headers or {}
self._file_size = file_size
self._filename_hint = filename_hint_from_url(url)
@property
def file_size(self) -> int:
return self._file_size
@property
def cache_key(self) -> str:
return hashlib.sha256(self._url.encode()).hexdigest()[:16]
@property
def filename_hint(self) -> str:
return self._filename_hint
async def resolve_file_size(self) -> int:
"""Perform a HEAD request to determine file size if not already known."""
if self._file_size > 0:
return self._file_size
async with create_aiohttp_session(self._url, headers=self._headers) as (session, proxy_url):
async with session.head(
self._url,
headers=self._headers,
proxy=proxy_url,
allow_redirects=True,
) as resp:
cl = resp.headers.get("content-length")
if cl:
self._file_size = int(cl)
else:
# Try GET with range to get content-range
async with session.get(
self._url,
headers={**self._headers, "range": "bytes=0-0"},
proxy=proxy_url,
allow_redirects=True,
) as range_resp:
cr = range_resp.headers.get("content-range", "")
if "/" in cr:
try:
self._file_size = int(cr.split("/")[-1])
except ValueError:
pass
return self._file_size
async def stream(self, offset: int = 0, limit: int | None = None) -> AsyncIterator[bytes]:
headers = dict(self._headers)
if offset > 0 or limit is not None:
end = ""
if limit is not None:
end = str(offset + limit - 1)
headers["range"] = f"bytes={offset}-{end}"
async with create_aiohttp_session(self._url, headers=headers) as (session, proxy_url):
async with session.get(
self._url,
headers=headers,
proxy=proxy_url,
allow_redirects=True,
) as resp:
resp.raise_for_status()
async for chunk in resp.content.iter_any():
yield chunk

View File

@@ -0,0 +1,469 @@
"""
Streaming MKV demuxer.
Reads an MKV byte stream via an async iterator and yields individual media
frames (MKVFrame) with absolute timestamps. Designed for on-the-fly remuxing
without buffering the entire file.
Architecture:
AsyncIterator[bytes] -> StreamBuffer -> EBML parsing -> MKVFrame yields
The demuxer works in two phases:
1. read_header(): Consume bytes until Tracks is fully parsed, returning
a list of MKVTrack with codec metadata.
2. iter_frames(): Yield MKVFrame objects from Cluster/SimpleBlock data
as clusters arrive.
"""
import logging
from collections.abc import AsyncIterator
from dataclasses import dataclass, field
from mediaflow_proxy.remuxer.ebml_parser import (
CLUSTER,
CLUSTER_TIMESTAMP,
EBML_HEADER,
INFO,
MKVFrame,
MKVTrack,
SEGMENT,
SIMPLE_BLOCK,
BLOCK_GROUP,
TRACKS,
TIMESTAMP_SCALE,
DURATION,
UNKNOWN_SIZE,
extract_block_frames,
parse_tracks,
read_element_id,
read_element_size,
read_float,
read_uint,
_parse_block_group,
iter_elements,
)
logger = logging.getLogger(__name__)
class StreamBuffer:
"""
Accumulating byte buffer for streaming EBML parsing.
Collects chunks from an async byte source and provides read-ahead
capabilities for EBML element parsing. Supports consuming parsed
bytes to keep memory usage bounded.
"""
def __init__(self) -> None:
self._chunks: list[bytes] = []
self._total: int = 0
self._consumed: int = 0 # Logical bytes consumed (for offset tracking)
@property
def available(self) -> int:
"""Number of buffered bytes available for reading."""
return self._total
@property
def consumed(self) -> int:
"""Total bytes consumed so far (for absolute offset tracking)."""
return self._consumed
def append(self, data: bytes) -> None:
"""Add bytes to the buffer."""
if data:
self._chunks.append(data)
self._total += len(data)
def peek(self, size: int) -> bytes:
"""Read up to size bytes without consuming."""
if size <= 0:
return b""
result = bytearray()
remaining = size
for chunk in self._chunks:
if remaining <= 0:
break
take = min(len(chunk), remaining)
result.extend(chunk[:take])
remaining -= take
return bytes(result)
def get_all(self) -> bytes:
"""Get all buffered data as a single bytes object (without consuming)."""
if len(self._chunks) == 1:
return self._chunks[0]
data = b"".join(self._chunks)
self._chunks = [data]
return data
def consume(self, size: int) -> bytes:
"""Remove and return size bytes from the front of the buffer."""
if size <= 0:
return b""
if size > self._total:
size = self._total
result = bytearray()
remaining = size
while remaining > 0 and self._chunks:
chunk = self._chunks[0]
if len(chunk) <= remaining:
result.extend(chunk)
remaining -= len(chunk)
self._chunks.pop(0)
else:
result.extend(chunk[:remaining])
self._chunks[0] = chunk[remaining:]
remaining = 0
consumed = len(result)
self._total -= consumed
self._consumed += consumed
return bytes(result)
def skip(self, size: int) -> int:
"""Discard size bytes from the front. Returns actual bytes skipped."""
if size <= 0:
return 0
actual = min(size, self._total)
remaining = actual
while remaining > 0 and self._chunks:
chunk = self._chunks[0]
if len(chunk) <= remaining:
remaining -= len(chunk)
self._chunks.pop(0)
else:
self._chunks[0] = chunk[remaining:]
remaining = 0
self._total -= actual
self._consumed += actual
return actual
@dataclass
class MKVHeader:
"""Parsed MKV header metadata."""
tracks: list[MKVTrack] = field(default_factory=list)
timestamp_scale_ns: int = 1_000_000 # Default 1ms
duration_ms: float = 0.0
segment_data_offset: int = 0 # Absolute byte offset of Segment children
class MKVDemuxer:
"""
Streaming async MKV demuxer.
Reads an MKV byte stream from an async iterator and provides:
- read_header(): Parse EBML header + Segment metadata + Tracks
- iter_frames(): Yield MKVFrame objects from Clusters
Usage:
demuxer = MKVDemuxer()
header = await demuxer.read_header(source)
async for frame in demuxer.iter_frames(source):
process(frame)
"""
# Minimum bytes to try parsing an element header (ID + size)
_MIN_ELEMENT_HEADER = 12
def __init__(self) -> None:
self._buf = StreamBuffer()
self._header: MKVHeader | None = None
self._scale_ms: float = 1.0 # timestamp_scale / 1_000_000
@property
def header(self) -> MKVHeader | None:
return self._header
async def read_header(self, source: AsyncIterator[bytes]) -> MKVHeader:
"""
Read and parse the MKV header (EBML header, Segment, Info, Tracks).
Consumes bytes from source until Tracks is fully parsed. Any leftover
bytes (start of first Cluster) remain in the internal buffer for
iter_frames().
Returns:
MKVHeader with track info and timing metadata.
"""
header = MKVHeader()
# Phase 1: Accumulate enough data for EBML header + Segment header
await self._ensure_bytes(source, 64)
data = self._buf.get_all()
if len(data) < 4:
raise ValueError(
f"Source ended prematurely: got {len(data)} bytes, need at least an EBML header (source disconnected?)"
)
pos = 0
# Parse EBML Header
eid, pos = read_element_id(data, pos)
if eid != EBML_HEADER:
raise ValueError(f"Not an MKV file: expected EBML header, got 0x{eid:X}")
size, pos = read_element_size(data, pos)
if size == UNKNOWN_SIZE:
raise ValueError("EBML header has unknown size")
pos += size # Skip EBML header content
# Parse Segment element header
eid, pos = read_element_id(data, pos)
if eid != SEGMENT:
raise ValueError(f"Expected Segment, got 0x{eid:X}")
_seg_size, pos = read_element_size(data, pos)
header.segment_data_offset = self._buf.consumed + pos
# Phase 2: Parse Segment children until we have Tracks
# We need to iterate top-level Segment children: SeekHead, Info, Tracks
# Stop when we hit the first Cluster (media data).
tracks_found = False
while not tracks_found:
# Ensure we have enough for element header
await self._ensure_bytes(source, pos + self._MIN_ELEMENT_HEADER)
data = self._buf.get_all()
if pos >= len(data):
break
try:
eid, pos2 = read_element_id(data, pos)
size, pos3 = read_element_size(data, pos2)
except (ValueError, IndexError):
await self._ensure_bytes(source, pos + 32)
data = self._buf.get_all()
try:
eid, pos2 = read_element_id(data, pos)
size, pos3 = read_element_size(data, pos2)
except (ValueError, IndexError):
break
if eid == CLUSTER:
# Reached media data; header parsing is done.
# Don't consume the Cluster -- leave it for iter_frames.
break
if size == UNKNOWN_SIZE:
# Can't handle unknown-size elements in header
logger.warning("[mkv_demuxer] Unknown-size element 0x%X in header at pos %d", eid, pos)
break
# Ensure we have the full element
elem_end = pos3 + size
await self._ensure_bytes(source, elem_end)
data = self._buf.get_all()
if eid == INFO:
self._parse_info_element(data, pos3, pos3 + size, header)
elif eid == TRACKS:
header.tracks = parse_tracks(data, pos3, pos3 + size)
tracks_found = True
logger.info(
"[mkv_demuxer] Parsed %d tracks: %s",
len(header.tracks),
", ".join(f"#{t.track_number}={t.codec_id}" for t in header.tracks),
)
pos = elem_end
# Consume everything up to the current position (Cluster boundary)
self._buf.consume(pos)
# Set timing scale
self._scale_ms = header.timestamp_scale_ns / 1_000_000.0
self._header = header
return header
async def iter_frames(self, source: AsyncIterator[bytes]) -> AsyncIterator[MKVFrame]:
"""
Yield MKVFrame objects from Cluster/SimpleBlock data.
Must be called after read_header(). Continues consuming bytes from
source, parsing Clusters and yielding individual frames.
"""
if self._header is None:
raise RuntimeError("read_header() must be called before iter_frames()")
while True:
# Try to read the next element header
if not await self._ensure_bytes_soft(source, self._MIN_ELEMENT_HEADER):
break
data = self._buf.get_all()
pos = 0
try:
eid, pos2 = read_element_id(data, pos)
size, pos3 = read_element_size(data, pos2)
except (ValueError, IndexError):
# Try to get more data
if not await self._ensure_bytes_soft(source, len(data) + 4096):
break
data = self._buf.get_all()
try:
eid, pos2 = read_element_id(data, pos)
size, pos3 = read_element_size(data, pos2)
except (ValueError, IndexError):
break
if eid == CLUSTER:
if size == UNKNOWN_SIZE:
# Unknown-size Cluster: parse children until we hit the next
# Cluster or run out of data
self._buf.consume(pos3) # consume Cluster header
async for frame in self._parse_unknown_size_cluster(source):
yield frame
else:
# Known-size Cluster: ensure we have all data
elem_end = pos3 + size
await self._ensure_bytes(source, elem_end)
data = self._buf.get_all()
for frame in self._parse_cluster_data(data, pos3, pos3 + size):
yield frame
self._buf.consume(elem_end)
else:
# Skip non-Cluster top-level elements
if size == UNKNOWN_SIZE:
break
elem_end = pos3 + size
if elem_end > len(data):
# Need to skip bytes we don't have yet
self._buf.consume(len(data))
skip_remaining = elem_end - len(data)
await self._skip_bytes(source, skip_remaining)
else:
self._buf.consume(elem_end)
def _parse_info_element(self, data: bytes, start: int, end: int, header: MKVHeader) -> None:
"""Parse Info element children for timestamp scale and duration."""
for eid, off, size, _ in iter_elements(data, start, end):
if eid == TIMESTAMP_SCALE:
header.timestamp_scale_ns = read_uint(data, off, size)
elif eid == DURATION:
scale = header.timestamp_scale_ns / 1_000_000.0
header.duration_ms = read_float(data, off, size) * scale
def _parse_cluster_data(self, data: bytes, start: int, end: int) -> list[MKVFrame]:
"""Parse a known-size Cluster and return its frames."""
cluster_timecode = 0
frames = []
for eid, data_off, size, _ in iter_elements(data, start, end):
if eid == CLUSTER_TIMESTAMP:
cluster_timecode = read_uint(data, data_off, size)
elif eid == SIMPLE_BLOCK:
for track_num, rel_tc, flags, frame_list in extract_block_frames(data, data_off, size):
is_kf = bool(flags & 0x80)
abs_ts_ms = (cluster_timecode + rel_tc) * self._scale_ms
for frame_data in frame_list:
frames.append(
MKVFrame(
track_number=track_num,
timestamp_ms=abs_ts_ms,
is_keyframe=is_kf,
data=frame_data,
)
)
elif eid == BLOCK_GROUP:
_parse_block_group(data, data_off, data_off + size, cluster_timecode, self._scale_ms, frames)
return frames
async def _parse_unknown_size_cluster(self, source: AsyncIterator[bytes]) -> AsyncIterator[MKVFrame]:
"""Parse an unknown-size Cluster by reading children until next Cluster."""
cluster_timecode = 0
while True:
if not await self._ensure_bytes_soft(source, self._MIN_ELEMENT_HEADER):
break
data = self._buf.get_all()
pos = 0
try:
eid, pos2 = read_element_id(data, pos)
size, pos3 = read_element_size(data, pos2)
except (ValueError, IndexError):
if not await self._ensure_bytes_soft(source, len(data) + 4096):
break
data = self._buf.get_all()
try:
eid, pos2 = read_element_id(data, pos)
size, pos3 = read_element_size(data, pos2)
except (ValueError, IndexError):
break
# A new Cluster or top-level element signals end of current Cluster
if eid == CLUSTER or eid == SEGMENT:
break
if size == UNKNOWN_SIZE:
break
elem_end = pos3 + size
await self._ensure_bytes(source, elem_end)
data = self._buf.get_all()
if eid == CLUSTER_TIMESTAMP:
cluster_timecode = read_uint(data, pos3, size)
elif eid == SIMPLE_BLOCK:
for track_num, rel_tc, flags, frame_list in extract_block_frames(data, pos3, size):
is_kf = bool(flags & 0x80)
abs_ts_ms = (cluster_timecode + rel_tc) * self._scale_ms
for frame_data in frame_list:
yield MKVFrame(
track_number=track_num,
timestamp_ms=abs_ts_ms,
is_keyframe=is_kf,
data=frame_data,
)
elif eid == BLOCK_GROUP:
bg_frames = []
_parse_block_group(data, pos3, pos3 + size, cluster_timecode, self._scale_ms, bg_frames)
for frame in bg_frames:
yield frame
self._buf.consume(elem_end)
async def _ensure_bytes(self, source: AsyncIterator[bytes], needed: int) -> None:
"""Ensure the buffer has at least 'needed' bytes. Raises StopAsyncIteration if exhausted."""
while self._buf.available < needed:
try:
chunk = await source.__anext__()
self._buf.append(chunk)
except StopAsyncIteration:
return
async def _ensure_bytes_soft(self, source: AsyncIterator[bytes], needed: int) -> bool:
"""Like _ensure_bytes but returns False instead of raising."""
while self._buf.available < needed:
try:
chunk = await source.__anext__()
if not chunk:
return self._buf.available > 0
self._buf.append(chunk)
except StopAsyncIteration:
return self._buf.available > 0
return True
async def _skip_bytes(self, source: AsyncIterator[bytes], count: int) -> None:
"""Skip count bytes from the source without buffering."""
remaining = count
while remaining > 0:
try:
chunk = await source.__anext__()
if len(chunk) <= remaining:
remaining -= len(chunk)
else:
# Put the excess back
self._buf.append(chunk[remaining:])
remaining = 0
except StopAsyncIteration:
break

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,834 @@
"""
MP4 container parser for moov atom probing.
Provides:
- MP4Index: seek index extracted from MP4 moov atom (parallel to MKVCueIndex)
- Top-level atom scanning
- Sample table parsers (stco, co64, stss, stsz, stts, stsc)
- Moov-to-cue-point builder
- rewrite_moov_offsets: adjust stco/co64 in moov for file rearrangement
The parsers are the inverse of the builder functions in mp4_muxer.py.
Box navigation reuses the pattern from ts_muxer.py's read_box/find_box/iter_boxes.
"""
import bisect
import logging
import struct
from dataclasses import dataclass, field
logger = logging.getLogger(__name__)
# =============================================================================
# MP4 Box Utilities
# =============================================================================
# Minimum bytes needed to read a standard box header
_BOX_HEADER_SIZE = 8
# ftyp brands that identify MP4/MOV containers
_MP4_BRANDS = {
b"isom",
b"iso2",
b"iso3",
b"iso4",
b"iso5",
b"iso6",
b"mp41",
b"mp42",
b"M4V ",
b"M4A ",
b"f4v ",
b"kddi",
b"avc1",
b"qt ",
b"MSNV",
b"dash",
b"3gp4",
b"3gp5",
b"3gp6",
}
def is_mp4_header(data: bytes) -> bool:
"""Check if the data starts with an ftyp box (MP4 signature)."""
if len(data) < 8:
return False
size = struct.unpack_from(">I", data, 0)[0]
box_type = data[4:8]
if box_type != b"ftyp":
return False
if size < 12 or size > len(data):
return size >= 12 # might be valid but truncated
major_brand = data[8:12]
return major_brand in _MP4_BRANDS
def read_box_header(data: bytes, offset: int) -> tuple[bytes, int, int] | None:
"""
Read a box header at the given offset.
Returns:
(box_type, header_size, total_box_size) or None if not enough data.
"""
if offset + 8 > len(data):
return None
size, box_type = struct.unpack_from(">I4s", data, offset)
header_size = 8
if size == 1: # Extended size (64-bit)
if offset + 16 > len(data):
return None
size = struct.unpack_from(">Q", data, offset + 8)[0]
header_size = 16
elif size == 0: # Box extends to end of data
size = len(data) - offset
return box_type, header_size, size
def iter_top_level_boxes(data: bytes):
"""
Iterate over top-level box headers.
Yields:
(box_type, header_size, total_size, data_offset)
"""
offset = 0
while offset < len(data):
result = read_box_header(data, offset)
if result is None:
break
box_type, header_size, total_size = result
yield box_type, header_size, total_size, offset + header_size
if total_size == 0:
break
offset += total_size
def find_box(data: bytes, target: bytes) -> bytes | None:
"""Find a box by type and return its body (data after header)."""
for box_type, header_size, total_size, data_offset in iter_top_level_boxes(data):
if box_type == target:
return data[data_offset : data_offset - header_size + total_size]
return None
def iter_boxes(data: bytes):
"""Iterate over child boxes: yields (box_type, box_body_bytes)."""
for box_type, header_size, total_size, data_offset in iter_top_level_boxes(data):
end = data_offset - header_size + total_size
yield box_type, data[data_offset:end]
# =============================================================================
# Sample Table Parsers (inverse of mp4_muxer.py builders)
# =============================================================================
def parse_full_box_header(data: bytes) -> tuple[int, int, int]:
"""
Parse a full box header (version + flags).
Returns:
(version, flags, header_size) where header_size is 4 bytes.
"""
if len(data) < 4:
return 0, 0, 0
version = data[0]
flags = (data[1] << 16) | (data[2] << 8) | data[3]
return version, flags, 4
def parse_stco(data: bytes) -> list[int]:
"""
Parse Chunk Offset box (stco) - 32-bit offsets.
Layout: version(1) + flags(3) + entry_count(4) + [offset(4)]...
"""
if len(data) < 8:
return []
_, _, hdr = parse_full_box_header(data)
pos = hdr
entry_count = struct.unpack_from(">I", data, pos)[0]
pos += 4
if len(data) < pos + entry_count * 4:
return []
offsets = []
for _ in range(entry_count):
offsets.append(struct.unpack_from(">I", data, pos)[0])
pos += 4
return offsets
def parse_co64(data: bytes) -> list[int]:
"""
Parse Chunk Offset box (co64) - 64-bit offsets.
Layout: version(1) + flags(3) + entry_count(4) + [offset(8)]...
"""
if len(data) < 8:
return []
_, _, hdr = parse_full_box_header(data)
pos = hdr
entry_count = struct.unpack_from(">I", data, pos)[0]
pos += 4
if len(data) < pos + entry_count * 8:
return []
offsets = []
for _ in range(entry_count):
offsets.append(struct.unpack_from(">Q", data, pos)[0])
pos += 8
return offsets
def parse_stss(data: bytes) -> list[int]:
"""
Parse Sync Sample box (stss) - keyframe indices (1-based).
Layout: version(1) + flags(3) + entry_count(4) + [sample_number(4)]...
"""
if len(data) < 8:
return []
_, _, hdr = parse_full_box_header(data)
pos = hdr
entry_count = struct.unpack_from(">I", data, pos)[0]
pos += 4
if len(data) < pos + entry_count * 4:
return []
indices = []
for _ in range(entry_count):
indices.append(struct.unpack_from(">I", data, pos)[0])
pos += 4
return indices
def parse_stsz(data: bytes) -> tuple[int, list[int]]:
"""
Parse Sample Size box (stsz).
Layout: version(1) + flags(3) + sample_size(4) + sample_count(4) + [size(4)]...
Returns:
(uniform_size, sizes_list).
If uniform_size > 0, all samples have that size and sizes_list is empty.
Otherwise, sizes_list contains per-sample sizes.
"""
if len(data) < 12:
return 0, []
_, _, hdr = parse_full_box_header(data)
pos = hdr
sample_size = struct.unpack_from(">I", data, pos)[0]
sample_count = struct.unpack_from(">I", data, pos + 4)[0]
pos += 8
if sample_size > 0:
return sample_size, []
if len(data) < pos + sample_count * 4:
return 0, []
sizes = []
for _ in range(sample_count):
sizes.append(struct.unpack_from(">I", data, pos)[0])
pos += 4
return 0, sizes
def parse_stts(data: bytes) -> list[tuple[int, int]]:
"""
Parse Time-to-Sample box (stts) - run-length encoded durations.
Layout: version(1) + flags(3) + entry_count(4) + [sample_count(4) + sample_delta(4)]...
Returns:
List of (sample_count, sample_delta) entries.
"""
if len(data) < 8:
return []
_, _, hdr = parse_full_box_header(data)
pos = hdr
entry_count = struct.unpack_from(">I", data, pos)[0]
pos += 4
if len(data) < pos + entry_count * 8:
return []
entries = []
for _ in range(entry_count):
count = struct.unpack_from(">I", data, pos)[0]
delta = struct.unpack_from(">I", data, pos + 4)[0]
entries.append((count, delta))
pos += 8
return entries
def parse_stsc(data: bytes) -> list[tuple[int, int, int]]:
"""
Parse Sample-to-Chunk box (stsc).
Layout: version(1) + flags(3) + entry_count(4) +
[first_chunk(4) + samples_per_chunk(4) + sample_desc_index(4)]...
Returns:
List of (first_chunk, samples_per_chunk, sample_desc_index) entries.
first_chunk is 1-based.
"""
if len(data) < 8:
return []
_, _, hdr = parse_full_box_header(data)
pos = hdr
entry_count = struct.unpack_from(">I", data, pos)[0]
pos += 4
if len(data) < pos + entry_count * 12:
return []
entries = []
for _ in range(entry_count):
first_chunk = struct.unpack_from(">I", data, pos)[0]
spc = struct.unpack_from(">I", data, pos + 4)[0]
sdi = struct.unpack_from(">I", data, pos + 8)[0]
entries.append((first_chunk, spc, sdi))
pos += 12
return entries
def parse_mdhd(data: bytes) -> tuple[int, int]:
"""
Parse Media Header box (mdhd) for timescale and duration.
Returns:
(timescale, duration) in media timescale units.
"""
if len(data) < 4:
return 0, 0
version = data[0]
if version == 1:
# 64-bit: skip version(1)+flags(3)+creation(8)+modification(8)
if len(data) < 32:
return 0, 0
timescale = struct.unpack_from(">I", data, 20)[0]
duration = struct.unpack_from(">Q", data, 24)[0]
else:
# 32-bit: skip version(1)+flags(3)+creation(4)+modification(4)
if len(data) < 20:
return 0, 0
timescale = struct.unpack_from(">I", data, 12)[0]
duration = struct.unpack_from(">I", data, 16)[0]
return timescale, duration
def parse_stsd_codec(data: bytes) -> str:
"""
Parse Sample Description box (stsd) to extract the codec FourCC.
Returns the codec name as a string (e.g. "avc1", "hvc1", "mp4a").
"""
if len(data) < 16:
return ""
# version(1)+flags(3)+entry_count(4)
pos = 8
# First entry: size(4)+type(4)
if pos + 8 > len(data):
return ""
codec_fourcc = data[pos + 4 : pos + 8]
try:
return codec_fourcc.decode("ascii").strip()
except (UnicodeDecodeError, ValueError):
return ""
# =============================================================================
# MP4 Index (parallel to MKVCueIndex)
# =============================================================================
@dataclass
class MP4Index:
"""
Seek index extracted from an MP4 file's moov atom.
Parallel to ``MKVCueIndex`` for MKV files. Provides keyframe-indexed
cue points for time-based seeking and the raw moov bytes needed to
reconstruct a streamable (faststart) MP4 for on-the-fly demuxing.
"""
duration_ms: float = 0.0
timescale: int = 0
cue_points: list[tuple[float, int]] = field(default_factory=list) # [(time_ms, byte_offset), ...]
moov_offset: int = 0 # Absolute file offset where moov atom starts
moov_size: int = 0 # Total size of the moov atom (header + body)
moov_data: bytes = b"" # Raw moov atom bytes (for prepending to mdat pipe)
ftyp_data: bytes = b"" # Raw ftyp atom bytes (for prepending before moov)
mdat_offset: int = 0 # Absolute file offset where mdat atom starts
mdat_size: int = 0 # Total size of the mdat atom
video_codec: str = "" # e.g. "avc1", "hvc1", "mp4v"
audio_codec: str = "" # e.g. "mp4a", "ac-3"
def byte_offset_for_time(self, time_ms: float) -> tuple[int, float]:
"""
Find the byte offset for the nearest keyframe at or before time_ms.
Returns:
(absolute_byte_offset, actual_keyframe_time_ms)
"""
if not self.cue_points:
return 0, 0.0
times = [cp[0] for cp in self.cue_points]
idx = bisect.bisect_right(times, time_ms) - 1
if idx < 0:
idx = 0
cue_time_ms, byte_offset = self.cue_points[idx]
return byte_offset, cue_time_ms
# =============================================================================
# Moov -> Cue Points Builder
# =============================================================================
def _find_nested_box(data: bytes, *path: bytes) -> bytes | None:
"""Walk a box hierarchy: find_nested_box(data, b"trak", b"mdia") etc."""
current = data
for box_name in path:
found = find_box(current, box_name)
if found is None:
return None
current = found
return current
def build_cue_points_from_moov(moov_body: bytes) -> tuple[list[tuple[float, int]], float, int, str, str]:
"""
Parse a moov body to build keyframe-indexed cue points.
Walks the first video trak's stbl to extract:
- Chunk offsets (stco/co64)
- Keyframe sample indices (stss)
- Sample sizes (stsz)
- Sample durations (stts)
- Sample-to-chunk mapping (stsc)
- Timescale and duration from mdhd
Returns:
(cue_points, duration_ms, timescale, video_codec, audio_codec)
"""
cue_points: list[tuple[float, int]] = []
duration_ms = 0.0
timescale = 0
video_codec = ""
audio_codec = ""
# Find all traks
video_stbl = None
video_mdhd = None
offset = 0
data = moov_body
while offset < len(data):
result = read_box_header(data, offset)
if result is None:
break
box_type, hdr_size, total_size = result
if box_type == b"trak":
trak_body = data[offset + hdr_size : offset + total_size]
# Check handler type to identify video/audio
hdlr_data = _find_nested_box(trak_body, b"mdia", b"hdlr")
handler_type = b""
if hdlr_data and len(hdlr_data) >= 12:
# hdlr: version(1)+flags(3)+pre_defined(4)+handler_type(4)
handler_type = hdlr_data[8:12]
if handler_type == b"vide" and video_stbl is None:
video_stbl = _find_nested_box(trak_body, b"mdia", b"minf", b"stbl")
video_mdhd_data = _find_nested_box(trak_body, b"mdia", b"mdhd")
if video_mdhd_data:
video_mdhd = video_mdhd_data
stsd_data = _find_nested_box(trak_body, b"mdia", b"minf", b"stbl", b"stsd")
if stsd_data:
video_codec = parse_stsd_codec(stsd_data)
elif handler_type == b"soun" and not audio_codec:
stsd_data = _find_nested_box(trak_body, b"mdia", b"minf", b"stbl", b"stsd")
if stsd_data:
audio_codec = parse_stsd_codec(stsd_data)
elif box_type == b"mvhd":
# Fallback: parse mvhd for timescale/duration if no mdhd
mvhd_body = data[offset + hdr_size : offset + total_size]
if len(mvhd_body) >= 20:
version = mvhd_body[0]
if version == 1:
if len(mvhd_body) >= 28:
ts = struct.unpack_from(">I", mvhd_body, 20)[0]
dur = struct.unpack_from(">Q", mvhd_body, 24)[0]
if timescale == 0:
timescale = ts
duration_ms = dur / ts * 1000.0 if ts else 0.0
else:
ts = struct.unpack_from(">I", mvhd_body, 12)[0]
dur = struct.unpack_from(">I", mvhd_body, 16)[0]
if timescale == 0:
timescale = ts
duration_ms = dur / ts * 1000.0 if ts else 0.0
if total_size == 0:
break
offset += total_size
# Parse mdhd for video timescale (more precise than mvhd)
if video_mdhd:
ts, dur = parse_mdhd(video_mdhd)
if ts > 0:
timescale = ts
duration_ms = dur / ts * 1000.0
if video_stbl is None:
logger.warning("[mp4_parser] No video stbl found in moov")
return cue_points, duration_ms, timescale, video_codec, audio_codec
# Parse sample tables from video stbl
stco_data = find_box(video_stbl, b"stco")
co64_data = find_box(video_stbl, b"co64")
stss_data = find_box(video_stbl, b"stss")
stsz_data = find_box(video_stbl, b"stsz")
stts_data = find_box(video_stbl, b"stts")
stsc_data = find_box(video_stbl, b"stsc")
# Chunk offsets
chunk_offsets = parse_co64(co64_data) if co64_data else (parse_stco(stco_data) if stco_data else [])
# Keyframe sample numbers (1-based)
keyframe_samples = set(parse_stss(stss_data)) if stss_data else set()
all_are_keyframes = not stss_data # No stss means all samples are sync
# Sample sizes
uniform_size, size_list = parse_stsz(stsz_data) if stsz_data else (0, [])
# Sample durations (run-length encoded)
stts_entries = parse_stts(stts_data) if stts_data else []
# Sample-to-chunk mapping
stsc_entries = parse_stsc(stsc_data) if stsc_data else []
if not chunk_offsets or timescale == 0:
logger.warning(
"[mp4_parser] Missing data: chunks=%d, timescale=%d",
len(chunk_offsets),
timescale,
)
return cue_points, duration_ms, timescale, video_codec, audio_codec
# Expand stts to per-sample durations
sample_durations: list[int] = []
for count, delta in stts_entries:
sample_durations.extend([delta] * count)
# Expand stsc to determine which samples belong to which chunk
# Build a mapping: chunk_index (0-based) -> samples_per_chunk
total_chunks = len(chunk_offsets)
chunk_sample_counts: list[int] = [0] * total_chunks
if stsc_entries:
for i, (first_chunk, spc, _sdi) in enumerate(stsc_entries):
# first_chunk is 1-based
start = first_chunk - 1
if i + 1 < len(stsc_entries):
end = stsc_entries[i + 1][0] - 1
else:
end = total_chunks
for c in range(start, end):
if c < total_chunks:
chunk_sample_counts[c] = spc
else:
# Default: 1 sample per chunk
chunk_sample_counts = [1] * total_chunks
# Count total samples
total_samples = sum(chunk_sample_counts)
# Get per-sample sizes
if uniform_size > 0:
sample_sizes = [uniform_size] * total_samples
else:
sample_sizes = size_list
# Build cumulative timestamp for each sample and map keyframes to byte offsets
current_sample = 0 # 0-based sample index
current_time = 0 # in timescale units
for chunk_idx, chunk_offset in enumerate(chunk_offsets):
spc = chunk_sample_counts[chunk_idx] if chunk_idx < len(chunk_sample_counts) else 1
byte_pos = chunk_offset
for s in range(spc):
sample_num = current_sample + 1 # 1-based for stss comparison
is_keyframe = all_are_keyframes or sample_num in keyframe_samples
if is_keyframe:
time_ms = current_time / timescale * 1000.0
cue_points.append((time_ms, byte_pos))
# Advance byte position by this sample's size
if current_sample < len(sample_sizes):
byte_pos += sample_sizes[current_sample]
# Advance timestamp
if current_sample < len(sample_durations):
current_time += sample_durations[current_sample]
current_sample += 1
logger.info(
"[mp4_parser] Built %d cue points from %d samples, duration=%.1fs, video=%s, audio=%s",
len(cue_points),
total_samples,
duration_ms / 1000.0,
video_codec,
audio_codec,
)
return cue_points, duration_ms, timescale, video_codec, audio_codec
# =============================================================================
# Moov Offset Rewriting (for faststart pipe construction)
# =============================================================================
def _rewrite_stco_in_place(data: bytearray, box_start: int, box_size: int, delta: int) -> int:
"""Rewrite stco chunk offsets by adding delta. Returns number of entries fixed."""
# FullBox header: version(1) + flags(3) = 4 bytes
body_start = box_start + 4
if body_start + 4 > box_start + box_size:
return 0
entry_count = struct.unpack_from(">I", data, body_start)[0]
pos = body_start + 4
for _ in range(entry_count):
if pos + 4 > box_start + box_size:
break
old_val = struct.unpack_from(">I", data, pos)[0]
struct.pack_into(">I", data, pos, old_val + delta)
pos += 4
return entry_count
def _rewrite_co64_in_place(data: bytearray, box_start: int, box_size: int, delta: int) -> int:
"""Rewrite co64 chunk offsets by adding delta. Returns number of entries fixed."""
body_start = box_start + 4
if body_start + 4 > box_start + box_size:
return 0
entry_count = struct.unpack_from(">I", data, body_start)[0]
pos = body_start + 4
for _ in range(entry_count):
if pos + 8 > box_start + box_size:
break
old_val = struct.unpack_from(">Q", data, pos)[0]
struct.pack_into(">Q", data, pos, old_val + delta)
pos += 8
return entry_count
def _walk_and_rewrite(data: bytearray, start: int, end: int, delta: int) -> int:
"""
Recursively walk boxes within [start, end) looking for stco/co64 boxes
and rewriting their offsets.
Returns total number of offset entries rewritten.
"""
total = 0
offset = start
while offset + 8 <= end:
size = struct.unpack_from(">I", data, offset)[0]
box_type = data[offset + 4 : offset + 8]
hdr_size = 8
if size == 1:
if offset + 16 > end:
break
size = struct.unpack_from(">Q", data, offset + 8)[0]
hdr_size = 16
elif size == 0:
size = end - offset
if size < 8 or offset + size > end:
break
body_start = offset + hdr_size
body_end = offset + size
if box_type == b"stco":
total += _rewrite_stco_in_place(data, body_start, size - hdr_size, delta)
elif box_type == b"co64":
total += _rewrite_co64_in_place(data, body_start, size - hdr_size, delta)
elif box_type in (b"moov", b"trak", b"mdia", b"minf", b"stbl"):
# Container box -- recurse into children
total += _walk_and_rewrite(data, body_start, body_end, delta)
offset += size
return total
def extract_video_track_from_moov(moov_data: bytes):
"""
Extract video codec configuration from an MP4 moov atom.
Walks the moov box tree to find the first video trak, extracts its
resolution and codec-private data (avcC/hvcC), and returns a synthetic
``MKVTrack`` suitable for building an fMP4 init segment.
Returns:
An ``MKVTrack`` with video metadata, or ``None`` if no video track
is found.
"""
from mediaflow_proxy.remuxer.ebml_parser import (
CODEC_ID_H264,
CODEC_ID_H265,
MKVTrack,
)
# Strip the moov box header to get the body
if len(moov_data) < 8:
return None
raw_size = struct.unpack_from(">I", moov_data, 0)[0]
hdr_size = 16 if raw_size == 1 else 8
moov_body = moov_data[hdr_size:]
# Walk traks looking for video handler
offset = 0
while offset < len(moov_body):
result = read_box_header(moov_body, offset)
if result is None:
break
box_type, box_hdr_size, total_size = result
if box_type == b"trak":
trak_body = moov_body[offset + box_hdr_size : offset + total_size]
# Check handler type
hdlr_data = _find_nested_box(trak_body, b"mdia", b"hdlr")
handler_type = b""
if hdlr_data and len(hdlr_data) >= 12:
handler_type = hdlr_data[8:12]
if handler_type == b"vide":
# Found video trak -- extract stsd for codec config
stsd_data = _find_nested_box(trak_body, b"mdia", b"minf", b"stbl", b"stsd")
if not stsd_data or len(stsd_data) < 16:
offset += total_size
continue
codec_name = parse_stsd_codec(stsd_data)
# Map MP4 codec names to MKV codec IDs
if codec_name in ("avc1", "avc3"):
mkv_codec_id = CODEC_ID_H264
elif codec_name in ("hvc1", "hev1"):
mkv_codec_id = CODEC_ID_H265
else:
mkv_codec_id = f"V_MP4/{codec_name}"
# Extract codec private (avcC or hvcC box) from inside the
# sample entry. The stsd structure is:
# version(1) + flags(3) + entry_count(4)
# then entry: size(4) + type(4) + ... + nested boxes
# The avcC/hvcC is a child box of the sample entry.
codec_private = b""
width = 0
height = 0
# Parse sample entry to get width/height and codec config
entry_start = 8 # skip version+flags+entry_count
if entry_start + 8 <= len(stsd_data):
entry_size = struct.unpack_from(">I", stsd_data, entry_start)[0]
entry_body_start = entry_start + 8 # skip size+type
entry_end = min(entry_start + entry_size, len(stsd_data))
# Visual sample entry: 6 reserved + 2 data_ref_idx + ...
# At offset 24 from entry body start: width(2) + height(2)
vis_offset = entry_body_start + 24
if vis_offset + 4 <= entry_end:
width = struct.unpack_from(">H", stsd_data, vis_offset)[0]
height = struct.unpack_from(">H", stsd_data, vis_offset + 2)[0]
# Scan nested boxes for avcC or hvcC
# Visual sample entry fixed fields = 70 bytes from entry body
nested_start = entry_body_start + 70
if nested_start < entry_end:
nested_data = stsd_data[nested_start:entry_end]
for target in (b"avcC", b"hvcC"):
found = find_box(nested_data, target)
if found:
codec_private = found
break
# Get duration from mdhd if available
default_duration_ns = 0
mdhd_data = _find_nested_box(trak_body, b"mdia", b"mdhd")
if mdhd_data and len(mdhd_data) >= 20:
version = mdhd_data[0]
if version == 1 and len(mdhd_data) >= 28:
ts = struct.unpack_from(">I", mdhd_data, 20)[0]
dur = struct.unpack_from(">Q", mdhd_data, 24)[0]
else:
ts = struct.unpack_from(">I", mdhd_data, 12)[0]
dur = struct.unpack_from(">I", mdhd_data, 16)[0]
if ts > 0 and dur > 0:
# Rough estimate: assume 24fps if we can't determine.
default_duration_ns = int(1_000_000_000 / 24)
return MKVTrack(
track_number=1,
track_type=1, # video
codec_id=mkv_codec_id,
codec_private=codec_private,
pixel_width=width,
pixel_height=height,
default_duration_ns=default_duration_ns,
)
offset += total_size
return None
def rewrite_moov_offsets(moov_data: bytes, delta: int) -> bytes:
"""
Rewrite all stco/co64 chunk offsets in a moov atom by adding ``delta``.
This is needed when rearranging an MP4 file for pipe streaming:
the original moov's chunk offsets reference positions in the original
file layout. When we prepend moov before mdat, the offsets must be
shifted by ``delta = moov_size - original_mdat_offset``.
Args:
moov_data: Raw bytes of the complete moov box (header + body).
delta: Offset adjustment to add to every chunk offset.
Returns:
Modified moov bytes with updated chunk offsets.
"""
buf = bytearray(moov_data)
# Determine moov box header size
raw_size = struct.unpack_from(">I", buf, 0)[0]
hdr_size = 16 if raw_size == 1 else 8
total = _walk_and_rewrite(buf, hdr_size, len(buf), delta)
logger.info("[mp4_parser] Rewrote %d chunk offset entries (delta=%+d)", total, delta)
return bytes(buf)

View File

@@ -0,0 +1,608 @@
"""
Universal PyAV-based streaming demuxer.
Bridges async byte streams to PyAV's synchronous I/O using an OS pipe,
allowing on-the-fly demuxing of any container format (MKV, MP4, TS,
FLV, WebM, etc.) from an async source.
Architecture:
AsyncIterator[bytes] --> async feeder task --> queue.Queue --> writer thread (pipe)
|
OS pipe (kernel buffer)
|
demux thread: av.open + discover + demux
|
queue.Queue --> run_in_executor consumer
Performance: Uses plain threading.Queue on both sides (writer input and
packet output) to avoid per-item ``run_coroutine_threadsafe`` overhead.
The async/thread bridge is done via ``run_in_executor`` on the consumer
side and a dedicated asyncio task on the producer side.
For MP4 inputs, the caller (transcode_handler) prepends the moov atom
to the stream so PyAV receives a "faststart"-style MP4 through the pipe.
This allows true on-the-fly demuxing for all container formats.
"""
import asyncio
import logging
import os
import queue
import threading
from collections.abc import AsyncIterator
from dataclasses import dataclass
import av
logger = logging.getLogger(__name__)
# Sentinel object to signal end-of-stream in queues
_SENTINEL = object()
@dataclass(slots=True)
class DemuxedStream:
"""Metadata about a demuxed stream."""
index: int
codec_name: str
codec_type: str # "video" or "audio"
# Video-specific
width: int = 0
height: int = 0
fps: float = 0.0
pixel_format: str = ""
# Audio-specific
sample_rate: int = 0
channels: int = 0
# Timing
time_base_num: int = 1
time_base_den: int = 1000
duration_seconds: float = 0.0
# Raw codec extradata (e.g. SPS/PPS for H.264, AudioSpecificConfig for AAC)
extradata: bytes = b""
@dataclass(slots=True)
class DemuxedPacket:
"""A demuxed packet with timing info."""
stream_index: int
codec_type: str # "video" or "audio"
data: bytes
pts: int # Presentation timestamp in stream time_base units
dts: int # Decode timestamp in stream time_base units
duration: int # Duration in stream time_base units
is_keyframe: bool
time_base_num: int
time_base_den: int
# Optional decoded frame when decode_video/decode_audio is True
# av.VideoFrame for video, av.AudioFrame for audio
decoded_frame: object = None
@property
def pts_seconds(self) -> float:
if self.time_base_den == 0:
return 0.0
return self.pts * self.time_base_num / self.time_base_den
@property
def dts_seconds(self) -> float:
if self.time_base_den == 0:
return 0.0
return self.dts * self.time_base_num / self.time_base_den
@property
def duration_seconds(self) -> float:
if self.time_base_den == 0:
return 0.0
return self.duration * self.time_base_num / self.time_base_den
class PyAVDemuxer:
"""
Streaming demuxer using PyAV with pipe-based I/O.
All container I/O happens in background threads. The writer thread
feeds source bytes into a pipe; a single demux thread opens the
container, discovers streams, and demuxes packets -- all on the
same file object, ensuring the pipe's read cursor is never lost.
Performance optimisation: both the writer-input side and the
packet-output side use plain ``queue.Queue`` (no event-loop
involvement per item). The async/thread bridge is done via
``run_in_executor`` on the consumer and an asyncio task on the
producer, eliminating ~1700 ``run_coroutine_threadsafe`` round-trips
per 30 s of 4K content.
Usage:
demuxer = PyAVDemuxer()
await demuxer.start(source_async_iter)
# demuxer.video_stream / audio_stream are now available
async for packet in demuxer.iter_packets():
if packet.codec_type == "video":
...
"""
def __init__(self, decode_video: bool = False, decode_audio: bool = False) -> None:
"""
Args:
decode_video: If True, the demux thread will decode video packets
using the container's codec context and attach decoded frames
to DemuxedPacket.decoded_frame. This avoids format conversion
issues with standalone decoders (HVCC vs Annex B).
decode_audio: If True, the demux thread will decode audio packets
using the container's codec context and attach decoded frames
to DemuxedPacket.decoded_frame. This is needed for codecs like
Vorbis/Opus where the standalone decoder requires codec headers
that are only available in the container context. Can also be
set after start() returns (before packets are consumed) via
the ``enable_audio_decode()`` method.
"""
self._decode_video = decode_video
self._decode_audio = decode_audio
self._video_decode_decided = threading.Event()
self._audio_decode_decided = threading.Event()
# If decode flags were set at construction time, mark decided immediately
if decode_video:
self._video_decode_decided.set()
if decode_audio:
self._audio_decode_decided.set()
self._container: av.InputContainer | None = None
self._video_stream: DemuxedStream | None = None
self._audio_stream: DemuxedStream | None = None
# Thread-safe queues (no event-loop involvement per put/get)
self._packet_queue: queue.Queue | None = None
self._source_queue: queue.Queue | None = None
self._demux_thread: threading.Thread | None = None
self._writer_thread: threading.Thread | None = None
self._feeder_task: asyncio.Task | None = None
self._write_fd: int | None = None
self._read_fd: int | None = None
@property
def video_stream(self) -> DemuxedStream | None:
return self._video_stream
@property
def audio_stream(self) -> DemuxedStream | None:
return self._audio_stream
def enable_video_decode(self, enable: bool = True) -> None:
"""
Enable or disable in-thread video decoding.
Call this after ``start()`` returns (stream metadata is available)
but before consuming packets via ``iter_packets()``. The demux
thread waits for this signal before processing video packets.
"""
self._decode_video = enable
self._video_decode_decided.set()
def enable_audio_decode(self, enable: bool = True) -> None:
"""
Enable or disable in-thread audio decoding.
Call this after ``start()`` returns (stream metadata is available)
but before consuming packets via ``iter_packets()``. The demux
thread waits for this signal before processing audio packets.
"""
self._decode_audio = enable
self._audio_decode_decided.set()
# ── Writer side ──────────────────────────────────────────────────
async def _async_feeder(self, source: AsyncIterator[bytes]) -> None:
"""
Async task: pull chunks from the async source and push them
into a plain ``queue.Queue`` for the writer thread.
This replaces the old per-chunk ``run_coroutine_threadsafe``
pattern, batching the async-to-sync bridge into one task.
``queue.Queue.put()`` is a blocking call, so we use
``run_in_executor`` to avoid blocking the event loop when the
queue is full.
"""
loop = asyncio.get_running_loop()
sq = self._source_queue
try:
async for chunk in source:
await loop.run_in_executor(None, sq.put, chunk)
except (asyncio.CancelledError, GeneratorExit):
pass
except Exception:
pass
finally:
sq.put(_SENTINEL)
def _write_chunks_sync(self) -> None:
"""
Writer thread: pull pre-buffered chunks from ``_source_queue``
and write to the OS pipe. No event-loop interaction.
"""
write_fd = self._write_fd
sq = self._source_queue
try:
while True:
chunk = sq.get(timeout=30.0)
if chunk is _SENTINEL:
break
os.write(write_fd, chunk)
except Exception:
pass
finally:
try:
os.close(write_fd)
except OSError:
pass
self._write_fd = None
# ── Demux side ───────────────────────────────────────────────────
async def start(self, source: AsyncIterator[bytes]) -> None:
"""
Start pipe-based streaming: writer thread feeds the pipe, a single
demux thread opens the container, discovers streams, and begins
enqueuing packets.
After this returns, ``video_stream`` and ``audio_stream`` are
populated and packets are being enqueued for ``iter_packets()``.
"""
loop = asyncio.get_running_loop()
# Create OS pipe
self._read_fd, self._write_fd = os.pipe()
# Source buffer queue (async feeder task -> writer thread)
self._source_queue = queue.Queue(maxsize=256)
# Kick off the async feeder task
self._feeder_task = asyncio.create_task(self._async_feeder(source))
# Start writer thread (drains source_queue into the pipe)
self._writer_thread = threading.Thread(
target=self._write_chunks_sync,
daemon=True,
name="pyav-writer",
)
self._writer_thread.start()
# Packet queue for demux-thread -> async consumer bridge
self._packet_queue = queue.Queue(maxsize=128)
streams_ready = threading.Event()
def _open_and_demux():
"""
Single background thread: open container, discover streams,
demux all packets.
Critical: av.open(), _discover_streams(), and container.demux()
all happen on the same file object in the same thread. This
ensures the pipe read cursor is never lost between open and demux.
"""
pkt_count = 0
pq = self._packet_queue
try:
# Open container from read end of pipe
read_file = os.fdopen(self._read_fd, "rb")
self._read_fd = None # ownership transferred
self._container = av.open(
read_file,
mode="r",
options={
# Tolerate mid-stream joins / broken data in live TS
"err_detect": "ignore_err",
"fflags": "+discardcorrupt+genpts",
},
)
self._discover_streams()
# Signal stream metadata is available
streams_ready.set()
if self._video_stream is None and self._audio_stream is None:
logger.warning("[pyav_demuxer] No video or audio streams found")
return
# Select streams to demux
streams_to_demux = []
if self._video_stream is not None:
streams_to_demux.append(self._container.streams[self._video_stream.index])
if self._audio_stream is not None:
streams_to_demux.append(self._container.streams[self._audio_stream.index])
# Wait for the caller to decide on video/audio decoding
# (if not already decided at construction time).
if not self._video_decode_decided.is_set():
self._video_decode_decided.wait(timeout=10.0)
if not self._audio_decode_decided.is_set():
self._audio_decode_decided.wait(timeout=10.0)
# Cache stream objects and time_base for the hot loop
video_stream_obj = (
self._container.streams[self._video_stream.index] if self._video_stream is not None else None
)
audio_stream_obj = (
self._container.streams[self._audio_stream.index] if self._audio_stream is not None else None
)
video_tb_num = video_stream_obj.time_base.numerator if video_stream_obj else 1
video_tb_den = video_stream_obj.time_base.denominator if video_stream_obj else 1
audio_tb_num = audio_stream_obj.time_base.numerator if audio_stream_obj else 1
audio_tb_den = audio_stream_obj.time_base.denominator if audio_stream_obj else 1
decode_video = self._decode_video
decode_audio = self._decode_audio
# Demux and enqueue packets -- plain queue.put(), no event loop
for packet in self._container.demux(*streams_to_demux):
if packet.size == 0:
continue
stream = self._container.streams[packet.stream_index]
is_video = stream.type == "video"
is_audio = stream.type == "audio"
# Optionally decode video packets in-thread
if decode_video and is_video and video_stream_obj is not None:
try:
frames = video_stream_obj.codec_context.decode(packet)
except Exception:
frames = []
for frame in frames:
pq.put(
DemuxedPacket(
stream_index=packet.stream_index,
codec_type="video",
data=b"",
pts=int(frame.pts) if frame.pts is not None else 0,
dts=int(frame.pts) if frame.pts is not None else 0,
duration=int(packet.duration) if packet.duration is not None else 0,
is_keyframe=frame.key_frame,
time_base_num=video_tb_num,
time_base_den=video_tb_den,
decoded_frame=frame,
)
)
pkt_count += 1
# Optionally decode audio packets in-thread
elif decode_audio and is_audio and audio_stream_obj is not None:
try:
frames = audio_stream_obj.codec_context.decode(packet)
except Exception:
frames = []
for frame in frames:
pq.put(
DemuxedPacket(
stream_index=packet.stream_index,
codec_type="audio",
data=b"",
pts=int(frame.pts) if frame.pts is not None else 0,
dts=int(frame.pts) if frame.pts is not None else 0,
duration=int(packet.duration) if packet.duration is not None else 0,
is_keyframe=False,
time_base_num=audio_tb_num,
time_base_den=audio_tb_den,
decoded_frame=frame,
)
)
pkt_count += 1
else:
tb_num = video_tb_num if is_video else audio_tb_num
tb_den = video_tb_den if is_video else audio_tb_den
pq.put(
DemuxedPacket(
stream_index=packet.stream_index,
codec_type=stream.type,
data=bytes(packet),
pts=int(packet.pts) if packet.pts is not None else 0,
dts=int(packet.dts) if packet.dts is not None else 0,
duration=int(packet.duration) if packet.duration is not None else 0,
is_keyframe=packet.is_keyframe,
time_base_num=tb_num,
time_base_den=tb_den,
)
)
pkt_count += 1
# Flush the video decoder if we were decoding
if decode_video and video_stream_obj is not None:
try:
for frame in video_stream_obj.codec_context.decode(None):
pq.put(
DemuxedPacket(
stream_index=video_stream_obj.index,
codec_type="video",
data=b"",
pts=int(frame.pts) if frame.pts is not None else 0,
dts=int(frame.pts) if frame.pts is not None else 0,
duration=0,
is_keyframe=frame.key_frame,
time_base_num=video_tb_num,
time_base_den=video_tb_den,
decoded_frame=frame,
)
)
pkt_count += 1
except Exception:
pass
# Flush the audio decoder if we were decoding
if decode_audio and audio_stream_obj is not None:
try:
for frame in audio_stream_obj.codec_context.decode(None):
pq.put(
DemuxedPacket(
stream_index=audio_stream_obj.index,
codec_type="audio",
data=b"",
pts=int(frame.pts) if frame.pts is not None else 0,
dts=int(frame.pts) if frame.pts is not None else 0,
duration=0,
is_keyframe=False,
time_base_num=audio_tb_num,
time_base_den=audio_tb_den,
decoded_frame=frame,
)
)
pkt_count += 1
except Exception:
pass
logger.info("[pyav_demuxer] Demux complete: %d packets", pkt_count)
except Exception as e:
if "Invalid data" not in str(e):
logger.debug("[pyav_demuxer] Demux thread error: %s", e)
# Ensure streams_ready is set even on error
streams_ready.set()
finally:
pq.put(_SENTINEL)
self._demux_thread = threading.Thread(target=_open_and_demux, daemon=True, name="pyav-demux")
self._demux_thread.start()
# Wait for stream discovery before returning.
# Use run_in_executor to avoid blocking the event loop.
await loop.run_in_executor(None, streams_ready.wait)
async def iter_packets(self) -> AsyncIterator[DemuxedPacket]:
"""
Yield demuxed packets from the background thread.
Uses ``run_in_executor`` for the blocking ``queue.get()`` call,
avoiding per-packet ``run_coroutine_threadsafe`` overhead.
``start()`` must be called first.
"""
if self._packet_queue is None:
raise RuntimeError("Call start() before iter_packets()")
loop = asyncio.get_running_loop()
pq = self._packet_queue
try:
while True:
packet = await loop.run_in_executor(None, pq.get)
if packet is _SENTINEL:
break
yield packet
if self._demux_thread is not None:
self._demux_thread.join(timeout=5.0)
except GeneratorExit:
logger.debug("[pyav_demuxer] Generator closed")
except asyncio.CancelledError:
logger.debug("[pyav_demuxer] Cancelled")
finally:
self._cleanup()
def _discover_streams(self) -> None:
"""Inspect the opened container and record stream metadata."""
if self._container is None:
return
for stream in self._container.streams:
if stream.type == "video" and self._video_stream is None:
codec_ctx = stream.codec_context
fps = float(stream.average_rate) if stream.average_rate else 24.0
self._video_stream = DemuxedStream(
index=stream.index,
codec_name=codec_ctx.name if codec_ctx else stream.codec.name,
codec_type="video",
width=codec_ctx.width if codec_ctx else 0,
height=codec_ctx.height if codec_ctx else 0,
fps=fps,
pixel_format=str(codec_ctx.pix_fmt) if codec_ctx and codec_ctx.pix_fmt else "yuv420p",
time_base_num=stream.time_base.numerator,
time_base_den=stream.time_base.denominator,
duration_seconds=float(stream.duration * stream.time_base) if stream.duration else 0.0,
extradata=bytes(codec_ctx.extradata) if codec_ctx and codec_ctx.extradata else b"",
)
logger.info(
"[pyav_demuxer] Video: %s %dx%d @%.1ffps",
self._video_stream.codec_name,
self._video_stream.width,
self._video_stream.height,
self._video_stream.fps,
)
elif stream.type == "audio" and self._audio_stream is None:
codec_ctx = stream.codec_context
self._audio_stream = DemuxedStream(
index=stream.index,
codec_name=codec_ctx.name if codec_ctx else stream.codec.name,
codec_type="audio",
sample_rate=codec_ctx.sample_rate if codec_ctx else 0,
channels=codec_ctx.channels if codec_ctx else 0,
time_base_num=stream.time_base.numerator,
time_base_den=stream.time_base.denominator,
duration_seconds=float(stream.duration * stream.time_base) if stream.duration else 0.0,
extradata=bytes(codec_ctx.extradata) if codec_ctx and codec_ctx.extradata else b"",
)
logger.info(
"[pyav_demuxer] Audio: %s %dHz %dch",
self._audio_stream.codec_name,
self._audio_stream.sample_rate,
self._audio_stream.channels,
)
def _cleanup(self) -> None:
"""Stop threads and release all resources safely.
The order is critical to avoid SIGSEGV from closing the container
while the demux thread is still calling container.demux():
1. Cancel the feeder task (stops new bytes being queued).
2. Put a sentinel into the source queue so the writer thread
unblocks and exits. The writer's ``finally`` closes the pipe
write-end, which causes the demux thread to see EOF.
3. Join the writer thread (wait for it to drain and exit).
4. Join the demux thread (it finishes after pipe EOF).
5. ONLY THEN close the container (no thread is using it).
6. Close any remaining pipe FDs (read end, if still open).
"""
# 1. Cancel feeder task
if self._feeder_task is not None:
self._feeder_task.cancel()
self._feeder_task = None
# 2. Unblock writer thread so it exits and closes the pipe
if self._source_queue is not None:
try:
self._source_queue.put_nowait(_SENTINEL)
except Exception:
pass
# 3. Join writer thread (it closes _write_fd in its finally block)
if self._writer_thread is not None:
self._writer_thread.join(timeout=5.0)
self._writer_thread = None
# 4. Join demux thread -- must finish before we close the container
if self._demux_thread is not None:
self._demux_thread.join(timeout=5.0)
self._demux_thread = None
# 5. Now safe to close the container (no thread is using it)
if self._container is not None:
try:
self._container.close()
except Exception:
pass
self._container = None
# 6. Close any remaining pipe FDs
for fd_name in ("_read_fd", "_write_fd"):
fd = getattr(self, fd_name, None)
if fd is not None:
try:
os.close(fd)
except OSError:
pass
setattr(self, fd_name, None)

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,403 @@
"""
GPU-accelerated video transcoder with runtime detection.
Detects available hardware encoders/decoders at first use and selects
the best available backend:
- NVIDIA: h264_nvenc / hevc_cuvid (NVENC + CUDA)
- Apple macOS: h264_videotoolbox / hevc_videotoolbox
- Intel Linux: h264_vaapi / h264_qsv
- Fallback: libx264 (CPU)
The transcoder operates at the packet/frame level via PyAV, suitable
for integration into the streaming pipeline.
"""
import logging
from dataclasses import dataclass, field
from enum import Enum
from fractions import Fraction
import av
from mediaflow_proxy.configs import settings
logger = logging.getLogger(__name__)
class HWAccelType(Enum):
NONE = "none"
NVIDIA = "nvidia"
VIDEOTOOLBOX = "videotoolbox"
VAAPI = "vaapi"
QSV = "qsv"
@dataclass
class HWCapability:
"""Detected hardware acceleration capability."""
accel_type: HWAccelType = HWAccelType.NONE
h264_encoder: str = "libx264"
h264_decoder: str | None = None # None = use default software decoder
hevc_decoder: str | None = None
available_encoders: list[str] = field(default_factory=list)
available_decoders: list[str] = field(default_factory=list)
# Module-level singleton -- populated on first call to get_hw_capability()
_hw_capability: HWCapability | None = None
def _probe_codec(name: str, mode: str = "w") -> bool:
"""
Check if a PyAV codec is available by name.
Args:
name: Codec name (e.g. 'h264_videotoolbox').
mode: 'w' for encoder, 'r' for decoder.
"""
try:
av.Codec(name, mode)
return True
except Exception:
return False
def _detect_hw_capability() -> HWCapability:
"""
Probe the runtime environment for hardware encoder/decoder availability.
Checks NVIDIA, Apple VideoToolbox, Intel VAAPI/QSV in priority order.
Falls back to libx264 CPU encoding.
"""
cap = HWCapability()
# Collect available encoders/decoders for logging
hw_encoders = [
"h264_nvenc",
"hevc_nvenc",
"h264_videotoolbox",
"hevc_videotoolbox",
"h264_vaapi",
"hevc_vaapi",
"h264_qsv",
"hevc_qsv",
]
hw_decoders = [
"h264_cuvid",
"hevc_cuvid",
"h264_qsv",
"hevc_qsv",
]
cap.available_encoders = [c for c in hw_encoders if _probe_codec(c, "w")]
cap.available_decoders = [c for c in hw_decoders if _probe_codec(c, "r")]
# Priority 1: NVIDIA NVENC
if "h264_nvenc" in cap.available_encoders:
cap.accel_type = HWAccelType.NVIDIA
cap.h264_encoder = "h264_nvenc"
if "h264_cuvid" in cap.available_decoders:
cap.h264_decoder = "h264_cuvid"
if "hevc_cuvid" in cap.available_decoders:
cap.hevc_decoder = "hevc_cuvid"
return cap
# Priority 2: Apple VideoToolbox
if "h264_videotoolbox" in cap.available_encoders:
cap.accel_type = HWAccelType.VIDEOTOOLBOX
cap.h264_encoder = "h264_videotoolbox"
# VideoToolbox decoders are used automatically via hwaccel
return cap
# Priority 3: Intel VAAPI (Linux)
if "h264_vaapi" in cap.available_encoders:
cap.accel_type = HWAccelType.VAAPI
cap.h264_encoder = "h264_vaapi"
return cap
# Priority 4: Intel QSV
if "h264_qsv" in cap.available_encoders:
cap.accel_type = HWAccelType.QSV
cap.h264_encoder = "h264_qsv"
if "h264_qsv" in cap.available_decoders:
cap.h264_decoder = "h264_qsv"
if "hevc_qsv" in cap.available_decoders:
cap.hevc_decoder = "hevc_qsv"
return cap
# Fallback: CPU
cap.accel_type = HWAccelType.NONE
cap.h264_encoder = "libx264"
return cap
def get_hw_capability() -> HWCapability:
"""Get the detected hardware acceleration capability (cached singleton)."""
global _hw_capability
if _hw_capability is None:
_hw_capability = _detect_hw_capability()
if settings.transcode_prefer_gpu and _hw_capability.accel_type != HWAccelType.NONE:
logger.info(
"[video_transcoder] GPU acceleration: %s (encoder=%s, decoders=%s)",
_hw_capability.accel_type.value,
_hw_capability.h264_encoder,
_hw_capability.available_decoders or "software",
)
else:
logger.info(
"[video_transcoder] Using CPU encoder: %s (available HW: encoders=%s, decoders=%s)",
_hw_capability.h264_encoder,
_hw_capability.available_encoders or "none",
_hw_capability.available_decoders or "none",
)
return _hw_capability
class VideoTranscoder:
"""
In-process video transcoder using PyAV.
Decodes input video packets and re-encodes to H.264 using the best
available hardware encoder (or CPU libx264 fallback).
Operates at the frame level: caller provides raw video packets (from
PyAV demuxer), transcoder returns encoded H.264 NAL data suitable
for the fMP4 muxer.
"""
def __init__(
self,
input_codec_name: str,
width: int,
height: int,
fps: float = 24.0,
pixel_format: str = "yuv420p",
force_software: bool = False,
) -> None:
hw = get_hw_capability()
use_gpu = settings.transcode_prefer_gpu and hw.accel_type != HWAccelType.NONE and not force_software
# --- Decoder ---
hw_decoder = None
if use_gpu:
if "hevc" in input_codec_name or "h265" in input_codec_name:
hw_decoder = hw.hevc_decoder
else:
hw_decoder = hw.h264_decoder
decoder_name = hw_decoder or input_codec_name
self._decoder = av.CodecContext.create(decoder_name, "r")
# --- Encoder ---
encoder_name = hw.h264_encoder if use_gpu else "libx264"
# H.264 requires even dimensions
enc_width = width if width % 2 == 0 else width + 1
enc_height = height if height % 2 == 0 else height + 1
self._encoder = av.CodecContext.create(encoder_name, "w")
self._encoder.width = enc_width
self._encoder.height = enc_height
self._encoder.pix_fmt = "yuv420p" # H.264 requires yuv420p
self._encoder.time_base = Fraction(1, int(fps * 1000))
self._encoder.framerate = Fraction(int(fps * 1000), 1000)
self._encoder.bit_rate = _parse_bitrate(settings.transcode_video_bitrate)
self._encoder.gop_size = int(fps * 2) # Keyframe every ~2 seconds
# Encoder options based on backend
opts = {}
if encoder_name == "libx264":
opts["preset"] = settings.transcode_video_preset
opts["tune"] = "zerolatency"
opts["profile"] = "high"
elif "nvenc" in encoder_name:
opts["preset"] = "p4" # NVENC preset (p1=fastest .. p7=slowest)
opts["tune"] = "ll" # Low latency
opts["rc"] = "vbr"
elif "videotoolbox" in encoder_name:
opts["realtime"] = "1"
opts["allow_sw"] = "1" # Fallback to software if HW busy
elif "vaapi" in encoder_name:
opts["rc_mode"] = "VBR"
elif "qsv" in encoder_name:
opts["preset"] = "medium"
self._encoder.options = opts
self._encoder.open()
width = enc_width
height = enc_height
self._input_codec = input_codec_name
self._encoder_name = encoder_name
self._frames_decoded = 0
self._frames_encoded = 0
self._width = width
self._height = height
# Tracks whether the standalone decoder was actually used (via decode_packet).
# When the demux thread decodes frames in-thread (decode_video=True),
# the standalone decoder is never fed packets and flushing it is wasted work.
self._decoder_used = False
self._flushed = False # Prevents double-flush which causes SIGSEGV
logger.info(
"[video_transcoder] Initialized: %s -> %s (%s), %dx%d @%.1ffps %dk",
input_codec_name,
encoder_name,
hw.accel_type.value,
width,
height,
fps,
self._encoder.bit_rate // 1000 if self._encoder.bit_rate else 0,
)
@property
def codec_private_data(self) -> bytes | None:
"""H.264 extradata (SPS/PPS) from the encoder, for the fMP4 init segment."""
if self._encoder.extradata:
return bytes(self._encoder.extradata)
return None
@property
def width(self) -> int:
return self._width
@property
def height(self) -> int:
return self._height
def transcode_frame(self, frame: av.VideoFrame) -> list[tuple[bytes, bool, int, int]]:
"""
Encode a decoded video frame to H.264.
Args:
frame: A decoded av.VideoFrame.
Returns:
List of (nal_data, is_keyframe, pts, dts) tuples.
"""
self._frames_decoded += 1
output = []
# Ensure correct pixel format for encoder
if frame.format.name != self._encoder.pix_fmt:
frame = frame.reformat(format=self._encoder.pix_fmt)
try:
for packet in self._encoder.encode(frame):
self._frames_encoded += 1
output.append(
(
bytes(packet),
packet.is_keyframe,
int(packet.pts) if packet.pts is not None else 0,
int(packet.dts) if packet.dts is not None else 0,
)
)
except av.error.InvalidDataError as e:
logger.debug("[video_transcoder] Encode error: %s", e)
return output
def decode_packet(self, packet: av.Packet) -> list[av.VideoFrame]:
"""Decode a video packet into frames."""
self._decoder_used = True
try:
return list(self._decoder.decode(packet))
except av.error.InvalidDataError as e:
logger.debug("[video_transcoder] Decode error: %s", e)
return []
def flush(self) -> list[tuple[bytes, bool, int, int]]:
"""
Flush encoder (and decoder, if it was used) buffers.
When ``decode_video=True`` is used in PyAVDemuxer, the demux thread
decodes frames using the container's codec context. In that case the
standalone ``_decoder`` here is never fed any packets, so flushing
it is skipped -- avoiding a stall that added ~5 s on some backends.
Safe to call multiple times -- subsequent calls return an empty list.
"""
if self._flushed:
return []
self._flushed = True
output = []
# Flush decoder only if it was actually used (via decode_packet)
if self._decoder_used:
try:
for frame in self._decoder.decode(None):
self._frames_decoded += 1
if frame.format.name != self._encoder.pix_fmt:
frame = frame.reformat(format=self._encoder.pix_fmt)
for packet in self._encoder.encode(frame):
self._frames_encoded += 1
output.append(
(
bytes(packet),
packet.is_keyframe,
int(packet.pts) if packet.pts is not None else 0,
int(packet.dts) if packet.dts is not None else 0,
)
)
except Exception as e:
logger.debug("[video_transcoder] Decoder flush error: %s", e)
else:
logger.debug("[video_transcoder] Skipping decoder flush (decoder not used)")
# Flush encoder
try:
for packet in self._encoder.encode(None):
self._frames_encoded += 1
output.append(
(
bytes(packet),
packet.is_keyframe,
int(packet.pts) if packet.pts is not None else 0,
int(packet.dts) if packet.dts is not None else 0,
)
)
except Exception as e:
logger.debug("[video_transcoder] Encoder flush error: %s", e)
logger.info(
"[video_transcoder] Flushed: %d decoded, %d encoded total (decoder_used=%s)",
self._frames_decoded,
self._frames_encoded,
self._decoder_used,
)
return output
def close(self) -> None:
"""Release codec contexts.
Flushes the encoder (if not already flushed) before releasing to avoid
SIGSEGV when libx264 or hardware encoders have buffered frames at
teardown time. Double-flushing is the most common cause of SIGSEGV
in the transcode pipeline.
PyAV codec contexts are released via garbage collection (no explicit
close method), so we flush first to ensure native buffers are drained
before the C-level codec is freed.
"""
# flush() is idempotent -- safe to call even if already flushed
self.flush()
# Release references -- GC will free the native codec contexts
self._encoder = None
self._decoder = None
def __del__(self) -> None:
self.close()
def _parse_bitrate(bitrate_str: str) -> int:
"""Parse a bitrate string like '4M', '2000k', '5000000' to int bits/s."""
s = bitrate_str.strip().lower()
if s.endswith("m"):
return int(float(s[:-1]) * 1_000_000)
if s.endswith("k"):
return int(float(s[:-1]) * 1_000)
return int(s)