mirror of
https://github.com/UrloMythus/UnHided.git
synced 2026-06-10 09:10:23 +00:00
update
This commit is contained in:
@@ -0,0 +1,18 @@
|
||||
"""
|
||||
Media remuxer package.
|
||||
|
||||
Provides pure Python implementations for media container parsing, remuxing,
|
||||
and transcoding:
|
||||
|
||||
- ebml_parser: Minimal EBML/MKV parser for seeking and demuxing
|
||||
- ts_muxer: fMP4 -> MPEG-TS remuxer
|
||||
- mkv_demuxer: Streaming MKV demuxer
|
||||
- mp4_muxer: MP4 box builder for standard moov-first MP4
|
||||
- audio_transcoder: PyAV-based audio frame transcoding
|
||||
- video_transcoder: GPU-accelerated video transcoding via PyAV
|
||||
- pyav_demuxer: Universal PyAV-based streaming demuxer (any container)
|
||||
- codec_utils: Codec compatibility detection and decision engine
|
||||
- media_source: Abstract MediaSource protocol (Telegram, HTTP, etc.)
|
||||
- transcode_handler: Shared transcode request orchestrator
|
||||
- transcode_pipeline: MKV fast-path and universal transcode pipelines
|
||||
"""
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,351 @@
|
||||
"""
|
||||
PyAV-based audio transcoder for frame-level codec conversion.
|
||||
|
||||
Transcodes audio frames between codecs using PyAV's CodecContext API
|
||||
(Python bindings for FFmpeg's libavcodec). This provides in-process
|
||||
audio transcoding without subprocess management or pipe overhead.
|
||||
|
||||
Supported input codecs: EAC3, AC3, AAC, Opus, Vorbis, FLAC, MP3
|
||||
Output codec: AAC-LC (stereo, configurable bitrate)
|
||||
|
||||
Architecture:
|
||||
raw_frame_bytes -> parse() -> decode() -> resample() -> encode() -> raw_aac_bytes
|
||||
|
||||
Usage:
|
||||
transcoder = AudioTranscoder("eac3", sample_rate=48000, channels=6)
|
||||
for raw_eac3_frame in frames:
|
||||
aac_frames = transcoder.transcode(raw_eac3_frame)
|
||||
for aac_data in aac_frames:
|
||||
write(aac_data)
|
||||
# Flush remaining frames
|
||||
for aac_data in transcoder.flush():
|
||||
write(aac_data)
|
||||
"""
|
||||
|
||||
import logging
|
||||
|
||||
import av
|
||||
from av.audio.resampler import AudioResampler
|
||||
|
||||
from mediaflow_proxy.remuxer.ebml_parser import (
|
||||
CODEC_ID_AAC,
|
||||
CODEC_ID_AC3,
|
||||
CODEC_ID_EAC3,
|
||||
CODEC_ID_FLAC,
|
||||
CODEC_ID_OPUS,
|
||||
CODEC_ID_VORBIS,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _generate_silence_aac_frame() -> bytes | None:
|
||||
"""Pre-encode a single silent AAC frame (48 kHz stereo, 1024 samples).
|
||||
|
||||
PyAV's AAC encoder has an intermittent ``avcodec_send_frame`` bug when
|
||||
rapidly creating/destroying codec contexts, so we retry a few times.
|
||||
This function is called once at module load; the result is cached in
|
||||
``_SILENCE_AAC_FRAME``.
|
||||
"""
|
||||
for _attempt in range(10):
|
||||
try:
|
||||
enc = av.CodecContext.create("aac", "w")
|
||||
enc.sample_rate = 48000
|
||||
enc.layout = "stereo"
|
||||
enc.format = av.AudioFormat("fltp")
|
||||
enc.bit_rate = 192000
|
||||
enc.open()
|
||||
|
||||
frame = av.AudioFrame(
|
||||
format=enc.format.name,
|
||||
layout=enc.layout.name,
|
||||
samples=enc.frame_size or 1024,
|
||||
)
|
||||
frame.sample_rate = enc.sample_rate
|
||||
frame.pts = 0
|
||||
|
||||
for pkt in enc.encode(frame):
|
||||
return bytes(pkt)
|
||||
# AAC priming delay: first encode buffered; flush to retrieve
|
||||
for pkt in enc.encode(None):
|
||||
return bytes(pkt)
|
||||
except Exception:
|
||||
continue
|
||||
return None
|
||||
|
||||
|
||||
# Module-level silence frame -- generated once, reused by every transcoder.
|
||||
_SILENCE_AAC_FRAME: bytes | None = _generate_silence_aac_frame()
|
||||
|
||||
# Map MKV codec IDs to PyAV/FFmpeg codec names
|
||||
_MKV_TO_FFMPEG_CODEC = {
|
||||
CODEC_ID_EAC3: "eac3",
|
||||
CODEC_ID_AC3: "ac3",
|
||||
CODEC_ID_AAC: "aac",
|
||||
CODEC_ID_OPUS: "opus",
|
||||
CODEC_ID_VORBIS: "vorbis",
|
||||
CODEC_ID_FLAC: "flac",
|
||||
"A_DTS": "dts",
|
||||
"A_MP3": "mp3",
|
||||
"A_MPEG/L3": "mp3",
|
||||
}
|
||||
|
||||
# Codecs that need transcoding to AAC for browser playback
|
||||
NEEDS_TRANSCODE = frozenset(
|
||||
{
|
||||
CODEC_ID_EAC3,
|
||||
CODEC_ID_AC3,
|
||||
CODEC_ID_OPUS,
|
||||
CODEC_ID_VORBIS,
|
||||
CODEC_ID_FLAC,
|
||||
"A_DTS",
|
||||
"A_MP3",
|
||||
"A_MPEG/L3",
|
||||
}
|
||||
)
|
||||
|
||||
# Output AAC settings
|
||||
_OUTPUT_CODEC = "aac"
|
||||
_OUTPUT_SAMPLE_FORMAT = "fltp" # AAC requires float planar
|
||||
_OUTPUT_LAYOUT = "stereo"
|
||||
|
||||
# Map channel count -> FFmpeg layout name
|
||||
_CHANNEL_LAYOUT_MAP = {
|
||||
1: "mono",
|
||||
2: "stereo",
|
||||
3: "2.1",
|
||||
4: "quad",
|
||||
6: "5.1",
|
||||
8: "7.1",
|
||||
}
|
||||
|
||||
|
||||
def needs_transcode(codec_id: str) -> bool:
|
||||
"""Check if an MKV audio codec needs transcoding for browser playback."""
|
||||
return codec_id in NEEDS_TRANSCODE
|
||||
|
||||
|
||||
def get_ffmpeg_codec_name(mkv_codec_id: str) -> str | None:
|
||||
"""Map an MKV CodecID to an FFmpeg codec name."""
|
||||
return _MKV_TO_FFMPEG_CODEC.get(mkv_codec_id)
|
||||
|
||||
|
||||
class AudioTranscoder:
|
||||
"""
|
||||
In-process audio transcoder using PyAV's CodecContext API.
|
||||
|
||||
Decodes raw audio frames from one codec and encodes them to AAC-LC
|
||||
stereo, suitable for MP4 container and browser playback. No container
|
||||
I/O or subprocess involved -- operates directly on raw frame bytes.
|
||||
|
||||
The transcoder handles sample format conversion and resampling
|
||||
automatically via AudioResampler.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
input_codec: str,
|
||||
input_sample_rate: int = 48000,
|
||||
input_channels: int = 6,
|
||||
output_sample_rate: int = 48000,
|
||||
output_channels: int = 2,
|
||||
output_bitrate: int = 192000,
|
||||
) -> None:
|
||||
"""
|
||||
Initialize the transcoder.
|
||||
|
||||
Args:
|
||||
input_codec: FFmpeg codec name (e.g., "eac3", "ac3", "aac").
|
||||
input_sample_rate: Input sample rate in Hz.
|
||||
input_channels: Input channel count.
|
||||
output_sample_rate: Output sample rate in Hz (default 48000).
|
||||
output_channels: Output channel count (default 2 = stereo).
|
||||
output_bitrate: Output bitrate in bits/s (default 192000).
|
||||
"""
|
||||
# Set up decoder -- use layout to configure channel count
|
||||
# (PyAV's channels property is read-only; layout drives it)
|
||||
self._decoder = av.CodecContext.create(input_codec, "r")
|
||||
self._decoder.sample_rate = input_sample_rate
|
||||
input_layout = _CHANNEL_LAYOUT_MAP.get(input_channels, "stereo")
|
||||
self._decoder.layout = input_layout
|
||||
|
||||
# Set up encoder
|
||||
self._encoder = av.CodecContext.create(_OUTPUT_CODEC, "w")
|
||||
self._encoder.sample_rate = output_sample_rate
|
||||
self._encoder.layout = _OUTPUT_LAYOUT
|
||||
self._encoder.format = av.AudioFormat(_OUTPUT_SAMPLE_FORMAT)
|
||||
self._encoder.bit_rate = output_bitrate
|
||||
self._encoder.open()
|
||||
|
||||
# Set up resampler for format/rate/channel conversion
|
||||
self._resampler = AudioResampler(
|
||||
format=_OUTPUT_SAMPLE_FORMAT,
|
||||
layout=_OUTPUT_LAYOUT,
|
||||
rate=output_sample_rate,
|
||||
)
|
||||
|
||||
self._input_codec = input_codec
|
||||
self._frames_decoded = 0
|
||||
self._frames_encoded = 0
|
||||
self._audio_specific_config: bytes | None = None
|
||||
|
||||
logger.info(
|
||||
"[audio_transcoder] Initialized: %s %dHz %dch -> aac %dHz %dch @%dk",
|
||||
input_codec,
|
||||
input_sample_rate,
|
||||
input_channels,
|
||||
output_sample_rate,
|
||||
output_channels,
|
||||
output_bitrate // 1000,
|
||||
)
|
||||
|
||||
@property
|
||||
def audio_specific_config(self) -> bytes | None:
|
||||
"""
|
||||
AAC AudioSpecificConfig from the encoder (available after first encode).
|
||||
|
||||
This is needed for the MP4 esds box.
|
||||
"""
|
||||
if self._audio_specific_config is not None:
|
||||
return self._audio_specific_config
|
||||
|
||||
# PyAV exposes extradata after the encoder is opened
|
||||
if self._encoder.extradata:
|
||||
self._audio_specific_config = bytes(self._encoder.extradata)
|
||||
return self._audio_specific_config
|
||||
return None
|
||||
|
||||
@property
|
||||
def output_sample_rate(self) -> int:
|
||||
return self._encoder.sample_rate
|
||||
|
||||
@property
|
||||
def output_channels(self) -> int:
|
||||
return self._encoder.channels
|
||||
|
||||
@property
|
||||
def frame_size(self) -> int:
|
||||
"""AAC frame size (samples per frame), typically 1024."""
|
||||
return self._encoder.frame_size or 1024
|
||||
|
||||
def transcode(self, raw_frame_data: bytes) -> list[bytes]:
|
||||
"""
|
||||
Transcode a raw audio frame from the input codec to AAC.
|
||||
|
||||
Args:
|
||||
raw_frame_data: Raw audio frame bytes (one codec frame, e.g.,
|
||||
one EAC3 sync frame).
|
||||
|
||||
Returns:
|
||||
List of raw AAC frame bytes. May return 0, 1, or more frames
|
||||
depending on codec frame sizes and buffering.
|
||||
"""
|
||||
output = []
|
||||
|
||||
# Parse raw bytes into packets
|
||||
packets = self._decoder.parse(raw_frame_data)
|
||||
|
||||
for packet in packets:
|
||||
# Decode to PCM frames
|
||||
try:
|
||||
decoded_frames = self._decoder.decode(packet)
|
||||
except av.error.InvalidDataError as e:
|
||||
logger.debug("[audio_transcoder] Decode error (skipping frame): %s", e)
|
||||
continue
|
||||
|
||||
for frame in decoded_frames:
|
||||
self._frames_decoded += 1
|
||||
|
||||
# Resample to match encoder format
|
||||
resampled = self._resampler.resample(frame)
|
||||
if resampled is None:
|
||||
continue
|
||||
|
||||
# resampled can be a single frame or list of frames
|
||||
if not isinstance(resampled, list):
|
||||
resampled = [resampled]
|
||||
|
||||
for rs_frame in resampled:
|
||||
# Encode to AAC
|
||||
try:
|
||||
encoded_packets = self._encoder.encode(rs_frame)
|
||||
except av.error.InvalidDataError as e:
|
||||
logger.debug("[audio_transcoder] Encode error: %s", e)
|
||||
continue
|
||||
|
||||
for enc_packet in encoded_packets:
|
||||
self._frames_encoded += 1
|
||||
output.append(bytes(enc_packet))
|
||||
|
||||
return output
|
||||
|
||||
def flush(self) -> list[bytes]:
|
||||
"""
|
||||
Flush the decoder and encoder buffers.
|
||||
|
||||
Call this when the input stream ends to get remaining frames.
|
||||
|
||||
Returns:
|
||||
List of remaining raw AAC frame bytes.
|
||||
"""
|
||||
output = []
|
||||
|
||||
# Flush decoder
|
||||
try:
|
||||
for frame in self._decoder.decode(None):
|
||||
self._frames_decoded += 1
|
||||
resampled = self._resampler.resample(frame)
|
||||
if resampled is None:
|
||||
continue
|
||||
if not isinstance(resampled, list):
|
||||
resampled = [resampled]
|
||||
for rs_frame in resampled:
|
||||
for enc_packet in self._encoder.encode(rs_frame):
|
||||
self._frames_encoded += 1
|
||||
output.append(bytes(enc_packet))
|
||||
except Exception as e:
|
||||
logger.debug("[audio_transcoder] Decoder flush error: %s", e)
|
||||
|
||||
# Flush resampler
|
||||
try:
|
||||
resampled = self._resampler.resample(None)
|
||||
if resampled is not None:
|
||||
if not isinstance(resampled, list):
|
||||
resampled = [resampled]
|
||||
for rs_frame in resampled:
|
||||
for enc_packet in self._encoder.encode(rs_frame):
|
||||
self._frames_encoded += 1
|
||||
output.append(bytes(enc_packet))
|
||||
except Exception as e:
|
||||
logger.debug("[audio_transcoder] Resampler flush error: %s", e)
|
||||
|
||||
# Flush encoder
|
||||
try:
|
||||
for enc_packet in self._encoder.encode(None):
|
||||
self._frames_encoded += 1
|
||||
output.append(bytes(enc_packet))
|
||||
except Exception as e:
|
||||
logger.debug("[audio_transcoder] Encoder flush error: %s", e)
|
||||
|
||||
logger.info(
|
||||
"[audio_transcoder] Flushed: %d decoded, %d encoded total",
|
||||
self._frames_decoded,
|
||||
self._frames_encoded,
|
||||
)
|
||||
return output
|
||||
|
||||
def generate_silence_frame(self) -> bytes | None:
|
||||
"""Return a pre-encoded silent AAC frame (module-level singleton)."""
|
||||
return _SILENCE_AAC_FRAME
|
||||
|
||||
def close(self) -> None:
|
||||
"""Release codec contexts (best-effort; PyAV AudioCodecContext may not have close())."""
|
||||
for ctx in (self._decoder, self._encoder):
|
||||
try:
|
||||
if hasattr(ctx, "close"):
|
||||
ctx.close()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
def __del__(self) -> None:
|
||||
self.close()
|
||||
@@ -0,0 +1,515 @@
|
||||
"""
|
||||
Codec decision engine for browser compatibility detection.
|
||||
|
||||
Determines whether video/audio streams need transcoding for browser
|
||||
playback and selects appropriate output codecs.
|
||||
"""
|
||||
|
||||
import logging
|
||||
import struct
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# ────────────────────────────────────────────────────────────────────
|
||||
# Browser-compatible codecs (work natively in HTML5 <video>)
|
||||
# ────────────────────────────────────────────────────────────────────
|
||||
BROWSER_VIDEO_CODECS = frozenset(
|
||||
{
|
||||
"V_MPEG4/ISO/AVC", # H.264/AVC -- universal
|
||||
"h264",
|
||||
"avc1", # FFmpeg/PyAV names
|
||||
}
|
||||
)
|
||||
|
||||
BROWSER_AUDIO_CODECS = frozenset(
|
||||
{
|
||||
"A_AAC", # AAC-LC -- universal
|
||||
"A_AAC/MPEG2/LC",
|
||||
"A_AAC/MPEG4/LC",
|
||||
"aac", # FFmpeg/PyAV name
|
||||
}
|
||||
)
|
||||
|
||||
# ────────────────────────────────────────────────────────────────────
|
||||
# Video codecs that need re-encoding to H.264
|
||||
# ────────────────────────────────────────────────────────────────────
|
||||
VIDEO_NEEDS_REENCODE = frozenset(
|
||||
{
|
||||
"V_MPEGH/ISO/HEVC", # H.265/HEVC (Chrome/Firefox don't support)
|
||||
"V_MPEG2", # MPEG-2 (DVD-era)
|
||||
"V_MPEG4/ISO/SP", # MPEG-4 Part 2 Simple Profile
|
||||
"V_MPEG4/ISO/ASP", # MPEG-4 Part 2 Advanced Simple (DivX/Xvid)
|
||||
"V_MPEG4/ISO/AP", # MPEG-4 Part 2 Advanced Profile
|
||||
"V_MPEG4/MS/V3", # MS MPEG-4 v3 (WMV)
|
||||
"V_MS/VFW/FOURCC", # Generic VFW (VC-1, etc.)
|
||||
"V_REAL/RV10",
|
||||
"V_REAL/RV20",
|
||||
"V_REAL/RV30",
|
||||
"V_REAL/RV40",
|
||||
"V_THEORA",
|
||||
"V_VP8",
|
||||
"V_VP9", # VP9 in MKV (needs WebM container for browser)
|
||||
"V_AV1", # AV1 (partial support, safer to reencode)
|
||||
# PyAV / FFmpeg codec names
|
||||
"hevc",
|
||||
"h265",
|
||||
"mpeg2video",
|
||||
"mpeg4",
|
||||
"vc1",
|
||||
"vp8",
|
||||
"vp9",
|
||||
"av1",
|
||||
"theora",
|
||||
"wmv3",
|
||||
"rv30",
|
||||
"rv40",
|
||||
}
|
||||
)
|
||||
|
||||
# ────────────────────────────────────────────────────────────────────
|
||||
# Audio codecs that need transcoding to AAC
|
||||
# (superset of the list in audio_transcoder.py, uses both MKV and
|
||||
# PyAV codec names for universal lookup)
|
||||
# ────────────────────────────────────────────────────────────────────
|
||||
AUDIO_NEEDS_TRANSCODE = frozenset(
|
||||
{
|
||||
# MKV CodecIDs
|
||||
"A_EAC3",
|
||||
"A_AC3",
|
||||
"A_DTS",
|
||||
"A_DTS/EXPRESS",
|
||||
"A_DTS/LOSSLESS",
|
||||
"A_OPUS",
|
||||
"A_VORBIS",
|
||||
"A_FLAC",
|
||||
"A_TRUEHD",
|
||||
"A_MLP",
|
||||
"A_PCM/INT/LIT",
|
||||
"A_PCM/INT/BIG",
|
||||
"A_PCM/FLOAT/IEEE",
|
||||
"A_REAL/28_8",
|
||||
"A_REAL/COOK",
|
||||
"A_REAL/SIPR",
|
||||
"A_REAL/ATRC",
|
||||
"A_MS/ACM", # Generic Windows audio
|
||||
"A_MP3",
|
||||
"A_MPEG/L3",
|
||||
# PyAV / FFmpeg names
|
||||
"eac3",
|
||||
"ac3",
|
||||
"dts",
|
||||
"dca",
|
||||
"truehd",
|
||||
"mlp",
|
||||
"mp3",
|
||||
"opus",
|
||||
"vorbis",
|
||||
"flac",
|
||||
"pcm_s16le",
|
||||
"pcm_s24le",
|
||||
"pcm_f32le",
|
||||
"wmav2",
|
||||
"wmavoice",
|
||||
"wmapro",
|
||||
"cook",
|
||||
"sipr",
|
||||
"atrac3",
|
||||
}
|
||||
)
|
||||
|
||||
# Map PyAV codec names to MKV CodecIDs (for the MKV fast-path)
|
||||
_PYAV_TO_MKV_VIDEO = {
|
||||
"h264": "V_MPEG4/ISO/AVC",
|
||||
"hevc": "V_MPEGH/ISO/HEVC",
|
||||
"h265": "V_MPEGH/ISO/HEVC",
|
||||
"mpeg2video": "V_MPEG2",
|
||||
"vp8": "V_VP8",
|
||||
"vp9": "V_VP9",
|
||||
"av1": "V_AV1",
|
||||
}
|
||||
|
||||
_PYAV_TO_MKV_AUDIO = {
|
||||
"aac": "A_AAC",
|
||||
"eac3": "A_EAC3",
|
||||
"ac3": "A_AC3",
|
||||
"dts": "A_DTS",
|
||||
"opus": "A_OPUS",
|
||||
"vorbis": "A_VORBIS",
|
||||
"flac": "A_FLAC",
|
||||
"mp3": "A_MPEG/L3",
|
||||
"truehd": "A_TRUEHD",
|
||||
}
|
||||
|
||||
|
||||
# ────────────────────────────────────────────────────────────────────
|
||||
# NAL unit format conversion (Annex B ↔ AVCC)
|
||||
# ────────────────────────────────────────────────────────────────────
|
||||
|
||||
# H.264 NAL types that belong in the init segment (avcC), not in samples
|
||||
_H264_PARAM_NAL_TYPES = frozenset({7, 8, 9}) # SPS, PPS, AUD
|
||||
|
||||
|
||||
def _find_annexb_nals(data: bytes) -> list[tuple[int, int]]:
|
||||
"""
|
||||
Find all NAL unit [start, end) byte ranges in Annex B formatted data.
|
||||
|
||||
Handles both 3-byte (00 00 01) and 4-byte (00 00 00 01) start codes.
|
||||
Returns a list of (start, end) tuples pointing into *data*.
|
||||
"""
|
||||
size = len(data)
|
||||
nals: list[tuple[int, int]] = []
|
||||
i = 0
|
||||
|
||||
while i < size - 2:
|
||||
# Scan for 0x000001 or 0x00000001
|
||||
if data[i] != 0:
|
||||
i += 1
|
||||
continue
|
||||
if data[i + 1] != 0:
|
||||
i += 2
|
||||
continue
|
||||
if data[i + 2] == 1:
|
||||
nal_start = i + 3
|
||||
elif data[i + 2] == 0 and i + 3 < size and data[i + 3] == 1:
|
||||
nal_start = i + 4
|
||||
else:
|
||||
i += 1
|
||||
continue
|
||||
|
||||
# Record end of previous NAL
|
||||
if nals:
|
||||
nals[-1] = (nals[-1][0], i)
|
||||
nals.append((nal_start, size))
|
||||
i = nal_start
|
||||
|
||||
return nals
|
||||
|
||||
|
||||
def is_annexb(data: bytes) -> bool:
|
||||
"""
|
||||
Return True if *data* starts with an Annex B start code.
|
||||
|
||||
Disambiguates AVCC (4-byte length prefix) from Annex B when the data
|
||||
begins with ``00 00 01 xx`` or ``00 00 00 01`` by checking whether
|
||||
the AVCC interpretation yields a plausible H.264 NAL. If the 4-byte
|
||||
big-endian length + subsequent NAL header byte is valid and the
|
||||
length fits within the data, this is AVCC -- not Annex B.
|
||||
"""
|
||||
if len(data) < 5:
|
||||
return False
|
||||
|
||||
# 4-byte start code: 00 00 00 01
|
||||
if data[0] == 0 and data[1] == 0 and data[2] == 0 and data[3] == 1:
|
||||
return True
|
||||
|
||||
# 3-byte start code: 00 00 01 -- but could also be AVCC with length
|
||||
# that starts with 00 00 01 (i.e. length 0x000001xx = 256..511).
|
||||
if data[0] == 0 and data[1] == 0 and data[2] == 1:
|
||||
# Interpret as AVCC: 4-byte big-endian length
|
||||
avcc_len = int.from_bytes(data[0:4], "big")
|
||||
if 0 < avcc_len <= len(data) - 4:
|
||||
# Check if the NAL header byte is a valid H.264 NAL
|
||||
nal_byte = data[4]
|
||||
forbidden = (nal_byte >> 7) & 1
|
||||
nal_type = nal_byte & 0x1F
|
||||
if forbidden == 0 and 1 <= nal_type <= 12:
|
||||
# Plausible AVCC: valid length + valid NAL type
|
||||
return False
|
||||
# Not plausible AVCC, treat as Annex B
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
|
||||
def annexb_to_avcc(data: bytes, filter_ps: bool = True) -> bytes:
|
||||
"""
|
||||
Convert Annex B (start-code-prefixed) NAL units to AVCC
|
||||
(4-byte length-prefixed) format suitable for fMP4 samples.
|
||||
|
||||
Args:
|
||||
data: H.264 access unit in Annex B format.
|
||||
filter_ps: If True, strip SPS/PPS/AUD NAL units (they belong
|
||||
in the avcC box of the init segment, not in samples).
|
||||
|
||||
Returns:
|
||||
The same NAL units with 4-byte big-endian length prefixes.
|
||||
"""
|
||||
if not data or not is_annexb(data):
|
||||
return data # Already AVCC or empty
|
||||
|
||||
nals = _find_annexb_nals(data)
|
||||
if not nals:
|
||||
return data
|
||||
|
||||
out = bytearray()
|
||||
for start, end in nals:
|
||||
# Strip trailing zero-padding before next start code
|
||||
while end > start and data[end - 1] == 0:
|
||||
end -= 1
|
||||
if end <= start:
|
||||
continue
|
||||
|
||||
if filter_ps:
|
||||
nal_type = data[start] & 0x1F
|
||||
if nal_type in _H264_PARAM_NAL_TYPES:
|
||||
continue
|
||||
|
||||
length = end - start
|
||||
out.extend(length.to_bytes(4, "big"))
|
||||
out.extend(data[start:end])
|
||||
|
||||
# If every NAL was filtered out (e.g. packet only contains SPS/PPS/AUD),
|
||||
# return empty so callers can drop this sample. Returning original Annex-B
|
||||
# bytes here would corrupt fMP4 samples (expects AVCC length prefixes).
|
||||
return bytes(out)
|
||||
|
||||
|
||||
# H.264 profiles that require the avcC High Profile extension fields
|
||||
# (chroma_format_idc, bit_depth_luma/chroma, numSpsExt).
|
||||
_HIGH_PROFILE_IDCS = frozenset({100, 110, 122, 244, 44, 83, 86, 118, 128, 138, 139, 134})
|
||||
|
||||
|
||||
def _fix_avcc_high_profile(avcc: bytes) -> bytes:
|
||||
"""
|
||||
Ensure an avcC record includes High Profile extension bytes.
|
||||
|
||||
The ISO/IEC 14496-15 spec requires additional fields after the PPS
|
||||
section when ``AVCProfileIndication`` is 100 (High), 110, 122, or 244.
|
||||
Some MKV muxers omit these, causing decoders to not know the chroma
|
||||
format or bit depth, which leads to widespread decode errors.
|
||||
|
||||
If the extensions are missing, appends the defaults for 4:2:0 / 8-bit
|
||||
with zero extended SPS sets.
|
||||
"""
|
||||
if len(avcc) < 7:
|
||||
return avcc
|
||||
if avcc[0] != 1:
|
||||
return avcc # Not an avcC record
|
||||
|
||||
profile_idc = avcc[1]
|
||||
if profile_idc not in _HIGH_PROFILE_IDCS:
|
||||
return avcc # Not a High Profile variant, no extensions needed
|
||||
|
||||
# Walk past SPS and PPS sections to find where extensions should be
|
||||
off = 5
|
||||
num_sps = avcc[off] & 0x1F
|
||||
off += 1
|
||||
for _ in range(num_sps):
|
||||
if off + 2 > len(avcc):
|
||||
return avcc
|
||||
sps_len = struct.unpack(">H", avcc[off : off + 2])[0]
|
||||
off += 2 + sps_len
|
||||
|
||||
if off >= len(avcc):
|
||||
return avcc
|
||||
num_pps = avcc[off]
|
||||
off += 1
|
||||
for _ in range(num_pps):
|
||||
if off + 2 > len(avcc):
|
||||
return avcc
|
||||
pps_len = struct.unpack(">H", avcc[off : off + 2])[0]
|
||||
off += 2 + pps_len
|
||||
|
||||
# If there are already bytes after the PPS section, extensions exist
|
||||
if off < len(avcc):
|
||||
return avcc
|
||||
|
||||
# Append default High Profile extensions:
|
||||
# chroma_format_idc = 1 (4:2:0) -> 0xFC | 0x01 = 0xFD (reserved 111111 + 01)
|
||||
# bit_depth_luma_minus8 = 0 -> 0xF8 | 0x00 = 0xF8 (reserved 11111 + 000)
|
||||
# bit_depth_chroma_minus8 = 0 -> 0xF8 | 0x00 = 0xF8 (reserved 11111 + 000)
|
||||
# numOfSequenceParameterSetExt = 0
|
||||
ext = bytearray(avcc)
|
||||
ext.append(0xFD) # 111111_01 : chroma_format_idc = 1
|
||||
ext.append(0xF8) # 11111_000 : bit_depth_luma_minus8 = 0
|
||||
ext.append(0xF8) # 11111_000 : bit_depth_chroma_minus8 = 0
|
||||
ext.append(0x00) # numOfSequenceParameterSetExt = 0
|
||||
return bytes(ext)
|
||||
|
||||
|
||||
def ensure_avcc_extradata(extradata: bytes) -> bytes:
|
||||
"""
|
||||
Ensure h264 extradata is in avcC format for the fMP4 init segment.
|
||||
|
||||
PyAV returns extradata in the container's native format:
|
||||
- MKV/MP4: avcC format (starts with 0x01)
|
||||
- MPEG-TS: Annex B format (starts with 0x00 0x00)
|
||||
|
||||
If Annex B, parses SPS/PPS NAL units and builds proper avcC.
|
||||
If already avcC, validates and fixes High Profile extension fields.
|
||||
"""
|
||||
if not extradata or len(extradata) < 4:
|
||||
return extradata
|
||||
|
||||
# Already avcC format (configurationVersion == 1)
|
||||
if extradata[0] == 0x01:
|
||||
return _fix_avcc_high_profile(extradata)
|
||||
|
||||
# Parse Annex B NAL units to extract SPS and PPS
|
||||
nals = _find_annexb_nals(extradata)
|
||||
if not nals:
|
||||
return extradata
|
||||
|
||||
sps_list: list[bytes] = []
|
||||
pps_list: list[bytes] = []
|
||||
|
||||
for start, end in nals:
|
||||
while end > start and extradata[end - 1] == 0:
|
||||
end -= 1
|
||||
if end <= start:
|
||||
continue
|
||||
nal_type = extradata[start] & 0x1F
|
||||
nal_data = extradata[start:end]
|
||||
if nal_type == 7: # SPS
|
||||
sps_list.append(nal_data)
|
||||
elif nal_type == 8: # PPS
|
||||
pps_list.append(nal_data)
|
||||
|
||||
if not sps_list:
|
||||
return extradata # Can't build avcC without SPS
|
||||
|
||||
sps = sps_list[0]
|
||||
if len(sps) < 4:
|
||||
return extradata
|
||||
|
||||
# Build avcC box content
|
||||
avcc = bytearray()
|
||||
avcc.append(1) # configurationVersion
|
||||
avcc.append(sps[1]) # AVCProfileIndication
|
||||
avcc.append(sps[2]) # profile_compatibility
|
||||
avcc.append(sps[3]) # AVCLevelIndication
|
||||
avcc.append(0xFF) # 6 bits reserved (0x3F) + lengthSizeMinusOne=3 -> 4-byte NAL lengths
|
||||
avcc.append(0xE0 | len(sps_list)) # 3 bits reserved (0x07) + numOfSPS
|
||||
|
||||
for s in sps_list:
|
||||
avcc.extend(struct.pack(">H", len(s)))
|
||||
avcc.extend(s)
|
||||
|
||||
avcc.append(len(pps_list)) # numOfPPS
|
||||
for p in pps_list:
|
||||
avcc.extend(struct.pack(">H", len(p)))
|
||||
avcc.extend(p)
|
||||
|
||||
return _fix_avcc_high_profile(bytes(avcc))
|
||||
|
||||
|
||||
def extract_sps_pps_from_annexb(data: bytes) -> bytes:
|
||||
"""
|
||||
Extract SPS and PPS NAL units from Annex B encoded data and build
|
||||
an avcC-format extradata blob.
|
||||
|
||||
Hardware encoders like VideoToolbox embed SPS/PPS as in-band NAL
|
||||
units in their first keyframe output rather than setting extradata
|
||||
on the codec context. This function finds those parameter sets
|
||||
and returns proper avcC bytes suitable for the fMP4 init segment.
|
||||
|
||||
Returns:
|
||||
avcC bytes if SPS/PPS were found, empty bytes otherwise.
|
||||
"""
|
||||
if not data or not is_annexb(data):
|
||||
return b""
|
||||
|
||||
nals = _find_annexb_nals(data)
|
||||
if not nals:
|
||||
return b""
|
||||
|
||||
sps_list: list[bytes] = []
|
||||
pps_list: list[bytes] = []
|
||||
|
||||
for start, end in nals:
|
||||
# Strip trailing zero-padding
|
||||
while end > start and data[end - 1] == 0:
|
||||
end -= 1
|
||||
if end <= start:
|
||||
continue
|
||||
|
||||
nal_type = data[start] & 0x1F
|
||||
if nal_type == 7: # SPS
|
||||
sps_list.append(data[start:end])
|
||||
elif nal_type == 8: # PPS
|
||||
pps_list.append(data[start:end])
|
||||
|
||||
if not sps_list:
|
||||
return b""
|
||||
|
||||
sps = sps_list[0]
|
||||
if len(sps) < 4:
|
||||
return b""
|
||||
|
||||
# Build avcC box content
|
||||
avcc = bytearray()
|
||||
avcc.append(1) # configurationVersion
|
||||
avcc.append(sps[1]) # AVCProfileIndication
|
||||
avcc.append(sps[2]) # profile_compatibility
|
||||
avcc.append(sps[3]) # AVCLevelIndication
|
||||
avcc.append(0xFF) # 6 bits reserved + lengthSizeMinusOne=3
|
||||
avcc.append(0xE0 | len(sps_list)) # 3 bits reserved + numOfSPS
|
||||
|
||||
for s in sps_list:
|
||||
avcc.extend(struct.pack(">H", len(s)))
|
||||
avcc.extend(s)
|
||||
|
||||
avcc.append(len(pps_list)) # numOfPPS
|
||||
for p in pps_list:
|
||||
avcc.extend(struct.pack(">H", len(p)))
|
||||
avcc.extend(p)
|
||||
|
||||
return bytes(avcc)
|
||||
|
||||
|
||||
def video_needs_reencode(codec_id: str) -> bool:
|
||||
"""Check if a video codec requires re-encoding for browser playback."""
|
||||
if not codec_id:
|
||||
return False
|
||||
return codec_id in VIDEO_NEEDS_REENCODE
|
||||
|
||||
|
||||
def audio_needs_transcode(codec_id: str) -> bool:
|
||||
"""Check if an audio codec requires transcoding for browser playback."""
|
||||
if not codec_id:
|
||||
return False
|
||||
return codec_id in AUDIO_NEEDS_TRANSCODE
|
||||
|
||||
|
||||
def is_browser_compatible(video_codec: str, audio_codec: str) -> bool:
|
||||
"""
|
||||
Check if a video+audio combination is fully browser-compatible.
|
||||
|
||||
Returns True only if BOTH video and audio can be played natively in
|
||||
an HTML5 <video> element inside an MP4 container.
|
||||
"""
|
||||
video_ok = video_codec in BROWSER_VIDEO_CODECS or not video_codec
|
||||
audio_ok = audio_codec in BROWSER_AUDIO_CODECS or not audio_codec
|
||||
return video_ok and audio_ok
|
||||
|
||||
|
||||
class TranscodeDecision:
|
||||
"""Result of analyzing a stream's codec compatibility."""
|
||||
|
||||
__slots__ = ("transcode_video", "transcode_audio", "video_codec", "audio_codec")
|
||||
|
||||
def __init__(self, video_codec: str = "", audio_codec: str = "") -> None:
|
||||
self.video_codec = video_codec
|
||||
self.audio_codec = audio_codec
|
||||
self.transcode_video = video_needs_reencode(video_codec)
|
||||
self.transcode_audio = audio_needs_transcode(audio_codec)
|
||||
|
||||
@property
|
||||
def needs_transcode(self) -> bool:
|
||||
"""True if any stream needs transcoding."""
|
||||
return self.transcode_video or self.transcode_audio
|
||||
|
||||
@property
|
||||
def passthrough_ok(self) -> bool:
|
||||
"""True if the stream can be served as-is to a browser."""
|
||||
return not self.needs_transcode
|
||||
|
||||
def __repr__(self) -> str:
|
||||
parts = []
|
||||
if self.transcode_video:
|
||||
parts.append(f"video:{self.video_codec}->h264")
|
||||
if self.transcode_audio:
|
||||
parts.append(f"audio:{self.audio_codec}->aac")
|
||||
if not parts:
|
||||
parts.append("passthrough")
|
||||
return f"TranscodeDecision({', '.join(parts)})"
|
||||
@@ -0,0 +1,614 @@
|
||||
"""
|
||||
Container format probing -- MKV Cues and MP4 moov.
|
||||
|
||||
Pure Python probing using EBML parsing (MKV) and struct-based atom
|
||||
scanning (MP4). No FFmpeg dependency.
|
||||
|
||||
Source-agnostic: accepts any MediaSource protocol implementation
|
||||
(Telegram, HTTP, etc.) for byte-range reads.
|
||||
|
||||
Provides:
|
||||
- probe_mkv_cues: probe MKV file to extract seek index (MKVCueIndex)
|
||||
- probe_mp4_moov: probe MP4 file to extract moov atom and build seek index (MP4Index)
|
||||
"""
|
||||
|
||||
import base64
|
||||
import hashlib
|
||||
import json
|
||||
import logging
|
||||
import struct
|
||||
|
||||
from mediaflow_proxy.utils import redis_utils
|
||||
from mediaflow_proxy.remuxer.ebml_parser import (
|
||||
MKVCueIndex,
|
||||
build_cue_index,
|
||||
parse_ebml_header,
|
||||
parse_seek_head,
|
||||
CUES,
|
||||
INFO,
|
||||
)
|
||||
from mediaflow_proxy.remuxer.mp4_parser import (
|
||||
MP4Index,
|
||||
build_cue_points_from_moov,
|
||||
is_mp4_header,
|
||||
rewrite_moov_offsets,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# How much of the MKV header to fetch for SeekHead + Info parsing
|
||||
_HEADER_PROBE_SIZE = 64 * 1024 # 64 KB
|
||||
|
||||
# Max Cues element size we'll attempt to fetch
|
||||
_MAX_CUES_SIZE = 2 * 1024 * 1024 # 2 MB
|
||||
|
||||
# Redis cache for MKV Cue indexes
|
||||
_CUE_INDEX_CACHE_PREFIX = "mfp:cue_index:"
|
||||
_CUE_INDEX_CACHE_TTL = 3600 # 1 hour
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# MKV Cues probing
|
||||
# =============================================================================
|
||||
|
||||
|
||||
def derive_cue_cache_key(
|
||||
source_key: str = "",
|
||||
*,
|
||||
chat_id: str | int | None = None,
|
||||
message_id: int | None = None,
|
||||
file_id: str | None = None,
|
||||
) -> str:
|
||||
"""
|
||||
Derive a deterministic cache key for a file's cue index.
|
||||
|
||||
Accepts either a pre-computed source_key (from MediaSource.cache_key)
|
||||
or legacy Telegram-style parameters for backwards compatibility.
|
||||
"""
|
||||
if source_key:
|
||||
return source_key
|
||||
if file_id:
|
||||
raw = f"file_id:{file_id}"
|
||||
elif chat_id is not None and message_id is not None:
|
||||
raw = f"chat:{chat_id}:msg:{message_id}"
|
||||
else:
|
||||
return ""
|
||||
return hashlib.sha256(raw.encode()).hexdigest()[:16]
|
||||
|
||||
|
||||
async def _get_cached_cue_index(cache_key: str) -> MKVCueIndex | None:
|
||||
"""Try to load a MKVCueIndex from Redis cache."""
|
||||
if not cache_key:
|
||||
return None
|
||||
r = await redis_utils.get_redis()
|
||||
if r is None:
|
||||
return None
|
||||
redis_key = f"{_CUE_INDEX_CACHE_PREFIX}{cache_key}"
|
||||
data = await r.get(redis_key)
|
||||
if not data:
|
||||
return None
|
||||
try:
|
||||
d = json.loads(data)
|
||||
seek_header = b""
|
||||
if d.get("seek_header_b64"):
|
||||
seek_header = base64.b64decode(d["seek_header_b64"])
|
||||
video_codec_private = b""
|
||||
if d.get("video_codec_private_b64"):
|
||||
video_codec_private = base64.b64decode(d["video_codec_private_b64"])
|
||||
index = MKVCueIndex(
|
||||
duration_ms=d["duration_ms"],
|
||||
timestamp_scale=d["timestamp_scale"],
|
||||
cue_points=[(cp[0], cp[1]) for cp in d["cue_points"]],
|
||||
segment_data_offset=d["segment_data_offset"],
|
||||
first_cluster_offset=d.get("first_cluster_offset", 0),
|
||||
seek_header=seek_header,
|
||||
audio_codec_id=d.get("audio_codec_id", ""),
|
||||
audio_bitrate=d.get("audio_bitrate", 0),
|
||||
audio_channels=d.get("audio_channels", 0),
|
||||
audio_sample_rate=d.get("audio_sample_rate", 0.0),
|
||||
video_codec_id=d.get("video_codec_id", ""),
|
||||
video_codec_private=video_codec_private,
|
||||
video_width=d.get("video_width", 0),
|
||||
video_height=d.get("video_height", 0),
|
||||
video_fps=d.get("video_fps", 0.0),
|
||||
video_default_duration_ns=d.get("video_default_duration_ns", 0),
|
||||
)
|
||||
logger.debug("[container_probe] Loaded cue index from cache: %s", cache_key)
|
||||
return index
|
||||
except (KeyError, TypeError, json.JSONDecodeError) as e:
|
||||
logger.warning("[container_probe] Invalid cached cue index: %s", e)
|
||||
return None
|
||||
|
||||
|
||||
async def _set_cached_cue_index(cache_key: str, index: MKVCueIndex) -> None:
|
||||
"""Cache a MKVCueIndex in Redis."""
|
||||
if not cache_key:
|
||||
return
|
||||
r = await redis_utils.get_redis()
|
||||
if r is None:
|
||||
return
|
||||
redis_key = f"{_CUE_INDEX_CACHE_PREFIX}{cache_key}"
|
||||
data = json.dumps(
|
||||
{
|
||||
"duration_ms": index.duration_ms,
|
||||
"timestamp_scale": index.timestamp_scale,
|
||||
"cue_points": index.cue_points,
|
||||
"segment_data_offset": index.segment_data_offset,
|
||||
"first_cluster_offset": index.first_cluster_offset,
|
||||
"seek_header_b64": base64.b64encode(index.seek_header).decode() if index.seek_header else "",
|
||||
"audio_codec_id": index.audio_codec_id,
|
||||
"audio_bitrate": index.audio_bitrate,
|
||||
"audio_channels": index.audio_channels,
|
||||
"audio_sample_rate": index.audio_sample_rate,
|
||||
"video_codec_id": index.video_codec_id,
|
||||
"video_codec_private_b64": base64.b64encode(index.video_codec_private).decode()
|
||||
if index.video_codec_private
|
||||
else "",
|
||||
"video_width": index.video_width,
|
||||
"video_height": index.video_height,
|
||||
"video_fps": index.video_fps,
|
||||
"video_default_duration_ns": index.video_default_duration_ns,
|
||||
}
|
||||
)
|
||||
await r.set(redis_key, data, ex=_CUE_INDEX_CACHE_TTL)
|
||||
logger.debug("[container_probe] Cached cue index: %s", cache_key)
|
||||
|
||||
|
||||
async def probe_mkv_cues(
|
||||
source,
|
||||
file_size: int = 0,
|
||||
cache_key: str = "",
|
||||
header_data: bytes | None = None,
|
||||
) -> MKVCueIndex | None:
|
||||
"""
|
||||
Probe an MKV file's EBML header and Cues to build a seek index.
|
||||
|
||||
Pure Python -- parses EBML structures directly, no FFmpeg involved.
|
||||
|
||||
Makes up to two small byte-range reads via the provided source:
|
||||
1. First ~64KB: EBML header + SeekHead + Info (skipped if header_data provided)
|
||||
2. Cues section: byte range from SeekHead's Cues position
|
||||
|
||||
Args:
|
||||
source: A MediaSource protocol implementation, or any object with
|
||||
a ``stream(offset, limit)`` async generator method.
|
||||
file_size: Total file size in bytes. If 0, tries ``source.file_size``.
|
||||
cache_key: Optional cache key for Redis caching. If empty, tries
|
||||
``source.cache_key``.
|
||||
header_data: Pre-fetched header bytes (first ~64KB). If provided,
|
||||
skips the initial header fetch from source.
|
||||
|
||||
Returns:
|
||||
MKVCueIndex if successful, None if the file has no Cues or parsing fails.
|
||||
"""
|
||||
# Resolve file_size and cache_key from source if not provided
|
||||
if file_size <= 0:
|
||||
file_size = getattr(source, "file_size", 0)
|
||||
if not cache_key:
|
||||
cache_key = getattr(source, "cache_key", "")
|
||||
|
||||
# Check cache first
|
||||
if cache_key:
|
||||
cached = await _get_cached_cue_index(cache_key)
|
||||
if cached:
|
||||
return cached
|
||||
|
||||
try:
|
||||
# Step 1: Use pre-fetched header or fetch from source
|
||||
if header_data is None:
|
||||
header_size = min(_HEADER_PROBE_SIZE, file_size) if file_size > 0 else _HEADER_PROBE_SIZE
|
||||
header_data = b""
|
||||
async for chunk in source.stream(offset=0, limit=header_size):
|
||||
header_data += chunk
|
||||
|
||||
if len(header_data) < 64:
|
||||
logger.warning("[container_probe] Header too small (%d bytes), cannot probe", len(header_data))
|
||||
return None
|
||||
|
||||
# Step 2: Parse EBML header to find Segment data offset
|
||||
segment_data_offset = parse_ebml_header(header_data)
|
||||
|
||||
# Step 3: Parse SeekHead to find Cues and Info positions
|
||||
seek_positions = parse_seek_head(header_data, segment_data_offset)
|
||||
|
||||
if CUES not in seek_positions:
|
||||
logger.info("[container_probe] No Cues position in SeekHead, seeking not available")
|
||||
return None
|
||||
|
||||
cues_relative_offset = seek_positions[CUES]
|
||||
cues_absolute_offset = segment_data_offset + cues_relative_offset
|
||||
|
||||
logger.info(
|
||||
"[container_probe] SeekHead: Cues at offset %d (absolute %d), Info at %s",
|
||||
cues_relative_offset,
|
||||
cues_absolute_offset,
|
||||
seek_positions.get(INFO, "not found"),
|
||||
)
|
||||
|
||||
# Step 4: Fetch the Cues element
|
||||
cues_max = file_size - cues_absolute_offset if file_size > 0 else _MAX_CUES_SIZE
|
||||
cues_fetch_size = min(_MAX_CUES_SIZE, cues_max)
|
||||
if cues_fetch_size <= 0:
|
||||
logger.warning("[container_probe] Cues offset %d beyond file size %d", cues_absolute_offset, file_size)
|
||||
return None
|
||||
|
||||
cues_data = b""
|
||||
async for chunk in source.stream(offset=cues_absolute_offset, limit=cues_fetch_size):
|
||||
cues_data += chunk
|
||||
|
||||
if len(cues_data) < 16:
|
||||
logger.warning("[container_probe] Cues data too small (%d bytes)", len(cues_data))
|
||||
return None
|
||||
|
||||
# Step 5: Build the cue index
|
||||
index = build_cue_index(
|
||||
header_data=header_data,
|
||||
cues_data=cues_data,
|
||||
cues_file_offset=cues_absolute_offset,
|
||||
segment_data_offset=segment_data_offset,
|
||||
)
|
||||
|
||||
# Cache the result
|
||||
if cache_key:
|
||||
await _set_cached_cue_index(cache_key, index)
|
||||
|
||||
return index
|
||||
|
||||
except Exception as e:
|
||||
logger.warning("[container_probe] Failed to probe MKV cues: %s", e)
|
||||
return None
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# MP4 Moov probing
|
||||
# =============================================================================
|
||||
|
||||
# Redis cache for MP4 indexes
|
||||
_MP4_INDEX_CACHE_PREFIX = "mfp:mp4_index:"
|
||||
_MP4_INDEX_CACHE_TTL = 3600 # 1 hour
|
||||
|
||||
# How much to read from the start for ftyp + initial atom scanning
|
||||
_MP4_HEADER_PROBE_SIZE = 64 * 1024 # 64 KB
|
||||
|
||||
# Max moov size we'll accept
|
||||
_MAX_MOOV_SIZE = 50 * 1024 * 1024 # 50 MB
|
||||
|
||||
# How much to read from the end of the file to find moov
|
||||
_MP4_TAIL_PROBE_SIZE = 512 * 1024 # 512 KB
|
||||
|
||||
|
||||
async def _get_cached_mp4_index(cache_key: str) -> MP4Index | None:
|
||||
"""Try to load an MP4Index from Redis cache."""
|
||||
if not cache_key:
|
||||
return None
|
||||
r = await redis_utils.get_redis()
|
||||
if r is None:
|
||||
return None
|
||||
redis_key = f"{_MP4_INDEX_CACHE_PREFIX}{cache_key}"
|
||||
data = await r.get(redis_key)
|
||||
if not data:
|
||||
return None
|
||||
try:
|
||||
d = json.loads(data)
|
||||
ftyp_data = b""
|
||||
if d.get("ftyp_data_b64"):
|
||||
ftyp_data = base64.b64decode(d["ftyp_data_b64"])
|
||||
index = MP4Index(
|
||||
duration_ms=d["duration_ms"],
|
||||
timescale=d["timescale"],
|
||||
cue_points=[(cp[0], cp[1]) for cp in d["cue_points"]],
|
||||
moov_offset=d["moov_offset"],
|
||||
moov_size=d["moov_size"],
|
||||
ftyp_data=ftyp_data,
|
||||
mdat_offset=d["mdat_offset"],
|
||||
mdat_size=d["mdat_size"],
|
||||
video_codec=d.get("video_codec", ""),
|
||||
audio_codec=d.get("audio_codec", ""),
|
||||
# moov_data is NOT cached (too large), it will be re-fetched
|
||||
)
|
||||
logger.debug("[container_probe] Loaded MP4 index from cache: %s", cache_key)
|
||||
return index
|
||||
except (KeyError, TypeError, json.JSONDecodeError) as e:
|
||||
logger.warning("[container_probe] Invalid cached MP4 index: %s", e)
|
||||
return None
|
||||
|
||||
|
||||
async def _set_cached_mp4_index(cache_key: str, index: MP4Index) -> None:
|
||||
"""Cache an MP4Index in Redis (without moov_data)."""
|
||||
if not cache_key:
|
||||
return
|
||||
r = await redis_utils.get_redis()
|
||||
if r is None:
|
||||
return
|
||||
redis_key = f"{_MP4_INDEX_CACHE_PREFIX}{cache_key}"
|
||||
data = json.dumps(
|
||||
{
|
||||
"duration_ms": index.duration_ms,
|
||||
"timescale": index.timescale,
|
||||
"cue_points": index.cue_points,
|
||||
"moov_offset": index.moov_offset,
|
||||
"moov_size": index.moov_size,
|
||||
"ftyp_data_b64": base64.b64encode(index.ftyp_data).decode() if index.ftyp_data else "",
|
||||
"mdat_offset": index.mdat_offset,
|
||||
"mdat_size": index.mdat_size,
|
||||
"video_codec": index.video_codec,
|
||||
"audio_codec": index.audio_codec,
|
||||
}
|
||||
)
|
||||
await r.set(redis_key, data, ex=_MP4_INDEX_CACHE_TTL)
|
||||
logger.debug("[container_probe] Cached MP4 index: %s", cache_key)
|
||||
|
||||
|
||||
def _scan_top_level_atoms(data: bytes) -> list[tuple[bytes, int, int]]:
|
||||
"""
|
||||
Scan top-level atom headers from raw file bytes.
|
||||
|
||||
Returns:
|
||||
List of (box_type, absolute_offset, total_size) for each atom found.
|
||||
"""
|
||||
atoms = []
|
||||
offset = 0
|
||||
while offset + 8 <= len(data):
|
||||
size = struct.unpack_from(">I", data, offset)[0]
|
||||
box_type = data[offset + 4 : offset + 8]
|
||||
|
||||
if size == 1: # Extended size
|
||||
if offset + 16 > len(data):
|
||||
break
|
||||
size = struct.unpack_from(">Q", data, offset + 8)[0]
|
||||
elif size == 0:
|
||||
# Extends to end of file - we can't know the real size from
|
||||
# a partial read, but record what we have
|
||||
atoms.append((box_type, offset, 0))
|
||||
break
|
||||
|
||||
if size < 8:
|
||||
break
|
||||
|
||||
atoms.append((box_type, offset, size))
|
||||
offset += size
|
||||
|
||||
return atoms
|
||||
|
||||
|
||||
async def probe_mp4_moov(
|
||||
source,
|
||||
file_size: int = 0,
|
||||
cache_key: str = "",
|
||||
header_data: bytes | None = None,
|
||||
) -> MP4Index | None:
|
||||
"""
|
||||
Probe an MP4 file's moov atom to build a seek index.
|
||||
|
||||
Pure Python -- scans MP4 box headers with struct, no FFmpeg involved.
|
||||
|
||||
Strategy:
|
||||
1. Read first ~64KB to check for ftyp (MP4 signature).
|
||||
2. Scan top-level atoms to find moov and mdat.
|
||||
3. If moov is at the start (faststart), read it from the header data.
|
||||
4. If moov is not in the header, read from the tail of the file.
|
||||
5. Parse moov sample tables to build cue points.
|
||||
|
||||
Args:
|
||||
source: A MediaSource protocol implementation with stream(offset, limit).
|
||||
file_size: Total file size in bytes.
|
||||
cache_key: Optional cache key for Redis caching.
|
||||
header_data: Pre-fetched header bytes (first ~64KB). If provided,
|
||||
skips the initial header fetch from source.
|
||||
|
||||
Returns:
|
||||
MP4Index if successful, None if not an MP4 or parsing fails.
|
||||
"""
|
||||
if file_size <= 0:
|
||||
file_size = getattr(source, "file_size", 0)
|
||||
if not cache_key:
|
||||
cache_key = getattr(source, "cache_key", "")
|
||||
|
||||
# Check cache first
|
||||
if cache_key:
|
||||
cached = await _get_cached_mp4_index(cache_key)
|
||||
if cached:
|
||||
# Re-fetch moov_data (not cached due to size) and rewrite offsets
|
||||
if cached.moov_size > 0 and cached.moov_size <= _MAX_MOOV_SIZE:
|
||||
moov_data = b""
|
||||
async for chunk in source.stream(offset=cached.moov_offset, limit=cached.moov_size):
|
||||
moov_data += chunk
|
||||
if cached.mdat_offset >= 0:
|
||||
new_mdat_start = len(cached.ftyp_data) + cached.moov_size
|
||||
offset_delta = new_mdat_start - cached.mdat_offset
|
||||
if offset_delta != 0:
|
||||
moov_data = rewrite_moov_offsets(moov_data, offset_delta)
|
||||
cached.moov_data = moov_data
|
||||
return cached
|
||||
|
||||
try:
|
||||
# Step 1: Use pre-fetched header or fetch from source
|
||||
if header_data is None:
|
||||
header_size = min(_MP4_HEADER_PROBE_SIZE, file_size) if file_size > 0 else _MP4_HEADER_PROBE_SIZE
|
||||
header_data = b""
|
||||
async for chunk in source.stream(offset=0, limit=header_size):
|
||||
header_data += chunk
|
||||
|
||||
if len(header_data) < 12:
|
||||
return None
|
||||
|
||||
# Step 2: Check for ftyp
|
||||
if not is_mp4_header(header_data):
|
||||
return None
|
||||
|
||||
logger.info("[container_probe] MP4 detected, scanning atoms (header=%d bytes)", len(header_data))
|
||||
|
||||
# Step 3: Scan top-level atoms from header
|
||||
atoms = _scan_top_level_atoms(header_data)
|
||||
|
||||
ftyp_offset = -1
|
||||
ftyp_size = 0
|
||||
moov_offset = -1
|
||||
moov_size = 0
|
||||
mdat_offset = -1
|
||||
mdat_size = 0
|
||||
|
||||
for box_type, atom_offset, atom_size in atoms:
|
||||
if box_type == b"ftyp":
|
||||
ftyp_offset = atom_offset
|
||||
ftyp_size = atom_size
|
||||
elif box_type == b"moov":
|
||||
moov_offset = atom_offset
|
||||
moov_size = atom_size
|
||||
elif box_type == b"mdat":
|
||||
mdat_offset = atom_offset
|
||||
mdat_size = atom_size
|
||||
|
||||
# Step 4: If moov not found in header, scan from tail
|
||||
if moov_offset < 0 and file_size > 0:
|
||||
tail_start = max(0, file_size - _MP4_TAIL_PROBE_SIZE)
|
||||
tail_data = b""
|
||||
async for chunk in source.stream(offset=tail_start, limit=file_size - tail_start):
|
||||
tail_data += chunk
|
||||
|
||||
if tail_data:
|
||||
tail_atoms = _scan_top_level_atoms(tail_data)
|
||||
for box_type, rel_offset, atom_size in tail_atoms:
|
||||
abs_offset = tail_start + rel_offset
|
||||
if box_type == b"moov":
|
||||
moov_offset = abs_offset
|
||||
moov_size = atom_size
|
||||
elif box_type == b"mdat" and mdat_offset < 0:
|
||||
mdat_offset = abs_offset
|
||||
mdat_size = atom_size
|
||||
|
||||
# If the initial scan yielded no moov (tail_start may land
|
||||
# inside a large mdat payload producing garbage atom headers),
|
||||
# resync by scanning 8-byte aligned windows for b"moov".
|
||||
if moov_offset < 0:
|
||||
needle = b"moov"
|
||||
search_pos = 0
|
||||
while search_pos + 8 <= len(tail_data):
|
||||
idx = tail_data.find(needle, search_pos)
|
||||
if idx < 0 or idx < 4:
|
||||
break
|
||||
candidate_size = struct.unpack_from(">I", tail_data, idx - 4)[0]
|
||||
if 8 < candidate_size <= _MAX_MOOV_SIZE:
|
||||
moov_offset = tail_start + idx - 4
|
||||
moov_size = candidate_size
|
||||
break
|
||||
search_pos = idx + 4
|
||||
|
||||
if moov_offset < 0:
|
||||
logger.info("[container_probe] No moov atom found in MP4")
|
||||
return None
|
||||
|
||||
if moov_size <= 0 or moov_size > _MAX_MOOV_SIZE:
|
||||
logger.warning("[container_probe] moov size %d is invalid or too large", moov_size)
|
||||
return None
|
||||
|
||||
logger.info(
|
||||
"[container_probe] MP4 atoms: moov at %d (%d bytes), mdat at %d (%d bytes)",
|
||||
moov_offset,
|
||||
moov_size,
|
||||
mdat_offset,
|
||||
mdat_size,
|
||||
)
|
||||
|
||||
# Step 5: Fetch full moov atom
|
||||
# Check if moov is already contained in the header data we read
|
||||
if moov_offset + moov_size <= len(header_data):
|
||||
moov_data = header_data[moov_offset : moov_offset + moov_size]
|
||||
else:
|
||||
moov_data = b""
|
||||
async for chunk in source.stream(offset=moov_offset, limit=moov_size):
|
||||
moov_data += chunk
|
||||
|
||||
if len(moov_data) < moov_size:
|
||||
logger.warning(
|
||||
"[container_probe] Incomplete moov: got %d of %d bytes",
|
||||
len(moov_data),
|
||||
moov_size,
|
||||
)
|
||||
return None
|
||||
|
||||
# Step 6: Parse moov body (skip box header)
|
||||
# Determine header size
|
||||
raw_size = struct.unpack_from(">I", moov_data, 0)[0]
|
||||
hdr_size = 16 if raw_size == 1 else 8
|
||||
moov_body = moov_data[hdr_size:]
|
||||
|
||||
cue_points, duration_ms, timescale, video_codec, audio_codec = build_cue_points_from_moov(moov_body)
|
||||
|
||||
# If mdat wasn't found via header scan, it's likely right after ftyp
|
||||
# or right after moov. Common layouts:
|
||||
# ftyp + moov + mdat (faststart) or ftyp + mdat + moov
|
||||
if mdat_offset < 0:
|
||||
# Walk atoms to find mdat by scanning just enough from the file
|
||||
# In most cases, mdat is either before or after moov
|
||||
if moov_offset < file_size // 2:
|
||||
# moov is early -> mdat likely follows
|
||||
mdat_search_offset = moov_offset + moov_size
|
||||
else:
|
||||
# moov is late -> mdat likely right after ftyp
|
||||
ftyp_size = struct.unpack_from(">I", header_data, 0)[0]
|
||||
if ftyp_size == 1:
|
||||
ftyp_size = struct.unpack_from(">Q", header_data, 8)[0]
|
||||
mdat_search_offset = ftyp_size
|
||||
|
||||
# Read a small amount to find the mdat header
|
||||
mdat_header = b""
|
||||
async for chunk in source.stream(offset=mdat_search_offset, limit=16):
|
||||
mdat_header += chunk
|
||||
if len(mdat_header) >= 8:
|
||||
box_type = mdat_header[4:8]
|
||||
if box_type == b"mdat":
|
||||
mdat_offset = mdat_search_offset
|
||||
raw_sz = struct.unpack_from(">I", mdat_header, 0)[0]
|
||||
if raw_sz == 1 and len(mdat_header) >= 16:
|
||||
mdat_size = struct.unpack_from(">Q", mdat_header, 8)[0]
|
||||
else:
|
||||
mdat_size = raw_sz
|
||||
|
||||
# Step 7: Extract ftyp data (always in the header since it's the first atom)
|
||||
ftyp_data = b""
|
||||
if ftyp_offset >= 0 and ftyp_size > 0 and ftyp_offset + ftyp_size <= len(header_data):
|
||||
ftyp_data = header_data[ftyp_offset : ftyp_offset + ftyp_size]
|
||||
|
||||
# Step 8: Rewrite moov chunk offsets for faststart pipe layout.
|
||||
# The pipe stream will be: ftyp + moov + mdat. The stco/co64
|
||||
# offsets in the original moov point to positions in the original
|
||||
# file. We need to shift them to account for the new layout.
|
||||
# New mdat position = ftyp_size + moov_size
|
||||
# Delta = new_mdat_position - original_mdat_offset
|
||||
if mdat_offset >= 0:
|
||||
new_mdat_start = len(ftyp_data) + moov_size
|
||||
offset_delta = new_mdat_start - mdat_offset
|
||||
if offset_delta != 0:
|
||||
moov_data = rewrite_moov_offsets(moov_data, offset_delta)
|
||||
|
||||
index = MP4Index(
|
||||
duration_ms=duration_ms,
|
||||
timescale=timescale,
|
||||
cue_points=cue_points,
|
||||
moov_offset=moov_offset,
|
||||
moov_size=moov_size,
|
||||
moov_data=moov_data,
|
||||
ftyp_data=ftyp_data,
|
||||
mdat_offset=mdat_offset,
|
||||
mdat_size=mdat_size,
|
||||
video_codec=video_codec,
|
||||
audio_codec=audio_codec,
|
||||
)
|
||||
|
||||
logger.info(
|
||||
"[container_probe] MP4 index: duration=%.1fs, %d cue points, video=%s, audio=%s",
|
||||
duration_ms / 1000.0,
|
||||
len(cue_points),
|
||||
video_codec,
|
||||
audio_codec,
|
||||
)
|
||||
|
||||
if cache_key:
|
||||
await _set_cached_mp4_index(cache_key, index)
|
||||
|
||||
return index
|
||||
|
||||
except Exception as e:
|
||||
logger.warning("[container_probe] Failed to probe MP4 moov: %s", e)
|
||||
return None
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,151 @@
|
||||
"""
|
||||
HLS VOD playlist generator for on-the-fly fMP4 transcoding.
|
||||
|
||||
Produces an M3U8 VOD playlist from an ``MKVCueIndex`` or ``MP4Index``.
|
||||
Consecutive keyframes that are closer together than the target segment
|
||||
duration are merged into a single HLS segment, matching the behaviour
|
||||
of ``ffmpeg -hls_time``.
|
||||
|
||||
The init segment is referenced via ``#EXT-X-MAP``.
|
||||
|
||||
Requires ``#EXT-X-VERSION:7`` for fMP4 (CMAF) segments.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import math
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
if TYPE_CHECKING:
|
||||
pass
|
||||
|
||||
|
||||
def merge_cue_points(
|
||||
cue_points: list[tuple[float, int]],
|
||||
target_duration_ms: float = 5000.0,
|
||||
) -> list[tuple[float, int]]:
|
||||
"""Merge consecutive keyframes into segments of *>= target_duration_ms*.
|
||||
|
||||
This replicates the logic of ``ffmpeg -hls_time``: a new segment
|
||||
boundary is created only when a keyframe is encountered **at least**
|
||||
``target_duration_ms`` after the start of the current segment.
|
||||
Keyframes that fall within the target window are absorbed into the
|
||||
current segment.
|
||||
|
||||
Side-effects:
|
||||
* Eliminates duplicate byte-offset entries (previously handled by
|
||||
``deduplicate_cue_points``).
|
||||
* Eliminates very short "runt" segments (e.g. 0.3 s).
|
||||
|
||||
Args:
|
||||
cue_points: Sorted ``(time_ms, byte_offset)`` list.
|
||||
target_duration_ms: Minimum segment duration in milliseconds.
|
||||
|
||||
Returns:
|
||||
A reduced list of ``(time_ms, byte_offset)`` tuples representing
|
||||
the merged segment boundaries.
|
||||
"""
|
||||
if not cue_points:
|
||||
return []
|
||||
|
||||
# Normalize duplicate offsets first: keep the earliest timestamp for each
|
||||
# byte offset. Some MKV files expose multiple cue times for the same
|
||||
# cluster offset; if we keep a later duplicate, segment start times no
|
||||
# longer match the actual bytes and can produce timestamp regressions.
|
||||
# Sorting by (time, offset) ensures earliest time wins deterministically.
|
||||
by_time = sorted(cue_points, key=lambda x: (x[0], x[1]))
|
||||
deduped: list[tuple[float, int]] = []
|
||||
seen_offsets: set[int] = set()
|
||||
for time_ms, byte_offset in by_time:
|
||||
if byte_offset in seen_offsets:
|
||||
continue
|
||||
seen_offsets.add(byte_offset)
|
||||
deduped.append((time_ms, byte_offset))
|
||||
|
||||
if not deduped:
|
||||
return []
|
||||
|
||||
merged: list[tuple[float, int]] = [deduped[0]]
|
||||
for i in range(1, len(deduped)):
|
||||
time_since_last = deduped[i][0] - merged[-1][0]
|
||||
if time_since_last >= target_duration_ms:
|
||||
merged.append(deduped[i])
|
||||
return merged
|
||||
|
||||
|
||||
def generate_vod_playlist(
|
||||
cue_points: list[tuple[float, int]],
|
||||
duration_ms: float,
|
||||
init_url: str,
|
||||
segment_url_template: str,
|
||||
target_segment_duration_ms: float = 5000.0,
|
||||
) -> str:
|
||||
"""Build an HLS VOD M3U8 playlist from cue-point data.
|
||||
|
||||
Consecutive keyframes that are closer than *target_segment_duration_ms*
|
||||
are merged into a single segment (matching ``ffmpeg -hls_time``).
|
||||
|
||||
Segment URLs use ``{start_ms}`` and ``{end_ms}`` placeholders that are
|
||||
replaced with the segment's time range in milliseconds.
|
||||
|
||||
Args:
|
||||
cue_points: Sorted list of ``(time_ms, byte_offset)`` tuples.
|
||||
duration_ms: Total media duration in milliseconds.
|
||||
init_url: URL for the fMP4 init segment (``#EXT-X-MAP`` URI).
|
||||
segment_url_template: URL template containing ``{seg}``,
|
||||
``{start_ms}`` and ``{end_ms}`` placeholders.
|
||||
target_segment_duration_ms: Target minimum segment duration.
|
||||
|
||||
Returns:
|
||||
Complete M3U8 playlist string.
|
||||
"""
|
||||
if not cue_points:
|
||||
return ""
|
||||
|
||||
merged = merge_cue_points(cue_points, target_segment_duration_ms)
|
||||
|
||||
# Build per-segment (start_ms, end_ms, duration_s) list.
|
||||
segments: list[tuple[float, float, float]] = []
|
||||
for i in range(len(merged)):
|
||||
start_ms = merged[i][0]
|
||||
end_ms = merged[i + 1][0] if i + 1 < len(merged) else duration_ms
|
||||
dur_s = max((end_ms - start_ms) / 1000.0, 0.001)
|
||||
segments.append((start_ms, end_ms, dur_s))
|
||||
|
||||
if not segments:
|
||||
return ""
|
||||
|
||||
target_duration = math.ceil(max(dur_s for _, _, dur_s in segments))
|
||||
target_duration = max(target_duration, 1)
|
||||
|
||||
lines: list[str] = [
|
||||
"#EXTM3U",
|
||||
"#EXT-X-VERSION:7",
|
||||
f"#EXT-X-TARGETDURATION:{target_duration}",
|
||||
"#EXT-X-PLAYLIST-TYPE:VOD",
|
||||
"#EXT-X-MEDIA-SEQUENCE:0",
|
||||
f'#EXT-X-MAP:URI="{init_url}"',
|
||||
]
|
||||
|
||||
for seg_num, (start_ms, end_ms, dur_s) in enumerate(segments):
|
||||
lines.append(f"#EXTINF:{dur_s:.3f},")
|
||||
url = (
|
||||
segment_url_template.replace(
|
||||
"{seg}",
|
||||
str(seg_num),
|
||||
)
|
||||
.replace(
|
||||
"{start_ms}",
|
||||
str(int(start_ms)),
|
||||
)
|
||||
.replace(
|
||||
"{end_ms}",
|
||||
str(int(end_ms)),
|
||||
)
|
||||
)
|
||||
lines.append(url)
|
||||
|
||||
lines.append("#EXT-X-ENDLIST")
|
||||
lines.append("") # trailing newline
|
||||
|
||||
return "\n".join(lines)
|
||||
@@ -0,0 +1,234 @@
|
||||
"""
|
||||
Abstract media source protocol for source-agnostic transcode pipeline.
|
||||
|
||||
Decouples the transcode pipeline, MKV cue probing, and seeking logic
|
||||
from any specific transport (Telegram, HTTP, etc.). Each transport
|
||||
implements the MediaSource protocol to provide byte-range streaming.
|
||||
"""
|
||||
|
||||
import hashlib
|
||||
import logging
|
||||
from collections.abc import AsyncIterator
|
||||
from typing import Protocol, runtime_checkable
|
||||
from urllib.parse import urlparse, unquote
|
||||
|
||||
from mediaflow_proxy.utils.http_client import create_aiohttp_session
|
||||
from mediaflow_proxy.utils.telegram import telegram_manager
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Extensions mapped to container format hints used by transcode_handler
|
||||
_MKV_EXTENSIONS = frozenset({".mkv", ".webm"})
|
||||
_MP4_EXTENSIONS = frozenset({".mp4", ".m4v", ".mov", ".m4a", ".3gp"})
|
||||
|
||||
|
||||
def _extract_extension(path: str) -> str:
|
||||
"""Extract lowercase file extension (e.g. '.mkv') from a path or URL."""
|
||||
# Strip query/fragment first for URL paths
|
||||
dot_pos = path.rfind(".")
|
||||
if dot_pos < 0:
|
||||
return ""
|
||||
ext = path[dot_pos:].lower()
|
||||
# Trim anything after the extension (query params from raw paths)
|
||||
for ch in ("?", "#", "&"):
|
||||
idx = ext.find(ch)
|
||||
if idx > 0:
|
||||
ext = ext[:idx]
|
||||
return ext
|
||||
|
||||
|
||||
def filename_hint_from_url(url: str) -> str:
|
||||
"""Derive a filename hint from a URL path (e.g. '.mkv', '.mp4')."""
|
||||
try:
|
||||
parsed = urlparse(url)
|
||||
return _extract_extension(unquote(parsed.path))
|
||||
except Exception:
|
||||
return ""
|
||||
|
||||
|
||||
def filename_hint_from_name(filename: str) -> str:
|
||||
"""Derive a filename hint from a filename string."""
|
||||
return _extract_extension(filename) if filename else ""
|
||||
|
||||
|
||||
@runtime_checkable
|
||||
class MediaSource(Protocol):
|
||||
"""
|
||||
Protocol for streaming media byte ranges.
|
||||
|
||||
Implementations must provide:
|
||||
- stream(): async iterator of bytes from offset/limit
|
||||
- file_size: total file size in bytes
|
||||
- cache_key: deterministic key for caching (cue index, etc.)
|
||||
- filename_hint: optional file extension hint (e.g. '.mkv', '.mp4')
|
||||
"""
|
||||
|
||||
@property
|
||||
def file_size(self) -> int:
|
||||
"""Total file size in bytes."""
|
||||
...
|
||||
|
||||
@property
|
||||
def cache_key(self) -> str:
|
||||
"""Deterministic cache key derived from the source identity."""
|
||||
...
|
||||
|
||||
@property
|
||||
def filename_hint(self) -> str:
|
||||
"""Optional file extension hint (e.g. '.mkv', '.mp4') for format detection."""
|
||||
...
|
||||
|
||||
async def stream(self, offset: int = 0, limit: int | None = None) -> AsyncIterator[bytes]:
|
||||
"""
|
||||
Stream bytes from the source.
|
||||
|
||||
Args:
|
||||
offset: Byte offset to start from.
|
||||
limit: Number of bytes to read. None = read to end.
|
||||
|
||||
Yields:
|
||||
Chunks of bytes.
|
||||
"""
|
||||
...
|
||||
|
||||
|
||||
class TelegramMediaSource:
|
||||
"""
|
||||
MediaSource backed by Telegram MTProto downloads.
|
||||
|
||||
Supports two download modes:
|
||||
|
||||
* **parallel** (default): Uses ``ParallelTransferrer`` with multiple
|
||||
MTProtoSender connections for maximum throughput. Best for full-file
|
||||
streaming (e.g. ``/proxy/telegram/stream``).
|
||||
|
||||
* **single** (``use_single_client=True``): Uses Telethon's built-in
|
||||
``iter_download`` over the existing client connection. Avoids the
|
||||
overhead of creating/destroying extra connections for each request,
|
||||
ideal for small byte-range fetches like HLS segments and probe
|
||||
headers.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
telegram_ref,
|
||||
file_size: int,
|
||||
file_name: str = "",
|
||||
*,
|
||||
use_single_client: bool = False,
|
||||
) -> None:
|
||||
self._ref = telegram_ref
|
||||
self._file_size = file_size
|
||||
self._filename_hint = filename_hint_from_name(file_name)
|
||||
self._use_single_client = use_single_client
|
||||
|
||||
@property
|
||||
def file_size(self) -> int:
|
||||
return self._file_size
|
||||
|
||||
@property
|
||||
def cache_key(self) -> str:
|
||||
ref = self._ref
|
||||
if ref.file_id:
|
||||
raw = f"file_id:{ref.file_id}"
|
||||
elif ref.chat_id is not None and ref.message_id is not None:
|
||||
raw = f"chat:{ref.chat_id}:msg:{ref.message_id}"
|
||||
else:
|
||||
return ""
|
||||
return hashlib.sha256(raw.encode()).hexdigest()[:16]
|
||||
|
||||
@property
|
||||
def filename_hint(self) -> str:
|
||||
return self._filename_hint
|
||||
|
||||
async def stream(self, offset: int = 0, limit: int | None = None) -> AsyncIterator[bytes]:
|
||||
effective_limit = limit or self._file_size
|
||||
if self._use_single_client:
|
||||
async for chunk in telegram_manager.stream_media_single(
|
||||
self._ref,
|
||||
offset=offset,
|
||||
limit=effective_limit,
|
||||
file_size=self._file_size,
|
||||
):
|
||||
yield chunk
|
||||
else:
|
||||
async for chunk in telegram_manager.stream_media(
|
||||
self._ref,
|
||||
offset=offset,
|
||||
limit=effective_limit,
|
||||
file_size=self._file_size,
|
||||
):
|
||||
yield chunk
|
||||
|
||||
|
||||
class HTTPMediaSource:
|
||||
"""MediaSource backed by HTTP byte-range requests via aiohttp."""
|
||||
|
||||
def __init__(self, url: str, headers: dict | None = None, file_size: int = 0) -> None:
|
||||
self._url = url
|
||||
self._headers = headers or {}
|
||||
self._file_size = file_size
|
||||
self._filename_hint = filename_hint_from_url(url)
|
||||
|
||||
@property
|
||||
def file_size(self) -> int:
|
||||
return self._file_size
|
||||
|
||||
@property
|
||||
def cache_key(self) -> str:
|
||||
return hashlib.sha256(self._url.encode()).hexdigest()[:16]
|
||||
|
||||
@property
|
||||
def filename_hint(self) -> str:
|
||||
return self._filename_hint
|
||||
|
||||
async def resolve_file_size(self) -> int:
|
||||
"""Perform a HEAD request to determine file size if not already known."""
|
||||
if self._file_size > 0:
|
||||
return self._file_size
|
||||
|
||||
async with create_aiohttp_session(self._url, headers=self._headers) as (session, proxy_url):
|
||||
async with session.head(
|
||||
self._url,
|
||||
headers=self._headers,
|
||||
proxy=proxy_url,
|
||||
allow_redirects=True,
|
||||
) as resp:
|
||||
cl = resp.headers.get("content-length")
|
||||
if cl:
|
||||
self._file_size = int(cl)
|
||||
else:
|
||||
# Try GET with range to get content-range
|
||||
async with session.get(
|
||||
self._url,
|
||||
headers={**self._headers, "range": "bytes=0-0"},
|
||||
proxy=proxy_url,
|
||||
allow_redirects=True,
|
||||
) as range_resp:
|
||||
cr = range_resp.headers.get("content-range", "")
|
||||
if "/" in cr:
|
||||
try:
|
||||
self._file_size = int(cr.split("/")[-1])
|
||||
except ValueError:
|
||||
pass
|
||||
return self._file_size
|
||||
|
||||
async def stream(self, offset: int = 0, limit: int | None = None) -> AsyncIterator[bytes]:
|
||||
headers = dict(self._headers)
|
||||
|
||||
if offset > 0 or limit is not None:
|
||||
end = ""
|
||||
if limit is not None:
|
||||
end = str(offset + limit - 1)
|
||||
headers["range"] = f"bytes={offset}-{end}"
|
||||
|
||||
async with create_aiohttp_session(self._url, headers=headers) as (session, proxy_url):
|
||||
async with session.get(
|
||||
self._url,
|
||||
headers=headers,
|
||||
proxy=proxy_url,
|
||||
allow_redirects=True,
|
||||
) as resp:
|
||||
resp.raise_for_status()
|
||||
async for chunk in resp.content.iter_any():
|
||||
yield chunk
|
||||
@@ -0,0 +1,469 @@
|
||||
"""
|
||||
Streaming MKV demuxer.
|
||||
|
||||
Reads an MKV byte stream via an async iterator and yields individual media
|
||||
frames (MKVFrame) with absolute timestamps. Designed for on-the-fly remuxing
|
||||
without buffering the entire file.
|
||||
|
||||
Architecture:
|
||||
AsyncIterator[bytes] -> StreamBuffer -> EBML parsing -> MKVFrame yields
|
||||
|
||||
The demuxer works in two phases:
|
||||
1. read_header(): Consume bytes until Tracks is fully parsed, returning
|
||||
a list of MKVTrack with codec metadata.
|
||||
2. iter_frames(): Yield MKVFrame objects from Cluster/SimpleBlock data
|
||||
as clusters arrive.
|
||||
"""
|
||||
|
||||
import logging
|
||||
from collections.abc import AsyncIterator
|
||||
from dataclasses import dataclass, field
|
||||
|
||||
from mediaflow_proxy.remuxer.ebml_parser import (
|
||||
CLUSTER,
|
||||
CLUSTER_TIMESTAMP,
|
||||
EBML_HEADER,
|
||||
INFO,
|
||||
MKVFrame,
|
||||
MKVTrack,
|
||||
SEGMENT,
|
||||
SIMPLE_BLOCK,
|
||||
BLOCK_GROUP,
|
||||
TRACKS,
|
||||
TIMESTAMP_SCALE,
|
||||
DURATION,
|
||||
UNKNOWN_SIZE,
|
||||
extract_block_frames,
|
||||
parse_tracks,
|
||||
read_element_id,
|
||||
read_element_size,
|
||||
read_float,
|
||||
read_uint,
|
||||
_parse_block_group,
|
||||
iter_elements,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class StreamBuffer:
|
||||
"""
|
||||
Accumulating byte buffer for streaming EBML parsing.
|
||||
|
||||
Collects chunks from an async byte source and provides read-ahead
|
||||
capabilities for EBML element parsing. Supports consuming parsed
|
||||
bytes to keep memory usage bounded.
|
||||
"""
|
||||
|
||||
def __init__(self) -> None:
|
||||
self._chunks: list[bytes] = []
|
||||
self._total: int = 0
|
||||
self._consumed: int = 0 # Logical bytes consumed (for offset tracking)
|
||||
|
||||
@property
|
||||
def available(self) -> int:
|
||||
"""Number of buffered bytes available for reading."""
|
||||
return self._total
|
||||
|
||||
@property
|
||||
def consumed(self) -> int:
|
||||
"""Total bytes consumed so far (for absolute offset tracking)."""
|
||||
return self._consumed
|
||||
|
||||
def append(self, data: bytes) -> None:
|
||||
"""Add bytes to the buffer."""
|
||||
if data:
|
||||
self._chunks.append(data)
|
||||
self._total += len(data)
|
||||
|
||||
def peek(self, size: int) -> bytes:
|
||||
"""Read up to size bytes without consuming."""
|
||||
if size <= 0:
|
||||
return b""
|
||||
result = bytearray()
|
||||
remaining = size
|
||||
for chunk in self._chunks:
|
||||
if remaining <= 0:
|
||||
break
|
||||
take = min(len(chunk), remaining)
|
||||
result.extend(chunk[:take])
|
||||
remaining -= take
|
||||
return bytes(result)
|
||||
|
||||
def get_all(self) -> bytes:
|
||||
"""Get all buffered data as a single bytes object (without consuming)."""
|
||||
if len(self._chunks) == 1:
|
||||
return self._chunks[0]
|
||||
data = b"".join(self._chunks)
|
||||
self._chunks = [data]
|
||||
return data
|
||||
|
||||
def consume(self, size: int) -> bytes:
|
||||
"""Remove and return size bytes from the front of the buffer."""
|
||||
if size <= 0:
|
||||
return b""
|
||||
if size > self._total:
|
||||
size = self._total
|
||||
|
||||
result = bytearray()
|
||||
remaining = size
|
||||
while remaining > 0 and self._chunks:
|
||||
chunk = self._chunks[0]
|
||||
if len(chunk) <= remaining:
|
||||
result.extend(chunk)
|
||||
remaining -= len(chunk)
|
||||
self._chunks.pop(0)
|
||||
else:
|
||||
result.extend(chunk[:remaining])
|
||||
self._chunks[0] = chunk[remaining:]
|
||||
remaining = 0
|
||||
|
||||
consumed = len(result)
|
||||
self._total -= consumed
|
||||
self._consumed += consumed
|
||||
return bytes(result)
|
||||
|
||||
def skip(self, size: int) -> int:
|
||||
"""Discard size bytes from the front. Returns actual bytes skipped."""
|
||||
if size <= 0:
|
||||
return 0
|
||||
actual = min(size, self._total)
|
||||
remaining = actual
|
||||
while remaining > 0 and self._chunks:
|
||||
chunk = self._chunks[0]
|
||||
if len(chunk) <= remaining:
|
||||
remaining -= len(chunk)
|
||||
self._chunks.pop(0)
|
||||
else:
|
||||
self._chunks[0] = chunk[remaining:]
|
||||
remaining = 0
|
||||
self._total -= actual
|
||||
self._consumed += actual
|
||||
return actual
|
||||
|
||||
|
||||
@dataclass
|
||||
class MKVHeader:
|
||||
"""Parsed MKV header metadata."""
|
||||
|
||||
tracks: list[MKVTrack] = field(default_factory=list)
|
||||
timestamp_scale_ns: int = 1_000_000 # Default 1ms
|
||||
duration_ms: float = 0.0
|
||||
segment_data_offset: int = 0 # Absolute byte offset of Segment children
|
||||
|
||||
|
||||
class MKVDemuxer:
|
||||
"""
|
||||
Streaming async MKV demuxer.
|
||||
|
||||
Reads an MKV byte stream from an async iterator and provides:
|
||||
- read_header(): Parse EBML header + Segment metadata + Tracks
|
||||
- iter_frames(): Yield MKVFrame objects from Clusters
|
||||
|
||||
Usage:
|
||||
demuxer = MKVDemuxer()
|
||||
header = await demuxer.read_header(source)
|
||||
async for frame in demuxer.iter_frames(source):
|
||||
process(frame)
|
||||
"""
|
||||
|
||||
# Minimum bytes to try parsing an element header (ID + size)
|
||||
_MIN_ELEMENT_HEADER = 12
|
||||
|
||||
def __init__(self) -> None:
|
||||
self._buf = StreamBuffer()
|
||||
self._header: MKVHeader | None = None
|
||||
self._scale_ms: float = 1.0 # timestamp_scale / 1_000_000
|
||||
|
||||
@property
|
||||
def header(self) -> MKVHeader | None:
|
||||
return self._header
|
||||
|
||||
async def read_header(self, source: AsyncIterator[bytes]) -> MKVHeader:
|
||||
"""
|
||||
Read and parse the MKV header (EBML header, Segment, Info, Tracks).
|
||||
|
||||
Consumes bytes from source until Tracks is fully parsed. Any leftover
|
||||
bytes (start of first Cluster) remain in the internal buffer for
|
||||
iter_frames().
|
||||
|
||||
Returns:
|
||||
MKVHeader with track info and timing metadata.
|
||||
"""
|
||||
header = MKVHeader()
|
||||
|
||||
# Phase 1: Accumulate enough data for EBML header + Segment header
|
||||
await self._ensure_bytes(source, 64)
|
||||
|
||||
data = self._buf.get_all()
|
||||
if len(data) < 4:
|
||||
raise ValueError(
|
||||
f"Source ended prematurely: got {len(data)} bytes, need at least an EBML header (source disconnected?)"
|
||||
)
|
||||
pos = 0
|
||||
|
||||
# Parse EBML Header
|
||||
eid, pos = read_element_id(data, pos)
|
||||
if eid != EBML_HEADER:
|
||||
raise ValueError(f"Not an MKV file: expected EBML header, got 0x{eid:X}")
|
||||
size, pos = read_element_size(data, pos)
|
||||
if size == UNKNOWN_SIZE:
|
||||
raise ValueError("EBML header has unknown size")
|
||||
pos += size # Skip EBML header content
|
||||
|
||||
# Parse Segment element header
|
||||
eid, pos = read_element_id(data, pos)
|
||||
if eid != SEGMENT:
|
||||
raise ValueError(f"Expected Segment, got 0x{eid:X}")
|
||||
_seg_size, pos = read_element_size(data, pos)
|
||||
header.segment_data_offset = self._buf.consumed + pos
|
||||
|
||||
# Phase 2: Parse Segment children until we have Tracks
|
||||
# We need to iterate top-level Segment children: SeekHead, Info, Tracks
|
||||
# Stop when we hit the first Cluster (media data).
|
||||
tracks_found = False
|
||||
|
||||
while not tracks_found:
|
||||
# Ensure we have enough for element header
|
||||
await self._ensure_bytes(source, pos + self._MIN_ELEMENT_HEADER)
|
||||
data = self._buf.get_all()
|
||||
|
||||
if pos >= len(data):
|
||||
break
|
||||
|
||||
try:
|
||||
eid, pos2 = read_element_id(data, pos)
|
||||
size, pos3 = read_element_size(data, pos2)
|
||||
except (ValueError, IndexError):
|
||||
await self._ensure_bytes(source, pos + 32)
|
||||
data = self._buf.get_all()
|
||||
try:
|
||||
eid, pos2 = read_element_id(data, pos)
|
||||
size, pos3 = read_element_size(data, pos2)
|
||||
except (ValueError, IndexError):
|
||||
break
|
||||
|
||||
if eid == CLUSTER:
|
||||
# Reached media data; header parsing is done.
|
||||
# Don't consume the Cluster -- leave it for iter_frames.
|
||||
break
|
||||
|
||||
if size == UNKNOWN_SIZE:
|
||||
# Can't handle unknown-size elements in header
|
||||
logger.warning("[mkv_demuxer] Unknown-size element 0x%X in header at pos %d", eid, pos)
|
||||
break
|
||||
|
||||
# Ensure we have the full element
|
||||
elem_end = pos3 + size
|
||||
await self._ensure_bytes(source, elem_end)
|
||||
data = self._buf.get_all()
|
||||
|
||||
if eid == INFO:
|
||||
self._parse_info_element(data, pos3, pos3 + size, header)
|
||||
elif eid == TRACKS:
|
||||
header.tracks = parse_tracks(data, pos3, pos3 + size)
|
||||
tracks_found = True
|
||||
logger.info(
|
||||
"[mkv_demuxer] Parsed %d tracks: %s",
|
||||
len(header.tracks),
|
||||
", ".join(f"#{t.track_number}={t.codec_id}" for t in header.tracks),
|
||||
)
|
||||
|
||||
pos = elem_end
|
||||
|
||||
# Consume everything up to the current position (Cluster boundary)
|
||||
self._buf.consume(pos)
|
||||
|
||||
# Set timing scale
|
||||
self._scale_ms = header.timestamp_scale_ns / 1_000_000.0
|
||||
self._header = header
|
||||
return header
|
||||
|
||||
async def iter_frames(self, source: AsyncIterator[bytes]) -> AsyncIterator[MKVFrame]:
|
||||
"""
|
||||
Yield MKVFrame objects from Cluster/SimpleBlock data.
|
||||
|
||||
Must be called after read_header(). Continues consuming bytes from
|
||||
source, parsing Clusters and yielding individual frames.
|
||||
"""
|
||||
if self._header is None:
|
||||
raise RuntimeError("read_header() must be called before iter_frames()")
|
||||
|
||||
while True:
|
||||
# Try to read the next element header
|
||||
if not await self._ensure_bytes_soft(source, self._MIN_ELEMENT_HEADER):
|
||||
break
|
||||
|
||||
data = self._buf.get_all()
|
||||
pos = 0
|
||||
|
||||
try:
|
||||
eid, pos2 = read_element_id(data, pos)
|
||||
size, pos3 = read_element_size(data, pos2)
|
||||
except (ValueError, IndexError):
|
||||
# Try to get more data
|
||||
if not await self._ensure_bytes_soft(source, len(data) + 4096):
|
||||
break
|
||||
data = self._buf.get_all()
|
||||
try:
|
||||
eid, pos2 = read_element_id(data, pos)
|
||||
size, pos3 = read_element_size(data, pos2)
|
||||
except (ValueError, IndexError):
|
||||
break
|
||||
|
||||
if eid == CLUSTER:
|
||||
if size == UNKNOWN_SIZE:
|
||||
# Unknown-size Cluster: parse children until we hit the next
|
||||
# Cluster or run out of data
|
||||
self._buf.consume(pos3) # consume Cluster header
|
||||
async for frame in self._parse_unknown_size_cluster(source):
|
||||
yield frame
|
||||
else:
|
||||
# Known-size Cluster: ensure we have all data
|
||||
elem_end = pos3 + size
|
||||
await self._ensure_bytes(source, elem_end)
|
||||
data = self._buf.get_all()
|
||||
|
||||
for frame in self._parse_cluster_data(data, pos3, pos3 + size):
|
||||
yield frame
|
||||
|
||||
self._buf.consume(elem_end)
|
||||
else:
|
||||
# Skip non-Cluster top-level elements
|
||||
if size == UNKNOWN_SIZE:
|
||||
break
|
||||
elem_end = pos3 + size
|
||||
if elem_end > len(data):
|
||||
# Need to skip bytes we don't have yet
|
||||
self._buf.consume(len(data))
|
||||
skip_remaining = elem_end - len(data)
|
||||
await self._skip_bytes(source, skip_remaining)
|
||||
else:
|
||||
self._buf.consume(elem_end)
|
||||
|
||||
def _parse_info_element(self, data: bytes, start: int, end: int, header: MKVHeader) -> None:
|
||||
"""Parse Info element children for timestamp scale and duration."""
|
||||
for eid, off, size, _ in iter_elements(data, start, end):
|
||||
if eid == TIMESTAMP_SCALE:
|
||||
header.timestamp_scale_ns = read_uint(data, off, size)
|
||||
elif eid == DURATION:
|
||||
scale = header.timestamp_scale_ns / 1_000_000.0
|
||||
header.duration_ms = read_float(data, off, size) * scale
|
||||
|
||||
def _parse_cluster_data(self, data: bytes, start: int, end: int) -> list[MKVFrame]:
|
||||
"""Parse a known-size Cluster and return its frames."""
|
||||
cluster_timecode = 0
|
||||
frames = []
|
||||
|
||||
for eid, data_off, size, _ in iter_elements(data, start, end):
|
||||
if eid == CLUSTER_TIMESTAMP:
|
||||
cluster_timecode = read_uint(data, data_off, size)
|
||||
elif eid == SIMPLE_BLOCK:
|
||||
for track_num, rel_tc, flags, frame_list in extract_block_frames(data, data_off, size):
|
||||
is_kf = bool(flags & 0x80)
|
||||
abs_ts_ms = (cluster_timecode + rel_tc) * self._scale_ms
|
||||
for frame_data in frame_list:
|
||||
frames.append(
|
||||
MKVFrame(
|
||||
track_number=track_num,
|
||||
timestamp_ms=abs_ts_ms,
|
||||
is_keyframe=is_kf,
|
||||
data=frame_data,
|
||||
)
|
||||
)
|
||||
elif eid == BLOCK_GROUP:
|
||||
_parse_block_group(data, data_off, data_off + size, cluster_timecode, self._scale_ms, frames)
|
||||
|
||||
return frames
|
||||
|
||||
async def _parse_unknown_size_cluster(self, source: AsyncIterator[bytes]) -> AsyncIterator[MKVFrame]:
|
||||
"""Parse an unknown-size Cluster by reading children until next Cluster."""
|
||||
cluster_timecode = 0
|
||||
|
||||
while True:
|
||||
if not await self._ensure_bytes_soft(source, self._MIN_ELEMENT_HEADER):
|
||||
break
|
||||
|
||||
data = self._buf.get_all()
|
||||
pos = 0
|
||||
|
||||
try:
|
||||
eid, pos2 = read_element_id(data, pos)
|
||||
size, pos3 = read_element_size(data, pos2)
|
||||
except (ValueError, IndexError):
|
||||
if not await self._ensure_bytes_soft(source, len(data) + 4096):
|
||||
break
|
||||
data = self._buf.get_all()
|
||||
try:
|
||||
eid, pos2 = read_element_id(data, pos)
|
||||
size, pos3 = read_element_size(data, pos2)
|
||||
except (ValueError, IndexError):
|
||||
break
|
||||
|
||||
# A new Cluster or top-level element signals end of current Cluster
|
||||
if eid == CLUSTER or eid == SEGMENT:
|
||||
break
|
||||
|
||||
if size == UNKNOWN_SIZE:
|
||||
break
|
||||
|
||||
elem_end = pos3 + size
|
||||
await self._ensure_bytes(source, elem_end)
|
||||
data = self._buf.get_all()
|
||||
|
||||
if eid == CLUSTER_TIMESTAMP:
|
||||
cluster_timecode = read_uint(data, pos3, size)
|
||||
elif eid == SIMPLE_BLOCK:
|
||||
for track_num, rel_tc, flags, frame_list in extract_block_frames(data, pos3, size):
|
||||
is_kf = bool(flags & 0x80)
|
||||
abs_ts_ms = (cluster_timecode + rel_tc) * self._scale_ms
|
||||
for frame_data in frame_list:
|
||||
yield MKVFrame(
|
||||
track_number=track_num,
|
||||
timestamp_ms=abs_ts_ms,
|
||||
is_keyframe=is_kf,
|
||||
data=frame_data,
|
||||
)
|
||||
elif eid == BLOCK_GROUP:
|
||||
bg_frames = []
|
||||
_parse_block_group(data, pos3, pos3 + size, cluster_timecode, self._scale_ms, bg_frames)
|
||||
for frame in bg_frames:
|
||||
yield frame
|
||||
|
||||
self._buf.consume(elem_end)
|
||||
|
||||
async def _ensure_bytes(self, source: AsyncIterator[bytes], needed: int) -> None:
|
||||
"""Ensure the buffer has at least 'needed' bytes. Raises StopAsyncIteration if exhausted."""
|
||||
while self._buf.available < needed:
|
||||
try:
|
||||
chunk = await source.__anext__()
|
||||
self._buf.append(chunk)
|
||||
except StopAsyncIteration:
|
||||
return
|
||||
|
||||
async def _ensure_bytes_soft(self, source: AsyncIterator[bytes], needed: int) -> bool:
|
||||
"""Like _ensure_bytes but returns False instead of raising."""
|
||||
while self._buf.available < needed:
|
||||
try:
|
||||
chunk = await source.__anext__()
|
||||
if not chunk:
|
||||
return self._buf.available > 0
|
||||
self._buf.append(chunk)
|
||||
except StopAsyncIteration:
|
||||
return self._buf.available > 0
|
||||
return True
|
||||
|
||||
async def _skip_bytes(self, source: AsyncIterator[bytes], count: int) -> None:
|
||||
"""Skip count bytes from the source without buffering."""
|
||||
remaining = count
|
||||
while remaining > 0:
|
||||
try:
|
||||
chunk = await source.__anext__()
|
||||
if len(chunk) <= remaining:
|
||||
remaining -= len(chunk)
|
||||
else:
|
||||
# Put the excess back
|
||||
self._buf.append(chunk[remaining:])
|
||||
remaining = 0
|
||||
except StopAsyncIteration:
|
||||
break
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,834 @@
|
||||
"""
|
||||
MP4 container parser for moov atom probing.
|
||||
|
||||
Provides:
|
||||
- MP4Index: seek index extracted from MP4 moov atom (parallel to MKVCueIndex)
|
||||
- Top-level atom scanning
|
||||
- Sample table parsers (stco, co64, stss, stsz, stts, stsc)
|
||||
- Moov-to-cue-point builder
|
||||
- rewrite_moov_offsets: adjust stco/co64 in moov for file rearrangement
|
||||
|
||||
The parsers are the inverse of the builder functions in mp4_muxer.py.
|
||||
Box navigation reuses the pattern from ts_muxer.py's read_box/find_box/iter_boxes.
|
||||
"""
|
||||
|
||||
import bisect
|
||||
import logging
|
||||
import struct
|
||||
from dataclasses import dataclass, field
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# =============================================================================
|
||||
# MP4 Box Utilities
|
||||
# =============================================================================
|
||||
|
||||
# Minimum bytes needed to read a standard box header
|
||||
_BOX_HEADER_SIZE = 8
|
||||
|
||||
# ftyp brands that identify MP4/MOV containers
|
||||
_MP4_BRANDS = {
|
||||
b"isom",
|
||||
b"iso2",
|
||||
b"iso3",
|
||||
b"iso4",
|
||||
b"iso5",
|
||||
b"iso6",
|
||||
b"mp41",
|
||||
b"mp42",
|
||||
b"M4V ",
|
||||
b"M4A ",
|
||||
b"f4v ",
|
||||
b"kddi",
|
||||
b"avc1",
|
||||
b"qt ",
|
||||
b"MSNV",
|
||||
b"dash",
|
||||
b"3gp4",
|
||||
b"3gp5",
|
||||
b"3gp6",
|
||||
}
|
||||
|
||||
|
||||
def is_mp4_header(data: bytes) -> bool:
|
||||
"""Check if the data starts with an ftyp box (MP4 signature)."""
|
||||
if len(data) < 8:
|
||||
return False
|
||||
size = struct.unpack_from(">I", data, 0)[0]
|
||||
box_type = data[4:8]
|
||||
if box_type != b"ftyp":
|
||||
return False
|
||||
if size < 12 or size > len(data):
|
||||
return size >= 12 # might be valid but truncated
|
||||
major_brand = data[8:12]
|
||||
return major_brand in _MP4_BRANDS
|
||||
|
||||
|
||||
def read_box_header(data: bytes, offset: int) -> tuple[bytes, int, int] | None:
|
||||
"""
|
||||
Read a box header at the given offset.
|
||||
|
||||
Returns:
|
||||
(box_type, header_size, total_box_size) or None if not enough data.
|
||||
"""
|
||||
if offset + 8 > len(data):
|
||||
return None
|
||||
|
||||
size, box_type = struct.unpack_from(">I4s", data, offset)
|
||||
header_size = 8
|
||||
|
||||
if size == 1: # Extended size (64-bit)
|
||||
if offset + 16 > len(data):
|
||||
return None
|
||||
size = struct.unpack_from(">Q", data, offset + 8)[0]
|
||||
header_size = 16
|
||||
elif size == 0: # Box extends to end of data
|
||||
size = len(data) - offset
|
||||
|
||||
return box_type, header_size, size
|
||||
|
||||
|
||||
def iter_top_level_boxes(data: bytes):
|
||||
"""
|
||||
Iterate over top-level box headers.
|
||||
|
||||
Yields:
|
||||
(box_type, header_size, total_size, data_offset)
|
||||
"""
|
||||
offset = 0
|
||||
while offset < len(data):
|
||||
result = read_box_header(data, offset)
|
||||
if result is None:
|
||||
break
|
||||
box_type, header_size, total_size = result
|
||||
yield box_type, header_size, total_size, offset + header_size
|
||||
if total_size == 0:
|
||||
break
|
||||
offset += total_size
|
||||
|
||||
|
||||
def find_box(data: bytes, target: bytes) -> bytes | None:
|
||||
"""Find a box by type and return its body (data after header)."""
|
||||
for box_type, header_size, total_size, data_offset in iter_top_level_boxes(data):
|
||||
if box_type == target:
|
||||
return data[data_offset : data_offset - header_size + total_size]
|
||||
return None
|
||||
|
||||
|
||||
def iter_boxes(data: bytes):
|
||||
"""Iterate over child boxes: yields (box_type, box_body_bytes)."""
|
||||
for box_type, header_size, total_size, data_offset in iter_top_level_boxes(data):
|
||||
end = data_offset - header_size + total_size
|
||||
yield box_type, data[data_offset:end]
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Sample Table Parsers (inverse of mp4_muxer.py builders)
|
||||
# =============================================================================
|
||||
|
||||
|
||||
def parse_full_box_header(data: bytes) -> tuple[int, int, int]:
|
||||
"""
|
||||
Parse a full box header (version + flags).
|
||||
|
||||
Returns:
|
||||
(version, flags, header_size) where header_size is 4 bytes.
|
||||
"""
|
||||
if len(data) < 4:
|
||||
return 0, 0, 0
|
||||
version = data[0]
|
||||
flags = (data[1] << 16) | (data[2] << 8) | data[3]
|
||||
return version, flags, 4
|
||||
|
||||
|
||||
def parse_stco(data: bytes) -> list[int]:
|
||||
"""
|
||||
Parse Chunk Offset box (stco) - 32-bit offsets.
|
||||
|
||||
Layout: version(1) + flags(3) + entry_count(4) + [offset(4)]...
|
||||
"""
|
||||
if len(data) < 8:
|
||||
return []
|
||||
_, _, hdr = parse_full_box_header(data)
|
||||
pos = hdr
|
||||
entry_count = struct.unpack_from(">I", data, pos)[0]
|
||||
pos += 4
|
||||
|
||||
if len(data) < pos + entry_count * 4:
|
||||
return []
|
||||
|
||||
offsets = []
|
||||
for _ in range(entry_count):
|
||||
offsets.append(struct.unpack_from(">I", data, pos)[0])
|
||||
pos += 4
|
||||
return offsets
|
||||
|
||||
|
||||
def parse_co64(data: bytes) -> list[int]:
|
||||
"""
|
||||
Parse Chunk Offset box (co64) - 64-bit offsets.
|
||||
|
||||
Layout: version(1) + flags(3) + entry_count(4) + [offset(8)]...
|
||||
"""
|
||||
if len(data) < 8:
|
||||
return []
|
||||
_, _, hdr = parse_full_box_header(data)
|
||||
pos = hdr
|
||||
entry_count = struct.unpack_from(">I", data, pos)[0]
|
||||
pos += 4
|
||||
|
||||
if len(data) < pos + entry_count * 8:
|
||||
return []
|
||||
|
||||
offsets = []
|
||||
for _ in range(entry_count):
|
||||
offsets.append(struct.unpack_from(">Q", data, pos)[0])
|
||||
pos += 8
|
||||
return offsets
|
||||
|
||||
|
||||
def parse_stss(data: bytes) -> list[int]:
|
||||
"""
|
||||
Parse Sync Sample box (stss) - keyframe indices (1-based).
|
||||
|
||||
Layout: version(1) + flags(3) + entry_count(4) + [sample_number(4)]...
|
||||
"""
|
||||
if len(data) < 8:
|
||||
return []
|
||||
_, _, hdr = parse_full_box_header(data)
|
||||
pos = hdr
|
||||
entry_count = struct.unpack_from(">I", data, pos)[0]
|
||||
pos += 4
|
||||
|
||||
if len(data) < pos + entry_count * 4:
|
||||
return []
|
||||
|
||||
indices = []
|
||||
for _ in range(entry_count):
|
||||
indices.append(struct.unpack_from(">I", data, pos)[0])
|
||||
pos += 4
|
||||
return indices
|
||||
|
||||
|
||||
def parse_stsz(data: bytes) -> tuple[int, list[int]]:
|
||||
"""
|
||||
Parse Sample Size box (stsz).
|
||||
|
||||
Layout: version(1) + flags(3) + sample_size(4) + sample_count(4) + [size(4)]...
|
||||
|
||||
Returns:
|
||||
(uniform_size, sizes_list).
|
||||
If uniform_size > 0, all samples have that size and sizes_list is empty.
|
||||
Otherwise, sizes_list contains per-sample sizes.
|
||||
"""
|
||||
if len(data) < 12:
|
||||
return 0, []
|
||||
_, _, hdr = parse_full_box_header(data)
|
||||
pos = hdr
|
||||
sample_size = struct.unpack_from(">I", data, pos)[0]
|
||||
sample_count = struct.unpack_from(">I", data, pos + 4)[0]
|
||||
pos += 8
|
||||
|
||||
if sample_size > 0:
|
||||
return sample_size, []
|
||||
|
||||
if len(data) < pos + sample_count * 4:
|
||||
return 0, []
|
||||
|
||||
sizes = []
|
||||
for _ in range(sample_count):
|
||||
sizes.append(struct.unpack_from(">I", data, pos)[0])
|
||||
pos += 4
|
||||
return 0, sizes
|
||||
|
||||
|
||||
def parse_stts(data: bytes) -> list[tuple[int, int]]:
|
||||
"""
|
||||
Parse Time-to-Sample box (stts) - run-length encoded durations.
|
||||
|
||||
Layout: version(1) + flags(3) + entry_count(4) + [sample_count(4) + sample_delta(4)]...
|
||||
|
||||
Returns:
|
||||
List of (sample_count, sample_delta) entries.
|
||||
"""
|
||||
if len(data) < 8:
|
||||
return []
|
||||
_, _, hdr = parse_full_box_header(data)
|
||||
pos = hdr
|
||||
entry_count = struct.unpack_from(">I", data, pos)[0]
|
||||
pos += 4
|
||||
|
||||
if len(data) < pos + entry_count * 8:
|
||||
return []
|
||||
|
||||
entries = []
|
||||
for _ in range(entry_count):
|
||||
count = struct.unpack_from(">I", data, pos)[0]
|
||||
delta = struct.unpack_from(">I", data, pos + 4)[0]
|
||||
entries.append((count, delta))
|
||||
pos += 8
|
||||
return entries
|
||||
|
||||
|
||||
def parse_stsc(data: bytes) -> list[tuple[int, int, int]]:
|
||||
"""
|
||||
Parse Sample-to-Chunk box (stsc).
|
||||
|
||||
Layout: version(1) + flags(3) + entry_count(4) +
|
||||
[first_chunk(4) + samples_per_chunk(4) + sample_desc_index(4)]...
|
||||
|
||||
Returns:
|
||||
List of (first_chunk, samples_per_chunk, sample_desc_index) entries.
|
||||
first_chunk is 1-based.
|
||||
"""
|
||||
if len(data) < 8:
|
||||
return []
|
||||
_, _, hdr = parse_full_box_header(data)
|
||||
pos = hdr
|
||||
entry_count = struct.unpack_from(">I", data, pos)[0]
|
||||
pos += 4
|
||||
|
||||
if len(data) < pos + entry_count * 12:
|
||||
return []
|
||||
|
||||
entries = []
|
||||
for _ in range(entry_count):
|
||||
first_chunk = struct.unpack_from(">I", data, pos)[0]
|
||||
spc = struct.unpack_from(">I", data, pos + 4)[0]
|
||||
sdi = struct.unpack_from(">I", data, pos + 8)[0]
|
||||
entries.append((first_chunk, spc, sdi))
|
||||
pos += 12
|
||||
return entries
|
||||
|
||||
|
||||
def parse_mdhd(data: bytes) -> tuple[int, int]:
|
||||
"""
|
||||
Parse Media Header box (mdhd) for timescale and duration.
|
||||
|
||||
Returns:
|
||||
(timescale, duration) in media timescale units.
|
||||
"""
|
||||
if len(data) < 4:
|
||||
return 0, 0
|
||||
version = data[0]
|
||||
if version == 1:
|
||||
# 64-bit: skip version(1)+flags(3)+creation(8)+modification(8)
|
||||
if len(data) < 32:
|
||||
return 0, 0
|
||||
timescale = struct.unpack_from(">I", data, 20)[0]
|
||||
duration = struct.unpack_from(">Q", data, 24)[0]
|
||||
else:
|
||||
# 32-bit: skip version(1)+flags(3)+creation(4)+modification(4)
|
||||
if len(data) < 20:
|
||||
return 0, 0
|
||||
timescale = struct.unpack_from(">I", data, 12)[0]
|
||||
duration = struct.unpack_from(">I", data, 16)[0]
|
||||
return timescale, duration
|
||||
|
||||
|
||||
def parse_stsd_codec(data: bytes) -> str:
|
||||
"""
|
||||
Parse Sample Description box (stsd) to extract the codec FourCC.
|
||||
|
||||
Returns the codec name as a string (e.g. "avc1", "hvc1", "mp4a").
|
||||
"""
|
||||
if len(data) < 16:
|
||||
return ""
|
||||
# version(1)+flags(3)+entry_count(4)
|
||||
pos = 8
|
||||
# First entry: size(4)+type(4)
|
||||
if pos + 8 > len(data):
|
||||
return ""
|
||||
codec_fourcc = data[pos + 4 : pos + 8]
|
||||
try:
|
||||
return codec_fourcc.decode("ascii").strip()
|
||||
except (UnicodeDecodeError, ValueError):
|
||||
return ""
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# MP4 Index (parallel to MKVCueIndex)
|
||||
# =============================================================================
|
||||
|
||||
|
||||
@dataclass
|
||||
class MP4Index:
|
||||
"""
|
||||
Seek index extracted from an MP4 file's moov atom.
|
||||
|
||||
Parallel to ``MKVCueIndex`` for MKV files. Provides keyframe-indexed
|
||||
cue points for time-based seeking and the raw moov bytes needed to
|
||||
reconstruct a streamable (faststart) MP4 for on-the-fly demuxing.
|
||||
"""
|
||||
|
||||
duration_ms: float = 0.0
|
||||
timescale: int = 0
|
||||
cue_points: list[tuple[float, int]] = field(default_factory=list) # [(time_ms, byte_offset), ...]
|
||||
moov_offset: int = 0 # Absolute file offset where moov atom starts
|
||||
moov_size: int = 0 # Total size of the moov atom (header + body)
|
||||
moov_data: bytes = b"" # Raw moov atom bytes (for prepending to mdat pipe)
|
||||
ftyp_data: bytes = b"" # Raw ftyp atom bytes (for prepending before moov)
|
||||
mdat_offset: int = 0 # Absolute file offset where mdat atom starts
|
||||
mdat_size: int = 0 # Total size of the mdat atom
|
||||
video_codec: str = "" # e.g. "avc1", "hvc1", "mp4v"
|
||||
audio_codec: str = "" # e.g. "mp4a", "ac-3"
|
||||
|
||||
def byte_offset_for_time(self, time_ms: float) -> tuple[int, float]:
|
||||
"""
|
||||
Find the byte offset for the nearest keyframe at or before time_ms.
|
||||
|
||||
Returns:
|
||||
(absolute_byte_offset, actual_keyframe_time_ms)
|
||||
"""
|
||||
if not self.cue_points:
|
||||
return 0, 0.0
|
||||
|
||||
times = [cp[0] for cp in self.cue_points]
|
||||
idx = bisect.bisect_right(times, time_ms) - 1
|
||||
if idx < 0:
|
||||
idx = 0
|
||||
|
||||
cue_time_ms, byte_offset = self.cue_points[idx]
|
||||
return byte_offset, cue_time_ms
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Moov -> Cue Points Builder
|
||||
# =============================================================================
|
||||
|
||||
|
||||
def _find_nested_box(data: bytes, *path: bytes) -> bytes | None:
|
||||
"""Walk a box hierarchy: find_nested_box(data, b"trak", b"mdia") etc."""
|
||||
current = data
|
||||
for box_name in path:
|
||||
found = find_box(current, box_name)
|
||||
if found is None:
|
||||
return None
|
||||
current = found
|
||||
return current
|
||||
|
||||
|
||||
def build_cue_points_from_moov(moov_body: bytes) -> tuple[list[tuple[float, int]], float, int, str, str]:
|
||||
"""
|
||||
Parse a moov body to build keyframe-indexed cue points.
|
||||
|
||||
Walks the first video trak's stbl to extract:
|
||||
- Chunk offsets (stco/co64)
|
||||
- Keyframe sample indices (stss)
|
||||
- Sample sizes (stsz)
|
||||
- Sample durations (stts)
|
||||
- Sample-to-chunk mapping (stsc)
|
||||
- Timescale and duration from mdhd
|
||||
|
||||
Returns:
|
||||
(cue_points, duration_ms, timescale, video_codec, audio_codec)
|
||||
"""
|
||||
cue_points: list[tuple[float, int]] = []
|
||||
duration_ms = 0.0
|
||||
timescale = 0
|
||||
video_codec = ""
|
||||
audio_codec = ""
|
||||
|
||||
# Find all traks
|
||||
video_stbl = None
|
||||
video_mdhd = None
|
||||
|
||||
offset = 0
|
||||
data = moov_body
|
||||
while offset < len(data):
|
||||
result = read_box_header(data, offset)
|
||||
if result is None:
|
||||
break
|
||||
box_type, hdr_size, total_size = result
|
||||
|
||||
if box_type == b"trak":
|
||||
trak_body = data[offset + hdr_size : offset + total_size]
|
||||
|
||||
# Check handler type to identify video/audio
|
||||
hdlr_data = _find_nested_box(trak_body, b"mdia", b"hdlr")
|
||||
handler_type = b""
|
||||
if hdlr_data and len(hdlr_data) >= 12:
|
||||
# hdlr: version(1)+flags(3)+pre_defined(4)+handler_type(4)
|
||||
handler_type = hdlr_data[8:12]
|
||||
|
||||
if handler_type == b"vide" and video_stbl is None:
|
||||
video_stbl = _find_nested_box(trak_body, b"mdia", b"minf", b"stbl")
|
||||
video_mdhd_data = _find_nested_box(trak_body, b"mdia", b"mdhd")
|
||||
if video_mdhd_data:
|
||||
video_mdhd = video_mdhd_data
|
||||
|
||||
stsd_data = _find_nested_box(trak_body, b"mdia", b"minf", b"stbl", b"stsd")
|
||||
if stsd_data:
|
||||
video_codec = parse_stsd_codec(stsd_data)
|
||||
|
||||
elif handler_type == b"soun" and not audio_codec:
|
||||
stsd_data = _find_nested_box(trak_body, b"mdia", b"minf", b"stbl", b"stsd")
|
||||
if stsd_data:
|
||||
audio_codec = parse_stsd_codec(stsd_data)
|
||||
|
||||
elif box_type == b"mvhd":
|
||||
# Fallback: parse mvhd for timescale/duration if no mdhd
|
||||
mvhd_body = data[offset + hdr_size : offset + total_size]
|
||||
if len(mvhd_body) >= 20:
|
||||
version = mvhd_body[0]
|
||||
if version == 1:
|
||||
if len(mvhd_body) >= 28:
|
||||
ts = struct.unpack_from(">I", mvhd_body, 20)[0]
|
||||
dur = struct.unpack_from(">Q", mvhd_body, 24)[0]
|
||||
if timescale == 0:
|
||||
timescale = ts
|
||||
duration_ms = dur / ts * 1000.0 if ts else 0.0
|
||||
else:
|
||||
ts = struct.unpack_from(">I", mvhd_body, 12)[0]
|
||||
dur = struct.unpack_from(">I", mvhd_body, 16)[0]
|
||||
if timescale == 0:
|
||||
timescale = ts
|
||||
duration_ms = dur / ts * 1000.0 if ts else 0.0
|
||||
|
||||
if total_size == 0:
|
||||
break
|
||||
offset += total_size
|
||||
|
||||
# Parse mdhd for video timescale (more precise than mvhd)
|
||||
if video_mdhd:
|
||||
ts, dur = parse_mdhd(video_mdhd)
|
||||
if ts > 0:
|
||||
timescale = ts
|
||||
duration_ms = dur / ts * 1000.0
|
||||
|
||||
if video_stbl is None:
|
||||
logger.warning("[mp4_parser] No video stbl found in moov")
|
||||
return cue_points, duration_ms, timescale, video_codec, audio_codec
|
||||
|
||||
# Parse sample tables from video stbl
|
||||
stco_data = find_box(video_stbl, b"stco")
|
||||
co64_data = find_box(video_stbl, b"co64")
|
||||
stss_data = find_box(video_stbl, b"stss")
|
||||
stsz_data = find_box(video_stbl, b"stsz")
|
||||
stts_data = find_box(video_stbl, b"stts")
|
||||
stsc_data = find_box(video_stbl, b"stsc")
|
||||
|
||||
# Chunk offsets
|
||||
chunk_offsets = parse_co64(co64_data) if co64_data else (parse_stco(stco_data) if stco_data else [])
|
||||
|
||||
# Keyframe sample numbers (1-based)
|
||||
keyframe_samples = set(parse_stss(stss_data)) if stss_data else set()
|
||||
all_are_keyframes = not stss_data # No stss means all samples are sync
|
||||
|
||||
# Sample sizes
|
||||
uniform_size, size_list = parse_stsz(stsz_data) if stsz_data else (0, [])
|
||||
|
||||
# Sample durations (run-length encoded)
|
||||
stts_entries = parse_stts(stts_data) if stts_data else []
|
||||
|
||||
# Sample-to-chunk mapping
|
||||
stsc_entries = parse_stsc(stsc_data) if stsc_data else []
|
||||
|
||||
if not chunk_offsets or timescale == 0:
|
||||
logger.warning(
|
||||
"[mp4_parser] Missing data: chunks=%d, timescale=%d",
|
||||
len(chunk_offsets),
|
||||
timescale,
|
||||
)
|
||||
return cue_points, duration_ms, timescale, video_codec, audio_codec
|
||||
|
||||
# Expand stts to per-sample durations
|
||||
sample_durations: list[int] = []
|
||||
for count, delta in stts_entries:
|
||||
sample_durations.extend([delta] * count)
|
||||
|
||||
# Expand stsc to determine which samples belong to which chunk
|
||||
# Build a mapping: chunk_index (0-based) -> samples_per_chunk
|
||||
total_chunks = len(chunk_offsets)
|
||||
chunk_sample_counts: list[int] = [0] * total_chunks
|
||||
|
||||
if stsc_entries:
|
||||
for i, (first_chunk, spc, _sdi) in enumerate(stsc_entries):
|
||||
# first_chunk is 1-based
|
||||
start = first_chunk - 1
|
||||
if i + 1 < len(stsc_entries):
|
||||
end = stsc_entries[i + 1][0] - 1
|
||||
else:
|
||||
end = total_chunks
|
||||
for c in range(start, end):
|
||||
if c < total_chunks:
|
||||
chunk_sample_counts[c] = spc
|
||||
else:
|
||||
# Default: 1 sample per chunk
|
||||
chunk_sample_counts = [1] * total_chunks
|
||||
|
||||
# Count total samples
|
||||
total_samples = sum(chunk_sample_counts)
|
||||
|
||||
# Get per-sample sizes
|
||||
if uniform_size > 0:
|
||||
sample_sizes = [uniform_size] * total_samples
|
||||
else:
|
||||
sample_sizes = size_list
|
||||
|
||||
# Build cumulative timestamp for each sample and map keyframes to byte offsets
|
||||
current_sample = 0 # 0-based sample index
|
||||
current_time = 0 # in timescale units
|
||||
|
||||
for chunk_idx, chunk_offset in enumerate(chunk_offsets):
|
||||
spc = chunk_sample_counts[chunk_idx] if chunk_idx < len(chunk_sample_counts) else 1
|
||||
byte_pos = chunk_offset
|
||||
|
||||
for s in range(spc):
|
||||
sample_num = current_sample + 1 # 1-based for stss comparison
|
||||
is_keyframe = all_are_keyframes or sample_num in keyframe_samples
|
||||
|
||||
if is_keyframe:
|
||||
time_ms = current_time / timescale * 1000.0
|
||||
cue_points.append((time_ms, byte_pos))
|
||||
|
||||
# Advance byte position by this sample's size
|
||||
if current_sample < len(sample_sizes):
|
||||
byte_pos += sample_sizes[current_sample]
|
||||
|
||||
# Advance timestamp
|
||||
if current_sample < len(sample_durations):
|
||||
current_time += sample_durations[current_sample]
|
||||
|
||||
current_sample += 1
|
||||
|
||||
logger.info(
|
||||
"[mp4_parser] Built %d cue points from %d samples, duration=%.1fs, video=%s, audio=%s",
|
||||
len(cue_points),
|
||||
total_samples,
|
||||
duration_ms / 1000.0,
|
||||
video_codec,
|
||||
audio_codec,
|
||||
)
|
||||
|
||||
return cue_points, duration_ms, timescale, video_codec, audio_codec
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Moov Offset Rewriting (for faststart pipe construction)
|
||||
# =============================================================================
|
||||
|
||||
|
||||
def _rewrite_stco_in_place(data: bytearray, box_start: int, box_size: int, delta: int) -> int:
|
||||
"""Rewrite stco chunk offsets by adding delta. Returns number of entries fixed."""
|
||||
# FullBox header: version(1) + flags(3) = 4 bytes
|
||||
body_start = box_start + 4
|
||||
if body_start + 4 > box_start + box_size:
|
||||
return 0
|
||||
entry_count = struct.unpack_from(">I", data, body_start)[0]
|
||||
pos = body_start + 4
|
||||
for _ in range(entry_count):
|
||||
if pos + 4 > box_start + box_size:
|
||||
break
|
||||
old_val = struct.unpack_from(">I", data, pos)[0]
|
||||
struct.pack_into(">I", data, pos, old_val + delta)
|
||||
pos += 4
|
||||
return entry_count
|
||||
|
||||
|
||||
def _rewrite_co64_in_place(data: bytearray, box_start: int, box_size: int, delta: int) -> int:
|
||||
"""Rewrite co64 chunk offsets by adding delta. Returns number of entries fixed."""
|
||||
body_start = box_start + 4
|
||||
if body_start + 4 > box_start + box_size:
|
||||
return 0
|
||||
entry_count = struct.unpack_from(">I", data, body_start)[0]
|
||||
pos = body_start + 4
|
||||
for _ in range(entry_count):
|
||||
if pos + 8 > box_start + box_size:
|
||||
break
|
||||
old_val = struct.unpack_from(">Q", data, pos)[0]
|
||||
struct.pack_into(">Q", data, pos, old_val + delta)
|
||||
pos += 8
|
||||
return entry_count
|
||||
|
||||
|
||||
def _walk_and_rewrite(data: bytearray, start: int, end: int, delta: int) -> int:
|
||||
"""
|
||||
Recursively walk boxes within [start, end) looking for stco/co64 boxes
|
||||
and rewriting their offsets.
|
||||
|
||||
Returns total number of offset entries rewritten.
|
||||
"""
|
||||
total = 0
|
||||
offset = start
|
||||
while offset + 8 <= end:
|
||||
size = struct.unpack_from(">I", data, offset)[0]
|
||||
box_type = data[offset + 4 : offset + 8]
|
||||
hdr_size = 8
|
||||
|
||||
if size == 1:
|
||||
if offset + 16 > end:
|
||||
break
|
||||
size = struct.unpack_from(">Q", data, offset + 8)[0]
|
||||
hdr_size = 16
|
||||
elif size == 0:
|
||||
size = end - offset
|
||||
|
||||
if size < 8 or offset + size > end:
|
||||
break
|
||||
|
||||
body_start = offset + hdr_size
|
||||
body_end = offset + size
|
||||
|
||||
if box_type == b"stco":
|
||||
total += _rewrite_stco_in_place(data, body_start, size - hdr_size, delta)
|
||||
elif box_type == b"co64":
|
||||
total += _rewrite_co64_in_place(data, body_start, size - hdr_size, delta)
|
||||
elif box_type in (b"moov", b"trak", b"mdia", b"minf", b"stbl"):
|
||||
# Container box -- recurse into children
|
||||
total += _walk_and_rewrite(data, body_start, body_end, delta)
|
||||
|
||||
offset += size
|
||||
|
||||
return total
|
||||
|
||||
|
||||
def extract_video_track_from_moov(moov_data: bytes):
|
||||
"""
|
||||
Extract video codec configuration from an MP4 moov atom.
|
||||
|
||||
Walks the moov box tree to find the first video trak, extracts its
|
||||
resolution and codec-private data (avcC/hvcC), and returns a synthetic
|
||||
``MKVTrack`` suitable for building an fMP4 init segment.
|
||||
|
||||
Returns:
|
||||
An ``MKVTrack`` with video metadata, or ``None`` if no video track
|
||||
is found.
|
||||
"""
|
||||
from mediaflow_proxy.remuxer.ebml_parser import (
|
||||
CODEC_ID_H264,
|
||||
CODEC_ID_H265,
|
||||
MKVTrack,
|
||||
)
|
||||
|
||||
# Strip the moov box header to get the body
|
||||
if len(moov_data) < 8:
|
||||
return None
|
||||
raw_size = struct.unpack_from(">I", moov_data, 0)[0]
|
||||
hdr_size = 16 if raw_size == 1 else 8
|
||||
moov_body = moov_data[hdr_size:]
|
||||
|
||||
# Walk traks looking for video handler
|
||||
offset = 0
|
||||
while offset < len(moov_body):
|
||||
result = read_box_header(moov_body, offset)
|
||||
if result is None:
|
||||
break
|
||||
box_type, box_hdr_size, total_size = result
|
||||
|
||||
if box_type == b"trak":
|
||||
trak_body = moov_body[offset + box_hdr_size : offset + total_size]
|
||||
|
||||
# Check handler type
|
||||
hdlr_data = _find_nested_box(trak_body, b"mdia", b"hdlr")
|
||||
handler_type = b""
|
||||
if hdlr_data and len(hdlr_data) >= 12:
|
||||
handler_type = hdlr_data[8:12]
|
||||
|
||||
if handler_type == b"vide":
|
||||
# Found video trak -- extract stsd for codec config
|
||||
stsd_data = _find_nested_box(trak_body, b"mdia", b"minf", b"stbl", b"stsd")
|
||||
if not stsd_data or len(stsd_data) < 16:
|
||||
offset += total_size
|
||||
continue
|
||||
|
||||
codec_name = parse_stsd_codec(stsd_data)
|
||||
|
||||
# Map MP4 codec names to MKV codec IDs
|
||||
if codec_name in ("avc1", "avc3"):
|
||||
mkv_codec_id = CODEC_ID_H264
|
||||
elif codec_name in ("hvc1", "hev1"):
|
||||
mkv_codec_id = CODEC_ID_H265
|
||||
else:
|
||||
mkv_codec_id = f"V_MP4/{codec_name}"
|
||||
|
||||
# Extract codec private (avcC or hvcC box) from inside the
|
||||
# sample entry. The stsd structure is:
|
||||
# version(1) + flags(3) + entry_count(4)
|
||||
# then entry: size(4) + type(4) + ... + nested boxes
|
||||
# The avcC/hvcC is a child box of the sample entry.
|
||||
codec_private = b""
|
||||
width = 0
|
||||
height = 0
|
||||
|
||||
# Parse sample entry to get width/height and codec config
|
||||
entry_start = 8 # skip version+flags+entry_count
|
||||
if entry_start + 8 <= len(stsd_data):
|
||||
entry_size = struct.unpack_from(">I", stsd_data, entry_start)[0]
|
||||
entry_body_start = entry_start + 8 # skip size+type
|
||||
entry_end = min(entry_start + entry_size, len(stsd_data))
|
||||
|
||||
# Visual sample entry: 6 reserved + 2 data_ref_idx + ...
|
||||
# At offset 24 from entry body start: width(2) + height(2)
|
||||
vis_offset = entry_body_start + 24
|
||||
if vis_offset + 4 <= entry_end:
|
||||
width = struct.unpack_from(">H", stsd_data, vis_offset)[0]
|
||||
height = struct.unpack_from(">H", stsd_data, vis_offset + 2)[0]
|
||||
|
||||
# Scan nested boxes for avcC or hvcC
|
||||
# Visual sample entry fixed fields = 70 bytes from entry body
|
||||
nested_start = entry_body_start + 70
|
||||
if nested_start < entry_end:
|
||||
nested_data = stsd_data[nested_start:entry_end]
|
||||
for target in (b"avcC", b"hvcC"):
|
||||
found = find_box(nested_data, target)
|
||||
if found:
|
||||
codec_private = found
|
||||
break
|
||||
|
||||
# Get duration from mdhd if available
|
||||
default_duration_ns = 0
|
||||
mdhd_data = _find_nested_box(trak_body, b"mdia", b"mdhd")
|
||||
if mdhd_data and len(mdhd_data) >= 20:
|
||||
version = mdhd_data[0]
|
||||
if version == 1 and len(mdhd_data) >= 28:
|
||||
ts = struct.unpack_from(">I", mdhd_data, 20)[0]
|
||||
dur = struct.unpack_from(">Q", mdhd_data, 24)[0]
|
||||
else:
|
||||
ts = struct.unpack_from(">I", mdhd_data, 12)[0]
|
||||
dur = struct.unpack_from(">I", mdhd_data, 16)[0]
|
||||
if ts > 0 and dur > 0:
|
||||
# Rough estimate: assume 24fps if we can't determine.
|
||||
default_duration_ns = int(1_000_000_000 / 24)
|
||||
|
||||
return MKVTrack(
|
||||
track_number=1,
|
||||
track_type=1, # video
|
||||
codec_id=mkv_codec_id,
|
||||
codec_private=codec_private,
|
||||
pixel_width=width,
|
||||
pixel_height=height,
|
||||
default_duration_ns=default_duration_ns,
|
||||
)
|
||||
|
||||
offset += total_size
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def rewrite_moov_offsets(moov_data: bytes, delta: int) -> bytes:
|
||||
"""
|
||||
Rewrite all stco/co64 chunk offsets in a moov atom by adding ``delta``.
|
||||
|
||||
This is needed when rearranging an MP4 file for pipe streaming:
|
||||
the original moov's chunk offsets reference positions in the original
|
||||
file layout. When we prepend moov before mdat, the offsets must be
|
||||
shifted by ``delta = moov_size - original_mdat_offset``.
|
||||
|
||||
Args:
|
||||
moov_data: Raw bytes of the complete moov box (header + body).
|
||||
delta: Offset adjustment to add to every chunk offset.
|
||||
|
||||
Returns:
|
||||
Modified moov bytes with updated chunk offsets.
|
||||
"""
|
||||
buf = bytearray(moov_data)
|
||||
|
||||
# Determine moov box header size
|
||||
raw_size = struct.unpack_from(">I", buf, 0)[0]
|
||||
hdr_size = 16 if raw_size == 1 else 8
|
||||
|
||||
total = _walk_and_rewrite(buf, hdr_size, len(buf), delta)
|
||||
logger.info("[mp4_parser] Rewrote %d chunk offset entries (delta=%+d)", total, delta)
|
||||
|
||||
return bytes(buf)
|
||||
@@ -0,0 +1,608 @@
|
||||
"""
|
||||
Universal PyAV-based streaming demuxer.
|
||||
|
||||
Bridges async byte streams to PyAV's synchronous I/O using an OS pipe,
|
||||
allowing on-the-fly demuxing of any container format (MKV, MP4, TS,
|
||||
FLV, WebM, etc.) from an async source.
|
||||
|
||||
Architecture:
|
||||
AsyncIterator[bytes] --> async feeder task --> queue.Queue --> writer thread (pipe)
|
||||
|
|
||||
OS pipe (kernel buffer)
|
||||
|
|
||||
demux thread: av.open + discover + demux
|
||||
|
|
||||
queue.Queue --> run_in_executor consumer
|
||||
|
||||
Performance: Uses plain threading.Queue on both sides (writer input and
|
||||
packet output) to avoid per-item ``run_coroutine_threadsafe`` overhead.
|
||||
The async/thread bridge is done via ``run_in_executor`` on the consumer
|
||||
side and a dedicated asyncio task on the producer side.
|
||||
|
||||
For MP4 inputs, the caller (transcode_handler) prepends the moov atom
|
||||
to the stream so PyAV receives a "faststart"-style MP4 through the pipe.
|
||||
This allows true on-the-fly demuxing for all container formats.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
import os
|
||||
import queue
|
||||
import threading
|
||||
from collections.abc import AsyncIterator
|
||||
from dataclasses import dataclass
|
||||
|
||||
import av
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Sentinel object to signal end-of-stream in queues
|
||||
_SENTINEL = object()
|
||||
|
||||
|
||||
@dataclass(slots=True)
|
||||
class DemuxedStream:
|
||||
"""Metadata about a demuxed stream."""
|
||||
|
||||
index: int
|
||||
codec_name: str
|
||||
codec_type: str # "video" or "audio"
|
||||
# Video-specific
|
||||
width: int = 0
|
||||
height: int = 0
|
||||
fps: float = 0.0
|
||||
pixel_format: str = ""
|
||||
# Audio-specific
|
||||
sample_rate: int = 0
|
||||
channels: int = 0
|
||||
# Timing
|
||||
time_base_num: int = 1
|
||||
time_base_den: int = 1000
|
||||
duration_seconds: float = 0.0
|
||||
# Raw codec extradata (e.g. SPS/PPS for H.264, AudioSpecificConfig for AAC)
|
||||
extradata: bytes = b""
|
||||
|
||||
|
||||
@dataclass(slots=True)
|
||||
class DemuxedPacket:
|
||||
"""A demuxed packet with timing info."""
|
||||
|
||||
stream_index: int
|
||||
codec_type: str # "video" or "audio"
|
||||
data: bytes
|
||||
pts: int # Presentation timestamp in stream time_base units
|
||||
dts: int # Decode timestamp in stream time_base units
|
||||
duration: int # Duration in stream time_base units
|
||||
is_keyframe: bool
|
||||
time_base_num: int
|
||||
time_base_den: int
|
||||
# Optional decoded frame when decode_video/decode_audio is True
|
||||
# av.VideoFrame for video, av.AudioFrame for audio
|
||||
decoded_frame: object = None
|
||||
|
||||
@property
|
||||
def pts_seconds(self) -> float:
|
||||
if self.time_base_den == 0:
|
||||
return 0.0
|
||||
return self.pts * self.time_base_num / self.time_base_den
|
||||
|
||||
@property
|
||||
def dts_seconds(self) -> float:
|
||||
if self.time_base_den == 0:
|
||||
return 0.0
|
||||
return self.dts * self.time_base_num / self.time_base_den
|
||||
|
||||
@property
|
||||
def duration_seconds(self) -> float:
|
||||
if self.time_base_den == 0:
|
||||
return 0.0
|
||||
return self.duration * self.time_base_num / self.time_base_den
|
||||
|
||||
|
||||
class PyAVDemuxer:
|
||||
"""
|
||||
Streaming demuxer using PyAV with pipe-based I/O.
|
||||
|
||||
All container I/O happens in background threads. The writer thread
|
||||
feeds source bytes into a pipe; a single demux thread opens the
|
||||
container, discovers streams, and demuxes packets -- all on the
|
||||
same file object, ensuring the pipe's read cursor is never lost.
|
||||
|
||||
Performance optimisation: both the writer-input side and the
|
||||
packet-output side use plain ``queue.Queue`` (no event-loop
|
||||
involvement per item). The async/thread bridge is done via
|
||||
``run_in_executor`` on the consumer and an asyncio task on the
|
||||
producer, eliminating ~1700 ``run_coroutine_threadsafe`` round-trips
|
||||
per 30 s of 4K content.
|
||||
|
||||
Usage:
|
||||
demuxer = PyAVDemuxer()
|
||||
await demuxer.start(source_async_iter)
|
||||
# demuxer.video_stream / audio_stream are now available
|
||||
async for packet in demuxer.iter_packets():
|
||||
if packet.codec_type == "video":
|
||||
...
|
||||
"""
|
||||
|
||||
def __init__(self, decode_video: bool = False, decode_audio: bool = False) -> None:
|
||||
"""
|
||||
Args:
|
||||
decode_video: If True, the demux thread will decode video packets
|
||||
using the container's codec context and attach decoded frames
|
||||
to DemuxedPacket.decoded_frame. This avoids format conversion
|
||||
issues with standalone decoders (HVCC vs Annex B).
|
||||
decode_audio: If True, the demux thread will decode audio packets
|
||||
using the container's codec context and attach decoded frames
|
||||
to DemuxedPacket.decoded_frame. This is needed for codecs like
|
||||
Vorbis/Opus where the standalone decoder requires codec headers
|
||||
that are only available in the container context. Can also be
|
||||
set after start() returns (before packets are consumed) via
|
||||
the ``enable_audio_decode()`` method.
|
||||
"""
|
||||
self._decode_video = decode_video
|
||||
self._decode_audio = decode_audio
|
||||
self._video_decode_decided = threading.Event()
|
||||
self._audio_decode_decided = threading.Event()
|
||||
# If decode flags were set at construction time, mark decided immediately
|
||||
if decode_video:
|
||||
self._video_decode_decided.set()
|
||||
if decode_audio:
|
||||
self._audio_decode_decided.set()
|
||||
self._container: av.InputContainer | None = None
|
||||
self._video_stream: DemuxedStream | None = None
|
||||
self._audio_stream: DemuxedStream | None = None
|
||||
# Thread-safe queues (no event-loop involvement per put/get)
|
||||
self._packet_queue: queue.Queue | None = None
|
||||
self._source_queue: queue.Queue | None = None
|
||||
self._demux_thread: threading.Thread | None = None
|
||||
self._writer_thread: threading.Thread | None = None
|
||||
self._feeder_task: asyncio.Task | None = None
|
||||
self._write_fd: int | None = None
|
||||
self._read_fd: int | None = None
|
||||
|
||||
@property
|
||||
def video_stream(self) -> DemuxedStream | None:
|
||||
return self._video_stream
|
||||
|
||||
@property
|
||||
def audio_stream(self) -> DemuxedStream | None:
|
||||
return self._audio_stream
|
||||
|
||||
def enable_video_decode(self, enable: bool = True) -> None:
|
||||
"""
|
||||
Enable or disable in-thread video decoding.
|
||||
|
||||
Call this after ``start()`` returns (stream metadata is available)
|
||||
but before consuming packets via ``iter_packets()``. The demux
|
||||
thread waits for this signal before processing video packets.
|
||||
"""
|
||||
self._decode_video = enable
|
||||
self._video_decode_decided.set()
|
||||
|
||||
def enable_audio_decode(self, enable: bool = True) -> None:
|
||||
"""
|
||||
Enable or disable in-thread audio decoding.
|
||||
|
||||
Call this after ``start()`` returns (stream metadata is available)
|
||||
but before consuming packets via ``iter_packets()``. The demux
|
||||
thread waits for this signal before processing audio packets.
|
||||
"""
|
||||
self._decode_audio = enable
|
||||
self._audio_decode_decided.set()
|
||||
|
||||
# ── Writer side ──────────────────────────────────────────────────
|
||||
|
||||
async def _async_feeder(self, source: AsyncIterator[bytes]) -> None:
|
||||
"""
|
||||
Async task: pull chunks from the async source and push them
|
||||
into a plain ``queue.Queue`` for the writer thread.
|
||||
|
||||
This replaces the old per-chunk ``run_coroutine_threadsafe``
|
||||
pattern, batching the async-to-sync bridge into one task.
|
||||
|
||||
``queue.Queue.put()`` is a blocking call, so we use
|
||||
``run_in_executor`` to avoid blocking the event loop when the
|
||||
queue is full.
|
||||
"""
|
||||
loop = asyncio.get_running_loop()
|
||||
sq = self._source_queue
|
||||
try:
|
||||
async for chunk in source:
|
||||
await loop.run_in_executor(None, sq.put, chunk)
|
||||
except (asyncio.CancelledError, GeneratorExit):
|
||||
pass
|
||||
except Exception:
|
||||
pass
|
||||
finally:
|
||||
sq.put(_SENTINEL)
|
||||
|
||||
def _write_chunks_sync(self) -> None:
|
||||
"""
|
||||
Writer thread: pull pre-buffered chunks from ``_source_queue``
|
||||
and write to the OS pipe. No event-loop interaction.
|
||||
"""
|
||||
write_fd = self._write_fd
|
||||
sq = self._source_queue
|
||||
try:
|
||||
while True:
|
||||
chunk = sq.get(timeout=30.0)
|
||||
if chunk is _SENTINEL:
|
||||
break
|
||||
os.write(write_fd, chunk)
|
||||
except Exception:
|
||||
pass
|
||||
finally:
|
||||
try:
|
||||
os.close(write_fd)
|
||||
except OSError:
|
||||
pass
|
||||
self._write_fd = None
|
||||
|
||||
# ── Demux side ───────────────────────────────────────────────────
|
||||
|
||||
async def start(self, source: AsyncIterator[bytes]) -> None:
|
||||
"""
|
||||
Start pipe-based streaming: writer thread feeds the pipe, a single
|
||||
demux thread opens the container, discovers streams, and begins
|
||||
enqueuing packets.
|
||||
|
||||
After this returns, ``video_stream`` and ``audio_stream`` are
|
||||
populated and packets are being enqueued for ``iter_packets()``.
|
||||
"""
|
||||
loop = asyncio.get_running_loop()
|
||||
|
||||
# Create OS pipe
|
||||
self._read_fd, self._write_fd = os.pipe()
|
||||
|
||||
# Source buffer queue (async feeder task -> writer thread)
|
||||
self._source_queue = queue.Queue(maxsize=256)
|
||||
|
||||
# Kick off the async feeder task
|
||||
self._feeder_task = asyncio.create_task(self._async_feeder(source))
|
||||
|
||||
# Start writer thread (drains source_queue into the pipe)
|
||||
self._writer_thread = threading.Thread(
|
||||
target=self._write_chunks_sync,
|
||||
daemon=True,
|
||||
name="pyav-writer",
|
||||
)
|
||||
self._writer_thread.start()
|
||||
|
||||
# Packet queue for demux-thread -> async consumer bridge
|
||||
self._packet_queue = queue.Queue(maxsize=128)
|
||||
streams_ready = threading.Event()
|
||||
|
||||
def _open_and_demux():
|
||||
"""
|
||||
Single background thread: open container, discover streams,
|
||||
demux all packets.
|
||||
|
||||
Critical: av.open(), _discover_streams(), and container.demux()
|
||||
all happen on the same file object in the same thread. This
|
||||
ensures the pipe read cursor is never lost between open and demux.
|
||||
"""
|
||||
pkt_count = 0
|
||||
pq = self._packet_queue
|
||||
try:
|
||||
# Open container from read end of pipe
|
||||
read_file = os.fdopen(self._read_fd, "rb")
|
||||
self._read_fd = None # ownership transferred
|
||||
|
||||
self._container = av.open(
|
||||
read_file,
|
||||
mode="r",
|
||||
options={
|
||||
# Tolerate mid-stream joins / broken data in live TS
|
||||
"err_detect": "ignore_err",
|
||||
"fflags": "+discardcorrupt+genpts",
|
||||
},
|
||||
)
|
||||
self._discover_streams()
|
||||
|
||||
# Signal stream metadata is available
|
||||
streams_ready.set()
|
||||
|
||||
if self._video_stream is None and self._audio_stream is None:
|
||||
logger.warning("[pyav_demuxer] No video or audio streams found")
|
||||
return
|
||||
|
||||
# Select streams to demux
|
||||
streams_to_demux = []
|
||||
if self._video_stream is not None:
|
||||
streams_to_demux.append(self._container.streams[self._video_stream.index])
|
||||
if self._audio_stream is not None:
|
||||
streams_to_demux.append(self._container.streams[self._audio_stream.index])
|
||||
|
||||
# Wait for the caller to decide on video/audio decoding
|
||||
# (if not already decided at construction time).
|
||||
if not self._video_decode_decided.is_set():
|
||||
self._video_decode_decided.wait(timeout=10.0)
|
||||
if not self._audio_decode_decided.is_set():
|
||||
self._audio_decode_decided.wait(timeout=10.0)
|
||||
|
||||
# Cache stream objects and time_base for the hot loop
|
||||
video_stream_obj = (
|
||||
self._container.streams[self._video_stream.index] if self._video_stream is not None else None
|
||||
)
|
||||
audio_stream_obj = (
|
||||
self._container.streams[self._audio_stream.index] if self._audio_stream is not None else None
|
||||
)
|
||||
|
||||
video_tb_num = video_stream_obj.time_base.numerator if video_stream_obj else 1
|
||||
video_tb_den = video_stream_obj.time_base.denominator if video_stream_obj else 1
|
||||
audio_tb_num = audio_stream_obj.time_base.numerator if audio_stream_obj else 1
|
||||
audio_tb_den = audio_stream_obj.time_base.denominator if audio_stream_obj else 1
|
||||
|
||||
decode_video = self._decode_video
|
||||
decode_audio = self._decode_audio
|
||||
|
||||
# Demux and enqueue packets -- plain queue.put(), no event loop
|
||||
for packet in self._container.demux(*streams_to_demux):
|
||||
if packet.size == 0:
|
||||
continue
|
||||
|
||||
stream = self._container.streams[packet.stream_index]
|
||||
is_video = stream.type == "video"
|
||||
is_audio = stream.type == "audio"
|
||||
|
||||
# Optionally decode video packets in-thread
|
||||
if decode_video and is_video and video_stream_obj is not None:
|
||||
try:
|
||||
frames = video_stream_obj.codec_context.decode(packet)
|
||||
except Exception:
|
||||
frames = []
|
||||
for frame in frames:
|
||||
pq.put(
|
||||
DemuxedPacket(
|
||||
stream_index=packet.stream_index,
|
||||
codec_type="video",
|
||||
data=b"",
|
||||
pts=int(frame.pts) if frame.pts is not None else 0,
|
||||
dts=int(frame.pts) if frame.pts is not None else 0,
|
||||
duration=int(packet.duration) if packet.duration is not None else 0,
|
||||
is_keyframe=frame.key_frame,
|
||||
time_base_num=video_tb_num,
|
||||
time_base_den=video_tb_den,
|
||||
decoded_frame=frame,
|
||||
)
|
||||
)
|
||||
pkt_count += 1
|
||||
|
||||
# Optionally decode audio packets in-thread
|
||||
elif decode_audio and is_audio and audio_stream_obj is not None:
|
||||
try:
|
||||
frames = audio_stream_obj.codec_context.decode(packet)
|
||||
except Exception:
|
||||
frames = []
|
||||
for frame in frames:
|
||||
pq.put(
|
||||
DemuxedPacket(
|
||||
stream_index=packet.stream_index,
|
||||
codec_type="audio",
|
||||
data=b"",
|
||||
pts=int(frame.pts) if frame.pts is not None else 0,
|
||||
dts=int(frame.pts) if frame.pts is not None else 0,
|
||||
duration=int(packet.duration) if packet.duration is not None else 0,
|
||||
is_keyframe=False,
|
||||
time_base_num=audio_tb_num,
|
||||
time_base_den=audio_tb_den,
|
||||
decoded_frame=frame,
|
||||
)
|
||||
)
|
||||
pkt_count += 1
|
||||
|
||||
else:
|
||||
tb_num = video_tb_num if is_video else audio_tb_num
|
||||
tb_den = video_tb_den if is_video else audio_tb_den
|
||||
pq.put(
|
||||
DemuxedPacket(
|
||||
stream_index=packet.stream_index,
|
||||
codec_type=stream.type,
|
||||
data=bytes(packet),
|
||||
pts=int(packet.pts) if packet.pts is not None else 0,
|
||||
dts=int(packet.dts) if packet.dts is not None else 0,
|
||||
duration=int(packet.duration) if packet.duration is not None else 0,
|
||||
is_keyframe=packet.is_keyframe,
|
||||
time_base_num=tb_num,
|
||||
time_base_den=tb_den,
|
||||
)
|
||||
)
|
||||
pkt_count += 1
|
||||
|
||||
# Flush the video decoder if we were decoding
|
||||
if decode_video and video_stream_obj is not None:
|
||||
try:
|
||||
for frame in video_stream_obj.codec_context.decode(None):
|
||||
pq.put(
|
||||
DemuxedPacket(
|
||||
stream_index=video_stream_obj.index,
|
||||
codec_type="video",
|
||||
data=b"",
|
||||
pts=int(frame.pts) if frame.pts is not None else 0,
|
||||
dts=int(frame.pts) if frame.pts is not None else 0,
|
||||
duration=0,
|
||||
is_keyframe=frame.key_frame,
|
||||
time_base_num=video_tb_num,
|
||||
time_base_den=video_tb_den,
|
||||
decoded_frame=frame,
|
||||
)
|
||||
)
|
||||
pkt_count += 1
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Flush the audio decoder if we were decoding
|
||||
if decode_audio and audio_stream_obj is not None:
|
||||
try:
|
||||
for frame in audio_stream_obj.codec_context.decode(None):
|
||||
pq.put(
|
||||
DemuxedPacket(
|
||||
stream_index=audio_stream_obj.index,
|
||||
codec_type="audio",
|
||||
data=b"",
|
||||
pts=int(frame.pts) if frame.pts is not None else 0,
|
||||
dts=int(frame.pts) if frame.pts is not None else 0,
|
||||
duration=0,
|
||||
is_keyframe=False,
|
||||
time_base_num=audio_tb_num,
|
||||
time_base_den=audio_tb_den,
|
||||
decoded_frame=frame,
|
||||
)
|
||||
)
|
||||
pkt_count += 1
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
logger.info("[pyav_demuxer] Demux complete: %d packets", pkt_count)
|
||||
|
||||
except Exception as e:
|
||||
if "Invalid data" not in str(e):
|
||||
logger.debug("[pyav_demuxer] Demux thread error: %s", e)
|
||||
# Ensure streams_ready is set even on error
|
||||
streams_ready.set()
|
||||
finally:
|
||||
pq.put(_SENTINEL)
|
||||
|
||||
self._demux_thread = threading.Thread(target=_open_and_demux, daemon=True, name="pyav-demux")
|
||||
self._demux_thread.start()
|
||||
|
||||
# Wait for stream discovery before returning.
|
||||
# Use run_in_executor to avoid blocking the event loop.
|
||||
await loop.run_in_executor(None, streams_ready.wait)
|
||||
|
||||
async def iter_packets(self) -> AsyncIterator[DemuxedPacket]:
|
||||
"""
|
||||
Yield demuxed packets from the background thread.
|
||||
|
||||
Uses ``run_in_executor`` for the blocking ``queue.get()`` call,
|
||||
avoiding per-packet ``run_coroutine_threadsafe`` overhead.
|
||||
|
||||
``start()`` must be called first.
|
||||
"""
|
||||
if self._packet_queue is None:
|
||||
raise RuntimeError("Call start() before iter_packets()")
|
||||
|
||||
loop = asyncio.get_running_loop()
|
||||
pq = self._packet_queue
|
||||
|
||||
try:
|
||||
while True:
|
||||
packet = await loop.run_in_executor(None, pq.get)
|
||||
if packet is _SENTINEL:
|
||||
break
|
||||
yield packet
|
||||
|
||||
if self._demux_thread is not None:
|
||||
self._demux_thread.join(timeout=5.0)
|
||||
|
||||
except GeneratorExit:
|
||||
logger.debug("[pyav_demuxer] Generator closed")
|
||||
except asyncio.CancelledError:
|
||||
logger.debug("[pyav_demuxer] Cancelled")
|
||||
finally:
|
||||
self._cleanup()
|
||||
|
||||
def _discover_streams(self) -> None:
|
||||
"""Inspect the opened container and record stream metadata."""
|
||||
if self._container is None:
|
||||
return
|
||||
|
||||
for stream in self._container.streams:
|
||||
if stream.type == "video" and self._video_stream is None:
|
||||
codec_ctx = stream.codec_context
|
||||
fps = float(stream.average_rate) if stream.average_rate else 24.0
|
||||
self._video_stream = DemuxedStream(
|
||||
index=stream.index,
|
||||
codec_name=codec_ctx.name if codec_ctx else stream.codec.name,
|
||||
codec_type="video",
|
||||
width=codec_ctx.width if codec_ctx else 0,
|
||||
height=codec_ctx.height if codec_ctx else 0,
|
||||
fps=fps,
|
||||
pixel_format=str(codec_ctx.pix_fmt) if codec_ctx and codec_ctx.pix_fmt else "yuv420p",
|
||||
time_base_num=stream.time_base.numerator,
|
||||
time_base_den=stream.time_base.denominator,
|
||||
duration_seconds=float(stream.duration * stream.time_base) if stream.duration else 0.0,
|
||||
extradata=bytes(codec_ctx.extradata) if codec_ctx and codec_ctx.extradata else b"",
|
||||
)
|
||||
logger.info(
|
||||
"[pyav_demuxer] Video: %s %dx%d @%.1ffps",
|
||||
self._video_stream.codec_name,
|
||||
self._video_stream.width,
|
||||
self._video_stream.height,
|
||||
self._video_stream.fps,
|
||||
)
|
||||
|
||||
elif stream.type == "audio" and self._audio_stream is None:
|
||||
codec_ctx = stream.codec_context
|
||||
self._audio_stream = DemuxedStream(
|
||||
index=stream.index,
|
||||
codec_name=codec_ctx.name if codec_ctx else stream.codec.name,
|
||||
codec_type="audio",
|
||||
sample_rate=codec_ctx.sample_rate if codec_ctx else 0,
|
||||
channels=codec_ctx.channels if codec_ctx else 0,
|
||||
time_base_num=stream.time_base.numerator,
|
||||
time_base_den=stream.time_base.denominator,
|
||||
duration_seconds=float(stream.duration * stream.time_base) if stream.duration else 0.0,
|
||||
extradata=bytes(codec_ctx.extradata) if codec_ctx and codec_ctx.extradata else b"",
|
||||
)
|
||||
logger.info(
|
||||
"[pyav_demuxer] Audio: %s %dHz %dch",
|
||||
self._audio_stream.codec_name,
|
||||
self._audio_stream.sample_rate,
|
||||
self._audio_stream.channels,
|
||||
)
|
||||
|
||||
def _cleanup(self) -> None:
|
||||
"""Stop threads and release all resources safely.
|
||||
|
||||
The order is critical to avoid SIGSEGV from closing the container
|
||||
while the demux thread is still calling container.demux():
|
||||
|
||||
1. Cancel the feeder task (stops new bytes being queued).
|
||||
2. Put a sentinel into the source queue so the writer thread
|
||||
unblocks and exits. The writer's ``finally`` closes the pipe
|
||||
write-end, which causes the demux thread to see EOF.
|
||||
3. Join the writer thread (wait for it to drain and exit).
|
||||
4. Join the demux thread (it finishes after pipe EOF).
|
||||
5. ONLY THEN close the container (no thread is using it).
|
||||
6. Close any remaining pipe FDs (read end, if still open).
|
||||
"""
|
||||
# 1. Cancel feeder task
|
||||
if self._feeder_task is not None:
|
||||
self._feeder_task.cancel()
|
||||
self._feeder_task = None
|
||||
|
||||
# 2. Unblock writer thread so it exits and closes the pipe
|
||||
if self._source_queue is not None:
|
||||
try:
|
||||
self._source_queue.put_nowait(_SENTINEL)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# 3. Join writer thread (it closes _write_fd in its finally block)
|
||||
if self._writer_thread is not None:
|
||||
self._writer_thread.join(timeout=5.0)
|
||||
self._writer_thread = None
|
||||
|
||||
# 4. Join demux thread -- must finish before we close the container
|
||||
if self._demux_thread is not None:
|
||||
self._demux_thread.join(timeout=5.0)
|
||||
self._demux_thread = None
|
||||
|
||||
# 5. Now safe to close the container (no thread is using it)
|
||||
if self._container is not None:
|
||||
try:
|
||||
self._container.close()
|
||||
except Exception:
|
||||
pass
|
||||
self._container = None
|
||||
|
||||
# 6. Close any remaining pipe FDs
|
||||
for fd_name in ("_read_fd", "_write_fd"):
|
||||
fd = getattr(self, fd_name, None)
|
||||
if fd is not None:
|
||||
try:
|
||||
os.close(fd)
|
||||
except OSError:
|
||||
pass
|
||||
setattr(self, fd_name, None)
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,403 @@
|
||||
"""
|
||||
GPU-accelerated video transcoder with runtime detection.
|
||||
|
||||
Detects available hardware encoders/decoders at first use and selects
|
||||
the best available backend:
|
||||
- NVIDIA: h264_nvenc / hevc_cuvid (NVENC + CUDA)
|
||||
- Apple macOS: h264_videotoolbox / hevc_videotoolbox
|
||||
- Intel Linux: h264_vaapi / h264_qsv
|
||||
- Fallback: libx264 (CPU)
|
||||
|
||||
The transcoder operates at the packet/frame level via PyAV, suitable
|
||||
for integration into the streaming pipeline.
|
||||
"""
|
||||
|
||||
import logging
|
||||
from dataclasses import dataclass, field
|
||||
from enum import Enum
|
||||
from fractions import Fraction
|
||||
|
||||
import av
|
||||
|
||||
from mediaflow_proxy.configs import settings
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class HWAccelType(Enum):
|
||||
NONE = "none"
|
||||
NVIDIA = "nvidia"
|
||||
VIDEOTOOLBOX = "videotoolbox"
|
||||
VAAPI = "vaapi"
|
||||
QSV = "qsv"
|
||||
|
||||
|
||||
@dataclass
|
||||
class HWCapability:
|
||||
"""Detected hardware acceleration capability."""
|
||||
|
||||
accel_type: HWAccelType = HWAccelType.NONE
|
||||
h264_encoder: str = "libx264"
|
||||
h264_decoder: str | None = None # None = use default software decoder
|
||||
hevc_decoder: str | None = None
|
||||
available_encoders: list[str] = field(default_factory=list)
|
||||
available_decoders: list[str] = field(default_factory=list)
|
||||
|
||||
|
||||
# Module-level singleton -- populated on first call to get_hw_capability()
|
||||
_hw_capability: HWCapability | None = None
|
||||
|
||||
|
||||
def _probe_codec(name: str, mode: str = "w") -> bool:
|
||||
"""
|
||||
Check if a PyAV codec is available by name.
|
||||
|
||||
Args:
|
||||
name: Codec name (e.g. 'h264_videotoolbox').
|
||||
mode: 'w' for encoder, 'r' for decoder.
|
||||
"""
|
||||
try:
|
||||
av.Codec(name, mode)
|
||||
return True
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
|
||||
def _detect_hw_capability() -> HWCapability:
|
||||
"""
|
||||
Probe the runtime environment for hardware encoder/decoder availability.
|
||||
|
||||
Checks NVIDIA, Apple VideoToolbox, Intel VAAPI/QSV in priority order.
|
||||
Falls back to libx264 CPU encoding.
|
||||
"""
|
||||
cap = HWCapability()
|
||||
|
||||
# Collect available encoders/decoders for logging
|
||||
hw_encoders = [
|
||||
"h264_nvenc",
|
||||
"hevc_nvenc",
|
||||
"h264_videotoolbox",
|
||||
"hevc_videotoolbox",
|
||||
"h264_vaapi",
|
||||
"hevc_vaapi",
|
||||
"h264_qsv",
|
||||
"hevc_qsv",
|
||||
]
|
||||
hw_decoders = [
|
||||
"h264_cuvid",
|
||||
"hevc_cuvid",
|
||||
"h264_qsv",
|
||||
"hevc_qsv",
|
||||
]
|
||||
|
||||
cap.available_encoders = [c for c in hw_encoders if _probe_codec(c, "w")]
|
||||
cap.available_decoders = [c for c in hw_decoders if _probe_codec(c, "r")]
|
||||
|
||||
# Priority 1: NVIDIA NVENC
|
||||
if "h264_nvenc" in cap.available_encoders:
|
||||
cap.accel_type = HWAccelType.NVIDIA
|
||||
cap.h264_encoder = "h264_nvenc"
|
||||
if "h264_cuvid" in cap.available_decoders:
|
||||
cap.h264_decoder = "h264_cuvid"
|
||||
if "hevc_cuvid" in cap.available_decoders:
|
||||
cap.hevc_decoder = "hevc_cuvid"
|
||||
return cap
|
||||
|
||||
# Priority 2: Apple VideoToolbox
|
||||
if "h264_videotoolbox" in cap.available_encoders:
|
||||
cap.accel_type = HWAccelType.VIDEOTOOLBOX
|
||||
cap.h264_encoder = "h264_videotoolbox"
|
||||
# VideoToolbox decoders are used automatically via hwaccel
|
||||
return cap
|
||||
|
||||
# Priority 3: Intel VAAPI (Linux)
|
||||
if "h264_vaapi" in cap.available_encoders:
|
||||
cap.accel_type = HWAccelType.VAAPI
|
||||
cap.h264_encoder = "h264_vaapi"
|
||||
return cap
|
||||
|
||||
# Priority 4: Intel QSV
|
||||
if "h264_qsv" in cap.available_encoders:
|
||||
cap.accel_type = HWAccelType.QSV
|
||||
cap.h264_encoder = "h264_qsv"
|
||||
if "h264_qsv" in cap.available_decoders:
|
||||
cap.h264_decoder = "h264_qsv"
|
||||
if "hevc_qsv" in cap.available_decoders:
|
||||
cap.hevc_decoder = "hevc_qsv"
|
||||
return cap
|
||||
|
||||
# Fallback: CPU
|
||||
cap.accel_type = HWAccelType.NONE
|
||||
cap.h264_encoder = "libx264"
|
||||
return cap
|
||||
|
||||
|
||||
def get_hw_capability() -> HWCapability:
|
||||
"""Get the detected hardware acceleration capability (cached singleton)."""
|
||||
global _hw_capability
|
||||
if _hw_capability is None:
|
||||
_hw_capability = _detect_hw_capability()
|
||||
if settings.transcode_prefer_gpu and _hw_capability.accel_type != HWAccelType.NONE:
|
||||
logger.info(
|
||||
"[video_transcoder] GPU acceleration: %s (encoder=%s, decoders=%s)",
|
||||
_hw_capability.accel_type.value,
|
||||
_hw_capability.h264_encoder,
|
||||
_hw_capability.available_decoders or "software",
|
||||
)
|
||||
else:
|
||||
logger.info(
|
||||
"[video_transcoder] Using CPU encoder: %s (available HW: encoders=%s, decoders=%s)",
|
||||
_hw_capability.h264_encoder,
|
||||
_hw_capability.available_encoders or "none",
|
||||
_hw_capability.available_decoders or "none",
|
||||
)
|
||||
return _hw_capability
|
||||
|
||||
|
||||
class VideoTranscoder:
|
||||
"""
|
||||
In-process video transcoder using PyAV.
|
||||
|
||||
Decodes input video packets and re-encodes to H.264 using the best
|
||||
available hardware encoder (or CPU libx264 fallback).
|
||||
|
||||
Operates at the frame level: caller provides raw video packets (from
|
||||
PyAV demuxer), transcoder returns encoded H.264 NAL data suitable
|
||||
for the fMP4 muxer.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
input_codec_name: str,
|
||||
width: int,
|
||||
height: int,
|
||||
fps: float = 24.0,
|
||||
pixel_format: str = "yuv420p",
|
||||
force_software: bool = False,
|
||||
) -> None:
|
||||
hw = get_hw_capability()
|
||||
use_gpu = settings.transcode_prefer_gpu and hw.accel_type != HWAccelType.NONE and not force_software
|
||||
|
||||
# --- Decoder ---
|
||||
hw_decoder = None
|
||||
if use_gpu:
|
||||
if "hevc" in input_codec_name or "h265" in input_codec_name:
|
||||
hw_decoder = hw.hevc_decoder
|
||||
else:
|
||||
hw_decoder = hw.h264_decoder
|
||||
|
||||
decoder_name = hw_decoder or input_codec_name
|
||||
self._decoder = av.CodecContext.create(decoder_name, "r")
|
||||
|
||||
# --- Encoder ---
|
||||
encoder_name = hw.h264_encoder if use_gpu else "libx264"
|
||||
|
||||
# H.264 requires even dimensions
|
||||
enc_width = width if width % 2 == 0 else width + 1
|
||||
enc_height = height if height % 2 == 0 else height + 1
|
||||
|
||||
self._encoder = av.CodecContext.create(encoder_name, "w")
|
||||
self._encoder.width = enc_width
|
||||
self._encoder.height = enc_height
|
||||
self._encoder.pix_fmt = "yuv420p" # H.264 requires yuv420p
|
||||
self._encoder.time_base = Fraction(1, int(fps * 1000))
|
||||
self._encoder.framerate = Fraction(int(fps * 1000), 1000)
|
||||
self._encoder.bit_rate = _parse_bitrate(settings.transcode_video_bitrate)
|
||||
self._encoder.gop_size = int(fps * 2) # Keyframe every ~2 seconds
|
||||
|
||||
# Encoder options based on backend
|
||||
opts = {}
|
||||
if encoder_name == "libx264":
|
||||
opts["preset"] = settings.transcode_video_preset
|
||||
opts["tune"] = "zerolatency"
|
||||
opts["profile"] = "high"
|
||||
elif "nvenc" in encoder_name:
|
||||
opts["preset"] = "p4" # NVENC preset (p1=fastest .. p7=slowest)
|
||||
opts["tune"] = "ll" # Low latency
|
||||
opts["rc"] = "vbr"
|
||||
elif "videotoolbox" in encoder_name:
|
||||
opts["realtime"] = "1"
|
||||
opts["allow_sw"] = "1" # Fallback to software if HW busy
|
||||
elif "vaapi" in encoder_name:
|
||||
opts["rc_mode"] = "VBR"
|
||||
elif "qsv" in encoder_name:
|
||||
opts["preset"] = "medium"
|
||||
|
||||
self._encoder.options = opts
|
||||
self._encoder.open()
|
||||
|
||||
width = enc_width
|
||||
height = enc_height
|
||||
|
||||
self._input_codec = input_codec_name
|
||||
self._encoder_name = encoder_name
|
||||
self._frames_decoded = 0
|
||||
self._frames_encoded = 0
|
||||
self._width = width
|
||||
self._height = height
|
||||
# Tracks whether the standalone decoder was actually used (via decode_packet).
|
||||
# When the demux thread decodes frames in-thread (decode_video=True),
|
||||
# the standalone decoder is never fed packets and flushing it is wasted work.
|
||||
self._decoder_used = False
|
||||
self._flushed = False # Prevents double-flush which causes SIGSEGV
|
||||
|
||||
logger.info(
|
||||
"[video_transcoder] Initialized: %s -> %s (%s), %dx%d @%.1ffps %dk",
|
||||
input_codec_name,
|
||||
encoder_name,
|
||||
hw.accel_type.value,
|
||||
width,
|
||||
height,
|
||||
fps,
|
||||
self._encoder.bit_rate // 1000 if self._encoder.bit_rate else 0,
|
||||
)
|
||||
|
||||
@property
|
||||
def codec_private_data(self) -> bytes | None:
|
||||
"""H.264 extradata (SPS/PPS) from the encoder, for the fMP4 init segment."""
|
||||
if self._encoder.extradata:
|
||||
return bytes(self._encoder.extradata)
|
||||
return None
|
||||
|
||||
@property
|
||||
def width(self) -> int:
|
||||
return self._width
|
||||
|
||||
@property
|
||||
def height(self) -> int:
|
||||
return self._height
|
||||
|
||||
def transcode_frame(self, frame: av.VideoFrame) -> list[tuple[bytes, bool, int, int]]:
|
||||
"""
|
||||
Encode a decoded video frame to H.264.
|
||||
|
||||
Args:
|
||||
frame: A decoded av.VideoFrame.
|
||||
|
||||
Returns:
|
||||
List of (nal_data, is_keyframe, pts, dts) tuples.
|
||||
"""
|
||||
self._frames_decoded += 1
|
||||
output = []
|
||||
|
||||
# Ensure correct pixel format for encoder
|
||||
if frame.format.name != self._encoder.pix_fmt:
|
||||
frame = frame.reformat(format=self._encoder.pix_fmt)
|
||||
|
||||
try:
|
||||
for packet in self._encoder.encode(frame):
|
||||
self._frames_encoded += 1
|
||||
output.append(
|
||||
(
|
||||
bytes(packet),
|
||||
packet.is_keyframe,
|
||||
int(packet.pts) if packet.pts is not None else 0,
|
||||
int(packet.dts) if packet.dts is not None else 0,
|
||||
)
|
||||
)
|
||||
except av.error.InvalidDataError as e:
|
||||
logger.debug("[video_transcoder] Encode error: %s", e)
|
||||
|
||||
return output
|
||||
|
||||
def decode_packet(self, packet: av.Packet) -> list[av.VideoFrame]:
|
||||
"""Decode a video packet into frames."""
|
||||
self._decoder_used = True
|
||||
try:
|
||||
return list(self._decoder.decode(packet))
|
||||
except av.error.InvalidDataError as e:
|
||||
logger.debug("[video_transcoder] Decode error: %s", e)
|
||||
return []
|
||||
|
||||
def flush(self) -> list[tuple[bytes, bool, int, int]]:
|
||||
"""
|
||||
Flush encoder (and decoder, if it was used) buffers.
|
||||
|
||||
When ``decode_video=True`` is used in PyAVDemuxer, the demux thread
|
||||
decodes frames using the container's codec context. In that case the
|
||||
standalone ``_decoder`` here is never fed any packets, so flushing
|
||||
it is skipped -- avoiding a stall that added ~5 s on some backends.
|
||||
|
||||
Safe to call multiple times -- subsequent calls return an empty list.
|
||||
"""
|
||||
if self._flushed:
|
||||
return []
|
||||
self._flushed = True
|
||||
|
||||
output = []
|
||||
|
||||
# Flush decoder only if it was actually used (via decode_packet)
|
||||
if self._decoder_used:
|
||||
try:
|
||||
for frame in self._decoder.decode(None):
|
||||
self._frames_decoded += 1
|
||||
if frame.format.name != self._encoder.pix_fmt:
|
||||
frame = frame.reformat(format=self._encoder.pix_fmt)
|
||||
for packet in self._encoder.encode(frame):
|
||||
self._frames_encoded += 1
|
||||
output.append(
|
||||
(
|
||||
bytes(packet),
|
||||
packet.is_keyframe,
|
||||
int(packet.pts) if packet.pts is not None else 0,
|
||||
int(packet.dts) if packet.dts is not None else 0,
|
||||
)
|
||||
)
|
||||
except Exception as e:
|
||||
logger.debug("[video_transcoder] Decoder flush error: %s", e)
|
||||
else:
|
||||
logger.debug("[video_transcoder] Skipping decoder flush (decoder not used)")
|
||||
|
||||
# Flush encoder
|
||||
try:
|
||||
for packet in self._encoder.encode(None):
|
||||
self._frames_encoded += 1
|
||||
output.append(
|
||||
(
|
||||
bytes(packet),
|
||||
packet.is_keyframe,
|
||||
int(packet.pts) if packet.pts is not None else 0,
|
||||
int(packet.dts) if packet.dts is not None else 0,
|
||||
)
|
||||
)
|
||||
except Exception as e:
|
||||
logger.debug("[video_transcoder] Encoder flush error: %s", e)
|
||||
|
||||
logger.info(
|
||||
"[video_transcoder] Flushed: %d decoded, %d encoded total (decoder_used=%s)",
|
||||
self._frames_decoded,
|
||||
self._frames_encoded,
|
||||
self._decoder_used,
|
||||
)
|
||||
return output
|
||||
|
||||
def close(self) -> None:
|
||||
"""Release codec contexts.
|
||||
|
||||
Flushes the encoder (if not already flushed) before releasing to avoid
|
||||
SIGSEGV when libx264 or hardware encoders have buffered frames at
|
||||
teardown time. Double-flushing is the most common cause of SIGSEGV
|
||||
in the transcode pipeline.
|
||||
|
||||
PyAV codec contexts are released via garbage collection (no explicit
|
||||
close method), so we flush first to ensure native buffers are drained
|
||||
before the C-level codec is freed.
|
||||
"""
|
||||
# flush() is idempotent -- safe to call even if already flushed
|
||||
self.flush()
|
||||
# Release references -- GC will free the native codec contexts
|
||||
self._encoder = None
|
||||
self._decoder = None
|
||||
|
||||
def __del__(self) -> None:
|
||||
self.close()
|
||||
|
||||
|
||||
def _parse_bitrate(bitrate_str: str) -> int:
|
||||
"""Parse a bitrate string like '4M', '2000k', '5000000' to int bits/s."""
|
||||
s = bitrate_str.strip().lower()
|
||||
if s.endswith("m"):
|
||||
return int(float(s[:-1]) * 1_000_000)
|
||||
if s.endswith("k"):
|
||||
return int(float(s[:-1]) * 1_000)
|
||||
return int(s)
|
||||
Reference in New Issue
Block a user