mirror of
https://github.com/UrloMythus/UnHided.git
synced 2026-04-11 11:50:51 +00:00
update
This commit is contained in:
18
mediaflow_proxy/remuxer/__init__.py
Normal file
18
mediaflow_proxy/remuxer/__init__.py
Normal file
@@ -0,0 +1,18 @@
|
||||
"""
|
||||
Media remuxer package.
|
||||
|
||||
Provides pure Python implementations for media container parsing, remuxing,
|
||||
and transcoding:
|
||||
|
||||
- ebml_parser: Minimal EBML/MKV parser for seeking and demuxing
|
||||
- ts_muxer: fMP4 -> MPEG-TS remuxer
|
||||
- mkv_demuxer: Streaming MKV demuxer
|
||||
- mp4_muxer: MP4 box builder for standard moov-first MP4
|
||||
- audio_transcoder: PyAV-based audio frame transcoding
|
||||
- video_transcoder: GPU-accelerated video transcoding via PyAV
|
||||
- pyav_demuxer: Universal PyAV-based streaming demuxer (any container)
|
||||
- codec_utils: Codec compatibility detection and decision engine
|
||||
- media_source: Abstract MediaSource protocol (Telegram, HTTP, etc.)
|
||||
- transcode_handler: Shared transcode request orchestrator
|
||||
- transcode_pipeline: MKV fast-path and universal transcode pipelines
|
||||
"""
|
||||
BIN
mediaflow_proxy/remuxer/__pycache__/__init__.cpython-313.pyc
Normal file
BIN
mediaflow_proxy/remuxer/__pycache__/__init__.cpython-313.pyc
Normal file
Binary file not shown.
Binary file not shown.
BIN
mediaflow_proxy/remuxer/__pycache__/codec_utils.cpython-313.pyc
Normal file
BIN
mediaflow_proxy/remuxer/__pycache__/codec_utils.cpython-313.pyc
Normal file
Binary file not shown.
Binary file not shown.
BIN
mediaflow_proxy/remuxer/__pycache__/ebml_parser.cpython-313.pyc
Normal file
BIN
mediaflow_proxy/remuxer/__pycache__/ebml_parser.cpython-313.pyc
Normal file
Binary file not shown.
BIN
mediaflow_proxy/remuxer/__pycache__/hls_manifest.cpython-313.pyc
Normal file
BIN
mediaflow_proxy/remuxer/__pycache__/hls_manifest.cpython-313.pyc
Normal file
Binary file not shown.
BIN
mediaflow_proxy/remuxer/__pycache__/media_source.cpython-313.pyc
Normal file
BIN
mediaflow_proxy/remuxer/__pycache__/media_source.cpython-313.pyc
Normal file
Binary file not shown.
BIN
mediaflow_proxy/remuxer/__pycache__/mkv_demuxer.cpython-313.pyc
Normal file
BIN
mediaflow_proxy/remuxer/__pycache__/mkv_demuxer.cpython-313.pyc
Normal file
Binary file not shown.
BIN
mediaflow_proxy/remuxer/__pycache__/mp4_muxer.cpython-313.pyc
Normal file
BIN
mediaflow_proxy/remuxer/__pycache__/mp4_muxer.cpython-313.pyc
Normal file
Binary file not shown.
BIN
mediaflow_proxy/remuxer/__pycache__/mp4_parser.cpython-313.pyc
Normal file
BIN
mediaflow_proxy/remuxer/__pycache__/mp4_parser.cpython-313.pyc
Normal file
Binary file not shown.
BIN
mediaflow_proxy/remuxer/__pycache__/pyav_demuxer.cpython-313.pyc
Normal file
BIN
mediaflow_proxy/remuxer/__pycache__/pyav_demuxer.cpython-313.pyc
Normal file
Binary file not shown.
Binary file not shown.
Binary file not shown.
BIN
mediaflow_proxy/remuxer/__pycache__/ts_muxer.cpython-313.pyc
Normal file
BIN
mediaflow_proxy/remuxer/__pycache__/ts_muxer.cpython-313.pyc
Normal file
Binary file not shown.
Binary file not shown.
351
mediaflow_proxy/remuxer/audio_transcoder.py
Normal file
351
mediaflow_proxy/remuxer/audio_transcoder.py
Normal file
@@ -0,0 +1,351 @@
|
||||
"""
|
||||
PyAV-based audio transcoder for frame-level codec conversion.
|
||||
|
||||
Transcodes audio frames between codecs using PyAV's CodecContext API
|
||||
(Python bindings for FFmpeg's libavcodec). This provides in-process
|
||||
audio transcoding without subprocess management or pipe overhead.
|
||||
|
||||
Supported input codecs: EAC3, AC3, AAC, Opus, Vorbis, FLAC, MP3
|
||||
Output codec: AAC-LC (stereo, configurable bitrate)
|
||||
|
||||
Architecture:
|
||||
raw_frame_bytes -> parse() -> decode() -> resample() -> encode() -> raw_aac_bytes
|
||||
|
||||
Usage:
|
||||
transcoder = AudioTranscoder("eac3", sample_rate=48000, channels=6)
|
||||
for raw_eac3_frame in frames:
|
||||
aac_frames = transcoder.transcode(raw_eac3_frame)
|
||||
for aac_data in aac_frames:
|
||||
write(aac_data)
|
||||
# Flush remaining frames
|
||||
for aac_data in transcoder.flush():
|
||||
write(aac_data)
|
||||
"""
|
||||
|
||||
import logging
|
||||
|
||||
import av
|
||||
from av.audio.resampler import AudioResampler
|
||||
|
||||
from mediaflow_proxy.remuxer.ebml_parser import (
|
||||
CODEC_ID_AAC,
|
||||
CODEC_ID_AC3,
|
||||
CODEC_ID_EAC3,
|
||||
CODEC_ID_FLAC,
|
||||
CODEC_ID_OPUS,
|
||||
CODEC_ID_VORBIS,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _generate_silence_aac_frame() -> bytes | None:
|
||||
"""Pre-encode a single silent AAC frame (48 kHz stereo, 1024 samples).
|
||||
|
||||
PyAV's AAC encoder has an intermittent ``avcodec_send_frame`` bug when
|
||||
rapidly creating/destroying codec contexts, so we retry a few times.
|
||||
This function is called once at module load; the result is cached in
|
||||
``_SILENCE_AAC_FRAME``.
|
||||
"""
|
||||
for _attempt in range(10):
|
||||
try:
|
||||
enc = av.CodecContext.create("aac", "w")
|
||||
enc.sample_rate = 48000
|
||||
enc.layout = "stereo"
|
||||
enc.format = av.AudioFormat("fltp")
|
||||
enc.bit_rate = 192000
|
||||
enc.open()
|
||||
|
||||
frame = av.AudioFrame(
|
||||
format=enc.format.name,
|
||||
layout=enc.layout.name,
|
||||
samples=enc.frame_size or 1024,
|
||||
)
|
||||
frame.sample_rate = enc.sample_rate
|
||||
frame.pts = 0
|
||||
|
||||
for pkt in enc.encode(frame):
|
||||
return bytes(pkt)
|
||||
# AAC priming delay: first encode buffered; flush to retrieve
|
||||
for pkt in enc.encode(None):
|
||||
return bytes(pkt)
|
||||
except Exception:
|
||||
continue
|
||||
return None
|
||||
|
||||
|
||||
# Module-level silence frame -- generated once, reused by every transcoder.
|
||||
_SILENCE_AAC_FRAME: bytes | None = _generate_silence_aac_frame()
|
||||
|
||||
# Map MKV codec IDs to PyAV/FFmpeg codec names
|
||||
_MKV_TO_FFMPEG_CODEC = {
|
||||
CODEC_ID_EAC3: "eac3",
|
||||
CODEC_ID_AC3: "ac3",
|
||||
CODEC_ID_AAC: "aac",
|
||||
CODEC_ID_OPUS: "opus",
|
||||
CODEC_ID_VORBIS: "vorbis",
|
||||
CODEC_ID_FLAC: "flac",
|
||||
"A_DTS": "dts",
|
||||
"A_MP3": "mp3",
|
||||
"A_MPEG/L3": "mp3",
|
||||
}
|
||||
|
||||
# Codecs that need transcoding to AAC for browser playback
|
||||
NEEDS_TRANSCODE = frozenset(
|
||||
{
|
||||
CODEC_ID_EAC3,
|
||||
CODEC_ID_AC3,
|
||||
CODEC_ID_OPUS,
|
||||
CODEC_ID_VORBIS,
|
||||
CODEC_ID_FLAC,
|
||||
"A_DTS",
|
||||
"A_MP3",
|
||||
"A_MPEG/L3",
|
||||
}
|
||||
)
|
||||
|
||||
# Output AAC settings
|
||||
_OUTPUT_CODEC = "aac"
|
||||
_OUTPUT_SAMPLE_FORMAT = "fltp" # AAC requires float planar
|
||||
_OUTPUT_LAYOUT = "stereo"
|
||||
|
||||
# Map channel count -> FFmpeg layout name
|
||||
_CHANNEL_LAYOUT_MAP = {
|
||||
1: "mono",
|
||||
2: "stereo",
|
||||
3: "2.1",
|
||||
4: "quad",
|
||||
6: "5.1",
|
||||
8: "7.1",
|
||||
}
|
||||
|
||||
|
||||
def needs_transcode(codec_id: str) -> bool:
|
||||
"""Check if an MKV audio codec needs transcoding for browser playback."""
|
||||
return codec_id in NEEDS_TRANSCODE
|
||||
|
||||
|
||||
def get_ffmpeg_codec_name(mkv_codec_id: str) -> str | None:
|
||||
"""Map an MKV CodecID to an FFmpeg codec name."""
|
||||
return _MKV_TO_FFMPEG_CODEC.get(mkv_codec_id)
|
||||
|
||||
|
||||
class AudioTranscoder:
|
||||
"""
|
||||
In-process audio transcoder using PyAV's CodecContext API.
|
||||
|
||||
Decodes raw audio frames from one codec and encodes them to AAC-LC
|
||||
stereo, suitable for MP4 container and browser playback. No container
|
||||
I/O or subprocess involved -- operates directly on raw frame bytes.
|
||||
|
||||
The transcoder handles sample format conversion and resampling
|
||||
automatically via AudioResampler.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
input_codec: str,
|
||||
input_sample_rate: int = 48000,
|
||||
input_channels: int = 6,
|
||||
output_sample_rate: int = 48000,
|
||||
output_channels: int = 2,
|
||||
output_bitrate: int = 192000,
|
||||
) -> None:
|
||||
"""
|
||||
Initialize the transcoder.
|
||||
|
||||
Args:
|
||||
input_codec: FFmpeg codec name (e.g., "eac3", "ac3", "aac").
|
||||
input_sample_rate: Input sample rate in Hz.
|
||||
input_channels: Input channel count.
|
||||
output_sample_rate: Output sample rate in Hz (default 48000).
|
||||
output_channels: Output channel count (default 2 = stereo).
|
||||
output_bitrate: Output bitrate in bits/s (default 192000).
|
||||
"""
|
||||
# Set up decoder -- use layout to configure channel count
|
||||
# (PyAV's channels property is read-only; layout drives it)
|
||||
self._decoder = av.CodecContext.create(input_codec, "r")
|
||||
self._decoder.sample_rate = input_sample_rate
|
||||
input_layout = _CHANNEL_LAYOUT_MAP.get(input_channels, "stereo")
|
||||
self._decoder.layout = input_layout
|
||||
|
||||
# Set up encoder
|
||||
self._encoder = av.CodecContext.create(_OUTPUT_CODEC, "w")
|
||||
self._encoder.sample_rate = output_sample_rate
|
||||
self._encoder.layout = _OUTPUT_LAYOUT
|
||||
self._encoder.format = av.AudioFormat(_OUTPUT_SAMPLE_FORMAT)
|
||||
self._encoder.bit_rate = output_bitrate
|
||||
self._encoder.open()
|
||||
|
||||
# Set up resampler for format/rate/channel conversion
|
||||
self._resampler = AudioResampler(
|
||||
format=_OUTPUT_SAMPLE_FORMAT,
|
||||
layout=_OUTPUT_LAYOUT,
|
||||
rate=output_sample_rate,
|
||||
)
|
||||
|
||||
self._input_codec = input_codec
|
||||
self._frames_decoded = 0
|
||||
self._frames_encoded = 0
|
||||
self._audio_specific_config: bytes | None = None
|
||||
|
||||
logger.info(
|
||||
"[audio_transcoder] Initialized: %s %dHz %dch -> aac %dHz %dch @%dk",
|
||||
input_codec,
|
||||
input_sample_rate,
|
||||
input_channels,
|
||||
output_sample_rate,
|
||||
output_channels,
|
||||
output_bitrate // 1000,
|
||||
)
|
||||
|
||||
@property
|
||||
def audio_specific_config(self) -> bytes | None:
|
||||
"""
|
||||
AAC AudioSpecificConfig from the encoder (available after first encode).
|
||||
|
||||
This is needed for the MP4 esds box.
|
||||
"""
|
||||
if self._audio_specific_config is not None:
|
||||
return self._audio_specific_config
|
||||
|
||||
# PyAV exposes extradata after the encoder is opened
|
||||
if self._encoder.extradata:
|
||||
self._audio_specific_config = bytes(self._encoder.extradata)
|
||||
return self._audio_specific_config
|
||||
return None
|
||||
|
||||
@property
|
||||
def output_sample_rate(self) -> int:
|
||||
return self._encoder.sample_rate
|
||||
|
||||
@property
|
||||
def output_channels(self) -> int:
|
||||
return self._encoder.channels
|
||||
|
||||
@property
|
||||
def frame_size(self) -> int:
|
||||
"""AAC frame size (samples per frame), typically 1024."""
|
||||
return self._encoder.frame_size or 1024
|
||||
|
||||
def transcode(self, raw_frame_data: bytes) -> list[bytes]:
|
||||
"""
|
||||
Transcode a raw audio frame from the input codec to AAC.
|
||||
|
||||
Args:
|
||||
raw_frame_data: Raw audio frame bytes (one codec frame, e.g.,
|
||||
one EAC3 sync frame).
|
||||
|
||||
Returns:
|
||||
List of raw AAC frame bytes. May return 0, 1, or more frames
|
||||
depending on codec frame sizes and buffering.
|
||||
"""
|
||||
output = []
|
||||
|
||||
# Parse raw bytes into packets
|
||||
packets = self._decoder.parse(raw_frame_data)
|
||||
|
||||
for packet in packets:
|
||||
# Decode to PCM frames
|
||||
try:
|
||||
decoded_frames = self._decoder.decode(packet)
|
||||
except av.error.InvalidDataError as e:
|
||||
logger.debug("[audio_transcoder] Decode error (skipping frame): %s", e)
|
||||
continue
|
||||
|
||||
for frame in decoded_frames:
|
||||
self._frames_decoded += 1
|
||||
|
||||
# Resample to match encoder format
|
||||
resampled = self._resampler.resample(frame)
|
||||
if resampled is None:
|
||||
continue
|
||||
|
||||
# resampled can be a single frame or list of frames
|
||||
if not isinstance(resampled, list):
|
||||
resampled = [resampled]
|
||||
|
||||
for rs_frame in resampled:
|
||||
# Encode to AAC
|
||||
try:
|
||||
encoded_packets = self._encoder.encode(rs_frame)
|
||||
except av.error.InvalidDataError as e:
|
||||
logger.debug("[audio_transcoder] Encode error: %s", e)
|
||||
continue
|
||||
|
||||
for enc_packet in encoded_packets:
|
||||
self._frames_encoded += 1
|
||||
output.append(bytes(enc_packet))
|
||||
|
||||
return output
|
||||
|
||||
def flush(self) -> list[bytes]:
|
||||
"""
|
||||
Flush the decoder and encoder buffers.
|
||||
|
||||
Call this when the input stream ends to get remaining frames.
|
||||
|
||||
Returns:
|
||||
List of remaining raw AAC frame bytes.
|
||||
"""
|
||||
output = []
|
||||
|
||||
# Flush decoder
|
||||
try:
|
||||
for frame in self._decoder.decode(None):
|
||||
self._frames_decoded += 1
|
||||
resampled = self._resampler.resample(frame)
|
||||
if resampled is None:
|
||||
continue
|
||||
if not isinstance(resampled, list):
|
||||
resampled = [resampled]
|
||||
for rs_frame in resampled:
|
||||
for enc_packet in self._encoder.encode(rs_frame):
|
||||
self._frames_encoded += 1
|
||||
output.append(bytes(enc_packet))
|
||||
except Exception as e:
|
||||
logger.debug("[audio_transcoder] Decoder flush error: %s", e)
|
||||
|
||||
# Flush resampler
|
||||
try:
|
||||
resampled = self._resampler.resample(None)
|
||||
if resampled is not None:
|
||||
if not isinstance(resampled, list):
|
||||
resampled = [resampled]
|
||||
for rs_frame in resampled:
|
||||
for enc_packet in self._encoder.encode(rs_frame):
|
||||
self._frames_encoded += 1
|
||||
output.append(bytes(enc_packet))
|
||||
except Exception as e:
|
||||
logger.debug("[audio_transcoder] Resampler flush error: %s", e)
|
||||
|
||||
# Flush encoder
|
||||
try:
|
||||
for enc_packet in self._encoder.encode(None):
|
||||
self._frames_encoded += 1
|
||||
output.append(bytes(enc_packet))
|
||||
except Exception as e:
|
||||
logger.debug("[audio_transcoder] Encoder flush error: %s", e)
|
||||
|
||||
logger.info(
|
||||
"[audio_transcoder] Flushed: %d decoded, %d encoded total",
|
||||
self._frames_decoded,
|
||||
self._frames_encoded,
|
||||
)
|
||||
return output
|
||||
|
||||
def generate_silence_frame(self) -> bytes | None:
|
||||
"""Return a pre-encoded silent AAC frame (module-level singleton)."""
|
||||
return _SILENCE_AAC_FRAME
|
||||
|
||||
def close(self) -> None:
|
||||
"""Release codec contexts (best-effort; PyAV AudioCodecContext may not have close())."""
|
||||
for ctx in (self._decoder, self._encoder):
|
||||
try:
|
||||
if hasattr(ctx, "close"):
|
||||
ctx.close()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
def __del__(self) -> None:
|
||||
self.close()
|
||||
515
mediaflow_proxy/remuxer/codec_utils.py
Normal file
515
mediaflow_proxy/remuxer/codec_utils.py
Normal file
@@ -0,0 +1,515 @@
|
||||
"""
|
||||
Codec decision engine for browser compatibility detection.
|
||||
|
||||
Determines whether video/audio streams need transcoding for browser
|
||||
playback and selects appropriate output codecs.
|
||||
"""
|
||||
|
||||
import logging
|
||||
import struct
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# ────────────────────────────────────────────────────────────────────
|
||||
# Browser-compatible codecs (work natively in HTML5 <video>)
|
||||
# ────────────────────────────────────────────────────────────────────
|
||||
BROWSER_VIDEO_CODECS = frozenset(
|
||||
{
|
||||
"V_MPEG4/ISO/AVC", # H.264/AVC -- universal
|
||||
"h264",
|
||||
"avc1", # FFmpeg/PyAV names
|
||||
}
|
||||
)
|
||||
|
||||
BROWSER_AUDIO_CODECS = frozenset(
|
||||
{
|
||||
"A_AAC", # AAC-LC -- universal
|
||||
"A_AAC/MPEG2/LC",
|
||||
"A_AAC/MPEG4/LC",
|
||||
"aac", # FFmpeg/PyAV name
|
||||
}
|
||||
)
|
||||
|
||||
# ────────────────────────────────────────────────────────────────────
|
||||
# Video codecs that need re-encoding to H.264
|
||||
# ────────────────────────────────────────────────────────────────────
|
||||
VIDEO_NEEDS_REENCODE = frozenset(
|
||||
{
|
||||
"V_MPEGH/ISO/HEVC", # H.265/HEVC (Chrome/Firefox don't support)
|
||||
"V_MPEG2", # MPEG-2 (DVD-era)
|
||||
"V_MPEG4/ISO/SP", # MPEG-4 Part 2 Simple Profile
|
||||
"V_MPEG4/ISO/ASP", # MPEG-4 Part 2 Advanced Simple (DivX/Xvid)
|
||||
"V_MPEG4/ISO/AP", # MPEG-4 Part 2 Advanced Profile
|
||||
"V_MPEG4/MS/V3", # MS MPEG-4 v3 (WMV)
|
||||
"V_MS/VFW/FOURCC", # Generic VFW (VC-1, etc.)
|
||||
"V_REAL/RV10",
|
||||
"V_REAL/RV20",
|
||||
"V_REAL/RV30",
|
||||
"V_REAL/RV40",
|
||||
"V_THEORA",
|
||||
"V_VP8",
|
||||
"V_VP9", # VP9 in MKV (needs WebM container for browser)
|
||||
"V_AV1", # AV1 (partial support, safer to reencode)
|
||||
# PyAV / FFmpeg codec names
|
||||
"hevc",
|
||||
"h265",
|
||||
"mpeg2video",
|
||||
"mpeg4",
|
||||
"vc1",
|
||||
"vp8",
|
||||
"vp9",
|
||||
"av1",
|
||||
"theora",
|
||||
"wmv3",
|
||||
"rv30",
|
||||
"rv40",
|
||||
}
|
||||
)
|
||||
|
||||
# ────────────────────────────────────────────────────────────────────
|
||||
# Audio codecs that need transcoding to AAC
|
||||
# (superset of the list in audio_transcoder.py, uses both MKV and
|
||||
# PyAV codec names for universal lookup)
|
||||
# ────────────────────────────────────────────────────────────────────
|
||||
AUDIO_NEEDS_TRANSCODE = frozenset(
|
||||
{
|
||||
# MKV CodecIDs
|
||||
"A_EAC3",
|
||||
"A_AC3",
|
||||
"A_DTS",
|
||||
"A_DTS/EXPRESS",
|
||||
"A_DTS/LOSSLESS",
|
||||
"A_OPUS",
|
||||
"A_VORBIS",
|
||||
"A_FLAC",
|
||||
"A_TRUEHD",
|
||||
"A_MLP",
|
||||
"A_PCM/INT/LIT",
|
||||
"A_PCM/INT/BIG",
|
||||
"A_PCM/FLOAT/IEEE",
|
||||
"A_REAL/28_8",
|
||||
"A_REAL/COOK",
|
||||
"A_REAL/SIPR",
|
||||
"A_REAL/ATRC",
|
||||
"A_MS/ACM", # Generic Windows audio
|
||||
"A_MP3",
|
||||
"A_MPEG/L3",
|
||||
# PyAV / FFmpeg names
|
||||
"eac3",
|
||||
"ac3",
|
||||
"dts",
|
||||
"dca",
|
||||
"truehd",
|
||||
"mlp",
|
||||
"mp3",
|
||||
"opus",
|
||||
"vorbis",
|
||||
"flac",
|
||||
"pcm_s16le",
|
||||
"pcm_s24le",
|
||||
"pcm_f32le",
|
||||
"wmav2",
|
||||
"wmavoice",
|
||||
"wmapro",
|
||||
"cook",
|
||||
"sipr",
|
||||
"atrac3",
|
||||
}
|
||||
)
|
||||
|
||||
# Map PyAV codec names to MKV CodecIDs (for the MKV fast-path)
|
||||
_PYAV_TO_MKV_VIDEO = {
|
||||
"h264": "V_MPEG4/ISO/AVC",
|
||||
"hevc": "V_MPEGH/ISO/HEVC",
|
||||
"h265": "V_MPEGH/ISO/HEVC",
|
||||
"mpeg2video": "V_MPEG2",
|
||||
"vp8": "V_VP8",
|
||||
"vp9": "V_VP9",
|
||||
"av1": "V_AV1",
|
||||
}
|
||||
|
||||
_PYAV_TO_MKV_AUDIO = {
|
||||
"aac": "A_AAC",
|
||||
"eac3": "A_EAC3",
|
||||
"ac3": "A_AC3",
|
||||
"dts": "A_DTS",
|
||||
"opus": "A_OPUS",
|
||||
"vorbis": "A_VORBIS",
|
||||
"flac": "A_FLAC",
|
||||
"mp3": "A_MPEG/L3",
|
||||
"truehd": "A_TRUEHD",
|
||||
}
|
||||
|
||||
|
||||
# ────────────────────────────────────────────────────────────────────
|
||||
# NAL unit format conversion (Annex B ↔ AVCC)
|
||||
# ────────────────────────────────────────────────────────────────────
|
||||
|
||||
# H.264 NAL types that belong in the init segment (avcC), not in samples
|
||||
_H264_PARAM_NAL_TYPES = frozenset({7, 8, 9}) # SPS, PPS, AUD
|
||||
|
||||
|
||||
def _find_annexb_nals(data: bytes) -> list[tuple[int, int]]:
|
||||
"""
|
||||
Find all NAL unit [start, end) byte ranges in Annex B formatted data.
|
||||
|
||||
Handles both 3-byte (00 00 01) and 4-byte (00 00 00 01) start codes.
|
||||
Returns a list of (start, end) tuples pointing into *data*.
|
||||
"""
|
||||
size = len(data)
|
||||
nals: list[tuple[int, int]] = []
|
||||
i = 0
|
||||
|
||||
while i < size - 2:
|
||||
# Scan for 0x000001 or 0x00000001
|
||||
if data[i] != 0:
|
||||
i += 1
|
||||
continue
|
||||
if data[i + 1] != 0:
|
||||
i += 2
|
||||
continue
|
||||
if data[i + 2] == 1:
|
||||
nal_start = i + 3
|
||||
elif data[i + 2] == 0 and i + 3 < size and data[i + 3] == 1:
|
||||
nal_start = i + 4
|
||||
else:
|
||||
i += 1
|
||||
continue
|
||||
|
||||
# Record end of previous NAL
|
||||
if nals:
|
||||
nals[-1] = (nals[-1][0], i)
|
||||
nals.append((nal_start, size))
|
||||
i = nal_start
|
||||
|
||||
return nals
|
||||
|
||||
|
||||
def is_annexb(data: bytes) -> bool:
|
||||
"""
|
||||
Return True if *data* starts with an Annex B start code.
|
||||
|
||||
Disambiguates AVCC (4-byte length prefix) from Annex B when the data
|
||||
begins with ``00 00 01 xx`` or ``00 00 00 01`` by checking whether
|
||||
the AVCC interpretation yields a plausible H.264 NAL. If the 4-byte
|
||||
big-endian length + subsequent NAL header byte is valid and the
|
||||
length fits within the data, this is AVCC -- not Annex B.
|
||||
"""
|
||||
if len(data) < 5:
|
||||
return False
|
||||
|
||||
# 4-byte start code: 00 00 00 01
|
||||
if data[0] == 0 and data[1] == 0 and data[2] == 0 and data[3] == 1:
|
||||
return True
|
||||
|
||||
# 3-byte start code: 00 00 01 -- but could also be AVCC with length
|
||||
# that starts with 00 00 01 (i.e. length 0x000001xx = 256..511).
|
||||
if data[0] == 0 and data[1] == 0 and data[2] == 1:
|
||||
# Interpret as AVCC: 4-byte big-endian length
|
||||
avcc_len = int.from_bytes(data[0:4], "big")
|
||||
if 0 < avcc_len <= len(data) - 4:
|
||||
# Check if the NAL header byte is a valid H.264 NAL
|
||||
nal_byte = data[4]
|
||||
forbidden = (nal_byte >> 7) & 1
|
||||
nal_type = nal_byte & 0x1F
|
||||
if forbidden == 0 and 1 <= nal_type <= 12:
|
||||
# Plausible AVCC: valid length + valid NAL type
|
||||
return False
|
||||
# Not plausible AVCC, treat as Annex B
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
|
||||
def annexb_to_avcc(data: bytes, filter_ps: bool = True) -> bytes:
|
||||
"""
|
||||
Convert Annex B (start-code-prefixed) NAL units to AVCC
|
||||
(4-byte length-prefixed) format suitable for fMP4 samples.
|
||||
|
||||
Args:
|
||||
data: H.264 access unit in Annex B format.
|
||||
filter_ps: If True, strip SPS/PPS/AUD NAL units (they belong
|
||||
in the avcC box of the init segment, not in samples).
|
||||
|
||||
Returns:
|
||||
The same NAL units with 4-byte big-endian length prefixes.
|
||||
"""
|
||||
if not data or not is_annexb(data):
|
||||
return data # Already AVCC or empty
|
||||
|
||||
nals = _find_annexb_nals(data)
|
||||
if not nals:
|
||||
return data
|
||||
|
||||
out = bytearray()
|
||||
for start, end in nals:
|
||||
# Strip trailing zero-padding before next start code
|
||||
while end > start and data[end - 1] == 0:
|
||||
end -= 1
|
||||
if end <= start:
|
||||
continue
|
||||
|
||||
if filter_ps:
|
||||
nal_type = data[start] & 0x1F
|
||||
if nal_type in _H264_PARAM_NAL_TYPES:
|
||||
continue
|
||||
|
||||
length = end - start
|
||||
out.extend(length.to_bytes(4, "big"))
|
||||
out.extend(data[start:end])
|
||||
|
||||
# If every NAL was filtered out (e.g. packet only contains SPS/PPS/AUD),
|
||||
# return empty so callers can drop this sample. Returning original Annex-B
|
||||
# bytes here would corrupt fMP4 samples (expects AVCC length prefixes).
|
||||
return bytes(out)
|
||||
|
||||
|
||||
# H.264 profiles that require the avcC High Profile extension fields
|
||||
# (chroma_format_idc, bit_depth_luma/chroma, numSpsExt).
|
||||
_HIGH_PROFILE_IDCS = frozenset({100, 110, 122, 244, 44, 83, 86, 118, 128, 138, 139, 134})
|
||||
|
||||
|
||||
def _fix_avcc_high_profile(avcc: bytes) -> bytes:
|
||||
"""
|
||||
Ensure an avcC record includes High Profile extension bytes.
|
||||
|
||||
The ISO/IEC 14496-15 spec requires additional fields after the PPS
|
||||
section when ``AVCProfileIndication`` is 100 (High), 110, 122, or 244.
|
||||
Some MKV muxers omit these, causing decoders to not know the chroma
|
||||
format or bit depth, which leads to widespread decode errors.
|
||||
|
||||
If the extensions are missing, appends the defaults for 4:2:0 / 8-bit
|
||||
with zero extended SPS sets.
|
||||
"""
|
||||
if len(avcc) < 7:
|
||||
return avcc
|
||||
if avcc[0] != 1:
|
||||
return avcc # Not an avcC record
|
||||
|
||||
profile_idc = avcc[1]
|
||||
if profile_idc not in _HIGH_PROFILE_IDCS:
|
||||
return avcc # Not a High Profile variant, no extensions needed
|
||||
|
||||
# Walk past SPS and PPS sections to find where extensions should be
|
||||
off = 5
|
||||
num_sps = avcc[off] & 0x1F
|
||||
off += 1
|
||||
for _ in range(num_sps):
|
||||
if off + 2 > len(avcc):
|
||||
return avcc
|
||||
sps_len = struct.unpack(">H", avcc[off : off + 2])[0]
|
||||
off += 2 + sps_len
|
||||
|
||||
if off >= len(avcc):
|
||||
return avcc
|
||||
num_pps = avcc[off]
|
||||
off += 1
|
||||
for _ in range(num_pps):
|
||||
if off + 2 > len(avcc):
|
||||
return avcc
|
||||
pps_len = struct.unpack(">H", avcc[off : off + 2])[0]
|
||||
off += 2 + pps_len
|
||||
|
||||
# If there are already bytes after the PPS section, extensions exist
|
||||
if off < len(avcc):
|
||||
return avcc
|
||||
|
||||
# Append default High Profile extensions:
|
||||
# chroma_format_idc = 1 (4:2:0) -> 0xFC | 0x01 = 0xFD (reserved 111111 + 01)
|
||||
# bit_depth_luma_minus8 = 0 -> 0xF8 | 0x00 = 0xF8 (reserved 11111 + 000)
|
||||
# bit_depth_chroma_minus8 = 0 -> 0xF8 | 0x00 = 0xF8 (reserved 11111 + 000)
|
||||
# numOfSequenceParameterSetExt = 0
|
||||
ext = bytearray(avcc)
|
||||
ext.append(0xFD) # 111111_01 : chroma_format_idc = 1
|
||||
ext.append(0xF8) # 11111_000 : bit_depth_luma_minus8 = 0
|
||||
ext.append(0xF8) # 11111_000 : bit_depth_chroma_minus8 = 0
|
||||
ext.append(0x00) # numOfSequenceParameterSetExt = 0
|
||||
return bytes(ext)
|
||||
|
||||
|
||||
def ensure_avcc_extradata(extradata: bytes) -> bytes:
|
||||
"""
|
||||
Ensure h264 extradata is in avcC format for the fMP4 init segment.
|
||||
|
||||
PyAV returns extradata in the container's native format:
|
||||
- MKV/MP4: avcC format (starts with 0x01)
|
||||
- MPEG-TS: Annex B format (starts with 0x00 0x00)
|
||||
|
||||
If Annex B, parses SPS/PPS NAL units and builds proper avcC.
|
||||
If already avcC, validates and fixes High Profile extension fields.
|
||||
"""
|
||||
if not extradata or len(extradata) < 4:
|
||||
return extradata
|
||||
|
||||
# Already avcC format (configurationVersion == 1)
|
||||
if extradata[0] == 0x01:
|
||||
return _fix_avcc_high_profile(extradata)
|
||||
|
||||
# Parse Annex B NAL units to extract SPS and PPS
|
||||
nals = _find_annexb_nals(extradata)
|
||||
if not nals:
|
||||
return extradata
|
||||
|
||||
sps_list: list[bytes] = []
|
||||
pps_list: list[bytes] = []
|
||||
|
||||
for start, end in nals:
|
||||
while end > start and extradata[end - 1] == 0:
|
||||
end -= 1
|
||||
if end <= start:
|
||||
continue
|
||||
nal_type = extradata[start] & 0x1F
|
||||
nal_data = extradata[start:end]
|
||||
if nal_type == 7: # SPS
|
||||
sps_list.append(nal_data)
|
||||
elif nal_type == 8: # PPS
|
||||
pps_list.append(nal_data)
|
||||
|
||||
if not sps_list:
|
||||
return extradata # Can't build avcC without SPS
|
||||
|
||||
sps = sps_list[0]
|
||||
if len(sps) < 4:
|
||||
return extradata
|
||||
|
||||
# Build avcC box content
|
||||
avcc = bytearray()
|
||||
avcc.append(1) # configurationVersion
|
||||
avcc.append(sps[1]) # AVCProfileIndication
|
||||
avcc.append(sps[2]) # profile_compatibility
|
||||
avcc.append(sps[3]) # AVCLevelIndication
|
||||
avcc.append(0xFF) # 6 bits reserved (0x3F) + lengthSizeMinusOne=3 -> 4-byte NAL lengths
|
||||
avcc.append(0xE0 | len(sps_list)) # 3 bits reserved (0x07) + numOfSPS
|
||||
|
||||
for s in sps_list:
|
||||
avcc.extend(struct.pack(">H", len(s)))
|
||||
avcc.extend(s)
|
||||
|
||||
avcc.append(len(pps_list)) # numOfPPS
|
||||
for p in pps_list:
|
||||
avcc.extend(struct.pack(">H", len(p)))
|
||||
avcc.extend(p)
|
||||
|
||||
return _fix_avcc_high_profile(bytes(avcc))
|
||||
|
||||
|
||||
def extract_sps_pps_from_annexb(data: bytes) -> bytes:
|
||||
"""
|
||||
Extract SPS and PPS NAL units from Annex B encoded data and build
|
||||
an avcC-format extradata blob.
|
||||
|
||||
Hardware encoders like VideoToolbox embed SPS/PPS as in-band NAL
|
||||
units in their first keyframe output rather than setting extradata
|
||||
on the codec context. This function finds those parameter sets
|
||||
and returns proper avcC bytes suitable for the fMP4 init segment.
|
||||
|
||||
Returns:
|
||||
avcC bytes if SPS/PPS were found, empty bytes otherwise.
|
||||
"""
|
||||
if not data or not is_annexb(data):
|
||||
return b""
|
||||
|
||||
nals = _find_annexb_nals(data)
|
||||
if not nals:
|
||||
return b""
|
||||
|
||||
sps_list: list[bytes] = []
|
||||
pps_list: list[bytes] = []
|
||||
|
||||
for start, end in nals:
|
||||
# Strip trailing zero-padding
|
||||
while end > start and data[end - 1] == 0:
|
||||
end -= 1
|
||||
if end <= start:
|
||||
continue
|
||||
|
||||
nal_type = data[start] & 0x1F
|
||||
if nal_type == 7: # SPS
|
||||
sps_list.append(data[start:end])
|
||||
elif nal_type == 8: # PPS
|
||||
pps_list.append(data[start:end])
|
||||
|
||||
if not sps_list:
|
||||
return b""
|
||||
|
||||
sps = sps_list[0]
|
||||
if len(sps) < 4:
|
||||
return b""
|
||||
|
||||
# Build avcC box content
|
||||
avcc = bytearray()
|
||||
avcc.append(1) # configurationVersion
|
||||
avcc.append(sps[1]) # AVCProfileIndication
|
||||
avcc.append(sps[2]) # profile_compatibility
|
||||
avcc.append(sps[3]) # AVCLevelIndication
|
||||
avcc.append(0xFF) # 6 bits reserved + lengthSizeMinusOne=3
|
||||
avcc.append(0xE0 | len(sps_list)) # 3 bits reserved + numOfSPS
|
||||
|
||||
for s in sps_list:
|
||||
avcc.extend(struct.pack(">H", len(s)))
|
||||
avcc.extend(s)
|
||||
|
||||
avcc.append(len(pps_list)) # numOfPPS
|
||||
for p in pps_list:
|
||||
avcc.extend(struct.pack(">H", len(p)))
|
||||
avcc.extend(p)
|
||||
|
||||
return bytes(avcc)
|
||||
|
||||
|
||||
def video_needs_reencode(codec_id: str) -> bool:
|
||||
"""Check if a video codec requires re-encoding for browser playback."""
|
||||
if not codec_id:
|
||||
return False
|
||||
return codec_id in VIDEO_NEEDS_REENCODE
|
||||
|
||||
|
||||
def audio_needs_transcode(codec_id: str) -> bool:
|
||||
"""Check if an audio codec requires transcoding for browser playback."""
|
||||
if not codec_id:
|
||||
return False
|
||||
return codec_id in AUDIO_NEEDS_TRANSCODE
|
||||
|
||||
|
||||
def is_browser_compatible(video_codec: str, audio_codec: str) -> bool:
|
||||
"""
|
||||
Check if a video+audio combination is fully browser-compatible.
|
||||
|
||||
Returns True only if BOTH video and audio can be played natively in
|
||||
an HTML5 <video> element inside an MP4 container.
|
||||
"""
|
||||
video_ok = video_codec in BROWSER_VIDEO_CODECS or not video_codec
|
||||
audio_ok = audio_codec in BROWSER_AUDIO_CODECS or not audio_codec
|
||||
return video_ok and audio_ok
|
||||
|
||||
|
||||
class TranscodeDecision:
|
||||
"""Result of analyzing a stream's codec compatibility."""
|
||||
|
||||
__slots__ = ("transcode_video", "transcode_audio", "video_codec", "audio_codec")
|
||||
|
||||
def __init__(self, video_codec: str = "", audio_codec: str = "") -> None:
|
||||
self.video_codec = video_codec
|
||||
self.audio_codec = audio_codec
|
||||
self.transcode_video = video_needs_reencode(video_codec)
|
||||
self.transcode_audio = audio_needs_transcode(audio_codec)
|
||||
|
||||
@property
|
||||
def needs_transcode(self) -> bool:
|
||||
"""True if any stream needs transcoding."""
|
||||
return self.transcode_video or self.transcode_audio
|
||||
|
||||
@property
|
||||
def passthrough_ok(self) -> bool:
|
||||
"""True if the stream can be served as-is to a browser."""
|
||||
return not self.needs_transcode
|
||||
|
||||
def __repr__(self) -> str:
|
||||
parts = []
|
||||
if self.transcode_video:
|
||||
parts.append(f"video:{self.video_codec}->h264")
|
||||
if self.transcode_audio:
|
||||
parts.append(f"audio:{self.audio_codec}->aac")
|
||||
if not parts:
|
||||
parts.append("passthrough")
|
||||
return f"TranscodeDecision({', '.join(parts)})"
|
||||
614
mediaflow_proxy/remuxer/container_probe.py
Normal file
614
mediaflow_proxy/remuxer/container_probe.py
Normal file
@@ -0,0 +1,614 @@
|
||||
"""
|
||||
Container format probing -- MKV Cues and MP4 moov.
|
||||
|
||||
Pure Python probing using EBML parsing (MKV) and struct-based atom
|
||||
scanning (MP4). No FFmpeg dependency.
|
||||
|
||||
Source-agnostic: accepts any MediaSource protocol implementation
|
||||
(Telegram, HTTP, etc.) for byte-range reads.
|
||||
|
||||
Provides:
|
||||
- probe_mkv_cues: probe MKV file to extract seek index (MKVCueIndex)
|
||||
- probe_mp4_moov: probe MP4 file to extract moov atom and build seek index (MP4Index)
|
||||
"""
|
||||
|
||||
import base64
|
||||
import hashlib
|
||||
import json
|
||||
import logging
|
||||
import struct
|
||||
|
||||
from mediaflow_proxy.utils import redis_utils
|
||||
from mediaflow_proxy.remuxer.ebml_parser import (
|
||||
MKVCueIndex,
|
||||
build_cue_index,
|
||||
parse_ebml_header,
|
||||
parse_seek_head,
|
||||
CUES,
|
||||
INFO,
|
||||
)
|
||||
from mediaflow_proxy.remuxer.mp4_parser import (
|
||||
MP4Index,
|
||||
build_cue_points_from_moov,
|
||||
is_mp4_header,
|
||||
rewrite_moov_offsets,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# How much of the MKV header to fetch for SeekHead + Info parsing
|
||||
_HEADER_PROBE_SIZE = 64 * 1024 # 64 KB
|
||||
|
||||
# Max Cues element size we'll attempt to fetch
|
||||
_MAX_CUES_SIZE = 2 * 1024 * 1024 # 2 MB
|
||||
|
||||
# Redis cache for MKV Cue indexes
|
||||
_CUE_INDEX_CACHE_PREFIX = "mfp:cue_index:"
|
||||
_CUE_INDEX_CACHE_TTL = 3600 # 1 hour
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# MKV Cues probing
|
||||
# =============================================================================
|
||||
|
||||
|
||||
def derive_cue_cache_key(
|
||||
source_key: str = "",
|
||||
*,
|
||||
chat_id: str | int | None = None,
|
||||
message_id: int | None = None,
|
||||
file_id: str | None = None,
|
||||
) -> str:
|
||||
"""
|
||||
Derive a deterministic cache key for a file's cue index.
|
||||
|
||||
Accepts either a pre-computed source_key (from MediaSource.cache_key)
|
||||
or legacy Telegram-style parameters for backwards compatibility.
|
||||
"""
|
||||
if source_key:
|
||||
return source_key
|
||||
if file_id:
|
||||
raw = f"file_id:{file_id}"
|
||||
elif chat_id is not None and message_id is not None:
|
||||
raw = f"chat:{chat_id}:msg:{message_id}"
|
||||
else:
|
||||
return ""
|
||||
return hashlib.sha256(raw.encode()).hexdigest()[:16]
|
||||
|
||||
|
||||
async def _get_cached_cue_index(cache_key: str) -> MKVCueIndex | None:
|
||||
"""Try to load a MKVCueIndex from Redis cache."""
|
||||
if not cache_key:
|
||||
return None
|
||||
r = await redis_utils.get_redis()
|
||||
if r is None:
|
||||
return None
|
||||
redis_key = f"{_CUE_INDEX_CACHE_PREFIX}{cache_key}"
|
||||
data = await r.get(redis_key)
|
||||
if not data:
|
||||
return None
|
||||
try:
|
||||
d = json.loads(data)
|
||||
seek_header = b""
|
||||
if d.get("seek_header_b64"):
|
||||
seek_header = base64.b64decode(d["seek_header_b64"])
|
||||
video_codec_private = b""
|
||||
if d.get("video_codec_private_b64"):
|
||||
video_codec_private = base64.b64decode(d["video_codec_private_b64"])
|
||||
index = MKVCueIndex(
|
||||
duration_ms=d["duration_ms"],
|
||||
timestamp_scale=d["timestamp_scale"],
|
||||
cue_points=[(cp[0], cp[1]) for cp in d["cue_points"]],
|
||||
segment_data_offset=d["segment_data_offset"],
|
||||
first_cluster_offset=d.get("first_cluster_offset", 0),
|
||||
seek_header=seek_header,
|
||||
audio_codec_id=d.get("audio_codec_id", ""),
|
||||
audio_bitrate=d.get("audio_bitrate", 0),
|
||||
audio_channels=d.get("audio_channels", 0),
|
||||
audio_sample_rate=d.get("audio_sample_rate", 0.0),
|
||||
video_codec_id=d.get("video_codec_id", ""),
|
||||
video_codec_private=video_codec_private,
|
||||
video_width=d.get("video_width", 0),
|
||||
video_height=d.get("video_height", 0),
|
||||
video_fps=d.get("video_fps", 0.0),
|
||||
video_default_duration_ns=d.get("video_default_duration_ns", 0),
|
||||
)
|
||||
logger.debug("[container_probe] Loaded cue index from cache: %s", cache_key)
|
||||
return index
|
||||
except (KeyError, TypeError, json.JSONDecodeError) as e:
|
||||
logger.warning("[container_probe] Invalid cached cue index: %s", e)
|
||||
return None
|
||||
|
||||
|
||||
async def _set_cached_cue_index(cache_key: str, index: MKVCueIndex) -> None:
|
||||
"""Cache a MKVCueIndex in Redis."""
|
||||
if not cache_key:
|
||||
return
|
||||
r = await redis_utils.get_redis()
|
||||
if r is None:
|
||||
return
|
||||
redis_key = f"{_CUE_INDEX_CACHE_PREFIX}{cache_key}"
|
||||
data = json.dumps(
|
||||
{
|
||||
"duration_ms": index.duration_ms,
|
||||
"timestamp_scale": index.timestamp_scale,
|
||||
"cue_points": index.cue_points,
|
||||
"segment_data_offset": index.segment_data_offset,
|
||||
"first_cluster_offset": index.first_cluster_offset,
|
||||
"seek_header_b64": base64.b64encode(index.seek_header).decode() if index.seek_header else "",
|
||||
"audio_codec_id": index.audio_codec_id,
|
||||
"audio_bitrate": index.audio_bitrate,
|
||||
"audio_channels": index.audio_channels,
|
||||
"audio_sample_rate": index.audio_sample_rate,
|
||||
"video_codec_id": index.video_codec_id,
|
||||
"video_codec_private_b64": base64.b64encode(index.video_codec_private).decode()
|
||||
if index.video_codec_private
|
||||
else "",
|
||||
"video_width": index.video_width,
|
||||
"video_height": index.video_height,
|
||||
"video_fps": index.video_fps,
|
||||
"video_default_duration_ns": index.video_default_duration_ns,
|
||||
}
|
||||
)
|
||||
await r.set(redis_key, data, ex=_CUE_INDEX_CACHE_TTL)
|
||||
logger.debug("[container_probe] Cached cue index: %s", cache_key)
|
||||
|
||||
|
||||
async def probe_mkv_cues(
|
||||
source,
|
||||
file_size: int = 0,
|
||||
cache_key: str = "",
|
||||
header_data: bytes | None = None,
|
||||
) -> MKVCueIndex | None:
|
||||
"""
|
||||
Probe an MKV file's EBML header and Cues to build a seek index.
|
||||
|
||||
Pure Python -- parses EBML structures directly, no FFmpeg involved.
|
||||
|
||||
Makes up to two small byte-range reads via the provided source:
|
||||
1. First ~64KB: EBML header + SeekHead + Info (skipped if header_data provided)
|
||||
2. Cues section: byte range from SeekHead's Cues position
|
||||
|
||||
Args:
|
||||
source: A MediaSource protocol implementation, or any object with
|
||||
a ``stream(offset, limit)`` async generator method.
|
||||
file_size: Total file size in bytes. If 0, tries ``source.file_size``.
|
||||
cache_key: Optional cache key for Redis caching. If empty, tries
|
||||
``source.cache_key``.
|
||||
header_data: Pre-fetched header bytes (first ~64KB). If provided,
|
||||
skips the initial header fetch from source.
|
||||
|
||||
Returns:
|
||||
MKVCueIndex if successful, None if the file has no Cues or parsing fails.
|
||||
"""
|
||||
# Resolve file_size and cache_key from source if not provided
|
||||
if file_size <= 0:
|
||||
file_size = getattr(source, "file_size", 0)
|
||||
if not cache_key:
|
||||
cache_key = getattr(source, "cache_key", "")
|
||||
|
||||
# Check cache first
|
||||
if cache_key:
|
||||
cached = await _get_cached_cue_index(cache_key)
|
||||
if cached:
|
||||
return cached
|
||||
|
||||
try:
|
||||
# Step 1: Use pre-fetched header or fetch from source
|
||||
if header_data is None:
|
||||
header_size = min(_HEADER_PROBE_SIZE, file_size) if file_size > 0 else _HEADER_PROBE_SIZE
|
||||
header_data = b""
|
||||
async for chunk in source.stream(offset=0, limit=header_size):
|
||||
header_data += chunk
|
||||
|
||||
if len(header_data) < 64:
|
||||
logger.warning("[container_probe] Header too small (%d bytes), cannot probe", len(header_data))
|
||||
return None
|
||||
|
||||
# Step 2: Parse EBML header to find Segment data offset
|
||||
segment_data_offset = parse_ebml_header(header_data)
|
||||
|
||||
# Step 3: Parse SeekHead to find Cues and Info positions
|
||||
seek_positions = parse_seek_head(header_data, segment_data_offset)
|
||||
|
||||
if CUES not in seek_positions:
|
||||
logger.info("[container_probe] No Cues position in SeekHead, seeking not available")
|
||||
return None
|
||||
|
||||
cues_relative_offset = seek_positions[CUES]
|
||||
cues_absolute_offset = segment_data_offset + cues_relative_offset
|
||||
|
||||
logger.info(
|
||||
"[container_probe] SeekHead: Cues at offset %d (absolute %d), Info at %s",
|
||||
cues_relative_offset,
|
||||
cues_absolute_offset,
|
||||
seek_positions.get(INFO, "not found"),
|
||||
)
|
||||
|
||||
# Step 4: Fetch the Cues element
|
||||
cues_max = file_size - cues_absolute_offset if file_size > 0 else _MAX_CUES_SIZE
|
||||
cues_fetch_size = min(_MAX_CUES_SIZE, cues_max)
|
||||
if cues_fetch_size <= 0:
|
||||
logger.warning("[container_probe] Cues offset %d beyond file size %d", cues_absolute_offset, file_size)
|
||||
return None
|
||||
|
||||
cues_data = b""
|
||||
async for chunk in source.stream(offset=cues_absolute_offset, limit=cues_fetch_size):
|
||||
cues_data += chunk
|
||||
|
||||
if len(cues_data) < 16:
|
||||
logger.warning("[container_probe] Cues data too small (%d bytes)", len(cues_data))
|
||||
return None
|
||||
|
||||
# Step 5: Build the cue index
|
||||
index = build_cue_index(
|
||||
header_data=header_data,
|
||||
cues_data=cues_data,
|
||||
cues_file_offset=cues_absolute_offset,
|
||||
segment_data_offset=segment_data_offset,
|
||||
)
|
||||
|
||||
# Cache the result
|
||||
if cache_key:
|
||||
await _set_cached_cue_index(cache_key, index)
|
||||
|
||||
return index
|
||||
|
||||
except Exception as e:
|
||||
logger.warning("[container_probe] Failed to probe MKV cues: %s", e)
|
||||
return None
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# MP4 Moov probing
|
||||
# =============================================================================
|
||||
|
||||
# Redis cache for MP4 indexes
|
||||
_MP4_INDEX_CACHE_PREFIX = "mfp:mp4_index:"
|
||||
_MP4_INDEX_CACHE_TTL = 3600 # 1 hour
|
||||
|
||||
# How much to read from the start for ftyp + initial atom scanning
|
||||
_MP4_HEADER_PROBE_SIZE = 64 * 1024 # 64 KB
|
||||
|
||||
# Max moov size we'll accept
|
||||
_MAX_MOOV_SIZE = 50 * 1024 * 1024 # 50 MB
|
||||
|
||||
# How much to read from the end of the file to find moov
|
||||
_MP4_TAIL_PROBE_SIZE = 512 * 1024 # 512 KB
|
||||
|
||||
|
||||
async def _get_cached_mp4_index(cache_key: str) -> MP4Index | None:
|
||||
"""Try to load an MP4Index from Redis cache."""
|
||||
if not cache_key:
|
||||
return None
|
||||
r = await redis_utils.get_redis()
|
||||
if r is None:
|
||||
return None
|
||||
redis_key = f"{_MP4_INDEX_CACHE_PREFIX}{cache_key}"
|
||||
data = await r.get(redis_key)
|
||||
if not data:
|
||||
return None
|
||||
try:
|
||||
d = json.loads(data)
|
||||
ftyp_data = b""
|
||||
if d.get("ftyp_data_b64"):
|
||||
ftyp_data = base64.b64decode(d["ftyp_data_b64"])
|
||||
index = MP4Index(
|
||||
duration_ms=d["duration_ms"],
|
||||
timescale=d["timescale"],
|
||||
cue_points=[(cp[0], cp[1]) for cp in d["cue_points"]],
|
||||
moov_offset=d["moov_offset"],
|
||||
moov_size=d["moov_size"],
|
||||
ftyp_data=ftyp_data,
|
||||
mdat_offset=d["mdat_offset"],
|
||||
mdat_size=d["mdat_size"],
|
||||
video_codec=d.get("video_codec", ""),
|
||||
audio_codec=d.get("audio_codec", ""),
|
||||
# moov_data is NOT cached (too large), it will be re-fetched
|
||||
)
|
||||
logger.debug("[container_probe] Loaded MP4 index from cache: %s", cache_key)
|
||||
return index
|
||||
except (KeyError, TypeError, json.JSONDecodeError) as e:
|
||||
logger.warning("[container_probe] Invalid cached MP4 index: %s", e)
|
||||
return None
|
||||
|
||||
|
||||
async def _set_cached_mp4_index(cache_key: str, index: MP4Index) -> None:
|
||||
"""Cache an MP4Index in Redis (without moov_data)."""
|
||||
if not cache_key:
|
||||
return
|
||||
r = await redis_utils.get_redis()
|
||||
if r is None:
|
||||
return
|
||||
redis_key = f"{_MP4_INDEX_CACHE_PREFIX}{cache_key}"
|
||||
data = json.dumps(
|
||||
{
|
||||
"duration_ms": index.duration_ms,
|
||||
"timescale": index.timescale,
|
||||
"cue_points": index.cue_points,
|
||||
"moov_offset": index.moov_offset,
|
||||
"moov_size": index.moov_size,
|
||||
"ftyp_data_b64": base64.b64encode(index.ftyp_data).decode() if index.ftyp_data else "",
|
||||
"mdat_offset": index.mdat_offset,
|
||||
"mdat_size": index.mdat_size,
|
||||
"video_codec": index.video_codec,
|
||||
"audio_codec": index.audio_codec,
|
||||
}
|
||||
)
|
||||
await r.set(redis_key, data, ex=_MP4_INDEX_CACHE_TTL)
|
||||
logger.debug("[container_probe] Cached MP4 index: %s", cache_key)
|
||||
|
||||
|
||||
def _scan_top_level_atoms(data: bytes) -> list[tuple[bytes, int, int]]:
|
||||
"""
|
||||
Scan top-level atom headers from raw file bytes.
|
||||
|
||||
Returns:
|
||||
List of (box_type, absolute_offset, total_size) for each atom found.
|
||||
"""
|
||||
atoms = []
|
||||
offset = 0
|
||||
while offset + 8 <= len(data):
|
||||
size = struct.unpack_from(">I", data, offset)[0]
|
||||
box_type = data[offset + 4 : offset + 8]
|
||||
|
||||
if size == 1: # Extended size
|
||||
if offset + 16 > len(data):
|
||||
break
|
||||
size = struct.unpack_from(">Q", data, offset + 8)[0]
|
||||
elif size == 0:
|
||||
# Extends to end of file - we can't know the real size from
|
||||
# a partial read, but record what we have
|
||||
atoms.append((box_type, offset, 0))
|
||||
break
|
||||
|
||||
if size < 8:
|
||||
break
|
||||
|
||||
atoms.append((box_type, offset, size))
|
||||
offset += size
|
||||
|
||||
return atoms
|
||||
|
||||
|
||||
async def probe_mp4_moov(
|
||||
source,
|
||||
file_size: int = 0,
|
||||
cache_key: str = "",
|
||||
header_data: bytes | None = None,
|
||||
) -> MP4Index | None:
|
||||
"""
|
||||
Probe an MP4 file's moov atom to build a seek index.
|
||||
|
||||
Pure Python -- scans MP4 box headers with struct, no FFmpeg involved.
|
||||
|
||||
Strategy:
|
||||
1. Read first ~64KB to check for ftyp (MP4 signature).
|
||||
2. Scan top-level atoms to find moov and mdat.
|
||||
3. If moov is at the start (faststart), read it from the header data.
|
||||
4. If moov is not in the header, read from the tail of the file.
|
||||
5. Parse moov sample tables to build cue points.
|
||||
|
||||
Args:
|
||||
source: A MediaSource protocol implementation with stream(offset, limit).
|
||||
file_size: Total file size in bytes.
|
||||
cache_key: Optional cache key for Redis caching.
|
||||
header_data: Pre-fetched header bytes (first ~64KB). If provided,
|
||||
skips the initial header fetch from source.
|
||||
|
||||
Returns:
|
||||
MP4Index if successful, None if not an MP4 or parsing fails.
|
||||
"""
|
||||
if file_size <= 0:
|
||||
file_size = getattr(source, "file_size", 0)
|
||||
if not cache_key:
|
||||
cache_key = getattr(source, "cache_key", "")
|
||||
|
||||
# Check cache first
|
||||
if cache_key:
|
||||
cached = await _get_cached_mp4_index(cache_key)
|
||||
if cached:
|
||||
# Re-fetch moov_data (not cached due to size) and rewrite offsets
|
||||
if cached.moov_size > 0 and cached.moov_size <= _MAX_MOOV_SIZE:
|
||||
moov_data = b""
|
||||
async for chunk in source.stream(offset=cached.moov_offset, limit=cached.moov_size):
|
||||
moov_data += chunk
|
||||
if cached.mdat_offset >= 0:
|
||||
new_mdat_start = len(cached.ftyp_data) + cached.moov_size
|
||||
offset_delta = new_mdat_start - cached.mdat_offset
|
||||
if offset_delta != 0:
|
||||
moov_data = rewrite_moov_offsets(moov_data, offset_delta)
|
||||
cached.moov_data = moov_data
|
||||
return cached
|
||||
|
||||
try:
|
||||
# Step 1: Use pre-fetched header or fetch from source
|
||||
if header_data is None:
|
||||
header_size = min(_MP4_HEADER_PROBE_SIZE, file_size) if file_size > 0 else _MP4_HEADER_PROBE_SIZE
|
||||
header_data = b""
|
||||
async for chunk in source.stream(offset=0, limit=header_size):
|
||||
header_data += chunk
|
||||
|
||||
if len(header_data) < 12:
|
||||
return None
|
||||
|
||||
# Step 2: Check for ftyp
|
||||
if not is_mp4_header(header_data):
|
||||
return None
|
||||
|
||||
logger.info("[container_probe] MP4 detected, scanning atoms (header=%d bytes)", len(header_data))
|
||||
|
||||
# Step 3: Scan top-level atoms from header
|
||||
atoms = _scan_top_level_atoms(header_data)
|
||||
|
||||
ftyp_offset = -1
|
||||
ftyp_size = 0
|
||||
moov_offset = -1
|
||||
moov_size = 0
|
||||
mdat_offset = -1
|
||||
mdat_size = 0
|
||||
|
||||
for box_type, atom_offset, atom_size in atoms:
|
||||
if box_type == b"ftyp":
|
||||
ftyp_offset = atom_offset
|
||||
ftyp_size = atom_size
|
||||
elif box_type == b"moov":
|
||||
moov_offset = atom_offset
|
||||
moov_size = atom_size
|
||||
elif box_type == b"mdat":
|
||||
mdat_offset = atom_offset
|
||||
mdat_size = atom_size
|
||||
|
||||
# Step 4: If moov not found in header, scan from tail
|
||||
if moov_offset < 0 and file_size > 0:
|
||||
tail_start = max(0, file_size - _MP4_TAIL_PROBE_SIZE)
|
||||
tail_data = b""
|
||||
async for chunk in source.stream(offset=tail_start, limit=file_size - tail_start):
|
||||
tail_data += chunk
|
||||
|
||||
if tail_data:
|
||||
tail_atoms = _scan_top_level_atoms(tail_data)
|
||||
for box_type, rel_offset, atom_size in tail_atoms:
|
||||
abs_offset = tail_start + rel_offset
|
||||
if box_type == b"moov":
|
||||
moov_offset = abs_offset
|
||||
moov_size = atom_size
|
||||
elif box_type == b"mdat" and mdat_offset < 0:
|
||||
mdat_offset = abs_offset
|
||||
mdat_size = atom_size
|
||||
|
||||
# If the initial scan yielded no moov (tail_start may land
|
||||
# inside a large mdat payload producing garbage atom headers),
|
||||
# resync by scanning 8-byte aligned windows for b"moov".
|
||||
if moov_offset < 0:
|
||||
needle = b"moov"
|
||||
search_pos = 0
|
||||
while search_pos + 8 <= len(tail_data):
|
||||
idx = tail_data.find(needle, search_pos)
|
||||
if idx < 0 or idx < 4:
|
||||
break
|
||||
candidate_size = struct.unpack_from(">I", tail_data, idx - 4)[0]
|
||||
if 8 < candidate_size <= _MAX_MOOV_SIZE:
|
||||
moov_offset = tail_start + idx - 4
|
||||
moov_size = candidate_size
|
||||
break
|
||||
search_pos = idx + 4
|
||||
|
||||
if moov_offset < 0:
|
||||
logger.info("[container_probe] No moov atom found in MP4")
|
||||
return None
|
||||
|
||||
if moov_size <= 0 or moov_size > _MAX_MOOV_SIZE:
|
||||
logger.warning("[container_probe] moov size %d is invalid or too large", moov_size)
|
||||
return None
|
||||
|
||||
logger.info(
|
||||
"[container_probe] MP4 atoms: moov at %d (%d bytes), mdat at %d (%d bytes)",
|
||||
moov_offset,
|
||||
moov_size,
|
||||
mdat_offset,
|
||||
mdat_size,
|
||||
)
|
||||
|
||||
# Step 5: Fetch full moov atom
|
||||
# Check if moov is already contained in the header data we read
|
||||
if moov_offset + moov_size <= len(header_data):
|
||||
moov_data = header_data[moov_offset : moov_offset + moov_size]
|
||||
else:
|
||||
moov_data = b""
|
||||
async for chunk in source.stream(offset=moov_offset, limit=moov_size):
|
||||
moov_data += chunk
|
||||
|
||||
if len(moov_data) < moov_size:
|
||||
logger.warning(
|
||||
"[container_probe] Incomplete moov: got %d of %d bytes",
|
||||
len(moov_data),
|
||||
moov_size,
|
||||
)
|
||||
return None
|
||||
|
||||
# Step 6: Parse moov body (skip box header)
|
||||
# Determine header size
|
||||
raw_size = struct.unpack_from(">I", moov_data, 0)[0]
|
||||
hdr_size = 16 if raw_size == 1 else 8
|
||||
moov_body = moov_data[hdr_size:]
|
||||
|
||||
cue_points, duration_ms, timescale, video_codec, audio_codec = build_cue_points_from_moov(moov_body)
|
||||
|
||||
# If mdat wasn't found via header scan, it's likely right after ftyp
|
||||
# or right after moov. Common layouts:
|
||||
# ftyp + moov + mdat (faststart) or ftyp + mdat + moov
|
||||
if mdat_offset < 0:
|
||||
# Walk atoms to find mdat by scanning just enough from the file
|
||||
# In most cases, mdat is either before or after moov
|
||||
if moov_offset < file_size // 2:
|
||||
# moov is early -> mdat likely follows
|
||||
mdat_search_offset = moov_offset + moov_size
|
||||
else:
|
||||
# moov is late -> mdat likely right after ftyp
|
||||
ftyp_size = struct.unpack_from(">I", header_data, 0)[0]
|
||||
if ftyp_size == 1:
|
||||
ftyp_size = struct.unpack_from(">Q", header_data, 8)[0]
|
||||
mdat_search_offset = ftyp_size
|
||||
|
||||
# Read a small amount to find the mdat header
|
||||
mdat_header = b""
|
||||
async for chunk in source.stream(offset=mdat_search_offset, limit=16):
|
||||
mdat_header += chunk
|
||||
if len(mdat_header) >= 8:
|
||||
box_type = mdat_header[4:8]
|
||||
if box_type == b"mdat":
|
||||
mdat_offset = mdat_search_offset
|
||||
raw_sz = struct.unpack_from(">I", mdat_header, 0)[0]
|
||||
if raw_sz == 1 and len(mdat_header) >= 16:
|
||||
mdat_size = struct.unpack_from(">Q", mdat_header, 8)[0]
|
||||
else:
|
||||
mdat_size = raw_sz
|
||||
|
||||
# Step 7: Extract ftyp data (always in the header since it's the first atom)
|
||||
ftyp_data = b""
|
||||
if ftyp_offset >= 0 and ftyp_size > 0 and ftyp_offset + ftyp_size <= len(header_data):
|
||||
ftyp_data = header_data[ftyp_offset : ftyp_offset + ftyp_size]
|
||||
|
||||
# Step 8: Rewrite moov chunk offsets for faststart pipe layout.
|
||||
# The pipe stream will be: ftyp + moov + mdat. The stco/co64
|
||||
# offsets in the original moov point to positions in the original
|
||||
# file. We need to shift them to account for the new layout.
|
||||
# New mdat position = ftyp_size + moov_size
|
||||
# Delta = new_mdat_position - original_mdat_offset
|
||||
if mdat_offset >= 0:
|
||||
new_mdat_start = len(ftyp_data) + moov_size
|
||||
offset_delta = new_mdat_start - mdat_offset
|
||||
if offset_delta != 0:
|
||||
moov_data = rewrite_moov_offsets(moov_data, offset_delta)
|
||||
|
||||
index = MP4Index(
|
||||
duration_ms=duration_ms,
|
||||
timescale=timescale,
|
||||
cue_points=cue_points,
|
||||
moov_offset=moov_offset,
|
||||
moov_size=moov_size,
|
||||
moov_data=moov_data,
|
||||
ftyp_data=ftyp_data,
|
||||
mdat_offset=mdat_offset,
|
||||
mdat_size=mdat_size,
|
||||
video_codec=video_codec,
|
||||
audio_codec=audio_codec,
|
||||
)
|
||||
|
||||
logger.info(
|
||||
"[container_probe] MP4 index: duration=%.1fs, %d cue points, video=%s, audio=%s",
|
||||
duration_ms / 1000.0,
|
||||
len(cue_points),
|
||||
video_codec,
|
||||
audio_codec,
|
||||
)
|
||||
|
||||
if cache_key:
|
||||
await _set_cached_mp4_index(cache_key, index)
|
||||
|
||||
return index
|
||||
|
||||
except Exception as e:
|
||||
logger.warning("[container_probe] Failed to probe MP4 moov: %s", e)
|
||||
return None
|
||||
1228
mediaflow_proxy/remuxer/ebml_parser.py
Normal file
1228
mediaflow_proxy/remuxer/ebml_parser.py
Normal file
File diff suppressed because it is too large
Load Diff
151
mediaflow_proxy/remuxer/hls_manifest.py
Normal file
151
mediaflow_proxy/remuxer/hls_manifest.py
Normal file
@@ -0,0 +1,151 @@
|
||||
"""
|
||||
HLS VOD playlist generator for on-the-fly fMP4 transcoding.
|
||||
|
||||
Produces an M3U8 VOD playlist from an ``MKVCueIndex`` or ``MP4Index``.
|
||||
Consecutive keyframes that are closer together than the target segment
|
||||
duration are merged into a single HLS segment, matching the behaviour
|
||||
of ``ffmpeg -hls_time``.
|
||||
|
||||
The init segment is referenced via ``#EXT-X-MAP``.
|
||||
|
||||
Requires ``#EXT-X-VERSION:7`` for fMP4 (CMAF) segments.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import math
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
if TYPE_CHECKING:
|
||||
pass
|
||||
|
||||
|
||||
def merge_cue_points(
|
||||
cue_points: list[tuple[float, int]],
|
||||
target_duration_ms: float = 5000.0,
|
||||
) -> list[tuple[float, int]]:
|
||||
"""Merge consecutive keyframes into segments of *>= target_duration_ms*.
|
||||
|
||||
This replicates the logic of ``ffmpeg -hls_time``: a new segment
|
||||
boundary is created only when a keyframe is encountered **at least**
|
||||
``target_duration_ms`` after the start of the current segment.
|
||||
Keyframes that fall within the target window are absorbed into the
|
||||
current segment.
|
||||
|
||||
Side-effects:
|
||||
* Eliminates duplicate byte-offset entries (previously handled by
|
||||
``deduplicate_cue_points``).
|
||||
* Eliminates very short "runt" segments (e.g. 0.3 s).
|
||||
|
||||
Args:
|
||||
cue_points: Sorted ``(time_ms, byte_offset)`` list.
|
||||
target_duration_ms: Minimum segment duration in milliseconds.
|
||||
|
||||
Returns:
|
||||
A reduced list of ``(time_ms, byte_offset)`` tuples representing
|
||||
the merged segment boundaries.
|
||||
"""
|
||||
if not cue_points:
|
||||
return []
|
||||
|
||||
# Normalize duplicate offsets first: keep the earliest timestamp for each
|
||||
# byte offset. Some MKV files expose multiple cue times for the same
|
||||
# cluster offset; if we keep a later duplicate, segment start times no
|
||||
# longer match the actual bytes and can produce timestamp regressions.
|
||||
# Sorting by (time, offset) ensures earliest time wins deterministically.
|
||||
by_time = sorted(cue_points, key=lambda x: (x[0], x[1]))
|
||||
deduped: list[tuple[float, int]] = []
|
||||
seen_offsets: set[int] = set()
|
||||
for time_ms, byte_offset in by_time:
|
||||
if byte_offset in seen_offsets:
|
||||
continue
|
||||
seen_offsets.add(byte_offset)
|
||||
deduped.append((time_ms, byte_offset))
|
||||
|
||||
if not deduped:
|
||||
return []
|
||||
|
||||
merged: list[tuple[float, int]] = [deduped[0]]
|
||||
for i in range(1, len(deduped)):
|
||||
time_since_last = deduped[i][0] - merged[-1][0]
|
||||
if time_since_last >= target_duration_ms:
|
||||
merged.append(deduped[i])
|
||||
return merged
|
||||
|
||||
|
||||
def generate_vod_playlist(
|
||||
cue_points: list[tuple[float, int]],
|
||||
duration_ms: float,
|
||||
init_url: str,
|
||||
segment_url_template: str,
|
||||
target_segment_duration_ms: float = 5000.0,
|
||||
) -> str:
|
||||
"""Build an HLS VOD M3U8 playlist from cue-point data.
|
||||
|
||||
Consecutive keyframes that are closer than *target_segment_duration_ms*
|
||||
are merged into a single segment (matching ``ffmpeg -hls_time``).
|
||||
|
||||
Segment URLs use ``{start_ms}`` and ``{end_ms}`` placeholders that are
|
||||
replaced with the segment's time range in milliseconds.
|
||||
|
||||
Args:
|
||||
cue_points: Sorted list of ``(time_ms, byte_offset)`` tuples.
|
||||
duration_ms: Total media duration in milliseconds.
|
||||
init_url: URL for the fMP4 init segment (``#EXT-X-MAP`` URI).
|
||||
segment_url_template: URL template containing ``{seg}``,
|
||||
``{start_ms}`` and ``{end_ms}`` placeholders.
|
||||
target_segment_duration_ms: Target minimum segment duration.
|
||||
|
||||
Returns:
|
||||
Complete M3U8 playlist string.
|
||||
"""
|
||||
if not cue_points:
|
||||
return ""
|
||||
|
||||
merged = merge_cue_points(cue_points, target_segment_duration_ms)
|
||||
|
||||
# Build per-segment (start_ms, end_ms, duration_s) list.
|
||||
segments: list[tuple[float, float, float]] = []
|
||||
for i in range(len(merged)):
|
||||
start_ms = merged[i][0]
|
||||
end_ms = merged[i + 1][0] if i + 1 < len(merged) else duration_ms
|
||||
dur_s = max((end_ms - start_ms) / 1000.0, 0.001)
|
||||
segments.append((start_ms, end_ms, dur_s))
|
||||
|
||||
if not segments:
|
||||
return ""
|
||||
|
||||
target_duration = math.ceil(max(dur_s for _, _, dur_s in segments))
|
||||
target_duration = max(target_duration, 1)
|
||||
|
||||
lines: list[str] = [
|
||||
"#EXTM3U",
|
||||
"#EXT-X-VERSION:7",
|
||||
f"#EXT-X-TARGETDURATION:{target_duration}",
|
||||
"#EXT-X-PLAYLIST-TYPE:VOD",
|
||||
"#EXT-X-MEDIA-SEQUENCE:0",
|
||||
f'#EXT-X-MAP:URI="{init_url}"',
|
||||
]
|
||||
|
||||
for seg_num, (start_ms, end_ms, dur_s) in enumerate(segments):
|
||||
lines.append(f"#EXTINF:{dur_s:.3f},")
|
||||
url = (
|
||||
segment_url_template.replace(
|
||||
"{seg}",
|
||||
str(seg_num),
|
||||
)
|
||||
.replace(
|
||||
"{start_ms}",
|
||||
str(int(start_ms)),
|
||||
)
|
||||
.replace(
|
||||
"{end_ms}",
|
||||
str(int(end_ms)),
|
||||
)
|
||||
)
|
||||
lines.append(url)
|
||||
|
||||
lines.append("#EXT-X-ENDLIST")
|
||||
lines.append("") # trailing newline
|
||||
|
||||
return "\n".join(lines)
|
||||
234
mediaflow_proxy/remuxer/media_source.py
Normal file
234
mediaflow_proxy/remuxer/media_source.py
Normal file
@@ -0,0 +1,234 @@
|
||||
"""
|
||||
Abstract media source protocol for source-agnostic transcode pipeline.
|
||||
|
||||
Decouples the transcode pipeline, MKV cue probing, and seeking logic
|
||||
from any specific transport (Telegram, HTTP, etc.). Each transport
|
||||
implements the MediaSource protocol to provide byte-range streaming.
|
||||
"""
|
||||
|
||||
import hashlib
|
||||
import logging
|
||||
from collections.abc import AsyncIterator
|
||||
from typing import Protocol, runtime_checkable
|
||||
from urllib.parse import urlparse, unquote
|
||||
|
||||
from mediaflow_proxy.utils.http_client import create_aiohttp_session
|
||||
from mediaflow_proxy.utils.telegram import telegram_manager
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Extensions mapped to container format hints used by transcode_handler
|
||||
_MKV_EXTENSIONS = frozenset({".mkv", ".webm"})
|
||||
_MP4_EXTENSIONS = frozenset({".mp4", ".m4v", ".mov", ".m4a", ".3gp"})
|
||||
|
||||
|
||||
def _extract_extension(path: str) -> str:
|
||||
"""Extract lowercase file extension (e.g. '.mkv') from a path or URL."""
|
||||
# Strip query/fragment first for URL paths
|
||||
dot_pos = path.rfind(".")
|
||||
if dot_pos < 0:
|
||||
return ""
|
||||
ext = path[dot_pos:].lower()
|
||||
# Trim anything after the extension (query params from raw paths)
|
||||
for ch in ("?", "#", "&"):
|
||||
idx = ext.find(ch)
|
||||
if idx > 0:
|
||||
ext = ext[:idx]
|
||||
return ext
|
||||
|
||||
|
||||
def filename_hint_from_url(url: str) -> str:
|
||||
"""Derive a filename hint from a URL path (e.g. '.mkv', '.mp4')."""
|
||||
try:
|
||||
parsed = urlparse(url)
|
||||
return _extract_extension(unquote(parsed.path))
|
||||
except Exception:
|
||||
return ""
|
||||
|
||||
|
||||
def filename_hint_from_name(filename: str) -> str:
|
||||
"""Derive a filename hint from a filename string."""
|
||||
return _extract_extension(filename) if filename else ""
|
||||
|
||||
|
||||
@runtime_checkable
|
||||
class MediaSource(Protocol):
|
||||
"""
|
||||
Protocol for streaming media byte ranges.
|
||||
|
||||
Implementations must provide:
|
||||
- stream(): async iterator of bytes from offset/limit
|
||||
- file_size: total file size in bytes
|
||||
- cache_key: deterministic key for caching (cue index, etc.)
|
||||
- filename_hint: optional file extension hint (e.g. '.mkv', '.mp4')
|
||||
"""
|
||||
|
||||
@property
|
||||
def file_size(self) -> int:
|
||||
"""Total file size in bytes."""
|
||||
...
|
||||
|
||||
@property
|
||||
def cache_key(self) -> str:
|
||||
"""Deterministic cache key derived from the source identity."""
|
||||
...
|
||||
|
||||
@property
|
||||
def filename_hint(self) -> str:
|
||||
"""Optional file extension hint (e.g. '.mkv', '.mp4') for format detection."""
|
||||
...
|
||||
|
||||
async def stream(self, offset: int = 0, limit: int | None = None) -> AsyncIterator[bytes]:
|
||||
"""
|
||||
Stream bytes from the source.
|
||||
|
||||
Args:
|
||||
offset: Byte offset to start from.
|
||||
limit: Number of bytes to read. None = read to end.
|
||||
|
||||
Yields:
|
||||
Chunks of bytes.
|
||||
"""
|
||||
...
|
||||
|
||||
|
||||
class TelegramMediaSource:
|
||||
"""
|
||||
MediaSource backed by Telegram MTProto downloads.
|
||||
|
||||
Supports two download modes:
|
||||
|
||||
* **parallel** (default): Uses ``ParallelTransferrer`` with multiple
|
||||
MTProtoSender connections for maximum throughput. Best for full-file
|
||||
streaming (e.g. ``/proxy/telegram/stream``).
|
||||
|
||||
* **single** (``use_single_client=True``): Uses Telethon's built-in
|
||||
``iter_download`` over the existing client connection. Avoids the
|
||||
overhead of creating/destroying extra connections for each request,
|
||||
ideal for small byte-range fetches like HLS segments and probe
|
||||
headers.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
telegram_ref,
|
||||
file_size: int,
|
||||
file_name: str = "",
|
||||
*,
|
||||
use_single_client: bool = False,
|
||||
) -> None:
|
||||
self._ref = telegram_ref
|
||||
self._file_size = file_size
|
||||
self._filename_hint = filename_hint_from_name(file_name)
|
||||
self._use_single_client = use_single_client
|
||||
|
||||
@property
|
||||
def file_size(self) -> int:
|
||||
return self._file_size
|
||||
|
||||
@property
|
||||
def cache_key(self) -> str:
|
||||
ref = self._ref
|
||||
if ref.file_id:
|
||||
raw = f"file_id:{ref.file_id}"
|
||||
elif ref.chat_id is not None and ref.message_id is not None:
|
||||
raw = f"chat:{ref.chat_id}:msg:{ref.message_id}"
|
||||
else:
|
||||
return ""
|
||||
return hashlib.sha256(raw.encode()).hexdigest()[:16]
|
||||
|
||||
@property
|
||||
def filename_hint(self) -> str:
|
||||
return self._filename_hint
|
||||
|
||||
async def stream(self, offset: int = 0, limit: int | None = None) -> AsyncIterator[bytes]:
|
||||
effective_limit = limit or self._file_size
|
||||
if self._use_single_client:
|
||||
async for chunk in telegram_manager.stream_media_single(
|
||||
self._ref,
|
||||
offset=offset,
|
||||
limit=effective_limit,
|
||||
file_size=self._file_size,
|
||||
):
|
||||
yield chunk
|
||||
else:
|
||||
async for chunk in telegram_manager.stream_media(
|
||||
self._ref,
|
||||
offset=offset,
|
||||
limit=effective_limit,
|
||||
file_size=self._file_size,
|
||||
):
|
||||
yield chunk
|
||||
|
||||
|
||||
class HTTPMediaSource:
|
||||
"""MediaSource backed by HTTP byte-range requests via aiohttp."""
|
||||
|
||||
def __init__(self, url: str, headers: dict | None = None, file_size: int = 0) -> None:
|
||||
self._url = url
|
||||
self._headers = headers or {}
|
||||
self._file_size = file_size
|
||||
self._filename_hint = filename_hint_from_url(url)
|
||||
|
||||
@property
|
||||
def file_size(self) -> int:
|
||||
return self._file_size
|
||||
|
||||
@property
|
||||
def cache_key(self) -> str:
|
||||
return hashlib.sha256(self._url.encode()).hexdigest()[:16]
|
||||
|
||||
@property
|
||||
def filename_hint(self) -> str:
|
||||
return self._filename_hint
|
||||
|
||||
async def resolve_file_size(self) -> int:
|
||||
"""Perform a HEAD request to determine file size if not already known."""
|
||||
if self._file_size > 0:
|
||||
return self._file_size
|
||||
|
||||
async with create_aiohttp_session(self._url, headers=self._headers) as (session, proxy_url):
|
||||
async with session.head(
|
||||
self._url,
|
||||
headers=self._headers,
|
||||
proxy=proxy_url,
|
||||
allow_redirects=True,
|
||||
) as resp:
|
||||
cl = resp.headers.get("content-length")
|
||||
if cl:
|
||||
self._file_size = int(cl)
|
||||
else:
|
||||
# Try GET with range to get content-range
|
||||
async with session.get(
|
||||
self._url,
|
||||
headers={**self._headers, "range": "bytes=0-0"},
|
||||
proxy=proxy_url,
|
||||
allow_redirects=True,
|
||||
) as range_resp:
|
||||
cr = range_resp.headers.get("content-range", "")
|
||||
if "/" in cr:
|
||||
try:
|
||||
self._file_size = int(cr.split("/")[-1])
|
||||
except ValueError:
|
||||
pass
|
||||
return self._file_size
|
||||
|
||||
async def stream(self, offset: int = 0, limit: int | None = None) -> AsyncIterator[bytes]:
|
||||
headers = dict(self._headers)
|
||||
|
||||
if offset > 0 or limit is not None:
|
||||
end = ""
|
||||
if limit is not None:
|
||||
end = str(offset + limit - 1)
|
||||
headers["range"] = f"bytes={offset}-{end}"
|
||||
|
||||
async with create_aiohttp_session(self._url, headers=headers) as (session, proxy_url):
|
||||
async with session.get(
|
||||
self._url,
|
||||
headers=headers,
|
||||
proxy=proxy_url,
|
||||
allow_redirects=True,
|
||||
) as resp:
|
||||
resp.raise_for_status()
|
||||
async for chunk in resp.content.iter_any():
|
||||
yield chunk
|
||||
469
mediaflow_proxy/remuxer/mkv_demuxer.py
Normal file
469
mediaflow_proxy/remuxer/mkv_demuxer.py
Normal file
@@ -0,0 +1,469 @@
|
||||
"""
|
||||
Streaming MKV demuxer.
|
||||
|
||||
Reads an MKV byte stream via an async iterator and yields individual media
|
||||
frames (MKVFrame) with absolute timestamps. Designed for on-the-fly remuxing
|
||||
without buffering the entire file.
|
||||
|
||||
Architecture:
|
||||
AsyncIterator[bytes] -> StreamBuffer -> EBML parsing -> MKVFrame yields
|
||||
|
||||
The demuxer works in two phases:
|
||||
1. read_header(): Consume bytes until Tracks is fully parsed, returning
|
||||
a list of MKVTrack with codec metadata.
|
||||
2. iter_frames(): Yield MKVFrame objects from Cluster/SimpleBlock data
|
||||
as clusters arrive.
|
||||
"""
|
||||
|
||||
import logging
|
||||
from collections.abc import AsyncIterator
|
||||
from dataclasses import dataclass, field
|
||||
|
||||
from mediaflow_proxy.remuxer.ebml_parser import (
|
||||
CLUSTER,
|
||||
CLUSTER_TIMESTAMP,
|
||||
EBML_HEADER,
|
||||
INFO,
|
||||
MKVFrame,
|
||||
MKVTrack,
|
||||
SEGMENT,
|
||||
SIMPLE_BLOCK,
|
||||
BLOCK_GROUP,
|
||||
TRACKS,
|
||||
TIMESTAMP_SCALE,
|
||||
DURATION,
|
||||
UNKNOWN_SIZE,
|
||||
extract_block_frames,
|
||||
parse_tracks,
|
||||
read_element_id,
|
||||
read_element_size,
|
||||
read_float,
|
||||
read_uint,
|
||||
_parse_block_group,
|
||||
iter_elements,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class StreamBuffer:
|
||||
"""
|
||||
Accumulating byte buffer for streaming EBML parsing.
|
||||
|
||||
Collects chunks from an async byte source and provides read-ahead
|
||||
capabilities for EBML element parsing. Supports consuming parsed
|
||||
bytes to keep memory usage bounded.
|
||||
"""
|
||||
|
||||
def __init__(self) -> None:
|
||||
self._chunks: list[bytes] = []
|
||||
self._total: int = 0
|
||||
self._consumed: int = 0 # Logical bytes consumed (for offset tracking)
|
||||
|
||||
@property
|
||||
def available(self) -> int:
|
||||
"""Number of buffered bytes available for reading."""
|
||||
return self._total
|
||||
|
||||
@property
|
||||
def consumed(self) -> int:
|
||||
"""Total bytes consumed so far (for absolute offset tracking)."""
|
||||
return self._consumed
|
||||
|
||||
def append(self, data: bytes) -> None:
|
||||
"""Add bytes to the buffer."""
|
||||
if data:
|
||||
self._chunks.append(data)
|
||||
self._total += len(data)
|
||||
|
||||
def peek(self, size: int) -> bytes:
|
||||
"""Read up to size bytes without consuming."""
|
||||
if size <= 0:
|
||||
return b""
|
||||
result = bytearray()
|
||||
remaining = size
|
||||
for chunk in self._chunks:
|
||||
if remaining <= 0:
|
||||
break
|
||||
take = min(len(chunk), remaining)
|
||||
result.extend(chunk[:take])
|
||||
remaining -= take
|
||||
return bytes(result)
|
||||
|
||||
def get_all(self) -> bytes:
|
||||
"""Get all buffered data as a single bytes object (without consuming)."""
|
||||
if len(self._chunks) == 1:
|
||||
return self._chunks[0]
|
||||
data = b"".join(self._chunks)
|
||||
self._chunks = [data]
|
||||
return data
|
||||
|
||||
def consume(self, size: int) -> bytes:
|
||||
"""Remove and return size bytes from the front of the buffer."""
|
||||
if size <= 0:
|
||||
return b""
|
||||
if size > self._total:
|
||||
size = self._total
|
||||
|
||||
result = bytearray()
|
||||
remaining = size
|
||||
while remaining > 0 and self._chunks:
|
||||
chunk = self._chunks[0]
|
||||
if len(chunk) <= remaining:
|
||||
result.extend(chunk)
|
||||
remaining -= len(chunk)
|
||||
self._chunks.pop(0)
|
||||
else:
|
||||
result.extend(chunk[:remaining])
|
||||
self._chunks[0] = chunk[remaining:]
|
||||
remaining = 0
|
||||
|
||||
consumed = len(result)
|
||||
self._total -= consumed
|
||||
self._consumed += consumed
|
||||
return bytes(result)
|
||||
|
||||
def skip(self, size: int) -> int:
|
||||
"""Discard size bytes from the front. Returns actual bytes skipped."""
|
||||
if size <= 0:
|
||||
return 0
|
||||
actual = min(size, self._total)
|
||||
remaining = actual
|
||||
while remaining > 0 and self._chunks:
|
||||
chunk = self._chunks[0]
|
||||
if len(chunk) <= remaining:
|
||||
remaining -= len(chunk)
|
||||
self._chunks.pop(0)
|
||||
else:
|
||||
self._chunks[0] = chunk[remaining:]
|
||||
remaining = 0
|
||||
self._total -= actual
|
||||
self._consumed += actual
|
||||
return actual
|
||||
|
||||
|
||||
@dataclass
|
||||
class MKVHeader:
|
||||
"""Parsed MKV header metadata."""
|
||||
|
||||
tracks: list[MKVTrack] = field(default_factory=list)
|
||||
timestamp_scale_ns: int = 1_000_000 # Default 1ms
|
||||
duration_ms: float = 0.0
|
||||
segment_data_offset: int = 0 # Absolute byte offset of Segment children
|
||||
|
||||
|
||||
class MKVDemuxer:
|
||||
"""
|
||||
Streaming async MKV demuxer.
|
||||
|
||||
Reads an MKV byte stream from an async iterator and provides:
|
||||
- read_header(): Parse EBML header + Segment metadata + Tracks
|
||||
- iter_frames(): Yield MKVFrame objects from Clusters
|
||||
|
||||
Usage:
|
||||
demuxer = MKVDemuxer()
|
||||
header = await demuxer.read_header(source)
|
||||
async for frame in demuxer.iter_frames(source):
|
||||
process(frame)
|
||||
"""
|
||||
|
||||
# Minimum bytes to try parsing an element header (ID + size)
|
||||
_MIN_ELEMENT_HEADER = 12
|
||||
|
||||
def __init__(self) -> None:
|
||||
self._buf = StreamBuffer()
|
||||
self._header: MKVHeader | None = None
|
||||
self._scale_ms: float = 1.0 # timestamp_scale / 1_000_000
|
||||
|
||||
@property
|
||||
def header(self) -> MKVHeader | None:
|
||||
return self._header
|
||||
|
||||
async def read_header(self, source: AsyncIterator[bytes]) -> MKVHeader:
|
||||
"""
|
||||
Read and parse the MKV header (EBML header, Segment, Info, Tracks).
|
||||
|
||||
Consumes bytes from source until Tracks is fully parsed. Any leftover
|
||||
bytes (start of first Cluster) remain in the internal buffer for
|
||||
iter_frames().
|
||||
|
||||
Returns:
|
||||
MKVHeader with track info and timing metadata.
|
||||
"""
|
||||
header = MKVHeader()
|
||||
|
||||
# Phase 1: Accumulate enough data for EBML header + Segment header
|
||||
await self._ensure_bytes(source, 64)
|
||||
|
||||
data = self._buf.get_all()
|
||||
if len(data) < 4:
|
||||
raise ValueError(
|
||||
f"Source ended prematurely: got {len(data)} bytes, need at least an EBML header (source disconnected?)"
|
||||
)
|
||||
pos = 0
|
||||
|
||||
# Parse EBML Header
|
||||
eid, pos = read_element_id(data, pos)
|
||||
if eid != EBML_HEADER:
|
||||
raise ValueError(f"Not an MKV file: expected EBML header, got 0x{eid:X}")
|
||||
size, pos = read_element_size(data, pos)
|
||||
if size == UNKNOWN_SIZE:
|
||||
raise ValueError("EBML header has unknown size")
|
||||
pos += size # Skip EBML header content
|
||||
|
||||
# Parse Segment element header
|
||||
eid, pos = read_element_id(data, pos)
|
||||
if eid != SEGMENT:
|
||||
raise ValueError(f"Expected Segment, got 0x{eid:X}")
|
||||
_seg_size, pos = read_element_size(data, pos)
|
||||
header.segment_data_offset = self._buf.consumed + pos
|
||||
|
||||
# Phase 2: Parse Segment children until we have Tracks
|
||||
# We need to iterate top-level Segment children: SeekHead, Info, Tracks
|
||||
# Stop when we hit the first Cluster (media data).
|
||||
tracks_found = False
|
||||
|
||||
while not tracks_found:
|
||||
# Ensure we have enough for element header
|
||||
await self._ensure_bytes(source, pos + self._MIN_ELEMENT_HEADER)
|
||||
data = self._buf.get_all()
|
||||
|
||||
if pos >= len(data):
|
||||
break
|
||||
|
||||
try:
|
||||
eid, pos2 = read_element_id(data, pos)
|
||||
size, pos3 = read_element_size(data, pos2)
|
||||
except (ValueError, IndexError):
|
||||
await self._ensure_bytes(source, pos + 32)
|
||||
data = self._buf.get_all()
|
||||
try:
|
||||
eid, pos2 = read_element_id(data, pos)
|
||||
size, pos3 = read_element_size(data, pos2)
|
||||
except (ValueError, IndexError):
|
||||
break
|
||||
|
||||
if eid == CLUSTER:
|
||||
# Reached media data; header parsing is done.
|
||||
# Don't consume the Cluster -- leave it for iter_frames.
|
||||
break
|
||||
|
||||
if size == UNKNOWN_SIZE:
|
||||
# Can't handle unknown-size elements in header
|
||||
logger.warning("[mkv_demuxer] Unknown-size element 0x%X in header at pos %d", eid, pos)
|
||||
break
|
||||
|
||||
# Ensure we have the full element
|
||||
elem_end = pos3 + size
|
||||
await self._ensure_bytes(source, elem_end)
|
||||
data = self._buf.get_all()
|
||||
|
||||
if eid == INFO:
|
||||
self._parse_info_element(data, pos3, pos3 + size, header)
|
||||
elif eid == TRACKS:
|
||||
header.tracks = parse_tracks(data, pos3, pos3 + size)
|
||||
tracks_found = True
|
||||
logger.info(
|
||||
"[mkv_demuxer] Parsed %d tracks: %s",
|
||||
len(header.tracks),
|
||||
", ".join(f"#{t.track_number}={t.codec_id}" for t in header.tracks),
|
||||
)
|
||||
|
||||
pos = elem_end
|
||||
|
||||
# Consume everything up to the current position (Cluster boundary)
|
||||
self._buf.consume(pos)
|
||||
|
||||
# Set timing scale
|
||||
self._scale_ms = header.timestamp_scale_ns / 1_000_000.0
|
||||
self._header = header
|
||||
return header
|
||||
|
||||
async def iter_frames(self, source: AsyncIterator[bytes]) -> AsyncIterator[MKVFrame]:
|
||||
"""
|
||||
Yield MKVFrame objects from Cluster/SimpleBlock data.
|
||||
|
||||
Must be called after read_header(). Continues consuming bytes from
|
||||
source, parsing Clusters and yielding individual frames.
|
||||
"""
|
||||
if self._header is None:
|
||||
raise RuntimeError("read_header() must be called before iter_frames()")
|
||||
|
||||
while True:
|
||||
# Try to read the next element header
|
||||
if not await self._ensure_bytes_soft(source, self._MIN_ELEMENT_HEADER):
|
||||
break
|
||||
|
||||
data = self._buf.get_all()
|
||||
pos = 0
|
||||
|
||||
try:
|
||||
eid, pos2 = read_element_id(data, pos)
|
||||
size, pos3 = read_element_size(data, pos2)
|
||||
except (ValueError, IndexError):
|
||||
# Try to get more data
|
||||
if not await self._ensure_bytes_soft(source, len(data) + 4096):
|
||||
break
|
||||
data = self._buf.get_all()
|
||||
try:
|
||||
eid, pos2 = read_element_id(data, pos)
|
||||
size, pos3 = read_element_size(data, pos2)
|
||||
except (ValueError, IndexError):
|
||||
break
|
||||
|
||||
if eid == CLUSTER:
|
||||
if size == UNKNOWN_SIZE:
|
||||
# Unknown-size Cluster: parse children until we hit the next
|
||||
# Cluster or run out of data
|
||||
self._buf.consume(pos3) # consume Cluster header
|
||||
async for frame in self._parse_unknown_size_cluster(source):
|
||||
yield frame
|
||||
else:
|
||||
# Known-size Cluster: ensure we have all data
|
||||
elem_end = pos3 + size
|
||||
await self._ensure_bytes(source, elem_end)
|
||||
data = self._buf.get_all()
|
||||
|
||||
for frame in self._parse_cluster_data(data, pos3, pos3 + size):
|
||||
yield frame
|
||||
|
||||
self._buf.consume(elem_end)
|
||||
else:
|
||||
# Skip non-Cluster top-level elements
|
||||
if size == UNKNOWN_SIZE:
|
||||
break
|
||||
elem_end = pos3 + size
|
||||
if elem_end > len(data):
|
||||
# Need to skip bytes we don't have yet
|
||||
self._buf.consume(len(data))
|
||||
skip_remaining = elem_end - len(data)
|
||||
await self._skip_bytes(source, skip_remaining)
|
||||
else:
|
||||
self._buf.consume(elem_end)
|
||||
|
||||
def _parse_info_element(self, data: bytes, start: int, end: int, header: MKVHeader) -> None:
|
||||
"""Parse Info element children for timestamp scale and duration."""
|
||||
for eid, off, size, _ in iter_elements(data, start, end):
|
||||
if eid == TIMESTAMP_SCALE:
|
||||
header.timestamp_scale_ns = read_uint(data, off, size)
|
||||
elif eid == DURATION:
|
||||
scale = header.timestamp_scale_ns / 1_000_000.0
|
||||
header.duration_ms = read_float(data, off, size) * scale
|
||||
|
||||
def _parse_cluster_data(self, data: bytes, start: int, end: int) -> list[MKVFrame]:
|
||||
"""Parse a known-size Cluster and return its frames."""
|
||||
cluster_timecode = 0
|
||||
frames = []
|
||||
|
||||
for eid, data_off, size, _ in iter_elements(data, start, end):
|
||||
if eid == CLUSTER_TIMESTAMP:
|
||||
cluster_timecode = read_uint(data, data_off, size)
|
||||
elif eid == SIMPLE_BLOCK:
|
||||
for track_num, rel_tc, flags, frame_list in extract_block_frames(data, data_off, size):
|
||||
is_kf = bool(flags & 0x80)
|
||||
abs_ts_ms = (cluster_timecode + rel_tc) * self._scale_ms
|
||||
for frame_data in frame_list:
|
||||
frames.append(
|
||||
MKVFrame(
|
||||
track_number=track_num,
|
||||
timestamp_ms=abs_ts_ms,
|
||||
is_keyframe=is_kf,
|
||||
data=frame_data,
|
||||
)
|
||||
)
|
||||
elif eid == BLOCK_GROUP:
|
||||
_parse_block_group(data, data_off, data_off + size, cluster_timecode, self._scale_ms, frames)
|
||||
|
||||
return frames
|
||||
|
||||
async def _parse_unknown_size_cluster(self, source: AsyncIterator[bytes]) -> AsyncIterator[MKVFrame]:
|
||||
"""Parse an unknown-size Cluster by reading children until next Cluster."""
|
||||
cluster_timecode = 0
|
||||
|
||||
while True:
|
||||
if not await self._ensure_bytes_soft(source, self._MIN_ELEMENT_HEADER):
|
||||
break
|
||||
|
||||
data = self._buf.get_all()
|
||||
pos = 0
|
||||
|
||||
try:
|
||||
eid, pos2 = read_element_id(data, pos)
|
||||
size, pos3 = read_element_size(data, pos2)
|
||||
except (ValueError, IndexError):
|
||||
if not await self._ensure_bytes_soft(source, len(data) + 4096):
|
||||
break
|
||||
data = self._buf.get_all()
|
||||
try:
|
||||
eid, pos2 = read_element_id(data, pos)
|
||||
size, pos3 = read_element_size(data, pos2)
|
||||
except (ValueError, IndexError):
|
||||
break
|
||||
|
||||
# A new Cluster or top-level element signals end of current Cluster
|
||||
if eid == CLUSTER or eid == SEGMENT:
|
||||
break
|
||||
|
||||
if size == UNKNOWN_SIZE:
|
||||
break
|
||||
|
||||
elem_end = pos3 + size
|
||||
await self._ensure_bytes(source, elem_end)
|
||||
data = self._buf.get_all()
|
||||
|
||||
if eid == CLUSTER_TIMESTAMP:
|
||||
cluster_timecode = read_uint(data, pos3, size)
|
||||
elif eid == SIMPLE_BLOCK:
|
||||
for track_num, rel_tc, flags, frame_list in extract_block_frames(data, pos3, size):
|
||||
is_kf = bool(flags & 0x80)
|
||||
abs_ts_ms = (cluster_timecode + rel_tc) * self._scale_ms
|
||||
for frame_data in frame_list:
|
||||
yield MKVFrame(
|
||||
track_number=track_num,
|
||||
timestamp_ms=abs_ts_ms,
|
||||
is_keyframe=is_kf,
|
||||
data=frame_data,
|
||||
)
|
||||
elif eid == BLOCK_GROUP:
|
||||
bg_frames = []
|
||||
_parse_block_group(data, pos3, pos3 + size, cluster_timecode, self._scale_ms, bg_frames)
|
||||
for frame in bg_frames:
|
||||
yield frame
|
||||
|
||||
self._buf.consume(elem_end)
|
||||
|
||||
async def _ensure_bytes(self, source: AsyncIterator[bytes], needed: int) -> None:
|
||||
"""Ensure the buffer has at least 'needed' bytes. Raises StopAsyncIteration if exhausted."""
|
||||
while self._buf.available < needed:
|
||||
try:
|
||||
chunk = await source.__anext__()
|
||||
self._buf.append(chunk)
|
||||
except StopAsyncIteration:
|
||||
return
|
||||
|
||||
async def _ensure_bytes_soft(self, source: AsyncIterator[bytes], needed: int) -> bool:
|
||||
"""Like _ensure_bytes but returns False instead of raising."""
|
||||
while self._buf.available < needed:
|
||||
try:
|
||||
chunk = await source.__anext__()
|
||||
if not chunk:
|
||||
return self._buf.available > 0
|
||||
self._buf.append(chunk)
|
||||
except StopAsyncIteration:
|
||||
return self._buf.available > 0
|
||||
return True
|
||||
|
||||
async def _skip_bytes(self, source: AsyncIterator[bytes], count: int) -> None:
|
||||
"""Skip count bytes from the source without buffering."""
|
||||
remaining = count
|
||||
while remaining > 0:
|
||||
try:
|
||||
chunk = await source.__anext__()
|
||||
if len(chunk) <= remaining:
|
||||
remaining -= len(chunk)
|
||||
else:
|
||||
# Put the excess back
|
||||
self._buf.append(chunk[remaining:])
|
||||
remaining = 0
|
||||
except StopAsyncIteration:
|
||||
break
|
||||
1376
mediaflow_proxy/remuxer/mp4_muxer.py
Normal file
1376
mediaflow_proxy/remuxer/mp4_muxer.py
Normal file
File diff suppressed because it is too large
Load Diff
834
mediaflow_proxy/remuxer/mp4_parser.py
Normal file
834
mediaflow_proxy/remuxer/mp4_parser.py
Normal file
@@ -0,0 +1,834 @@
|
||||
"""
|
||||
MP4 container parser for moov atom probing.
|
||||
|
||||
Provides:
|
||||
- MP4Index: seek index extracted from MP4 moov atom (parallel to MKVCueIndex)
|
||||
- Top-level atom scanning
|
||||
- Sample table parsers (stco, co64, stss, stsz, stts, stsc)
|
||||
- Moov-to-cue-point builder
|
||||
- rewrite_moov_offsets: adjust stco/co64 in moov for file rearrangement
|
||||
|
||||
The parsers are the inverse of the builder functions in mp4_muxer.py.
|
||||
Box navigation reuses the pattern from ts_muxer.py's read_box/find_box/iter_boxes.
|
||||
"""
|
||||
|
||||
import bisect
|
||||
import logging
|
||||
import struct
|
||||
from dataclasses import dataclass, field
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# =============================================================================
|
||||
# MP4 Box Utilities
|
||||
# =============================================================================
|
||||
|
||||
# Minimum bytes needed to read a standard box header
|
||||
_BOX_HEADER_SIZE = 8
|
||||
|
||||
# ftyp brands that identify MP4/MOV containers
|
||||
_MP4_BRANDS = {
|
||||
b"isom",
|
||||
b"iso2",
|
||||
b"iso3",
|
||||
b"iso4",
|
||||
b"iso5",
|
||||
b"iso6",
|
||||
b"mp41",
|
||||
b"mp42",
|
||||
b"M4V ",
|
||||
b"M4A ",
|
||||
b"f4v ",
|
||||
b"kddi",
|
||||
b"avc1",
|
||||
b"qt ",
|
||||
b"MSNV",
|
||||
b"dash",
|
||||
b"3gp4",
|
||||
b"3gp5",
|
||||
b"3gp6",
|
||||
}
|
||||
|
||||
|
||||
def is_mp4_header(data: bytes) -> bool:
|
||||
"""Check if the data starts with an ftyp box (MP4 signature)."""
|
||||
if len(data) < 8:
|
||||
return False
|
||||
size = struct.unpack_from(">I", data, 0)[0]
|
||||
box_type = data[4:8]
|
||||
if box_type != b"ftyp":
|
||||
return False
|
||||
if size < 12 or size > len(data):
|
||||
return size >= 12 # might be valid but truncated
|
||||
major_brand = data[8:12]
|
||||
return major_brand in _MP4_BRANDS
|
||||
|
||||
|
||||
def read_box_header(data: bytes, offset: int) -> tuple[bytes, int, int] | None:
|
||||
"""
|
||||
Read a box header at the given offset.
|
||||
|
||||
Returns:
|
||||
(box_type, header_size, total_box_size) or None if not enough data.
|
||||
"""
|
||||
if offset + 8 > len(data):
|
||||
return None
|
||||
|
||||
size, box_type = struct.unpack_from(">I4s", data, offset)
|
||||
header_size = 8
|
||||
|
||||
if size == 1: # Extended size (64-bit)
|
||||
if offset + 16 > len(data):
|
||||
return None
|
||||
size = struct.unpack_from(">Q", data, offset + 8)[0]
|
||||
header_size = 16
|
||||
elif size == 0: # Box extends to end of data
|
||||
size = len(data) - offset
|
||||
|
||||
return box_type, header_size, size
|
||||
|
||||
|
||||
def iter_top_level_boxes(data: bytes):
|
||||
"""
|
||||
Iterate over top-level box headers.
|
||||
|
||||
Yields:
|
||||
(box_type, header_size, total_size, data_offset)
|
||||
"""
|
||||
offset = 0
|
||||
while offset < len(data):
|
||||
result = read_box_header(data, offset)
|
||||
if result is None:
|
||||
break
|
||||
box_type, header_size, total_size = result
|
||||
yield box_type, header_size, total_size, offset + header_size
|
||||
if total_size == 0:
|
||||
break
|
||||
offset += total_size
|
||||
|
||||
|
||||
def find_box(data: bytes, target: bytes) -> bytes | None:
|
||||
"""Find a box by type and return its body (data after header)."""
|
||||
for box_type, header_size, total_size, data_offset in iter_top_level_boxes(data):
|
||||
if box_type == target:
|
||||
return data[data_offset : data_offset - header_size + total_size]
|
||||
return None
|
||||
|
||||
|
||||
def iter_boxes(data: bytes):
|
||||
"""Iterate over child boxes: yields (box_type, box_body_bytes)."""
|
||||
for box_type, header_size, total_size, data_offset in iter_top_level_boxes(data):
|
||||
end = data_offset - header_size + total_size
|
||||
yield box_type, data[data_offset:end]
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Sample Table Parsers (inverse of mp4_muxer.py builders)
|
||||
# =============================================================================
|
||||
|
||||
|
||||
def parse_full_box_header(data: bytes) -> tuple[int, int, int]:
|
||||
"""
|
||||
Parse a full box header (version + flags).
|
||||
|
||||
Returns:
|
||||
(version, flags, header_size) where header_size is 4 bytes.
|
||||
"""
|
||||
if len(data) < 4:
|
||||
return 0, 0, 0
|
||||
version = data[0]
|
||||
flags = (data[1] << 16) | (data[2] << 8) | data[3]
|
||||
return version, flags, 4
|
||||
|
||||
|
||||
def parse_stco(data: bytes) -> list[int]:
|
||||
"""
|
||||
Parse Chunk Offset box (stco) - 32-bit offsets.
|
||||
|
||||
Layout: version(1) + flags(3) + entry_count(4) + [offset(4)]...
|
||||
"""
|
||||
if len(data) < 8:
|
||||
return []
|
||||
_, _, hdr = parse_full_box_header(data)
|
||||
pos = hdr
|
||||
entry_count = struct.unpack_from(">I", data, pos)[0]
|
||||
pos += 4
|
||||
|
||||
if len(data) < pos + entry_count * 4:
|
||||
return []
|
||||
|
||||
offsets = []
|
||||
for _ in range(entry_count):
|
||||
offsets.append(struct.unpack_from(">I", data, pos)[0])
|
||||
pos += 4
|
||||
return offsets
|
||||
|
||||
|
||||
def parse_co64(data: bytes) -> list[int]:
|
||||
"""
|
||||
Parse Chunk Offset box (co64) - 64-bit offsets.
|
||||
|
||||
Layout: version(1) + flags(3) + entry_count(4) + [offset(8)]...
|
||||
"""
|
||||
if len(data) < 8:
|
||||
return []
|
||||
_, _, hdr = parse_full_box_header(data)
|
||||
pos = hdr
|
||||
entry_count = struct.unpack_from(">I", data, pos)[0]
|
||||
pos += 4
|
||||
|
||||
if len(data) < pos + entry_count * 8:
|
||||
return []
|
||||
|
||||
offsets = []
|
||||
for _ in range(entry_count):
|
||||
offsets.append(struct.unpack_from(">Q", data, pos)[0])
|
||||
pos += 8
|
||||
return offsets
|
||||
|
||||
|
||||
def parse_stss(data: bytes) -> list[int]:
|
||||
"""
|
||||
Parse Sync Sample box (stss) - keyframe indices (1-based).
|
||||
|
||||
Layout: version(1) + flags(3) + entry_count(4) + [sample_number(4)]...
|
||||
"""
|
||||
if len(data) < 8:
|
||||
return []
|
||||
_, _, hdr = parse_full_box_header(data)
|
||||
pos = hdr
|
||||
entry_count = struct.unpack_from(">I", data, pos)[0]
|
||||
pos += 4
|
||||
|
||||
if len(data) < pos + entry_count * 4:
|
||||
return []
|
||||
|
||||
indices = []
|
||||
for _ in range(entry_count):
|
||||
indices.append(struct.unpack_from(">I", data, pos)[0])
|
||||
pos += 4
|
||||
return indices
|
||||
|
||||
|
||||
def parse_stsz(data: bytes) -> tuple[int, list[int]]:
|
||||
"""
|
||||
Parse Sample Size box (stsz).
|
||||
|
||||
Layout: version(1) + flags(3) + sample_size(4) + sample_count(4) + [size(4)]...
|
||||
|
||||
Returns:
|
||||
(uniform_size, sizes_list).
|
||||
If uniform_size > 0, all samples have that size and sizes_list is empty.
|
||||
Otherwise, sizes_list contains per-sample sizes.
|
||||
"""
|
||||
if len(data) < 12:
|
||||
return 0, []
|
||||
_, _, hdr = parse_full_box_header(data)
|
||||
pos = hdr
|
||||
sample_size = struct.unpack_from(">I", data, pos)[0]
|
||||
sample_count = struct.unpack_from(">I", data, pos + 4)[0]
|
||||
pos += 8
|
||||
|
||||
if sample_size > 0:
|
||||
return sample_size, []
|
||||
|
||||
if len(data) < pos + sample_count * 4:
|
||||
return 0, []
|
||||
|
||||
sizes = []
|
||||
for _ in range(sample_count):
|
||||
sizes.append(struct.unpack_from(">I", data, pos)[0])
|
||||
pos += 4
|
||||
return 0, sizes
|
||||
|
||||
|
||||
def parse_stts(data: bytes) -> list[tuple[int, int]]:
|
||||
"""
|
||||
Parse Time-to-Sample box (stts) - run-length encoded durations.
|
||||
|
||||
Layout: version(1) + flags(3) + entry_count(4) + [sample_count(4) + sample_delta(4)]...
|
||||
|
||||
Returns:
|
||||
List of (sample_count, sample_delta) entries.
|
||||
"""
|
||||
if len(data) < 8:
|
||||
return []
|
||||
_, _, hdr = parse_full_box_header(data)
|
||||
pos = hdr
|
||||
entry_count = struct.unpack_from(">I", data, pos)[0]
|
||||
pos += 4
|
||||
|
||||
if len(data) < pos + entry_count * 8:
|
||||
return []
|
||||
|
||||
entries = []
|
||||
for _ in range(entry_count):
|
||||
count = struct.unpack_from(">I", data, pos)[0]
|
||||
delta = struct.unpack_from(">I", data, pos + 4)[0]
|
||||
entries.append((count, delta))
|
||||
pos += 8
|
||||
return entries
|
||||
|
||||
|
||||
def parse_stsc(data: bytes) -> list[tuple[int, int, int]]:
|
||||
"""
|
||||
Parse Sample-to-Chunk box (stsc).
|
||||
|
||||
Layout: version(1) + flags(3) + entry_count(4) +
|
||||
[first_chunk(4) + samples_per_chunk(4) + sample_desc_index(4)]...
|
||||
|
||||
Returns:
|
||||
List of (first_chunk, samples_per_chunk, sample_desc_index) entries.
|
||||
first_chunk is 1-based.
|
||||
"""
|
||||
if len(data) < 8:
|
||||
return []
|
||||
_, _, hdr = parse_full_box_header(data)
|
||||
pos = hdr
|
||||
entry_count = struct.unpack_from(">I", data, pos)[0]
|
||||
pos += 4
|
||||
|
||||
if len(data) < pos + entry_count * 12:
|
||||
return []
|
||||
|
||||
entries = []
|
||||
for _ in range(entry_count):
|
||||
first_chunk = struct.unpack_from(">I", data, pos)[0]
|
||||
spc = struct.unpack_from(">I", data, pos + 4)[0]
|
||||
sdi = struct.unpack_from(">I", data, pos + 8)[0]
|
||||
entries.append((first_chunk, spc, sdi))
|
||||
pos += 12
|
||||
return entries
|
||||
|
||||
|
||||
def parse_mdhd(data: bytes) -> tuple[int, int]:
|
||||
"""
|
||||
Parse Media Header box (mdhd) for timescale and duration.
|
||||
|
||||
Returns:
|
||||
(timescale, duration) in media timescale units.
|
||||
"""
|
||||
if len(data) < 4:
|
||||
return 0, 0
|
||||
version = data[0]
|
||||
if version == 1:
|
||||
# 64-bit: skip version(1)+flags(3)+creation(8)+modification(8)
|
||||
if len(data) < 32:
|
||||
return 0, 0
|
||||
timescale = struct.unpack_from(">I", data, 20)[0]
|
||||
duration = struct.unpack_from(">Q", data, 24)[0]
|
||||
else:
|
||||
# 32-bit: skip version(1)+flags(3)+creation(4)+modification(4)
|
||||
if len(data) < 20:
|
||||
return 0, 0
|
||||
timescale = struct.unpack_from(">I", data, 12)[0]
|
||||
duration = struct.unpack_from(">I", data, 16)[0]
|
||||
return timescale, duration
|
||||
|
||||
|
||||
def parse_stsd_codec(data: bytes) -> str:
|
||||
"""
|
||||
Parse Sample Description box (stsd) to extract the codec FourCC.
|
||||
|
||||
Returns the codec name as a string (e.g. "avc1", "hvc1", "mp4a").
|
||||
"""
|
||||
if len(data) < 16:
|
||||
return ""
|
||||
# version(1)+flags(3)+entry_count(4)
|
||||
pos = 8
|
||||
# First entry: size(4)+type(4)
|
||||
if pos + 8 > len(data):
|
||||
return ""
|
||||
codec_fourcc = data[pos + 4 : pos + 8]
|
||||
try:
|
||||
return codec_fourcc.decode("ascii").strip()
|
||||
except (UnicodeDecodeError, ValueError):
|
||||
return ""
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# MP4 Index (parallel to MKVCueIndex)
|
||||
# =============================================================================
|
||||
|
||||
|
||||
@dataclass
|
||||
class MP4Index:
|
||||
"""
|
||||
Seek index extracted from an MP4 file's moov atom.
|
||||
|
||||
Parallel to ``MKVCueIndex`` for MKV files. Provides keyframe-indexed
|
||||
cue points for time-based seeking and the raw moov bytes needed to
|
||||
reconstruct a streamable (faststart) MP4 for on-the-fly demuxing.
|
||||
"""
|
||||
|
||||
duration_ms: float = 0.0
|
||||
timescale: int = 0
|
||||
cue_points: list[tuple[float, int]] = field(default_factory=list) # [(time_ms, byte_offset), ...]
|
||||
moov_offset: int = 0 # Absolute file offset where moov atom starts
|
||||
moov_size: int = 0 # Total size of the moov atom (header + body)
|
||||
moov_data: bytes = b"" # Raw moov atom bytes (for prepending to mdat pipe)
|
||||
ftyp_data: bytes = b"" # Raw ftyp atom bytes (for prepending before moov)
|
||||
mdat_offset: int = 0 # Absolute file offset where mdat atom starts
|
||||
mdat_size: int = 0 # Total size of the mdat atom
|
||||
video_codec: str = "" # e.g. "avc1", "hvc1", "mp4v"
|
||||
audio_codec: str = "" # e.g. "mp4a", "ac-3"
|
||||
|
||||
def byte_offset_for_time(self, time_ms: float) -> tuple[int, float]:
|
||||
"""
|
||||
Find the byte offset for the nearest keyframe at or before time_ms.
|
||||
|
||||
Returns:
|
||||
(absolute_byte_offset, actual_keyframe_time_ms)
|
||||
"""
|
||||
if not self.cue_points:
|
||||
return 0, 0.0
|
||||
|
||||
times = [cp[0] for cp in self.cue_points]
|
||||
idx = bisect.bisect_right(times, time_ms) - 1
|
||||
if idx < 0:
|
||||
idx = 0
|
||||
|
||||
cue_time_ms, byte_offset = self.cue_points[idx]
|
||||
return byte_offset, cue_time_ms
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Moov -> Cue Points Builder
|
||||
# =============================================================================
|
||||
|
||||
|
||||
def _find_nested_box(data: bytes, *path: bytes) -> bytes | None:
|
||||
"""Walk a box hierarchy: find_nested_box(data, b"trak", b"mdia") etc."""
|
||||
current = data
|
||||
for box_name in path:
|
||||
found = find_box(current, box_name)
|
||||
if found is None:
|
||||
return None
|
||||
current = found
|
||||
return current
|
||||
|
||||
|
||||
def build_cue_points_from_moov(moov_body: bytes) -> tuple[list[tuple[float, int]], float, int, str, str]:
|
||||
"""
|
||||
Parse a moov body to build keyframe-indexed cue points.
|
||||
|
||||
Walks the first video trak's stbl to extract:
|
||||
- Chunk offsets (stco/co64)
|
||||
- Keyframe sample indices (stss)
|
||||
- Sample sizes (stsz)
|
||||
- Sample durations (stts)
|
||||
- Sample-to-chunk mapping (stsc)
|
||||
- Timescale and duration from mdhd
|
||||
|
||||
Returns:
|
||||
(cue_points, duration_ms, timescale, video_codec, audio_codec)
|
||||
"""
|
||||
cue_points: list[tuple[float, int]] = []
|
||||
duration_ms = 0.0
|
||||
timescale = 0
|
||||
video_codec = ""
|
||||
audio_codec = ""
|
||||
|
||||
# Find all traks
|
||||
video_stbl = None
|
||||
video_mdhd = None
|
||||
|
||||
offset = 0
|
||||
data = moov_body
|
||||
while offset < len(data):
|
||||
result = read_box_header(data, offset)
|
||||
if result is None:
|
||||
break
|
||||
box_type, hdr_size, total_size = result
|
||||
|
||||
if box_type == b"trak":
|
||||
trak_body = data[offset + hdr_size : offset + total_size]
|
||||
|
||||
# Check handler type to identify video/audio
|
||||
hdlr_data = _find_nested_box(trak_body, b"mdia", b"hdlr")
|
||||
handler_type = b""
|
||||
if hdlr_data and len(hdlr_data) >= 12:
|
||||
# hdlr: version(1)+flags(3)+pre_defined(4)+handler_type(4)
|
||||
handler_type = hdlr_data[8:12]
|
||||
|
||||
if handler_type == b"vide" and video_stbl is None:
|
||||
video_stbl = _find_nested_box(trak_body, b"mdia", b"minf", b"stbl")
|
||||
video_mdhd_data = _find_nested_box(trak_body, b"mdia", b"mdhd")
|
||||
if video_mdhd_data:
|
||||
video_mdhd = video_mdhd_data
|
||||
|
||||
stsd_data = _find_nested_box(trak_body, b"mdia", b"minf", b"stbl", b"stsd")
|
||||
if stsd_data:
|
||||
video_codec = parse_stsd_codec(stsd_data)
|
||||
|
||||
elif handler_type == b"soun" and not audio_codec:
|
||||
stsd_data = _find_nested_box(trak_body, b"mdia", b"minf", b"stbl", b"stsd")
|
||||
if stsd_data:
|
||||
audio_codec = parse_stsd_codec(stsd_data)
|
||||
|
||||
elif box_type == b"mvhd":
|
||||
# Fallback: parse mvhd for timescale/duration if no mdhd
|
||||
mvhd_body = data[offset + hdr_size : offset + total_size]
|
||||
if len(mvhd_body) >= 20:
|
||||
version = mvhd_body[0]
|
||||
if version == 1:
|
||||
if len(mvhd_body) >= 28:
|
||||
ts = struct.unpack_from(">I", mvhd_body, 20)[0]
|
||||
dur = struct.unpack_from(">Q", mvhd_body, 24)[0]
|
||||
if timescale == 0:
|
||||
timescale = ts
|
||||
duration_ms = dur / ts * 1000.0 if ts else 0.0
|
||||
else:
|
||||
ts = struct.unpack_from(">I", mvhd_body, 12)[0]
|
||||
dur = struct.unpack_from(">I", mvhd_body, 16)[0]
|
||||
if timescale == 0:
|
||||
timescale = ts
|
||||
duration_ms = dur / ts * 1000.0 if ts else 0.0
|
||||
|
||||
if total_size == 0:
|
||||
break
|
||||
offset += total_size
|
||||
|
||||
# Parse mdhd for video timescale (more precise than mvhd)
|
||||
if video_mdhd:
|
||||
ts, dur = parse_mdhd(video_mdhd)
|
||||
if ts > 0:
|
||||
timescale = ts
|
||||
duration_ms = dur / ts * 1000.0
|
||||
|
||||
if video_stbl is None:
|
||||
logger.warning("[mp4_parser] No video stbl found in moov")
|
||||
return cue_points, duration_ms, timescale, video_codec, audio_codec
|
||||
|
||||
# Parse sample tables from video stbl
|
||||
stco_data = find_box(video_stbl, b"stco")
|
||||
co64_data = find_box(video_stbl, b"co64")
|
||||
stss_data = find_box(video_stbl, b"stss")
|
||||
stsz_data = find_box(video_stbl, b"stsz")
|
||||
stts_data = find_box(video_stbl, b"stts")
|
||||
stsc_data = find_box(video_stbl, b"stsc")
|
||||
|
||||
# Chunk offsets
|
||||
chunk_offsets = parse_co64(co64_data) if co64_data else (parse_stco(stco_data) if stco_data else [])
|
||||
|
||||
# Keyframe sample numbers (1-based)
|
||||
keyframe_samples = set(parse_stss(stss_data)) if stss_data else set()
|
||||
all_are_keyframes = not stss_data # No stss means all samples are sync
|
||||
|
||||
# Sample sizes
|
||||
uniform_size, size_list = parse_stsz(stsz_data) if stsz_data else (0, [])
|
||||
|
||||
# Sample durations (run-length encoded)
|
||||
stts_entries = parse_stts(stts_data) if stts_data else []
|
||||
|
||||
# Sample-to-chunk mapping
|
||||
stsc_entries = parse_stsc(stsc_data) if stsc_data else []
|
||||
|
||||
if not chunk_offsets or timescale == 0:
|
||||
logger.warning(
|
||||
"[mp4_parser] Missing data: chunks=%d, timescale=%d",
|
||||
len(chunk_offsets),
|
||||
timescale,
|
||||
)
|
||||
return cue_points, duration_ms, timescale, video_codec, audio_codec
|
||||
|
||||
# Expand stts to per-sample durations
|
||||
sample_durations: list[int] = []
|
||||
for count, delta in stts_entries:
|
||||
sample_durations.extend([delta] * count)
|
||||
|
||||
# Expand stsc to determine which samples belong to which chunk
|
||||
# Build a mapping: chunk_index (0-based) -> samples_per_chunk
|
||||
total_chunks = len(chunk_offsets)
|
||||
chunk_sample_counts: list[int] = [0] * total_chunks
|
||||
|
||||
if stsc_entries:
|
||||
for i, (first_chunk, spc, _sdi) in enumerate(stsc_entries):
|
||||
# first_chunk is 1-based
|
||||
start = first_chunk - 1
|
||||
if i + 1 < len(stsc_entries):
|
||||
end = stsc_entries[i + 1][0] - 1
|
||||
else:
|
||||
end = total_chunks
|
||||
for c in range(start, end):
|
||||
if c < total_chunks:
|
||||
chunk_sample_counts[c] = spc
|
||||
else:
|
||||
# Default: 1 sample per chunk
|
||||
chunk_sample_counts = [1] * total_chunks
|
||||
|
||||
# Count total samples
|
||||
total_samples = sum(chunk_sample_counts)
|
||||
|
||||
# Get per-sample sizes
|
||||
if uniform_size > 0:
|
||||
sample_sizes = [uniform_size] * total_samples
|
||||
else:
|
||||
sample_sizes = size_list
|
||||
|
||||
# Build cumulative timestamp for each sample and map keyframes to byte offsets
|
||||
current_sample = 0 # 0-based sample index
|
||||
current_time = 0 # in timescale units
|
||||
|
||||
for chunk_idx, chunk_offset in enumerate(chunk_offsets):
|
||||
spc = chunk_sample_counts[chunk_idx] if chunk_idx < len(chunk_sample_counts) else 1
|
||||
byte_pos = chunk_offset
|
||||
|
||||
for s in range(spc):
|
||||
sample_num = current_sample + 1 # 1-based for stss comparison
|
||||
is_keyframe = all_are_keyframes or sample_num in keyframe_samples
|
||||
|
||||
if is_keyframe:
|
||||
time_ms = current_time / timescale * 1000.0
|
||||
cue_points.append((time_ms, byte_pos))
|
||||
|
||||
# Advance byte position by this sample's size
|
||||
if current_sample < len(sample_sizes):
|
||||
byte_pos += sample_sizes[current_sample]
|
||||
|
||||
# Advance timestamp
|
||||
if current_sample < len(sample_durations):
|
||||
current_time += sample_durations[current_sample]
|
||||
|
||||
current_sample += 1
|
||||
|
||||
logger.info(
|
||||
"[mp4_parser] Built %d cue points from %d samples, duration=%.1fs, video=%s, audio=%s",
|
||||
len(cue_points),
|
||||
total_samples,
|
||||
duration_ms / 1000.0,
|
||||
video_codec,
|
||||
audio_codec,
|
||||
)
|
||||
|
||||
return cue_points, duration_ms, timescale, video_codec, audio_codec
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Moov Offset Rewriting (for faststart pipe construction)
|
||||
# =============================================================================
|
||||
|
||||
|
||||
def _rewrite_stco_in_place(data: bytearray, box_start: int, box_size: int, delta: int) -> int:
|
||||
"""Rewrite stco chunk offsets by adding delta. Returns number of entries fixed."""
|
||||
# FullBox header: version(1) + flags(3) = 4 bytes
|
||||
body_start = box_start + 4
|
||||
if body_start + 4 > box_start + box_size:
|
||||
return 0
|
||||
entry_count = struct.unpack_from(">I", data, body_start)[0]
|
||||
pos = body_start + 4
|
||||
for _ in range(entry_count):
|
||||
if pos + 4 > box_start + box_size:
|
||||
break
|
||||
old_val = struct.unpack_from(">I", data, pos)[0]
|
||||
struct.pack_into(">I", data, pos, old_val + delta)
|
||||
pos += 4
|
||||
return entry_count
|
||||
|
||||
|
||||
def _rewrite_co64_in_place(data: bytearray, box_start: int, box_size: int, delta: int) -> int:
|
||||
"""Rewrite co64 chunk offsets by adding delta. Returns number of entries fixed."""
|
||||
body_start = box_start + 4
|
||||
if body_start + 4 > box_start + box_size:
|
||||
return 0
|
||||
entry_count = struct.unpack_from(">I", data, body_start)[0]
|
||||
pos = body_start + 4
|
||||
for _ in range(entry_count):
|
||||
if pos + 8 > box_start + box_size:
|
||||
break
|
||||
old_val = struct.unpack_from(">Q", data, pos)[0]
|
||||
struct.pack_into(">Q", data, pos, old_val + delta)
|
||||
pos += 8
|
||||
return entry_count
|
||||
|
||||
|
||||
def _walk_and_rewrite(data: bytearray, start: int, end: int, delta: int) -> int:
|
||||
"""
|
||||
Recursively walk boxes within [start, end) looking for stco/co64 boxes
|
||||
and rewriting their offsets.
|
||||
|
||||
Returns total number of offset entries rewritten.
|
||||
"""
|
||||
total = 0
|
||||
offset = start
|
||||
while offset + 8 <= end:
|
||||
size = struct.unpack_from(">I", data, offset)[0]
|
||||
box_type = data[offset + 4 : offset + 8]
|
||||
hdr_size = 8
|
||||
|
||||
if size == 1:
|
||||
if offset + 16 > end:
|
||||
break
|
||||
size = struct.unpack_from(">Q", data, offset + 8)[0]
|
||||
hdr_size = 16
|
||||
elif size == 0:
|
||||
size = end - offset
|
||||
|
||||
if size < 8 or offset + size > end:
|
||||
break
|
||||
|
||||
body_start = offset + hdr_size
|
||||
body_end = offset + size
|
||||
|
||||
if box_type == b"stco":
|
||||
total += _rewrite_stco_in_place(data, body_start, size - hdr_size, delta)
|
||||
elif box_type == b"co64":
|
||||
total += _rewrite_co64_in_place(data, body_start, size - hdr_size, delta)
|
||||
elif box_type in (b"moov", b"trak", b"mdia", b"minf", b"stbl"):
|
||||
# Container box -- recurse into children
|
||||
total += _walk_and_rewrite(data, body_start, body_end, delta)
|
||||
|
||||
offset += size
|
||||
|
||||
return total
|
||||
|
||||
|
||||
def extract_video_track_from_moov(moov_data: bytes):
|
||||
"""
|
||||
Extract video codec configuration from an MP4 moov atom.
|
||||
|
||||
Walks the moov box tree to find the first video trak, extracts its
|
||||
resolution and codec-private data (avcC/hvcC), and returns a synthetic
|
||||
``MKVTrack`` suitable for building an fMP4 init segment.
|
||||
|
||||
Returns:
|
||||
An ``MKVTrack`` with video metadata, or ``None`` if no video track
|
||||
is found.
|
||||
"""
|
||||
from mediaflow_proxy.remuxer.ebml_parser import (
|
||||
CODEC_ID_H264,
|
||||
CODEC_ID_H265,
|
||||
MKVTrack,
|
||||
)
|
||||
|
||||
# Strip the moov box header to get the body
|
||||
if len(moov_data) < 8:
|
||||
return None
|
||||
raw_size = struct.unpack_from(">I", moov_data, 0)[0]
|
||||
hdr_size = 16 if raw_size == 1 else 8
|
||||
moov_body = moov_data[hdr_size:]
|
||||
|
||||
# Walk traks looking for video handler
|
||||
offset = 0
|
||||
while offset < len(moov_body):
|
||||
result = read_box_header(moov_body, offset)
|
||||
if result is None:
|
||||
break
|
||||
box_type, box_hdr_size, total_size = result
|
||||
|
||||
if box_type == b"trak":
|
||||
trak_body = moov_body[offset + box_hdr_size : offset + total_size]
|
||||
|
||||
# Check handler type
|
||||
hdlr_data = _find_nested_box(trak_body, b"mdia", b"hdlr")
|
||||
handler_type = b""
|
||||
if hdlr_data and len(hdlr_data) >= 12:
|
||||
handler_type = hdlr_data[8:12]
|
||||
|
||||
if handler_type == b"vide":
|
||||
# Found video trak -- extract stsd for codec config
|
||||
stsd_data = _find_nested_box(trak_body, b"mdia", b"minf", b"stbl", b"stsd")
|
||||
if not stsd_data or len(stsd_data) < 16:
|
||||
offset += total_size
|
||||
continue
|
||||
|
||||
codec_name = parse_stsd_codec(stsd_data)
|
||||
|
||||
# Map MP4 codec names to MKV codec IDs
|
||||
if codec_name in ("avc1", "avc3"):
|
||||
mkv_codec_id = CODEC_ID_H264
|
||||
elif codec_name in ("hvc1", "hev1"):
|
||||
mkv_codec_id = CODEC_ID_H265
|
||||
else:
|
||||
mkv_codec_id = f"V_MP4/{codec_name}"
|
||||
|
||||
# Extract codec private (avcC or hvcC box) from inside the
|
||||
# sample entry. The stsd structure is:
|
||||
# version(1) + flags(3) + entry_count(4)
|
||||
# then entry: size(4) + type(4) + ... + nested boxes
|
||||
# The avcC/hvcC is a child box of the sample entry.
|
||||
codec_private = b""
|
||||
width = 0
|
||||
height = 0
|
||||
|
||||
# Parse sample entry to get width/height and codec config
|
||||
entry_start = 8 # skip version+flags+entry_count
|
||||
if entry_start + 8 <= len(stsd_data):
|
||||
entry_size = struct.unpack_from(">I", stsd_data, entry_start)[0]
|
||||
entry_body_start = entry_start + 8 # skip size+type
|
||||
entry_end = min(entry_start + entry_size, len(stsd_data))
|
||||
|
||||
# Visual sample entry: 6 reserved + 2 data_ref_idx + ...
|
||||
# At offset 24 from entry body start: width(2) + height(2)
|
||||
vis_offset = entry_body_start + 24
|
||||
if vis_offset + 4 <= entry_end:
|
||||
width = struct.unpack_from(">H", stsd_data, vis_offset)[0]
|
||||
height = struct.unpack_from(">H", stsd_data, vis_offset + 2)[0]
|
||||
|
||||
# Scan nested boxes for avcC or hvcC
|
||||
# Visual sample entry fixed fields = 70 bytes from entry body
|
||||
nested_start = entry_body_start + 70
|
||||
if nested_start < entry_end:
|
||||
nested_data = stsd_data[nested_start:entry_end]
|
||||
for target in (b"avcC", b"hvcC"):
|
||||
found = find_box(nested_data, target)
|
||||
if found:
|
||||
codec_private = found
|
||||
break
|
||||
|
||||
# Get duration from mdhd if available
|
||||
default_duration_ns = 0
|
||||
mdhd_data = _find_nested_box(trak_body, b"mdia", b"mdhd")
|
||||
if mdhd_data and len(mdhd_data) >= 20:
|
||||
version = mdhd_data[0]
|
||||
if version == 1 and len(mdhd_data) >= 28:
|
||||
ts = struct.unpack_from(">I", mdhd_data, 20)[0]
|
||||
dur = struct.unpack_from(">Q", mdhd_data, 24)[0]
|
||||
else:
|
||||
ts = struct.unpack_from(">I", mdhd_data, 12)[0]
|
||||
dur = struct.unpack_from(">I", mdhd_data, 16)[0]
|
||||
if ts > 0 and dur > 0:
|
||||
# Rough estimate: assume 24fps if we can't determine.
|
||||
default_duration_ns = int(1_000_000_000 / 24)
|
||||
|
||||
return MKVTrack(
|
||||
track_number=1,
|
||||
track_type=1, # video
|
||||
codec_id=mkv_codec_id,
|
||||
codec_private=codec_private,
|
||||
pixel_width=width,
|
||||
pixel_height=height,
|
||||
default_duration_ns=default_duration_ns,
|
||||
)
|
||||
|
||||
offset += total_size
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def rewrite_moov_offsets(moov_data: bytes, delta: int) -> bytes:
|
||||
"""
|
||||
Rewrite all stco/co64 chunk offsets in a moov atom by adding ``delta``.
|
||||
|
||||
This is needed when rearranging an MP4 file for pipe streaming:
|
||||
the original moov's chunk offsets reference positions in the original
|
||||
file layout. When we prepend moov before mdat, the offsets must be
|
||||
shifted by ``delta = moov_size - original_mdat_offset``.
|
||||
|
||||
Args:
|
||||
moov_data: Raw bytes of the complete moov box (header + body).
|
||||
delta: Offset adjustment to add to every chunk offset.
|
||||
|
||||
Returns:
|
||||
Modified moov bytes with updated chunk offsets.
|
||||
"""
|
||||
buf = bytearray(moov_data)
|
||||
|
||||
# Determine moov box header size
|
||||
raw_size = struct.unpack_from(">I", buf, 0)[0]
|
||||
hdr_size = 16 if raw_size == 1 else 8
|
||||
|
||||
total = _walk_and_rewrite(buf, hdr_size, len(buf), delta)
|
||||
logger.info("[mp4_parser] Rewrote %d chunk offset entries (delta=%+d)", total, delta)
|
||||
|
||||
return bytes(buf)
|
||||
608
mediaflow_proxy/remuxer/pyav_demuxer.py
Normal file
608
mediaflow_proxy/remuxer/pyav_demuxer.py
Normal file
@@ -0,0 +1,608 @@
|
||||
"""
|
||||
Universal PyAV-based streaming demuxer.
|
||||
|
||||
Bridges async byte streams to PyAV's synchronous I/O using an OS pipe,
|
||||
allowing on-the-fly demuxing of any container format (MKV, MP4, TS,
|
||||
FLV, WebM, etc.) from an async source.
|
||||
|
||||
Architecture:
|
||||
AsyncIterator[bytes] --> async feeder task --> queue.Queue --> writer thread (pipe)
|
||||
|
|
||||
OS pipe (kernel buffer)
|
||||
|
|
||||
demux thread: av.open + discover + demux
|
||||
|
|
||||
queue.Queue --> run_in_executor consumer
|
||||
|
||||
Performance: Uses plain threading.Queue on both sides (writer input and
|
||||
packet output) to avoid per-item ``run_coroutine_threadsafe`` overhead.
|
||||
The async/thread bridge is done via ``run_in_executor`` on the consumer
|
||||
side and a dedicated asyncio task on the producer side.
|
||||
|
||||
For MP4 inputs, the caller (transcode_handler) prepends the moov atom
|
||||
to the stream so PyAV receives a "faststart"-style MP4 through the pipe.
|
||||
This allows true on-the-fly demuxing for all container formats.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
import os
|
||||
import queue
|
||||
import threading
|
||||
from collections.abc import AsyncIterator
|
||||
from dataclasses import dataclass
|
||||
|
||||
import av
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Sentinel object to signal end-of-stream in queues
|
||||
_SENTINEL = object()
|
||||
|
||||
|
||||
@dataclass(slots=True)
|
||||
class DemuxedStream:
|
||||
"""Metadata about a demuxed stream."""
|
||||
|
||||
index: int
|
||||
codec_name: str
|
||||
codec_type: str # "video" or "audio"
|
||||
# Video-specific
|
||||
width: int = 0
|
||||
height: int = 0
|
||||
fps: float = 0.0
|
||||
pixel_format: str = ""
|
||||
# Audio-specific
|
||||
sample_rate: int = 0
|
||||
channels: int = 0
|
||||
# Timing
|
||||
time_base_num: int = 1
|
||||
time_base_den: int = 1000
|
||||
duration_seconds: float = 0.0
|
||||
# Raw codec extradata (e.g. SPS/PPS for H.264, AudioSpecificConfig for AAC)
|
||||
extradata: bytes = b""
|
||||
|
||||
|
||||
@dataclass(slots=True)
|
||||
class DemuxedPacket:
|
||||
"""A demuxed packet with timing info."""
|
||||
|
||||
stream_index: int
|
||||
codec_type: str # "video" or "audio"
|
||||
data: bytes
|
||||
pts: int # Presentation timestamp in stream time_base units
|
||||
dts: int # Decode timestamp in stream time_base units
|
||||
duration: int # Duration in stream time_base units
|
||||
is_keyframe: bool
|
||||
time_base_num: int
|
||||
time_base_den: int
|
||||
# Optional decoded frame when decode_video/decode_audio is True
|
||||
# av.VideoFrame for video, av.AudioFrame for audio
|
||||
decoded_frame: object = None
|
||||
|
||||
@property
|
||||
def pts_seconds(self) -> float:
|
||||
if self.time_base_den == 0:
|
||||
return 0.0
|
||||
return self.pts * self.time_base_num / self.time_base_den
|
||||
|
||||
@property
|
||||
def dts_seconds(self) -> float:
|
||||
if self.time_base_den == 0:
|
||||
return 0.0
|
||||
return self.dts * self.time_base_num / self.time_base_den
|
||||
|
||||
@property
|
||||
def duration_seconds(self) -> float:
|
||||
if self.time_base_den == 0:
|
||||
return 0.0
|
||||
return self.duration * self.time_base_num / self.time_base_den
|
||||
|
||||
|
||||
class PyAVDemuxer:
|
||||
"""
|
||||
Streaming demuxer using PyAV with pipe-based I/O.
|
||||
|
||||
All container I/O happens in background threads. The writer thread
|
||||
feeds source bytes into a pipe; a single demux thread opens the
|
||||
container, discovers streams, and demuxes packets -- all on the
|
||||
same file object, ensuring the pipe's read cursor is never lost.
|
||||
|
||||
Performance optimisation: both the writer-input side and the
|
||||
packet-output side use plain ``queue.Queue`` (no event-loop
|
||||
involvement per item). The async/thread bridge is done via
|
||||
``run_in_executor`` on the consumer and an asyncio task on the
|
||||
producer, eliminating ~1700 ``run_coroutine_threadsafe`` round-trips
|
||||
per 30 s of 4K content.
|
||||
|
||||
Usage:
|
||||
demuxer = PyAVDemuxer()
|
||||
await demuxer.start(source_async_iter)
|
||||
# demuxer.video_stream / audio_stream are now available
|
||||
async for packet in demuxer.iter_packets():
|
||||
if packet.codec_type == "video":
|
||||
...
|
||||
"""
|
||||
|
||||
def __init__(self, decode_video: bool = False, decode_audio: bool = False) -> None:
|
||||
"""
|
||||
Args:
|
||||
decode_video: If True, the demux thread will decode video packets
|
||||
using the container's codec context and attach decoded frames
|
||||
to DemuxedPacket.decoded_frame. This avoids format conversion
|
||||
issues with standalone decoders (HVCC vs Annex B).
|
||||
decode_audio: If True, the demux thread will decode audio packets
|
||||
using the container's codec context and attach decoded frames
|
||||
to DemuxedPacket.decoded_frame. This is needed for codecs like
|
||||
Vorbis/Opus where the standalone decoder requires codec headers
|
||||
that are only available in the container context. Can also be
|
||||
set after start() returns (before packets are consumed) via
|
||||
the ``enable_audio_decode()`` method.
|
||||
"""
|
||||
self._decode_video = decode_video
|
||||
self._decode_audio = decode_audio
|
||||
self._video_decode_decided = threading.Event()
|
||||
self._audio_decode_decided = threading.Event()
|
||||
# If decode flags were set at construction time, mark decided immediately
|
||||
if decode_video:
|
||||
self._video_decode_decided.set()
|
||||
if decode_audio:
|
||||
self._audio_decode_decided.set()
|
||||
self._container: av.InputContainer | None = None
|
||||
self._video_stream: DemuxedStream | None = None
|
||||
self._audio_stream: DemuxedStream | None = None
|
||||
# Thread-safe queues (no event-loop involvement per put/get)
|
||||
self._packet_queue: queue.Queue | None = None
|
||||
self._source_queue: queue.Queue | None = None
|
||||
self._demux_thread: threading.Thread | None = None
|
||||
self._writer_thread: threading.Thread | None = None
|
||||
self._feeder_task: asyncio.Task | None = None
|
||||
self._write_fd: int | None = None
|
||||
self._read_fd: int | None = None
|
||||
|
||||
@property
|
||||
def video_stream(self) -> DemuxedStream | None:
|
||||
return self._video_stream
|
||||
|
||||
@property
|
||||
def audio_stream(self) -> DemuxedStream | None:
|
||||
return self._audio_stream
|
||||
|
||||
def enable_video_decode(self, enable: bool = True) -> None:
|
||||
"""
|
||||
Enable or disable in-thread video decoding.
|
||||
|
||||
Call this after ``start()`` returns (stream metadata is available)
|
||||
but before consuming packets via ``iter_packets()``. The demux
|
||||
thread waits for this signal before processing video packets.
|
||||
"""
|
||||
self._decode_video = enable
|
||||
self._video_decode_decided.set()
|
||||
|
||||
def enable_audio_decode(self, enable: bool = True) -> None:
|
||||
"""
|
||||
Enable or disable in-thread audio decoding.
|
||||
|
||||
Call this after ``start()`` returns (stream metadata is available)
|
||||
but before consuming packets via ``iter_packets()``. The demux
|
||||
thread waits for this signal before processing audio packets.
|
||||
"""
|
||||
self._decode_audio = enable
|
||||
self._audio_decode_decided.set()
|
||||
|
||||
# ── Writer side ──────────────────────────────────────────────────
|
||||
|
||||
async def _async_feeder(self, source: AsyncIterator[bytes]) -> None:
|
||||
"""
|
||||
Async task: pull chunks from the async source and push them
|
||||
into a plain ``queue.Queue`` for the writer thread.
|
||||
|
||||
This replaces the old per-chunk ``run_coroutine_threadsafe``
|
||||
pattern, batching the async-to-sync bridge into one task.
|
||||
|
||||
``queue.Queue.put()`` is a blocking call, so we use
|
||||
``run_in_executor`` to avoid blocking the event loop when the
|
||||
queue is full.
|
||||
"""
|
||||
loop = asyncio.get_running_loop()
|
||||
sq = self._source_queue
|
||||
try:
|
||||
async for chunk in source:
|
||||
await loop.run_in_executor(None, sq.put, chunk)
|
||||
except (asyncio.CancelledError, GeneratorExit):
|
||||
pass
|
||||
except Exception:
|
||||
pass
|
||||
finally:
|
||||
sq.put(_SENTINEL)
|
||||
|
||||
def _write_chunks_sync(self) -> None:
|
||||
"""
|
||||
Writer thread: pull pre-buffered chunks from ``_source_queue``
|
||||
and write to the OS pipe. No event-loop interaction.
|
||||
"""
|
||||
write_fd = self._write_fd
|
||||
sq = self._source_queue
|
||||
try:
|
||||
while True:
|
||||
chunk = sq.get(timeout=30.0)
|
||||
if chunk is _SENTINEL:
|
||||
break
|
||||
os.write(write_fd, chunk)
|
||||
except Exception:
|
||||
pass
|
||||
finally:
|
||||
try:
|
||||
os.close(write_fd)
|
||||
except OSError:
|
||||
pass
|
||||
self._write_fd = None
|
||||
|
||||
# ── Demux side ───────────────────────────────────────────────────
|
||||
|
||||
async def start(self, source: AsyncIterator[bytes]) -> None:
|
||||
"""
|
||||
Start pipe-based streaming: writer thread feeds the pipe, a single
|
||||
demux thread opens the container, discovers streams, and begins
|
||||
enqueuing packets.
|
||||
|
||||
After this returns, ``video_stream`` and ``audio_stream`` are
|
||||
populated and packets are being enqueued for ``iter_packets()``.
|
||||
"""
|
||||
loop = asyncio.get_running_loop()
|
||||
|
||||
# Create OS pipe
|
||||
self._read_fd, self._write_fd = os.pipe()
|
||||
|
||||
# Source buffer queue (async feeder task -> writer thread)
|
||||
self._source_queue = queue.Queue(maxsize=256)
|
||||
|
||||
# Kick off the async feeder task
|
||||
self._feeder_task = asyncio.create_task(self._async_feeder(source))
|
||||
|
||||
# Start writer thread (drains source_queue into the pipe)
|
||||
self._writer_thread = threading.Thread(
|
||||
target=self._write_chunks_sync,
|
||||
daemon=True,
|
||||
name="pyav-writer",
|
||||
)
|
||||
self._writer_thread.start()
|
||||
|
||||
# Packet queue for demux-thread -> async consumer bridge
|
||||
self._packet_queue = queue.Queue(maxsize=128)
|
||||
streams_ready = threading.Event()
|
||||
|
||||
def _open_and_demux():
|
||||
"""
|
||||
Single background thread: open container, discover streams,
|
||||
demux all packets.
|
||||
|
||||
Critical: av.open(), _discover_streams(), and container.demux()
|
||||
all happen on the same file object in the same thread. This
|
||||
ensures the pipe read cursor is never lost between open and demux.
|
||||
"""
|
||||
pkt_count = 0
|
||||
pq = self._packet_queue
|
||||
try:
|
||||
# Open container from read end of pipe
|
||||
read_file = os.fdopen(self._read_fd, "rb")
|
||||
self._read_fd = None # ownership transferred
|
||||
|
||||
self._container = av.open(
|
||||
read_file,
|
||||
mode="r",
|
||||
options={
|
||||
# Tolerate mid-stream joins / broken data in live TS
|
||||
"err_detect": "ignore_err",
|
||||
"fflags": "+discardcorrupt+genpts",
|
||||
},
|
||||
)
|
||||
self._discover_streams()
|
||||
|
||||
# Signal stream metadata is available
|
||||
streams_ready.set()
|
||||
|
||||
if self._video_stream is None and self._audio_stream is None:
|
||||
logger.warning("[pyav_demuxer] No video or audio streams found")
|
||||
return
|
||||
|
||||
# Select streams to demux
|
||||
streams_to_demux = []
|
||||
if self._video_stream is not None:
|
||||
streams_to_demux.append(self._container.streams[self._video_stream.index])
|
||||
if self._audio_stream is not None:
|
||||
streams_to_demux.append(self._container.streams[self._audio_stream.index])
|
||||
|
||||
# Wait for the caller to decide on video/audio decoding
|
||||
# (if not already decided at construction time).
|
||||
if not self._video_decode_decided.is_set():
|
||||
self._video_decode_decided.wait(timeout=10.0)
|
||||
if not self._audio_decode_decided.is_set():
|
||||
self._audio_decode_decided.wait(timeout=10.0)
|
||||
|
||||
# Cache stream objects and time_base for the hot loop
|
||||
video_stream_obj = (
|
||||
self._container.streams[self._video_stream.index] if self._video_stream is not None else None
|
||||
)
|
||||
audio_stream_obj = (
|
||||
self._container.streams[self._audio_stream.index] if self._audio_stream is not None else None
|
||||
)
|
||||
|
||||
video_tb_num = video_stream_obj.time_base.numerator if video_stream_obj else 1
|
||||
video_tb_den = video_stream_obj.time_base.denominator if video_stream_obj else 1
|
||||
audio_tb_num = audio_stream_obj.time_base.numerator if audio_stream_obj else 1
|
||||
audio_tb_den = audio_stream_obj.time_base.denominator if audio_stream_obj else 1
|
||||
|
||||
decode_video = self._decode_video
|
||||
decode_audio = self._decode_audio
|
||||
|
||||
# Demux and enqueue packets -- plain queue.put(), no event loop
|
||||
for packet in self._container.demux(*streams_to_demux):
|
||||
if packet.size == 0:
|
||||
continue
|
||||
|
||||
stream = self._container.streams[packet.stream_index]
|
||||
is_video = stream.type == "video"
|
||||
is_audio = stream.type == "audio"
|
||||
|
||||
# Optionally decode video packets in-thread
|
||||
if decode_video and is_video and video_stream_obj is not None:
|
||||
try:
|
||||
frames = video_stream_obj.codec_context.decode(packet)
|
||||
except Exception:
|
||||
frames = []
|
||||
for frame in frames:
|
||||
pq.put(
|
||||
DemuxedPacket(
|
||||
stream_index=packet.stream_index,
|
||||
codec_type="video",
|
||||
data=b"",
|
||||
pts=int(frame.pts) if frame.pts is not None else 0,
|
||||
dts=int(frame.pts) if frame.pts is not None else 0,
|
||||
duration=int(packet.duration) if packet.duration is not None else 0,
|
||||
is_keyframe=frame.key_frame,
|
||||
time_base_num=video_tb_num,
|
||||
time_base_den=video_tb_den,
|
||||
decoded_frame=frame,
|
||||
)
|
||||
)
|
||||
pkt_count += 1
|
||||
|
||||
# Optionally decode audio packets in-thread
|
||||
elif decode_audio and is_audio and audio_stream_obj is not None:
|
||||
try:
|
||||
frames = audio_stream_obj.codec_context.decode(packet)
|
||||
except Exception:
|
||||
frames = []
|
||||
for frame in frames:
|
||||
pq.put(
|
||||
DemuxedPacket(
|
||||
stream_index=packet.stream_index,
|
||||
codec_type="audio",
|
||||
data=b"",
|
||||
pts=int(frame.pts) if frame.pts is not None else 0,
|
||||
dts=int(frame.pts) if frame.pts is not None else 0,
|
||||
duration=int(packet.duration) if packet.duration is not None else 0,
|
||||
is_keyframe=False,
|
||||
time_base_num=audio_tb_num,
|
||||
time_base_den=audio_tb_den,
|
||||
decoded_frame=frame,
|
||||
)
|
||||
)
|
||||
pkt_count += 1
|
||||
|
||||
else:
|
||||
tb_num = video_tb_num if is_video else audio_tb_num
|
||||
tb_den = video_tb_den if is_video else audio_tb_den
|
||||
pq.put(
|
||||
DemuxedPacket(
|
||||
stream_index=packet.stream_index,
|
||||
codec_type=stream.type,
|
||||
data=bytes(packet),
|
||||
pts=int(packet.pts) if packet.pts is not None else 0,
|
||||
dts=int(packet.dts) if packet.dts is not None else 0,
|
||||
duration=int(packet.duration) if packet.duration is not None else 0,
|
||||
is_keyframe=packet.is_keyframe,
|
||||
time_base_num=tb_num,
|
||||
time_base_den=tb_den,
|
||||
)
|
||||
)
|
||||
pkt_count += 1
|
||||
|
||||
# Flush the video decoder if we were decoding
|
||||
if decode_video and video_stream_obj is not None:
|
||||
try:
|
||||
for frame in video_stream_obj.codec_context.decode(None):
|
||||
pq.put(
|
||||
DemuxedPacket(
|
||||
stream_index=video_stream_obj.index,
|
||||
codec_type="video",
|
||||
data=b"",
|
||||
pts=int(frame.pts) if frame.pts is not None else 0,
|
||||
dts=int(frame.pts) if frame.pts is not None else 0,
|
||||
duration=0,
|
||||
is_keyframe=frame.key_frame,
|
||||
time_base_num=video_tb_num,
|
||||
time_base_den=video_tb_den,
|
||||
decoded_frame=frame,
|
||||
)
|
||||
)
|
||||
pkt_count += 1
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Flush the audio decoder if we were decoding
|
||||
if decode_audio and audio_stream_obj is not None:
|
||||
try:
|
||||
for frame in audio_stream_obj.codec_context.decode(None):
|
||||
pq.put(
|
||||
DemuxedPacket(
|
||||
stream_index=audio_stream_obj.index,
|
||||
codec_type="audio",
|
||||
data=b"",
|
||||
pts=int(frame.pts) if frame.pts is not None else 0,
|
||||
dts=int(frame.pts) if frame.pts is not None else 0,
|
||||
duration=0,
|
||||
is_keyframe=False,
|
||||
time_base_num=audio_tb_num,
|
||||
time_base_den=audio_tb_den,
|
||||
decoded_frame=frame,
|
||||
)
|
||||
)
|
||||
pkt_count += 1
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
logger.info("[pyav_demuxer] Demux complete: %d packets", pkt_count)
|
||||
|
||||
except Exception as e:
|
||||
if "Invalid data" not in str(e):
|
||||
logger.debug("[pyav_demuxer] Demux thread error: %s", e)
|
||||
# Ensure streams_ready is set even on error
|
||||
streams_ready.set()
|
||||
finally:
|
||||
pq.put(_SENTINEL)
|
||||
|
||||
self._demux_thread = threading.Thread(target=_open_and_demux, daemon=True, name="pyav-demux")
|
||||
self._demux_thread.start()
|
||||
|
||||
# Wait for stream discovery before returning.
|
||||
# Use run_in_executor to avoid blocking the event loop.
|
||||
await loop.run_in_executor(None, streams_ready.wait)
|
||||
|
||||
async def iter_packets(self) -> AsyncIterator[DemuxedPacket]:
|
||||
"""
|
||||
Yield demuxed packets from the background thread.
|
||||
|
||||
Uses ``run_in_executor`` for the blocking ``queue.get()`` call,
|
||||
avoiding per-packet ``run_coroutine_threadsafe`` overhead.
|
||||
|
||||
``start()`` must be called first.
|
||||
"""
|
||||
if self._packet_queue is None:
|
||||
raise RuntimeError("Call start() before iter_packets()")
|
||||
|
||||
loop = asyncio.get_running_loop()
|
||||
pq = self._packet_queue
|
||||
|
||||
try:
|
||||
while True:
|
||||
packet = await loop.run_in_executor(None, pq.get)
|
||||
if packet is _SENTINEL:
|
||||
break
|
||||
yield packet
|
||||
|
||||
if self._demux_thread is not None:
|
||||
self._demux_thread.join(timeout=5.0)
|
||||
|
||||
except GeneratorExit:
|
||||
logger.debug("[pyav_demuxer] Generator closed")
|
||||
except asyncio.CancelledError:
|
||||
logger.debug("[pyav_demuxer] Cancelled")
|
||||
finally:
|
||||
self._cleanup()
|
||||
|
||||
def _discover_streams(self) -> None:
|
||||
"""Inspect the opened container and record stream metadata."""
|
||||
if self._container is None:
|
||||
return
|
||||
|
||||
for stream in self._container.streams:
|
||||
if stream.type == "video" and self._video_stream is None:
|
||||
codec_ctx = stream.codec_context
|
||||
fps = float(stream.average_rate) if stream.average_rate else 24.0
|
||||
self._video_stream = DemuxedStream(
|
||||
index=stream.index,
|
||||
codec_name=codec_ctx.name if codec_ctx else stream.codec.name,
|
||||
codec_type="video",
|
||||
width=codec_ctx.width if codec_ctx else 0,
|
||||
height=codec_ctx.height if codec_ctx else 0,
|
||||
fps=fps,
|
||||
pixel_format=str(codec_ctx.pix_fmt) if codec_ctx and codec_ctx.pix_fmt else "yuv420p",
|
||||
time_base_num=stream.time_base.numerator,
|
||||
time_base_den=stream.time_base.denominator,
|
||||
duration_seconds=float(stream.duration * stream.time_base) if stream.duration else 0.0,
|
||||
extradata=bytes(codec_ctx.extradata) if codec_ctx and codec_ctx.extradata else b"",
|
||||
)
|
||||
logger.info(
|
||||
"[pyav_demuxer] Video: %s %dx%d @%.1ffps",
|
||||
self._video_stream.codec_name,
|
||||
self._video_stream.width,
|
||||
self._video_stream.height,
|
||||
self._video_stream.fps,
|
||||
)
|
||||
|
||||
elif stream.type == "audio" and self._audio_stream is None:
|
||||
codec_ctx = stream.codec_context
|
||||
self._audio_stream = DemuxedStream(
|
||||
index=stream.index,
|
||||
codec_name=codec_ctx.name if codec_ctx else stream.codec.name,
|
||||
codec_type="audio",
|
||||
sample_rate=codec_ctx.sample_rate if codec_ctx else 0,
|
||||
channels=codec_ctx.channels if codec_ctx else 0,
|
||||
time_base_num=stream.time_base.numerator,
|
||||
time_base_den=stream.time_base.denominator,
|
||||
duration_seconds=float(stream.duration * stream.time_base) if stream.duration else 0.0,
|
||||
extradata=bytes(codec_ctx.extradata) if codec_ctx and codec_ctx.extradata else b"",
|
||||
)
|
||||
logger.info(
|
||||
"[pyav_demuxer] Audio: %s %dHz %dch",
|
||||
self._audio_stream.codec_name,
|
||||
self._audio_stream.sample_rate,
|
||||
self._audio_stream.channels,
|
||||
)
|
||||
|
||||
def _cleanup(self) -> None:
|
||||
"""Stop threads and release all resources safely.
|
||||
|
||||
The order is critical to avoid SIGSEGV from closing the container
|
||||
while the demux thread is still calling container.demux():
|
||||
|
||||
1. Cancel the feeder task (stops new bytes being queued).
|
||||
2. Put a sentinel into the source queue so the writer thread
|
||||
unblocks and exits. The writer's ``finally`` closes the pipe
|
||||
write-end, which causes the demux thread to see EOF.
|
||||
3. Join the writer thread (wait for it to drain and exit).
|
||||
4. Join the demux thread (it finishes after pipe EOF).
|
||||
5. ONLY THEN close the container (no thread is using it).
|
||||
6. Close any remaining pipe FDs (read end, if still open).
|
||||
"""
|
||||
# 1. Cancel feeder task
|
||||
if self._feeder_task is not None:
|
||||
self._feeder_task.cancel()
|
||||
self._feeder_task = None
|
||||
|
||||
# 2. Unblock writer thread so it exits and closes the pipe
|
||||
if self._source_queue is not None:
|
||||
try:
|
||||
self._source_queue.put_nowait(_SENTINEL)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# 3. Join writer thread (it closes _write_fd in its finally block)
|
||||
if self._writer_thread is not None:
|
||||
self._writer_thread.join(timeout=5.0)
|
||||
self._writer_thread = None
|
||||
|
||||
# 4. Join demux thread -- must finish before we close the container
|
||||
if self._demux_thread is not None:
|
||||
self._demux_thread.join(timeout=5.0)
|
||||
self._demux_thread = None
|
||||
|
||||
# 5. Now safe to close the container (no thread is using it)
|
||||
if self._container is not None:
|
||||
try:
|
||||
self._container.close()
|
||||
except Exception:
|
||||
pass
|
||||
self._container = None
|
||||
|
||||
# 6. Close any remaining pipe FDs
|
||||
for fd_name in ("_read_fd", "_write_fd"):
|
||||
fd = getattr(self, fd_name, None)
|
||||
if fd is not None:
|
||||
try:
|
||||
os.close(fd)
|
||||
except OSError:
|
||||
pass
|
||||
setattr(self, fd_name, None)
|
||||
1121
mediaflow_proxy/remuxer/transcode_handler.py
Normal file
1121
mediaflow_proxy/remuxer/transcode_handler.py
Normal file
File diff suppressed because it is too large
Load Diff
1268
mediaflow_proxy/remuxer/transcode_pipeline.py
Normal file
1268
mediaflow_proxy/remuxer/transcode_pipeline.py
Normal file
File diff suppressed because it is too large
Load Diff
1728
mediaflow_proxy/remuxer/ts_muxer.py
Normal file
1728
mediaflow_proxy/remuxer/ts_muxer.py
Normal file
File diff suppressed because it is too large
Load Diff
403
mediaflow_proxy/remuxer/video_transcoder.py
Normal file
403
mediaflow_proxy/remuxer/video_transcoder.py
Normal file
@@ -0,0 +1,403 @@
|
||||
"""
|
||||
GPU-accelerated video transcoder with runtime detection.
|
||||
|
||||
Detects available hardware encoders/decoders at first use and selects
|
||||
the best available backend:
|
||||
- NVIDIA: h264_nvenc / hevc_cuvid (NVENC + CUDA)
|
||||
- Apple macOS: h264_videotoolbox / hevc_videotoolbox
|
||||
- Intel Linux: h264_vaapi / h264_qsv
|
||||
- Fallback: libx264 (CPU)
|
||||
|
||||
The transcoder operates at the packet/frame level via PyAV, suitable
|
||||
for integration into the streaming pipeline.
|
||||
"""
|
||||
|
||||
import logging
|
||||
from dataclasses import dataclass, field
|
||||
from enum import Enum
|
||||
from fractions import Fraction
|
||||
|
||||
import av
|
||||
|
||||
from mediaflow_proxy.configs import settings
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class HWAccelType(Enum):
|
||||
NONE = "none"
|
||||
NVIDIA = "nvidia"
|
||||
VIDEOTOOLBOX = "videotoolbox"
|
||||
VAAPI = "vaapi"
|
||||
QSV = "qsv"
|
||||
|
||||
|
||||
@dataclass
|
||||
class HWCapability:
|
||||
"""Detected hardware acceleration capability."""
|
||||
|
||||
accel_type: HWAccelType = HWAccelType.NONE
|
||||
h264_encoder: str = "libx264"
|
||||
h264_decoder: str | None = None # None = use default software decoder
|
||||
hevc_decoder: str | None = None
|
||||
available_encoders: list[str] = field(default_factory=list)
|
||||
available_decoders: list[str] = field(default_factory=list)
|
||||
|
||||
|
||||
# Module-level singleton -- populated on first call to get_hw_capability()
|
||||
_hw_capability: HWCapability | None = None
|
||||
|
||||
|
||||
def _probe_codec(name: str, mode: str = "w") -> bool:
|
||||
"""
|
||||
Check if a PyAV codec is available by name.
|
||||
|
||||
Args:
|
||||
name: Codec name (e.g. 'h264_videotoolbox').
|
||||
mode: 'w' for encoder, 'r' for decoder.
|
||||
"""
|
||||
try:
|
||||
av.Codec(name, mode)
|
||||
return True
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
|
||||
def _detect_hw_capability() -> HWCapability:
|
||||
"""
|
||||
Probe the runtime environment for hardware encoder/decoder availability.
|
||||
|
||||
Checks NVIDIA, Apple VideoToolbox, Intel VAAPI/QSV in priority order.
|
||||
Falls back to libx264 CPU encoding.
|
||||
"""
|
||||
cap = HWCapability()
|
||||
|
||||
# Collect available encoders/decoders for logging
|
||||
hw_encoders = [
|
||||
"h264_nvenc",
|
||||
"hevc_nvenc",
|
||||
"h264_videotoolbox",
|
||||
"hevc_videotoolbox",
|
||||
"h264_vaapi",
|
||||
"hevc_vaapi",
|
||||
"h264_qsv",
|
||||
"hevc_qsv",
|
||||
]
|
||||
hw_decoders = [
|
||||
"h264_cuvid",
|
||||
"hevc_cuvid",
|
||||
"h264_qsv",
|
||||
"hevc_qsv",
|
||||
]
|
||||
|
||||
cap.available_encoders = [c for c in hw_encoders if _probe_codec(c, "w")]
|
||||
cap.available_decoders = [c for c in hw_decoders if _probe_codec(c, "r")]
|
||||
|
||||
# Priority 1: NVIDIA NVENC
|
||||
if "h264_nvenc" in cap.available_encoders:
|
||||
cap.accel_type = HWAccelType.NVIDIA
|
||||
cap.h264_encoder = "h264_nvenc"
|
||||
if "h264_cuvid" in cap.available_decoders:
|
||||
cap.h264_decoder = "h264_cuvid"
|
||||
if "hevc_cuvid" in cap.available_decoders:
|
||||
cap.hevc_decoder = "hevc_cuvid"
|
||||
return cap
|
||||
|
||||
# Priority 2: Apple VideoToolbox
|
||||
if "h264_videotoolbox" in cap.available_encoders:
|
||||
cap.accel_type = HWAccelType.VIDEOTOOLBOX
|
||||
cap.h264_encoder = "h264_videotoolbox"
|
||||
# VideoToolbox decoders are used automatically via hwaccel
|
||||
return cap
|
||||
|
||||
# Priority 3: Intel VAAPI (Linux)
|
||||
if "h264_vaapi" in cap.available_encoders:
|
||||
cap.accel_type = HWAccelType.VAAPI
|
||||
cap.h264_encoder = "h264_vaapi"
|
||||
return cap
|
||||
|
||||
# Priority 4: Intel QSV
|
||||
if "h264_qsv" in cap.available_encoders:
|
||||
cap.accel_type = HWAccelType.QSV
|
||||
cap.h264_encoder = "h264_qsv"
|
||||
if "h264_qsv" in cap.available_decoders:
|
||||
cap.h264_decoder = "h264_qsv"
|
||||
if "hevc_qsv" in cap.available_decoders:
|
||||
cap.hevc_decoder = "hevc_qsv"
|
||||
return cap
|
||||
|
||||
# Fallback: CPU
|
||||
cap.accel_type = HWAccelType.NONE
|
||||
cap.h264_encoder = "libx264"
|
||||
return cap
|
||||
|
||||
|
||||
def get_hw_capability() -> HWCapability:
|
||||
"""Get the detected hardware acceleration capability (cached singleton)."""
|
||||
global _hw_capability
|
||||
if _hw_capability is None:
|
||||
_hw_capability = _detect_hw_capability()
|
||||
if settings.transcode_prefer_gpu and _hw_capability.accel_type != HWAccelType.NONE:
|
||||
logger.info(
|
||||
"[video_transcoder] GPU acceleration: %s (encoder=%s, decoders=%s)",
|
||||
_hw_capability.accel_type.value,
|
||||
_hw_capability.h264_encoder,
|
||||
_hw_capability.available_decoders or "software",
|
||||
)
|
||||
else:
|
||||
logger.info(
|
||||
"[video_transcoder] Using CPU encoder: %s (available HW: encoders=%s, decoders=%s)",
|
||||
_hw_capability.h264_encoder,
|
||||
_hw_capability.available_encoders or "none",
|
||||
_hw_capability.available_decoders or "none",
|
||||
)
|
||||
return _hw_capability
|
||||
|
||||
|
||||
class VideoTranscoder:
|
||||
"""
|
||||
In-process video transcoder using PyAV.
|
||||
|
||||
Decodes input video packets and re-encodes to H.264 using the best
|
||||
available hardware encoder (or CPU libx264 fallback).
|
||||
|
||||
Operates at the frame level: caller provides raw video packets (from
|
||||
PyAV demuxer), transcoder returns encoded H.264 NAL data suitable
|
||||
for the fMP4 muxer.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
input_codec_name: str,
|
||||
width: int,
|
||||
height: int,
|
||||
fps: float = 24.0,
|
||||
pixel_format: str = "yuv420p",
|
||||
force_software: bool = False,
|
||||
) -> None:
|
||||
hw = get_hw_capability()
|
||||
use_gpu = settings.transcode_prefer_gpu and hw.accel_type != HWAccelType.NONE and not force_software
|
||||
|
||||
# --- Decoder ---
|
||||
hw_decoder = None
|
||||
if use_gpu:
|
||||
if "hevc" in input_codec_name or "h265" in input_codec_name:
|
||||
hw_decoder = hw.hevc_decoder
|
||||
else:
|
||||
hw_decoder = hw.h264_decoder
|
||||
|
||||
decoder_name = hw_decoder or input_codec_name
|
||||
self._decoder = av.CodecContext.create(decoder_name, "r")
|
||||
|
||||
# --- Encoder ---
|
||||
encoder_name = hw.h264_encoder if use_gpu else "libx264"
|
||||
|
||||
# H.264 requires even dimensions
|
||||
enc_width = width if width % 2 == 0 else width + 1
|
||||
enc_height = height if height % 2 == 0 else height + 1
|
||||
|
||||
self._encoder = av.CodecContext.create(encoder_name, "w")
|
||||
self._encoder.width = enc_width
|
||||
self._encoder.height = enc_height
|
||||
self._encoder.pix_fmt = "yuv420p" # H.264 requires yuv420p
|
||||
self._encoder.time_base = Fraction(1, int(fps * 1000))
|
||||
self._encoder.framerate = Fraction(int(fps * 1000), 1000)
|
||||
self._encoder.bit_rate = _parse_bitrate(settings.transcode_video_bitrate)
|
||||
self._encoder.gop_size = int(fps * 2) # Keyframe every ~2 seconds
|
||||
|
||||
# Encoder options based on backend
|
||||
opts = {}
|
||||
if encoder_name == "libx264":
|
||||
opts["preset"] = settings.transcode_video_preset
|
||||
opts["tune"] = "zerolatency"
|
||||
opts["profile"] = "high"
|
||||
elif "nvenc" in encoder_name:
|
||||
opts["preset"] = "p4" # NVENC preset (p1=fastest .. p7=slowest)
|
||||
opts["tune"] = "ll" # Low latency
|
||||
opts["rc"] = "vbr"
|
||||
elif "videotoolbox" in encoder_name:
|
||||
opts["realtime"] = "1"
|
||||
opts["allow_sw"] = "1" # Fallback to software if HW busy
|
||||
elif "vaapi" in encoder_name:
|
||||
opts["rc_mode"] = "VBR"
|
||||
elif "qsv" in encoder_name:
|
||||
opts["preset"] = "medium"
|
||||
|
||||
self._encoder.options = opts
|
||||
self._encoder.open()
|
||||
|
||||
width = enc_width
|
||||
height = enc_height
|
||||
|
||||
self._input_codec = input_codec_name
|
||||
self._encoder_name = encoder_name
|
||||
self._frames_decoded = 0
|
||||
self._frames_encoded = 0
|
||||
self._width = width
|
||||
self._height = height
|
||||
# Tracks whether the standalone decoder was actually used (via decode_packet).
|
||||
# When the demux thread decodes frames in-thread (decode_video=True),
|
||||
# the standalone decoder is never fed packets and flushing it is wasted work.
|
||||
self._decoder_used = False
|
||||
self._flushed = False # Prevents double-flush which causes SIGSEGV
|
||||
|
||||
logger.info(
|
||||
"[video_transcoder] Initialized: %s -> %s (%s), %dx%d @%.1ffps %dk",
|
||||
input_codec_name,
|
||||
encoder_name,
|
||||
hw.accel_type.value,
|
||||
width,
|
||||
height,
|
||||
fps,
|
||||
self._encoder.bit_rate // 1000 if self._encoder.bit_rate else 0,
|
||||
)
|
||||
|
||||
@property
|
||||
def codec_private_data(self) -> bytes | None:
|
||||
"""H.264 extradata (SPS/PPS) from the encoder, for the fMP4 init segment."""
|
||||
if self._encoder.extradata:
|
||||
return bytes(self._encoder.extradata)
|
||||
return None
|
||||
|
||||
@property
|
||||
def width(self) -> int:
|
||||
return self._width
|
||||
|
||||
@property
|
||||
def height(self) -> int:
|
||||
return self._height
|
||||
|
||||
def transcode_frame(self, frame: av.VideoFrame) -> list[tuple[bytes, bool, int, int]]:
|
||||
"""
|
||||
Encode a decoded video frame to H.264.
|
||||
|
||||
Args:
|
||||
frame: A decoded av.VideoFrame.
|
||||
|
||||
Returns:
|
||||
List of (nal_data, is_keyframe, pts, dts) tuples.
|
||||
"""
|
||||
self._frames_decoded += 1
|
||||
output = []
|
||||
|
||||
# Ensure correct pixel format for encoder
|
||||
if frame.format.name != self._encoder.pix_fmt:
|
||||
frame = frame.reformat(format=self._encoder.pix_fmt)
|
||||
|
||||
try:
|
||||
for packet in self._encoder.encode(frame):
|
||||
self._frames_encoded += 1
|
||||
output.append(
|
||||
(
|
||||
bytes(packet),
|
||||
packet.is_keyframe,
|
||||
int(packet.pts) if packet.pts is not None else 0,
|
||||
int(packet.dts) if packet.dts is not None else 0,
|
||||
)
|
||||
)
|
||||
except av.error.InvalidDataError as e:
|
||||
logger.debug("[video_transcoder] Encode error: %s", e)
|
||||
|
||||
return output
|
||||
|
||||
def decode_packet(self, packet: av.Packet) -> list[av.VideoFrame]:
|
||||
"""Decode a video packet into frames."""
|
||||
self._decoder_used = True
|
||||
try:
|
||||
return list(self._decoder.decode(packet))
|
||||
except av.error.InvalidDataError as e:
|
||||
logger.debug("[video_transcoder] Decode error: %s", e)
|
||||
return []
|
||||
|
||||
def flush(self) -> list[tuple[bytes, bool, int, int]]:
|
||||
"""
|
||||
Flush encoder (and decoder, if it was used) buffers.
|
||||
|
||||
When ``decode_video=True`` is used in PyAVDemuxer, the demux thread
|
||||
decodes frames using the container's codec context. In that case the
|
||||
standalone ``_decoder`` here is never fed any packets, so flushing
|
||||
it is skipped -- avoiding a stall that added ~5 s on some backends.
|
||||
|
||||
Safe to call multiple times -- subsequent calls return an empty list.
|
||||
"""
|
||||
if self._flushed:
|
||||
return []
|
||||
self._flushed = True
|
||||
|
||||
output = []
|
||||
|
||||
# Flush decoder only if it was actually used (via decode_packet)
|
||||
if self._decoder_used:
|
||||
try:
|
||||
for frame in self._decoder.decode(None):
|
||||
self._frames_decoded += 1
|
||||
if frame.format.name != self._encoder.pix_fmt:
|
||||
frame = frame.reformat(format=self._encoder.pix_fmt)
|
||||
for packet in self._encoder.encode(frame):
|
||||
self._frames_encoded += 1
|
||||
output.append(
|
||||
(
|
||||
bytes(packet),
|
||||
packet.is_keyframe,
|
||||
int(packet.pts) if packet.pts is not None else 0,
|
||||
int(packet.dts) if packet.dts is not None else 0,
|
||||
)
|
||||
)
|
||||
except Exception as e:
|
||||
logger.debug("[video_transcoder] Decoder flush error: %s", e)
|
||||
else:
|
||||
logger.debug("[video_transcoder] Skipping decoder flush (decoder not used)")
|
||||
|
||||
# Flush encoder
|
||||
try:
|
||||
for packet in self._encoder.encode(None):
|
||||
self._frames_encoded += 1
|
||||
output.append(
|
||||
(
|
||||
bytes(packet),
|
||||
packet.is_keyframe,
|
||||
int(packet.pts) if packet.pts is not None else 0,
|
||||
int(packet.dts) if packet.dts is not None else 0,
|
||||
)
|
||||
)
|
||||
except Exception as e:
|
||||
logger.debug("[video_transcoder] Encoder flush error: %s", e)
|
||||
|
||||
logger.info(
|
||||
"[video_transcoder] Flushed: %d decoded, %d encoded total (decoder_used=%s)",
|
||||
self._frames_decoded,
|
||||
self._frames_encoded,
|
||||
self._decoder_used,
|
||||
)
|
||||
return output
|
||||
|
||||
def close(self) -> None:
|
||||
"""Release codec contexts.
|
||||
|
||||
Flushes the encoder (if not already flushed) before releasing to avoid
|
||||
SIGSEGV when libx264 or hardware encoders have buffered frames at
|
||||
teardown time. Double-flushing is the most common cause of SIGSEGV
|
||||
in the transcode pipeline.
|
||||
|
||||
PyAV codec contexts are released via garbage collection (no explicit
|
||||
close method), so we flush first to ensure native buffers are drained
|
||||
before the C-level codec is freed.
|
||||
"""
|
||||
# flush() is idempotent -- safe to call even if already flushed
|
||||
self.flush()
|
||||
# Release references -- GC will free the native codec contexts
|
||||
self._encoder = None
|
||||
self._decoder = None
|
||||
|
||||
def __del__(self) -> None:
|
||||
self.close()
|
||||
|
||||
|
||||
def _parse_bitrate(bitrate_str: str) -> int:
|
||||
"""Parse a bitrate string like '4M', '2000k', '5000000' to int bits/s."""
|
||||
s = bitrate_str.strip().lower()
|
||||
if s.endswith("m"):
|
||||
return int(float(s[:-1]) * 1_000_000)
|
||||
if s.endswith("k"):
|
||||
return int(float(s[:-1]) * 1_000)
|
||||
return int(s)
|
||||
Reference in New Issue
Block a user