Files
UnHided/mediaflow_proxy/remuxer/transcode_pipeline.py
UrloMythus cfc6bbabc9 update
2026-02-19 20:15:03 +01:00

1269 lines
54 KiB
Python

"""
Streaming transcode pipelines producing fragmented MP4 on-the-fly.
Three pipelines are provided:
1. ``stream_transcode_fmp4`` -- **MKV fast path (continuous)**.
Uses the custom EBML demuxer for zero-copy video passthrough (H.264/H.265)
with audio-only transcoding. Best for MKV sources with browser-compatible
video but incompatible audio. Emits init + media fragments.
2. ``stream_segment_fmp4`` -- **MKV fast path (HLS segment)**.
Same EBML demuxer and video passthrough as above, but adapted for
per-segment HLS delivery: no init segment, ``start_decode_time_ms``
for correct tfdt placement, and frame-count bounding for precise
segment duration control.
3. ``stream_transcode_universal`` -- **Universal path via PyAV**.
Demuxes any container format (MKV, MP4, TS, etc.) using PyAV, optionally
re-encodes video (GPU-accelerated when available), and transcodes audio.
Required when the video codec needs re-encoding or the source is not MKV.
All pipelines produce on-the-fly fMP4 fragments suitable for streaming
via ``StreamingResponse``.
"""
import asyncio
import hashlib
import logging
from collections.abc import AsyncIterator
import av
from av.audio.resampler import AudioResampler
from mediaflow_proxy.remuxer.audio_transcoder import AudioTranscoder, get_ffmpeg_codec_name, needs_transcode
from mediaflow_proxy.remuxer.codec_utils import (
_PYAV_TO_MKV_AUDIO,
_PYAV_TO_MKV_VIDEO,
annexb_to_avcc,
ensure_avcc_extradata,
audio_needs_transcode as pyav_audio_needs_transcode,
video_needs_reencode as pyav_video_needs_reencode,
)
from mediaflow_proxy.remuxer.ebml_parser import (
CODEC_ID_H264,
CODEC_ID_H265,
MKVTrack,
)
from mediaflow_proxy.remuxer.mkv_demuxer import MKVDemuxer, MKVHeader
from mediaflow_proxy.remuxer.mp4_muxer import FMP4StreamMuxer
from mediaflow_proxy.remuxer.pyav_demuxer import PyAVDemuxer
from mediaflow_proxy.remuxer.video_transcoder import VideoTranscoder
logger = logging.getLogger(__name__)
# Video timescale (90kHz is standard for MPEG transport)
_VIDEO_TIMESCALE = 90000
def derive_mp4_cache_key(
chat_id: str | int | None,
message_id: int | None,
file_id: str | None,
) -> str:
"""Derive a deterministic cache key for a transcoded stream."""
if file_id:
raw = f"mp4:file_id:{file_id}"
elif chat_id is not None and message_id is not None:
raw = f"mp4:chat:{chat_id}:msg:{message_id}"
else:
return ""
return hashlib.sha256(raw.encode()).hexdigest()[:16]
async def stream_transcode_fmp4(
source: AsyncIterator[bytes],
*,
max_duration_ms: float | None = None,
) -> AsyncIterator[bytes]:
"""
Stream MKV-to-fMP4 transcoding as an async generator (continuous mode).
This pipeline copies video (passthrough) and transcodes audio from
EAC3/AC3 to AAC. Used for continuous single-request fMP4 streaming.
HLS segments use the universal pipeline with video re-encoding instead.
Yields:
1. First yield: fMP4 init segment (ftyp + moov)
2. Subsequent yields: fMP4 media fragments (moof + mdat)
Args:
source: Async iterator of MKV bytes (e.g., from Telegram stream).
max_duration_ms: If set, stop emitting after this many milliseconds
of media have been produced.
Yields:
bytes chunks forming a valid fMP4 byte stream.
"""
transcoder = None
video_frame_count = 0
audio_frame_count = 0
fragment_count = 0
bytes_out = 0
cancelled = False
try:
# Phase 1: Parse MKV header
demuxer = MKVDemuxer()
header = await demuxer.read_header(source)
if not header.tracks:
raise ValueError("MKV file has no tracks")
video_track = _find_video_track(header)
audio_track = _find_audio_track(header)
if video_track is None:
raise ValueError("No supported video track found (need H.264 or H.265)")
logger.info(
"[pipeline] MKV header: duration=%.1fs, video=%s %dx%d, audio=%s %dHz %dch",
header.duration_ms / 1000.0,
video_track.codec_id,
video_track.pixel_width,
video_track.pixel_height,
audio_track.codec_id if audio_track else "none",
int(audio_track.sample_rate) if audio_track else 0,
audio_track.channels if audio_track else 0,
)
# Phase 2: Set up audio transcoder
if audio_track and needs_transcode(audio_track.codec_id):
ffmpeg_codec = get_ffmpeg_codec_name(audio_track.codec_id)
if ffmpeg_codec:
transcoder = AudioTranscoder(
input_codec=ffmpeg_codec,
input_sample_rate=int(audio_track.sample_rate),
input_channels=audio_track.channels,
output_sample_rate=48000,
output_channels=2,
output_bitrate=192000,
)
logger.info("[pipeline] Audio transcoding: %s -> AAC", audio_track.codec_id)
else:
logger.warning("[pipeline] No FFmpeg codec for %s, skipping audio", audio_track.codec_id)
audio_track = None
audio_timescale = 48000 if transcoder else (int(audio_track.effective_sample_rate) if audio_track else 48000)
# Phase 3: Build init segment with placeholder AAC config
# We'll use a default AAC config (48kHz stereo LC) initially.
# If the encoder provides a different one, the decoder should still handle it
# since the actual config is embedded in the AAC frames.
default_asc = bytes([0x11, 0x90]) # 48kHz stereo LC
muxer = FMP4StreamMuxer(
video_track=video_track,
audio_sample_rate=48000 if transcoder else (int(audio_track.sample_rate) if audio_track else 48000),
audio_channels=2 if transcoder else (audio_track.channels if audio_track else 2),
audio_specific_config=default_asc,
video_timescale=_VIDEO_TIMESCALE,
audio_timescale=audio_timescale,
duration_ms=header.duration_ms,
fragment_duration_ms=2000.0,
)
# Check if we can get a real ASC from the encoder before building init
if transcoder and transcoder.audio_specific_config:
muxer.update_audio_specific_config(transcoder.audio_specific_config)
init_segment = muxer.build_init_segment()
logger.info("[pipeline] Init segment: %d bytes", len(init_segment))
yield init_segment
bytes_out = len(init_segment)
# Phase 4: Process frames and emit fragments
last_video_ts_ms = 0.0
emitted_duration_ms = 0.0
async for frame in demuxer.iter_frames(source):
if video_track and frame.track_number == video_track.track_number:
# Video frame (passthrough -- no decode/re-encode)
duration_ms = frame.duration_ms
if duration_ms <= 0 and video_track.frame_duration_ms > 0:
duration_ms = video_track.frame_duration_ms
elif duration_ms <= 0:
if video_frame_count > 0 and frame.timestamp_ms > last_video_ts_ms:
duration_ms = frame.timestamp_ms - last_video_ts_ms
else:
duration_ms = 1000.0 / 24.0 # Fallback 24fps
duration_ticks = max(1, int(duration_ms * _VIDEO_TIMESCALE / 1000.0))
# Pass absolute PTS for CTS (composition time offset).
# MKV timestamps are display-order (PTS); the muxer
# accumulates DTS monotonically, so the difference is
# written as CTS in the trun sample entry.
pts_ticks = int(frame.timestamp_ms * _VIDEO_TIMESCALE / 1000.0)
# Ensure AVCC format and skip non-VCL NAL-only samples
sample_data = annexb_to_avcc(frame.data, filter_ps=False)
if not sample_data or not _has_valid_video_nal(sample_data, video_track.codec_id):
continue
muxer.add_video_sample(
sample_data,
duration_ticks,
frame.is_keyframe,
pts_ticks=pts_ticks,
)
last_video_ts_ms = frame.timestamp_ms
video_frame_count += 1
emitted_duration_ms += duration_ms
elif audio_track and frame.track_number == audio_track.track_number:
if transcoder:
aac_frames = transcoder.transcode(frame.data)
for aac_data in aac_frames:
muxer.add_audio_sample(aac_data, transcoder.frame_size)
audio_frame_count += 1
else:
# Audio passthrough
duration_ms = frame.duration_ms
if duration_ms <= 0 and audio_track.frame_duration_ms > 0:
duration_ms = audio_track.frame_duration_ms
elif duration_ms <= 0:
duration_ms = 1024.0 / audio_track.sample_rate * 1000.0
duration_ticks = max(1, int(duration_ms * audio_timescale / 1000.0))
muxer.add_audio_sample(frame.data, duration_ticks)
audio_frame_count += 1
# Check if we should emit a fragment
fragment = muxer.flush_fragment()
if fragment:
fragment_count += 1
bytes_out += len(fragment)
yield fragment
# Duration bounding (e.g. for max_duration_ms safety net)
if max_duration_ms is not None and emitted_duration_ms >= max_duration_ms:
logger.debug(
"[pipeline] Duration limit reached (%.0fms >= %.0fms), stopping",
emitted_duration_ms,
max_duration_ms,
)
break
# Flush remaining audio from transcoder
if transcoder:
for aac_data in transcoder.flush():
muxer.add_audio_sample(aac_data, transcoder.frame_size)
audio_frame_count += 1
# Emit final fragment
final = muxer.flush_final()
if final:
fragment_count += 1
bytes_out += len(final)
yield final
except (GeneratorExit, asyncio.CancelledError):
cancelled = True
logger.info("[pipeline] Client disconnected, stopping pipeline")
except Exception as exc:
# Source exhausted with 0 bytes during header parsing = client disconnect
if bytes_out == 0 and "prematurely" in str(exc):
cancelled = True
logger.info("[pipeline] Client disconnected before streaming started")
else:
logger.exception("[pipeline] Pipeline error")
finally:
if transcoder:
transcoder.close()
# Close the source generator to stop the upstream download
if hasattr(source, "aclose"):
try:
await source.aclose()
except Exception:
pass
if cancelled:
logger.info(
"[pipeline] Cancelled after %d video, %d audio frames, %d fragments, %d bytes out",
video_frame_count,
audio_frame_count,
fragment_count,
bytes_out,
)
else:
logger.info(
"[pipeline] Complete: %d video, %d audio frames, %d fragments, %d bytes out",
video_frame_count,
audio_frame_count,
fragment_count,
bytes_out,
)
# =============================================================================
# MKV fast-path HLS segment pipeline
# =============================================================================
async def stream_segment_fmp4(
source: AsyncIterator[bytes],
*,
start_decode_time_ms: float = 0.0,
max_duration_ms: float | None = None,
) -> AsyncIterator[bytes]:
"""
MKV fast-path pipeline for a single HLS fMP4 media segment.
Adapted from ``stream_transcode_fmp4`` (continuous mode) but designed
for per-segment HLS delivery:
- **No init segment** -- HLS serves init separately.
- **start_decode_time_ms** places the segment's tfdt correctly on
the global HLS timeline.
- **Frame-count bounding** stops after exactly the right number of
video and audio frames for the segment duration.
- **Video passthrough** with exact MKV absolute timestamps (no
encoder, no DTS drift).
- **AudioTranscoder** with deterministic per-frame AAC output.
Args:
source: Async iterator of bytes (seek_header + cluster data).
start_decode_time_ms: Absolute time of segment start on HLS
timeline, used for muxer tfdt and frame skipping.
max_duration_ms: Segment duration in ms. Controls frame-count
bounding for both video and audio.
Yields:
fMP4 media fragments (moof + mdat) -- no init segment.
"""
transcoder = None
video_frame_count = 0
audio_frame_count = 0
fragment_count = 0
bytes_out = 0
cancelled = False
try:
# Phase 1: Parse MKV header from seek_header + cluster bytes
demuxer = MKVDemuxer()
header = await demuxer.read_header(source)
if not header.tracks:
raise ValueError("MKV segment source has no tracks")
video_track = _find_video_track(header)
audio_track = _find_audio_track(header)
if video_track is None:
raise ValueError("No supported video track found for segment pipeline")
logger.info(
"[seg_fmp4] Segment %.1f-%.1fs: video=%s %dx%d, audio=%s %dHz %dch",
start_decode_time_ms / 1000.0,
(start_decode_time_ms + (max_duration_ms or 0)) / 1000.0,
video_track.codec_id,
video_track.pixel_width,
video_track.pixel_height,
audio_track.codec_id if audio_track else "none",
int(audio_track.sample_rate) if audio_track else 0,
audio_track.channels if audio_track else 0,
)
# Phase 2: Set up audio transcoder
if audio_track and needs_transcode(audio_track.codec_id):
ffmpeg_codec = get_ffmpeg_codec_name(audio_track.codec_id)
if ffmpeg_codec:
transcoder = AudioTranscoder(
input_codec=ffmpeg_codec,
input_sample_rate=int(audio_track.sample_rate),
input_channels=audio_track.channels,
output_sample_rate=48000,
output_channels=2,
output_bitrate=192000,
)
logger.info("[seg_fmp4] Audio transcoding: %s -> AAC", audio_track.codec_id)
else:
logger.warning("[seg_fmp4] No FFmpeg codec for %s, skipping audio", audio_track.codec_id)
audio_track = None
audio_timescale = 48000 if transcoder else (int(audio_track.effective_sample_rate) if audio_track else 48000)
aac_frame_size = transcoder.frame_size if transcoder else 1024
audio_sr = 48000 if transcoder else (int(audio_track.sample_rate) if audio_track else 48000)
# Phase 3: Build muxer (NO init segment emitted -- HLS serves it separately)
default_asc = bytes([0x11, 0x90]) # 48kHz stereo LC
muxer = FMP4StreamMuxer(
video_track=video_track,
audio_sample_rate=audio_sr,
audio_channels=2 if transcoder else (audio_track.channels if audio_track else 2),
audio_specific_config=default_asc,
video_timescale=_VIDEO_TIMESCALE,
audio_timescale=audio_timescale,
duration_ms=max_duration_ms or 0.0,
fragment_duration_ms=2000.0,
start_decode_time_ms=start_decode_time_ms,
audio_frame_size=aac_frame_size,
)
if transcoder and transcoder.audio_specific_config:
muxer.update_audio_specific_config(transcoder.audio_specific_config)
# Phase 4: Compute frame-count limits for precise segment bounding
fps = 24.0
if video_track.default_duration_ns > 0:
fps = 1_000_000_000.0 / video_track.default_duration_ns
elif video_track.frame_duration_ms > 0:
fps = 1000.0 / video_track.frame_duration_ms
_max_video_frames: int | None = None
_max_audio_frames: int | None = None
segment_end_ms: float | None = None
if max_duration_ms is not None:
segment_end_ms = start_decode_time_ms + max_duration_ms
_max_video_frames = round(max_duration_ms * fps / 1000.0)
# Audio frame-count: tile AAC frames across timeline and count
# how many fall within [start_ms, end_ms). This mirrors the
# muxer's _audio_decode_time alignment exactly.
if aac_frame_size > 0 and audio_sr > 0:
end_time_ms = start_decode_time_ms + max_duration_ms
frames_before_start = int(start_decode_time_ms / 1000.0 * audio_sr / aac_frame_size)
frames_before_end = int(end_time_ms / 1000.0 * audio_sr / aac_frame_size)
_max_audio_frames = frames_before_end - frames_before_start
else:
_max_audio_frames = None
logger.info(
"[seg_fmp4] Frame limits: video=%s @%.1ffps, audio=%s (frame_size=%d, sr=%d), window=%.3f-%.3fs",
_max_video_frames,
fps,
_max_audio_frames,
aac_frame_size,
audio_sr,
start_decode_time_ms / 1000.0,
segment_end_ms / 1000.0 if segment_end_ms is not None else -1.0,
)
# Phase 5: Process frames
last_video_ts_ms = 0.0
_video_limit_hit = False
_audio_limit_hit = False
_got_keyframe = False # Must see IDR before emitting any video
async for frame in demuxer.iter_frames(source):
# ── Video frame (passthrough) ──
if video_track and frame.track_number == video_track.track_number:
# Segment time-window clamp (critical for monotonic HLS PTS):
# with overlapped MKV byte ranges, we may receive extra video
# blocks from the next segment's cluster. Drop anything outside
# [segment_start, segment_end) to prevent timestamp regressions
# at segment boundaries.
if segment_end_ms is not None and frame.timestamp_ms >= segment_end_ms:
_video_limit_hit = True
if _audio_limit_hit or audio_track is None:
break
continue
# Check frame-count limit
if _max_video_frames is not None and video_frame_count >= _max_video_frames:
_video_limit_hit = True
if _audio_limit_hit or audio_track is None:
break
continue
# Ensure AVCC length-prefixed NAL format for fMP4.
# Some MKV files store frames in mixed Annex B / AVCC.
# annexb_to_avcc converts start-code NALUs to length-
# prefixed and is a no-op for already-AVCC data.
# filter_ps=False preserves in-band SPS/PPS updates.
sample_data = annexb_to_avcc(frame.data, filter_ps=False)
if not sample_data:
continue
# Skip non-VCL samples (SEI-only, filler, padding).
if not _has_valid_video_nal(sample_data, video_track.codec_id):
continue
# Gate on first keyframe: fMP4 segments must start with a sync sample.
if not _got_keyframe:
if not frame.is_keyframe:
continue
_got_keyframe = True
logger.info(
"[seg_fmp4] First keyframe at %.3fs",
frame.timestamp_ms / 1000.0,
)
# Compute duration
duration_ms = frame.duration_ms
if duration_ms <= 0 and video_track.frame_duration_ms > 0:
duration_ms = video_track.frame_duration_ms
elif duration_ms <= 0:
if video_frame_count > 0 and frame.timestamp_ms > last_video_ts_ms:
duration_ms = frame.timestamp_ms - last_video_ts_ms
else:
duration_ms = 1000.0 / fps
duration_ticks = max(1, int(duration_ms * _VIDEO_TIMESCALE / 1000.0))
# Absolute PTS from MKV Cluster timestamps -- exact, no
# encoder involved, no drift.
pts_ticks = int(frame.timestamp_ms * _VIDEO_TIMESCALE / 1000.0)
muxer.add_video_sample(
sample_data,
duration_ticks,
frame.is_keyframe,
pts_ticks=pts_ticks,
)
last_video_ts_ms = frame.timestamp_ms
video_frame_count += 1
# ── Audio frame ──
elif audio_track and frame.track_number == audio_track.track_number:
# Check frame-count limit
if _max_audio_frames is not None and audio_frame_count >= _max_audio_frames:
_audio_limit_hit = True
if _video_limit_hit or video_track is None:
break
continue
if transcoder:
aac_frames = transcoder.transcode(frame.data)
for aac_data in aac_frames:
if _max_audio_frames is not None and audio_frame_count >= _max_audio_frames:
_audio_limit_hit = True
break
muxer.add_audio_sample(aac_data, transcoder.frame_size)
audio_frame_count += 1
else:
# Audio passthrough
duration_ms = frame.duration_ms
if duration_ms <= 0 and audio_track.frame_duration_ms > 0:
duration_ms = audio_track.frame_duration_ms
elif duration_ms <= 0:
duration_ms = 1024.0 / audio_track.sample_rate * 1000.0
duration_ticks = max(1, int(duration_ms * audio_timescale / 1000.0))
muxer.add_audio_sample(frame.data, duration_ticks)
audio_frame_count += 1
# Check if we should emit a fragment
fragment = muxer.flush_fragment()
if fragment:
fragment_count += 1
bytes_out += len(fragment)
yield fragment
# Early exit when both tracks hit their limits
if _video_limit_hit and (_audio_limit_hit or audio_track is None):
break
if _audio_limit_hit and (video_track is None):
break
# Flush remaining audio from transcoder
if transcoder and not _audio_limit_hit:
for aac_data in transcoder.flush():
if _max_audio_frames is not None and audio_frame_count >= _max_audio_frames:
break
muxer.add_audio_sample(aac_data, transcoder.frame_size)
audio_frame_count += 1
# Emit final fragment
final = muxer.flush_final()
if final:
fragment_count += 1
bytes_out += len(final)
yield final
except (GeneratorExit, asyncio.CancelledError):
cancelled = True
logger.info("[seg_fmp4] Client disconnected, stopping segment pipeline")
except Exception as exc:
if bytes_out == 0 and "prematurely" in str(exc):
cancelled = True
logger.info("[seg_fmp4] Client disconnected before segment started")
else:
logger.exception("[seg_fmp4] Segment pipeline error")
finally:
if transcoder:
transcoder.close()
if hasattr(source, "aclose"):
try:
await source.aclose()
except Exception:
pass
if cancelled:
logger.info(
"[seg_fmp4] Cancelled: %d video, %d audio frames, %d fragments, %d bytes",
video_frame_count,
audio_frame_count,
fragment_count,
bytes_out,
)
else:
logger.info(
"[seg_fmp4] Complete: %d video, %d audio frames, %d fragments, %d bytes",
video_frame_count,
audio_frame_count,
fragment_count,
bytes_out,
)
# =============================================================================
# Helper functions
# =============================================================================
# H.264 VCL NAL unit types (actual video slices)
_H264_VCL_TYPES = frozenset({1, 2, 3, 4, 5}) # Non-IDR, Part A/B/C, IDR
# HEVC VCL NAL unit types (BLA through CRA, 0-21)
_HEVC_VCL_TYPES = frozenset(range(0, 22))
def _has_valid_video_nal(data: bytes, codec_id: str = CODEC_ID_H264) -> bool:
"""
Check if AVCC/HVCC-formatted sample data contains at least one VCL NAL.
For H.264: VCL types 1-5 (Non-IDR through IDR slice).
For HEVC: VCL types 0-21 (BLA_W_LP through CRA_NUT).
Returns True if at least one qualifying NAL is present.
"""
if len(data) < 5:
return False
is_hevc = codec_id == CODEC_ID_H265
vcl_types = _HEVC_VCL_TYPES if is_hevc else _H264_VCL_TYPES
pos = 0
size = len(data)
while pos + 4 < size:
nal_len = int.from_bytes(data[pos : pos + 4], "big")
if nal_len <= 0 or nal_len > size - pos - 4:
break
nal_byte = data[pos + 4]
if is_hevc:
forbidden = (nal_byte >> 7) & 1
nal_type = (nal_byte >> 1) & 0x3F
else:
forbidden = (nal_byte >> 7) & 1
nal_type = nal_byte & 0x1F
if forbidden == 0 and nal_type in vcl_types:
return True
pos += 4 + nal_len
return False
def _find_video_track(header: MKVHeader) -> MKVTrack | None:
"""Find the first supported video track."""
for track in header.tracks:
if track.is_video and track.codec_id in (CODEC_ID_H264, CODEC_ID_H265):
return track
return None
def _find_audio_track(header: MKVHeader) -> MKVTrack | None:
"""Find the first audio track."""
for track in header.tracks:
if track.is_audio:
return track
return None
# =============================================================================
# Universal transcode pipeline (PyAV-based, any container, video re-encoding)
# =============================================================================
def _build_synthetic_mkv_track(
codec_id: str,
codec_private: bytes,
*,
width: int = 0,
height: int = 0,
sample_rate: float = 0.0,
channels: int = 0,
track_type: int = 1,
track_number: int = 1,
default_duration_ns: int = 0,
) -> MKVTrack:
"""
Create a synthetic MKVTrack from PyAV stream metadata.
The fMP4 muxer expects MKVTrack objects. This bridges PyAV stream info
to the existing muxer interface without modifying the muxer.
"""
return MKVTrack(
track_number=track_number,
track_type=track_type,
codec_id=codec_id,
codec_private=codec_private,
pixel_width=width,
pixel_height=height,
sample_rate=sample_rate,
channels=channels,
default_duration_ns=default_duration_ns,
)
def _update_init_extradata(
video_transcoder: VideoTranscoder,
video_track: MKVTrack,
first_nal_data: bytes,
) -> None:
"""
Update a video track's codec_private with SPS/PPS from the encoder.
Hardware encoders (VideoToolbox, NVENC) often don't expose extradata
on the codec context. Instead, they embed SPS/PPS as in-band NAL
units in the first keyframe. This function extracts them and writes
proper AVCC-format extradata into the MKVTrack so the init segment
built from it is valid.
"""
from mediaflow_proxy.remuxer.codec_utils import ensure_avcc_extradata, extract_sps_pps_from_annexb
# Try encoder context first (works for libx264 / software)
extradata = video_transcoder.codec_private_data
if not extradata:
# Extract from first keyframe NAL data (HW encoders)
extradata = extract_sps_pps_from_annexb(first_nal_data)
if extradata:
extradata = ensure_avcc_extradata(extradata)
video_track.codec_private = extradata
logger.info(
"[universal] Updated init extradata from encoder: %d bytes",
len(extradata),
)
async def stream_transcode_universal(
source: AsyncIterator[bytes],
*,
force_video_reencode: bool = False,
max_duration_ms: float | None = None,
start_decode_time_ms: float = 0.0,
emit_init_segment: bool = True,
force_software_encode: bool = False,
) -> AsyncIterator[bytes]:
"""
Universal transcode pipeline using PyAV for demuxing and encoding.
Handles any container format and optionally re-encodes video using
GPU-accelerated codecs when available.
Args:
source: Async iterator of container bytes (MKV, MP4, TS, etc.).
force_video_reencode: When True, always re-encode video even if
the codec is normally browser-compatible (e.g. H.264). Useful
for live MPEG-TS sources with corrupt bitstreams.
max_duration_ms: If set, stop emitting after this many milliseconds
of media have been produced.
start_decode_time_ms: Initial decode time offset for fMP4 timestamps.
emit_init_segment: Whether to yield the fMP4 init segment (ftyp+moov).
force_software_encode: When True, force ``libx264`` software encoder
instead of hardware (VideoToolbox/NVENC). Used for HLS per-segment
transcoding to avoid SIGSEGV crashes with hardware encoders.
Yields:
bytes chunks forming a valid fMP4 byte stream.
"""
video_transcoder = None
audio_encoder = None
audio_resampler = None
video_frame_count = 0
audio_frame_count = 0
fragment_count = 0
bytes_out = 0
cancelled = False
_audio_flushed = False # Prevents double-flush SIGSEGV on teardown
# Both video and audio decode decisions are deferred until after stream
# discovery, so the demux thread only decodes what's actually needed.
# Video decoding is only required when the codec needs re-encoding;
# passthrough uses raw packets. Audio decoding is needed when the
# codec is not browser-compatible (e.g. ac3 -> aac).
demuxer = PyAVDemuxer(decode_video=False, decode_audio=False)
try:
# Phase 1: Start demuxing -- opens the container in a background thread,
# discovers streams, and starts enqueuing packets. Awaits until stream
# metadata is available.
await demuxer.start(source)
vs = demuxer.video_stream
aus = demuxer.audio_stream
if vs is None and aus is None:
demuxer.enable_video_decode(False)
demuxer.enable_audio_decode(False)
raise ValueError("No video or audio streams found in source")
# Phase 2: Determine what needs transcoding
do_video_transcode = False
do_audio_transcode = False
video_mkv_codec = ""
audio_mkv_codec = ""
if vs:
video_mkv_codec = _PYAV_TO_MKV_VIDEO.get(vs.codec_name, vs.codec_name)
do_video_transcode = (
force_video_reencode
or pyav_video_needs_reencode(vs.codec_name)
or pyav_video_needs_reencode(video_mkv_codec)
)
if aus:
audio_mkv_codec = _PYAV_TO_MKV_AUDIO.get(aus.codec_name, aus.codec_name)
do_audio_transcode = pyav_audio_needs_transcode(aus.codec_name) or pyav_audio_needs_transcode(
audio_mkv_codec
)
# Tell the demux thread whether to decode video/audio in-thread.
# This must be called before consuming packets via iter_packets().
demuxer.enable_video_decode(do_video_transcode)
demuxer.enable_audio_decode(do_audio_transcode)
logger.info(
"[universal] Streams: video=%s (reencode=%s), audio=%s (transcode=%s)",
vs.codec_name if vs else "none",
do_video_transcode,
aus.codec_name if aus else "none",
do_audio_transcode,
)
# Phase 3: Set up transcoders
if do_video_transcode and vs:
video_transcoder = VideoTranscoder(
input_codec_name=vs.codec_name,
width=vs.width,
height=vs.height,
fps=vs.fps or 24.0,
pixel_format=vs.pixel_format or "yuv420p",
force_software=force_software_encode,
)
# Audio encoding: since audio is decoded in the demux thread, we only
# need a resampler and encoder here. No standalone decoder needed.
audio_encoder = None
audio_resampler = None
if do_audio_transcode and aus:
audio_encoder = av.CodecContext.create("aac", "w")
audio_encoder.sample_rate = 48000
audio_encoder.layout = "stereo"
audio_encoder.format = av.AudioFormat("fltp")
audio_encoder.bit_rate = 192000
audio_encoder.open()
audio_resampler = AudioResampler(
format="fltp",
layout="stereo",
rate=48000,
)
logger.info(
"[universal] Audio transcoding: %s %dHz %dch -> aac 48000Hz 2ch @192k",
aus.codec_name,
aus.sample_rate or 0,
aus.channels or 0,
)
# Phase 4: Build init segment
# When transcoding video, force output codec to H.264 regardless
# of whether the encoder has produced extradata yet (libx264 emits
# SPS/PPS only after the first encode call).
if do_video_transcode and video_transcoder:
raw_extradata = video_transcoder.codec_private_data or b""
video_codec_private = ensure_avcc_extradata(raw_extradata) if raw_extradata else b""
video_track_codec = CODEC_ID_H264 # Output is always H.264
elif vs:
# Ensure extradata is in avcC format (MPEG-TS returns Annex B)
video_codec_private = ensure_avcc_extradata(vs.extradata)
video_track_codec = video_mkv_codec or CODEC_ID_H264
else:
video_codec_private = b""
video_track_codec = CODEC_ID_H264
video_track = None
if vs:
output_w = video_transcoder.width if video_transcoder else vs.width
output_h = video_transcoder.height if video_transcoder else vs.height
frame_dur_ns = int(1_000_000_000 / (vs.fps or 24.0))
video_track = _build_synthetic_mkv_track(
codec_id=video_track_codec,
codec_private=video_codec_private,
width=output_w,
height=output_h,
track_type=1,
track_number=1,
default_duration_ns=frame_dur_ns,
)
audio_sr = 48000 if audio_encoder else (aus.sample_rate if aus else 48000)
audio_ch = 2 if audio_encoder else (aus.channels if aus else 2)
default_asc = bytes([0x11, 0x90]) # 48kHz stereo LC
if not video_track:
raise ValueError("No video track available for muxing")
# AAC frame size (samples per frame), typically 1024
aac_frame_size = audio_encoder.frame_size if audio_encoder and audio_encoder.frame_size else 1024
muxer = FMP4StreamMuxer(
video_track=video_track,
audio_sample_rate=audio_sr,
audio_channels=audio_ch,
audio_specific_config=default_asc,
video_timescale=_VIDEO_TIMESCALE,
audio_timescale=audio_sr,
# Cap duration: live/unknown streams report 0 or garbage values.
# Anything over 24h is almost certainly wrong for a real file.
duration_ms=vs.duration_seconds * 1000.0
if vs and vs.duration_seconds and 0 < vs.duration_seconds < 86400
else 0.0,
fragment_duration_ms=2000.0,
start_decode_time_ms=start_decode_time_ms,
# Pass AAC frame size so the muxer can align the audio tfdt to
# exact frame boundaries, preventing DTS discontinuities at
# HLS segment borders.
audio_frame_size=aac_frame_size,
)
if audio_encoder and audio_encoder.extradata:
muxer.update_audio_specific_config(bytes(audio_encoder.extradata))
# For hardware encoders (VideoToolbox, NVENC), SPS/PPS extradata may
# not be available until the first frame is encoded. Defer the init
# segment emission until after the first encoded video packet so the
# init segment always contains valid codec configuration.
_init_emitted = False
if emit_init_segment and not do_video_transcode:
# No re-encoding: extradata comes from the source stream, so we
# can emit the init segment immediately.
init_segment = muxer.build_init_segment()
logger.info("[universal] Init segment: %d bytes", len(init_segment))
yield init_segment
bytes_out = len(init_segment)
_init_emitted = True
# Phase 5: Process packets
# For video passthrough: skip until first keyframe and rebase DTS/PTS
# so fMP4 timestamps start from 0 (live TS streams have huge absolute values).
_video_dts_base: float | None = None # first video DTS in seconds
_got_keyframe = do_video_transcode # transcoded output always starts with keyframe
_emitted_video_duration_ms = 0.0 # accumulated video duration for monitoring
# Offset (video timescale ticks) that maps rebased-to-0 encoder PTS
# onto the absolute timeline expected by the muxer. When producing
# HLS segments starting at e.g. 25 s, the muxer's tfdt is at 25 s
# but the encoder PTS starts at 0. Adding this offset realigns them.
_start_offset_ticks = int(start_decode_time_ms * _VIDEO_TIMESCALE / 1000.0)
# Pre-compute per-frame duration ticks for re-encoded video (constant
# with zerolatency / no B-frames). Used for frame-count-based PTS.
_fps = (vs.fps or 24.0) if vs else 24.0
_reencode_dur_ticks = max(1, int(_VIDEO_TIMESCALE / _fps)) if vs else 0
# Encoder timebase denominator for setting sequential frame.pts on
# decoded frames before encoding. Keeps libx264's internal rate
# control consistent.
_enc_tb_den: int = 0
_enc_frame_dur: int = 0
if video_transcoder:
_enc_tb_den = video_transcoder._encoder.time_base.denominator
_enc_frame_dur = max(1, int(_enc_tb_den / _fps))
# ── Frame-count-based segment bounding ──────────────────────────
# When producing HLS segments, each segment MUST produce exactly
# the right number of video (and audio) frames so that the next
# segment's tfdt is contiguous. Relying on source PTS is fragile
# because mid-stream MKV byte ranges may not report accurate PTS.
#
# Video: round(duration_ms * fps / 1000) frames.
#
# Audio: compute by tiling AAC frames across the timeline. The
# audio tfdt of this segment is the cumulative count of AAC frames
# from time=0 up to start_decode_time_ms. The next segment's
# audio tfdt is the cumulative count up to end_time_ms. The
# difference gives the exact number of frames this segment must
# produce to keep segment borders gapless.
_max_video_frames: int | None = None
_max_audio_frames: int | None = None
if max_duration_ms is not None:
_max_video_frames = round(max_duration_ms * _fps / 1000.0)
if aac_frame_size > 0 and audio_sr > 0:
end_time_ms = start_decode_time_ms + max_duration_ms
# Count of whole AAC frames from t=0 to start and end
frames_before_start = int(start_decode_time_ms / 1000.0 * audio_sr / aac_frame_size)
frames_before_end = int(end_time_ms / 1000.0 * audio_sr / aac_frame_size)
_max_audio_frames = frames_before_end - frames_before_start
else:
_max_audio_frames = None # no cap
async def _process_packet(packet):
nonlocal video_frame_count, audio_frame_count, fragment_count, bytes_out
nonlocal _video_dts_base, _got_keyframe
nonlocal _emitted_video_duration_ms, _init_emitted
init_bytes: bytes | None = None # deferred init, returned alongside fragment
if vs and packet.stream_index == vs.index and packet.codec_type == "video":
# ── Frame-count limit for HLS segments ──
# Stop accepting video once we've emitted enough frames.
if _max_video_frames is not None and video_frame_count >= _max_video_frames:
return None, None
if do_video_transcode and video_transcoder and packet.decoded_frame is not None:
# Set sequential PTS on the decoded frame in encoder
# timebase *before* encoding. The demuxer's frame.pts is
# in the demuxer's timebase (e.g. 1/1000 for MKV) which
# does NOT match the encoder's timebase (1/(fps*1000)).
# Passing the raw integer through causes PTS compression
# by ~fps-x, corrupting the output timeline. Sequential
# PTS keeps libx264's rate control consistent.
packet.decoded_frame.pts = video_frame_count * _enc_frame_dur
# Frame already decoded by the demux thread -- re-encode
encoded = video_transcoder.transcode_frame(packet.decoded_frame)
for nal_data, is_kf, enc_pts, enc_dts in encoded:
# Convert Annex B start codes to AVCC length prefixes.
# Hardware encoders (VideoToolbox, NVENC) emit Annex B.
sample_data = annexb_to_avcc(nal_data)
if not sample_data:
continue
# Deferred init segment: after the first encode, the HW
# encoder's extradata is available. Extract SPS/PPS and
# rebuild the init segment so it has correct codec config.
if emit_init_segment and not _init_emitted:
_update_init_extradata(video_transcoder, video_track, nal_data)
init_bytes = muxer.build_init_segment()
logger.info("[universal] Init segment (deferred): %d bytes", len(init_bytes))
bytes_out += len(init_bytes)
_init_emitted = True
# Frame-count-based PTS: since zerolatency produces
# no B-frames (PTS == DTS), derive PTS directly from
# the output frame index. This avoids the timebase
# mismatch bug and guarantees monotonic timestamps.
pts_ticks = _start_offset_ticks + (video_frame_count * _reencode_dur_ticks)
muxer.add_video_sample(sample_data, _reencode_dur_ticks, is_kf, pts_ticks=pts_ticks)
video_frame_count += 1
_emitted_video_duration_ms += _reencode_dur_ticks * 1000.0 / _VIDEO_TIMESCALE
elif do_video_transcode and video_transcoder:
# Fallback: raw packet (shouldn't happen with decode_video=True)
logger.warning("[universal] Video packet without decoded frame, skipping")
else:
# Video passthrough -- wait for first keyframe before
# sending any video (browser can't decode without IDR).
if not _got_keyframe:
if not packet.is_keyframe:
return None, None
_got_keyframe = True
logger.info("[universal] First keyframe received, starting video")
# Convert Annex B start codes to AVCC length prefixes
# if needed (MPEG-TS sources).
sample_data = annexb_to_avcc(packet.data)
if not sample_data:
return None, None
dur_ticks = (
max(1, int(packet.duration_seconds * _VIDEO_TIMESCALE))
if packet.duration > 0
else max(1, int(_VIDEO_TIMESCALE / (vs.fps or 24.0)))
)
# Always pass PTS for CTS computation so B-frames
# are properly reordered by the player.
pts_ticks = None
dts_secs = packet.dts_seconds
pts_secs = packet.pts_seconds
if _video_dts_base is None:
_video_dts_base = dts_secs
if packet.pts != 0 and pts_secs != dts_secs:
rebased_pts = pts_secs - _video_dts_base
pts_ticks = max(0, int(rebased_pts * _VIDEO_TIMESCALE)) + _start_offset_ticks
muxer.add_video_sample(sample_data, dur_ticks, packet.is_keyframe, pts_ticks=pts_ticks)
video_frame_count += 1
_emitted_video_duration_ms += dur_ticks * 1000.0 / _VIDEO_TIMESCALE
elif aus and packet.stream_index == aus.index and packet.codec_type == "audio":
# Don't emit audio until the first video keyframe so A/V stay in sync
if not _got_keyframe:
return None, None
# ── Audio frame-count limit for HLS segments ──
if _max_audio_frames is not None and audio_frame_count >= _max_audio_frames:
return None, None
if do_audio_transcode and audio_encoder and audio_resampler and packet.decoded_frame is not None:
# Audio frame decoded by demux thread -- resample and encode
resampled = audio_resampler.resample(packet.decoded_frame)
if resampled is not None:
if not isinstance(resampled, list):
resampled = [resampled]
for rs_frame in resampled:
if _max_audio_frames is not None and audio_frame_count >= _max_audio_frames:
break
for enc_pkt in audio_encoder.encode(rs_frame):
muxer.add_audio_sample(bytes(enc_pkt), aac_frame_size)
audio_frame_count += 1
elif do_audio_transcode and audio_encoder:
# Fallback: raw packet (shouldn't happen with decode_audio=True)
logger.warning("[universal] Audio packet without decoded frame, skipping")
else:
# Audio passthrough
dur_ticks = max(1, int(packet.duration_seconds * audio_sr)) if packet.duration > 0 else 1024
muxer.add_audio_sample(packet.data, dur_ticks)
audio_frame_count += 1
# Emit fragment if ready
fragment = muxer.flush_fragment()
if fragment:
fragment_count += 1
bytes_out += len(fragment)
return init_bytes, fragment
return init_bytes, None
# Process all packets from the demuxer
async for packet in demuxer.iter_packets():
# Frame-count-based segment bounding: stop the packet loop once
# both video and audio have emitted their target frame counts.
# Individual _process_packet calls for each track already skip
# frames beyond the limit, so this break is just an optimisation
# to avoid draining the entire byte range.
if _max_video_frames is not None:
video_done = video_frame_count >= _max_video_frames
audio_done = _max_audio_frames is None or audio_frame_count >= _max_audio_frames
if video_done and audio_done:
logger.debug(
"[universal] Segment frame limits reached: video=%d/%d, audio=%d/%s, emitted=%.0fms",
video_frame_count,
_max_video_frames,
audio_frame_count,
_max_audio_frames if _max_audio_frames is not None else "unlimited",
_emitted_video_duration_ms,
)
break
deferred_init, frag = await _process_packet(packet)
if deferred_init:
yield deferred_init
if frag:
yield frag
# Flush video encoder (decoder already flushed in the demux thread).
# Skip flush if we already reached the frame count limit for HLS
# segments -- flushed frames would exceed the target and cause
# DTS overlap with the next segment.
_video_limit_hit = _max_video_frames is not None and video_frame_count >= _max_video_frames
if video_transcoder and not _video_limit_hit:
for nal_data, is_kf, pts, dts in video_transcoder.flush():
sample_data = annexb_to_avcc(nal_data)
if not sample_data:
continue
# Use same frame-count-based PTS as the main encode path
pts_ticks = _start_offset_ticks + (video_frame_count * _reencode_dur_ticks)
muxer.add_video_sample(sample_data, _reencode_dur_ticks, is_kf, pts_ticks=pts_ticks)
video_frame_count += 1
_emitted_video_duration_ms += _reencode_dur_ticks * 1000.0 / _VIDEO_TIMESCALE
# Flush audio resampler + encoder (decoder already flushed in the demux thread).
# When audio frame limit was reached, we still need to flush the
# encoder to drain its internal state, but we discard the output
# to avoid exceeding the frame count.
_audio_limit_hit = _max_audio_frames is not None and audio_frame_count >= _max_audio_frames
if audio_encoder and audio_resampler and _audio_limit_hit:
# Drain encoder without emitting -- prevents SIGSEGV on teardown
try:
audio_resampler.resample(None)
except Exception:
pass
try:
for _ in audio_encoder.encode(None):
pass
except Exception:
pass
_audio_flushed = True
elif audio_encoder and audio_resampler:
try:
resampled = audio_resampler.resample(None)
if resampled is not None:
if not isinstance(resampled, list):
resampled = [resampled]
for rs_frame in resampled:
if _max_audio_frames is not None and audio_frame_count >= _max_audio_frames:
break
for enc_pkt in audio_encoder.encode(rs_frame):
muxer.add_audio_sample(bytes(enc_pkt), aac_frame_size)
audio_frame_count += 1
except Exception:
pass
try:
for enc_pkt in audio_encoder.encode(None):
if _max_audio_frames is not None and audio_frame_count >= _max_audio_frames:
break
muxer.add_audio_sample(bytes(enc_pkt), aac_frame_size)
audio_frame_count += 1
except Exception:
pass
_audio_flushed = True
# Final fragment
final = muxer.flush_final()
if final:
fragment_count += 1
bytes_out += len(final)
yield final
except (GeneratorExit, asyncio.CancelledError):
cancelled = True
logger.info("[universal] Client disconnected, stopping pipeline")
except Exception as exc:
if bytes_out == 0 and "prematurely" in str(exc):
cancelled = True
logger.info("[universal] Client disconnected before streaming started")
else:
logger.exception("[universal] Pipeline error")
finally:
if video_transcoder:
video_transcoder.close()
video_transcoder = None
# Flush audio only if the normal path didn't already do it.
# Double-flushing a PyAV codec context causes SIGSEGV.
if audio_encoder and not _audio_flushed:
try:
for _ in audio_encoder.encode(None):
pass
except Exception:
pass
audio_encoder = None
audio_resampler = None
if hasattr(source, "aclose"):
try:
await source.aclose()
except Exception:
pass
logger.debug("[universal] Cleanup: complete")
if cancelled:
logger.info(
"[universal] Cancelled after %d video, %d audio frames, %d fragments, %d bytes out",
video_frame_count,
audio_frame_count,
fragment_count,
bytes_out,
)
else:
logger.info(
"[universal] Complete: %d video, %d audio frames, %d fragments, %d bytes out",
video_frame_count,
audio_frame_count,
fragment_count,
bytes_out,
)