mirror of
https://github.com/UrloMythus/UnHided.git
synced 2026-04-09 02:40:47 +00:00
1269 lines
54 KiB
Python
1269 lines
54 KiB
Python
"""
|
|
Streaming transcode pipelines producing fragmented MP4 on-the-fly.
|
|
|
|
Three pipelines are provided:
|
|
|
|
1. ``stream_transcode_fmp4`` -- **MKV fast path (continuous)**.
|
|
Uses the custom EBML demuxer for zero-copy video passthrough (H.264/H.265)
|
|
with audio-only transcoding. Best for MKV sources with browser-compatible
|
|
video but incompatible audio. Emits init + media fragments.
|
|
|
|
2. ``stream_segment_fmp4`` -- **MKV fast path (HLS segment)**.
|
|
Same EBML demuxer and video passthrough as above, but adapted for
|
|
per-segment HLS delivery: no init segment, ``start_decode_time_ms``
|
|
for correct tfdt placement, and frame-count bounding for precise
|
|
segment duration control.
|
|
|
|
3. ``stream_transcode_universal`` -- **Universal path via PyAV**.
|
|
Demuxes any container format (MKV, MP4, TS, etc.) using PyAV, optionally
|
|
re-encodes video (GPU-accelerated when available), and transcodes audio.
|
|
Required when the video codec needs re-encoding or the source is not MKV.
|
|
|
|
All pipelines produce on-the-fly fMP4 fragments suitable for streaming
|
|
via ``StreamingResponse``.
|
|
"""
|
|
|
|
import asyncio
|
|
import hashlib
|
|
import logging
|
|
from collections.abc import AsyncIterator
|
|
|
|
import av
|
|
from av.audio.resampler import AudioResampler
|
|
|
|
from mediaflow_proxy.remuxer.audio_transcoder import AudioTranscoder, get_ffmpeg_codec_name, needs_transcode
|
|
from mediaflow_proxy.remuxer.codec_utils import (
|
|
_PYAV_TO_MKV_AUDIO,
|
|
_PYAV_TO_MKV_VIDEO,
|
|
annexb_to_avcc,
|
|
ensure_avcc_extradata,
|
|
audio_needs_transcode as pyav_audio_needs_transcode,
|
|
video_needs_reencode as pyav_video_needs_reencode,
|
|
)
|
|
from mediaflow_proxy.remuxer.ebml_parser import (
|
|
CODEC_ID_H264,
|
|
CODEC_ID_H265,
|
|
MKVTrack,
|
|
)
|
|
from mediaflow_proxy.remuxer.mkv_demuxer import MKVDemuxer, MKVHeader
|
|
from mediaflow_proxy.remuxer.mp4_muxer import FMP4StreamMuxer
|
|
from mediaflow_proxy.remuxer.pyav_demuxer import PyAVDemuxer
|
|
from mediaflow_proxy.remuxer.video_transcoder import VideoTranscoder
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# Video timescale (90kHz is standard for MPEG transport)
|
|
_VIDEO_TIMESCALE = 90000
|
|
|
|
|
|
def derive_mp4_cache_key(
|
|
chat_id: str | int | None,
|
|
message_id: int | None,
|
|
file_id: str | None,
|
|
) -> str:
|
|
"""Derive a deterministic cache key for a transcoded stream."""
|
|
if file_id:
|
|
raw = f"mp4:file_id:{file_id}"
|
|
elif chat_id is not None and message_id is not None:
|
|
raw = f"mp4:chat:{chat_id}:msg:{message_id}"
|
|
else:
|
|
return ""
|
|
return hashlib.sha256(raw.encode()).hexdigest()[:16]
|
|
|
|
|
|
async def stream_transcode_fmp4(
|
|
source: AsyncIterator[bytes],
|
|
*,
|
|
max_duration_ms: float | None = None,
|
|
) -> AsyncIterator[bytes]:
|
|
"""
|
|
Stream MKV-to-fMP4 transcoding as an async generator (continuous mode).
|
|
|
|
This pipeline copies video (passthrough) and transcodes audio from
|
|
EAC3/AC3 to AAC. Used for continuous single-request fMP4 streaming.
|
|
HLS segments use the universal pipeline with video re-encoding instead.
|
|
|
|
Yields:
|
|
1. First yield: fMP4 init segment (ftyp + moov)
|
|
2. Subsequent yields: fMP4 media fragments (moof + mdat)
|
|
|
|
Args:
|
|
source: Async iterator of MKV bytes (e.g., from Telegram stream).
|
|
max_duration_ms: If set, stop emitting after this many milliseconds
|
|
of media have been produced.
|
|
|
|
Yields:
|
|
bytes chunks forming a valid fMP4 byte stream.
|
|
"""
|
|
transcoder = None
|
|
video_frame_count = 0
|
|
audio_frame_count = 0
|
|
fragment_count = 0
|
|
bytes_out = 0
|
|
cancelled = False
|
|
|
|
try:
|
|
# Phase 1: Parse MKV header
|
|
demuxer = MKVDemuxer()
|
|
header = await demuxer.read_header(source)
|
|
|
|
if not header.tracks:
|
|
raise ValueError("MKV file has no tracks")
|
|
|
|
video_track = _find_video_track(header)
|
|
audio_track = _find_audio_track(header)
|
|
|
|
if video_track is None:
|
|
raise ValueError("No supported video track found (need H.264 or H.265)")
|
|
|
|
logger.info(
|
|
"[pipeline] MKV header: duration=%.1fs, video=%s %dx%d, audio=%s %dHz %dch",
|
|
header.duration_ms / 1000.0,
|
|
video_track.codec_id,
|
|
video_track.pixel_width,
|
|
video_track.pixel_height,
|
|
audio_track.codec_id if audio_track else "none",
|
|
int(audio_track.sample_rate) if audio_track else 0,
|
|
audio_track.channels if audio_track else 0,
|
|
)
|
|
|
|
# Phase 2: Set up audio transcoder
|
|
if audio_track and needs_transcode(audio_track.codec_id):
|
|
ffmpeg_codec = get_ffmpeg_codec_name(audio_track.codec_id)
|
|
if ffmpeg_codec:
|
|
transcoder = AudioTranscoder(
|
|
input_codec=ffmpeg_codec,
|
|
input_sample_rate=int(audio_track.sample_rate),
|
|
input_channels=audio_track.channels,
|
|
output_sample_rate=48000,
|
|
output_channels=2,
|
|
output_bitrate=192000,
|
|
)
|
|
logger.info("[pipeline] Audio transcoding: %s -> AAC", audio_track.codec_id)
|
|
else:
|
|
logger.warning("[pipeline] No FFmpeg codec for %s, skipping audio", audio_track.codec_id)
|
|
audio_track = None
|
|
|
|
audio_timescale = 48000 if transcoder else (int(audio_track.effective_sample_rate) if audio_track else 48000)
|
|
|
|
# Phase 3: Build init segment with placeholder AAC config
|
|
# We'll use a default AAC config (48kHz stereo LC) initially.
|
|
# If the encoder provides a different one, the decoder should still handle it
|
|
# since the actual config is embedded in the AAC frames.
|
|
default_asc = bytes([0x11, 0x90]) # 48kHz stereo LC
|
|
|
|
muxer = FMP4StreamMuxer(
|
|
video_track=video_track,
|
|
audio_sample_rate=48000 if transcoder else (int(audio_track.sample_rate) if audio_track else 48000),
|
|
audio_channels=2 if transcoder else (audio_track.channels if audio_track else 2),
|
|
audio_specific_config=default_asc,
|
|
video_timescale=_VIDEO_TIMESCALE,
|
|
audio_timescale=audio_timescale,
|
|
duration_ms=header.duration_ms,
|
|
fragment_duration_ms=2000.0,
|
|
)
|
|
|
|
# Check if we can get a real ASC from the encoder before building init
|
|
if transcoder and transcoder.audio_specific_config:
|
|
muxer.update_audio_specific_config(transcoder.audio_specific_config)
|
|
|
|
init_segment = muxer.build_init_segment()
|
|
logger.info("[pipeline] Init segment: %d bytes", len(init_segment))
|
|
yield init_segment
|
|
bytes_out = len(init_segment)
|
|
|
|
# Phase 4: Process frames and emit fragments
|
|
last_video_ts_ms = 0.0
|
|
emitted_duration_ms = 0.0
|
|
|
|
async for frame in demuxer.iter_frames(source):
|
|
if video_track and frame.track_number == video_track.track_number:
|
|
# Video frame (passthrough -- no decode/re-encode)
|
|
duration_ms = frame.duration_ms
|
|
if duration_ms <= 0 and video_track.frame_duration_ms > 0:
|
|
duration_ms = video_track.frame_duration_ms
|
|
elif duration_ms <= 0:
|
|
if video_frame_count > 0 and frame.timestamp_ms > last_video_ts_ms:
|
|
duration_ms = frame.timestamp_ms - last_video_ts_ms
|
|
else:
|
|
duration_ms = 1000.0 / 24.0 # Fallback 24fps
|
|
|
|
duration_ticks = max(1, int(duration_ms * _VIDEO_TIMESCALE / 1000.0))
|
|
|
|
# Pass absolute PTS for CTS (composition time offset).
|
|
# MKV timestamps are display-order (PTS); the muxer
|
|
# accumulates DTS monotonically, so the difference is
|
|
# written as CTS in the trun sample entry.
|
|
pts_ticks = int(frame.timestamp_ms * _VIDEO_TIMESCALE / 1000.0)
|
|
|
|
# Ensure AVCC format and skip non-VCL NAL-only samples
|
|
sample_data = annexb_to_avcc(frame.data, filter_ps=False)
|
|
if not sample_data or not _has_valid_video_nal(sample_data, video_track.codec_id):
|
|
continue
|
|
|
|
muxer.add_video_sample(
|
|
sample_data,
|
|
duration_ticks,
|
|
frame.is_keyframe,
|
|
pts_ticks=pts_ticks,
|
|
)
|
|
last_video_ts_ms = frame.timestamp_ms
|
|
video_frame_count += 1
|
|
emitted_duration_ms += duration_ms
|
|
|
|
elif audio_track and frame.track_number == audio_track.track_number:
|
|
if transcoder:
|
|
aac_frames = transcoder.transcode(frame.data)
|
|
for aac_data in aac_frames:
|
|
muxer.add_audio_sample(aac_data, transcoder.frame_size)
|
|
audio_frame_count += 1
|
|
else:
|
|
# Audio passthrough
|
|
duration_ms = frame.duration_ms
|
|
if duration_ms <= 0 and audio_track.frame_duration_ms > 0:
|
|
duration_ms = audio_track.frame_duration_ms
|
|
elif duration_ms <= 0:
|
|
duration_ms = 1024.0 / audio_track.sample_rate * 1000.0
|
|
duration_ticks = max(1, int(duration_ms * audio_timescale / 1000.0))
|
|
muxer.add_audio_sample(frame.data, duration_ticks)
|
|
audio_frame_count += 1
|
|
|
|
# Check if we should emit a fragment
|
|
fragment = muxer.flush_fragment()
|
|
if fragment:
|
|
fragment_count += 1
|
|
bytes_out += len(fragment)
|
|
yield fragment
|
|
|
|
# Duration bounding (e.g. for max_duration_ms safety net)
|
|
if max_duration_ms is not None and emitted_duration_ms >= max_duration_ms:
|
|
logger.debug(
|
|
"[pipeline] Duration limit reached (%.0fms >= %.0fms), stopping",
|
|
emitted_duration_ms,
|
|
max_duration_ms,
|
|
)
|
|
break
|
|
|
|
# Flush remaining audio from transcoder
|
|
if transcoder:
|
|
for aac_data in transcoder.flush():
|
|
muxer.add_audio_sample(aac_data, transcoder.frame_size)
|
|
audio_frame_count += 1
|
|
|
|
# Emit final fragment
|
|
final = muxer.flush_final()
|
|
if final:
|
|
fragment_count += 1
|
|
bytes_out += len(final)
|
|
yield final
|
|
|
|
except (GeneratorExit, asyncio.CancelledError):
|
|
cancelled = True
|
|
logger.info("[pipeline] Client disconnected, stopping pipeline")
|
|
except Exception as exc:
|
|
# Source exhausted with 0 bytes during header parsing = client disconnect
|
|
if bytes_out == 0 and "prematurely" in str(exc):
|
|
cancelled = True
|
|
logger.info("[pipeline] Client disconnected before streaming started")
|
|
else:
|
|
logger.exception("[pipeline] Pipeline error")
|
|
finally:
|
|
if transcoder:
|
|
transcoder.close()
|
|
|
|
# Close the source generator to stop the upstream download
|
|
if hasattr(source, "aclose"):
|
|
try:
|
|
await source.aclose()
|
|
except Exception:
|
|
pass
|
|
|
|
if cancelled:
|
|
logger.info(
|
|
"[pipeline] Cancelled after %d video, %d audio frames, %d fragments, %d bytes out",
|
|
video_frame_count,
|
|
audio_frame_count,
|
|
fragment_count,
|
|
bytes_out,
|
|
)
|
|
else:
|
|
logger.info(
|
|
"[pipeline] Complete: %d video, %d audio frames, %d fragments, %d bytes out",
|
|
video_frame_count,
|
|
audio_frame_count,
|
|
fragment_count,
|
|
bytes_out,
|
|
)
|
|
|
|
|
|
# =============================================================================
|
|
# MKV fast-path HLS segment pipeline
|
|
# =============================================================================
|
|
|
|
|
|
async def stream_segment_fmp4(
|
|
source: AsyncIterator[bytes],
|
|
*,
|
|
start_decode_time_ms: float = 0.0,
|
|
max_duration_ms: float | None = None,
|
|
) -> AsyncIterator[bytes]:
|
|
"""
|
|
MKV fast-path pipeline for a single HLS fMP4 media segment.
|
|
|
|
Adapted from ``stream_transcode_fmp4`` (continuous mode) but designed
|
|
for per-segment HLS delivery:
|
|
|
|
- **No init segment** -- HLS serves init separately.
|
|
- **start_decode_time_ms** places the segment's tfdt correctly on
|
|
the global HLS timeline.
|
|
- **Frame-count bounding** stops after exactly the right number of
|
|
video and audio frames for the segment duration.
|
|
- **Video passthrough** with exact MKV absolute timestamps (no
|
|
encoder, no DTS drift).
|
|
- **AudioTranscoder** with deterministic per-frame AAC output.
|
|
|
|
Args:
|
|
source: Async iterator of bytes (seek_header + cluster data).
|
|
start_decode_time_ms: Absolute time of segment start on HLS
|
|
timeline, used for muxer tfdt and frame skipping.
|
|
max_duration_ms: Segment duration in ms. Controls frame-count
|
|
bounding for both video and audio.
|
|
|
|
Yields:
|
|
fMP4 media fragments (moof + mdat) -- no init segment.
|
|
"""
|
|
transcoder = None
|
|
video_frame_count = 0
|
|
audio_frame_count = 0
|
|
fragment_count = 0
|
|
bytes_out = 0
|
|
cancelled = False
|
|
|
|
try:
|
|
# Phase 1: Parse MKV header from seek_header + cluster bytes
|
|
demuxer = MKVDemuxer()
|
|
header = await demuxer.read_header(source)
|
|
|
|
if not header.tracks:
|
|
raise ValueError("MKV segment source has no tracks")
|
|
|
|
video_track = _find_video_track(header)
|
|
audio_track = _find_audio_track(header)
|
|
|
|
if video_track is None:
|
|
raise ValueError("No supported video track found for segment pipeline")
|
|
|
|
logger.info(
|
|
"[seg_fmp4] Segment %.1f-%.1fs: video=%s %dx%d, audio=%s %dHz %dch",
|
|
start_decode_time_ms / 1000.0,
|
|
(start_decode_time_ms + (max_duration_ms or 0)) / 1000.0,
|
|
video_track.codec_id,
|
|
video_track.pixel_width,
|
|
video_track.pixel_height,
|
|
audio_track.codec_id if audio_track else "none",
|
|
int(audio_track.sample_rate) if audio_track else 0,
|
|
audio_track.channels if audio_track else 0,
|
|
)
|
|
|
|
# Phase 2: Set up audio transcoder
|
|
if audio_track and needs_transcode(audio_track.codec_id):
|
|
ffmpeg_codec = get_ffmpeg_codec_name(audio_track.codec_id)
|
|
if ffmpeg_codec:
|
|
transcoder = AudioTranscoder(
|
|
input_codec=ffmpeg_codec,
|
|
input_sample_rate=int(audio_track.sample_rate),
|
|
input_channels=audio_track.channels,
|
|
output_sample_rate=48000,
|
|
output_channels=2,
|
|
output_bitrate=192000,
|
|
)
|
|
logger.info("[seg_fmp4] Audio transcoding: %s -> AAC", audio_track.codec_id)
|
|
else:
|
|
logger.warning("[seg_fmp4] No FFmpeg codec for %s, skipping audio", audio_track.codec_id)
|
|
audio_track = None
|
|
|
|
audio_timescale = 48000 if transcoder else (int(audio_track.effective_sample_rate) if audio_track else 48000)
|
|
aac_frame_size = transcoder.frame_size if transcoder else 1024
|
|
audio_sr = 48000 if transcoder else (int(audio_track.sample_rate) if audio_track else 48000)
|
|
|
|
# Phase 3: Build muxer (NO init segment emitted -- HLS serves it separately)
|
|
default_asc = bytes([0x11, 0x90]) # 48kHz stereo LC
|
|
|
|
muxer = FMP4StreamMuxer(
|
|
video_track=video_track,
|
|
audio_sample_rate=audio_sr,
|
|
audio_channels=2 if transcoder else (audio_track.channels if audio_track else 2),
|
|
audio_specific_config=default_asc,
|
|
video_timescale=_VIDEO_TIMESCALE,
|
|
audio_timescale=audio_timescale,
|
|
duration_ms=max_duration_ms or 0.0,
|
|
fragment_duration_ms=2000.0,
|
|
start_decode_time_ms=start_decode_time_ms,
|
|
audio_frame_size=aac_frame_size,
|
|
)
|
|
|
|
if transcoder and transcoder.audio_specific_config:
|
|
muxer.update_audio_specific_config(transcoder.audio_specific_config)
|
|
|
|
# Phase 4: Compute frame-count limits for precise segment bounding
|
|
fps = 24.0
|
|
if video_track.default_duration_ns > 0:
|
|
fps = 1_000_000_000.0 / video_track.default_duration_ns
|
|
elif video_track.frame_duration_ms > 0:
|
|
fps = 1000.0 / video_track.frame_duration_ms
|
|
|
|
_max_video_frames: int | None = None
|
|
_max_audio_frames: int | None = None
|
|
segment_end_ms: float | None = None
|
|
|
|
if max_duration_ms is not None:
|
|
segment_end_ms = start_decode_time_ms + max_duration_ms
|
|
_max_video_frames = round(max_duration_ms * fps / 1000.0)
|
|
|
|
# Audio frame-count: tile AAC frames across timeline and count
|
|
# how many fall within [start_ms, end_ms). This mirrors the
|
|
# muxer's _audio_decode_time alignment exactly.
|
|
if aac_frame_size > 0 and audio_sr > 0:
|
|
end_time_ms = start_decode_time_ms + max_duration_ms
|
|
frames_before_start = int(start_decode_time_ms / 1000.0 * audio_sr / aac_frame_size)
|
|
frames_before_end = int(end_time_ms / 1000.0 * audio_sr / aac_frame_size)
|
|
_max_audio_frames = frames_before_end - frames_before_start
|
|
else:
|
|
_max_audio_frames = None
|
|
|
|
logger.info(
|
|
"[seg_fmp4] Frame limits: video=%s @%.1ffps, audio=%s (frame_size=%d, sr=%d), window=%.3f-%.3fs",
|
|
_max_video_frames,
|
|
fps,
|
|
_max_audio_frames,
|
|
aac_frame_size,
|
|
audio_sr,
|
|
start_decode_time_ms / 1000.0,
|
|
segment_end_ms / 1000.0 if segment_end_ms is not None else -1.0,
|
|
)
|
|
|
|
# Phase 5: Process frames
|
|
last_video_ts_ms = 0.0
|
|
_video_limit_hit = False
|
|
_audio_limit_hit = False
|
|
_got_keyframe = False # Must see IDR before emitting any video
|
|
|
|
async for frame in demuxer.iter_frames(source):
|
|
# ── Video frame (passthrough) ──
|
|
if video_track and frame.track_number == video_track.track_number:
|
|
# Segment time-window clamp (critical for monotonic HLS PTS):
|
|
# with overlapped MKV byte ranges, we may receive extra video
|
|
# blocks from the next segment's cluster. Drop anything outside
|
|
# [segment_start, segment_end) to prevent timestamp regressions
|
|
# at segment boundaries.
|
|
if segment_end_ms is not None and frame.timestamp_ms >= segment_end_ms:
|
|
_video_limit_hit = True
|
|
if _audio_limit_hit or audio_track is None:
|
|
break
|
|
continue
|
|
|
|
# Check frame-count limit
|
|
if _max_video_frames is not None and video_frame_count >= _max_video_frames:
|
|
_video_limit_hit = True
|
|
if _audio_limit_hit or audio_track is None:
|
|
break
|
|
continue
|
|
|
|
# Ensure AVCC length-prefixed NAL format for fMP4.
|
|
# Some MKV files store frames in mixed Annex B / AVCC.
|
|
# annexb_to_avcc converts start-code NALUs to length-
|
|
# prefixed and is a no-op for already-AVCC data.
|
|
# filter_ps=False preserves in-band SPS/PPS updates.
|
|
sample_data = annexb_to_avcc(frame.data, filter_ps=False)
|
|
if not sample_data:
|
|
continue
|
|
|
|
# Skip non-VCL samples (SEI-only, filler, padding).
|
|
if not _has_valid_video_nal(sample_data, video_track.codec_id):
|
|
continue
|
|
|
|
# Gate on first keyframe: fMP4 segments must start with a sync sample.
|
|
if not _got_keyframe:
|
|
if not frame.is_keyframe:
|
|
continue
|
|
_got_keyframe = True
|
|
logger.info(
|
|
"[seg_fmp4] First keyframe at %.3fs",
|
|
frame.timestamp_ms / 1000.0,
|
|
)
|
|
|
|
# Compute duration
|
|
duration_ms = frame.duration_ms
|
|
if duration_ms <= 0 and video_track.frame_duration_ms > 0:
|
|
duration_ms = video_track.frame_duration_ms
|
|
elif duration_ms <= 0:
|
|
if video_frame_count > 0 and frame.timestamp_ms > last_video_ts_ms:
|
|
duration_ms = frame.timestamp_ms - last_video_ts_ms
|
|
else:
|
|
duration_ms = 1000.0 / fps
|
|
|
|
duration_ticks = max(1, int(duration_ms * _VIDEO_TIMESCALE / 1000.0))
|
|
|
|
# Absolute PTS from MKV Cluster timestamps -- exact, no
|
|
# encoder involved, no drift.
|
|
pts_ticks = int(frame.timestamp_ms * _VIDEO_TIMESCALE / 1000.0)
|
|
|
|
muxer.add_video_sample(
|
|
sample_data,
|
|
duration_ticks,
|
|
frame.is_keyframe,
|
|
pts_ticks=pts_ticks,
|
|
)
|
|
last_video_ts_ms = frame.timestamp_ms
|
|
video_frame_count += 1
|
|
|
|
# ── Audio frame ──
|
|
elif audio_track and frame.track_number == audio_track.track_number:
|
|
# Check frame-count limit
|
|
if _max_audio_frames is not None and audio_frame_count >= _max_audio_frames:
|
|
_audio_limit_hit = True
|
|
if _video_limit_hit or video_track is None:
|
|
break
|
|
continue
|
|
|
|
if transcoder:
|
|
aac_frames = transcoder.transcode(frame.data)
|
|
for aac_data in aac_frames:
|
|
if _max_audio_frames is not None and audio_frame_count >= _max_audio_frames:
|
|
_audio_limit_hit = True
|
|
break
|
|
muxer.add_audio_sample(aac_data, transcoder.frame_size)
|
|
audio_frame_count += 1
|
|
else:
|
|
# Audio passthrough
|
|
duration_ms = frame.duration_ms
|
|
if duration_ms <= 0 and audio_track.frame_duration_ms > 0:
|
|
duration_ms = audio_track.frame_duration_ms
|
|
elif duration_ms <= 0:
|
|
duration_ms = 1024.0 / audio_track.sample_rate * 1000.0
|
|
duration_ticks = max(1, int(duration_ms * audio_timescale / 1000.0))
|
|
muxer.add_audio_sample(frame.data, duration_ticks)
|
|
audio_frame_count += 1
|
|
|
|
# Check if we should emit a fragment
|
|
fragment = muxer.flush_fragment()
|
|
if fragment:
|
|
fragment_count += 1
|
|
bytes_out += len(fragment)
|
|
yield fragment
|
|
|
|
# Early exit when both tracks hit their limits
|
|
if _video_limit_hit and (_audio_limit_hit or audio_track is None):
|
|
break
|
|
if _audio_limit_hit and (video_track is None):
|
|
break
|
|
|
|
# Flush remaining audio from transcoder
|
|
if transcoder and not _audio_limit_hit:
|
|
for aac_data in transcoder.flush():
|
|
if _max_audio_frames is not None and audio_frame_count >= _max_audio_frames:
|
|
break
|
|
muxer.add_audio_sample(aac_data, transcoder.frame_size)
|
|
audio_frame_count += 1
|
|
|
|
# Emit final fragment
|
|
final = muxer.flush_final()
|
|
if final:
|
|
fragment_count += 1
|
|
bytes_out += len(final)
|
|
yield final
|
|
|
|
except (GeneratorExit, asyncio.CancelledError):
|
|
cancelled = True
|
|
logger.info("[seg_fmp4] Client disconnected, stopping segment pipeline")
|
|
except Exception as exc:
|
|
if bytes_out == 0 and "prematurely" in str(exc):
|
|
cancelled = True
|
|
logger.info("[seg_fmp4] Client disconnected before segment started")
|
|
else:
|
|
logger.exception("[seg_fmp4] Segment pipeline error")
|
|
finally:
|
|
if transcoder:
|
|
transcoder.close()
|
|
if hasattr(source, "aclose"):
|
|
try:
|
|
await source.aclose()
|
|
except Exception:
|
|
pass
|
|
|
|
if cancelled:
|
|
logger.info(
|
|
"[seg_fmp4] Cancelled: %d video, %d audio frames, %d fragments, %d bytes",
|
|
video_frame_count,
|
|
audio_frame_count,
|
|
fragment_count,
|
|
bytes_out,
|
|
)
|
|
else:
|
|
logger.info(
|
|
"[seg_fmp4] Complete: %d video, %d audio frames, %d fragments, %d bytes",
|
|
video_frame_count,
|
|
audio_frame_count,
|
|
fragment_count,
|
|
bytes_out,
|
|
)
|
|
|
|
|
|
# =============================================================================
|
|
# Helper functions
|
|
# =============================================================================
|
|
|
|
|
|
# H.264 VCL NAL unit types (actual video slices)
|
|
_H264_VCL_TYPES = frozenset({1, 2, 3, 4, 5}) # Non-IDR, Part A/B/C, IDR
|
|
# HEVC VCL NAL unit types (BLA through CRA, 0-21)
|
|
_HEVC_VCL_TYPES = frozenset(range(0, 22))
|
|
|
|
|
|
def _has_valid_video_nal(data: bytes, codec_id: str = CODEC_ID_H264) -> bool:
|
|
"""
|
|
Check if AVCC/HVCC-formatted sample data contains at least one VCL NAL.
|
|
|
|
For H.264: VCL types 1-5 (Non-IDR through IDR slice).
|
|
For HEVC: VCL types 0-21 (BLA_W_LP through CRA_NUT).
|
|
|
|
Returns True if at least one qualifying NAL is present.
|
|
"""
|
|
if len(data) < 5:
|
|
return False
|
|
|
|
is_hevc = codec_id == CODEC_ID_H265
|
|
vcl_types = _HEVC_VCL_TYPES if is_hevc else _H264_VCL_TYPES
|
|
|
|
pos = 0
|
|
size = len(data)
|
|
while pos + 4 < size:
|
|
nal_len = int.from_bytes(data[pos : pos + 4], "big")
|
|
if nal_len <= 0 or nal_len > size - pos - 4:
|
|
break
|
|
nal_byte = data[pos + 4]
|
|
if is_hevc:
|
|
forbidden = (nal_byte >> 7) & 1
|
|
nal_type = (nal_byte >> 1) & 0x3F
|
|
else:
|
|
forbidden = (nal_byte >> 7) & 1
|
|
nal_type = nal_byte & 0x1F
|
|
if forbidden == 0 and nal_type in vcl_types:
|
|
return True
|
|
pos += 4 + nal_len
|
|
return False
|
|
|
|
|
|
def _find_video_track(header: MKVHeader) -> MKVTrack | None:
|
|
"""Find the first supported video track."""
|
|
for track in header.tracks:
|
|
if track.is_video and track.codec_id in (CODEC_ID_H264, CODEC_ID_H265):
|
|
return track
|
|
return None
|
|
|
|
|
|
def _find_audio_track(header: MKVHeader) -> MKVTrack | None:
|
|
"""Find the first audio track."""
|
|
for track in header.tracks:
|
|
if track.is_audio:
|
|
return track
|
|
return None
|
|
|
|
|
|
# =============================================================================
|
|
# Universal transcode pipeline (PyAV-based, any container, video re-encoding)
|
|
# =============================================================================
|
|
|
|
|
|
def _build_synthetic_mkv_track(
|
|
codec_id: str,
|
|
codec_private: bytes,
|
|
*,
|
|
width: int = 0,
|
|
height: int = 0,
|
|
sample_rate: float = 0.0,
|
|
channels: int = 0,
|
|
track_type: int = 1,
|
|
track_number: int = 1,
|
|
default_duration_ns: int = 0,
|
|
) -> MKVTrack:
|
|
"""
|
|
Create a synthetic MKVTrack from PyAV stream metadata.
|
|
|
|
The fMP4 muxer expects MKVTrack objects. This bridges PyAV stream info
|
|
to the existing muxer interface without modifying the muxer.
|
|
"""
|
|
return MKVTrack(
|
|
track_number=track_number,
|
|
track_type=track_type,
|
|
codec_id=codec_id,
|
|
codec_private=codec_private,
|
|
pixel_width=width,
|
|
pixel_height=height,
|
|
sample_rate=sample_rate,
|
|
channels=channels,
|
|
default_duration_ns=default_duration_ns,
|
|
)
|
|
|
|
|
|
def _update_init_extradata(
|
|
video_transcoder: VideoTranscoder,
|
|
video_track: MKVTrack,
|
|
first_nal_data: bytes,
|
|
) -> None:
|
|
"""
|
|
Update a video track's codec_private with SPS/PPS from the encoder.
|
|
|
|
Hardware encoders (VideoToolbox, NVENC) often don't expose extradata
|
|
on the codec context. Instead, they embed SPS/PPS as in-band NAL
|
|
units in the first keyframe. This function extracts them and writes
|
|
proper AVCC-format extradata into the MKVTrack so the init segment
|
|
built from it is valid.
|
|
"""
|
|
from mediaflow_proxy.remuxer.codec_utils import ensure_avcc_extradata, extract_sps_pps_from_annexb
|
|
|
|
# Try encoder context first (works for libx264 / software)
|
|
extradata = video_transcoder.codec_private_data
|
|
if not extradata:
|
|
# Extract from first keyframe NAL data (HW encoders)
|
|
extradata = extract_sps_pps_from_annexb(first_nal_data)
|
|
if extradata:
|
|
extradata = ensure_avcc_extradata(extradata)
|
|
video_track.codec_private = extradata
|
|
logger.info(
|
|
"[universal] Updated init extradata from encoder: %d bytes",
|
|
len(extradata),
|
|
)
|
|
|
|
|
|
async def stream_transcode_universal(
|
|
source: AsyncIterator[bytes],
|
|
*,
|
|
force_video_reencode: bool = False,
|
|
max_duration_ms: float | None = None,
|
|
start_decode_time_ms: float = 0.0,
|
|
emit_init_segment: bool = True,
|
|
force_software_encode: bool = False,
|
|
) -> AsyncIterator[bytes]:
|
|
"""
|
|
Universal transcode pipeline using PyAV for demuxing and encoding.
|
|
|
|
Handles any container format and optionally re-encodes video using
|
|
GPU-accelerated codecs when available.
|
|
|
|
Args:
|
|
source: Async iterator of container bytes (MKV, MP4, TS, etc.).
|
|
force_video_reencode: When True, always re-encode video even if
|
|
the codec is normally browser-compatible (e.g. H.264). Useful
|
|
for live MPEG-TS sources with corrupt bitstreams.
|
|
max_duration_ms: If set, stop emitting after this many milliseconds
|
|
of media have been produced.
|
|
start_decode_time_ms: Initial decode time offset for fMP4 timestamps.
|
|
emit_init_segment: Whether to yield the fMP4 init segment (ftyp+moov).
|
|
force_software_encode: When True, force ``libx264`` software encoder
|
|
instead of hardware (VideoToolbox/NVENC). Used for HLS per-segment
|
|
transcoding to avoid SIGSEGV crashes with hardware encoders.
|
|
|
|
Yields:
|
|
bytes chunks forming a valid fMP4 byte stream.
|
|
"""
|
|
video_transcoder = None
|
|
audio_encoder = None
|
|
audio_resampler = None
|
|
video_frame_count = 0
|
|
audio_frame_count = 0
|
|
fragment_count = 0
|
|
bytes_out = 0
|
|
cancelled = False
|
|
_audio_flushed = False # Prevents double-flush SIGSEGV on teardown
|
|
|
|
# Both video and audio decode decisions are deferred until after stream
|
|
# discovery, so the demux thread only decodes what's actually needed.
|
|
# Video decoding is only required when the codec needs re-encoding;
|
|
# passthrough uses raw packets. Audio decoding is needed when the
|
|
# codec is not browser-compatible (e.g. ac3 -> aac).
|
|
demuxer = PyAVDemuxer(decode_video=False, decode_audio=False)
|
|
|
|
try:
|
|
# Phase 1: Start demuxing -- opens the container in a background thread,
|
|
# discovers streams, and starts enqueuing packets. Awaits until stream
|
|
# metadata is available.
|
|
await demuxer.start(source)
|
|
|
|
vs = demuxer.video_stream
|
|
aus = demuxer.audio_stream
|
|
|
|
if vs is None and aus is None:
|
|
demuxer.enable_video_decode(False)
|
|
demuxer.enable_audio_decode(False)
|
|
raise ValueError("No video or audio streams found in source")
|
|
|
|
# Phase 2: Determine what needs transcoding
|
|
do_video_transcode = False
|
|
do_audio_transcode = False
|
|
video_mkv_codec = ""
|
|
audio_mkv_codec = ""
|
|
|
|
if vs:
|
|
video_mkv_codec = _PYAV_TO_MKV_VIDEO.get(vs.codec_name, vs.codec_name)
|
|
do_video_transcode = (
|
|
force_video_reencode
|
|
or pyav_video_needs_reencode(vs.codec_name)
|
|
or pyav_video_needs_reencode(video_mkv_codec)
|
|
)
|
|
if aus:
|
|
audio_mkv_codec = _PYAV_TO_MKV_AUDIO.get(aus.codec_name, aus.codec_name)
|
|
do_audio_transcode = pyav_audio_needs_transcode(aus.codec_name) or pyav_audio_needs_transcode(
|
|
audio_mkv_codec
|
|
)
|
|
|
|
# Tell the demux thread whether to decode video/audio in-thread.
|
|
# This must be called before consuming packets via iter_packets().
|
|
demuxer.enable_video_decode(do_video_transcode)
|
|
demuxer.enable_audio_decode(do_audio_transcode)
|
|
|
|
logger.info(
|
|
"[universal] Streams: video=%s (reencode=%s), audio=%s (transcode=%s)",
|
|
vs.codec_name if vs else "none",
|
|
do_video_transcode,
|
|
aus.codec_name if aus else "none",
|
|
do_audio_transcode,
|
|
)
|
|
|
|
# Phase 3: Set up transcoders
|
|
if do_video_transcode and vs:
|
|
video_transcoder = VideoTranscoder(
|
|
input_codec_name=vs.codec_name,
|
|
width=vs.width,
|
|
height=vs.height,
|
|
fps=vs.fps or 24.0,
|
|
pixel_format=vs.pixel_format or "yuv420p",
|
|
force_software=force_software_encode,
|
|
)
|
|
|
|
# Audio encoding: since audio is decoded in the demux thread, we only
|
|
# need a resampler and encoder here. No standalone decoder needed.
|
|
audio_encoder = None
|
|
audio_resampler = None
|
|
if do_audio_transcode and aus:
|
|
audio_encoder = av.CodecContext.create("aac", "w")
|
|
audio_encoder.sample_rate = 48000
|
|
audio_encoder.layout = "stereo"
|
|
audio_encoder.format = av.AudioFormat("fltp")
|
|
audio_encoder.bit_rate = 192000
|
|
audio_encoder.open()
|
|
|
|
audio_resampler = AudioResampler(
|
|
format="fltp",
|
|
layout="stereo",
|
|
rate=48000,
|
|
)
|
|
|
|
logger.info(
|
|
"[universal] Audio transcoding: %s %dHz %dch -> aac 48000Hz 2ch @192k",
|
|
aus.codec_name,
|
|
aus.sample_rate or 0,
|
|
aus.channels or 0,
|
|
)
|
|
|
|
# Phase 4: Build init segment
|
|
# When transcoding video, force output codec to H.264 regardless
|
|
# of whether the encoder has produced extradata yet (libx264 emits
|
|
# SPS/PPS only after the first encode call).
|
|
if do_video_transcode and video_transcoder:
|
|
raw_extradata = video_transcoder.codec_private_data or b""
|
|
video_codec_private = ensure_avcc_extradata(raw_extradata) if raw_extradata else b""
|
|
video_track_codec = CODEC_ID_H264 # Output is always H.264
|
|
elif vs:
|
|
# Ensure extradata is in avcC format (MPEG-TS returns Annex B)
|
|
video_codec_private = ensure_avcc_extradata(vs.extradata)
|
|
video_track_codec = video_mkv_codec or CODEC_ID_H264
|
|
else:
|
|
video_codec_private = b""
|
|
video_track_codec = CODEC_ID_H264
|
|
|
|
video_track = None
|
|
if vs:
|
|
output_w = video_transcoder.width if video_transcoder else vs.width
|
|
output_h = video_transcoder.height if video_transcoder else vs.height
|
|
frame_dur_ns = int(1_000_000_000 / (vs.fps or 24.0))
|
|
video_track = _build_synthetic_mkv_track(
|
|
codec_id=video_track_codec,
|
|
codec_private=video_codec_private,
|
|
width=output_w,
|
|
height=output_h,
|
|
track_type=1,
|
|
track_number=1,
|
|
default_duration_ns=frame_dur_ns,
|
|
)
|
|
|
|
audio_sr = 48000 if audio_encoder else (aus.sample_rate if aus else 48000)
|
|
audio_ch = 2 if audio_encoder else (aus.channels if aus else 2)
|
|
default_asc = bytes([0x11, 0x90]) # 48kHz stereo LC
|
|
|
|
if not video_track:
|
|
raise ValueError("No video track available for muxing")
|
|
|
|
# AAC frame size (samples per frame), typically 1024
|
|
aac_frame_size = audio_encoder.frame_size if audio_encoder and audio_encoder.frame_size else 1024
|
|
|
|
muxer = FMP4StreamMuxer(
|
|
video_track=video_track,
|
|
audio_sample_rate=audio_sr,
|
|
audio_channels=audio_ch,
|
|
audio_specific_config=default_asc,
|
|
video_timescale=_VIDEO_TIMESCALE,
|
|
audio_timescale=audio_sr,
|
|
# Cap duration: live/unknown streams report 0 or garbage values.
|
|
# Anything over 24h is almost certainly wrong for a real file.
|
|
duration_ms=vs.duration_seconds * 1000.0
|
|
if vs and vs.duration_seconds and 0 < vs.duration_seconds < 86400
|
|
else 0.0,
|
|
fragment_duration_ms=2000.0,
|
|
start_decode_time_ms=start_decode_time_ms,
|
|
# Pass AAC frame size so the muxer can align the audio tfdt to
|
|
# exact frame boundaries, preventing DTS discontinuities at
|
|
# HLS segment borders.
|
|
audio_frame_size=aac_frame_size,
|
|
)
|
|
|
|
if audio_encoder and audio_encoder.extradata:
|
|
muxer.update_audio_specific_config(bytes(audio_encoder.extradata))
|
|
|
|
# For hardware encoders (VideoToolbox, NVENC), SPS/PPS extradata may
|
|
# not be available until the first frame is encoded. Defer the init
|
|
# segment emission until after the first encoded video packet so the
|
|
# init segment always contains valid codec configuration.
|
|
_init_emitted = False
|
|
|
|
if emit_init_segment and not do_video_transcode:
|
|
# No re-encoding: extradata comes from the source stream, so we
|
|
# can emit the init segment immediately.
|
|
init_segment = muxer.build_init_segment()
|
|
logger.info("[universal] Init segment: %d bytes", len(init_segment))
|
|
yield init_segment
|
|
bytes_out = len(init_segment)
|
|
_init_emitted = True
|
|
|
|
# Phase 5: Process packets
|
|
# For video passthrough: skip until first keyframe and rebase DTS/PTS
|
|
# so fMP4 timestamps start from 0 (live TS streams have huge absolute values).
|
|
_video_dts_base: float | None = None # first video DTS in seconds
|
|
_got_keyframe = do_video_transcode # transcoded output always starts with keyframe
|
|
_emitted_video_duration_ms = 0.0 # accumulated video duration for monitoring
|
|
|
|
# Offset (video timescale ticks) that maps rebased-to-0 encoder PTS
|
|
# onto the absolute timeline expected by the muxer. When producing
|
|
# HLS segments starting at e.g. 25 s, the muxer's tfdt is at 25 s
|
|
# but the encoder PTS starts at 0. Adding this offset realigns them.
|
|
_start_offset_ticks = int(start_decode_time_ms * _VIDEO_TIMESCALE / 1000.0)
|
|
|
|
# Pre-compute per-frame duration ticks for re-encoded video (constant
|
|
# with zerolatency / no B-frames). Used for frame-count-based PTS.
|
|
_fps = (vs.fps or 24.0) if vs else 24.0
|
|
_reencode_dur_ticks = max(1, int(_VIDEO_TIMESCALE / _fps)) if vs else 0
|
|
|
|
# Encoder timebase denominator for setting sequential frame.pts on
|
|
# decoded frames before encoding. Keeps libx264's internal rate
|
|
# control consistent.
|
|
_enc_tb_den: int = 0
|
|
_enc_frame_dur: int = 0
|
|
if video_transcoder:
|
|
_enc_tb_den = video_transcoder._encoder.time_base.denominator
|
|
_enc_frame_dur = max(1, int(_enc_tb_den / _fps))
|
|
|
|
# ── Frame-count-based segment bounding ──────────────────────────
|
|
# When producing HLS segments, each segment MUST produce exactly
|
|
# the right number of video (and audio) frames so that the next
|
|
# segment's tfdt is contiguous. Relying on source PTS is fragile
|
|
# because mid-stream MKV byte ranges may not report accurate PTS.
|
|
#
|
|
# Video: round(duration_ms * fps / 1000) frames.
|
|
#
|
|
# Audio: compute by tiling AAC frames across the timeline. The
|
|
# audio tfdt of this segment is the cumulative count of AAC frames
|
|
# from time=0 up to start_decode_time_ms. The next segment's
|
|
# audio tfdt is the cumulative count up to end_time_ms. The
|
|
# difference gives the exact number of frames this segment must
|
|
# produce to keep segment borders gapless.
|
|
_max_video_frames: int | None = None
|
|
_max_audio_frames: int | None = None
|
|
if max_duration_ms is not None:
|
|
_max_video_frames = round(max_duration_ms * _fps / 1000.0)
|
|
if aac_frame_size > 0 and audio_sr > 0:
|
|
end_time_ms = start_decode_time_ms + max_duration_ms
|
|
# Count of whole AAC frames from t=0 to start and end
|
|
frames_before_start = int(start_decode_time_ms / 1000.0 * audio_sr / aac_frame_size)
|
|
frames_before_end = int(end_time_ms / 1000.0 * audio_sr / aac_frame_size)
|
|
_max_audio_frames = frames_before_end - frames_before_start
|
|
else:
|
|
_max_audio_frames = None # no cap
|
|
|
|
async def _process_packet(packet):
|
|
nonlocal video_frame_count, audio_frame_count, fragment_count, bytes_out
|
|
nonlocal _video_dts_base, _got_keyframe
|
|
nonlocal _emitted_video_duration_ms, _init_emitted
|
|
|
|
init_bytes: bytes | None = None # deferred init, returned alongside fragment
|
|
|
|
if vs and packet.stream_index == vs.index and packet.codec_type == "video":
|
|
# ── Frame-count limit for HLS segments ──
|
|
# Stop accepting video once we've emitted enough frames.
|
|
if _max_video_frames is not None and video_frame_count >= _max_video_frames:
|
|
return None, None
|
|
|
|
if do_video_transcode and video_transcoder and packet.decoded_frame is not None:
|
|
# Set sequential PTS on the decoded frame in encoder
|
|
# timebase *before* encoding. The demuxer's frame.pts is
|
|
# in the demuxer's timebase (e.g. 1/1000 for MKV) which
|
|
# does NOT match the encoder's timebase (1/(fps*1000)).
|
|
# Passing the raw integer through causes PTS compression
|
|
# by ~fps-x, corrupting the output timeline. Sequential
|
|
# PTS keeps libx264's rate control consistent.
|
|
packet.decoded_frame.pts = video_frame_count * _enc_frame_dur
|
|
|
|
# Frame already decoded by the demux thread -- re-encode
|
|
encoded = video_transcoder.transcode_frame(packet.decoded_frame)
|
|
for nal_data, is_kf, enc_pts, enc_dts in encoded:
|
|
# Convert Annex B start codes to AVCC length prefixes.
|
|
# Hardware encoders (VideoToolbox, NVENC) emit Annex B.
|
|
sample_data = annexb_to_avcc(nal_data)
|
|
if not sample_data:
|
|
continue
|
|
|
|
# Deferred init segment: after the first encode, the HW
|
|
# encoder's extradata is available. Extract SPS/PPS and
|
|
# rebuild the init segment so it has correct codec config.
|
|
if emit_init_segment and not _init_emitted:
|
|
_update_init_extradata(video_transcoder, video_track, nal_data)
|
|
init_bytes = muxer.build_init_segment()
|
|
logger.info("[universal] Init segment (deferred): %d bytes", len(init_bytes))
|
|
bytes_out += len(init_bytes)
|
|
_init_emitted = True
|
|
|
|
# Frame-count-based PTS: since zerolatency produces
|
|
# no B-frames (PTS == DTS), derive PTS directly from
|
|
# the output frame index. This avoids the timebase
|
|
# mismatch bug and guarantees monotonic timestamps.
|
|
pts_ticks = _start_offset_ticks + (video_frame_count * _reencode_dur_ticks)
|
|
|
|
muxer.add_video_sample(sample_data, _reencode_dur_ticks, is_kf, pts_ticks=pts_ticks)
|
|
video_frame_count += 1
|
|
_emitted_video_duration_ms += _reencode_dur_ticks * 1000.0 / _VIDEO_TIMESCALE
|
|
elif do_video_transcode and video_transcoder:
|
|
# Fallback: raw packet (shouldn't happen with decode_video=True)
|
|
logger.warning("[universal] Video packet without decoded frame, skipping")
|
|
else:
|
|
# Video passthrough -- wait for first keyframe before
|
|
# sending any video (browser can't decode without IDR).
|
|
if not _got_keyframe:
|
|
if not packet.is_keyframe:
|
|
return None, None
|
|
_got_keyframe = True
|
|
logger.info("[universal] First keyframe received, starting video")
|
|
|
|
# Convert Annex B start codes to AVCC length prefixes
|
|
# if needed (MPEG-TS sources).
|
|
sample_data = annexb_to_avcc(packet.data)
|
|
if not sample_data:
|
|
return None, None
|
|
|
|
dur_ticks = (
|
|
max(1, int(packet.duration_seconds * _VIDEO_TIMESCALE))
|
|
if packet.duration > 0
|
|
else max(1, int(_VIDEO_TIMESCALE / (vs.fps or 24.0)))
|
|
)
|
|
|
|
# Always pass PTS for CTS computation so B-frames
|
|
# are properly reordered by the player.
|
|
pts_ticks = None
|
|
dts_secs = packet.dts_seconds
|
|
pts_secs = packet.pts_seconds
|
|
|
|
if _video_dts_base is None:
|
|
_video_dts_base = dts_secs
|
|
|
|
if packet.pts != 0 and pts_secs != dts_secs:
|
|
rebased_pts = pts_secs - _video_dts_base
|
|
pts_ticks = max(0, int(rebased_pts * _VIDEO_TIMESCALE)) + _start_offset_ticks
|
|
|
|
muxer.add_video_sample(sample_data, dur_ticks, packet.is_keyframe, pts_ticks=pts_ticks)
|
|
video_frame_count += 1
|
|
_emitted_video_duration_ms += dur_ticks * 1000.0 / _VIDEO_TIMESCALE
|
|
|
|
elif aus and packet.stream_index == aus.index and packet.codec_type == "audio":
|
|
# Don't emit audio until the first video keyframe so A/V stay in sync
|
|
if not _got_keyframe:
|
|
return None, None
|
|
|
|
# ── Audio frame-count limit for HLS segments ──
|
|
if _max_audio_frames is not None and audio_frame_count >= _max_audio_frames:
|
|
return None, None
|
|
|
|
if do_audio_transcode and audio_encoder and audio_resampler and packet.decoded_frame is not None:
|
|
# Audio frame decoded by demux thread -- resample and encode
|
|
resampled = audio_resampler.resample(packet.decoded_frame)
|
|
if resampled is not None:
|
|
if not isinstance(resampled, list):
|
|
resampled = [resampled]
|
|
for rs_frame in resampled:
|
|
if _max_audio_frames is not None and audio_frame_count >= _max_audio_frames:
|
|
break
|
|
for enc_pkt in audio_encoder.encode(rs_frame):
|
|
muxer.add_audio_sample(bytes(enc_pkt), aac_frame_size)
|
|
audio_frame_count += 1
|
|
elif do_audio_transcode and audio_encoder:
|
|
# Fallback: raw packet (shouldn't happen with decode_audio=True)
|
|
logger.warning("[universal] Audio packet without decoded frame, skipping")
|
|
else:
|
|
# Audio passthrough
|
|
dur_ticks = max(1, int(packet.duration_seconds * audio_sr)) if packet.duration > 0 else 1024
|
|
muxer.add_audio_sample(packet.data, dur_ticks)
|
|
audio_frame_count += 1
|
|
|
|
# Emit fragment if ready
|
|
fragment = muxer.flush_fragment()
|
|
if fragment:
|
|
fragment_count += 1
|
|
bytes_out += len(fragment)
|
|
return init_bytes, fragment
|
|
return init_bytes, None
|
|
|
|
# Process all packets from the demuxer
|
|
async for packet in demuxer.iter_packets():
|
|
# Frame-count-based segment bounding: stop the packet loop once
|
|
# both video and audio have emitted their target frame counts.
|
|
# Individual _process_packet calls for each track already skip
|
|
# frames beyond the limit, so this break is just an optimisation
|
|
# to avoid draining the entire byte range.
|
|
if _max_video_frames is not None:
|
|
video_done = video_frame_count >= _max_video_frames
|
|
audio_done = _max_audio_frames is None or audio_frame_count >= _max_audio_frames
|
|
if video_done and audio_done:
|
|
logger.debug(
|
|
"[universal] Segment frame limits reached: video=%d/%d, audio=%d/%s, emitted=%.0fms",
|
|
video_frame_count,
|
|
_max_video_frames,
|
|
audio_frame_count,
|
|
_max_audio_frames if _max_audio_frames is not None else "unlimited",
|
|
_emitted_video_duration_ms,
|
|
)
|
|
break
|
|
|
|
deferred_init, frag = await _process_packet(packet)
|
|
if deferred_init:
|
|
yield deferred_init
|
|
if frag:
|
|
yield frag
|
|
|
|
# Flush video encoder (decoder already flushed in the demux thread).
|
|
# Skip flush if we already reached the frame count limit for HLS
|
|
# segments -- flushed frames would exceed the target and cause
|
|
# DTS overlap with the next segment.
|
|
_video_limit_hit = _max_video_frames is not None and video_frame_count >= _max_video_frames
|
|
if video_transcoder and not _video_limit_hit:
|
|
for nal_data, is_kf, pts, dts in video_transcoder.flush():
|
|
sample_data = annexb_to_avcc(nal_data)
|
|
if not sample_data:
|
|
continue
|
|
# Use same frame-count-based PTS as the main encode path
|
|
pts_ticks = _start_offset_ticks + (video_frame_count * _reencode_dur_ticks)
|
|
muxer.add_video_sample(sample_data, _reencode_dur_ticks, is_kf, pts_ticks=pts_ticks)
|
|
video_frame_count += 1
|
|
_emitted_video_duration_ms += _reencode_dur_ticks * 1000.0 / _VIDEO_TIMESCALE
|
|
|
|
# Flush audio resampler + encoder (decoder already flushed in the demux thread).
|
|
# When audio frame limit was reached, we still need to flush the
|
|
# encoder to drain its internal state, but we discard the output
|
|
# to avoid exceeding the frame count.
|
|
_audio_limit_hit = _max_audio_frames is not None and audio_frame_count >= _max_audio_frames
|
|
if audio_encoder and audio_resampler and _audio_limit_hit:
|
|
# Drain encoder without emitting -- prevents SIGSEGV on teardown
|
|
try:
|
|
audio_resampler.resample(None)
|
|
except Exception:
|
|
pass
|
|
try:
|
|
for _ in audio_encoder.encode(None):
|
|
pass
|
|
except Exception:
|
|
pass
|
|
_audio_flushed = True
|
|
elif audio_encoder and audio_resampler:
|
|
try:
|
|
resampled = audio_resampler.resample(None)
|
|
if resampled is not None:
|
|
if not isinstance(resampled, list):
|
|
resampled = [resampled]
|
|
for rs_frame in resampled:
|
|
if _max_audio_frames is not None and audio_frame_count >= _max_audio_frames:
|
|
break
|
|
for enc_pkt in audio_encoder.encode(rs_frame):
|
|
muxer.add_audio_sample(bytes(enc_pkt), aac_frame_size)
|
|
audio_frame_count += 1
|
|
except Exception:
|
|
pass
|
|
try:
|
|
for enc_pkt in audio_encoder.encode(None):
|
|
if _max_audio_frames is not None and audio_frame_count >= _max_audio_frames:
|
|
break
|
|
muxer.add_audio_sample(bytes(enc_pkt), aac_frame_size)
|
|
audio_frame_count += 1
|
|
except Exception:
|
|
pass
|
|
_audio_flushed = True
|
|
|
|
# Final fragment
|
|
final = muxer.flush_final()
|
|
if final:
|
|
fragment_count += 1
|
|
bytes_out += len(final)
|
|
yield final
|
|
|
|
except (GeneratorExit, asyncio.CancelledError):
|
|
cancelled = True
|
|
logger.info("[universal] Client disconnected, stopping pipeline")
|
|
except Exception as exc:
|
|
if bytes_out == 0 and "prematurely" in str(exc):
|
|
cancelled = True
|
|
logger.info("[universal] Client disconnected before streaming started")
|
|
else:
|
|
logger.exception("[universal] Pipeline error")
|
|
finally:
|
|
if video_transcoder:
|
|
video_transcoder.close()
|
|
video_transcoder = None
|
|
# Flush audio only if the normal path didn't already do it.
|
|
# Double-flushing a PyAV codec context causes SIGSEGV.
|
|
if audio_encoder and not _audio_flushed:
|
|
try:
|
|
for _ in audio_encoder.encode(None):
|
|
pass
|
|
except Exception:
|
|
pass
|
|
audio_encoder = None
|
|
audio_resampler = None
|
|
if hasattr(source, "aclose"):
|
|
try:
|
|
await source.aclose()
|
|
except Exception:
|
|
pass
|
|
logger.debug("[universal] Cleanup: complete")
|
|
|
|
if cancelled:
|
|
logger.info(
|
|
"[universal] Cancelled after %d video, %d audio frames, %d fragments, %d bytes out",
|
|
video_frame_count,
|
|
audio_frame_count,
|
|
fragment_count,
|
|
bytes_out,
|
|
)
|
|
else:
|
|
logger.info(
|
|
"[universal] Complete: %d video, %d audio frames, %d fragments, %d bytes out",
|
|
video_frame_count,
|
|
audio_frame_count,
|
|
fragment_count,
|
|
bytes_out,
|
|
)
|