Files
UnHided/mediaflow_proxy/remuxer/audio_transcoder.py
UrloMythus cfc6bbabc9 update
2026-02-19 20:15:03 +01:00

352 lines
11 KiB
Python

"""
PyAV-based audio transcoder for frame-level codec conversion.
Transcodes audio frames between codecs using PyAV's CodecContext API
(Python bindings for FFmpeg's libavcodec). This provides in-process
audio transcoding without subprocess management or pipe overhead.
Supported input codecs: EAC3, AC3, AAC, Opus, Vorbis, FLAC, MP3
Output codec: AAC-LC (stereo, configurable bitrate)
Architecture:
raw_frame_bytes -> parse() -> decode() -> resample() -> encode() -> raw_aac_bytes
Usage:
transcoder = AudioTranscoder("eac3", sample_rate=48000, channels=6)
for raw_eac3_frame in frames:
aac_frames = transcoder.transcode(raw_eac3_frame)
for aac_data in aac_frames:
write(aac_data)
# Flush remaining frames
for aac_data in transcoder.flush():
write(aac_data)
"""
import logging
import av
from av.audio.resampler import AudioResampler
from mediaflow_proxy.remuxer.ebml_parser import (
CODEC_ID_AAC,
CODEC_ID_AC3,
CODEC_ID_EAC3,
CODEC_ID_FLAC,
CODEC_ID_OPUS,
CODEC_ID_VORBIS,
)
logger = logging.getLogger(__name__)
def _generate_silence_aac_frame() -> bytes | None:
"""Pre-encode a single silent AAC frame (48 kHz stereo, 1024 samples).
PyAV's AAC encoder has an intermittent ``avcodec_send_frame`` bug when
rapidly creating/destroying codec contexts, so we retry a few times.
This function is called once at module load; the result is cached in
``_SILENCE_AAC_FRAME``.
"""
for _attempt in range(10):
try:
enc = av.CodecContext.create("aac", "w")
enc.sample_rate = 48000
enc.layout = "stereo"
enc.format = av.AudioFormat("fltp")
enc.bit_rate = 192000
enc.open()
frame = av.AudioFrame(
format=enc.format.name,
layout=enc.layout.name,
samples=enc.frame_size or 1024,
)
frame.sample_rate = enc.sample_rate
frame.pts = 0
for pkt in enc.encode(frame):
return bytes(pkt)
# AAC priming delay: first encode buffered; flush to retrieve
for pkt in enc.encode(None):
return bytes(pkt)
except Exception:
continue
return None
# Module-level silence frame -- generated once, reused by every transcoder.
_SILENCE_AAC_FRAME: bytes | None = _generate_silence_aac_frame()
# Map MKV codec IDs to PyAV/FFmpeg codec names
_MKV_TO_FFMPEG_CODEC = {
CODEC_ID_EAC3: "eac3",
CODEC_ID_AC3: "ac3",
CODEC_ID_AAC: "aac",
CODEC_ID_OPUS: "opus",
CODEC_ID_VORBIS: "vorbis",
CODEC_ID_FLAC: "flac",
"A_DTS": "dts",
"A_MP3": "mp3",
"A_MPEG/L3": "mp3",
}
# Codecs that need transcoding to AAC for browser playback
NEEDS_TRANSCODE = frozenset(
{
CODEC_ID_EAC3,
CODEC_ID_AC3,
CODEC_ID_OPUS,
CODEC_ID_VORBIS,
CODEC_ID_FLAC,
"A_DTS",
"A_MP3",
"A_MPEG/L3",
}
)
# Output AAC settings
_OUTPUT_CODEC = "aac"
_OUTPUT_SAMPLE_FORMAT = "fltp" # AAC requires float planar
_OUTPUT_LAYOUT = "stereo"
# Map channel count -> FFmpeg layout name
_CHANNEL_LAYOUT_MAP = {
1: "mono",
2: "stereo",
3: "2.1",
4: "quad",
6: "5.1",
8: "7.1",
}
def needs_transcode(codec_id: str) -> bool:
"""Check if an MKV audio codec needs transcoding for browser playback."""
return codec_id in NEEDS_TRANSCODE
def get_ffmpeg_codec_name(mkv_codec_id: str) -> str | None:
"""Map an MKV CodecID to an FFmpeg codec name."""
return _MKV_TO_FFMPEG_CODEC.get(mkv_codec_id)
class AudioTranscoder:
"""
In-process audio transcoder using PyAV's CodecContext API.
Decodes raw audio frames from one codec and encodes them to AAC-LC
stereo, suitable for MP4 container and browser playback. No container
I/O or subprocess involved -- operates directly on raw frame bytes.
The transcoder handles sample format conversion and resampling
automatically via AudioResampler.
"""
def __init__(
self,
input_codec: str,
input_sample_rate: int = 48000,
input_channels: int = 6,
output_sample_rate: int = 48000,
output_channels: int = 2,
output_bitrate: int = 192000,
) -> None:
"""
Initialize the transcoder.
Args:
input_codec: FFmpeg codec name (e.g., "eac3", "ac3", "aac").
input_sample_rate: Input sample rate in Hz.
input_channels: Input channel count.
output_sample_rate: Output sample rate in Hz (default 48000).
output_channels: Output channel count (default 2 = stereo).
output_bitrate: Output bitrate in bits/s (default 192000).
"""
# Set up decoder -- use layout to configure channel count
# (PyAV's channels property is read-only; layout drives it)
self._decoder = av.CodecContext.create(input_codec, "r")
self._decoder.sample_rate = input_sample_rate
input_layout = _CHANNEL_LAYOUT_MAP.get(input_channels, "stereo")
self._decoder.layout = input_layout
# Set up encoder
self._encoder = av.CodecContext.create(_OUTPUT_CODEC, "w")
self._encoder.sample_rate = output_sample_rate
self._encoder.layout = _OUTPUT_LAYOUT
self._encoder.format = av.AudioFormat(_OUTPUT_SAMPLE_FORMAT)
self._encoder.bit_rate = output_bitrate
self._encoder.open()
# Set up resampler for format/rate/channel conversion
self._resampler = AudioResampler(
format=_OUTPUT_SAMPLE_FORMAT,
layout=_OUTPUT_LAYOUT,
rate=output_sample_rate,
)
self._input_codec = input_codec
self._frames_decoded = 0
self._frames_encoded = 0
self._audio_specific_config: bytes | None = None
logger.info(
"[audio_transcoder] Initialized: %s %dHz %dch -> aac %dHz %dch @%dk",
input_codec,
input_sample_rate,
input_channels,
output_sample_rate,
output_channels,
output_bitrate // 1000,
)
@property
def audio_specific_config(self) -> bytes | None:
"""
AAC AudioSpecificConfig from the encoder (available after first encode).
This is needed for the MP4 esds box.
"""
if self._audio_specific_config is not None:
return self._audio_specific_config
# PyAV exposes extradata after the encoder is opened
if self._encoder.extradata:
self._audio_specific_config = bytes(self._encoder.extradata)
return self._audio_specific_config
return None
@property
def output_sample_rate(self) -> int:
return self._encoder.sample_rate
@property
def output_channels(self) -> int:
return self._encoder.channels
@property
def frame_size(self) -> int:
"""AAC frame size (samples per frame), typically 1024."""
return self._encoder.frame_size or 1024
def transcode(self, raw_frame_data: bytes) -> list[bytes]:
"""
Transcode a raw audio frame from the input codec to AAC.
Args:
raw_frame_data: Raw audio frame bytes (one codec frame, e.g.,
one EAC3 sync frame).
Returns:
List of raw AAC frame bytes. May return 0, 1, or more frames
depending on codec frame sizes and buffering.
"""
output = []
# Parse raw bytes into packets
packets = self._decoder.parse(raw_frame_data)
for packet in packets:
# Decode to PCM frames
try:
decoded_frames = self._decoder.decode(packet)
except av.error.InvalidDataError as e:
logger.debug("[audio_transcoder] Decode error (skipping frame): %s", e)
continue
for frame in decoded_frames:
self._frames_decoded += 1
# Resample to match encoder format
resampled = self._resampler.resample(frame)
if resampled is None:
continue
# resampled can be a single frame or list of frames
if not isinstance(resampled, list):
resampled = [resampled]
for rs_frame in resampled:
# Encode to AAC
try:
encoded_packets = self._encoder.encode(rs_frame)
except av.error.InvalidDataError as e:
logger.debug("[audio_transcoder] Encode error: %s", e)
continue
for enc_packet in encoded_packets:
self._frames_encoded += 1
output.append(bytes(enc_packet))
return output
def flush(self) -> list[bytes]:
"""
Flush the decoder and encoder buffers.
Call this when the input stream ends to get remaining frames.
Returns:
List of remaining raw AAC frame bytes.
"""
output = []
# Flush decoder
try:
for frame in self._decoder.decode(None):
self._frames_decoded += 1
resampled = self._resampler.resample(frame)
if resampled is None:
continue
if not isinstance(resampled, list):
resampled = [resampled]
for rs_frame in resampled:
for enc_packet in self._encoder.encode(rs_frame):
self._frames_encoded += 1
output.append(bytes(enc_packet))
except Exception as e:
logger.debug("[audio_transcoder] Decoder flush error: %s", e)
# Flush resampler
try:
resampled = self._resampler.resample(None)
if resampled is not None:
if not isinstance(resampled, list):
resampled = [resampled]
for rs_frame in resampled:
for enc_packet in self._encoder.encode(rs_frame):
self._frames_encoded += 1
output.append(bytes(enc_packet))
except Exception as e:
logger.debug("[audio_transcoder] Resampler flush error: %s", e)
# Flush encoder
try:
for enc_packet in self._encoder.encode(None):
self._frames_encoded += 1
output.append(bytes(enc_packet))
except Exception as e:
logger.debug("[audio_transcoder] Encoder flush error: %s", e)
logger.info(
"[audio_transcoder] Flushed: %d decoded, %d encoded total",
self._frames_decoded,
self._frames_encoded,
)
return output
def generate_silence_frame(self) -> bytes | None:
"""Return a pre-encoded silent AAC frame (module-level singleton)."""
return _SILENCE_AAC_FRAME
def close(self) -> None:
"""Release codec contexts (best-effort; PyAV AudioCodecContext may not have close())."""
for ctx in (self._decoder, self._encoder):
try:
if hasattr(ctx, "close"):
ctx.close()
except Exception:
pass
def __del__(self) -> None:
self.close()