UnHided/mediaflow_proxy/utils/stream_transformers.py

"""
Stream transformers for host-specific content manipulation.

This module provides transformer classes that can modify streaming content
on-the-fly. Each transformer handles specific content manipulation needs
for different streaming hosts (e.g., PNG wrapper stripping, TS detection).
"""

import logging
import typing

logger = logging.getLogger(__name__)


class StreamTransformer:
    """
    Base class for stream content transformers.

    Subclasses should override the transform method to implement
    specific content manipulation logic.
    """

    async def transform(self, chunk_iterator: typing.AsyncIterator[bytes]) -> typing.AsyncGenerator[bytes, None]:
        """
        Transform stream chunks.

        Args:
            chunk_iterator: Async iterator of raw bytes from upstream.

        Yields:
            Transformed bytes chunks.
        """
        async for chunk in chunk_iterator:
            yield chunk


class TSStreamTransformer(StreamTransformer):
    """
    Transformer for MPEG-TS streams with obfuscation.

    Handles streams from hosts like TurboVidPlay, StreamWish, and FileMoon
    that may have:
    - Fake PNG wrapper prepended to video data
    - 0xFF padding bytes before actual content
    - Need for TS sync byte detection
    """

    # PNG signature and IEND marker for fake PNG header detection
    _PNG_SIGNATURE = b"\x89PNG\r\n\x1a\n"
    _PNG_IEND_MARKER = b"\x49\x45\x4e\x44\xae\x42\x60\x82"

    # TS packet constants
    _TS_SYNC = 0x47
    _TS_PACKET_SIZE = 188

    # Maximum bytes to buffer before forcing passthrough
    _MAX_PREFETCH = 512 * 1024  # 512 KB

    def __init__(self):
        self.buffer = bytearray()
        self.ts_started = False
        self.bytes_stripped = 0

    @staticmethod
    def _find_ts_start(buffer: bytes) -> typing.Optional[int]:
        """
        Find MPEG-TS sync byte (0x47) aligned on 188 bytes.

        Args:
            buffer: Bytes to search for TS sync pattern.

        Returns:
            Offset where TS starts, or None if not found.
        """
        TS_SYNC = 0x47
        TS_PACKET = 188

        max_i = len(buffer) - TS_PACKET
        for i in range(max(0, max_i)):
            if buffer[i] == TS_SYNC and buffer[i + TS_PACKET] == TS_SYNC:
                return i
        return None

    def _strip_fake_png_wrapper(self, chunk: bytes) -> bytes:
        """
        Strip fake PNG wrapper from chunk data.

        Some streaming services prepend a fake PNG image to video data
        to evade detection. This method detects and removes it.

        Args:
            chunk: The raw chunk data that may contain a fake PNG header.

        Returns:
            The chunk with fake PNG wrapper removed, or original chunk if not present.
        """
        if not chunk.startswith(self._PNG_SIGNATURE):
            return chunk

        # Find the IEND marker that signals end of PNG data
        iend_pos = chunk.find(self._PNG_IEND_MARKER)
        if iend_pos == -1:
            # IEND not found in this chunk - return as-is to avoid data corruption
            logger.debug("PNG signature detected but IEND marker not found in chunk")
            return chunk

        # Calculate position after IEND marker
        content_start = iend_pos + len(self._PNG_IEND_MARKER)

        # Skip any padding bytes (null or 0xFF) between PNG and actual content
        while content_start < len(chunk) and chunk[content_start] in (0x00, 0xFF):
            content_start += 1

        self.bytes_stripped = content_start
        logger.debug(f"Stripped {content_start} bytes of fake PNG wrapper from stream")

        return chunk[content_start:]

    async def transform(self, chunk_iterator: typing.AsyncIterator[bytes]) -> typing.AsyncGenerator[bytes, None]:
        """
        Transform TS stream by stripping PNG wrapper and finding TS start.

        Args:
            chunk_iterator: Async iterator of raw bytes from upstream.

        Yields:
            Cleaned TS stream bytes.
        """
        async for chunk in chunk_iterator:
            if self.ts_started:
                # Normal streaming once TS has started
                yield chunk
                continue

            # Prebuffer phase (until we find TS or pass through)
            self.buffer += chunk

            # Fast-path: if it's an m3u8 playlist, don't do TS detection
            if len(self.buffer) >= 7 and self.buffer[:7] in (b"#EXTM3U", b"#EXT-X-"):
                yield bytes(self.buffer)
                self.buffer.clear()
                self.ts_started = True
                continue

            # Strip fake PNG wrapper if present
            if self.buffer.startswith(self._PNG_SIGNATURE):
                if self._PNG_IEND_MARKER in self.buffer:
                    self.buffer = bytearray(self._strip_fake_png_wrapper(bytes(self.buffer)))

            # Skip pure 0xFF padding bytes (TurboVid style)
            while self.buffer and self.buffer[0] == 0xFF:
                self.buffer.pop(0)

            # Re-check for m3u8 playlist after stripping PNG wrapper and padding
            # This handles cases where m3u8 content is wrapped in PNG
            if len(self.buffer) >= 7 and self.buffer[:7] in (b"#EXTM3U", b"#EXT-X-"):
                logger.debug("Found m3u8 content after stripping wrapper - passing through")
                yield bytes(self.buffer)
                self.buffer.clear()
                self.ts_started = True
                continue

            ts_offset = self._find_ts_start(bytes(self.buffer))
            if ts_offset is None:
                # Keep buffering until we find TS or hit limit
                if len(self.buffer) > self._MAX_PREFETCH:
                    logger.warning("TS sync not found after large prebuffer, forcing passthrough")
                    yield bytes(self.buffer)
                    self.buffer.clear()
                    self.ts_started = True
                continue

            # TS found: emit from ts_offset and switch to pass-through
            self.ts_started = True
            out = bytes(self.buffer[ts_offset:])
            self.buffer.clear()

            if out:
                yield out


# Registry of available transformers
TRANSFORMER_REGISTRY: dict[str, type[StreamTransformer]] = {
    "ts_stream": TSStreamTransformer,
}


def get_transformer(transformer_id: typing.Optional[str]) -> typing.Optional[StreamTransformer]:
    """
    Get a transformer instance by ID.

    Args:
        transformer_id: The transformer identifier (e.g., "ts_stream").

    Returns:
        A new transformer instance, or None if transformer_id is None or not found.
    """
    if transformer_id is None:
        return None

    transformer_class = TRANSFORMER_REGISTRY.get(transformer_id)
    if transformer_class is None:
        logger.warning(f"Unknown transformer ID: {transformer_id}")
        return None

    return transformer_class()


async def apply_transformer_to_bytes(
    data: bytes,
    transformer_id: typing.Optional[str],
) -> bytes:
    """
    Apply a transformer to already-downloaded bytes data.

    This is useful when serving cached segments that need transformation.
    Creates a single-chunk async iterator and collects the transformed output.

    Args:
        data: The raw bytes data to transform.
        transformer_id: The transformer identifier (e.g., "ts_stream").

    Returns:
        Transformed bytes, or original data if no transformer specified.
    """
    if not transformer_id:
        return data

    transformer = get_transformer(transformer_id)
    if not transformer:
        return data

    async def single_chunk_iterator():
        yield data

    # Collect all transformed chunks
    result = bytearray()
    async for chunk in transformer.transform(single_chunk_iterator()):
        result.extend(chunk)

    return bytes(result)