UnHided/mediaflow_proxy/utils/base64_utils.py

import base64
import logging
from typing import Optional
from urllib.parse import urlparse

logger = logging.getLogger(__name__)


def is_base64_url(url: str) -> bool:
    """
    Check if a URL appears to be base64 encoded.

    Args:
        url (str): The URL to check.

    Returns:
        bool: True if the URL appears to be base64 encoded, False otherwise.
    """
    # Check if the URL doesn't start with http/https and contains base64-like characters
    if url.startswith(("http://", "https://", "ftp://", "ftps://")):
        return False

    # Base64 URLs typically contain only alphanumeric characters, +, /, and =
    # and don't contain typical URL characters like ://
    base64_chars = set("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/=")
    url_chars = set(url)

    # If the URL contains characters not in base64 charset, it's likely not base64
    if not url_chars.issubset(base64_chars):
        return False

    # Additional heuristic: base64 strings are typically longer and don't contain common URL patterns
    if len(url) < 10:  # Too short to be a meaningful base64 encoded URL
        return False

    return True


def decode_base64_url(encoded_url: str) -> Optional[str]:
    """
    Decode a base64 encoded URL.

    Args:
        encoded_url (str): The base64 encoded URL string.

    Returns:
        Optional[str]: The decoded URL if successful, None if decoding fails.
    """
    try:
        # Handle URL-safe base64 encoding (replace - with + and _ with /)
        url_safe_encoded = encoded_url.replace("-", "+").replace("_", "/")

        # Add padding if necessary
        missing_padding = len(url_safe_encoded) % 4
        if missing_padding:
            url_safe_encoded += "=" * (4 - missing_padding)

        # Decode the base64 string
        decoded_bytes = base64.b64decode(url_safe_encoded)
        decoded_url = decoded_bytes.decode("utf-8")

        # Validate that the decoded string is a valid URL
        parsed = urlparse(decoded_url)
        if parsed.scheme and parsed.netloc:
            logger.info(f"Successfully decoded base64 URL: {encoded_url[:50]}... -> {decoded_url}")
            return decoded_url
        else:
            logger.warning(f"Decoded string is not a valid URL: {decoded_url}")
            return None

    except (base64.binascii.Error, UnicodeDecodeError, ValueError) as e:
        logger.debug(f"Failed to decode base64 URL '{encoded_url[:50]}...': {e}")
        return None


def encode_url_to_base64(url: str, url_safe: bool = True) -> str:
    """
    Encode a URL to base64.

    Args:
        url (str): The URL to encode.
        url_safe (bool): Whether to use URL-safe base64 encoding (default: True).

    Returns:
        str: The base64 encoded URL.
    """
    try:
        url_bytes = url.encode("utf-8")
        if url_safe:
            # Use URL-safe base64 encoding (replace + with - and / with _)
            encoded = base64.urlsafe_b64encode(url_bytes).decode("utf-8")
            # Remove padding for cleaner URLs
            encoded = encoded.rstrip("=")
        else:
            encoded = base64.b64encode(url_bytes).decode("utf-8")

        logger.debug(f"Encoded URL to base64: {url} -> {encoded}")
        return encoded

    except Exception as e:
        logger.error(f"Failed to encode URL to base64: {e}")
        raise


def process_potential_base64_url(url: str) -> str:
    """
    Process a URL that might be base64 encoded. If it's base64 encoded, decode it.
    Otherwise, return the original URL.

    Args:
        url (str): The URL to process.

    Returns:
        str: The processed URL (decoded if it was base64, original otherwise).
    """
    if is_base64_url(url):
        decoded_url = decode_base64_url(url)
        if decoded_url:
            return decoded_url
        else:
            logger.warning(f"URL appears to be base64 but failed to decode: {url[:50]}...")

    return url