stef-openclaw-skills/skills/us-cpa/src/us_cpa/document_extractors.py

from __future__ import annotations

import json
import re
from pathlib import Path
from typing import Any

from pypdf import PdfReader


_NUMBER = r"(-?\d+(?:,\d{3})*(?:\.\d+)?)"


def _parse_number(raw: str) -> float:
    return float(raw.replace(",", ""))


def _extract_text(path: Path) -> str:
    suffix = path.suffix.lower()
    if suffix in {".txt", ".md"}:
        return path.read_text()
    if suffix == ".pdf":
        reader = PdfReader(str(path))
        return "\n".join((page.extract_text() or "") for page in reader.pages)
    return ""


def _facts_from_text(text: str) -> dict[str, Any]:
    extracted: dict[str, Any] = {}

    if match := re.search(r"Employee:\s*(.+)", text):
        extracted["taxpayer.fullName"] = match.group(1).strip()
    if match := re.search(r"Recipient:\s*(.+)", text):
        extracted.setdefault("taxpayer.fullName", match.group(1).strip())
    if match := re.search(r"Box 1 Wages, tips, other compensation\s+" + _NUMBER, text, re.I):
        extracted["wages"] = _parse_number(match.group(1))
    if match := re.search(r"Box 2 Federal income tax withheld\s+" + _NUMBER, text, re.I):
        extracted["federalWithholding"] = _parse_number(match.group(1))
    if match := re.search(r"Box 16 State wages, tips, etc\.\s+" + _NUMBER, text, re.I):
        extracted["stateWages"] = _parse_number(match.group(1))
    if match := re.search(r"Box 17 State income tax\s+" + _NUMBER, text, re.I):
        extracted["stateWithholding"] = _parse_number(match.group(1))
    if match := re.search(r"Box 3 Social security wages\s+" + _NUMBER, text, re.I):
        extracted["socialSecurityWages"] = _parse_number(match.group(1))
    if match := re.search(r"Box 5 Medicare wages and tips\s+" + _NUMBER, text, re.I):
        extracted["medicareWages"] = _parse_number(match.group(1))
    if match := re.search(r"Box 1 Interest Income\s+" + _NUMBER, text, re.I):
        extracted["taxableInterest"] = _parse_number(match.group(1))
    if match := re.search(r"Box 1a Total ordinary dividends\s+" + _NUMBER, text, re.I):
        extracted["ordinaryDividends"] = _parse_number(match.group(1))
    if match := re.search(r"Box 1 Gross distribution\s+" + _NUMBER, text, re.I):
        extracted["retirementDistribution"] = _parse_number(match.group(1))
    if match := re.search(r"Box 3 Other income\s+" + _NUMBER, text, re.I):
        extracted["otherIncome"] = _parse_number(match.group(1))
    if match := re.search(r"Net profit(?: or loss)?\s+" + _NUMBER, text, re.I):
        extracted["businessIncome"] = _parse_number(match.group(1))
    if match := re.search(r"Adjusted gross income\s+" + _NUMBER, text, re.I):
        extracted["priorYear.adjustedGrossIncome"] = _parse_number(match.group(1))
    if match := re.search(r"Taxable income\s+" + _NUMBER, text, re.I):
        extracted["priorYear.taxableIncome"] = _parse_number(match.group(1))
    if match := re.search(r"Refund\s+" + _NUMBER, text, re.I):
        extracted["priorYear.refund"] = _parse_number(match.group(1))

    return extracted


def extract_document_facts(path: Path) -> dict[str, Any]:
    suffix = path.suffix.lower()
    if suffix == ".json":
        payload = json.loads(path.read_text())
        if isinstance(payload, dict):
            return payload
        return {}
    return _facts_from_text(_extract_text(path))