fix: expand us-cpa extraction review and rendering

This commit is contained in:
Stefano Fiorini
2026-03-15 03:01:16 -05:00
parent 6c02e0b7c6
commit fb39fe76cb
19 changed files with 693 additions and 56 deletions

View File

@@ -0,0 +1,54 @@
from __future__ import annotations
import json
import re
from pathlib import Path
from typing import Any
from pypdf import PdfReader
_NUMBER = r"(-?\d+(?:,\d{3})*(?:\.\d+)?)"
def _parse_number(raw: str) -> float:
return float(raw.replace(",", ""))
def _extract_text(path: Path) -> str:
suffix = path.suffix.lower()
if suffix in {".txt", ".md"}:
return path.read_text()
if suffix == ".pdf":
reader = PdfReader(str(path))
return "\n".join((page.extract_text() or "") for page in reader.pages)
return ""
def _facts_from_text(text: str) -> dict[str, Any]:
extracted: dict[str, Any] = {}
if match := re.search(r"Employee:\s*(.+)", text):
extracted["taxpayer.fullName"] = match.group(1).strip()
if match := re.search(r"Recipient:\s*(.+)", text):
extracted.setdefault("taxpayer.fullName", match.group(1).strip())
if match := re.search(r"Box 1 Wages, tips, other compensation\s+" + _NUMBER, text, re.I):
extracted["wages"] = _parse_number(match.group(1))
if match := re.search(r"Box 2 Federal income tax withheld\s+" + _NUMBER, text, re.I):
extracted["federalWithholding"] = _parse_number(match.group(1))
if match := re.search(r"Box 1 Interest Income\s+" + _NUMBER, text, re.I):
extracted["taxableInterest"] = _parse_number(match.group(1))
if match := re.search(r"Net profit(?: or loss)?\s+" + _NUMBER, text, re.I):
extracted["businessIncome"] = _parse_number(match.group(1))
return extracted
def extract_document_facts(path: Path) -> dict[str, Any]:
suffix = path.suffix.lower()
if suffix == ".json":
payload = json.loads(path.read_text())
if isinstance(payload, dict):
return payload
return {}
return _facts_from_text(_extract_text(path))