fix: expand us-cpa extraction review and rendering

2026-03-15 03:01:16 -05:00
parent 6c02e0b7c6
commit fb39fe76cb
19 changed files with 693 additions and 56 deletions
@@ -0,0 +1,54 @@
+from __future__ import annotations
+
+import json
+import re
+from pathlib import Path
+from typing import Any
+
+from pypdf import PdfReader
+
+
+_NUMBER = r"(-?\d+(?:,\d{3})*(?:\.\d+)?)"
+
+
+def _parse_number(raw: str) -> float:
+    return float(raw.replace(",", ""))
+
+
+def _extract_text(path: Path) -> str:
+    suffix = path.suffix.lower()
+    if suffix in {".txt", ".md"}:
+        return path.read_text()
+    if suffix == ".pdf":
+        reader = PdfReader(str(path))
+        return "\n".join((page.extract_text() or "") for page in reader.pages)
+    return ""
+
+
+def _facts_from_text(text: str) -> dict[str, Any]:
+    extracted: dict[str, Any] = {}
+
+    if match := re.search(r"Employee:\s*(.+)", text):
+        extracted["taxpayer.fullName"] = match.group(1).strip()
+    if match := re.search(r"Recipient:\s*(.+)", text):
+        extracted.setdefault("taxpayer.fullName", match.group(1).strip())
+    if match := re.search(r"Box 1 Wages, tips, other compensation\s+" + _NUMBER, text, re.I):
+        extracted["wages"] = _parse_number(match.group(1))
+    if match := re.search(r"Box 2 Federal income tax withheld\s+" + _NUMBER, text, re.I):
+        extracted["federalWithholding"] = _parse_number(match.group(1))
+    if match := re.search(r"Box 1 Interest Income\s+" + _NUMBER, text, re.I):
+        extracted["taxableInterest"] = _parse_number(match.group(1))
+    if match := re.search(r"Net profit(?: or loss)?\s+" + _NUMBER, text, re.I):
+        extracted["businessIncome"] = _parse_number(match.group(1))
+
+    return extracted
+
+
+def extract_document_facts(path: Path) -> dict[str, Any]:
+    suffix = path.suffix.lower()
+    if suffix == ".json":
+        payload = json.loads(path.read_text())
+        if isinstance(payload, dict):
+            return payload
+        return {}
+    return _facts_from_text(_extract_text(path))