fix: expand us-cpa extraction review and rendering
This commit is contained in:
54
skills/us-cpa/src/us_cpa/document_extractors.py
Normal file
54
skills/us-cpa/src/us_cpa/document_extractors.py
Normal file
@@ -0,0 +1,54 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import re
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
from pypdf import PdfReader
|
||||
|
||||
|
||||
_NUMBER = r"(-?\d+(?:,\d{3})*(?:\.\d+)?)"
|
||||
|
||||
|
||||
def _parse_number(raw: str) -> float:
|
||||
return float(raw.replace(",", ""))
|
||||
|
||||
|
||||
def _extract_text(path: Path) -> str:
|
||||
suffix = path.suffix.lower()
|
||||
if suffix in {".txt", ".md"}:
|
||||
return path.read_text()
|
||||
if suffix == ".pdf":
|
||||
reader = PdfReader(str(path))
|
||||
return "\n".join((page.extract_text() or "") for page in reader.pages)
|
||||
return ""
|
||||
|
||||
|
||||
def _facts_from_text(text: str) -> dict[str, Any]:
|
||||
extracted: dict[str, Any] = {}
|
||||
|
||||
if match := re.search(r"Employee:\s*(.+)", text):
|
||||
extracted["taxpayer.fullName"] = match.group(1).strip()
|
||||
if match := re.search(r"Recipient:\s*(.+)", text):
|
||||
extracted.setdefault("taxpayer.fullName", match.group(1).strip())
|
||||
if match := re.search(r"Box 1 Wages, tips, other compensation\s+" + _NUMBER, text, re.I):
|
||||
extracted["wages"] = _parse_number(match.group(1))
|
||||
if match := re.search(r"Box 2 Federal income tax withheld\s+" + _NUMBER, text, re.I):
|
||||
extracted["federalWithholding"] = _parse_number(match.group(1))
|
||||
if match := re.search(r"Box 1 Interest Income\s+" + _NUMBER, text, re.I):
|
||||
extracted["taxableInterest"] = _parse_number(match.group(1))
|
||||
if match := re.search(r"Net profit(?: or loss)?\s+" + _NUMBER, text, re.I):
|
||||
extracted["businessIncome"] = _parse_number(match.group(1))
|
||||
|
||||
return extracted
|
||||
|
||||
|
||||
def extract_document_facts(path: Path) -> dict[str, Any]:
|
||||
suffix = path.suffix.lower()
|
||||
if suffix == ".json":
|
||||
payload = json.loads(path.read_text())
|
||||
if isinstance(payload, dict):
|
||||
return payload
|
||||
return {}
|
||||
return _facts_from_text(_extract_text(path))
|
||||
Reference in New Issue
Block a user