75 lines
3.3 KiB
Python
75 lines
3.3 KiB
Python
from __future__ import annotations
|
|
|
|
import json
|
|
import re
|
|
from pathlib import Path
|
|
from typing import Any
|
|
|
|
from pypdf import PdfReader
|
|
|
|
|
|
_NUMBER = r"(-?\d+(?:,\d{3})*(?:\.\d+)?)"
|
|
|
|
|
|
def _parse_number(raw: str) -> float:
|
|
return float(raw.replace(",", ""))
|
|
|
|
|
|
def _extract_text(path: Path) -> str:
|
|
suffix = path.suffix.lower()
|
|
if suffix in {".txt", ".md"}:
|
|
return path.read_text()
|
|
if suffix == ".pdf":
|
|
reader = PdfReader(str(path))
|
|
return "\n".join((page.extract_text() or "") for page in reader.pages)
|
|
return ""
|
|
|
|
|
|
def _facts_from_text(text: str) -> dict[str, Any]:
|
|
extracted: dict[str, Any] = {}
|
|
|
|
if match := re.search(r"Employee:\s*(.+)", text):
|
|
extracted["taxpayer.fullName"] = match.group(1).strip()
|
|
if match := re.search(r"Recipient:\s*(.+)", text):
|
|
extracted.setdefault("taxpayer.fullName", match.group(1).strip())
|
|
if match := re.search(r"Box 1 Wages, tips, other compensation\s+" + _NUMBER, text, re.I):
|
|
extracted["wages"] = _parse_number(match.group(1))
|
|
if match := re.search(r"Box 2 Federal income tax withheld\s+" + _NUMBER, text, re.I):
|
|
extracted["federalWithholding"] = _parse_number(match.group(1))
|
|
if match := re.search(r"Box 16 State wages, tips, etc\.\s+" + _NUMBER, text, re.I):
|
|
extracted["stateWages"] = _parse_number(match.group(1))
|
|
if match := re.search(r"Box 17 State income tax\s+" + _NUMBER, text, re.I):
|
|
extracted["stateWithholding"] = _parse_number(match.group(1))
|
|
if match := re.search(r"Box 3 Social security wages\s+" + _NUMBER, text, re.I):
|
|
extracted["socialSecurityWages"] = _parse_number(match.group(1))
|
|
if match := re.search(r"Box 5 Medicare wages and tips\s+" + _NUMBER, text, re.I):
|
|
extracted["medicareWages"] = _parse_number(match.group(1))
|
|
if match := re.search(r"Box 1 Interest Income\s+" + _NUMBER, text, re.I):
|
|
extracted["taxableInterest"] = _parse_number(match.group(1))
|
|
if match := re.search(r"Box 1a Total ordinary dividends\s+" + _NUMBER, text, re.I):
|
|
extracted["ordinaryDividends"] = _parse_number(match.group(1))
|
|
if match := re.search(r"Box 1 Gross distribution\s+" + _NUMBER, text, re.I):
|
|
extracted["retirementDistribution"] = _parse_number(match.group(1))
|
|
if match := re.search(r"Box 3 Other income\s+" + _NUMBER, text, re.I):
|
|
extracted["otherIncome"] = _parse_number(match.group(1))
|
|
if match := re.search(r"Net profit(?: or loss)?\s+" + _NUMBER, text, re.I):
|
|
extracted["businessIncome"] = _parse_number(match.group(1))
|
|
if match := re.search(r"Adjusted gross income\s+" + _NUMBER, text, re.I):
|
|
extracted["priorYear.adjustedGrossIncome"] = _parse_number(match.group(1))
|
|
if match := re.search(r"Taxable income\s+" + _NUMBER, text, re.I):
|
|
extracted["priorYear.taxableIncome"] = _parse_number(match.group(1))
|
|
if match := re.search(r"Refund\s+" + _NUMBER, text, re.I):
|
|
extracted["priorYear.refund"] = _parse_number(match.group(1))
|
|
|
|
return extracted
|
|
|
|
|
|
def extract_document_facts(path: Path) -> dict[str, Any]:
|
|
suffix = path.suffix.lower()
|
|
if suffix == ".json":
|
|
payload = json.loads(path.read_text())
|
|
if isinstance(payload, dict):
|
|
return payload
|
|
return {}
|
|
return _facts_from_text(_extract_text(path))
|