from __future__ import annotations import json import re from pathlib import Path from typing import Any from pypdf import PdfReader _NUMBER = r"(-?\d+(?:,\d{3})*(?:\.\d+)?)" def _parse_number(raw: str) -> float: return float(raw.replace(",", "")) def _extract_text(path: Path) -> str: suffix = path.suffix.lower() if suffix in {".txt", ".md"}: return path.read_text() if suffix == ".pdf": reader = PdfReader(str(path)) return "\n".join((page.extract_text() or "") for page in reader.pages) return "" def _facts_from_text(text: str) -> dict[str, Any]: extracted: dict[str, Any] = {} if match := re.search(r"Employee:\s*(.+)", text): extracted["taxpayer.fullName"] = match.group(1).strip() if match := re.search(r"Recipient:\s*(.+)", text): extracted.setdefault("taxpayer.fullName", match.group(1).strip()) if match := re.search(r"Box 1 Wages, tips, other compensation\s+" + _NUMBER, text, re.I): extracted["wages"] = _parse_number(match.group(1)) if match := re.search(r"Box 2 Federal income tax withheld\s+" + _NUMBER, text, re.I): extracted["federalWithholding"] = _parse_number(match.group(1)) if match := re.search(r"Box 16 State wages, tips, etc\.\s+" + _NUMBER, text, re.I): extracted["stateWages"] = _parse_number(match.group(1)) if match := re.search(r"Box 17 State income tax\s+" + _NUMBER, text, re.I): extracted["stateWithholding"] = _parse_number(match.group(1)) if match := re.search(r"Box 3 Social security wages\s+" + _NUMBER, text, re.I): extracted["socialSecurityWages"] = _parse_number(match.group(1)) if match := re.search(r"Box 5 Medicare wages and tips\s+" + _NUMBER, text, re.I): extracted["medicareWages"] = _parse_number(match.group(1)) if match := re.search(r"Box 1 Interest Income\s+" + _NUMBER, text, re.I): extracted["taxableInterest"] = _parse_number(match.group(1)) if match := re.search(r"Box 1a Total ordinary dividends\s+" + _NUMBER, text, re.I): extracted["ordinaryDividends"] = _parse_number(match.group(1)) if match := re.search(r"Box 1 Gross distribution\s+" + _NUMBER, text, re.I): extracted["retirementDistribution"] = _parse_number(match.group(1)) if match := re.search(r"Box 3 Other income\s+" + _NUMBER, text, re.I): extracted["otherIncome"] = _parse_number(match.group(1)) if match := re.search(r"Net profit(?: or loss)?\s+" + _NUMBER, text, re.I): extracted["businessIncome"] = _parse_number(match.group(1)) if match := re.search(r"Adjusted gross income\s+" + _NUMBER, text, re.I): extracted["priorYear.adjustedGrossIncome"] = _parse_number(match.group(1)) if match := re.search(r"Taxable income\s+" + _NUMBER, text, re.I): extracted["priorYear.taxableIncome"] = _parse_number(match.group(1)) if match := re.search(r"Refund\s+" + _NUMBER, text, re.I): extracted["priorYear.refund"] = _parse_number(match.group(1)) return extracted def extract_document_facts(path: Path) -> dict[str, Any]: suffix = path.suffix.lower() if suffix == ".json": payload = json.loads(path.read_text()) if isinstance(payload, dict): return payload return {} return _facts_from_text(_extract_text(path))