fix: expand us-cpa extraction review and rendering

This commit is contained in:
Stefano Fiorini
2026-03-15 03:01:16 -05:00
parent 6c02e0b7c6
commit fb39fe76cb
19 changed files with 693 additions and 56 deletions

View File

@@ -0,0 +1,3 @@
Form 1099-INT
Recipient: Jane Doe
Box 1 Interest Income 1750

View File

@@ -0,0 +1,4 @@
Form W-2 Wage and Tax Statement
Employee: Jane Doe
Box 1 Wages, tips, other compensation 50000
Box 2 Federal income tax withheld 6000

View File

@@ -0,0 +1,16 @@
{
"taxYear": 2025,
"filingStatus": "single",
"requiredForms": ["f1040", "f1040sb"],
"income": {
"wages": 50000.0,
"taxableInterest": 1750.0,
"businessIncome": 0.0,
"capitalGainLoss": 0.0,
"rentalIncome": 0.0
},
"totals": {
"adjustedGrossIncome": 51750.0,
"taxableIncome": 36000.0
}
}

View File

@@ -51,6 +51,39 @@ class CaseManagerTests(unittest.TestCase):
facts = json.loads((case_dir / "extracted" / "facts.json").read_text())
self.assertEqual(facts["facts"]["filingStatus"]["value"], "single")
def test_intake_extracts_machine_usable_facts_from_text_documents(self) -> None:
with tempfile.TemporaryDirectory() as temp_dir:
root = Path(temp_dir)
case_dir = root / "2025-jane-doe"
w2 = root / "w2.txt"
w2.write_text(
"Form W-2 Wage and Tax Statement\n"
"Employee: Jane Doe\n"
"Box 1 Wages, tips, other compensation 50000\n"
"Box 2 Federal income tax withheld 6000\n"
)
interest = root / "1099-int.txt"
interest.write_text(
"Form 1099-INT\n"
"Recipient: Jane Doe\n"
"Box 1 Interest Income 1750\n"
)
manager = CaseManager(case_dir)
manager.create_case(case_label="Jane Doe", tax_year=2025)
result = manager.intake(
tax_year=2025,
user_facts={"filingStatus": "single"},
document_paths=[w2, interest],
)
self.assertEqual(result["status"], "accepted")
facts = json.loads((case_dir / "extracted" / "facts.json").read_text())
self.assertEqual(facts["facts"]["wages"]["value"], 50000.0)
self.assertEqual(facts["facts"]["federalWithholding"]["value"], 6000.0)
self.assertEqual(facts["facts"]["taxableInterest"]["value"], 1750.0)
self.assertEqual(facts["facts"]["wages"]["sources"][0]["sourceType"], "document_extract")
def test_conflicting_facts_raise_structured_issue(self) -> None:
with tempfile.TemporaryDirectory() as temp_dir:
case_dir = Path(temp_dir) / "2025-jane-doe"

View File

@@ -51,6 +51,36 @@ class QuestionEngineTests(unittest.TestCase):
self.assertEqual(analysis["riskLevel"], "high")
self.assertTrue(analysis["primaryLawRequired"])
self.assertIn("Internal Revenue Code", analysis["missingFacts"][0])
self.assertTrue(any(item["sourceClass"] == "internal_revenue_code" for item in analysis["authorities"]))
def test_capital_gains_question_returns_schedule_d_guidance(self) -> None:
with tempfile.TemporaryDirectory() as temp_dir:
engine = self.build_engine(temp_dir)
analysis = engine.answer(
question="Do I need Schedule D for capital gains?",
tax_year=2025,
case_facts={"capitalGainLoss": 400},
)
self.assertEqual(analysis["issue"], "schedule_d_required")
self.assertEqual(analysis["confidence"], "medium")
self.assertFalse(analysis["primaryLawRequired"])
self.assertTrue(any(item["slug"] == "f1040sd" for item in analysis["authorities"]))
def test_schedule_e_question_returns_rental_guidance(self) -> None:
with tempfile.TemporaryDirectory() as temp_dir:
engine = self.build_engine(temp_dir)
analysis = engine.answer(
question="Do I need Schedule E for rental income?",
tax_year=2025,
case_facts={"rentalIncome": 1200},
)
self.assertEqual(analysis["issue"], "schedule_e_required")
self.assertFalse(analysis["primaryLawRequired"])
self.assertTrue(any(item["slug"] == "f1040se" for item in analysis["authorities"]))
def test_renderers_produce_conversation_and_memo(self) -> None:
analysis = {

View File

@@ -13,6 +13,47 @@ from us_cpa.sources import TaxYearCorpus
class RendererTests(unittest.TestCase):
def test_render_case_forms_prefers_fillable_pdf_fields_when_available(self) -> None:
with tempfile.TemporaryDirectory() as temp_dir:
case_dir = Path(temp_dir) / "case"
(case_dir / "output").mkdir(parents=True)
corpus = TaxYearCorpus(cache_root=Path(temp_dir) / "cache")
irs_dir = corpus.paths_for_year(2025).irs_dir
irs_dir.mkdir(parents=True, exist_ok=True)
buffer = BytesIO()
pdf = canvas.Canvas(buffer)
form = pdf.acroForm
pdf.drawString(72, 720, "Name")
form.textfield(name="taxpayer_full_name", x=120, y=710, width=200, height=20)
pdf.drawString(72, 680, "Wages")
form.textfield(name="wages", x=120, y=670, width=200, height=20)
pdf.save()
(irs_dir / "f1040.pdf").write_bytes(buffer.getvalue())
normalized = {
"taxYear": 2025,
"requiredForms": ["f1040"],
"taxpayer": {"fullName": "Jane Doe"},
"filingStatus": "single",
"income": {"wages": 50000.0, "taxableInterest": 100.0, "businessIncome": 0.0, "capitalGainLoss": 0.0, "rentalIncome": 0.0},
"deductions": {"standardDeduction": 15750.0, "deductionType": "standard", "deductionAmount": 15750.0},
"adjustments": {"hsaContribution": 0.0},
"credits": {"educationCredit": 0.0, "foreignTaxCredit": 0.0, "energyCredit": 0.0},
"taxes": {"totalTax": 3883.5, "additionalMedicareTax": 0.0, "netInvestmentIncomeTax": 0.0, "alternativeMinimumTax": 0.0, "additionalTaxPenalty": 0.0},
"payments": {"federalWithholding": 6000.0},
"business": {"qualifiedBusinessIncome": 0.0},
"basis": {"traditionalIraBasis": 0.0},
"depreciation": {"depreciationExpense": 0.0},
"assetSales": {"section1231GainLoss": 0.0},
"totals": {"adjustedGrossIncome": 50100.0, "taxableIncome": 34350.0, "refund": 2116.5, "balanceDue": 0.0},
}
artifacts = render_case_forms(case_dir, corpus, normalized)
self.assertEqual(artifacts["artifacts"][0]["renderMethod"], "field_fill")
self.assertFalse(artifacts["artifacts"][0]["reviewRequired"])
def test_render_case_forms_writes_overlay_artifacts_and_flags_review(self) -> None:
with tempfile.TemporaryDirectory() as temp_dir:
case_dir = Path(temp_dir) / "case"
@@ -32,10 +73,16 @@ class RendererTests(unittest.TestCase):
"requiredForms": ["f1040"],
"taxpayer": {"fullName": "Jane Doe"},
"filingStatus": "single",
"income": {"wages": 50000.0, "taxableInterest": 100.0, "businessIncome": 0.0},
"deductions": {"standardDeduction": 15750.0},
"taxes": {"totalTax": 3883.5},
"income": {"wages": 50000.0, "taxableInterest": 100.0, "businessIncome": 0.0, "capitalGainLoss": 0.0, "rentalIncome": 0.0},
"deductions": {"standardDeduction": 15750.0, "deductionType": "standard", "deductionAmount": 15750.0},
"adjustments": {"hsaContribution": 0.0},
"credits": {"educationCredit": 0.0, "foreignTaxCredit": 0.0, "energyCredit": 0.0},
"taxes": {"totalTax": 3883.5, "additionalMedicareTax": 0.0, "netInvestmentIncomeTax": 0.0, "alternativeMinimumTax": 0.0, "additionalTaxPenalty": 0.0},
"payments": {"federalWithholding": 6000.0},
"business": {"qualifiedBusinessIncome": 0.0},
"basis": {"traditionalIraBasis": 0.0},
"depreciation": {"depreciationExpense": 0.0},
"assetSales": {"section1231GainLoss": 0.0},
"totals": {"adjustedGrossIncome": 50100.0, "taxableIncome": 34350.0, "refund": 2116.5, "balanceDue": 0.0},
}

View File

@@ -37,11 +37,11 @@ class ReturnModelTests(unittest.TestCase):
self.assertEqual(
resolve_required_forms(normalized),
["f1040", "f1040sb", "f1040sc", "f1040se", "f1040s1"],
["f1040", "f1040sb", "f1040sc", "f1040sse", "f1040s1", "f8995"],
)
def test_tax_bracket_calculation_uses_2025_single_rates(self) -> None:
self.assertEqual(tax_on_ordinary_income(34350.0, "single"), 3883.5)
self.assertEqual(tax_on_ordinary_income(34350.0, "single", 2025), 3883.5)
def test_tax_bracket_calculation_uses_selected_tax_year(self) -> None:
self.assertEqual(tax_on_ordinary_income(33650.0, "single", 2024), 3806.0)
@@ -50,6 +50,53 @@ class ReturnModelTests(unittest.TestCase):
with self.assertRaisesRegex(ValueError, "Unsupported tax year"):
normalize_case_facts({"filingStatus": "single"}, 2023)
def test_normalize_case_facts_preserves_provenance_and_expands_form_resolution(self) -> None:
normalized = normalize_case_facts(
{
"taxpayer.fullName": "Jane Doe",
"spouse.fullName": "John Doe",
"dependents": [{"fullName": "Kid Doe", "ssnLast4": "4321"}],
"filingStatus": "married_filing_jointly",
"wages": 50000,
"taxableInterest": 2001,
"capitalGainLoss": 400,
"rentalIncome": 1200,
"itemizedDeductions": 40000,
"hsaContribution": 1000,
"educationCredit": 500,
"foreignTaxCredit": 250,
"qualifiedBusinessIncome": 12000,
"traditionalIraBasis": 6000,
"additionalMedicareTax": 100,
"netInvestmentIncomeTax": 200,
"alternativeMinimumTax": 300,
"additionalTaxPenalty": 50,
"energyCredit": 600,
"_factMetadata": {
"wages": {"sources": [{"sourceType": "document_extract", "documentName": "w2.txt"}]},
},
},
2025,
)
self.assertEqual(normalized["spouse"]["fullName"], "John Doe")
self.assertEqual(normalized["dependents"][0]["fullName"], "Kid Doe")
self.assertEqual(normalized["provenance"]["income.wages"]["sources"][0]["documentName"], "w2.txt")
self.assertIn("f1040sa", normalized["requiredForms"])
self.assertIn("f1040sd", normalized["requiredForms"])
self.assertIn("f8949", normalized["requiredForms"])
self.assertIn("f1040se", normalized["requiredForms"])
self.assertIn("f8889", normalized["requiredForms"])
self.assertIn("f8863", normalized["requiredForms"])
self.assertIn("f1116", normalized["requiredForms"])
self.assertIn("f8995", normalized["requiredForms"])
self.assertIn("f8606", normalized["requiredForms"])
self.assertIn("f8959", normalized["requiredForms"])
self.assertIn("f8960", normalized["requiredForms"])
self.assertIn("f6251", normalized["requiredForms"])
self.assertIn("f5329", normalized["requiredForms"])
self.assertIn("f5695", normalized["requiredForms"])
if __name__ == "__main__":
unittest.main()

View File

@@ -64,6 +64,44 @@ class ReviewEngineTests(unittest.TestCase):
self.assertIn("adjusted gross income", review["findings"][0]["title"].lower())
self.assertTrue(any("missing rendered artifact" in item["title"].lower() for item in review["findings"]))
def test_review_detects_reporting_omissions_from_source_facts(self) -> None:
with tempfile.TemporaryDirectory() as temp_dir:
case_dir, corpus = self.build_prepared_case(temp_dir)
normalized_path = case_dir / "return" / "normalized-return.json"
normalized = json.loads(normalized_path.read_text())
normalized["income"]["taxableInterest"] = 0.0
normalized["totals"]["adjustedGrossIncome"] = 50000.0
normalized_path.write_text(json.dumps(normalized, indent=2))
facts_path = case_dir / "extracted" / "facts.json"
facts_payload = json.loads(facts_path.read_text())
facts_payload["facts"]["taxableInterest"] = {
"value": 1750.0,
"sources": [{"sourceType": "document_extract", "sourceName": "1099-int.txt"}],
}
facts_path.write_text(json.dumps(facts_payload, indent=2))
review = ReviewEngine(corpus=corpus).review_case(case_dir)
self.assertTrue(
any("likely omitted taxable interest" in item["title"].lower() for item in review["findings"])
)
def test_review_flags_high_complexity_positions_for_specialist_follow_up(self) -> None:
with tempfile.TemporaryDirectory() as temp_dir:
case_dir, corpus = self.build_prepared_case(temp_dir)
normalized_path = case_dir / "return" / "normalized-return.json"
normalized = json.loads(normalized_path.read_text())
normalized["requiredForms"].append("f6251")
normalized["taxes"]["alternativeMinimumTax"] = 300.0
normalized_path.write_text(json.dumps(normalized, indent=2))
review = ReviewEngine(corpus=corpus).review_case(case_dir)
self.assertTrue(
any("high-complexity tax position" in item["title"].lower() for item in review["findings"])
)
def test_review_renderers_produce_summary_and_memo(self) -> None:
review = {
"status": "reviewed",

View File

@@ -12,6 +12,7 @@ from us_cpa.sources import (
authority_rank_for,
bootstrap_irs_catalog,
build_irs_prior_pdf_url,
build_primary_law_authorities,
)
@@ -42,6 +43,17 @@ class SourceCatalogTests(unittest.TestCase):
self.assertGreaterEqual(len(catalog), 5)
self.assertEqual(catalog[0].url, "https://www.irs.gov/pub/irs-prior/f1040--2025.pdf")
self.assertTrue(any(item.slug == "i1040gi" for item in catalog))
self.assertTrue(any(item.slug == "f1040sse" for item in catalog))
def test_primary_law_authorities_build_official_urls(self) -> None:
authorities = build_primary_law_authorities(
"Does section 469 apply and what does Treas. Reg. 1.469-1 say?"
)
self.assertTrue(any(item["sourceClass"] == "internal_revenue_code" for item in authorities))
self.assertTrue(any(item["sourceClass"] == "treasury_regulation" for item in authorities))
self.assertTrue(any("uscode.house.gov" in item["url"] for item in authorities))
self.assertTrue(any("ecfr.gov" in item["url"] for item in authorities))
class TaxYearCorpusTests(unittest.TestCase):