From faff555757edaabe267bfdcb9a03ce772422c200 Mon Sep 17 00:00:00 2001 From: Stefano Fiorini Date: Sun, 15 Mar 2026 00:56:07 -0500 Subject: [PATCH] feat: add us-cpa case intake workflow --- docs/us-cpa.md | 25 ++++- skills/us-cpa/SKILL.md | 2 + skills/us-cpa/src/us_cpa/cases.py | 157 ++++++++++++++++++++++++++++++ skills/us-cpa/src/us_cpa/cli.py | 41 +++++++- skills/us-cpa/tests/test_cases.py | 80 +++++++++++++++ skills/us-cpa/tests/test_cli.py | 62 ++++++++++++ 6 files changed, 365 insertions(+), 2 deletions(-) create mode 100644 skills/us-cpa/src/us_cpa/cases.py create mode 100644 skills/us-cpa/tests/test_cases.py diff --git a/docs/us-cpa.md b/docs/us-cpa.md index e76feac..e50b598 100644 --- a/docs/us-cpa.md +++ b/docs/us-cpa.md @@ -20,7 +20,7 @@ skills/us-cpa/scripts/us-cpa question --question "What is the standard deduction skills/us-cpa/scripts/us-cpa prepare --tax-year 2025 --case-dir ~/tax-cases/2025-jane-doe skills/us-cpa/scripts/us-cpa review --tax-year 2025 --case-dir ~/tax-cases/2025-jane-doe skills/us-cpa/scripts/us-cpa fetch-year --tax-year 2025 -skills/us-cpa/scripts/us-cpa extract-docs --tax-year 2025 --case-dir ~/tax-cases/2025-jane-doe +skills/us-cpa/scripts/us-cpa extract-docs --tax-year 2025 --case-dir ~/tax-cases/2025-jane-doe --create-case --case-label "Jane Doe" --facts-json ./facts.json skills/us-cpa/scripts/us-cpa render-forms --tax-year 2025 --case-dir ~/tax-cases/2025-jane-doe skills/us-cpa/scripts/us-cpa export-efile-ready --tax-year 2025 --case-dir ~/tax-cases/2025-jane-doe ``` @@ -71,6 +71,29 @@ Current `fetch-year` bootstrap corpus for tax year `2025` is verified against li sources/ ``` +Current implementation writes: + +- `case-manifest.json` +- `extracted/facts.json` +- `issues/open-issues.json` + +## Intake Flow + +Current `extract-docs` supports: + +- `--create-case` +- `--case-label` +- `--facts-json ` +- repeated `--input-file ` + +Behavior: + +- creates the full case directory layout when `--create-case` is used +- copies input documents into `input/` +- stores normalized user-statement facts in `extracted/facts.json` +- appends document registry entries to `case-manifest.json` +- stops with a structured issue and non-zero exit if a new fact conflicts with an existing stored fact + ## Output Contract - JSON by default diff --git a/skills/us-cpa/SKILL.md b/skills/us-cpa/SKILL.md index c212fe0..6b9e841 100644 --- a/skills/us-cpa/SKILL.md +++ b/skills/us-cpa/SKILL.md @@ -34,6 +34,7 @@ description: Use when answering U.S. federal individual tax questions, preparing skills/us-cpa/scripts/us-cpa question --question "What is the standard deduction?" --tax-year 2025 skills/us-cpa/scripts/us-cpa prepare --tax-year 2025 --case-dir ~/tax-cases/2025-jane-doe skills/us-cpa/scripts/us-cpa review --tax-year 2025 --case-dir ~/tax-cases/2025-jane-doe +skills/us-cpa/scripts/us-cpa extract-docs --tax-year 2025 --case-dir ~/tax-cases/2025-jane-doe --create-case --case-label "Jane Doe" --facts-json ./facts.json ``` ## Rules @@ -50,5 +51,6 @@ skills/us-cpa/scripts/us-cpa review --tax-year 2025 --case-dir ~/tax-cases/2025- - markdown output available with `--format markdown` - `fetch-year` downloads the bootstrap IRS form/instruction corpus into `~/.cache/us-cpa` by default - override the cache root with `US_CPA_CACHE_DIR` when you need an isolated run or fixture generation +- `extract-docs` creates or opens a case, registers documents, stores facts, and stops with a structured issue if facts conflict For operator details, limitations, and the planned case structure, see `docs/us-cpa.md`. diff --git a/skills/us-cpa/src/us_cpa/cases.py b/skills/us-cpa/src/us_cpa/cases.py new file mode 100644 index 0000000..9f8827d --- /dev/null +++ b/skills/us-cpa/src/us_cpa/cases.py @@ -0,0 +1,157 @@ +from __future__ import annotations + +import hashlib +import json +import shutil +from dataclasses import dataclass +from datetime import datetime, timezone +from pathlib import Path +from typing import Any + + +CASE_SUBDIRECTORIES = ( + "input", + "extracted", + "return", + "output", + "reports", + "issues", + "sources", +) + + +def _timestamp() -> str: + return datetime.now(timezone.utc).isoformat() + + +def _sha256_path(path: Path) -> str: + digest = hashlib.sha256() + with path.open("rb") as handle: + for chunk in iter(lambda: handle.read(65536), b""): + digest.update(chunk) + return digest.hexdigest() + + +class CaseConflictError(Exception): + def __init__(self, issue: dict[str, Any]) -> None: + super().__init__(issue["message"]) + self.issue = issue + + +@dataclass +class CaseManager: + case_dir: Path + + def __post_init__(self) -> None: + self.case_dir = self.case_dir.expanduser().resolve() + + @property + def manifest_path(self) -> Path: + return self.case_dir / "case-manifest.json" + + @property + def facts_path(self) -> Path: + return self.case_dir / "extracted" / "facts.json" + + @property + def issues_path(self) -> Path: + return self.case_dir / "issues" / "open-issues.json" + + def create_case(self, *, case_label: str, tax_year: int) -> dict[str, Any]: + self.case_dir.mkdir(parents=True, exist_ok=True) + for name in CASE_SUBDIRECTORIES: + (self.case_dir / name).mkdir(exist_ok=True) + + manifest = { + "caseLabel": case_label, + "taxYear": tax_year, + "createdAt": _timestamp(), + "updatedAt": _timestamp(), + "status": "open", + "documents": [], + } + self.manifest_path.write_text(json.dumps(manifest, indent=2)) + if not self.facts_path.exists(): + self.facts_path.write_text(json.dumps({"facts": {}}, indent=2)) + if not self.issues_path.exists(): + self.issues_path.write_text(json.dumps({"issues": []}, indent=2)) + return manifest + + def load_manifest(self) -> dict[str, Any]: + return json.loads(self.manifest_path.read_text()) + + def _load_facts(self) -> dict[str, Any]: + return json.loads(self.facts_path.read_text()) + + def _write_manifest(self, manifest: dict[str, Any]) -> None: + manifest["updatedAt"] = _timestamp() + self.manifest_path.write_text(json.dumps(manifest, indent=2)) + + def _write_facts(self, facts: dict[str, Any]) -> None: + self.facts_path.write_text(json.dumps(facts, indent=2)) + + def _write_issue(self, issue: dict[str, Any]) -> None: + current = json.loads(self.issues_path.read_text()) + current["issues"].append(issue) + self.issues_path.write_text(json.dumps(current, indent=2)) + + def intake( + self, + *, + tax_year: int, + user_facts: dict[str, Any], + document_paths: list[Path], + ) -> dict[str, Any]: + manifest = self.load_manifest() + if manifest["taxYear"] != tax_year: + raise ValueError( + f"Case tax year {manifest['taxYear']} does not match requested tax year {tax_year}." + ) + + registered_documents = [] + for source_path in document_paths: + source_path = source_path.expanduser().resolve() + destination = self.case_dir / "input" / source_path.name + shutil.copy2(source_path, destination) + document_entry = { + "name": source_path.name, + "sourcePath": str(source_path), + "storedPath": str(destination), + "sha256": _sha256_path(destination), + "registeredAt": _timestamp(), + } + manifest["documents"].append(document_entry) + registered_documents.append(document_entry) + + facts_payload = self._load_facts() + for field, value in user_facts.items(): + existing = facts_payload["facts"].get(field) + if existing and existing["value"] != value: + issue = { + "status": "needs_resolution", + "issueType": "fact_conflict", + "field": field, + "existingValue": existing["value"], + "newValue": value, + "message": f"Conflicting values for {field}. Resolve before continuing.", + "createdAt": _timestamp(), + "taxYear": tax_year, + } + self._write_issue(issue) + raise CaseConflictError(issue) + + facts_payload["facts"][field] = { + "value": value, + "sourceType": "user_statement", + "capturedAt": _timestamp(), + } + + self._write_manifest(manifest) + self._write_facts(facts_payload) + return { + "status": "accepted", + "caseDir": str(self.case_dir), + "taxYear": tax_year, + "registeredDocuments": registered_documents, + "factCount": len(facts_payload["facts"]), + } diff --git a/skills/us-cpa/src/us_cpa/cli.py b/skills/us-cpa/src/us_cpa/cli.py index 3fc8e1c..6d5b46f 100644 --- a/skills/us-cpa/src/us_cpa/cli.py +++ b/skills/us-cpa/src/us_cpa/cli.py @@ -6,6 +6,7 @@ import sys from pathlib import Path from typing import Any +from us_cpa.cases import CaseConflictError, CaseManager from us_cpa.sources import TaxYearCorpus, bootstrap_irs_catalog COMMANDS = ( @@ -47,6 +48,12 @@ def _require_case_dir(args: argparse.Namespace) -> Path: return Path(args.case_dir).expanduser().resolve() +def _load_json_file(path_value: str | None) -> dict[str, Any]: + if not path_value: + return {} + return json.loads(Path(path_value).expanduser().resolve().read_text()) + + def build_parser() -> argparse.ArgumentParser: parser = argparse.ArgumentParser( prog="us-cpa", @@ -74,6 +81,10 @@ def build_parser() -> argparse.ArgumentParser: "extract-docs", help="Extract facts from case documents." ) _add_common_arguments(extract_docs) + extract_docs.add_argument("--create-case", action="store_true") + extract_docs.add_argument("--case-label") + extract_docs.add_argument("--facts-json") + extract_docs.add_argument("--input-file", action="append", default=[]) render_forms = subparsers.add_parser( "render-forms", help="Render compiled IRS forms." @@ -103,7 +114,35 @@ def main(argv: list[str] | None = None) -> int: } return _emit(payload, args.format) - if args.command in {"prepare", "review", "extract-docs", "render-forms", "export-efile-ready"}: + if args.command == "extract-docs": + case_dir = _require_case_dir(args) + manager = CaseManager(case_dir) + if args.create_case: + if not args.case_label: + raise SystemExit("--case-label is required when --create-case is used.") + manager.create_case(case_label=args.case_label, tax_year=args.tax_year) + elif not manager.manifest_path.exists(): + raise SystemExit("Case manifest not found. Use --create-case for a new case.") + + try: + result = manager.intake( + tax_year=args.tax_year, + user_facts=_load_json_file(args.facts_json), + document_paths=[ + Path(path_value).expanduser().resolve() for path_value in args.input_file + ], + ) + except CaseConflictError as exc: + print(json.dumps(exc.issue, indent=2)) + return 1 + payload = { + "command": args.command, + "format": args.format, + **result, + } + return _emit(payload, args.format) + + if args.command in {"prepare", "review", "render-forms", "export-efile-ready"}: case_dir = _require_case_dir(args) payload = { "command": args.command, diff --git a/skills/us-cpa/tests/test_cases.py b/skills/us-cpa/tests/test_cases.py new file mode 100644 index 0000000..71fa379 --- /dev/null +++ b/skills/us-cpa/tests/test_cases.py @@ -0,0 +1,80 @@ +from __future__ import annotations + +import json +import tempfile +import unittest +from pathlib import Path + +from us_cpa.cases import CaseConflictError, CaseManager + + +class CaseManagerTests(unittest.TestCase): + def test_create_case_builds_expected_layout(self) -> None: + with tempfile.TemporaryDirectory() as temp_dir: + case_dir = Path(temp_dir) / "2025-jane-doe" + manager = CaseManager(case_dir) + + manifest = manager.create_case(case_label="Jane Doe", tax_year=2025) + + self.assertEqual(manifest["caseLabel"], "Jane Doe") + self.assertEqual(manifest["taxYear"], 2025) + for name in ( + "input", + "extracted", + "return", + "output", + "reports", + "issues", + "sources", + ): + self.assertTrue((case_dir / name).is_dir()) + self.assertTrue((case_dir / "case-manifest.json").exists()) + + def test_intake_registers_documents_and_user_facts(self) -> None: + with tempfile.TemporaryDirectory() as temp_dir: + root = Path(temp_dir) + case_dir = root / "2025-jane-doe" + document = root / "w2.txt" + document.write_text("sample w2") + manager = CaseManager(case_dir) + manager.create_case(case_label="Jane Doe", tax_year=2025) + + result = manager.intake( + tax_year=2025, + user_facts={"filingStatus": "single", "taxpayer.ssnLast4": "1234"}, + document_paths=[document], + ) + + self.assertEqual(result["status"], "accepted") + self.assertEqual(len(result["registeredDocuments"]), 1) + self.assertTrue((case_dir / "input" / "w2.txt").exists()) + facts = json.loads((case_dir / "extracted" / "facts.json").read_text()) + self.assertEqual(facts["facts"]["filingStatus"]["value"], "single") + + def test_conflicting_facts_raise_structured_issue(self) -> None: + with tempfile.TemporaryDirectory() as temp_dir: + case_dir = Path(temp_dir) / "2025-jane-doe" + manager = CaseManager(case_dir) + manager.create_case(case_label="Jane Doe", tax_year=2025) + manager.intake( + tax_year=2025, + user_facts={"filingStatus": "single"}, + document_paths=[], + ) + + with self.assertRaises(CaseConflictError) as context: + manager.intake( + tax_year=2025, + user_facts={"filingStatus": "married_filing_jointly"}, + document_paths=[], + ) + + issue = context.exception.issue + self.assertEqual(issue["status"], "needs_resolution") + self.assertEqual(issue["issueType"], "fact_conflict") + self.assertEqual(issue["field"], "filingStatus") + self.assertTrue((case_dir / "issues" / "open-issues.json").exists()) + + +if __name__ == "__main__": + unittest.main() diff --git a/skills/us-cpa/tests/test_cli.py b/skills/us-cpa/tests/test_cli.py index db4d765..942f027 100644 --- a/skills/us-cpa/tests/test_cli.py +++ b/skills/us-cpa/tests/test_cli.py @@ -4,6 +4,7 @@ import json import os import subprocess import sys +import tempfile import unittest from pathlib import Path @@ -66,6 +67,67 @@ class UsCpaCliSmokeTests(unittest.TestCase): self.assertNotEqual(result.returncode, 0) self.assertIn("case directory", result.stderr.lower()) + def test_extract_docs_can_create_case_and_register_facts(self) -> None: + with tempfile.TemporaryDirectory() as temp_dir: + case_dir = Path(temp_dir) / "2025-jane-doe" + facts_path = Path(temp_dir) / "facts.json" + facts_path.write_text(json.dumps({"filingStatus": "single"})) + + result = self.run_cli( + "extract-docs", + "--tax-year", + "2025", + "--case-dir", + str(case_dir), + "--create-case", + "--case-label", + "Jane Doe", + "--facts-json", + str(facts_path), + ) + + self.assertEqual(result.returncode, 0, result.stderr) + payload = json.loads(result.stdout) + self.assertEqual(payload["status"], "accepted") + self.assertEqual(payload["factCount"], 1) + self.assertTrue((case_dir / "case-manifest.json").exists()) + + def test_extract_docs_stops_on_conflicts(self) -> None: + with tempfile.TemporaryDirectory() as temp_dir: + case_dir = Path(temp_dir) / "2025-jane-doe" + first_facts = Path(temp_dir) / "facts-1.json" + second_facts = Path(temp_dir) / "facts-2.json" + first_facts.write_text(json.dumps({"filingStatus": "single"})) + second_facts.write_text(json.dumps({"filingStatus": "married_filing_jointly"})) + + first = self.run_cli( + "extract-docs", + "--tax-year", + "2025", + "--case-dir", + str(case_dir), + "--create-case", + "--case-label", + "Jane Doe", + "--facts-json", + str(first_facts), + ) + self.assertEqual(first.returncode, 0, first.stderr) + + second = self.run_cli( + "extract-docs", + "--tax-year", + "2025", + "--case-dir", + str(case_dir), + "--facts-json", + str(second_facts), + ) + self.assertNotEqual(second.returncode, 0) + payload = json.loads(second.stdout) + self.assertEqual(payload["status"], "needs_resolution") + self.assertEqual(payload["issueType"], "fact_conflict") + if __name__ == "__main__": unittest.main()