feat: add us-cpa question engine

2026-03-15 01:17:14 -05:00
parent faff555757
commit 8f797b3a51
6 changed files with 360 additions and 2 deletions
@@ -17,6 +17,7 @@ Tax logic, case workflows, rendering, and review logic are still pending.

 ```bash
 skills/us-cpa/scripts/us-cpa question --question "What is the standard deduction?" --tax-year 2025
+skills/us-cpa/scripts/us-cpa question --question "What is the standard deduction?" --tax-year 2025 --style memo --format markdown
 skills/us-cpa/scripts/us-cpa prepare --tax-year 2025 --case-dir ~/tax-cases/2025-jane-doe
 skills/us-cpa/scripts/us-cpa review --tax-year 2025 --case-dir ~/tax-cases/2025-jane-doe
 skills/us-cpa/scripts/us-cpa fetch-year --tax-year 2025
@@ -98,9 +99,29 @@ Behavior:

 - JSON by default
 - markdown available with `--format markdown`
+- `question` supports:
+  - `--style conversation`
+  - `--style memo`
 - `question`, `prepare`, `review`, `extract-docs`, `render-forms`, and `export-efile-ready` still emit scaffold payloads with `status: "not_implemented"`
 - `fetch-year` emits a downloaded manifest location and source count

+## Question Engine
+
+Current `question` implementation:
+
+- loads the cached tax-year corpus
+- searches a small IRS-first topical rule set
+- returns one canonical analysis object
+- renders that analysis as:
+  - conversational output
+  - memo output
+- marks questions outside the current topical rule set as requiring primary-law escalation
+
+Current implemented topics:
+
+- standard deduction
+- Schedule C / sole proprietorship reporting trigger
+
 ## Scope Rules

 - U.S. federal individual returns only in v1
@@ -32,6 +32,7 @@ description: Use when answering U.S. federal individual tax questions, preparing

 ```bash
 skills/us-cpa/scripts/us-cpa question --question "What is the standard deduction?" --tax-year 2025
+skills/us-cpa/scripts/us-cpa question --question "What is the standard deduction?" --tax-year 2025 --style memo --format markdown
 skills/us-cpa/scripts/us-cpa prepare --tax-year 2025 --case-dir ~/tax-cases/2025-jane-doe
 skills/us-cpa/scripts/us-cpa review --tax-year 2025 --case-dir ~/tax-cases/2025-jane-doe
 skills/us-cpa/scripts/us-cpa extract-docs --tax-year 2025 --case-dir ~/tax-cases/2025-jane-doe --create-case --case-label "Jane Doe" --facts-json ./facts.json
@@ -49,6 +50,7 @@ skills/us-cpa/scripts/us-cpa extract-docs --tax-year 2025 --case-dir ~/tax-cases

 - JSON by default
 - markdown output available with `--format markdown`
+- `question` supports `--style conversation|memo`
 - `fetch-year` downloads the bootstrap IRS form/instruction corpus into `~/.cache/us-cpa` by default
 - override the cache root with `US_CPA_CACHE_DIR` when you need an isolated run or fixture generation
 - `extract-docs` creates or opens a case, registers documents, stores facts, and stops with a structured issue if facts conflict
@@ -7,6 +7,7 @@ from pathlib import Path
 from typing import Any

 from us_cpa.cases import CaseConflictError, CaseManager
+from us_cpa.questions import QuestionEngine, render_analysis, render_memo
 from us_cpa.sources import TaxYearCorpus, bootstrap_irs_catalog

 COMMANDS = (
@@ -64,6 +65,7 @@ def build_parser() -> argparse.ArgumentParser:
    question = subparsers.add_parser("question", help="Answer a tax question.")
    _add_common_arguments(question)
    question.add_argument("--question", required=True)
+    question.add_argument("--style", choices=("conversation", "memo"), default="conversation")

    prepare = subparsers.add_parser("prepare", help="Prepare a return case.")
    _add_common_arguments(prepare)
@@ -104,14 +106,37 @@ def main(argv: list[str] | None = None) -> int:
    args = parser.parse_args(argv)

    if args.command == "question":
+        corpus = TaxYearCorpus()
+        engine = QuestionEngine(corpus=corpus)
+        case_facts: dict[str, Any] = {}
+        if args.case_dir:
+            manager = CaseManager(Path(args.case_dir))
+            if manager.facts_path.exists():
+                case_facts = {
+                    key: value["value"]
+                    for key, value in json.loads(manager.facts_path.read_text())["facts"].items()
+                }
+        analysis = engine.answer(
+            question=args.question,
+            tax_year=args.tax_year,
+            case_facts=case_facts,
+        )
        payload = {
            "command": "question",
            "format": args.format,
+            "style": args.style,
            "taxYear": args.tax_year,
            "caseDir": args.case_dir,
            "question": args.question,
-            "status": "not_implemented",
+            "status": "answered",
+            "analysis": analysis,
        }
+        payload["rendered"] = (
+            render_memo(analysis) if args.style == "memo" else render_analysis(analysis)
+        )
+        if args.format == "markdown":
+            print(payload["rendered"])
+            return 0
        return _emit(payload, args.format)

    if args.command == "extract-docs":
@@ -0,0 +1,172 @@
+from __future__ import annotations
+
+import json
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any
+
+from us_cpa.sources import TaxYearCorpus
+
+
+TOPIC_RULES = [
+    {
+        "issue": "standard_deduction",
+        "keywords": ("standard deduction",),
+        "authority_slugs": ("i1040gi",),
+        "answer_by_status": {
+            "single": "$15,000",
+            "married_filing_jointly": "$30,000",
+            "head_of_household": "$22,500",
+        },
+        "summary_template": "{filing_status_label} filers use a {answer} standard deduction for tax year {tax_year}.",
+        "confidence": "high",
+    },
+    {
+        "issue": "schedule_c_required",
+        "keywords": ("schedule c", "sole proprietor", "self-employment"),
+        "authority_slugs": ("f1040sc", "i1040sc"),
+        "answer": "Schedule C is generally required when a taxpayer reports sole proprietorship business income or expenses.",
+        "summary": "Business income and expenses from a sole proprietorship generally belong on Schedule C.",
+        "confidence": "medium",
+    },
+]
+
+
+def _normalize_question(question: str) -> str:
+    return question.strip().lower()
+
+
+def _filing_status_label(status: str) -> str:
+    return status.replace("_", " ").title()
+
+
+@dataclass
+class QuestionEngine:
+    corpus: TaxYearCorpus
+
+    def _manifest(self, tax_year: int) -> dict[str, Any]:
+        path = self.corpus.paths_for_year(tax_year).manifest_path
+        if not path.exists():
+            raise FileNotFoundError(
+                f"Tax year {tax_year} corpus not found at {path}. Run fetch-year first."
+            )
+        return json.loads(path.read_text())
+
+    def _authorities_for(self, manifest: dict[str, Any], slugs: tuple[str, ...]) -> list[dict[str, Any]]:
+        found = []
+        sources = {item["slug"]: item for item in manifest["sources"]}
+        for slug in slugs:
+            if slug in sources:
+                source = sources[slug]
+                found.append(
+                    {
+                        "slug": source["slug"],
+                        "title": source["title"],
+                        "sourceClass": source["sourceClass"],
+                        "url": source["url"],
+                        "localPath": source["localPath"],
+                        "authorityRank": source["authorityRank"],
+                    }
+                )
+        return found
+
+    def answer(self, *, question: str, tax_year: int, case_facts: dict[str, Any]) -> dict[str, Any]:
+        manifest = self._manifest(tax_year)
+        normalized = _normalize_question(question)
+        facts_used = [{"field": key, "value": value} for key, value in sorted(case_facts.items())]
+
+        for rule in TOPIC_RULES:
+            if all(keyword in normalized for keyword in rule["keywords"]):
+                authorities = self._authorities_for(manifest, rule["authority_slugs"])
+                if rule["issue"] == "standard_deduction":
+                    filing_status = case_facts.get("filingStatus", "single")
+                    answer = rule["answer_by_status"].get(filing_status, rule["answer_by_status"]["single"])
+                    summary = rule["summary_template"].format(
+                        filing_status_label=_filing_status_label(filing_status),
+                        answer=answer,
+                        tax_year=tax_year,
+                    )
+                else:
+                    answer = rule["answer"]
+                    summary = rule["summary"]
+
+                return {
+                    "issue": rule["issue"],
+                    "taxYear": tax_year,
+                    "factsUsed": facts_used,
+                    "missingFacts": [],
+                    "authorities": authorities,
+                    "conclusion": {"answer": answer, "summary": summary},
+                    "confidence": rule["confidence"],
+                    "followUpQuestions": [],
+                    "primaryLawRequired": False,
+                }
+
+        return {
+            "issue": "requires_primary_law_escalation",
+            "taxYear": tax_year,
+            "factsUsed": facts_used,
+            "missingFacts": [
+                "Internal Revenue Code or Treasury regulation analysis is required before answering this question confidently."
+            ],
+            "authorities": [],
+            "conclusion": {
+                "answer": "Insufficient IRS-form and instruction support for a confident answer.",
+                "summary": "This question needs primary-law analysis before a reliable answer can be given.",
+            },
+            "confidence": "low",
+            "followUpQuestions": [
+                "What facts drive the section-level issue?",
+                "Is there an existing return position or drafted treatment to review?",
+            ],
+            "primaryLawRequired": True,
+        }
+
+
+def render_analysis(analysis: dict[str, Any]) -> str:
+    lines = [analysis["conclusion"]["summary"]]
+    if analysis["factsUsed"]:
+        facts = ", ".join(f"{item['field']}={item['value']}" for item in analysis["factsUsed"])
+        lines.append(f"Facts used: {facts}.")
+    if analysis["authorities"]:
+        titles = "; ".join(item["title"] for item in analysis["authorities"])
+        lines.append(f"Authorities: {titles}.")
+    if analysis["missingFacts"]:
+        lines.append(f"Open items: {' '.join(analysis['missingFacts'])}")
+    return " ".join(lines)
+
+
+def render_memo(analysis: dict[str, Any]) -> str:
+    lines = [
+        "# Tax Memo",
+        "",
+        f"## Issue\n{analysis['issue']}",
+        "",
+        "## Facts",
+    ]
+    if analysis["factsUsed"]:
+        for item in analysis["factsUsed"]:
+            lines.append(f"- {item['field']}: {item['value']}")
+    else:
+        lines.append("- No case-specific facts supplied.")
+    lines.extend(["", "## Authorities"])
+    if analysis["authorities"]:
+        for authority in analysis["authorities"]:
+            lines.append(f"- {authority['title']}")
+    else:
+        lines.append("- Primary-law escalation required.")
+    lines.extend(
+        [
+            "",
+            "## Analysis",
+            analysis["conclusion"]["summary"],
+            "",
+            "## Conclusion",
+            analysis["conclusion"]["answer"],
+        ]
+    )
+    if analysis["missingFacts"]:
+        lines.extend(["", "## Open Items"])
+        for item in analysis["missingFacts"]:
+            lines.append(f"- {item}")
+    return "\n".join(lines)
@@ -53,13 +53,40 @@ class UsCpaCliSmokeTests(unittest.TestCase):
            self.assertIn(command, result.stdout)

    def test_question_command_emits_json_by_default(self) -> None:
-        result = self.run_cli("question", "--question", "What is the standard deduction?")
+        with tempfile.TemporaryDirectory() as temp_dir:
+            env = os.environ.copy()
+            env["PYTHONPATH"] = str(SRC_DIR)
+            env["US_CPA_CACHE_DIR"] = temp_dir
+            subprocess.run(
+                [sys.executable, "-m", "us_cpa.cli", "fetch-year", "--tax-year", "2025"],
+                text=True,
+                capture_output=True,
+                env=env,
+                check=True,
+            )
+            result = subprocess.run(
+                [
+                    sys.executable,
+                    "-m",
+                    "us_cpa.cli",
+                    "question",
+                    "--tax-year",
+                    "2025",
+                    "--question",
+                    "What is the standard deduction?",
+                ],
+                text=True,
+                capture_output=True,
+                env=env,
+            )

        self.assertEqual(result.returncode, 0, result.stderr)
        payload = json.loads(result.stdout)
        self.assertEqual(payload["command"], "question")
        self.assertEqual(payload["format"], "json")
        self.assertEqual(payload["question"], "What is the standard deduction?")
+        self.assertEqual(payload["status"], "answered")
+        self.assertIn("analysis", payload)

    def test_prepare_requires_case_dir(self) -> None:
        result = self.run_cli("prepare", "--tax-year", "2025")
@@ -128,6 +155,42 @@ class UsCpaCliSmokeTests(unittest.TestCase):
            self.assertEqual(payload["status"], "needs_resolution")
            self.assertEqual(payload["issueType"], "fact_conflict")

+    def test_question_markdown_memo_mode_renders_tax_memo(self) -> None:
+        with tempfile.TemporaryDirectory() as temp_dir:
+            env = os.environ.copy()
+            env["PYTHONPATH"] = str(SRC_DIR)
+            env["US_CPA_CACHE_DIR"] = temp_dir
+            subprocess.run(
+                [sys.executable, "-m", "us_cpa.cli", "fetch-year", "--tax-year", "2025"],
+                text=True,
+                capture_output=True,
+                env=env,
+                check=True,
+            )
+            result = subprocess.run(
+                [
+                    sys.executable,
+                    "-m",
+                    "us_cpa.cli",
+                    "question",
+                    "--tax-year",
+                    "2025",
+                    "--format",
+                    "markdown",
+                    "--style",
+                    "memo",
+                    "--question",
+                    "What is the standard deduction?",
+                ],
+                text=True,
+                capture_output=True,
+                env=env,
+            )
+
+            self.assertEqual(result.returncode, 0, result.stderr)
+            self.assertIn("# Tax Memo", result.stdout)
+            self.assertIn("## Conclusion", result.stdout)
+

 if __name__ == "__main__":
    unittest.main()
@@ -0,0 +1,75 @@
+from __future__ import annotations
+
+import json
+import tempfile
+import unittest
+from pathlib import Path
+
+from us_cpa.questions import QuestionEngine, render_analysis, render_memo
+from us_cpa.sources import TaxYearCorpus, bootstrap_irs_catalog
+
+
+class QuestionEngineTests(unittest.TestCase):
+    def build_engine(self, temp_dir: str) -> QuestionEngine:
+        corpus = TaxYearCorpus(cache_root=Path(temp_dir))
+
+        def fake_fetch(url: str) -> bytes:
+            return f"source for {url}".encode()
+
+        corpus.download_catalog(2025, bootstrap_irs_catalog(2025), fetcher=fake_fetch)
+        return QuestionEngine(corpus=corpus)
+
+    def test_standard_deduction_question_returns_structured_analysis(self) -> None:
+        with tempfile.TemporaryDirectory() as temp_dir:
+            engine = self.build_engine(temp_dir)
+
+            analysis = engine.answer(
+                question="What is the standard deduction for single filers?",
+                tax_year=2025,
+                case_facts={"filingStatus": "single"},
+            )
+
+            self.assertEqual(analysis["issue"], "standard_deduction")
+            self.assertEqual(analysis["taxYear"], 2025)
+            self.assertEqual(analysis["conclusion"]["answer"], "$15,000")
+            self.assertEqual(analysis["confidence"], "high")
+            self.assertTrue(analysis["authorities"])
+            self.assertEqual(analysis["authorities"][0]["sourceClass"], "irs_instructions")
+
+    def test_complex_question_flags_primary_law_escalation(self) -> None:
+        with tempfile.TemporaryDirectory() as temp_dir:
+            engine = self.build_engine(temp_dir)
+
+            analysis = engine.answer(
+                question="Does section 469 passive activity loss limitation apply here?",
+                tax_year=2025,
+                case_facts={},
+            )
+
+            self.assertEqual(analysis["confidence"], "low")
+            self.assertTrue(analysis["primaryLawRequired"])
+            self.assertIn("Internal Revenue Code", analysis["missingFacts"][0])
+
+    def test_renderers_produce_conversation_and_memo(self) -> None:
+        analysis = {
+            "issue": "standard_deduction",
+            "taxYear": 2025,
+            "factsUsed": [{"field": "filingStatus", "value": "single"}],
+            "missingFacts": [],
+            "authorities": [{"title": "Instructions for Form 1040 and Schedules 1-3"}],
+            "conclusion": {"answer": "$15,000", "summary": "Single filers use a $15,000 standard deduction for tax year 2025."},
+            "confidence": "high",
+            "followUpQuestions": [],
+            "primaryLawRequired": False,
+        }
+
+        conversation = render_analysis(analysis)
+        memo = render_memo(analysis)
+
+        self.assertIn("$15,000", conversation)
+        self.assertIn("Issue", memo)
+        self.assertIn("Authorities", memo)
+
+
+if __name__ == "__main__":
+    unittest.main()