From b2bb07fa9053553898509eaa3d3ce037882779ee Mon Sep 17 00:00:00 2001 From: Stefano Fiorini Date: Sun, 15 Mar 2026 04:40:57 -0500 Subject: [PATCH] feat: make us-cpa questions retrieval-first --- docs/us-cpa.md | 16 +- skills/us-cpa/SKILL.md | 7 +- skills/us-cpa/src/us_cpa/cli.py | 14 ++ skills/us-cpa/src/us_cpa/questions.py | 219 ++++++++++++++++++++++++++ skills/us-cpa/src/us_cpa/sources.py | 1 + skills/us-cpa/tests/test_questions.py | 25 +++ 6 files changed, 272 insertions(+), 10 deletions(-) diff --git a/docs/us-cpa.md b/docs/us-cpa.md index 6f019d3..c931ebb 100644 --- a/docs/us-cpa.md +++ b/docs/us-cpa.md @@ -177,19 +177,17 @@ Behavior: Current `question` implementation: - loads the cached tax-year corpus -- searches a small IRS-first topical rule set -- returns one canonical analysis object +- searches the downloaded IRS corpus for relevant authorities and excerpts +- returns one canonical analysis object with: + - authorities + - excerpts + - confidence / risk + - primary-law escalation only when the IRS corpus is still insufficient - renders that analysis as: - conversational output - memo output -- marks questions outside the current topical rule set as requiring primary-law escalation -Current implemented topics: - -- standard deduction -- Schedule C / sole proprietorship reporting trigger -- Schedule D / capital gains reporting trigger -- Schedule E / rental income reporting trigger +In OpenClaw, the model should answer the user from the returned IRS excerpts when `primaryLawRequired` is `false`, rather than merely repeating the CLI summary. ## Form Rendering diff --git a/skills/us-cpa/SKILL.md b/skills/us-cpa/SKILL.md index 8feccd7..15d288d 100644 --- a/skills/us-cpa/SKILL.md +++ b/skills/us-cpa/SKILL.md @@ -40,6 +40,11 @@ skills/us-cpa/scripts/us-cpa review --tax-year 2025 --case-dir ~/tax-cases/2025- skills/us-cpa/scripts/us-cpa extract-docs --tax-year 2025 --case-dir ~/tax-cases/2025-jane-doe --create-case --case-label "Jane Doe" --facts-json ./facts.json ``` +4. For `question` mode, do not mechanically repeat the CLI fallback text. + - If the CLI returns `analysis.excerpts` with `primaryLawRequired: false`, answer the user directly from those IRS excerpts in your own words. + - Cite the specific IRS authorities returned by the CLI. + - Only tell the user the question needs deeper legal research when the CLI returns `primaryLawRequired: true` and no relevant IRS excerpts were found. + When OpenClaw is using the installed workspace copy, the entrypoint is: ```bash @@ -63,7 +68,7 @@ When OpenClaw is using the installed workspace copy, the entrypoint is: - `fetch-year` downloads the bootstrap IRS form/instruction corpus into `~/.cache/us-cpa` by default - override the cache root with `US_CPA_CACHE_DIR` when you need an isolated run or fixture generation - `extract-docs` creates or opens a case, registers documents, stores facts, extracts machine-usable facts from JSON/text/PDF sources where possible, and stops with a structured issue if facts conflict -- `question` currently has explicit IRS-first answers for standard deduction, Schedule C, Schedule D, and Schedule E questions; other questions escalate to primary-law research with official IRC/regulation URLs +- `question` now searches the downloaded IRS corpus for relevant authorities and excerpts before escalating to primary-law research - rendered form artifacts prefer fillable-field output when possible and otherwise fall back to overlay output - `prepare` computes the current supported federal 1040 package, preserves fact provenance in the normalized return, and writes normalized return/artifact/report files into the case directory - `export-efile-ready` writes a draft transmission-ready payload without transmitting anything diff --git a/skills/us-cpa/src/us_cpa/cli.py b/skills/us-cpa/src/us_cpa/cli.py index 71f3602..ddb9d98 100644 --- a/skills/us-cpa/src/us_cpa/cli.py +++ b/skills/us-cpa/src/us_cpa/cli.py @@ -57,6 +57,19 @@ def _load_json_file(path_value: str | None) -> dict[str, Any]: return json.loads(Path(path_value).expanduser().resolve().read_text()) +def _ensure_question_corpus(corpus: TaxYearCorpus, tax_year: int) -> None: + paths = corpus.paths_for_year(tax_year) + required_slugs = {item.slug for item in bootstrap_irs_catalog(tax_year)} + if not paths.manifest_path.exists(): + corpus.download_catalog(tax_year, bootstrap_irs_catalog(tax_year)) + return + + manifest = json.loads(paths.manifest_path.read_text()) + existing_slugs = {item["slug"] for item in manifest.get("sources", [])} + if not required_slugs.issubset(existing_slugs): + corpus.download_catalog(tax_year, bootstrap_irs_catalog(tax_year)) + + def build_parser() -> argparse.ArgumentParser: parser = argparse.ArgumentParser( prog="us-cpa", @@ -110,6 +123,7 @@ def main(argv: list[str] | None = None) -> int: if args.command == "question": corpus = TaxYearCorpus() + _ensure_question_corpus(corpus, args.tax_year) engine = QuestionEngine(corpus=corpus) case_facts: dict[str, Any] = {} if args.case_dir: diff --git a/skills/us-cpa/src/us_cpa/questions.py b/skills/us-cpa/src/us_cpa/questions.py index 7c92778..174342e 100644 --- a/skills/us-cpa/src/us_cpa/questions.py +++ b/skills/us-cpa/src/us_cpa/questions.py @@ -1,10 +1,13 @@ from __future__ import annotations import json +import re from dataclasses import dataclass from pathlib import Path from typing import Any +from pypdf import PdfReader + from us_cpa.sources import TaxYearCorpus, build_primary_law_authorities @@ -56,6 +59,71 @@ RISK_BY_CONFIDENCE = { } +QUESTION_STOPWORDS = { + "a", + "also", + "am", + "an", + "and", + "are", + "as", + "at", + "be", + "before", + "but", + "by", + "can", + "considered", + "did", + "do", + "does", + "for", + "from", + "had", + "has", + "have", + "her", + "hers", + "his", + "i", + "if", + "in", + "is", + "it", + "its", + "my", + "of", + "or", + "our", + "she", + "should", + "that", + "the", + "their", + "them", + "they", + "this", + "to", + "was", + "we", + "went", + "what", + "worked", + "would", + "year", + "you", + "your", +} + + +SEARCH_SOURCE_BONUS = { + "irs_publication": 30, + "irs_instructions": 20, + "irs_faq": 10, + "irs_form": 0, +} + + def _normalize_question(question: str) -> str: return question.strip().lower() @@ -64,6 +132,101 @@ def _filing_status_label(status: str) -> str: return status.replace("_", " ").title() +def _question_terms(normalized_question: str) -> list[str]: + terms = [] + for token in re.findall(r"[a-z0-9]+", normalized_question): + if len(token) < 3 or token in QUESTION_STOPWORDS or token.isdigit(): + continue + terms.append(token) + + expanded = set(terms) + if any(token in expanded for token in {"dependent", "dependents", "daughter", "son", "child", "children"}): + expanded.update({"dependent", "qualifying", "child", "support", "residency"}) + if any(token in expanded for token in {"college", "school", "student", "tuition"}): + expanded.update({"student", "school", "education", "temporary", "absence"}) + + return sorted(expanded) + + +def _load_searchable_pages(path: Path) -> list[str]: + payload = path.read_bytes() + if payload.startswith(b"%PDF"): + try: + reader = PdfReader(path) + pages = [] + for page in reader.pages: + text = page.extract_text() or "" + if text.strip(): + pages.append(text) + if pages: + return pages + except Exception: + pass + + try: + decoded = payload.decode("utf-8", errors="ignore") + except Exception: + return [] + return [decoded] if decoded.strip() else [] + + +def _build_excerpt(text: str, terms: list[str], *, width: int = 420) -> str: + lowered = text.lower() + first_index = None + for term in terms: + idx = lowered.find(term) + if idx >= 0 and (first_index is None or idx < first_index): + first_index = idx + if first_index is None: + cleaned = " ".join(text.split()) + return cleaned[:width] + + start = max(0, first_index - 120) + end = min(len(text), first_index + width) + cleaned = " ".join(text[start:end].split()) + return cleaned + + +def _rank_research_hits(manifest: dict[str, Any], normalized_question: str) -> list[dict[str, Any]]: + terms = _question_terms(normalized_question) + if not terms: + return [] + + hits: list[dict[str, Any]] = [] + for source in manifest["sources"]: + path = Path(source["localPath"]) + if not path.exists(): + continue + pages = _load_searchable_pages(path) + for page_number, text in enumerate(pages, start=1): + lowered = text.lower() + matched_terms = [term for term in terms if term in lowered] + if not matched_terms: + continue + score = ( + len(matched_terms) * 10 + + SEARCH_SOURCE_BONUS.get(source["sourceClass"], 0) + - int(source["authorityRank"]) + ) + hits.append( + { + "slug": source["slug"], + "title": source["title"], + "sourceClass": source["sourceClass"], + "url": source["url"], + "localPath": source["localPath"], + "authorityRank": source["authorityRank"], + "page": page_number, + "score": score, + "matchedTerms": matched_terms, + "excerpt": _build_excerpt(text, matched_terms), + } + ) + + hits.sort(key=lambda item: (-item["score"], item["authorityRank"], item["slug"], item["page"])) + return hits[:5] + + FILING_STATUS_PATTERNS = ( (("qualifying surviving spouse",), "qualifying_surviving_spouse"), (("qualifying widow",), "qualifying_surviving_spouse"), @@ -151,8 +314,54 @@ class QuestionEngine: "riskLevel": RISK_BY_CONFIDENCE[rule["confidence"]], "followUpQuestions": [], "primaryLawRequired": False, + "excerpts": [], } + research_hits = _rank_research_hits(manifest, normalized) + if research_hits: + authorities = [] + seen = set() + for hit in research_hits: + if hit["slug"] in seen: + continue + authorities.append( + { + "slug": hit["slug"], + "title": hit["title"], + "sourceClass": hit["sourceClass"], + "url": hit["url"], + "localPath": hit["localPath"], + "authorityRank": hit["authorityRank"], + } + ) + seen.add(hit["slug"]) + + return { + "issue": "irs_corpus_research", + "taxYear": tax_year, + "factsUsed": facts_used, + "missingFacts": [], + "authorities": authorities, + "excerpts": [ + { + "slug": hit["slug"], + "title": hit["title"], + "page": hit["page"], + "matchedTerms": hit["matchedTerms"], + "excerpt": hit["excerpt"], + } + for hit in research_hits + ], + "conclusion": { + "answer": "Relevant IRS authorities were found in the downloaded tax-year corpus. Answer from those authorities directly, and only escalate further if the cited passages are still insufficient.", + "summary": "Relevant IRS materials in the cached tax-year corpus address this question. Use the cited passages below to answer it directly.", + }, + "confidence": "medium", + "riskLevel": "medium", + "followUpQuestions": [], + "primaryLawRequired": False, + } + return { "issue": "requires_primary_law_escalation", "taxYear": tax_year, @@ -172,6 +381,7 @@ class QuestionEngine: "Is there an existing return position or drafted treatment to review?", ], "primaryLawRequired": True, + "excerpts": [], } @@ -186,6 +396,11 @@ def render_analysis(analysis: dict[str, Any]) -> str: if analysis["authorities"]: titles = "; ".join(item["title"] for item in analysis["authorities"]) lines.append(f"Authorities: {titles}.") + if analysis.get("excerpts"): + excerpt_lines = [] + for item in analysis["excerpts"][:3]: + excerpt_lines.append(f"{item['title']} p.{item['page']}: {item['excerpt']}") + lines.append(f"Excerpts: {' | '.join(excerpt_lines)}") if analysis["missingFacts"]: lines.append(f"Open items: {' '.join(analysis['missingFacts'])}") return " ".join(lines) @@ -210,6 +425,10 @@ def render_memo(analysis: dict[str, Any]) -> str: lines.append(f"- {authority['title']}") else: lines.append("- Primary-law escalation required.") + if analysis.get("excerpts"): + lines.extend(["", "## IRS Excerpts"]) + for item in analysis["excerpts"]: + lines.append(f"- {item['title']} (page {item['page']}): {item['excerpt']}") lines.extend( [ "", diff --git a/skills/us-cpa/src/us_cpa/sources.py b/skills/us-cpa/src/us_cpa/sources.py index 852b5aa..9338fe0 100644 --- a/skills/us-cpa/src/us_cpa/sources.py +++ b/skills/us-cpa/src/us_cpa/sources.py @@ -143,6 +143,7 @@ def bootstrap_irs_catalog(tax_year: int) -> list[SourceDescriptor]: ("i5329", "Instructions for Form 5329", "irs_instructions"), ("i5695", "Instructions for Form 5695", "irs_instructions"), ("i1116", "Instructions for Form 1116", "irs_instructions"), + ("p501", "Publication 501, Dependents, Standard Deduction, and Filing Information", "irs_publication"), ] return [ SourceDescriptor( diff --git a/skills/us-cpa/tests/test_questions.py b/skills/us-cpa/tests/test_questions.py index 6ef5f49..8172d7d 100644 --- a/skills/us-cpa/tests/test_questions.py +++ b/skills/us-cpa/tests/test_questions.py @@ -14,6 +14,12 @@ class QuestionEngineTests(unittest.TestCase): corpus = TaxYearCorpus(cache_root=Path(temp_dir)) def fake_fetch(url: str) -> bytes: + if "p501" in url: + return ( + "A qualifying child may be your dependent if the relationship, age, residency, support, and joint return tests are met. " + "Temporary absences due to education count as time lived with you. " + "To meet the support test, the child must not have provided more than half of their own support for the year." + ).encode() return f"source for {url}".encode() corpus.download_catalog(2025, bootstrap_irs_catalog(2025), fetcher=fake_fetch) @@ -93,6 +99,25 @@ class QuestionEngineTests(unittest.TestCase): self.assertEqual(analysis["conclusion"]["answer"], "$31,500") self.assertIn("Qualifying Surviving Spouse", analysis["conclusion"]["summary"]) + def test_dependency_question_uses_irs_corpus_research_before_primary_law(self) -> None: + with tempfile.TemporaryDirectory() as temp_dir: + engine = self.build_engine(temp_dir) + + analysis = engine.answer( + question=( + "If my daughter went to college in 2025 starting in August, but also worked before that, " + "should she be considered as a dependent?" + ), + tax_year=2025, + case_facts={}, + ) + + self.assertEqual(analysis["issue"], "irs_corpus_research") + self.assertFalse(analysis["primaryLawRequired"]) + self.assertEqual(analysis["authorities"][0]["slug"], "p501") + self.assertTrue(any(item["slug"] == "p501" for item in analysis["authorities"])) + self.assertTrue(analysis["excerpts"]) + def test_complex_question_flags_primary_law_escalation(self) -> None: with tempfile.TemporaryDirectory() as temp_dir: engine = self.build_engine(temp_dir)