feat: make us-cpa questions retrieval-first

This commit is contained in:
Stefano Fiorini
2026-03-15 04:40:57 -05:00
parent b4f9666560
commit b2bb07fa90
6 changed files with 272 additions and 10 deletions

View File

@@ -177,19 +177,17 @@ Behavior:
Current `question` implementation: Current `question` implementation:
- loads the cached tax-year corpus - loads the cached tax-year corpus
- searches a small IRS-first topical rule set - searches the downloaded IRS corpus for relevant authorities and excerpts
- returns one canonical analysis object - returns one canonical analysis object with:
- authorities
- excerpts
- confidence / risk
- primary-law escalation only when the IRS corpus is still insufficient
- renders that analysis as: - renders that analysis as:
- conversational output - conversational output
- memo output - memo output
- marks questions outside the current topical rule set as requiring primary-law escalation
Current implemented topics: In OpenClaw, the model should answer the user from the returned IRS excerpts when `primaryLawRequired` is `false`, rather than merely repeating the CLI summary.
- standard deduction
- Schedule C / sole proprietorship reporting trigger
- Schedule D / capital gains reporting trigger
- Schedule E / rental income reporting trigger
## Form Rendering ## Form Rendering

View File

@@ -40,6 +40,11 @@ skills/us-cpa/scripts/us-cpa review --tax-year 2025 --case-dir ~/tax-cases/2025-
skills/us-cpa/scripts/us-cpa extract-docs --tax-year 2025 --case-dir ~/tax-cases/2025-jane-doe --create-case --case-label "Jane Doe" --facts-json ./facts.json skills/us-cpa/scripts/us-cpa extract-docs --tax-year 2025 --case-dir ~/tax-cases/2025-jane-doe --create-case --case-label "Jane Doe" --facts-json ./facts.json
``` ```
4. For `question` mode, do not mechanically repeat the CLI fallback text.
- If the CLI returns `analysis.excerpts` with `primaryLawRequired: false`, answer the user directly from those IRS excerpts in your own words.
- Cite the specific IRS authorities returned by the CLI.
- Only tell the user the question needs deeper legal research when the CLI returns `primaryLawRequired: true` and no relevant IRS excerpts were found.
When OpenClaw is using the installed workspace copy, the entrypoint is: When OpenClaw is using the installed workspace copy, the entrypoint is:
```bash ```bash
@@ -63,7 +68,7 @@ When OpenClaw is using the installed workspace copy, the entrypoint is:
- `fetch-year` downloads the bootstrap IRS form/instruction corpus into `~/.cache/us-cpa` by default - `fetch-year` downloads the bootstrap IRS form/instruction corpus into `~/.cache/us-cpa` by default
- override the cache root with `US_CPA_CACHE_DIR` when you need an isolated run or fixture generation - override the cache root with `US_CPA_CACHE_DIR` when you need an isolated run or fixture generation
- `extract-docs` creates or opens a case, registers documents, stores facts, extracts machine-usable facts from JSON/text/PDF sources where possible, and stops with a structured issue if facts conflict - `extract-docs` creates or opens a case, registers documents, stores facts, extracts machine-usable facts from JSON/text/PDF sources where possible, and stops with a structured issue if facts conflict
- `question` currently has explicit IRS-first answers for standard deduction, Schedule C, Schedule D, and Schedule E questions; other questions escalate to primary-law research with official IRC/regulation URLs - `question` now searches the downloaded IRS corpus for relevant authorities and excerpts before escalating to primary-law research
- rendered form artifacts prefer fillable-field output when possible and otherwise fall back to overlay output - rendered form artifacts prefer fillable-field output when possible and otherwise fall back to overlay output
- `prepare` computes the current supported federal 1040 package, preserves fact provenance in the normalized return, and writes normalized return/artifact/report files into the case directory - `prepare` computes the current supported federal 1040 package, preserves fact provenance in the normalized return, and writes normalized return/artifact/report files into the case directory
- `export-efile-ready` writes a draft transmission-ready payload without transmitting anything - `export-efile-ready` writes a draft transmission-ready payload without transmitting anything

View File

@@ -57,6 +57,19 @@ def _load_json_file(path_value: str | None) -> dict[str, Any]:
return json.loads(Path(path_value).expanduser().resolve().read_text()) return json.loads(Path(path_value).expanduser().resolve().read_text())
def _ensure_question_corpus(corpus: TaxYearCorpus, tax_year: int) -> None:
paths = corpus.paths_for_year(tax_year)
required_slugs = {item.slug for item in bootstrap_irs_catalog(tax_year)}
if not paths.manifest_path.exists():
corpus.download_catalog(tax_year, bootstrap_irs_catalog(tax_year))
return
manifest = json.loads(paths.manifest_path.read_text())
existing_slugs = {item["slug"] for item in manifest.get("sources", [])}
if not required_slugs.issubset(existing_slugs):
corpus.download_catalog(tax_year, bootstrap_irs_catalog(tax_year))
def build_parser() -> argparse.ArgumentParser: def build_parser() -> argparse.ArgumentParser:
parser = argparse.ArgumentParser( parser = argparse.ArgumentParser(
prog="us-cpa", prog="us-cpa",
@@ -110,6 +123,7 @@ def main(argv: list[str] | None = None) -> int:
if args.command == "question": if args.command == "question":
corpus = TaxYearCorpus() corpus = TaxYearCorpus()
_ensure_question_corpus(corpus, args.tax_year)
engine = QuestionEngine(corpus=corpus) engine = QuestionEngine(corpus=corpus)
case_facts: dict[str, Any] = {} case_facts: dict[str, Any] = {}
if args.case_dir: if args.case_dir:

View File

@@ -1,10 +1,13 @@
from __future__ import annotations from __future__ import annotations
import json import json
import re
from dataclasses import dataclass from dataclasses import dataclass
from pathlib import Path from pathlib import Path
from typing import Any from typing import Any
from pypdf import PdfReader
from us_cpa.sources import TaxYearCorpus, build_primary_law_authorities from us_cpa.sources import TaxYearCorpus, build_primary_law_authorities
@@ -56,6 +59,71 @@ RISK_BY_CONFIDENCE = {
} }
QUESTION_STOPWORDS = {
"a",
"also",
"am",
"an",
"and",
"are",
"as",
"at",
"be",
"before",
"but",
"by",
"can",
"considered",
"did",
"do",
"does",
"for",
"from",
"had",
"has",
"have",
"her",
"hers",
"his",
"i",
"if",
"in",
"is",
"it",
"its",
"my",
"of",
"or",
"our",
"she",
"should",
"that",
"the",
"their",
"them",
"they",
"this",
"to",
"was",
"we",
"went",
"what",
"worked",
"would",
"year",
"you",
"your",
}
SEARCH_SOURCE_BONUS = {
"irs_publication": 30,
"irs_instructions": 20,
"irs_faq": 10,
"irs_form": 0,
}
def _normalize_question(question: str) -> str: def _normalize_question(question: str) -> str:
return question.strip().lower() return question.strip().lower()
@@ -64,6 +132,101 @@ def _filing_status_label(status: str) -> str:
return status.replace("_", " ").title() return status.replace("_", " ").title()
def _question_terms(normalized_question: str) -> list[str]:
terms = []
for token in re.findall(r"[a-z0-9]+", normalized_question):
if len(token) < 3 or token in QUESTION_STOPWORDS or token.isdigit():
continue
terms.append(token)
expanded = set(terms)
if any(token in expanded for token in {"dependent", "dependents", "daughter", "son", "child", "children"}):
expanded.update({"dependent", "qualifying", "child", "support", "residency"})
if any(token in expanded for token in {"college", "school", "student", "tuition"}):
expanded.update({"student", "school", "education", "temporary", "absence"})
return sorted(expanded)
def _load_searchable_pages(path: Path) -> list[str]:
payload = path.read_bytes()
if payload.startswith(b"%PDF"):
try:
reader = PdfReader(path)
pages = []
for page in reader.pages:
text = page.extract_text() or ""
if text.strip():
pages.append(text)
if pages:
return pages
except Exception:
pass
try:
decoded = payload.decode("utf-8", errors="ignore")
except Exception:
return []
return [decoded] if decoded.strip() else []
def _build_excerpt(text: str, terms: list[str], *, width: int = 420) -> str:
lowered = text.lower()
first_index = None
for term in terms:
idx = lowered.find(term)
if idx >= 0 and (first_index is None or idx < first_index):
first_index = idx
if first_index is None:
cleaned = " ".join(text.split())
return cleaned[:width]
start = max(0, first_index - 120)
end = min(len(text), first_index + width)
cleaned = " ".join(text[start:end].split())
return cleaned
def _rank_research_hits(manifest: dict[str, Any], normalized_question: str) -> list[dict[str, Any]]:
terms = _question_terms(normalized_question)
if not terms:
return []
hits: list[dict[str, Any]] = []
for source in manifest["sources"]:
path = Path(source["localPath"])
if not path.exists():
continue
pages = _load_searchable_pages(path)
for page_number, text in enumerate(pages, start=1):
lowered = text.lower()
matched_terms = [term for term in terms if term in lowered]
if not matched_terms:
continue
score = (
len(matched_terms) * 10
+ SEARCH_SOURCE_BONUS.get(source["sourceClass"], 0)
- int(source["authorityRank"])
)
hits.append(
{
"slug": source["slug"],
"title": source["title"],
"sourceClass": source["sourceClass"],
"url": source["url"],
"localPath": source["localPath"],
"authorityRank": source["authorityRank"],
"page": page_number,
"score": score,
"matchedTerms": matched_terms,
"excerpt": _build_excerpt(text, matched_terms),
}
)
hits.sort(key=lambda item: (-item["score"], item["authorityRank"], item["slug"], item["page"]))
return hits[:5]
FILING_STATUS_PATTERNS = ( FILING_STATUS_PATTERNS = (
(("qualifying surviving spouse",), "qualifying_surviving_spouse"), (("qualifying surviving spouse",), "qualifying_surviving_spouse"),
(("qualifying widow",), "qualifying_surviving_spouse"), (("qualifying widow",), "qualifying_surviving_spouse"),
@@ -151,6 +314,52 @@ class QuestionEngine:
"riskLevel": RISK_BY_CONFIDENCE[rule["confidence"]], "riskLevel": RISK_BY_CONFIDENCE[rule["confidence"]],
"followUpQuestions": [], "followUpQuestions": [],
"primaryLawRequired": False, "primaryLawRequired": False,
"excerpts": [],
}
research_hits = _rank_research_hits(manifest, normalized)
if research_hits:
authorities = []
seen = set()
for hit in research_hits:
if hit["slug"] in seen:
continue
authorities.append(
{
"slug": hit["slug"],
"title": hit["title"],
"sourceClass": hit["sourceClass"],
"url": hit["url"],
"localPath": hit["localPath"],
"authorityRank": hit["authorityRank"],
}
)
seen.add(hit["slug"])
return {
"issue": "irs_corpus_research",
"taxYear": tax_year,
"factsUsed": facts_used,
"missingFacts": [],
"authorities": authorities,
"excerpts": [
{
"slug": hit["slug"],
"title": hit["title"],
"page": hit["page"],
"matchedTerms": hit["matchedTerms"],
"excerpt": hit["excerpt"],
}
for hit in research_hits
],
"conclusion": {
"answer": "Relevant IRS authorities were found in the downloaded tax-year corpus. Answer from those authorities directly, and only escalate further if the cited passages are still insufficient.",
"summary": "Relevant IRS materials in the cached tax-year corpus address this question. Use the cited passages below to answer it directly.",
},
"confidence": "medium",
"riskLevel": "medium",
"followUpQuestions": [],
"primaryLawRequired": False,
} }
return { return {
@@ -172,6 +381,7 @@ class QuestionEngine:
"Is there an existing return position or drafted treatment to review?", "Is there an existing return position or drafted treatment to review?",
], ],
"primaryLawRequired": True, "primaryLawRequired": True,
"excerpts": [],
} }
@@ -186,6 +396,11 @@ def render_analysis(analysis: dict[str, Any]) -> str:
if analysis["authorities"]: if analysis["authorities"]:
titles = "; ".join(item["title"] for item in analysis["authorities"]) titles = "; ".join(item["title"] for item in analysis["authorities"])
lines.append(f"Authorities: {titles}.") lines.append(f"Authorities: {titles}.")
if analysis.get("excerpts"):
excerpt_lines = []
for item in analysis["excerpts"][:3]:
excerpt_lines.append(f"{item['title']} p.{item['page']}: {item['excerpt']}")
lines.append(f"Excerpts: {' | '.join(excerpt_lines)}")
if analysis["missingFacts"]: if analysis["missingFacts"]:
lines.append(f"Open items: {' '.join(analysis['missingFacts'])}") lines.append(f"Open items: {' '.join(analysis['missingFacts'])}")
return " ".join(lines) return " ".join(lines)
@@ -210,6 +425,10 @@ def render_memo(analysis: dict[str, Any]) -> str:
lines.append(f"- {authority['title']}") lines.append(f"- {authority['title']}")
else: else:
lines.append("- Primary-law escalation required.") lines.append("- Primary-law escalation required.")
if analysis.get("excerpts"):
lines.extend(["", "## IRS Excerpts"])
for item in analysis["excerpts"]:
lines.append(f"- {item['title']} (page {item['page']}): {item['excerpt']}")
lines.extend( lines.extend(
[ [
"", "",

View File

@@ -143,6 +143,7 @@ def bootstrap_irs_catalog(tax_year: int) -> list[SourceDescriptor]:
("i5329", "Instructions for Form 5329", "irs_instructions"), ("i5329", "Instructions for Form 5329", "irs_instructions"),
("i5695", "Instructions for Form 5695", "irs_instructions"), ("i5695", "Instructions for Form 5695", "irs_instructions"),
("i1116", "Instructions for Form 1116", "irs_instructions"), ("i1116", "Instructions for Form 1116", "irs_instructions"),
("p501", "Publication 501, Dependents, Standard Deduction, and Filing Information", "irs_publication"),
] ]
return [ return [
SourceDescriptor( SourceDescriptor(

View File

@@ -14,6 +14,12 @@ class QuestionEngineTests(unittest.TestCase):
corpus = TaxYearCorpus(cache_root=Path(temp_dir)) corpus = TaxYearCorpus(cache_root=Path(temp_dir))
def fake_fetch(url: str) -> bytes: def fake_fetch(url: str) -> bytes:
if "p501" in url:
return (
"A qualifying child may be your dependent if the relationship, age, residency, support, and joint return tests are met. "
"Temporary absences due to education count as time lived with you. "
"To meet the support test, the child must not have provided more than half of their own support for the year."
).encode()
return f"source for {url}".encode() return f"source for {url}".encode()
corpus.download_catalog(2025, bootstrap_irs_catalog(2025), fetcher=fake_fetch) corpus.download_catalog(2025, bootstrap_irs_catalog(2025), fetcher=fake_fetch)
@@ -93,6 +99,25 @@ class QuestionEngineTests(unittest.TestCase):
self.assertEqual(analysis["conclusion"]["answer"], "$31,500") self.assertEqual(analysis["conclusion"]["answer"], "$31,500")
self.assertIn("Qualifying Surviving Spouse", analysis["conclusion"]["summary"]) self.assertIn("Qualifying Surviving Spouse", analysis["conclusion"]["summary"])
def test_dependency_question_uses_irs_corpus_research_before_primary_law(self) -> None:
with tempfile.TemporaryDirectory() as temp_dir:
engine = self.build_engine(temp_dir)
analysis = engine.answer(
question=(
"If my daughter went to college in 2025 starting in August, but also worked before that, "
"should she be considered as a dependent?"
),
tax_year=2025,
case_facts={},
)
self.assertEqual(analysis["issue"], "irs_corpus_research")
self.assertFalse(analysis["primaryLawRequired"])
self.assertEqual(analysis["authorities"][0]["slug"], "p501")
self.assertTrue(any(item["slug"] == "p501" for item in analysis["authorities"]))
self.assertTrue(analysis["excerpts"])
def test_complex_question_flags_primary_law_escalation(self) -> None: def test_complex_question_flags_primary_law_escalation(self) -> None:
with tempfile.TemporaryDirectory() as temp_dir: with tempfile.TemporaryDirectory() as temp_dir:
engine = self.build_engine(temp_dir) engine = self.build_engine(temp_dir)