feat: make us-cpa questions retrieval-first
This commit is contained in:
@@ -177,19 +177,17 @@ Behavior:
|
|||||||
Current `question` implementation:
|
Current `question` implementation:
|
||||||
|
|
||||||
- loads the cached tax-year corpus
|
- loads the cached tax-year corpus
|
||||||
- searches a small IRS-first topical rule set
|
- searches the downloaded IRS corpus for relevant authorities and excerpts
|
||||||
- returns one canonical analysis object
|
- returns one canonical analysis object with:
|
||||||
|
- authorities
|
||||||
|
- excerpts
|
||||||
|
- confidence / risk
|
||||||
|
- primary-law escalation only when the IRS corpus is still insufficient
|
||||||
- renders that analysis as:
|
- renders that analysis as:
|
||||||
- conversational output
|
- conversational output
|
||||||
- memo output
|
- memo output
|
||||||
- marks questions outside the current topical rule set as requiring primary-law escalation
|
|
||||||
|
|
||||||
Current implemented topics:
|
In OpenClaw, the model should answer the user from the returned IRS excerpts when `primaryLawRequired` is `false`, rather than merely repeating the CLI summary.
|
||||||
|
|
||||||
- standard deduction
|
|
||||||
- Schedule C / sole proprietorship reporting trigger
|
|
||||||
- Schedule D / capital gains reporting trigger
|
|
||||||
- Schedule E / rental income reporting trigger
|
|
||||||
|
|
||||||
## Form Rendering
|
## Form Rendering
|
||||||
|
|
||||||
|
|||||||
@@ -40,6 +40,11 @@ skills/us-cpa/scripts/us-cpa review --tax-year 2025 --case-dir ~/tax-cases/2025-
|
|||||||
skills/us-cpa/scripts/us-cpa extract-docs --tax-year 2025 --case-dir ~/tax-cases/2025-jane-doe --create-case --case-label "Jane Doe" --facts-json ./facts.json
|
skills/us-cpa/scripts/us-cpa extract-docs --tax-year 2025 --case-dir ~/tax-cases/2025-jane-doe --create-case --case-label "Jane Doe" --facts-json ./facts.json
|
||||||
```
|
```
|
||||||
|
|
||||||
|
4. For `question` mode, do not mechanically repeat the CLI fallback text.
|
||||||
|
- If the CLI returns `analysis.excerpts` with `primaryLawRequired: false`, answer the user directly from those IRS excerpts in your own words.
|
||||||
|
- Cite the specific IRS authorities returned by the CLI.
|
||||||
|
- Only tell the user the question needs deeper legal research when the CLI returns `primaryLawRequired: true` and no relevant IRS excerpts were found.
|
||||||
|
|
||||||
When OpenClaw is using the installed workspace copy, the entrypoint is:
|
When OpenClaw is using the installed workspace copy, the entrypoint is:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
@@ -63,7 +68,7 @@ When OpenClaw is using the installed workspace copy, the entrypoint is:
|
|||||||
- `fetch-year` downloads the bootstrap IRS form/instruction corpus into `~/.cache/us-cpa` by default
|
- `fetch-year` downloads the bootstrap IRS form/instruction corpus into `~/.cache/us-cpa` by default
|
||||||
- override the cache root with `US_CPA_CACHE_DIR` when you need an isolated run or fixture generation
|
- override the cache root with `US_CPA_CACHE_DIR` when you need an isolated run or fixture generation
|
||||||
- `extract-docs` creates or opens a case, registers documents, stores facts, extracts machine-usable facts from JSON/text/PDF sources where possible, and stops with a structured issue if facts conflict
|
- `extract-docs` creates or opens a case, registers documents, stores facts, extracts machine-usable facts from JSON/text/PDF sources where possible, and stops with a structured issue if facts conflict
|
||||||
- `question` currently has explicit IRS-first answers for standard deduction, Schedule C, Schedule D, and Schedule E questions; other questions escalate to primary-law research with official IRC/regulation URLs
|
- `question` now searches the downloaded IRS corpus for relevant authorities and excerpts before escalating to primary-law research
|
||||||
- rendered form artifacts prefer fillable-field output when possible and otherwise fall back to overlay output
|
- rendered form artifacts prefer fillable-field output when possible and otherwise fall back to overlay output
|
||||||
- `prepare` computes the current supported federal 1040 package, preserves fact provenance in the normalized return, and writes normalized return/artifact/report files into the case directory
|
- `prepare` computes the current supported federal 1040 package, preserves fact provenance in the normalized return, and writes normalized return/artifact/report files into the case directory
|
||||||
- `export-efile-ready` writes a draft transmission-ready payload without transmitting anything
|
- `export-efile-ready` writes a draft transmission-ready payload without transmitting anything
|
||||||
|
|||||||
@@ -57,6 +57,19 @@ def _load_json_file(path_value: str | None) -> dict[str, Any]:
|
|||||||
return json.loads(Path(path_value).expanduser().resolve().read_text())
|
return json.loads(Path(path_value).expanduser().resolve().read_text())
|
||||||
|
|
||||||
|
|
||||||
|
def _ensure_question_corpus(corpus: TaxYearCorpus, tax_year: int) -> None:
|
||||||
|
paths = corpus.paths_for_year(tax_year)
|
||||||
|
required_slugs = {item.slug for item in bootstrap_irs_catalog(tax_year)}
|
||||||
|
if not paths.manifest_path.exists():
|
||||||
|
corpus.download_catalog(tax_year, bootstrap_irs_catalog(tax_year))
|
||||||
|
return
|
||||||
|
|
||||||
|
manifest = json.loads(paths.manifest_path.read_text())
|
||||||
|
existing_slugs = {item["slug"] for item in manifest.get("sources", [])}
|
||||||
|
if not required_slugs.issubset(existing_slugs):
|
||||||
|
corpus.download_catalog(tax_year, bootstrap_irs_catalog(tax_year))
|
||||||
|
|
||||||
|
|
||||||
def build_parser() -> argparse.ArgumentParser:
|
def build_parser() -> argparse.ArgumentParser:
|
||||||
parser = argparse.ArgumentParser(
|
parser = argparse.ArgumentParser(
|
||||||
prog="us-cpa",
|
prog="us-cpa",
|
||||||
@@ -110,6 +123,7 @@ def main(argv: list[str] | None = None) -> int:
|
|||||||
|
|
||||||
if args.command == "question":
|
if args.command == "question":
|
||||||
corpus = TaxYearCorpus()
|
corpus = TaxYearCorpus()
|
||||||
|
_ensure_question_corpus(corpus, args.tax_year)
|
||||||
engine = QuestionEngine(corpus=corpus)
|
engine = QuestionEngine(corpus=corpus)
|
||||||
case_facts: dict[str, Any] = {}
|
case_facts: dict[str, Any] = {}
|
||||||
if args.case_dir:
|
if args.case_dir:
|
||||||
|
|||||||
@@ -1,10 +1,13 @@
|
|||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
import json
|
import json
|
||||||
|
import re
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Any
|
from typing import Any
|
||||||
|
|
||||||
|
from pypdf import PdfReader
|
||||||
|
|
||||||
from us_cpa.sources import TaxYearCorpus, build_primary_law_authorities
|
from us_cpa.sources import TaxYearCorpus, build_primary_law_authorities
|
||||||
|
|
||||||
|
|
||||||
@@ -56,6 +59,71 @@ RISK_BY_CONFIDENCE = {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
QUESTION_STOPWORDS = {
|
||||||
|
"a",
|
||||||
|
"also",
|
||||||
|
"am",
|
||||||
|
"an",
|
||||||
|
"and",
|
||||||
|
"are",
|
||||||
|
"as",
|
||||||
|
"at",
|
||||||
|
"be",
|
||||||
|
"before",
|
||||||
|
"but",
|
||||||
|
"by",
|
||||||
|
"can",
|
||||||
|
"considered",
|
||||||
|
"did",
|
||||||
|
"do",
|
||||||
|
"does",
|
||||||
|
"for",
|
||||||
|
"from",
|
||||||
|
"had",
|
||||||
|
"has",
|
||||||
|
"have",
|
||||||
|
"her",
|
||||||
|
"hers",
|
||||||
|
"his",
|
||||||
|
"i",
|
||||||
|
"if",
|
||||||
|
"in",
|
||||||
|
"is",
|
||||||
|
"it",
|
||||||
|
"its",
|
||||||
|
"my",
|
||||||
|
"of",
|
||||||
|
"or",
|
||||||
|
"our",
|
||||||
|
"she",
|
||||||
|
"should",
|
||||||
|
"that",
|
||||||
|
"the",
|
||||||
|
"their",
|
||||||
|
"them",
|
||||||
|
"they",
|
||||||
|
"this",
|
||||||
|
"to",
|
||||||
|
"was",
|
||||||
|
"we",
|
||||||
|
"went",
|
||||||
|
"what",
|
||||||
|
"worked",
|
||||||
|
"would",
|
||||||
|
"year",
|
||||||
|
"you",
|
||||||
|
"your",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
SEARCH_SOURCE_BONUS = {
|
||||||
|
"irs_publication": 30,
|
||||||
|
"irs_instructions": 20,
|
||||||
|
"irs_faq": 10,
|
||||||
|
"irs_form": 0,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
def _normalize_question(question: str) -> str:
|
def _normalize_question(question: str) -> str:
|
||||||
return question.strip().lower()
|
return question.strip().lower()
|
||||||
|
|
||||||
@@ -64,6 +132,101 @@ def _filing_status_label(status: str) -> str:
|
|||||||
return status.replace("_", " ").title()
|
return status.replace("_", " ").title()
|
||||||
|
|
||||||
|
|
||||||
|
def _question_terms(normalized_question: str) -> list[str]:
|
||||||
|
terms = []
|
||||||
|
for token in re.findall(r"[a-z0-9]+", normalized_question):
|
||||||
|
if len(token) < 3 or token in QUESTION_STOPWORDS or token.isdigit():
|
||||||
|
continue
|
||||||
|
terms.append(token)
|
||||||
|
|
||||||
|
expanded = set(terms)
|
||||||
|
if any(token in expanded for token in {"dependent", "dependents", "daughter", "son", "child", "children"}):
|
||||||
|
expanded.update({"dependent", "qualifying", "child", "support", "residency"})
|
||||||
|
if any(token in expanded for token in {"college", "school", "student", "tuition"}):
|
||||||
|
expanded.update({"student", "school", "education", "temporary", "absence"})
|
||||||
|
|
||||||
|
return sorted(expanded)
|
||||||
|
|
||||||
|
|
||||||
|
def _load_searchable_pages(path: Path) -> list[str]:
|
||||||
|
payload = path.read_bytes()
|
||||||
|
if payload.startswith(b"%PDF"):
|
||||||
|
try:
|
||||||
|
reader = PdfReader(path)
|
||||||
|
pages = []
|
||||||
|
for page in reader.pages:
|
||||||
|
text = page.extract_text() or ""
|
||||||
|
if text.strip():
|
||||||
|
pages.append(text)
|
||||||
|
if pages:
|
||||||
|
return pages
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
try:
|
||||||
|
decoded = payload.decode("utf-8", errors="ignore")
|
||||||
|
except Exception:
|
||||||
|
return []
|
||||||
|
return [decoded] if decoded.strip() else []
|
||||||
|
|
||||||
|
|
||||||
|
def _build_excerpt(text: str, terms: list[str], *, width: int = 420) -> str:
|
||||||
|
lowered = text.lower()
|
||||||
|
first_index = None
|
||||||
|
for term in terms:
|
||||||
|
idx = lowered.find(term)
|
||||||
|
if idx >= 0 and (first_index is None or idx < first_index):
|
||||||
|
first_index = idx
|
||||||
|
if first_index is None:
|
||||||
|
cleaned = " ".join(text.split())
|
||||||
|
return cleaned[:width]
|
||||||
|
|
||||||
|
start = max(0, first_index - 120)
|
||||||
|
end = min(len(text), first_index + width)
|
||||||
|
cleaned = " ".join(text[start:end].split())
|
||||||
|
return cleaned
|
||||||
|
|
||||||
|
|
||||||
|
def _rank_research_hits(manifest: dict[str, Any], normalized_question: str) -> list[dict[str, Any]]:
|
||||||
|
terms = _question_terms(normalized_question)
|
||||||
|
if not terms:
|
||||||
|
return []
|
||||||
|
|
||||||
|
hits: list[dict[str, Any]] = []
|
||||||
|
for source in manifest["sources"]:
|
||||||
|
path = Path(source["localPath"])
|
||||||
|
if not path.exists():
|
||||||
|
continue
|
||||||
|
pages = _load_searchable_pages(path)
|
||||||
|
for page_number, text in enumerate(pages, start=1):
|
||||||
|
lowered = text.lower()
|
||||||
|
matched_terms = [term for term in terms if term in lowered]
|
||||||
|
if not matched_terms:
|
||||||
|
continue
|
||||||
|
score = (
|
||||||
|
len(matched_terms) * 10
|
||||||
|
+ SEARCH_SOURCE_BONUS.get(source["sourceClass"], 0)
|
||||||
|
- int(source["authorityRank"])
|
||||||
|
)
|
||||||
|
hits.append(
|
||||||
|
{
|
||||||
|
"slug": source["slug"],
|
||||||
|
"title": source["title"],
|
||||||
|
"sourceClass": source["sourceClass"],
|
||||||
|
"url": source["url"],
|
||||||
|
"localPath": source["localPath"],
|
||||||
|
"authorityRank": source["authorityRank"],
|
||||||
|
"page": page_number,
|
||||||
|
"score": score,
|
||||||
|
"matchedTerms": matched_terms,
|
||||||
|
"excerpt": _build_excerpt(text, matched_terms),
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
hits.sort(key=lambda item: (-item["score"], item["authorityRank"], item["slug"], item["page"]))
|
||||||
|
return hits[:5]
|
||||||
|
|
||||||
|
|
||||||
FILING_STATUS_PATTERNS = (
|
FILING_STATUS_PATTERNS = (
|
||||||
(("qualifying surviving spouse",), "qualifying_surviving_spouse"),
|
(("qualifying surviving spouse",), "qualifying_surviving_spouse"),
|
||||||
(("qualifying widow",), "qualifying_surviving_spouse"),
|
(("qualifying widow",), "qualifying_surviving_spouse"),
|
||||||
@@ -151,8 +314,54 @@ class QuestionEngine:
|
|||||||
"riskLevel": RISK_BY_CONFIDENCE[rule["confidence"]],
|
"riskLevel": RISK_BY_CONFIDENCE[rule["confidence"]],
|
||||||
"followUpQuestions": [],
|
"followUpQuestions": [],
|
||||||
"primaryLawRequired": False,
|
"primaryLawRequired": False,
|
||||||
|
"excerpts": [],
|
||||||
}
|
}
|
||||||
|
|
||||||
|
research_hits = _rank_research_hits(manifest, normalized)
|
||||||
|
if research_hits:
|
||||||
|
authorities = []
|
||||||
|
seen = set()
|
||||||
|
for hit in research_hits:
|
||||||
|
if hit["slug"] in seen:
|
||||||
|
continue
|
||||||
|
authorities.append(
|
||||||
|
{
|
||||||
|
"slug": hit["slug"],
|
||||||
|
"title": hit["title"],
|
||||||
|
"sourceClass": hit["sourceClass"],
|
||||||
|
"url": hit["url"],
|
||||||
|
"localPath": hit["localPath"],
|
||||||
|
"authorityRank": hit["authorityRank"],
|
||||||
|
}
|
||||||
|
)
|
||||||
|
seen.add(hit["slug"])
|
||||||
|
|
||||||
|
return {
|
||||||
|
"issue": "irs_corpus_research",
|
||||||
|
"taxYear": tax_year,
|
||||||
|
"factsUsed": facts_used,
|
||||||
|
"missingFacts": [],
|
||||||
|
"authorities": authorities,
|
||||||
|
"excerpts": [
|
||||||
|
{
|
||||||
|
"slug": hit["slug"],
|
||||||
|
"title": hit["title"],
|
||||||
|
"page": hit["page"],
|
||||||
|
"matchedTerms": hit["matchedTerms"],
|
||||||
|
"excerpt": hit["excerpt"],
|
||||||
|
}
|
||||||
|
for hit in research_hits
|
||||||
|
],
|
||||||
|
"conclusion": {
|
||||||
|
"answer": "Relevant IRS authorities were found in the downloaded tax-year corpus. Answer from those authorities directly, and only escalate further if the cited passages are still insufficient.",
|
||||||
|
"summary": "Relevant IRS materials in the cached tax-year corpus address this question. Use the cited passages below to answer it directly.",
|
||||||
|
},
|
||||||
|
"confidence": "medium",
|
||||||
|
"riskLevel": "medium",
|
||||||
|
"followUpQuestions": [],
|
||||||
|
"primaryLawRequired": False,
|
||||||
|
}
|
||||||
|
|
||||||
return {
|
return {
|
||||||
"issue": "requires_primary_law_escalation",
|
"issue": "requires_primary_law_escalation",
|
||||||
"taxYear": tax_year,
|
"taxYear": tax_year,
|
||||||
@@ -172,6 +381,7 @@ class QuestionEngine:
|
|||||||
"Is there an existing return position or drafted treatment to review?",
|
"Is there an existing return position or drafted treatment to review?",
|
||||||
],
|
],
|
||||||
"primaryLawRequired": True,
|
"primaryLawRequired": True,
|
||||||
|
"excerpts": [],
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@@ -186,6 +396,11 @@ def render_analysis(analysis: dict[str, Any]) -> str:
|
|||||||
if analysis["authorities"]:
|
if analysis["authorities"]:
|
||||||
titles = "; ".join(item["title"] for item in analysis["authorities"])
|
titles = "; ".join(item["title"] for item in analysis["authorities"])
|
||||||
lines.append(f"Authorities: {titles}.")
|
lines.append(f"Authorities: {titles}.")
|
||||||
|
if analysis.get("excerpts"):
|
||||||
|
excerpt_lines = []
|
||||||
|
for item in analysis["excerpts"][:3]:
|
||||||
|
excerpt_lines.append(f"{item['title']} p.{item['page']}: {item['excerpt']}")
|
||||||
|
lines.append(f"Excerpts: {' | '.join(excerpt_lines)}")
|
||||||
if analysis["missingFacts"]:
|
if analysis["missingFacts"]:
|
||||||
lines.append(f"Open items: {' '.join(analysis['missingFacts'])}")
|
lines.append(f"Open items: {' '.join(analysis['missingFacts'])}")
|
||||||
return " ".join(lines)
|
return " ".join(lines)
|
||||||
@@ -210,6 +425,10 @@ def render_memo(analysis: dict[str, Any]) -> str:
|
|||||||
lines.append(f"- {authority['title']}")
|
lines.append(f"- {authority['title']}")
|
||||||
else:
|
else:
|
||||||
lines.append("- Primary-law escalation required.")
|
lines.append("- Primary-law escalation required.")
|
||||||
|
if analysis.get("excerpts"):
|
||||||
|
lines.extend(["", "## IRS Excerpts"])
|
||||||
|
for item in analysis["excerpts"]:
|
||||||
|
lines.append(f"- {item['title']} (page {item['page']}): {item['excerpt']}")
|
||||||
lines.extend(
|
lines.extend(
|
||||||
[
|
[
|
||||||
"",
|
"",
|
||||||
|
|||||||
@@ -143,6 +143,7 @@ def bootstrap_irs_catalog(tax_year: int) -> list[SourceDescriptor]:
|
|||||||
("i5329", "Instructions for Form 5329", "irs_instructions"),
|
("i5329", "Instructions for Form 5329", "irs_instructions"),
|
||||||
("i5695", "Instructions for Form 5695", "irs_instructions"),
|
("i5695", "Instructions for Form 5695", "irs_instructions"),
|
||||||
("i1116", "Instructions for Form 1116", "irs_instructions"),
|
("i1116", "Instructions for Form 1116", "irs_instructions"),
|
||||||
|
("p501", "Publication 501, Dependents, Standard Deduction, and Filing Information", "irs_publication"),
|
||||||
]
|
]
|
||||||
return [
|
return [
|
||||||
SourceDescriptor(
|
SourceDescriptor(
|
||||||
|
|||||||
@@ -14,6 +14,12 @@ class QuestionEngineTests(unittest.TestCase):
|
|||||||
corpus = TaxYearCorpus(cache_root=Path(temp_dir))
|
corpus = TaxYearCorpus(cache_root=Path(temp_dir))
|
||||||
|
|
||||||
def fake_fetch(url: str) -> bytes:
|
def fake_fetch(url: str) -> bytes:
|
||||||
|
if "p501" in url:
|
||||||
|
return (
|
||||||
|
"A qualifying child may be your dependent if the relationship, age, residency, support, and joint return tests are met. "
|
||||||
|
"Temporary absences due to education count as time lived with you. "
|
||||||
|
"To meet the support test, the child must not have provided more than half of their own support for the year."
|
||||||
|
).encode()
|
||||||
return f"source for {url}".encode()
|
return f"source for {url}".encode()
|
||||||
|
|
||||||
corpus.download_catalog(2025, bootstrap_irs_catalog(2025), fetcher=fake_fetch)
|
corpus.download_catalog(2025, bootstrap_irs_catalog(2025), fetcher=fake_fetch)
|
||||||
@@ -93,6 +99,25 @@ class QuestionEngineTests(unittest.TestCase):
|
|||||||
self.assertEqual(analysis["conclusion"]["answer"], "$31,500")
|
self.assertEqual(analysis["conclusion"]["answer"], "$31,500")
|
||||||
self.assertIn("Qualifying Surviving Spouse", analysis["conclusion"]["summary"])
|
self.assertIn("Qualifying Surviving Spouse", analysis["conclusion"]["summary"])
|
||||||
|
|
||||||
|
def test_dependency_question_uses_irs_corpus_research_before_primary_law(self) -> None:
|
||||||
|
with tempfile.TemporaryDirectory() as temp_dir:
|
||||||
|
engine = self.build_engine(temp_dir)
|
||||||
|
|
||||||
|
analysis = engine.answer(
|
||||||
|
question=(
|
||||||
|
"If my daughter went to college in 2025 starting in August, but also worked before that, "
|
||||||
|
"should she be considered as a dependent?"
|
||||||
|
),
|
||||||
|
tax_year=2025,
|
||||||
|
case_facts={},
|
||||||
|
)
|
||||||
|
|
||||||
|
self.assertEqual(analysis["issue"], "irs_corpus_research")
|
||||||
|
self.assertFalse(analysis["primaryLawRequired"])
|
||||||
|
self.assertEqual(analysis["authorities"][0]["slug"], "p501")
|
||||||
|
self.assertTrue(any(item["slug"] == "p501" for item in analysis["authorities"]))
|
||||||
|
self.assertTrue(analysis["excerpts"])
|
||||||
|
|
||||||
def test_complex_question_flags_primary_law_escalation(self) -> None:
|
def test_complex_question_flags_primary_law_escalation(self) -> None:
|
||||||
with tempfile.TemporaryDirectory() as temp_dir:
|
with tempfile.TemporaryDirectory() as temp_dir:
|
||||||
engine = self.build_engine(temp_dir)
|
engine = self.build_engine(temp_dir)
|
||||||
|
|||||||
Reference in New Issue
Block a user