from __future__ import annotations import json import re from dataclasses import dataclass from pathlib import Path from typing import Any from pypdf import PdfReader from us_cpa.sources import TaxYearCorpus, build_primary_law_authorities TOPIC_RULES = [ { "issue": "standard_deduction", "keywords": ("standard deduction",), "authority_slugs": ("i1040gi",), "answer_by_status": { "single": "$15,750", "married_filing_jointly": "$31,500", "qualifying_surviving_spouse": "$31,500", "head_of_household": "$23,625", }, "summary_template": "{filing_status_label} filers use a {answer} standard deduction for tax year {tax_year}.", "confidence": "high", }, { "issue": "schedule_c_required", "keywords": ("schedule c", "sole proprietor", "self-employment"), "authority_slugs": ("f1040sc", "i1040sc"), "answer": "Schedule C is generally required when a taxpayer reports sole proprietorship business income or expenses.", "summary": "Business income and expenses from a sole proprietorship generally belong on Schedule C.", "confidence": "medium", }, { "issue": "schedule_d_required", "keywords": ("schedule d", "capital gains"), "authority_slugs": ("f1040sd", "i1040sd", "f8949", "i8949"), "answer": "Schedule D is generally required when a taxpayer reports capital gains or losses, often alongside Form 8949.", "summary": "Capital gains and losses generally flow through Schedule D, with Form 8949 supporting detail when required.", "confidence": "medium", }, { "issue": "schedule_e_required", "keywords": ("schedule e", "rental income"), "authority_slugs": ("f1040se", "i1040se"), "answer": "Schedule E is generally required when a taxpayer reports rental real-estate income or expenses.", "summary": "Rental income and expenses generally belong on Schedule E.", "confidence": "medium", }, ] RISK_BY_CONFIDENCE = { "high": "low", "medium": "medium", "low": "high", } QUESTION_STOPWORDS = { "a", "also", "am", "an", "and", "are", "as", "at", "be", "before", "but", "by", "can", "considered", "did", "do", "does", "for", "from", "had", "has", "have", "her", "hers", "his", "i", "if", "in", "is", "it", "its", "my", "of", "or", "our", "she", "should", "that", "the", "their", "them", "they", "this", "to", "was", "we", "went", "what", "worked", "would", "year", "you", "your", } SEARCH_SOURCE_BONUS = { "irs_publication": 30, "irs_instructions": 20, "irs_faq": 10, "irs_form": 0, } def _normalize_question(question: str) -> str: return question.strip().lower() def _filing_status_label(status: str) -> str: return status.replace("_", " ").title() def _question_terms(normalized_question: str) -> list[str]: terms = [] for token in re.findall(r"[a-z0-9]+", normalized_question): if len(token) < 3 or token in QUESTION_STOPWORDS or token.isdigit(): continue terms.append(token) expanded = set(terms) if any(token in expanded for token in {"dependent", "dependents", "daughter", "son", "child", "children"}): expanded.update({"dependent", "qualifying", "child", "support", "residency"}) if any(token in expanded for token in {"college", "school", "student", "tuition"}): expanded.update({"student", "school", "education", "temporary", "absence"}) return sorted(expanded) def _load_searchable_pages(path: Path) -> list[str]: payload = path.read_bytes() if payload.startswith(b"%PDF"): try: reader = PdfReader(path) pages = [] for page in reader.pages: text = page.extract_text() or "" if text.strip(): pages.append(text) if pages: return pages except Exception: pass try: decoded = payload.decode("utf-8", errors="ignore") except Exception: return [] return [decoded] if decoded.strip() else [] def _build_excerpt(text: str, terms: list[str], *, width: int = 420) -> str: lowered = text.lower() first_index = None for term in terms: idx = lowered.find(term) if idx >= 0 and (first_index is None or idx < first_index): first_index = idx if first_index is None: cleaned = " ".join(text.split()) return cleaned[:width] start = max(0, first_index - 120) end = min(len(text), first_index + width) cleaned = " ".join(text[start:end].split()) return cleaned def _rank_research_hits(manifest: dict[str, Any], normalized_question: str) -> list[dict[str, Any]]: terms = _question_terms(normalized_question) if not terms: return [] hits: list[dict[str, Any]] = [] for source in manifest["sources"]: path = Path(source["localPath"]) if not path.exists(): continue pages = _load_searchable_pages(path) for page_number, text in enumerate(pages, start=1): lowered = text.lower() matched_terms = [term for term in terms if term in lowered] if not matched_terms: continue score = ( len(matched_terms) * 10 + SEARCH_SOURCE_BONUS.get(source["sourceClass"], 0) - int(source["authorityRank"]) ) hits.append( { "slug": source["slug"], "title": source["title"], "sourceClass": source["sourceClass"], "url": source["url"], "localPath": source["localPath"], "authorityRank": source["authorityRank"], "page": page_number, "score": score, "matchedTerms": matched_terms, "excerpt": _build_excerpt(text, matched_terms), } ) hits.sort(key=lambda item: (-item["score"], item["authorityRank"], item["slug"], item["page"])) return hits[:5] FILING_STATUS_PATTERNS = ( (("qualifying surviving spouse",), "qualifying_surviving_spouse"), (("qualifying widow",), "qualifying_surviving_spouse"), (("qualifying widower",), "qualifying_surviving_spouse"), (("surviving spouse",), "qualifying_surviving_spouse"), (("married filing jointly",), "married_filing_jointly"), (("mfj",), "married_filing_jointly"), (("head of household",), "head_of_household"), (("hoh",), "head_of_household"), (("married filing separately",), "married_filing_separately"), (("mfs",), "married_filing_separately"), (("single",), "single"), ) def _infer_filing_status(normalized_question: str, case_facts: dict[str, Any]) -> str: if "filingStatus" in case_facts: return case_facts["filingStatus"] for patterns, filing_status in FILING_STATUS_PATTERNS: if all(pattern in normalized_question for pattern in patterns): return filing_status return "single" @dataclass class QuestionEngine: corpus: TaxYearCorpus def _manifest(self, tax_year: int) -> dict[str, Any]: path = self.corpus.paths_for_year(tax_year).manifest_path if not path.exists(): raise FileNotFoundError( f"Tax year {tax_year} corpus not found at {path}. Run fetch-year first." ) return json.loads(path.read_text()) def _authorities_for(self, manifest: dict[str, Any], slugs: tuple[str, ...]) -> list[dict[str, Any]]: found = [] sources = {item["slug"]: item for item in manifest["sources"]} for slug in slugs: if slug in sources: source = sources[slug] found.append( { "slug": source["slug"], "title": source["title"], "sourceClass": source["sourceClass"], "url": source["url"], "localPath": source["localPath"], "authorityRank": source["authorityRank"], } ) return found def answer(self, *, question: str, tax_year: int, case_facts: dict[str, Any]) -> dict[str, Any]: manifest = self._manifest(tax_year) normalized = _normalize_question(question) facts_used = [{"field": key, "value": value} for key, value in sorted(case_facts.items())] for rule in TOPIC_RULES: if all(keyword in normalized for keyword in rule["keywords"]): authorities = self._authorities_for(manifest, rule["authority_slugs"]) if rule["issue"] == "standard_deduction": filing_status = _infer_filing_status(normalized, case_facts) answer = rule["answer_by_status"].get(filing_status, rule["answer_by_status"]["single"]) summary = rule["summary_template"].format( filing_status_label=_filing_status_label(filing_status), answer=answer, tax_year=tax_year, ) else: answer = rule["answer"] summary = rule["summary"] return { "issue": rule["issue"], "taxYear": tax_year, "factsUsed": facts_used, "missingFacts": [], "authorities": authorities, "conclusion": {"answer": answer, "summary": summary}, "confidence": rule["confidence"], "riskLevel": RISK_BY_CONFIDENCE[rule["confidence"]], "followUpQuestions": [], "primaryLawRequired": False, "excerpts": [], } research_hits = _rank_research_hits(manifest, normalized) if research_hits: authorities = [] seen = set() for hit in research_hits: if hit["slug"] in seen: continue authorities.append( { "slug": hit["slug"], "title": hit["title"], "sourceClass": hit["sourceClass"], "url": hit["url"], "localPath": hit["localPath"], "authorityRank": hit["authorityRank"], } ) seen.add(hit["slug"]) return { "issue": "irs_corpus_research", "taxYear": tax_year, "factsUsed": facts_used, "missingFacts": [], "authorities": authorities, "excerpts": [ { "slug": hit["slug"], "title": hit["title"], "page": hit["page"], "matchedTerms": hit["matchedTerms"], "excerpt": hit["excerpt"], } for hit in research_hits ], "conclusion": { "answer": "Relevant IRS authorities were found in the downloaded tax-year corpus. Answer from those authorities directly, and only escalate further if the cited passages are still insufficient.", "summary": "Relevant IRS materials in the cached tax-year corpus address this question. Use the cited passages below to answer it directly.", }, "confidence": "medium", "riskLevel": "medium", "followUpQuestions": [], "primaryLawRequired": False, } return { "issue": "requires_primary_law_escalation", "taxYear": tax_year, "factsUsed": facts_used, "missingFacts": [ "Internal Revenue Code or Treasury regulation analysis is required before answering this question confidently." ], "authorities": build_primary_law_authorities(question), "conclusion": { "answer": "Insufficient IRS-form and instruction support for a confident answer.", "summary": "This question needs primary-law analysis before a reliable answer can be given.", }, "confidence": "low", "riskLevel": "high", "followUpQuestions": [ "What facts drive the section-level issue?", "Is there an existing return position or drafted treatment to review?", ], "primaryLawRequired": True, "excerpts": [], } def render_analysis(analysis: dict[str, Any]) -> str: lines = [analysis["conclusion"]["summary"]] lines.append( f"Confidence: {analysis['confidence']}. Risk: {analysis['riskLevel']}." ) if analysis["factsUsed"]: facts = ", ".join(f"{item['field']}={item['value']}" for item in analysis["factsUsed"]) lines.append(f"Facts used: {facts}.") if analysis["authorities"]: titles = "; ".join(item["title"] for item in analysis["authorities"]) lines.append(f"Authorities: {titles}.") if analysis.get("excerpts"): excerpt_lines = [] for item in analysis["excerpts"][:3]: excerpt_lines.append(f"{item['title']} p.{item['page']}: {item['excerpt']}") lines.append(f"Excerpts: {' | '.join(excerpt_lines)}") if analysis["missingFacts"]: lines.append(f"Open items: {' '.join(analysis['missingFacts'])}") return " ".join(lines) def render_memo(analysis: dict[str, Any]) -> str: lines = [ "# Tax Memo", "", f"## Issue\n{analysis['issue']}", "", "## Facts", ] if analysis["factsUsed"]: for item in analysis["factsUsed"]: lines.append(f"- {item['field']}: {item['value']}") else: lines.append("- No case-specific facts supplied.") lines.extend(["", "## Authorities"]) if analysis["authorities"]: for authority in analysis["authorities"]: lines.append(f"- {authority['title']}") else: lines.append("- Primary-law escalation required.") if analysis.get("excerpts"): lines.extend(["", "## IRS Excerpts"]) for item in analysis["excerpts"]: lines.append(f"- {item['title']} (page {item['page']}): {item['excerpt']}") lines.extend( [ "", "## Analysis", analysis["conclusion"]["summary"], f"Confidence: {analysis['confidence']}", f"Risk level: {analysis['riskLevel']}", "", "## Conclusion", analysis["conclusion"]["answer"], ] ) if analysis["missingFacts"]: lines.extend(["", "## Open Items"]) for item in analysis["missingFacts"]: lines.append(f"- {item}") return "\n".join(lines)