from __future__ import annotations import hashlib import json import os import re from dataclasses import dataclass from datetime import datetime, timezone from enum import IntEnum from pathlib import Path from typing import Callable from urllib.request import urlopen class AuthorityRank(IntEnum): IRS_FORM = 10 IRS_INSTRUCTIONS = 20 IRS_PUBLICATION = 30 IRS_FAQ = 40 INTERNAL_REVENUE_CODE = 100 TREASURY_REGULATION = 110 OTHER_PRIMARY_AUTHORITY = 120 AUTHORITY_RANKS: dict[str, AuthorityRank] = { "irs_form": AuthorityRank.IRS_FORM, "irs_instructions": AuthorityRank.IRS_INSTRUCTIONS, "irs_publication": AuthorityRank.IRS_PUBLICATION, "irs_faq": AuthorityRank.IRS_FAQ, "internal_revenue_code": AuthorityRank.INTERNAL_REVENUE_CODE, "treasury_regulation": AuthorityRank.TREASURY_REGULATION, "other_primary_authority": AuthorityRank.OTHER_PRIMARY_AUTHORITY, } def authority_rank_for(source_class: str) -> AuthorityRank: return AUTHORITY_RANKS[source_class] @dataclass(frozen=True) class SourceDescriptor: slug: str title: str source_class: str media_type: str url: str @dataclass(frozen=True) class TaxYearPaths: year_dir: Path irs_dir: Path manifest_path: Path def default_cache_root() -> Path: override = os.getenv("US_CPA_CACHE_DIR") if override: return Path(override).expanduser().resolve() return (Path.home() / ".cache" / "us-cpa").resolve() def build_irs_prior_pdf_url(slug: str, tax_year: int) -> str: return f"https://www.irs.gov/pub/irs-prior/{slug}--{tax_year}.pdf" def build_primary_law_authorities(question: str) -> list[dict[str, str | int]]: authorities: list[dict[str, str | int]] = [] normalized = question.lower() for match in re.finditer(r"(?:section|sec\.)\s+(\d+[a-z0-9-]*)", normalized): section = match.group(1) authorities.append( { "slug": f"irc-{section}", "title": f"Internal Revenue Code section {section}", "sourceClass": "internal_revenue_code", "url": f"https://uscode.house.gov/view.xhtml?req=granuleid:USC-prelim-title26-section{section}&num=0&edition=prelim", "authorityRank": int(AuthorityRank.INTERNAL_REVENUE_CODE), } ) for match in re.finditer(r"(?:treas(?:ury)?\.?\s+reg(?:ulation)?\.?\s*)([\d.]+-\d+)", normalized): section = match.group(1) authorities.append( { "slug": f"reg-{section}", "title": f"Treasury Regulation {section}", "sourceClass": "treasury_regulation", "url": f"https://www.ecfr.gov/current/title-26/section-{section}", "authorityRank": int(AuthorityRank.TREASURY_REGULATION), } ) return authorities def bootstrap_irs_catalog(tax_year: int) -> list[SourceDescriptor]: entries = [ ("f1040", "Form 1040", "irs_form"), ("f1040s1", "Schedule 1 (Form 1040)", "irs_form"), ("f1040s2", "Schedule 2 (Form 1040)", "irs_form"), ("f1040s3", "Schedule 3 (Form 1040)", "irs_form"), ("f1040sa", "Schedule A (Form 1040)", "irs_form"), ("f1040sb", "Schedule B (Form 1040)", "irs_form"), ("f1040sc", "Schedule C (Form 1040)", "irs_form"), ("f1040sd", "Schedule D (Form 1040)", "irs_form"), ("f1040se", "Schedule E (Form 1040)", "irs_form"), ("f1040sse", "Schedule SE (Form 1040)", "irs_form"), ("f1040s8", "Schedule 8812 (Form 1040)", "irs_form"), ("f8949", "Form 8949", "irs_form"), ("f4562", "Form 4562", "irs_form"), ("f4797", "Form 4797", "irs_form"), ("f6251", "Form 6251", "irs_form"), ("f8606", "Form 8606", "irs_form"), ("f8863", "Form 8863", "irs_form"), ("f8889", "Form 8889", "irs_form"), ("f8959", "Form 8959", "irs_form"), ("f8960", "Form 8960", "irs_form"), ("f8995", "Form 8995", "irs_form"), ("f8995a", "Form 8995-A", "irs_form"), ("f5329", "Form 5329", "irs_form"), ("f5695", "Form 5695", "irs_form"), ("f1116", "Form 1116", "irs_form"), ("i1040gi", "Instructions for Form 1040 and Schedules 1-3", "irs_instructions"), ("i1040sca", "Instructions for Schedule A", "irs_instructions"), ("i1040sc", "Instructions for Schedule C", "irs_instructions"), ("i1040sd", "Instructions for Schedule D", "irs_instructions"), ("i1040se", "Instructions for Schedule E (Form 1040)", "irs_instructions"), ("i1040sse", "Instructions for Schedule SE", "irs_instructions"), ("i1040s8", "Instructions for Schedule 8812 (Form 1040)", "irs_instructions"), ("i8949", "Instructions for Form 8949", "irs_instructions"), ("i4562", "Instructions for Form 4562", "irs_instructions"), ("i4797", "Instructions for Form 4797", "irs_instructions"), ("i6251", "Instructions for Form 6251", "irs_instructions"), ("i8606", "Instructions for Form 8606", "irs_instructions"), ("i8863", "Instructions for Form 8863", "irs_instructions"), ("i8889", "Instructions for Form 8889", "irs_instructions"), ("i8959", "Instructions for Form 8959", "irs_instructions"), ("i8960", "Instructions for Form 8960", "irs_instructions"), ("i8995", "Instructions for Form 8995", "irs_instructions"), ("i8995a", "Instructions for Form 8995-A", "irs_instructions"), ("i5329", "Instructions for Form 5329", "irs_instructions"), ("i5695", "Instructions for Form 5695", "irs_instructions"), ("i1116", "Instructions for Form 1116", "irs_instructions"), ("p501", "Publication 501, Dependents, Standard Deduction, and Filing Information", "irs_publication"), ] return [ SourceDescriptor( slug=slug, title=title, source_class=source_class, media_type="application/pdf", url=build_irs_prior_pdf_url(slug, tax_year), ) for slug, title, source_class in entries ] def _sha256_bytes(payload: bytes) -> str: return hashlib.sha256(payload).hexdigest() def _http_fetch(url: str) -> bytes: with urlopen(url) as response: return response.read() class TaxYearCorpus: def __init__(self, cache_root: Path | None = None) -> None: self.cache_root = cache_root or default_cache_root() def paths_for_year(self, tax_year: int) -> TaxYearPaths: year_dir = self.cache_root / "tax-years" / str(tax_year) return TaxYearPaths( year_dir=year_dir, irs_dir=year_dir / "irs", manifest_path=year_dir / "manifest.json", ) def download_catalog( self, tax_year: int, catalog: list[SourceDescriptor], *, fetcher: Callable[[str], bytes] = _http_fetch, ) -> dict: paths = self.paths_for_year(tax_year) paths.irs_dir.mkdir(parents=True, exist_ok=True) fetched_at = datetime.now(timezone.utc).isoformat() sources: list[dict] = [] for descriptor in catalog: payload = fetcher(descriptor.url) destination = paths.irs_dir / f"{descriptor.slug}.pdf" destination.write_bytes(payload) sources.append( { "slug": descriptor.slug, "title": descriptor.title, "sourceClass": descriptor.source_class, "mediaType": descriptor.media_type, "url": descriptor.url, "localPath": str(destination), "sha256": _sha256_bytes(payload), "fetchedAt": fetched_at, "authorityRank": int(authority_rank_for(descriptor.source_class)), } ) manifest = { "taxYear": tax_year, "fetchedAt": fetched_at, "cacheRoot": str(self.cache_root), "sourceCount": len(sources), "sources": sources, "indexes": self.index_manifest(sources), "primaryLawHooks": [ { "sourceClass": "internal_revenue_code", "authorityRank": int(AuthorityRank.INTERNAL_REVENUE_CODE), }, { "sourceClass": "treasury_regulation", "authorityRank": int(AuthorityRank.TREASURY_REGULATION), }, ], } paths.manifest_path.write_text(json.dumps(manifest, indent=2)) return manifest @staticmethod def index_manifest(sources: list[dict]) -> dict[str, dict[str, list[str]]]: by_class: dict[str, list[str]] = {} by_slug: dict[str, list[str]] = {} for source in sources: by_class.setdefault(source["sourceClass"], []).append(source["slug"]) by_slug.setdefault(source["slug"], []).append(source["localPath"]) return {"bySourceClass": by_class, "bySlug": by_slug}