feat: add us-cpa tax-year source corpus

This commit is contained in:
Stefano Fiorini
2026-03-15 00:53:18 -05:00
parent 291b729894
commit 0c2e34f2f0
5 changed files with 328 additions and 3 deletions

View File

@@ -0,0 +1,178 @@
from __future__ import annotations
import hashlib
import json
import os
from dataclasses import dataclass
from datetime import datetime, timezone
from enum import IntEnum
from pathlib import Path
from typing import Callable
from urllib.request import urlopen
class AuthorityRank(IntEnum):
IRS_FORM = 10
IRS_INSTRUCTIONS = 20
IRS_PUBLICATION = 30
IRS_FAQ = 40
INTERNAL_REVENUE_CODE = 100
TREASURY_REGULATION = 110
OTHER_PRIMARY_AUTHORITY = 120
AUTHORITY_RANKS: dict[str, AuthorityRank] = {
"irs_form": AuthorityRank.IRS_FORM,
"irs_instructions": AuthorityRank.IRS_INSTRUCTIONS,
"irs_publication": AuthorityRank.IRS_PUBLICATION,
"irs_faq": AuthorityRank.IRS_FAQ,
"internal_revenue_code": AuthorityRank.INTERNAL_REVENUE_CODE,
"treasury_regulation": AuthorityRank.TREASURY_REGULATION,
"other_primary_authority": AuthorityRank.OTHER_PRIMARY_AUTHORITY,
}
def authority_rank_for(source_class: str) -> AuthorityRank:
return AUTHORITY_RANKS[source_class]
@dataclass(frozen=True)
class SourceDescriptor:
slug: str
title: str
source_class: str
media_type: str
url: str
@dataclass(frozen=True)
class TaxYearPaths:
year_dir: Path
irs_dir: Path
manifest_path: Path
def default_cache_root() -> Path:
override = os.getenv("US_CPA_CACHE_DIR")
if override:
return Path(override).expanduser().resolve()
return (Path.home() / ".cache" / "us-cpa").resolve()
def build_irs_prior_pdf_url(slug: str, tax_year: int) -> str:
return f"https://www.irs.gov/pub/irs-prior/{slug}--{tax_year}.pdf"
def bootstrap_irs_catalog(tax_year: int) -> list[SourceDescriptor]:
entries = [
("f1040", "Form 1040", "irs_form"),
("f1040s1", "Schedule 1 (Form 1040)", "irs_form"),
("f1040s2", "Schedule 2 (Form 1040)", "irs_form"),
("f1040s3", "Schedule 3 (Form 1040)", "irs_form"),
("f1040sa", "Schedule A (Form 1040)", "irs_form"),
("f1040sb", "Schedule B (Form 1040)", "irs_form"),
("f1040sc", "Schedule C (Form 1040)", "irs_form"),
("f1040sd", "Schedule D (Form 1040)", "irs_form"),
("f1040se", "Schedule SE (Form 1040)", "irs_form"),
("f1040s8", "Schedule 8812 (Form 1040)", "irs_form"),
("f8949", "Form 8949", "irs_form"),
("i1040gi", "Instructions for Form 1040 and Schedules 1-3", "irs_instructions"),
("i1040sca", "Instructions for Schedule A", "irs_instructions"),
("i1040sc", "Instructions for Schedule C", "irs_instructions"),
("i1040sd", "Instructions for Schedule D", "irs_instructions"),
("i1040se", "Instructions for Schedule SE", "irs_instructions"),
("i1040s8", "Instructions for Schedule 8812 (Form 1040)", "irs_instructions"),
("i8949", "Instructions for Form 8949", "irs_instructions"),
]
return [
SourceDescriptor(
slug=slug,
title=title,
source_class=source_class,
media_type="application/pdf",
url=build_irs_prior_pdf_url(slug, tax_year),
)
for slug, title, source_class in entries
]
def _sha256_bytes(payload: bytes) -> str:
return hashlib.sha256(payload).hexdigest()
def _http_fetch(url: str) -> bytes:
with urlopen(url) as response:
return response.read()
class TaxYearCorpus:
def __init__(self, cache_root: Path | None = None) -> None:
self.cache_root = cache_root or default_cache_root()
def paths_for_year(self, tax_year: int) -> TaxYearPaths:
year_dir = self.cache_root / "tax-years" / str(tax_year)
return TaxYearPaths(
year_dir=year_dir,
irs_dir=year_dir / "irs",
manifest_path=year_dir / "manifest.json",
)
def download_catalog(
self,
tax_year: int,
catalog: list[SourceDescriptor],
*,
fetcher: Callable[[str], bytes] = _http_fetch,
) -> dict:
paths = self.paths_for_year(tax_year)
paths.irs_dir.mkdir(parents=True, exist_ok=True)
fetched_at = datetime.now(timezone.utc).isoformat()
sources: list[dict] = []
for descriptor in catalog:
payload = fetcher(descriptor.url)
destination = paths.irs_dir / f"{descriptor.slug}.pdf"
destination.write_bytes(payload)
sources.append(
{
"slug": descriptor.slug,
"title": descriptor.title,
"sourceClass": descriptor.source_class,
"mediaType": descriptor.media_type,
"url": descriptor.url,
"localPath": str(destination),
"sha256": _sha256_bytes(payload),
"fetchedAt": fetched_at,
"authorityRank": int(authority_rank_for(descriptor.source_class)),
}
)
manifest = {
"taxYear": tax_year,
"fetchedAt": fetched_at,
"cacheRoot": str(self.cache_root),
"sourceCount": len(sources),
"sources": sources,
"indexes": self.index_manifest(sources),
"primaryLawHooks": [
{
"sourceClass": "internal_revenue_code",
"authorityRank": int(AuthorityRank.INTERNAL_REVENUE_CODE),
},
{
"sourceClass": "treasury_regulation",
"authorityRank": int(AuthorityRank.TREASURY_REGULATION),
},
],
}
paths.manifest_path.write_text(json.dumps(manifest, indent=2))
return manifest
@staticmethod
def index_manifest(sources: list[dict]) -> dict[str, dict[str, list[str]]]:
by_class: dict[str, list[str]] = {}
by_slug: dict[str, list[str]] = {}
for source in sources:
by_class.setdefault(source["sourceClass"], []).append(source["slug"])
by_slug.setdefault(source["slug"], []).append(source["localPath"])
return {"bySourceClass": by_class, "bySlug": by_slug}