240 lines
8.9 KiB
Python
240 lines
8.9 KiB
Python
from __future__ import annotations
|
|
|
|
import hashlib
|
|
import json
|
|
import os
|
|
import re
|
|
from dataclasses import dataclass
|
|
from datetime import datetime, timezone
|
|
from enum import IntEnum
|
|
from pathlib import Path
|
|
from typing import Callable
|
|
from urllib.request import urlopen
|
|
|
|
|
|
class AuthorityRank(IntEnum):
|
|
IRS_FORM = 10
|
|
IRS_INSTRUCTIONS = 20
|
|
IRS_PUBLICATION = 30
|
|
IRS_FAQ = 40
|
|
INTERNAL_REVENUE_CODE = 100
|
|
TREASURY_REGULATION = 110
|
|
OTHER_PRIMARY_AUTHORITY = 120
|
|
|
|
|
|
AUTHORITY_RANKS: dict[str, AuthorityRank] = {
|
|
"irs_form": AuthorityRank.IRS_FORM,
|
|
"irs_instructions": AuthorityRank.IRS_INSTRUCTIONS,
|
|
"irs_publication": AuthorityRank.IRS_PUBLICATION,
|
|
"irs_faq": AuthorityRank.IRS_FAQ,
|
|
"internal_revenue_code": AuthorityRank.INTERNAL_REVENUE_CODE,
|
|
"treasury_regulation": AuthorityRank.TREASURY_REGULATION,
|
|
"other_primary_authority": AuthorityRank.OTHER_PRIMARY_AUTHORITY,
|
|
}
|
|
|
|
|
|
def authority_rank_for(source_class: str) -> AuthorityRank:
|
|
return AUTHORITY_RANKS[source_class]
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class SourceDescriptor:
|
|
slug: str
|
|
title: str
|
|
source_class: str
|
|
media_type: str
|
|
url: str
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class TaxYearPaths:
|
|
year_dir: Path
|
|
irs_dir: Path
|
|
manifest_path: Path
|
|
|
|
|
|
def default_cache_root() -> Path:
|
|
override = os.getenv("US_CPA_CACHE_DIR")
|
|
if override:
|
|
return Path(override).expanduser().resolve()
|
|
return (Path.home() / ".cache" / "us-cpa").resolve()
|
|
|
|
|
|
def build_irs_prior_pdf_url(slug: str, tax_year: int) -> str:
|
|
return f"https://www.irs.gov/pub/irs-prior/{slug}--{tax_year}.pdf"
|
|
|
|
|
|
def build_primary_law_authorities(question: str) -> list[dict[str, str | int]]:
|
|
authorities: list[dict[str, str | int]] = []
|
|
normalized = question.lower()
|
|
|
|
for match in re.finditer(r"(?:section|sec\.)\s+(\d+[a-z0-9-]*)", normalized):
|
|
section = match.group(1)
|
|
authorities.append(
|
|
{
|
|
"slug": f"irc-{section}",
|
|
"title": f"Internal Revenue Code section {section}",
|
|
"sourceClass": "internal_revenue_code",
|
|
"url": f"https://uscode.house.gov/view.xhtml?req=granuleid:USC-prelim-title26-section{section}&num=0&edition=prelim",
|
|
"authorityRank": int(AuthorityRank.INTERNAL_REVENUE_CODE),
|
|
}
|
|
)
|
|
|
|
for match in re.finditer(r"(?:treas(?:ury)?\.?\s+reg(?:ulation)?\.?\s*)([\d.]+-\d+)", normalized):
|
|
section = match.group(1)
|
|
authorities.append(
|
|
{
|
|
"slug": f"reg-{section}",
|
|
"title": f"Treasury Regulation {section}",
|
|
"sourceClass": "treasury_regulation",
|
|
"url": f"https://www.ecfr.gov/current/title-26/section-{section}",
|
|
"authorityRank": int(AuthorityRank.TREASURY_REGULATION),
|
|
}
|
|
)
|
|
|
|
return authorities
|
|
|
|
|
|
def bootstrap_irs_catalog(tax_year: int) -> list[SourceDescriptor]:
|
|
entries = [
|
|
("f1040", "Form 1040", "irs_form"),
|
|
("f1040s1", "Schedule 1 (Form 1040)", "irs_form"),
|
|
("f1040s2", "Schedule 2 (Form 1040)", "irs_form"),
|
|
("f1040s3", "Schedule 3 (Form 1040)", "irs_form"),
|
|
("f1040sa", "Schedule A (Form 1040)", "irs_form"),
|
|
("f1040sb", "Schedule B (Form 1040)", "irs_form"),
|
|
("f1040sc", "Schedule C (Form 1040)", "irs_form"),
|
|
("f1040sd", "Schedule D (Form 1040)", "irs_form"),
|
|
("f1040se", "Schedule E (Form 1040)", "irs_form"),
|
|
("f1040sse", "Schedule SE (Form 1040)", "irs_form"),
|
|
("f1040s8", "Schedule 8812 (Form 1040)", "irs_form"),
|
|
("f8949", "Form 8949", "irs_form"),
|
|
("f4562", "Form 4562", "irs_form"),
|
|
("f4797", "Form 4797", "irs_form"),
|
|
("f6251", "Form 6251", "irs_form"),
|
|
("f8606", "Form 8606", "irs_form"),
|
|
("f8863", "Form 8863", "irs_form"),
|
|
("f8889", "Form 8889", "irs_form"),
|
|
("f8959", "Form 8959", "irs_form"),
|
|
("f8960", "Form 8960", "irs_form"),
|
|
("f8995", "Form 8995", "irs_form"),
|
|
("f8995a", "Form 8995-A", "irs_form"),
|
|
("f5329", "Form 5329", "irs_form"),
|
|
("f5695", "Form 5695", "irs_form"),
|
|
("f1116", "Form 1116", "irs_form"),
|
|
("i1040gi", "Instructions for Form 1040 and Schedules 1-3", "irs_instructions"),
|
|
("i1040sca", "Instructions for Schedule A", "irs_instructions"),
|
|
("i1040sc", "Instructions for Schedule C", "irs_instructions"),
|
|
("i1040sd", "Instructions for Schedule D", "irs_instructions"),
|
|
("i1040se", "Instructions for Schedule E (Form 1040)", "irs_instructions"),
|
|
("i1040sse", "Instructions for Schedule SE", "irs_instructions"),
|
|
("i1040s8", "Instructions for Schedule 8812 (Form 1040)", "irs_instructions"),
|
|
("i8949", "Instructions for Form 8949", "irs_instructions"),
|
|
("i4562", "Instructions for Form 4562", "irs_instructions"),
|
|
("i4797", "Instructions for Form 4797", "irs_instructions"),
|
|
("i6251", "Instructions for Form 6251", "irs_instructions"),
|
|
("i8606", "Instructions for Form 8606", "irs_instructions"),
|
|
("i8863", "Instructions for Form 8863", "irs_instructions"),
|
|
("i8889", "Instructions for Form 8889", "irs_instructions"),
|
|
("i8959", "Instructions for Form 8959", "irs_instructions"),
|
|
("i8960", "Instructions for Form 8960", "irs_instructions"),
|
|
("i8995", "Instructions for Form 8995", "irs_instructions"),
|
|
("i8995a", "Instructions for Form 8995-A", "irs_instructions"),
|
|
("i5329", "Instructions for Form 5329", "irs_instructions"),
|
|
("i5695", "Instructions for Form 5695", "irs_instructions"),
|
|
("i1116", "Instructions for Form 1116", "irs_instructions"),
|
|
("p501", "Publication 501, Dependents, Standard Deduction, and Filing Information", "irs_publication"),
|
|
]
|
|
return [
|
|
SourceDescriptor(
|
|
slug=slug,
|
|
title=title,
|
|
source_class=source_class,
|
|
media_type="application/pdf",
|
|
url=build_irs_prior_pdf_url(slug, tax_year),
|
|
)
|
|
for slug, title, source_class in entries
|
|
]
|
|
|
|
|
|
def _sha256_bytes(payload: bytes) -> str:
|
|
return hashlib.sha256(payload).hexdigest()
|
|
|
|
|
|
def _http_fetch(url: str) -> bytes:
|
|
with urlopen(url) as response:
|
|
return response.read()
|
|
|
|
|
|
class TaxYearCorpus:
|
|
def __init__(self, cache_root: Path | None = None) -> None:
|
|
self.cache_root = cache_root or default_cache_root()
|
|
|
|
def paths_for_year(self, tax_year: int) -> TaxYearPaths:
|
|
year_dir = self.cache_root / "tax-years" / str(tax_year)
|
|
return TaxYearPaths(
|
|
year_dir=year_dir,
|
|
irs_dir=year_dir / "irs",
|
|
manifest_path=year_dir / "manifest.json",
|
|
)
|
|
|
|
def download_catalog(
|
|
self,
|
|
tax_year: int,
|
|
catalog: list[SourceDescriptor],
|
|
*,
|
|
fetcher: Callable[[str], bytes] = _http_fetch,
|
|
) -> dict:
|
|
paths = self.paths_for_year(tax_year)
|
|
paths.irs_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
fetched_at = datetime.now(timezone.utc).isoformat()
|
|
sources: list[dict] = []
|
|
for descriptor in catalog:
|
|
payload = fetcher(descriptor.url)
|
|
destination = paths.irs_dir / f"{descriptor.slug}.pdf"
|
|
destination.write_bytes(payload)
|
|
sources.append(
|
|
{
|
|
"slug": descriptor.slug,
|
|
"title": descriptor.title,
|
|
"sourceClass": descriptor.source_class,
|
|
"mediaType": descriptor.media_type,
|
|
"url": descriptor.url,
|
|
"localPath": str(destination),
|
|
"sha256": _sha256_bytes(payload),
|
|
"fetchedAt": fetched_at,
|
|
"authorityRank": int(authority_rank_for(descriptor.source_class)),
|
|
}
|
|
)
|
|
|
|
manifest = {
|
|
"taxYear": tax_year,
|
|
"fetchedAt": fetched_at,
|
|
"cacheRoot": str(self.cache_root),
|
|
"sourceCount": len(sources),
|
|
"sources": sources,
|
|
"indexes": self.index_manifest(sources),
|
|
"primaryLawHooks": [
|
|
{
|
|
"sourceClass": "internal_revenue_code",
|
|
"authorityRank": int(AuthorityRank.INTERNAL_REVENUE_CODE),
|
|
},
|
|
{
|
|
"sourceClass": "treasury_regulation",
|
|
"authorityRank": int(AuthorityRank.TREASURY_REGULATION),
|
|
},
|
|
],
|
|
}
|
|
paths.manifest_path.write_text(json.dumps(manifest, indent=2))
|
|
return manifest
|
|
|
|
@staticmethod
|
|
def index_manifest(sources: list[dict]) -> dict[str, dict[str, list[str]]]:
|
|
by_class: dict[str, list[str]] = {}
|
|
by_slug: dict[str, list[str]] = {}
|
|
for source in sources:
|
|
by_class.setdefault(source["sourceClass"], []).append(source["slug"])
|
|
by_slug.setdefault(source["slug"], []).append(source["localPath"])
|
|
return {"bySourceClass": by_class, "bySlug": by_slug}
|