Files
stef-openclaw-skills/skills/us-cpa/src/us_cpa/sources.py
2026-03-15 04:40:57 -05:00

240 lines
8.9 KiB
Python

from __future__ import annotations
import hashlib
import json
import os
import re
from dataclasses import dataclass
from datetime import datetime, timezone
from enum import IntEnum
from pathlib import Path
from typing import Callable
from urllib.request import urlopen
class AuthorityRank(IntEnum):
IRS_FORM = 10
IRS_INSTRUCTIONS = 20
IRS_PUBLICATION = 30
IRS_FAQ = 40
INTERNAL_REVENUE_CODE = 100
TREASURY_REGULATION = 110
OTHER_PRIMARY_AUTHORITY = 120
AUTHORITY_RANKS: dict[str, AuthorityRank] = {
"irs_form": AuthorityRank.IRS_FORM,
"irs_instructions": AuthorityRank.IRS_INSTRUCTIONS,
"irs_publication": AuthorityRank.IRS_PUBLICATION,
"irs_faq": AuthorityRank.IRS_FAQ,
"internal_revenue_code": AuthorityRank.INTERNAL_REVENUE_CODE,
"treasury_regulation": AuthorityRank.TREASURY_REGULATION,
"other_primary_authority": AuthorityRank.OTHER_PRIMARY_AUTHORITY,
}
def authority_rank_for(source_class: str) -> AuthorityRank:
return AUTHORITY_RANKS[source_class]
@dataclass(frozen=True)
class SourceDescriptor:
slug: str
title: str
source_class: str
media_type: str
url: str
@dataclass(frozen=True)
class TaxYearPaths:
year_dir: Path
irs_dir: Path
manifest_path: Path
def default_cache_root() -> Path:
override = os.getenv("US_CPA_CACHE_DIR")
if override:
return Path(override).expanduser().resolve()
return (Path.home() / ".cache" / "us-cpa").resolve()
def build_irs_prior_pdf_url(slug: str, tax_year: int) -> str:
return f"https://www.irs.gov/pub/irs-prior/{slug}--{tax_year}.pdf"
def build_primary_law_authorities(question: str) -> list[dict[str, str | int]]:
authorities: list[dict[str, str | int]] = []
normalized = question.lower()
for match in re.finditer(r"(?:section|sec\.)\s+(\d+[a-z0-9-]*)", normalized):
section = match.group(1)
authorities.append(
{
"slug": f"irc-{section}",
"title": f"Internal Revenue Code section {section}",
"sourceClass": "internal_revenue_code",
"url": f"https://uscode.house.gov/view.xhtml?req=granuleid:USC-prelim-title26-section{section}&num=0&edition=prelim",
"authorityRank": int(AuthorityRank.INTERNAL_REVENUE_CODE),
}
)
for match in re.finditer(r"(?:treas(?:ury)?\.?\s+reg(?:ulation)?\.?\s*)([\d.]+-\d+)", normalized):
section = match.group(1)
authorities.append(
{
"slug": f"reg-{section}",
"title": f"Treasury Regulation {section}",
"sourceClass": "treasury_regulation",
"url": f"https://www.ecfr.gov/current/title-26/section-{section}",
"authorityRank": int(AuthorityRank.TREASURY_REGULATION),
}
)
return authorities
def bootstrap_irs_catalog(tax_year: int) -> list[SourceDescriptor]:
entries = [
("f1040", "Form 1040", "irs_form"),
("f1040s1", "Schedule 1 (Form 1040)", "irs_form"),
("f1040s2", "Schedule 2 (Form 1040)", "irs_form"),
("f1040s3", "Schedule 3 (Form 1040)", "irs_form"),
("f1040sa", "Schedule A (Form 1040)", "irs_form"),
("f1040sb", "Schedule B (Form 1040)", "irs_form"),
("f1040sc", "Schedule C (Form 1040)", "irs_form"),
("f1040sd", "Schedule D (Form 1040)", "irs_form"),
("f1040se", "Schedule E (Form 1040)", "irs_form"),
("f1040sse", "Schedule SE (Form 1040)", "irs_form"),
("f1040s8", "Schedule 8812 (Form 1040)", "irs_form"),
("f8949", "Form 8949", "irs_form"),
("f4562", "Form 4562", "irs_form"),
("f4797", "Form 4797", "irs_form"),
("f6251", "Form 6251", "irs_form"),
("f8606", "Form 8606", "irs_form"),
("f8863", "Form 8863", "irs_form"),
("f8889", "Form 8889", "irs_form"),
("f8959", "Form 8959", "irs_form"),
("f8960", "Form 8960", "irs_form"),
("f8995", "Form 8995", "irs_form"),
("f8995a", "Form 8995-A", "irs_form"),
("f5329", "Form 5329", "irs_form"),
("f5695", "Form 5695", "irs_form"),
("f1116", "Form 1116", "irs_form"),
("i1040gi", "Instructions for Form 1040 and Schedules 1-3", "irs_instructions"),
("i1040sca", "Instructions for Schedule A", "irs_instructions"),
("i1040sc", "Instructions for Schedule C", "irs_instructions"),
("i1040sd", "Instructions for Schedule D", "irs_instructions"),
("i1040se", "Instructions for Schedule E (Form 1040)", "irs_instructions"),
("i1040sse", "Instructions for Schedule SE", "irs_instructions"),
("i1040s8", "Instructions for Schedule 8812 (Form 1040)", "irs_instructions"),
("i8949", "Instructions for Form 8949", "irs_instructions"),
("i4562", "Instructions for Form 4562", "irs_instructions"),
("i4797", "Instructions for Form 4797", "irs_instructions"),
("i6251", "Instructions for Form 6251", "irs_instructions"),
("i8606", "Instructions for Form 8606", "irs_instructions"),
("i8863", "Instructions for Form 8863", "irs_instructions"),
("i8889", "Instructions for Form 8889", "irs_instructions"),
("i8959", "Instructions for Form 8959", "irs_instructions"),
("i8960", "Instructions for Form 8960", "irs_instructions"),
("i8995", "Instructions for Form 8995", "irs_instructions"),
("i8995a", "Instructions for Form 8995-A", "irs_instructions"),
("i5329", "Instructions for Form 5329", "irs_instructions"),
("i5695", "Instructions for Form 5695", "irs_instructions"),
("i1116", "Instructions for Form 1116", "irs_instructions"),
("p501", "Publication 501, Dependents, Standard Deduction, and Filing Information", "irs_publication"),
]
return [
SourceDescriptor(
slug=slug,
title=title,
source_class=source_class,
media_type="application/pdf",
url=build_irs_prior_pdf_url(slug, tax_year),
)
for slug, title, source_class in entries
]
def _sha256_bytes(payload: bytes) -> str:
return hashlib.sha256(payload).hexdigest()
def _http_fetch(url: str) -> bytes:
with urlopen(url) as response:
return response.read()
class TaxYearCorpus:
def __init__(self, cache_root: Path | None = None) -> None:
self.cache_root = cache_root or default_cache_root()
def paths_for_year(self, tax_year: int) -> TaxYearPaths:
year_dir = self.cache_root / "tax-years" / str(tax_year)
return TaxYearPaths(
year_dir=year_dir,
irs_dir=year_dir / "irs",
manifest_path=year_dir / "manifest.json",
)
def download_catalog(
self,
tax_year: int,
catalog: list[SourceDescriptor],
*,
fetcher: Callable[[str], bytes] = _http_fetch,
) -> dict:
paths = self.paths_for_year(tax_year)
paths.irs_dir.mkdir(parents=True, exist_ok=True)
fetched_at = datetime.now(timezone.utc).isoformat()
sources: list[dict] = []
for descriptor in catalog:
payload = fetcher(descriptor.url)
destination = paths.irs_dir / f"{descriptor.slug}.pdf"
destination.write_bytes(payload)
sources.append(
{
"slug": descriptor.slug,
"title": descriptor.title,
"sourceClass": descriptor.source_class,
"mediaType": descriptor.media_type,
"url": descriptor.url,
"localPath": str(destination),
"sha256": _sha256_bytes(payload),
"fetchedAt": fetched_at,
"authorityRank": int(authority_rank_for(descriptor.source_class)),
}
)
manifest = {
"taxYear": tax_year,
"fetchedAt": fetched_at,
"cacheRoot": str(self.cache_root),
"sourceCount": len(sources),
"sources": sources,
"indexes": self.index_manifest(sources),
"primaryLawHooks": [
{
"sourceClass": "internal_revenue_code",
"authorityRank": int(AuthorityRank.INTERNAL_REVENUE_CODE),
},
{
"sourceClass": "treasury_regulation",
"authorityRank": int(AuthorityRank.TREASURY_REGULATION),
},
],
}
paths.manifest_path.write_text(json.dumps(manifest, indent=2))
return manifest
@staticmethod
def index_manifest(sources: list[dict]) -> dict[str, dict[str, list[str]]]:
by_class: dict[str, list[str]] = {}
by_slug: dict[str, list[str]] = {}
for source in sources:
by_class.setdefault(source["sourceClass"], []).append(source["slug"])
by_slug.setdefault(source["slug"], []).append(source["localPath"])
return {"bySourceClass": by_class, "bySlug": by_slug}