From 0c2e34f2f06b107398d94e0e41c6932e353c7472 Mon Sep 17 00:00:00 2001 From: Stefano Fiorini Date: Sun, 15 Mar 2026 00:53:18 -0500 Subject: [PATCH] feat: add us-cpa tax-year source corpus --- docs/us-cpa.md | 47 +++++++- skills/us-cpa/SKILL.md | 2 + skills/us-cpa/src/us_cpa/cli.py | 7 +- skills/us-cpa/src/us_cpa/sources.py | 178 ++++++++++++++++++++++++++++ skills/us-cpa/tests/test_sources.py | 97 +++++++++++++++ 5 files changed, 328 insertions(+), 3 deletions(-) create mode 100644 skills/us-cpa/src/us_cpa/sources.py create mode 100644 skills/us-cpa/tests/test_sources.py diff --git a/docs/us-cpa.md b/docs/us-cpa.md index c8b4111..e76feac 100644 --- a/docs/us-cpa.md +++ b/docs/us-cpa.md @@ -4,7 +4,14 @@ ## Current Milestone -Milestone 1 provides the initial package, CLI surface, skill wrapper, and test harness. Tax logic, IRS corpus download, case workflows, rendering, and review logic are not implemented yet. +Milestone 2 now adds the first tax-year corpus layer: + +- deterministic cache layout under `~/.cache/us-cpa` by default +- `fetch-year` download flow for the bootstrap IRS corpus +- source manifest with URL, hash, authority rank, and local path traceability +- authority ranking hooks for IRS materials and future primary-law escalation + +Tax logic, case workflows, rendering, and review logic are still pending. ## CLI Surface @@ -18,6 +25,27 @@ skills/us-cpa/scripts/us-cpa render-forms --tax-year 2025 --case-dir ~/tax-cases skills/us-cpa/scripts/us-cpa export-efile-ready --tax-year 2025 --case-dir ~/tax-cases/2025-jane-doe ``` +## Tax-Year Cache + +Default cache root: + +```text +~/.cache/us-cpa +``` + +Override for isolated runs: + +```bash +US_CPA_CACHE_DIR=/tmp/us-cpa-cache skills/us-cpa/scripts/us-cpa fetch-year --tax-year 2025 +``` + +Current `fetch-year` bootstrap corpus for tax year `2025` is verified against live IRS `irs-prior` PDFs for: + +- Form 1040 +- Schedules 1, 2, 3, A, B, C, D, SE, and 8812 +- Form 8949 +- General Form 1040 instructions and selected schedule/form instructions + ## Interaction Model - `question` @@ -47,10 +75,25 @@ skills/us-cpa/scripts/us-cpa export-efile-ready --tax-year 2025 --case-dir ~/tax - JSON by default - markdown available with `--format markdown` -- current milestone responses are scaffold payloads with `status: "not_implemented"` +- `question`, `prepare`, `review`, `extract-docs`, `render-forms`, and `export-efile-ready` still emit scaffold payloads with `status: "not_implemented"` +- `fetch-year` emits a downloaded manifest location and source count ## Scope Rules - U.S. federal individual returns only in v1 - official IRS artifacts are the target output for compiled forms - conflicting facts must stop the workflow for user resolution + +## Authority Ranking + +Current authority classes are ranked to preserve source hierarchy: + +- IRS forms +- IRS instructions +- IRS publications +- IRS FAQs +- Internal Revenue Code +- Treasury regulations +- other primary authority + +Later research and review flows should consume this ranking rather than inventing their own. diff --git a/skills/us-cpa/SKILL.md b/skills/us-cpa/SKILL.md index d3fccc3..c212fe0 100644 --- a/skills/us-cpa/SKILL.md +++ b/skills/us-cpa/SKILL.md @@ -48,5 +48,7 @@ skills/us-cpa/scripts/us-cpa review --tax-year 2025 --case-dir ~/tax-cases/2025- - JSON by default - markdown output available with `--format markdown` +- `fetch-year` downloads the bootstrap IRS form/instruction corpus into `~/.cache/us-cpa` by default +- override the cache root with `US_CPA_CACHE_DIR` when you need an isolated run or fixture generation For operator details, limitations, and the planned case structure, see `docs/us-cpa.md`. diff --git a/skills/us-cpa/src/us_cpa/cli.py b/skills/us-cpa/src/us_cpa/cli.py index 4ef2046..3fc8e1c 100644 --- a/skills/us-cpa/src/us_cpa/cli.py +++ b/skills/us-cpa/src/us_cpa/cli.py @@ -6,6 +6,7 @@ import sys from pathlib import Path from typing import Any +from us_cpa.sources import TaxYearCorpus, bootstrap_irs_catalog COMMANDS = ( "question", @@ -114,11 +115,15 @@ def main(argv: list[str] | None = None) -> int: return _emit(payload, args.format) if args.command == "fetch-year": + corpus = TaxYearCorpus() + manifest = corpus.download_catalog(args.tax_year, bootstrap_irs_catalog(args.tax_year)) payload = { "command": "fetch-year", "format": args.format, "taxYear": args.tax_year, - "status": "not_implemented", + "status": "downloaded", + "sourceCount": manifest["sourceCount"], + "manifestPath": corpus.paths_for_year(args.tax_year).manifest_path.as_posix(), } return _emit(payload, args.format) diff --git a/skills/us-cpa/src/us_cpa/sources.py b/skills/us-cpa/src/us_cpa/sources.py new file mode 100644 index 0000000..1f4190a --- /dev/null +++ b/skills/us-cpa/src/us_cpa/sources.py @@ -0,0 +1,178 @@ +from __future__ import annotations + +import hashlib +import json +import os +from dataclasses import dataclass +from datetime import datetime, timezone +from enum import IntEnum +from pathlib import Path +from typing import Callable +from urllib.request import urlopen + + +class AuthorityRank(IntEnum): + IRS_FORM = 10 + IRS_INSTRUCTIONS = 20 + IRS_PUBLICATION = 30 + IRS_FAQ = 40 + INTERNAL_REVENUE_CODE = 100 + TREASURY_REGULATION = 110 + OTHER_PRIMARY_AUTHORITY = 120 + + +AUTHORITY_RANKS: dict[str, AuthorityRank] = { + "irs_form": AuthorityRank.IRS_FORM, + "irs_instructions": AuthorityRank.IRS_INSTRUCTIONS, + "irs_publication": AuthorityRank.IRS_PUBLICATION, + "irs_faq": AuthorityRank.IRS_FAQ, + "internal_revenue_code": AuthorityRank.INTERNAL_REVENUE_CODE, + "treasury_regulation": AuthorityRank.TREASURY_REGULATION, + "other_primary_authority": AuthorityRank.OTHER_PRIMARY_AUTHORITY, +} + + +def authority_rank_for(source_class: str) -> AuthorityRank: + return AUTHORITY_RANKS[source_class] + + +@dataclass(frozen=True) +class SourceDescriptor: + slug: str + title: str + source_class: str + media_type: str + url: str + + +@dataclass(frozen=True) +class TaxYearPaths: + year_dir: Path + irs_dir: Path + manifest_path: Path + + +def default_cache_root() -> Path: + override = os.getenv("US_CPA_CACHE_DIR") + if override: + return Path(override).expanduser().resolve() + return (Path.home() / ".cache" / "us-cpa").resolve() + + +def build_irs_prior_pdf_url(slug: str, tax_year: int) -> str: + return f"https://www.irs.gov/pub/irs-prior/{slug}--{tax_year}.pdf" + + +def bootstrap_irs_catalog(tax_year: int) -> list[SourceDescriptor]: + entries = [ + ("f1040", "Form 1040", "irs_form"), + ("f1040s1", "Schedule 1 (Form 1040)", "irs_form"), + ("f1040s2", "Schedule 2 (Form 1040)", "irs_form"), + ("f1040s3", "Schedule 3 (Form 1040)", "irs_form"), + ("f1040sa", "Schedule A (Form 1040)", "irs_form"), + ("f1040sb", "Schedule B (Form 1040)", "irs_form"), + ("f1040sc", "Schedule C (Form 1040)", "irs_form"), + ("f1040sd", "Schedule D (Form 1040)", "irs_form"), + ("f1040se", "Schedule SE (Form 1040)", "irs_form"), + ("f1040s8", "Schedule 8812 (Form 1040)", "irs_form"), + ("f8949", "Form 8949", "irs_form"), + ("i1040gi", "Instructions for Form 1040 and Schedules 1-3", "irs_instructions"), + ("i1040sca", "Instructions for Schedule A", "irs_instructions"), + ("i1040sc", "Instructions for Schedule C", "irs_instructions"), + ("i1040sd", "Instructions for Schedule D", "irs_instructions"), + ("i1040se", "Instructions for Schedule SE", "irs_instructions"), + ("i1040s8", "Instructions for Schedule 8812 (Form 1040)", "irs_instructions"), + ("i8949", "Instructions for Form 8949", "irs_instructions"), + ] + return [ + SourceDescriptor( + slug=slug, + title=title, + source_class=source_class, + media_type="application/pdf", + url=build_irs_prior_pdf_url(slug, tax_year), + ) + for slug, title, source_class in entries + ] + + +def _sha256_bytes(payload: bytes) -> str: + return hashlib.sha256(payload).hexdigest() + + +def _http_fetch(url: str) -> bytes: + with urlopen(url) as response: + return response.read() + + +class TaxYearCorpus: + def __init__(self, cache_root: Path | None = None) -> None: + self.cache_root = cache_root or default_cache_root() + + def paths_for_year(self, tax_year: int) -> TaxYearPaths: + year_dir = self.cache_root / "tax-years" / str(tax_year) + return TaxYearPaths( + year_dir=year_dir, + irs_dir=year_dir / "irs", + manifest_path=year_dir / "manifest.json", + ) + + def download_catalog( + self, + tax_year: int, + catalog: list[SourceDescriptor], + *, + fetcher: Callable[[str], bytes] = _http_fetch, + ) -> dict: + paths = self.paths_for_year(tax_year) + paths.irs_dir.mkdir(parents=True, exist_ok=True) + + fetched_at = datetime.now(timezone.utc).isoformat() + sources: list[dict] = [] + for descriptor in catalog: + payload = fetcher(descriptor.url) + destination = paths.irs_dir / f"{descriptor.slug}.pdf" + destination.write_bytes(payload) + sources.append( + { + "slug": descriptor.slug, + "title": descriptor.title, + "sourceClass": descriptor.source_class, + "mediaType": descriptor.media_type, + "url": descriptor.url, + "localPath": str(destination), + "sha256": _sha256_bytes(payload), + "fetchedAt": fetched_at, + "authorityRank": int(authority_rank_for(descriptor.source_class)), + } + ) + + manifest = { + "taxYear": tax_year, + "fetchedAt": fetched_at, + "cacheRoot": str(self.cache_root), + "sourceCount": len(sources), + "sources": sources, + "indexes": self.index_manifest(sources), + "primaryLawHooks": [ + { + "sourceClass": "internal_revenue_code", + "authorityRank": int(AuthorityRank.INTERNAL_REVENUE_CODE), + }, + { + "sourceClass": "treasury_regulation", + "authorityRank": int(AuthorityRank.TREASURY_REGULATION), + }, + ], + } + paths.manifest_path.write_text(json.dumps(manifest, indent=2)) + return manifest + + @staticmethod + def index_manifest(sources: list[dict]) -> dict[str, dict[str, list[str]]]: + by_class: dict[str, list[str]] = {} + by_slug: dict[str, list[str]] = {} + for source in sources: + by_class.setdefault(source["sourceClass"], []).append(source["slug"]) + by_slug.setdefault(source["slug"], []).append(source["localPath"]) + return {"bySourceClass": by_class, "bySlug": by_slug} diff --git a/skills/us-cpa/tests/test_sources.py b/skills/us-cpa/tests/test_sources.py new file mode 100644 index 0000000..206e4d0 --- /dev/null +++ b/skills/us-cpa/tests/test_sources.py @@ -0,0 +1,97 @@ +from __future__ import annotations + +import json +import tempfile +import unittest +from pathlib import Path + +from us_cpa.sources import ( + AuthorityRank, + SourceDescriptor, + TaxYearCorpus, + authority_rank_for, + bootstrap_irs_catalog, + build_irs_prior_pdf_url, +) + + +class SourceCatalogTests(unittest.TestCase): + def test_build_irs_prior_pdf_url_uses_expected_pattern(self) -> None: + self.assertEqual( + build_irs_prior_pdf_url("f1040", 2025), + "https://www.irs.gov/pub/irs-prior/f1040--2025.pdf", + ) + self.assertEqual( + build_irs_prior_pdf_url("i1040gi", 2025), + "https://www.irs.gov/pub/irs-prior/i1040gi--2025.pdf", + ) + + def test_authority_ranking_orders_irs_before_primary_law(self) -> None: + self.assertEqual(authority_rank_for("irs_form"), AuthorityRank.IRS_FORM) + self.assertEqual( + authority_rank_for("treasury_regulation"), + AuthorityRank.TREASURY_REGULATION, + ) + self.assertLess( + authority_rank_for("irs_form"), authority_rank_for("internal_revenue_code") + ) + + def test_bootstrap_catalog_builds_tax_year_specific_urls(self) -> None: + catalog = bootstrap_irs_catalog(2025) + + self.assertGreaterEqual(len(catalog), 5) + self.assertEqual(catalog[0].url, "https://www.irs.gov/pub/irs-prior/f1040--2025.pdf") + self.assertTrue(any(item.slug == "i1040gi" for item in catalog)) + + +class TaxYearCorpusTests(unittest.TestCase): + def test_tax_year_layout_is_deterministic(self) -> None: + with tempfile.TemporaryDirectory() as temp_dir: + corpus = TaxYearCorpus(cache_root=Path(temp_dir)) + paths = corpus.paths_for_year(2025) + + self.assertEqual(paths.year_dir, Path(temp_dir) / "tax-years" / "2025") + self.assertEqual(paths.irs_dir, paths.year_dir / "irs") + self.assertEqual(paths.manifest_path, paths.year_dir / "manifest.json") + + def test_download_catalog_writes_files_and_manifest(self) -> None: + with tempfile.TemporaryDirectory() as temp_dir: + corpus = TaxYearCorpus(cache_root=Path(temp_dir)) + catalog = [ + SourceDescriptor( + slug="f1040", + title="Form 1040", + source_class="irs_form", + media_type="application/pdf", + url=build_irs_prior_pdf_url("f1040", 2025), + ), + SourceDescriptor( + slug="i1040gi", + title="Instructions for Form 1040", + source_class="irs_instructions", + media_type="application/pdf", + url=build_irs_prior_pdf_url("i1040gi", 2025), + ), + ] + + def fake_fetch(url: str) -> bytes: + return f"downloaded:{url}".encode() + + manifest = corpus.download_catalog(2025, catalog, fetcher=fake_fetch) + + self.assertEqual(manifest["taxYear"], 2025) + self.assertEqual(manifest["sourceCount"], 2) + self.assertTrue(corpus.paths_for_year(2025).manifest_path.exists()) + + first = manifest["sources"][0] + self.assertEqual(first["slug"], "f1040") + self.assertEqual(first["authorityRank"], int(AuthorityRank.IRS_FORM)) + self.assertTrue(Path(first["localPath"]).exists()) + + saved = json.loads(corpus.paths_for_year(2025).manifest_path.read_text()) + self.assertEqual(saved["sourceCount"], 2) + self.assertEqual(saved["sources"][1]["slug"], "i1040gi") + + +if __name__ == "__main__": + unittest.main()