feat: add us-cpa tax-year source corpus

This commit is contained in:
Stefano Fiorini
2026-03-15 00:53:18 -05:00
parent 291b729894
commit 0c2e34f2f0
5 changed files with 328 additions and 3 deletions

View File

@@ -4,7 +4,14 @@
## Current Milestone
Milestone 1 provides the initial package, CLI surface, skill wrapper, and test harness. Tax logic, IRS corpus download, case workflows, rendering, and review logic are not implemented yet.
Milestone 2 now adds the first tax-year corpus layer:
- deterministic cache layout under `~/.cache/us-cpa` by default
- `fetch-year` download flow for the bootstrap IRS corpus
- source manifest with URL, hash, authority rank, and local path traceability
- authority ranking hooks for IRS materials and future primary-law escalation
Tax logic, case workflows, rendering, and review logic are still pending.
## CLI Surface
@@ -18,6 +25,27 @@ skills/us-cpa/scripts/us-cpa render-forms --tax-year 2025 --case-dir ~/tax-cases
skills/us-cpa/scripts/us-cpa export-efile-ready --tax-year 2025 --case-dir ~/tax-cases/2025-jane-doe
```
## Tax-Year Cache
Default cache root:
```text
~/.cache/us-cpa
```
Override for isolated runs:
```bash
US_CPA_CACHE_DIR=/tmp/us-cpa-cache skills/us-cpa/scripts/us-cpa fetch-year --tax-year 2025
```
Current `fetch-year` bootstrap corpus for tax year `2025` is verified against live IRS `irs-prior` PDFs for:
- Form 1040
- Schedules 1, 2, 3, A, B, C, D, SE, and 8812
- Form 8949
- General Form 1040 instructions and selected schedule/form instructions
## Interaction Model
- `question`
@@ -47,10 +75,25 @@ skills/us-cpa/scripts/us-cpa export-efile-ready --tax-year 2025 --case-dir ~/tax
- JSON by default
- markdown available with `--format markdown`
- current milestone responses are scaffold payloads with `status: "not_implemented"`
- `question`, `prepare`, `review`, `extract-docs`, `render-forms`, and `export-efile-ready` still emit scaffold payloads with `status: "not_implemented"`
- `fetch-year` emits a downloaded manifest location and source count
## Scope Rules
- U.S. federal individual returns only in v1
- official IRS artifacts are the target output for compiled forms
- conflicting facts must stop the workflow for user resolution
## Authority Ranking
Current authority classes are ranked to preserve source hierarchy:
- IRS forms
- IRS instructions
- IRS publications
- IRS FAQs
- Internal Revenue Code
- Treasury regulations
- other primary authority
Later research and review flows should consume this ranking rather than inventing their own.

View File

@@ -48,5 +48,7 @@ skills/us-cpa/scripts/us-cpa review --tax-year 2025 --case-dir ~/tax-cases/2025-
- JSON by default
- markdown output available with `--format markdown`
- `fetch-year` downloads the bootstrap IRS form/instruction corpus into `~/.cache/us-cpa` by default
- override the cache root with `US_CPA_CACHE_DIR` when you need an isolated run or fixture generation
For operator details, limitations, and the planned case structure, see `docs/us-cpa.md`.

View File

@@ -6,6 +6,7 @@ import sys
from pathlib import Path
from typing import Any
from us_cpa.sources import TaxYearCorpus, bootstrap_irs_catalog
COMMANDS = (
"question",
@@ -114,11 +115,15 @@ def main(argv: list[str] | None = None) -> int:
return _emit(payload, args.format)
if args.command == "fetch-year":
corpus = TaxYearCorpus()
manifest = corpus.download_catalog(args.tax_year, bootstrap_irs_catalog(args.tax_year))
payload = {
"command": "fetch-year",
"format": args.format,
"taxYear": args.tax_year,
"status": "not_implemented",
"status": "downloaded",
"sourceCount": manifest["sourceCount"],
"manifestPath": corpus.paths_for_year(args.tax_year).manifest_path.as_posix(),
}
return _emit(payload, args.format)

View File

@@ -0,0 +1,178 @@
from __future__ import annotations
import hashlib
import json
import os
from dataclasses import dataclass
from datetime import datetime, timezone
from enum import IntEnum
from pathlib import Path
from typing import Callable
from urllib.request import urlopen
class AuthorityRank(IntEnum):
IRS_FORM = 10
IRS_INSTRUCTIONS = 20
IRS_PUBLICATION = 30
IRS_FAQ = 40
INTERNAL_REVENUE_CODE = 100
TREASURY_REGULATION = 110
OTHER_PRIMARY_AUTHORITY = 120
AUTHORITY_RANKS: dict[str, AuthorityRank] = {
"irs_form": AuthorityRank.IRS_FORM,
"irs_instructions": AuthorityRank.IRS_INSTRUCTIONS,
"irs_publication": AuthorityRank.IRS_PUBLICATION,
"irs_faq": AuthorityRank.IRS_FAQ,
"internal_revenue_code": AuthorityRank.INTERNAL_REVENUE_CODE,
"treasury_regulation": AuthorityRank.TREASURY_REGULATION,
"other_primary_authority": AuthorityRank.OTHER_PRIMARY_AUTHORITY,
}
def authority_rank_for(source_class: str) -> AuthorityRank:
return AUTHORITY_RANKS[source_class]
@dataclass(frozen=True)
class SourceDescriptor:
slug: str
title: str
source_class: str
media_type: str
url: str
@dataclass(frozen=True)
class TaxYearPaths:
year_dir: Path
irs_dir: Path
manifest_path: Path
def default_cache_root() -> Path:
override = os.getenv("US_CPA_CACHE_DIR")
if override:
return Path(override).expanduser().resolve()
return (Path.home() / ".cache" / "us-cpa").resolve()
def build_irs_prior_pdf_url(slug: str, tax_year: int) -> str:
return f"https://www.irs.gov/pub/irs-prior/{slug}--{tax_year}.pdf"
def bootstrap_irs_catalog(tax_year: int) -> list[SourceDescriptor]:
entries = [
("f1040", "Form 1040", "irs_form"),
("f1040s1", "Schedule 1 (Form 1040)", "irs_form"),
("f1040s2", "Schedule 2 (Form 1040)", "irs_form"),
("f1040s3", "Schedule 3 (Form 1040)", "irs_form"),
("f1040sa", "Schedule A (Form 1040)", "irs_form"),
("f1040sb", "Schedule B (Form 1040)", "irs_form"),
("f1040sc", "Schedule C (Form 1040)", "irs_form"),
("f1040sd", "Schedule D (Form 1040)", "irs_form"),
("f1040se", "Schedule SE (Form 1040)", "irs_form"),
("f1040s8", "Schedule 8812 (Form 1040)", "irs_form"),
("f8949", "Form 8949", "irs_form"),
("i1040gi", "Instructions for Form 1040 and Schedules 1-3", "irs_instructions"),
("i1040sca", "Instructions for Schedule A", "irs_instructions"),
("i1040sc", "Instructions for Schedule C", "irs_instructions"),
("i1040sd", "Instructions for Schedule D", "irs_instructions"),
("i1040se", "Instructions for Schedule SE", "irs_instructions"),
("i1040s8", "Instructions for Schedule 8812 (Form 1040)", "irs_instructions"),
("i8949", "Instructions for Form 8949", "irs_instructions"),
]
return [
SourceDescriptor(
slug=slug,
title=title,
source_class=source_class,
media_type="application/pdf",
url=build_irs_prior_pdf_url(slug, tax_year),
)
for slug, title, source_class in entries
]
def _sha256_bytes(payload: bytes) -> str:
return hashlib.sha256(payload).hexdigest()
def _http_fetch(url: str) -> bytes:
with urlopen(url) as response:
return response.read()
class TaxYearCorpus:
def __init__(self, cache_root: Path | None = None) -> None:
self.cache_root = cache_root or default_cache_root()
def paths_for_year(self, tax_year: int) -> TaxYearPaths:
year_dir = self.cache_root / "tax-years" / str(tax_year)
return TaxYearPaths(
year_dir=year_dir,
irs_dir=year_dir / "irs",
manifest_path=year_dir / "manifest.json",
)
def download_catalog(
self,
tax_year: int,
catalog: list[SourceDescriptor],
*,
fetcher: Callable[[str], bytes] = _http_fetch,
) -> dict:
paths = self.paths_for_year(tax_year)
paths.irs_dir.mkdir(parents=True, exist_ok=True)
fetched_at = datetime.now(timezone.utc).isoformat()
sources: list[dict] = []
for descriptor in catalog:
payload = fetcher(descriptor.url)
destination = paths.irs_dir / f"{descriptor.slug}.pdf"
destination.write_bytes(payload)
sources.append(
{
"slug": descriptor.slug,
"title": descriptor.title,
"sourceClass": descriptor.source_class,
"mediaType": descriptor.media_type,
"url": descriptor.url,
"localPath": str(destination),
"sha256": _sha256_bytes(payload),
"fetchedAt": fetched_at,
"authorityRank": int(authority_rank_for(descriptor.source_class)),
}
)
manifest = {
"taxYear": tax_year,
"fetchedAt": fetched_at,
"cacheRoot": str(self.cache_root),
"sourceCount": len(sources),
"sources": sources,
"indexes": self.index_manifest(sources),
"primaryLawHooks": [
{
"sourceClass": "internal_revenue_code",
"authorityRank": int(AuthorityRank.INTERNAL_REVENUE_CODE),
},
{
"sourceClass": "treasury_regulation",
"authorityRank": int(AuthorityRank.TREASURY_REGULATION),
},
],
}
paths.manifest_path.write_text(json.dumps(manifest, indent=2))
return manifest
@staticmethod
def index_manifest(sources: list[dict]) -> dict[str, dict[str, list[str]]]:
by_class: dict[str, list[str]] = {}
by_slug: dict[str, list[str]] = {}
for source in sources:
by_class.setdefault(source["sourceClass"], []).append(source["slug"])
by_slug.setdefault(source["slug"], []).append(source["localPath"])
return {"bySourceClass": by_class, "bySlug": by_slug}

View File

@@ -0,0 +1,97 @@
from __future__ import annotations
import json
import tempfile
import unittest
from pathlib import Path
from us_cpa.sources import (
AuthorityRank,
SourceDescriptor,
TaxYearCorpus,
authority_rank_for,
bootstrap_irs_catalog,
build_irs_prior_pdf_url,
)
class SourceCatalogTests(unittest.TestCase):
def test_build_irs_prior_pdf_url_uses_expected_pattern(self) -> None:
self.assertEqual(
build_irs_prior_pdf_url("f1040", 2025),
"https://www.irs.gov/pub/irs-prior/f1040--2025.pdf",
)
self.assertEqual(
build_irs_prior_pdf_url("i1040gi", 2025),
"https://www.irs.gov/pub/irs-prior/i1040gi--2025.pdf",
)
def test_authority_ranking_orders_irs_before_primary_law(self) -> None:
self.assertEqual(authority_rank_for("irs_form"), AuthorityRank.IRS_FORM)
self.assertEqual(
authority_rank_for("treasury_regulation"),
AuthorityRank.TREASURY_REGULATION,
)
self.assertLess(
authority_rank_for("irs_form"), authority_rank_for("internal_revenue_code")
)
def test_bootstrap_catalog_builds_tax_year_specific_urls(self) -> None:
catalog = bootstrap_irs_catalog(2025)
self.assertGreaterEqual(len(catalog), 5)
self.assertEqual(catalog[0].url, "https://www.irs.gov/pub/irs-prior/f1040--2025.pdf")
self.assertTrue(any(item.slug == "i1040gi" for item in catalog))
class TaxYearCorpusTests(unittest.TestCase):
def test_tax_year_layout_is_deterministic(self) -> None:
with tempfile.TemporaryDirectory() as temp_dir:
corpus = TaxYearCorpus(cache_root=Path(temp_dir))
paths = corpus.paths_for_year(2025)
self.assertEqual(paths.year_dir, Path(temp_dir) / "tax-years" / "2025")
self.assertEqual(paths.irs_dir, paths.year_dir / "irs")
self.assertEqual(paths.manifest_path, paths.year_dir / "manifest.json")
def test_download_catalog_writes_files_and_manifest(self) -> None:
with tempfile.TemporaryDirectory() as temp_dir:
corpus = TaxYearCorpus(cache_root=Path(temp_dir))
catalog = [
SourceDescriptor(
slug="f1040",
title="Form 1040",
source_class="irs_form",
media_type="application/pdf",
url=build_irs_prior_pdf_url("f1040", 2025),
),
SourceDescriptor(
slug="i1040gi",
title="Instructions for Form 1040",
source_class="irs_instructions",
media_type="application/pdf",
url=build_irs_prior_pdf_url("i1040gi", 2025),
),
]
def fake_fetch(url: str) -> bytes:
return f"downloaded:{url}".encode()
manifest = corpus.download_catalog(2025, catalog, fetcher=fake_fetch)
self.assertEqual(manifest["taxYear"], 2025)
self.assertEqual(manifest["sourceCount"], 2)
self.assertTrue(corpus.paths_for_year(2025).manifest_path.exists())
first = manifest["sources"][0]
self.assertEqual(first["slug"], "f1040")
self.assertEqual(first["authorityRank"], int(AuthorityRank.IRS_FORM))
self.assertTrue(Path(first["localPath"]).exists())
saved = json.loads(corpus.paths_for_year(2025).manifest_path.read_text())
self.assertEqual(saved["sourceCount"], 2)
self.assertEqual(saved["sources"][1]["slug"], "i1040gi")
if __name__ == "__main__":
unittest.main()