feat: add us-cpa tax-year source corpus
This commit is contained in:
@@ -4,7 +4,14 @@
|
||||
|
||||
## Current Milestone
|
||||
|
||||
Milestone 1 provides the initial package, CLI surface, skill wrapper, and test harness. Tax logic, IRS corpus download, case workflows, rendering, and review logic are not implemented yet.
|
||||
Milestone 2 now adds the first tax-year corpus layer:
|
||||
|
||||
- deterministic cache layout under `~/.cache/us-cpa` by default
|
||||
- `fetch-year` download flow for the bootstrap IRS corpus
|
||||
- source manifest with URL, hash, authority rank, and local path traceability
|
||||
- authority ranking hooks for IRS materials and future primary-law escalation
|
||||
|
||||
Tax logic, case workflows, rendering, and review logic are still pending.
|
||||
|
||||
## CLI Surface
|
||||
|
||||
@@ -18,6 +25,27 @@ skills/us-cpa/scripts/us-cpa render-forms --tax-year 2025 --case-dir ~/tax-cases
|
||||
skills/us-cpa/scripts/us-cpa export-efile-ready --tax-year 2025 --case-dir ~/tax-cases/2025-jane-doe
|
||||
```
|
||||
|
||||
## Tax-Year Cache
|
||||
|
||||
Default cache root:
|
||||
|
||||
```text
|
||||
~/.cache/us-cpa
|
||||
```
|
||||
|
||||
Override for isolated runs:
|
||||
|
||||
```bash
|
||||
US_CPA_CACHE_DIR=/tmp/us-cpa-cache skills/us-cpa/scripts/us-cpa fetch-year --tax-year 2025
|
||||
```
|
||||
|
||||
Current `fetch-year` bootstrap corpus for tax year `2025` is verified against live IRS `irs-prior` PDFs for:
|
||||
|
||||
- Form 1040
|
||||
- Schedules 1, 2, 3, A, B, C, D, SE, and 8812
|
||||
- Form 8949
|
||||
- General Form 1040 instructions and selected schedule/form instructions
|
||||
|
||||
## Interaction Model
|
||||
|
||||
- `question`
|
||||
@@ -47,10 +75,25 @@ skills/us-cpa/scripts/us-cpa export-efile-ready --tax-year 2025 --case-dir ~/tax
|
||||
|
||||
- JSON by default
|
||||
- markdown available with `--format markdown`
|
||||
- current milestone responses are scaffold payloads with `status: "not_implemented"`
|
||||
- `question`, `prepare`, `review`, `extract-docs`, `render-forms`, and `export-efile-ready` still emit scaffold payloads with `status: "not_implemented"`
|
||||
- `fetch-year` emits a downloaded manifest location and source count
|
||||
|
||||
## Scope Rules
|
||||
|
||||
- U.S. federal individual returns only in v1
|
||||
- official IRS artifacts are the target output for compiled forms
|
||||
- conflicting facts must stop the workflow for user resolution
|
||||
|
||||
## Authority Ranking
|
||||
|
||||
Current authority classes are ranked to preserve source hierarchy:
|
||||
|
||||
- IRS forms
|
||||
- IRS instructions
|
||||
- IRS publications
|
||||
- IRS FAQs
|
||||
- Internal Revenue Code
|
||||
- Treasury regulations
|
||||
- other primary authority
|
||||
|
||||
Later research and review flows should consume this ranking rather than inventing their own.
|
||||
|
||||
@@ -48,5 +48,7 @@ skills/us-cpa/scripts/us-cpa review --tax-year 2025 --case-dir ~/tax-cases/2025-
|
||||
|
||||
- JSON by default
|
||||
- markdown output available with `--format markdown`
|
||||
- `fetch-year` downloads the bootstrap IRS form/instruction corpus into `~/.cache/us-cpa` by default
|
||||
- override the cache root with `US_CPA_CACHE_DIR` when you need an isolated run or fixture generation
|
||||
|
||||
For operator details, limitations, and the planned case structure, see `docs/us-cpa.md`.
|
||||
|
||||
@@ -6,6 +6,7 @@ import sys
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
from us_cpa.sources import TaxYearCorpus, bootstrap_irs_catalog
|
||||
|
||||
COMMANDS = (
|
||||
"question",
|
||||
@@ -114,11 +115,15 @@ def main(argv: list[str] | None = None) -> int:
|
||||
return _emit(payload, args.format)
|
||||
|
||||
if args.command == "fetch-year":
|
||||
corpus = TaxYearCorpus()
|
||||
manifest = corpus.download_catalog(args.tax_year, bootstrap_irs_catalog(args.tax_year))
|
||||
payload = {
|
||||
"command": "fetch-year",
|
||||
"format": args.format,
|
||||
"taxYear": args.tax_year,
|
||||
"status": "not_implemented",
|
||||
"status": "downloaded",
|
||||
"sourceCount": manifest["sourceCount"],
|
||||
"manifestPath": corpus.paths_for_year(args.tax_year).manifest_path.as_posix(),
|
||||
}
|
||||
return _emit(payload, args.format)
|
||||
|
||||
|
||||
178
skills/us-cpa/src/us_cpa/sources.py
Normal file
178
skills/us-cpa/src/us_cpa/sources.py
Normal file
@@ -0,0 +1,178 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import hashlib
|
||||
import json
|
||||
import os
|
||||
from dataclasses import dataclass
|
||||
from datetime import datetime, timezone
|
||||
from enum import IntEnum
|
||||
from pathlib import Path
|
||||
from typing import Callable
|
||||
from urllib.request import urlopen
|
||||
|
||||
|
||||
class AuthorityRank(IntEnum):
|
||||
IRS_FORM = 10
|
||||
IRS_INSTRUCTIONS = 20
|
||||
IRS_PUBLICATION = 30
|
||||
IRS_FAQ = 40
|
||||
INTERNAL_REVENUE_CODE = 100
|
||||
TREASURY_REGULATION = 110
|
||||
OTHER_PRIMARY_AUTHORITY = 120
|
||||
|
||||
|
||||
AUTHORITY_RANKS: dict[str, AuthorityRank] = {
|
||||
"irs_form": AuthorityRank.IRS_FORM,
|
||||
"irs_instructions": AuthorityRank.IRS_INSTRUCTIONS,
|
||||
"irs_publication": AuthorityRank.IRS_PUBLICATION,
|
||||
"irs_faq": AuthorityRank.IRS_FAQ,
|
||||
"internal_revenue_code": AuthorityRank.INTERNAL_REVENUE_CODE,
|
||||
"treasury_regulation": AuthorityRank.TREASURY_REGULATION,
|
||||
"other_primary_authority": AuthorityRank.OTHER_PRIMARY_AUTHORITY,
|
||||
}
|
||||
|
||||
|
||||
def authority_rank_for(source_class: str) -> AuthorityRank:
|
||||
return AUTHORITY_RANKS[source_class]
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class SourceDescriptor:
|
||||
slug: str
|
||||
title: str
|
||||
source_class: str
|
||||
media_type: str
|
||||
url: str
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class TaxYearPaths:
|
||||
year_dir: Path
|
||||
irs_dir: Path
|
||||
manifest_path: Path
|
||||
|
||||
|
||||
def default_cache_root() -> Path:
|
||||
override = os.getenv("US_CPA_CACHE_DIR")
|
||||
if override:
|
||||
return Path(override).expanduser().resolve()
|
||||
return (Path.home() / ".cache" / "us-cpa").resolve()
|
||||
|
||||
|
||||
def build_irs_prior_pdf_url(slug: str, tax_year: int) -> str:
|
||||
return f"https://www.irs.gov/pub/irs-prior/{slug}--{tax_year}.pdf"
|
||||
|
||||
|
||||
def bootstrap_irs_catalog(tax_year: int) -> list[SourceDescriptor]:
|
||||
entries = [
|
||||
("f1040", "Form 1040", "irs_form"),
|
||||
("f1040s1", "Schedule 1 (Form 1040)", "irs_form"),
|
||||
("f1040s2", "Schedule 2 (Form 1040)", "irs_form"),
|
||||
("f1040s3", "Schedule 3 (Form 1040)", "irs_form"),
|
||||
("f1040sa", "Schedule A (Form 1040)", "irs_form"),
|
||||
("f1040sb", "Schedule B (Form 1040)", "irs_form"),
|
||||
("f1040sc", "Schedule C (Form 1040)", "irs_form"),
|
||||
("f1040sd", "Schedule D (Form 1040)", "irs_form"),
|
||||
("f1040se", "Schedule SE (Form 1040)", "irs_form"),
|
||||
("f1040s8", "Schedule 8812 (Form 1040)", "irs_form"),
|
||||
("f8949", "Form 8949", "irs_form"),
|
||||
("i1040gi", "Instructions for Form 1040 and Schedules 1-3", "irs_instructions"),
|
||||
("i1040sca", "Instructions for Schedule A", "irs_instructions"),
|
||||
("i1040sc", "Instructions for Schedule C", "irs_instructions"),
|
||||
("i1040sd", "Instructions for Schedule D", "irs_instructions"),
|
||||
("i1040se", "Instructions for Schedule SE", "irs_instructions"),
|
||||
("i1040s8", "Instructions for Schedule 8812 (Form 1040)", "irs_instructions"),
|
||||
("i8949", "Instructions for Form 8949", "irs_instructions"),
|
||||
]
|
||||
return [
|
||||
SourceDescriptor(
|
||||
slug=slug,
|
||||
title=title,
|
||||
source_class=source_class,
|
||||
media_type="application/pdf",
|
||||
url=build_irs_prior_pdf_url(slug, tax_year),
|
||||
)
|
||||
for slug, title, source_class in entries
|
||||
]
|
||||
|
||||
|
||||
def _sha256_bytes(payload: bytes) -> str:
|
||||
return hashlib.sha256(payload).hexdigest()
|
||||
|
||||
|
||||
def _http_fetch(url: str) -> bytes:
|
||||
with urlopen(url) as response:
|
||||
return response.read()
|
||||
|
||||
|
||||
class TaxYearCorpus:
|
||||
def __init__(self, cache_root: Path | None = None) -> None:
|
||||
self.cache_root = cache_root or default_cache_root()
|
||||
|
||||
def paths_for_year(self, tax_year: int) -> TaxYearPaths:
|
||||
year_dir = self.cache_root / "tax-years" / str(tax_year)
|
||||
return TaxYearPaths(
|
||||
year_dir=year_dir,
|
||||
irs_dir=year_dir / "irs",
|
||||
manifest_path=year_dir / "manifest.json",
|
||||
)
|
||||
|
||||
def download_catalog(
|
||||
self,
|
||||
tax_year: int,
|
||||
catalog: list[SourceDescriptor],
|
||||
*,
|
||||
fetcher: Callable[[str], bytes] = _http_fetch,
|
||||
) -> dict:
|
||||
paths = self.paths_for_year(tax_year)
|
||||
paths.irs_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
fetched_at = datetime.now(timezone.utc).isoformat()
|
||||
sources: list[dict] = []
|
||||
for descriptor in catalog:
|
||||
payload = fetcher(descriptor.url)
|
||||
destination = paths.irs_dir / f"{descriptor.slug}.pdf"
|
||||
destination.write_bytes(payload)
|
||||
sources.append(
|
||||
{
|
||||
"slug": descriptor.slug,
|
||||
"title": descriptor.title,
|
||||
"sourceClass": descriptor.source_class,
|
||||
"mediaType": descriptor.media_type,
|
||||
"url": descriptor.url,
|
||||
"localPath": str(destination),
|
||||
"sha256": _sha256_bytes(payload),
|
||||
"fetchedAt": fetched_at,
|
||||
"authorityRank": int(authority_rank_for(descriptor.source_class)),
|
||||
}
|
||||
)
|
||||
|
||||
manifest = {
|
||||
"taxYear": tax_year,
|
||||
"fetchedAt": fetched_at,
|
||||
"cacheRoot": str(self.cache_root),
|
||||
"sourceCount": len(sources),
|
||||
"sources": sources,
|
||||
"indexes": self.index_manifest(sources),
|
||||
"primaryLawHooks": [
|
||||
{
|
||||
"sourceClass": "internal_revenue_code",
|
||||
"authorityRank": int(AuthorityRank.INTERNAL_REVENUE_CODE),
|
||||
},
|
||||
{
|
||||
"sourceClass": "treasury_regulation",
|
||||
"authorityRank": int(AuthorityRank.TREASURY_REGULATION),
|
||||
},
|
||||
],
|
||||
}
|
||||
paths.manifest_path.write_text(json.dumps(manifest, indent=2))
|
||||
return manifest
|
||||
|
||||
@staticmethod
|
||||
def index_manifest(sources: list[dict]) -> dict[str, dict[str, list[str]]]:
|
||||
by_class: dict[str, list[str]] = {}
|
||||
by_slug: dict[str, list[str]] = {}
|
||||
for source in sources:
|
||||
by_class.setdefault(source["sourceClass"], []).append(source["slug"])
|
||||
by_slug.setdefault(source["slug"], []).append(source["localPath"])
|
||||
return {"bySourceClass": by_class, "bySlug": by_slug}
|
||||
97
skills/us-cpa/tests/test_sources.py
Normal file
97
skills/us-cpa/tests/test_sources.py
Normal file
@@ -0,0 +1,97 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import tempfile
|
||||
import unittest
|
||||
from pathlib import Path
|
||||
|
||||
from us_cpa.sources import (
|
||||
AuthorityRank,
|
||||
SourceDescriptor,
|
||||
TaxYearCorpus,
|
||||
authority_rank_for,
|
||||
bootstrap_irs_catalog,
|
||||
build_irs_prior_pdf_url,
|
||||
)
|
||||
|
||||
|
||||
class SourceCatalogTests(unittest.TestCase):
|
||||
def test_build_irs_prior_pdf_url_uses_expected_pattern(self) -> None:
|
||||
self.assertEqual(
|
||||
build_irs_prior_pdf_url("f1040", 2025),
|
||||
"https://www.irs.gov/pub/irs-prior/f1040--2025.pdf",
|
||||
)
|
||||
self.assertEqual(
|
||||
build_irs_prior_pdf_url("i1040gi", 2025),
|
||||
"https://www.irs.gov/pub/irs-prior/i1040gi--2025.pdf",
|
||||
)
|
||||
|
||||
def test_authority_ranking_orders_irs_before_primary_law(self) -> None:
|
||||
self.assertEqual(authority_rank_for("irs_form"), AuthorityRank.IRS_FORM)
|
||||
self.assertEqual(
|
||||
authority_rank_for("treasury_regulation"),
|
||||
AuthorityRank.TREASURY_REGULATION,
|
||||
)
|
||||
self.assertLess(
|
||||
authority_rank_for("irs_form"), authority_rank_for("internal_revenue_code")
|
||||
)
|
||||
|
||||
def test_bootstrap_catalog_builds_tax_year_specific_urls(self) -> None:
|
||||
catalog = bootstrap_irs_catalog(2025)
|
||||
|
||||
self.assertGreaterEqual(len(catalog), 5)
|
||||
self.assertEqual(catalog[0].url, "https://www.irs.gov/pub/irs-prior/f1040--2025.pdf")
|
||||
self.assertTrue(any(item.slug == "i1040gi" for item in catalog))
|
||||
|
||||
|
||||
class TaxYearCorpusTests(unittest.TestCase):
|
||||
def test_tax_year_layout_is_deterministic(self) -> None:
|
||||
with tempfile.TemporaryDirectory() as temp_dir:
|
||||
corpus = TaxYearCorpus(cache_root=Path(temp_dir))
|
||||
paths = corpus.paths_for_year(2025)
|
||||
|
||||
self.assertEqual(paths.year_dir, Path(temp_dir) / "tax-years" / "2025")
|
||||
self.assertEqual(paths.irs_dir, paths.year_dir / "irs")
|
||||
self.assertEqual(paths.manifest_path, paths.year_dir / "manifest.json")
|
||||
|
||||
def test_download_catalog_writes_files_and_manifest(self) -> None:
|
||||
with tempfile.TemporaryDirectory() as temp_dir:
|
||||
corpus = TaxYearCorpus(cache_root=Path(temp_dir))
|
||||
catalog = [
|
||||
SourceDescriptor(
|
||||
slug="f1040",
|
||||
title="Form 1040",
|
||||
source_class="irs_form",
|
||||
media_type="application/pdf",
|
||||
url=build_irs_prior_pdf_url("f1040", 2025),
|
||||
),
|
||||
SourceDescriptor(
|
||||
slug="i1040gi",
|
||||
title="Instructions for Form 1040",
|
||||
source_class="irs_instructions",
|
||||
media_type="application/pdf",
|
||||
url=build_irs_prior_pdf_url("i1040gi", 2025),
|
||||
),
|
||||
]
|
||||
|
||||
def fake_fetch(url: str) -> bytes:
|
||||
return f"downloaded:{url}".encode()
|
||||
|
||||
manifest = corpus.download_catalog(2025, catalog, fetcher=fake_fetch)
|
||||
|
||||
self.assertEqual(manifest["taxYear"], 2025)
|
||||
self.assertEqual(manifest["sourceCount"], 2)
|
||||
self.assertTrue(corpus.paths_for_year(2025).manifest_path.exists())
|
||||
|
||||
first = manifest["sources"][0]
|
||||
self.assertEqual(first["slug"], "f1040")
|
||||
self.assertEqual(first["authorityRank"], int(AuthorityRank.IRS_FORM))
|
||||
self.assertTrue(Path(first["localPath"]).exists())
|
||||
|
||||
saved = json.loads(corpus.paths_for_year(2025).manifest_path.read_text())
|
||||
self.assertEqual(saved["sourceCount"], 2)
|
||||
self.assertEqual(saved["sources"][1]["slug"], "i1040gi")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
Reference in New Issue
Block a user