us-cpa: OpenClaw skill wrapper for U.S. federal individual tax work #1
@@ -4,7 +4,14 @@
|
|||||||
|
|
||||||
## Current Milestone
|
## Current Milestone
|
||||||
|
|
||||||
Milestone 1 provides the initial package, CLI surface, skill wrapper, and test harness. Tax logic, IRS corpus download, case workflows, rendering, and review logic are not implemented yet.
|
Milestone 2 now adds the first tax-year corpus layer:
|
||||||
|
|
||||||
|
- deterministic cache layout under `~/.cache/us-cpa` by default
|
||||||
|
- `fetch-year` download flow for the bootstrap IRS corpus
|
||||||
|
- source manifest with URL, hash, authority rank, and local path traceability
|
||||||
|
- authority ranking hooks for IRS materials and future primary-law escalation
|
||||||
|
|
||||||
|
Tax logic, case workflows, rendering, and review logic are still pending.
|
||||||
|
|
||||||
## CLI Surface
|
## CLI Surface
|
||||||
|
|
||||||
@@ -18,6 +25,27 @@ skills/us-cpa/scripts/us-cpa render-forms --tax-year 2025 --case-dir ~/tax-cases
|
|||||||
skills/us-cpa/scripts/us-cpa export-efile-ready --tax-year 2025 --case-dir ~/tax-cases/2025-jane-doe
|
skills/us-cpa/scripts/us-cpa export-efile-ready --tax-year 2025 --case-dir ~/tax-cases/2025-jane-doe
|
||||||
```
|
```
|
||||||
|
|
||||||
|
## Tax-Year Cache
|
||||||
|
|
||||||
|
Default cache root:
|
||||||
|
|
||||||
|
```text
|
||||||
|
~/.cache/us-cpa
|
||||||
|
```
|
||||||
|
|
||||||
|
Override for isolated runs:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
US_CPA_CACHE_DIR=/tmp/us-cpa-cache skills/us-cpa/scripts/us-cpa fetch-year --tax-year 2025
|
||||||
|
```
|
||||||
|
|
||||||
|
Current `fetch-year` bootstrap corpus for tax year `2025` is verified against live IRS `irs-prior` PDFs for:
|
||||||
|
|
||||||
|
- Form 1040
|
||||||
|
- Schedules 1, 2, 3, A, B, C, D, SE, and 8812
|
||||||
|
- Form 8949
|
||||||
|
- General Form 1040 instructions and selected schedule/form instructions
|
||||||
|
|
||||||
## Interaction Model
|
## Interaction Model
|
||||||
|
|
||||||
- `question`
|
- `question`
|
||||||
@@ -47,10 +75,25 @@ skills/us-cpa/scripts/us-cpa export-efile-ready --tax-year 2025 --case-dir ~/tax
|
|||||||
|
|
||||||
- JSON by default
|
- JSON by default
|
||||||
- markdown available with `--format markdown`
|
- markdown available with `--format markdown`
|
||||||
- current milestone responses are scaffold payloads with `status: "not_implemented"`
|
- `question`, `prepare`, `review`, `extract-docs`, `render-forms`, and `export-efile-ready` still emit scaffold payloads with `status: "not_implemented"`
|
||||||
|
- `fetch-year` emits a downloaded manifest location and source count
|
||||||
|
|
||||||
## Scope Rules
|
## Scope Rules
|
||||||
|
|
||||||
- U.S. federal individual returns only in v1
|
- U.S. federal individual returns only in v1
|
||||||
- official IRS artifacts are the target output for compiled forms
|
- official IRS artifacts are the target output for compiled forms
|
||||||
- conflicting facts must stop the workflow for user resolution
|
- conflicting facts must stop the workflow for user resolution
|
||||||
|
|
||||||
|
## Authority Ranking
|
||||||
|
|
||||||
|
Current authority classes are ranked to preserve source hierarchy:
|
||||||
|
|
||||||
|
- IRS forms
|
||||||
|
- IRS instructions
|
||||||
|
- IRS publications
|
||||||
|
- IRS FAQs
|
||||||
|
- Internal Revenue Code
|
||||||
|
- Treasury regulations
|
||||||
|
- other primary authority
|
||||||
|
|
||||||
|
Later research and review flows should consume this ranking rather than inventing their own.
|
||||||
|
|||||||
@@ -48,5 +48,7 @@ skills/us-cpa/scripts/us-cpa review --tax-year 2025 --case-dir ~/tax-cases/2025-
|
|||||||
|
|
||||||
- JSON by default
|
- JSON by default
|
||||||
- markdown output available with `--format markdown`
|
- markdown output available with `--format markdown`
|
||||||
|
- `fetch-year` downloads the bootstrap IRS form/instruction corpus into `~/.cache/us-cpa` by default
|
||||||
|
- override the cache root with `US_CPA_CACHE_DIR` when you need an isolated run or fixture generation
|
||||||
|
|
||||||
For operator details, limitations, and the planned case structure, see `docs/us-cpa.md`.
|
For operator details, limitations, and the planned case structure, see `docs/us-cpa.md`.
|
||||||
|
|||||||
@@ -6,6 +6,7 @@ import sys
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Any
|
from typing import Any
|
||||||
|
|
||||||
|
from us_cpa.sources import TaxYearCorpus, bootstrap_irs_catalog
|
||||||
|
|
||||||
COMMANDS = (
|
COMMANDS = (
|
||||||
"question",
|
"question",
|
||||||
@@ -114,11 +115,15 @@ def main(argv: list[str] | None = None) -> int:
|
|||||||
return _emit(payload, args.format)
|
return _emit(payload, args.format)
|
||||||
|
|
||||||
if args.command == "fetch-year":
|
if args.command == "fetch-year":
|
||||||
|
corpus = TaxYearCorpus()
|
||||||
|
manifest = corpus.download_catalog(args.tax_year, bootstrap_irs_catalog(args.tax_year))
|
||||||
payload = {
|
payload = {
|
||||||
"command": "fetch-year",
|
"command": "fetch-year",
|
||||||
"format": args.format,
|
"format": args.format,
|
||||||
"taxYear": args.tax_year,
|
"taxYear": args.tax_year,
|
||||||
"status": "not_implemented",
|
"status": "downloaded",
|
||||||
|
"sourceCount": manifest["sourceCount"],
|
||||||
|
"manifestPath": corpus.paths_for_year(args.tax_year).manifest_path.as_posix(),
|
||||||
}
|
}
|
||||||
return _emit(payload, args.format)
|
return _emit(payload, args.format)
|
||||||
|
|
||||||
|
|||||||
178
skills/us-cpa/src/us_cpa/sources.py
Normal file
178
skills/us-cpa/src/us_cpa/sources.py
Normal file
@@ -0,0 +1,178 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import hashlib
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from datetime import datetime, timezone
|
||||||
|
from enum import IntEnum
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Callable
|
||||||
|
from urllib.request import urlopen
|
||||||
|
|
||||||
|
|
||||||
|
class AuthorityRank(IntEnum):
|
||||||
|
IRS_FORM = 10
|
||||||
|
IRS_INSTRUCTIONS = 20
|
||||||
|
IRS_PUBLICATION = 30
|
||||||
|
IRS_FAQ = 40
|
||||||
|
INTERNAL_REVENUE_CODE = 100
|
||||||
|
TREASURY_REGULATION = 110
|
||||||
|
OTHER_PRIMARY_AUTHORITY = 120
|
||||||
|
|
||||||
|
|
||||||
|
AUTHORITY_RANKS: dict[str, AuthorityRank] = {
|
||||||
|
"irs_form": AuthorityRank.IRS_FORM,
|
||||||
|
"irs_instructions": AuthorityRank.IRS_INSTRUCTIONS,
|
||||||
|
"irs_publication": AuthorityRank.IRS_PUBLICATION,
|
||||||
|
"irs_faq": AuthorityRank.IRS_FAQ,
|
||||||
|
"internal_revenue_code": AuthorityRank.INTERNAL_REVENUE_CODE,
|
||||||
|
"treasury_regulation": AuthorityRank.TREASURY_REGULATION,
|
||||||
|
"other_primary_authority": AuthorityRank.OTHER_PRIMARY_AUTHORITY,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def authority_rank_for(source_class: str) -> AuthorityRank:
|
||||||
|
return AUTHORITY_RANKS[source_class]
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class SourceDescriptor:
|
||||||
|
slug: str
|
||||||
|
title: str
|
||||||
|
source_class: str
|
||||||
|
media_type: str
|
||||||
|
url: str
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class TaxYearPaths:
|
||||||
|
year_dir: Path
|
||||||
|
irs_dir: Path
|
||||||
|
manifest_path: Path
|
||||||
|
|
||||||
|
|
||||||
|
def default_cache_root() -> Path:
|
||||||
|
override = os.getenv("US_CPA_CACHE_DIR")
|
||||||
|
if override:
|
||||||
|
return Path(override).expanduser().resolve()
|
||||||
|
return (Path.home() / ".cache" / "us-cpa").resolve()
|
||||||
|
|
||||||
|
|
||||||
|
def build_irs_prior_pdf_url(slug: str, tax_year: int) -> str:
|
||||||
|
return f"https://www.irs.gov/pub/irs-prior/{slug}--{tax_year}.pdf"
|
||||||
|
|
||||||
|
|
||||||
|
def bootstrap_irs_catalog(tax_year: int) -> list[SourceDescriptor]:
|
||||||
|
entries = [
|
||||||
|
("f1040", "Form 1040", "irs_form"),
|
||||||
|
("f1040s1", "Schedule 1 (Form 1040)", "irs_form"),
|
||||||
|
("f1040s2", "Schedule 2 (Form 1040)", "irs_form"),
|
||||||
|
("f1040s3", "Schedule 3 (Form 1040)", "irs_form"),
|
||||||
|
("f1040sa", "Schedule A (Form 1040)", "irs_form"),
|
||||||
|
("f1040sb", "Schedule B (Form 1040)", "irs_form"),
|
||||||
|
("f1040sc", "Schedule C (Form 1040)", "irs_form"),
|
||||||
|
("f1040sd", "Schedule D (Form 1040)", "irs_form"),
|
||||||
|
("f1040se", "Schedule SE (Form 1040)", "irs_form"),
|
||||||
|
("f1040s8", "Schedule 8812 (Form 1040)", "irs_form"),
|
||||||
|
("f8949", "Form 8949", "irs_form"),
|
||||||
|
("i1040gi", "Instructions for Form 1040 and Schedules 1-3", "irs_instructions"),
|
||||||
|
("i1040sca", "Instructions for Schedule A", "irs_instructions"),
|
||||||
|
("i1040sc", "Instructions for Schedule C", "irs_instructions"),
|
||||||
|
("i1040sd", "Instructions for Schedule D", "irs_instructions"),
|
||||||
|
("i1040se", "Instructions for Schedule SE", "irs_instructions"),
|
||||||
|
("i1040s8", "Instructions for Schedule 8812 (Form 1040)", "irs_instructions"),
|
||||||
|
("i8949", "Instructions for Form 8949", "irs_instructions"),
|
||||||
|
]
|
||||||
|
return [
|
||||||
|
SourceDescriptor(
|
||||||
|
slug=slug,
|
||||||
|
title=title,
|
||||||
|
source_class=source_class,
|
||||||
|
media_type="application/pdf",
|
||||||
|
url=build_irs_prior_pdf_url(slug, tax_year),
|
||||||
|
)
|
||||||
|
for slug, title, source_class in entries
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def _sha256_bytes(payload: bytes) -> str:
|
||||||
|
return hashlib.sha256(payload).hexdigest()
|
||||||
|
|
||||||
|
|
||||||
|
def _http_fetch(url: str) -> bytes:
|
||||||
|
with urlopen(url) as response:
|
||||||
|
return response.read()
|
||||||
|
|
||||||
|
|
||||||
|
class TaxYearCorpus:
|
||||||
|
def __init__(self, cache_root: Path | None = None) -> None:
|
||||||
|
self.cache_root = cache_root or default_cache_root()
|
||||||
|
|
||||||
|
def paths_for_year(self, tax_year: int) -> TaxYearPaths:
|
||||||
|
year_dir = self.cache_root / "tax-years" / str(tax_year)
|
||||||
|
return TaxYearPaths(
|
||||||
|
year_dir=year_dir,
|
||||||
|
irs_dir=year_dir / "irs",
|
||||||
|
manifest_path=year_dir / "manifest.json",
|
||||||
|
)
|
||||||
|
|
||||||
|
def download_catalog(
|
||||||
|
self,
|
||||||
|
tax_year: int,
|
||||||
|
catalog: list[SourceDescriptor],
|
||||||
|
*,
|
||||||
|
fetcher: Callable[[str], bytes] = _http_fetch,
|
||||||
|
) -> dict:
|
||||||
|
paths = self.paths_for_year(tax_year)
|
||||||
|
paths.irs_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
fetched_at = datetime.now(timezone.utc).isoformat()
|
||||||
|
sources: list[dict] = []
|
||||||
|
for descriptor in catalog:
|
||||||
|
payload = fetcher(descriptor.url)
|
||||||
|
destination = paths.irs_dir / f"{descriptor.slug}.pdf"
|
||||||
|
destination.write_bytes(payload)
|
||||||
|
sources.append(
|
||||||
|
{
|
||||||
|
"slug": descriptor.slug,
|
||||||
|
"title": descriptor.title,
|
||||||
|
"sourceClass": descriptor.source_class,
|
||||||
|
"mediaType": descriptor.media_type,
|
||||||
|
"url": descriptor.url,
|
||||||
|
"localPath": str(destination),
|
||||||
|
"sha256": _sha256_bytes(payload),
|
||||||
|
"fetchedAt": fetched_at,
|
||||||
|
"authorityRank": int(authority_rank_for(descriptor.source_class)),
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
manifest = {
|
||||||
|
"taxYear": tax_year,
|
||||||
|
"fetchedAt": fetched_at,
|
||||||
|
"cacheRoot": str(self.cache_root),
|
||||||
|
"sourceCount": len(sources),
|
||||||
|
"sources": sources,
|
||||||
|
"indexes": self.index_manifest(sources),
|
||||||
|
"primaryLawHooks": [
|
||||||
|
{
|
||||||
|
"sourceClass": "internal_revenue_code",
|
||||||
|
"authorityRank": int(AuthorityRank.INTERNAL_REVENUE_CODE),
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"sourceClass": "treasury_regulation",
|
||||||
|
"authorityRank": int(AuthorityRank.TREASURY_REGULATION),
|
||||||
|
},
|
||||||
|
],
|
||||||
|
}
|
||||||
|
paths.manifest_path.write_text(json.dumps(manifest, indent=2))
|
||||||
|
return manifest
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def index_manifest(sources: list[dict]) -> dict[str, dict[str, list[str]]]:
|
||||||
|
by_class: dict[str, list[str]] = {}
|
||||||
|
by_slug: dict[str, list[str]] = {}
|
||||||
|
for source in sources:
|
||||||
|
by_class.setdefault(source["sourceClass"], []).append(source["slug"])
|
||||||
|
by_slug.setdefault(source["slug"], []).append(source["localPath"])
|
||||||
|
return {"bySourceClass": by_class, "bySlug": by_slug}
|
||||||
97
skills/us-cpa/tests/test_sources.py
Normal file
97
skills/us-cpa/tests/test_sources.py
Normal file
@@ -0,0 +1,97 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import json
|
||||||
|
import tempfile
|
||||||
|
import unittest
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from us_cpa.sources import (
|
||||||
|
AuthorityRank,
|
||||||
|
SourceDescriptor,
|
||||||
|
TaxYearCorpus,
|
||||||
|
authority_rank_for,
|
||||||
|
bootstrap_irs_catalog,
|
||||||
|
build_irs_prior_pdf_url,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class SourceCatalogTests(unittest.TestCase):
|
||||||
|
def test_build_irs_prior_pdf_url_uses_expected_pattern(self) -> None:
|
||||||
|
self.assertEqual(
|
||||||
|
build_irs_prior_pdf_url("f1040", 2025),
|
||||||
|
"https://www.irs.gov/pub/irs-prior/f1040--2025.pdf",
|
||||||
|
)
|
||||||
|
self.assertEqual(
|
||||||
|
build_irs_prior_pdf_url("i1040gi", 2025),
|
||||||
|
"https://www.irs.gov/pub/irs-prior/i1040gi--2025.pdf",
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_authority_ranking_orders_irs_before_primary_law(self) -> None:
|
||||||
|
self.assertEqual(authority_rank_for("irs_form"), AuthorityRank.IRS_FORM)
|
||||||
|
self.assertEqual(
|
||||||
|
authority_rank_for("treasury_regulation"),
|
||||||
|
AuthorityRank.TREASURY_REGULATION,
|
||||||
|
)
|
||||||
|
self.assertLess(
|
||||||
|
authority_rank_for("irs_form"), authority_rank_for("internal_revenue_code")
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_bootstrap_catalog_builds_tax_year_specific_urls(self) -> None:
|
||||||
|
catalog = bootstrap_irs_catalog(2025)
|
||||||
|
|
||||||
|
self.assertGreaterEqual(len(catalog), 5)
|
||||||
|
self.assertEqual(catalog[0].url, "https://www.irs.gov/pub/irs-prior/f1040--2025.pdf")
|
||||||
|
self.assertTrue(any(item.slug == "i1040gi" for item in catalog))
|
||||||
|
|
||||||
|
|
||||||
|
class TaxYearCorpusTests(unittest.TestCase):
|
||||||
|
def test_tax_year_layout_is_deterministic(self) -> None:
|
||||||
|
with tempfile.TemporaryDirectory() as temp_dir:
|
||||||
|
corpus = TaxYearCorpus(cache_root=Path(temp_dir))
|
||||||
|
paths = corpus.paths_for_year(2025)
|
||||||
|
|
||||||
|
self.assertEqual(paths.year_dir, Path(temp_dir) / "tax-years" / "2025")
|
||||||
|
self.assertEqual(paths.irs_dir, paths.year_dir / "irs")
|
||||||
|
self.assertEqual(paths.manifest_path, paths.year_dir / "manifest.json")
|
||||||
|
|
||||||
|
def test_download_catalog_writes_files_and_manifest(self) -> None:
|
||||||
|
with tempfile.TemporaryDirectory() as temp_dir:
|
||||||
|
corpus = TaxYearCorpus(cache_root=Path(temp_dir))
|
||||||
|
catalog = [
|
||||||
|
SourceDescriptor(
|
||||||
|
slug="f1040",
|
||||||
|
title="Form 1040",
|
||||||
|
source_class="irs_form",
|
||||||
|
media_type="application/pdf",
|
||||||
|
url=build_irs_prior_pdf_url("f1040", 2025),
|
||||||
|
),
|
||||||
|
SourceDescriptor(
|
||||||
|
slug="i1040gi",
|
||||||
|
title="Instructions for Form 1040",
|
||||||
|
source_class="irs_instructions",
|
||||||
|
media_type="application/pdf",
|
||||||
|
url=build_irs_prior_pdf_url("i1040gi", 2025),
|
||||||
|
),
|
||||||
|
]
|
||||||
|
|
||||||
|
def fake_fetch(url: str) -> bytes:
|
||||||
|
return f"downloaded:{url}".encode()
|
||||||
|
|
||||||
|
manifest = corpus.download_catalog(2025, catalog, fetcher=fake_fetch)
|
||||||
|
|
||||||
|
self.assertEqual(manifest["taxYear"], 2025)
|
||||||
|
self.assertEqual(manifest["sourceCount"], 2)
|
||||||
|
self.assertTrue(corpus.paths_for_year(2025).manifest_path.exists())
|
||||||
|
|
||||||
|
first = manifest["sources"][0]
|
||||||
|
self.assertEqual(first["slug"], "f1040")
|
||||||
|
self.assertEqual(first["authorityRank"], int(AuthorityRank.IRS_FORM))
|
||||||
|
self.assertTrue(Path(first["localPath"]).exists())
|
||||||
|
|
||||||
|
saved = json.loads(corpus.paths_for_year(2025).manifest_path.read_text())
|
||||||
|
self.assertEqual(saved["sourceCount"], 2)
|
||||||
|
self.assertEqual(saved["sources"][1]["slug"], "i1040gi")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
unittest.main()
|
||||||
Reference in New Issue
Block a user