from __future__ import annotations import json import tempfile import unittest from pathlib import Path from us_cpa.sources import ( AuthorityRank, SourceDescriptor, TaxYearCorpus, authority_rank_for, bootstrap_irs_catalog, build_irs_prior_pdf_url, ) class SourceCatalogTests(unittest.TestCase): def test_build_irs_prior_pdf_url_uses_expected_pattern(self) -> None: self.assertEqual( build_irs_prior_pdf_url("f1040", 2025), "https://www.irs.gov/pub/irs-prior/f1040--2025.pdf", ) self.assertEqual( build_irs_prior_pdf_url("i1040gi", 2025), "https://www.irs.gov/pub/irs-prior/i1040gi--2025.pdf", ) def test_authority_ranking_orders_irs_before_primary_law(self) -> None: self.assertEqual(authority_rank_for("irs_form"), AuthorityRank.IRS_FORM) self.assertEqual( authority_rank_for("treasury_regulation"), AuthorityRank.TREASURY_REGULATION, ) self.assertLess( authority_rank_for("irs_form"), authority_rank_for("internal_revenue_code") ) def test_bootstrap_catalog_builds_tax_year_specific_urls(self) -> None: catalog = bootstrap_irs_catalog(2025) self.assertGreaterEqual(len(catalog), 5) self.assertEqual(catalog[0].url, "https://www.irs.gov/pub/irs-prior/f1040--2025.pdf") self.assertTrue(any(item.slug == "i1040gi" for item in catalog)) class TaxYearCorpusTests(unittest.TestCase): def test_tax_year_layout_is_deterministic(self) -> None: with tempfile.TemporaryDirectory() as temp_dir: corpus = TaxYearCorpus(cache_root=Path(temp_dir)) paths = corpus.paths_for_year(2025) self.assertEqual(paths.year_dir, Path(temp_dir) / "tax-years" / "2025") self.assertEqual(paths.irs_dir, paths.year_dir / "irs") self.assertEqual(paths.manifest_path, paths.year_dir / "manifest.json") def test_download_catalog_writes_files_and_manifest(self) -> None: with tempfile.TemporaryDirectory() as temp_dir: corpus = TaxYearCorpus(cache_root=Path(temp_dir)) catalog = [ SourceDescriptor( slug="f1040", title="Form 1040", source_class="irs_form", media_type="application/pdf", url=build_irs_prior_pdf_url("f1040", 2025), ), SourceDescriptor( slug="i1040gi", title="Instructions for Form 1040", source_class="irs_instructions", media_type="application/pdf", url=build_irs_prior_pdf_url("i1040gi", 2025), ), ] def fake_fetch(url: str) -> bytes: return f"downloaded:{url}".encode() manifest = corpus.download_catalog(2025, catalog, fetcher=fake_fetch) self.assertEqual(manifest["taxYear"], 2025) self.assertEqual(manifest["sourceCount"], 2) self.assertTrue(corpus.paths_for_year(2025).manifest_path.exists()) first = manifest["sources"][0] self.assertEqual(first["slug"], "f1040") self.assertEqual(first["authorityRank"], int(AuthorityRank.IRS_FORM)) self.assertTrue(Path(first["localPath"]).exists()) saved = json.loads(corpus.paths_for_year(2025).manifest_path.read_text()) self.assertEqual(saved["sourceCount"], 2) self.assertEqual(saved["sources"][1]["slug"], "i1040gi") if __name__ == "__main__": unittest.main()