from __future__ import annotations

import hashlib
import json
import os
import re
from dataclasses import dataclass
from datetime import datetime, timezone
from enum import IntEnum
from pathlib import Path
from typing import Callable
from urllib.request import urlopen


class AuthorityRank(IntEnum):
    IRS_FORM = 10
    IRS_INSTRUCTIONS = 20
    IRS_PUBLICATION = 30
    IRS_FAQ = 40
    INTERNAL_REVENUE_CODE = 100
    TREASURY_REGULATION = 110
    OTHER_PRIMARY_AUTHORITY = 120


AUTHORITY_RANKS: dict[str, AuthorityRank] = {
    "irs_form": AuthorityRank.IRS_FORM,
    "irs_instructions": AuthorityRank.IRS_INSTRUCTIONS,
    "irs_publication": AuthorityRank.IRS_PUBLICATION,
    "irs_faq": AuthorityRank.IRS_FAQ,
    "internal_revenue_code": AuthorityRank.INTERNAL_REVENUE_CODE,
    "treasury_regulation": AuthorityRank.TREASURY_REGULATION,
    "other_primary_authority": AuthorityRank.OTHER_PRIMARY_AUTHORITY,
}


def authority_rank_for(source_class: str) -> AuthorityRank:
    return AUTHORITY_RANKS[source_class]


@dataclass(frozen=True)
class SourceDescriptor:
    slug: str
    title: str
    source_class: str
    media_type: str
    url: str


@dataclass(frozen=True)
class TaxYearPaths:
    year_dir: Path
    irs_dir: Path
    manifest_path: Path


def default_cache_root() -> Path:
    override = os.getenv("US_CPA_CACHE_DIR")
    if override:
        return Path(override).expanduser().resolve()
    return (Path.home() / ".cache" / "us-cpa").resolve()


def build_irs_prior_pdf_url(slug: str, tax_year: int) -> str:
    return f"https://www.irs.gov/pub/irs-prior/{slug}--{tax_year}.pdf"


def build_primary_law_authorities(question: str) -> list[dict[str, str | int]]:
    authorities: list[dict[str, str | int]] = []
    normalized = question.lower()

    for match in re.finditer(r"(?:section|sec\.)\s+(\d+[a-z0-9-]*)", normalized):
        section = match.group(1)
        authorities.append(
            {
                "slug": f"irc-{section}",
                "title": f"Internal Revenue Code section {section}",
                "sourceClass": "internal_revenue_code",
                "url": f"https://uscode.house.gov/view.xhtml?req=granuleid:USC-prelim-title26-section{section}&num=0&edition=prelim",
                "authorityRank": int(AuthorityRank.INTERNAL_REVENUE_CODE),
            }
        )

    for match in re.finditer(r"(?:treas(?:ury)?\.?\s+reg(?:ulation)?\.?\s*)([\d.]+-\d+)", normalized):
        section = match.group(1)
        authorities.append(
            {
                "slug": f"reg-{section}",
                "title": f"Treasury Regulation {section}",
                "sourceClass": "treasury_regulation",
                "url": f"https://www.ecfr.gov/current/title-26/section-{section}",
                "authorityRank": int(AuthorityRank.TREASURY_REGULATION),
            }
        )

    return authorities


def bootstrap_irs_catalog(tax_year: int) -> list[SourceDescriptor]:
    entries = [
        ("f1040", "Form 1040", "irs_form"),
        ("f1040s1", "Schedule 1 (Form 1040)", "irs_form"),
        ("f1040s2", "Schedule 2 (Form 1040)", "irs_form"),
        ("f1040s3", "Schedule 3 (Form 1040)", "irs_form"),
        ("f1040sa", "Schedule A (Form 1040)", "irs_form"),
        ("f1040sb", "Schedule B (Form 1040)", "irs_form"),
        ("f1040sc", "Schedule C (Form 1040)", "irs_form"),
        ("f1040sd", "Schedule D (Form 1040)", "irs_form"),
        ("f1040se", "Schedule E (Form 1040)", "irs_form"),
        ("f1040sse", "Schedule SE (Form 1040)", "irs_form"),
        ("f1040s8", "Schedule 8812 (Form 1040)", "irs_form"),
        ("f8949", "Form 8949", "irs_form"),
        ("f4562", "Form 4562", "irs_form"),
        ("f4797", "Form 4797", "irs_form"),
        ("f6251", "Form 6251", "irs_form"),
        ("f8606", "Form 8606", "irs_form"),
        ("f8863", "Form 8863", "irs_form"),
        ("f8889", "Form 8889", "irs_form"),
        ("f8959", "Form 8959", "irs_form"),
        ("f8960", "Form 8960", "irs_form"),
        ("f8995", "Form 8995", "irs_form"),
        ("f8995a", "Form 8995-A", "irs_form"),
        ("f5329", "Form 5329", "irs_form"),
        ("f5695", "Form 5695", "irs_form"),
        ("f1116", "Form 1116", "irs_form"),
        ("i1040gi", "Instructions for Form 1040 and Schedules 1-3", "irs_instructions"),
        ("i1040sca", "Instructions for Schedule A", "irs_instructions"),
        ("i1040sc", "Instructions for Schedule C", "irs_instructions"),
        ("i1040sd", "Instructions for Schedule D", "irs_instructions"),
        ("i1040se", "Instructions for Schedule E (Form 1040)", "irs_instructions"),
        ("i1040sse", "Instructions for Schedule SE", "irs_instructions"),
        ("i1040s8", "Instructions for Schedule 8812 (Form 1040)", "irs_instructions"),
        ("i8949", "Instructions for Form 8949", "irs_instructions"),
        ("i4562", "Instructions for Form 4562", "irs_instructions"),
        ("i4797", "Instructions for Form 4797", "irs_instructions"),
        ("i6251", "Instructions for Form 6251", "irs_instructions"),
        ("i8606", "Instructions for Form 8606", "irs_instructions"),
        ("i8863", "Instructions for Form 8863", "irs_instructions"),
        ("i8889", "Instructions for Form 8889", "irs_instructions"),
        ("i8959", "Instructions for Form 8959", "irs_instructions"),
        ("i8960", "Instructions for Form 8960", "irs_instructions"),
        ("i8995", "Instructions for Form 8995", "irs_instructions"),
        ("i8995a", "Instructions for Form 8995-A", "irs_instructions"),
        ("i5329", "Instructions for Form 5329", "irs_instructions"),
        ("i5695", "Instructions for Form 5695", "irs_instructions"),
        ("i1116", "Instructions for Form 1116", "irs_instructions"),
        ("p501", "Publication 501, Dependents, Standard Deduction, and Filing Information", "irs_publication"),
    ]
    return [
        SourceDescriptor(
            slug=slug,
            title=title,
            source_class=source_class,
            media_type="application/pdf",
            url=build_irs_prior_pdf_url(slug, tax_year),
        )
        for slug, title, source_class in entries
    ]


def _sha256_bytes(payload: bytes) -> str:
    return hashlib.sha256(payload).hexdigest()


def _http_fetch(url: str) -> bytes:
    with urlopen(url) as response:
        return response.read()


class TaxYearCorpus:
    def __init__(self, cache_root: Path | None = None) -> None:
        self.cache_root = cache_root or default_cache_root()

    def paths_for_year(self, tax_year: int) -> TaxYearPaths:
        year_dir = self.cache_root / "tax-years" / str(tax_year)
        return TaxYearPaths(
            year_dir=year_dir,
            irs_dir=year_dir / "irs",
            manifest_path=year_dir / "manifest.json",
        )

    def download_catalog(
        self,
        tax_year: int,
        catalog: list[SourceDescriptor],
        *,
        fetcher: Callable[[str], bytes] = _http_fetch,
    ) -> dict:
        paths = self.paths_for_year(tax_year)
        paths.irs_dir.mkdir(parents=True, exist_ok=True)

        fetched_at = datetime.now(timezone.utc).isoformat()
        sources: list[dict] = []
        for descriptor in catalog:
            payload = fetcher(descriptor.url)
            destination = paths.irs_dir / f"{descriptor.slug}.pdf"
            destination.write_bytes(payload)
            sources.append(
                {
                    "slug": descriptor.slug,
                    "title": descriptor.title,
                    "sourceClass": descriptor.source_class,
                    "mediaType": descriptor.media_type,
                    "url": descriptor.url,
                    "localPath": str(destination),
                    "sha256": _sha256_bytes(payload),
                    "fetchedAt": fetched_at,
                    "authorityRank": int(authority_rank_for(descriptor.source_class)),
                }
            )

        manifest = {
            "taxYear": tax_year,
            "fetchedAt": fetched_at,
            "cacheRoot": str(self.cache_root),
            "sourceCount": len(sources),
            "sources": sources,
            "indexes": self.index_manifest(sources),
            "primaryLawHooks": [
                {
                    "sourceClass": "internal_revenue_code",
                    "authorityRank": int(AuthorityRank.INTERNAL_REVENUE_CODE),
                },
                {
                    "sourceClass": "treasury_regulation",
                    "authorityRank": int(AuthorityRank.TREASURY_REGULATION),
                },
            ],
        }
        paths.manifest_path.write_text(json.dumps(manifest, indent=2))
        return manifest

    @staticmethod
    def index_manifest(sources: list[dict]) -> dict[str, dict[str, list[str]]]:
        by_class: dict[str, list[str]] = {}
        by_slug: dict[str, list[str]] = {}
        for source in sources:
            by_class.setdefault(source["sourceClass"], []).append(source["slug"])
            by_slug.setdefault(source["slug"], []).append(source["localPath"])
        return {"bySourceClass": by_class, "bySlug": by_slug}