import { HTMLElement, parse } from "node-html-parser"; import { parseMoney, parseRating, parseReviewCount, parseStarBreakdown } from "./parsers.js"; import type { DeliverySummary, ProductSearchResult, ProductSpec } from "./types.js"; function textOf(node: HTMLElement | null | undefined): string { return cleanText(node?.textContent ?? ""); } function attrOf(node: HTMLElement | null | undefined, name: string): string { return cleanText(node?.getAttribute(name) ?? ""); } function cleanText(text: string): string { return text .replace(/\s+/g, " ") .replace(/\s*\{".*$/g, "") .trim(); } function isScriptLike(text: string): boolean { return /\(function\s*\(|window\.|P\.when|ue\.count|tracking\(\)|logShoppableMetrics|buying options|add to cart/i.test(text); } function firstText(root: HTMLElement, selectors: string[]): string { for (const selector of selectors) { const text = textOf(root.querySelector(selector)); if (text) { return text; } } return ""; } function extractBullets(root: HTMLElement): string[] { const spanBullets = root.querySelectorAll("#feature-bullets li span") .map((node) => textOf(node)) .filter((text) => text && !/make sure this fits/i.test(text)); if (spanBullets.length > 0) { return spanBullets; } return root.querySelectorAll("#feature-bullets li") .map((node) => textOf(node)) .filter((text) => text && !/make sure this fits/i.test(text)); } function extractSpecs(root: HTMLElement): ProductSpec[] { const specs: ProductSpec[] = []; const seen = new Set(); const excludedNames = new Set(["customer reviews"]); for (const row of root.querySelectorAll("tr")) { const cells = row.querySelectorAll("th,td").map((cell) => textOf(cell)).filter(Boolean); if (cells.length >= 2) { const name = cells[0]; const value = cells.slice(1).join(" "); const key = name.toLowerCase(); if (seen.has(key) || excludedNames.has(key) || isScriptLike(name) || isScriptLike(value)) { continue; } seen.add(key); specs.push({ name, value }); } } return specs; } function extractHistogramText(root: HTMLElement): string { const rows = root.querySelectorAll("#histogramTable tr, [aria-label*='star'] tr"); const parts: string[] = []; for (const row of rows) { const cells = row.querySelectorAll("th,td").map((cell) => textOf(cell)).filter(Boolean); if (cells.length >= 2) { parts.push(`${cells[0]} ${cells[1]}`); } } return parts.join(" "); } function deliveryFromText(text: string, primeSignal = false): DeliverySummary | undefined { const display = text.replace(/\s+/g, " ").trim(); if (!display) { return primeSignal ? { display: "Prime delivery available", prime: true } : undefined; } return { display, free: /\bfree\b/i.test(display), prime: primeSignal || /\bprime\b/i.test(display) }; } function hasPrimeSignal(root: HTMLElement): boolean { const attributeText = root.querySelectorAll("[id], [class], [aria-label], img[alt]") .map((node) => [ attrOf(node, "id"), attrOf(node, "class"), attrOf(node, "aria-label"), attrOf(node, "alt") ].join(" ")) .join(" "); return /a-icon-prime|prime-logo|primeExclusive|primePopover|amazon\s+prime|\bprime\b/i.test(attributeText); } function mergeDelivery(detail: DeliverySummary | undefined, base: DeliverySummary | undefined): DeliverySummary | undefined { if (!detail) { return base; } if (!base) { return detail; } return { display: detail.display || base.display, free: Boolean(detail.free || base.free), prime: Boolean(detail.prime || base.prime), fastestDate: detail.fastestDate ?? base.fastestDate }; } export function extractDetailPage(html: string, base: ProductSearchResult): ProductSearchResult { const root = parse(html); const title = firstText(root, ["#productTitle", "h1"]) || base.title; const priceText = firstText(root, [ "#corePriceDisplay_desktop_feature_div .a-offscreen", ".a-price .a-offscreen", ".a-price" ]); const deliveryText = firstText(root, [ "#mir-layout-DELIVERY_BLOCK-slot-PRIMARY_DELIVERY_MESSAGE_LARGE", "#deliveryBlockMessage", "[data-csa-c-delivery-price]" ]); const availability = firstText(root, ["#availability", "#availabilityInsideBuyBox_feature_div"]); const seller = firstText(root, ["#merchant-info", "#sellerProfileTriggerId"]); const ratingText = attrOf(root.querySelector("#acrPopover"), "title") || textOf(root.querySelector("#acrPopover")); const reviewText = textOf(root.querySelector("#acrCustomerReviewText")); const histogram = parseStarBreakdown(extractHistogramText(root)); const product: ProductSearchResult = { ...base, title, price: parseMoney(priceText) ?? base.price, rating: parseRating(ratingText) ?? base.rating, reviewCount: parseReviewCount(reviewText) ?? base.reviewCount, delivery: mergeDelivery(deliveryFromText(deliveryText, hasPrimeSignal(root)), base.delivery), availability: availability || base.availability, seller: seller || base.seller, bullets: extractBullets(root), specs: extractSpecs(root), starBreakdown: histogram ?? base.starBreakdown, missingFields: [...base.missingFields], extractionNotes: [...base.extractionNotes] }; for (const field of ["price", "delivery", "rating", "reviewCount", "starBreakdown"] as const) { if (product[field] === undefined && !product.missingFields.includes(field)) { product.missingFields.push(field); } } return product; }