161 lines
5.5 KiB
TypeScript
161 lines
5.5 KiB
TypeScript
import { HTMLElement, parse } from "node-html-parser";
|
|
|
|
import { parseMoney, parseRating, parseReviewCount, parseStarBreakdown } from "./parsers.js";
|
|
import type { DeliverySummary, ProductSearchResult, ProductSpec } from "./types.js";
|
|
|
|
function textOf(node: HTMLElement | null | undefined): string {
|
|
return cleanText(node?.textContent ?? "");
|
|
}
|
|
|
|
function attrOf(node: HTMLElement | null | undefined, name: string): string {
|
|
return cleanText(node?.getAttribute(name) ?? "");
|
|
}
|
|
|
|
function cleanText(text: string): string {
|
|
return text
|
|
.replace(/\s+/g, " ")
|
|
.replace(/\s*\{".*$/g, "")
|
|
.trim();
|
|
}
|
|
|
|
function isScriptLike(text: string): boolean {
|
|
return /\(function\s*\(|window\.|P\.when|ue\.count|tracking\(\)|logShoppableMetrics|buying options|add to cart/i.test(text);
|
|
}
|
|
|
|
function firstText(root: HTMLElement, selectors: string[]): string {
|
|
for (const selector of selectors) {
|
|
const text = textOf(root.querySelector(selector));
|
|
if (text) {
|
|
return text;
|
|
}
|
|
}
|
|
return "";
|
|
}
|
|
|
|
function extractBullets(root: HTMLElement): string[] {
|
|
const spanBullets = root.querySelectorAll("#feature-bullets li span")
|
|
.map((node) => textOf(node))
|
|
.filter((text) => text && !/make sure this fits/i.test(text));
|
|
if (spanBullets.length > 0) {
|
|
return spanBullets;
|
|
}
|
|
return root.querySelectorAll("#feature-bullets li")
|
|
.map((node) => textOf(node))
|
|
.filter((text) => text && !/make sure this fits/i.test(text));
|
|
}
|
|
|
|
function extractSpecs(root: HTMLElement): ProductSpec[] {
|
|
const specs: ProductSpec[] = [];
|
|
const seen = new Set<string>();
|
|
const excludedNames = new Set(["customer reviews"]);
|
|
for (const row of root.querySelectorAll("tr")) {
|
|
const cells = row.querySelectorAll("th,td").map((cell) => textOf(cell)).filter(Boolean);
|
|
if (cells.length >= 2) {
|
|
const name = cells[0];
|
|
const value = cells.slice(1).join(" ");
|
|
const key = name.toLowerCase();
|
|
if (seen.has(key) || excludedNames.has(key) || isScriptLike(name) || isScriptLike(value)) {
|
|
continue;
|
|
}
|
|
seen.add(key);
|
|
specs.push({ name, value });
|
|
}
|
|
}
|
|
return specs;
|
|
}
|
|
|
|
function extractHistogramText(root: HTMLElement): string {
|
|
const rows = root.querySelectorAll("#histogramTable tr, [aria-label*='star'] tr");
|
|
const parts: string[] = [];
|
|
for (const row of rows) {
|
|
const cells = row.querySelectorAll("th,td").map((cell) => textOf(cell)).filter(Boolean);
|
|
if (cells.length >= 2) {
|
|
parts.push(`${cells[0]} ${cells[1]}`);
|
|
}
|
|
}
|
|
return parts.join(" ");
|
|
}
|
|
|
|
function deliveryFromText(text: string, primeSignal = false): DeliverySummary | undefined {
|
|
const display = text.replace(/\s+/g, " ").trim();
|
|
if (!display) {
|
|
return primeSignal ? { display: "Prime delivery available", prime: true } : undefined;
|
|
}
|
|
return {
|
|
display,
|
|
free: /\bfree\b/i.test(display),
|
|
prime: primeSignal || /\bprime\b/i.test(display)
|
|
};
|
|
}
|
|
|
|
function hasPrimeSignal(root: HTMLElement): boolean {
|
|
const attributeText = root.querySelectorAll("[id], [class], [aria-label], img[alt]")
|
|
.map((node) => [
|
|
attrOf(node, "id"),
|
|
attrOf(node, "class"),
|
|
attrOf(node, "aria-label"),
|
|
attrOf(node, "alt")
|
|
].join(" "))
|
|
.join(" ");
|
|
return /a-icon-prime|prime-logo|primeExclusive|primePopover|amazon\s+prime|\bprime\b/i.test(attributeText);
|
|
}
|
|
|
|
function mergeDelivery(detail: DeliverySummary | undefined, base: DeliverySummary | undefined): DeliverySummary | undefined {
|
|
if (!detail) {
|
|
return base;
|
|
}
|
|
if (!base) {
|
|
return detail;
|
|
}
|
|
return {
|
|
display: detail.display || base.display,
|
|
free: Boolean(detail.free || base.free),
|
|
prime: Boolean(detail.prime || base.prime),
|
|
fastestDate: detail.fastestDate ?? base.fastestDate
|
|
};
|
|
}
|
|
|
|
export function extractDetailPage(html: string, base: ProductSearchResult): ProductSearchResult {
|
|
const root = parse(html);
|
|
const title = firstText(root, ["#productTitle", "h1"]) || base.title;
|
|
const priceText = firstText(root, [
|
|
"#corePriceDisplay_desktop_feature_div .a-offscreen",
|
|
".a-price .a-offscreen",
|
|
".a-price"
|
|
]);
|
|
const deliveryText = firstText(root, [
|
|
"#mir-layout-DELIVERY_BLOCK-slot-PRIMARY_DELIVERY_MESSAGE_LARGE",
|
|
"#deliveryBlockMessage",
|
|
"[data-csa-c-delivery-price]"
|
|
]);
|
|
const availability = firstText(root, ["#availability", "#availabilityInsideBuyBox_feature_div"]);
|
|
const seller = firstText(root, ["#merchant-info", "#sellerProfileTriggerId"]);
|
|
const ratingText = attrOf(root.querySelector("#acrPopover"), "title") || textOf(root.querySelector("#acrPopover"));
|
|
const reviewText = textOf(root.querySelector("#acrCustomerReviewText"));
|
|
const histogram = parseStarBreakdown(extractHistogramText(root));
|
|
|
|
const product: ProductSearchResult = {
|
|
...base,
|
|
title,
|
|
price: parseMoney(priceText) ?? base.price,
|
|
rating: parseRating(ratingText) ?? base.rating,
|
|
reviewCount: parseReviewCount(reviewText) ?? base.reviewCount,
|
|
delivery: mergeDelivery(deliveryFromText(deliveryText, hasPrimeSignal(root)), base.delivery),
|
|
availability: availability || base.availability,
|
|
seller: seller || base.seller,
|
|
bullets: extractBullets(root),
|
|
specs: extractSpecs(root),
|
|
starBreakdown: histogram ?? base.starBreakdown,
|
|
missingFields: [...base.missingFields],
|
|
extractionNotes: [...base.extractionNotes]
|
|
};
|
|
|
|
for (const field of ["price", "delivery", "rating", "reviewCount", "starBreakdown"] as const) {
|
|
if (product[field] === undefined && !product.missingFields.includes(field)) {
|
|
product.missingFields.push(field);
|
|
}
|
|
}
|
|
|
|
return product;
|
|
}
|