feat(amazon-shopping): scrape and filter amazon product results

This commit is contained in:
2026-04-15 18:48:51 -05:00
parent ef326896f4
commit 1e0e265f1e
15 changed files with 844 additions and 6 deletions
+133
View File
@@ -0,0 +1,133 @@
import { HTMLElement, parse } from "node-html-parser";
import { parseMoney, parseRating, parseReviewCount, parseStarBreakdown } from "./parsers.js";
import type { DeliverySummary, ProductSearchResult, ProductSpec } from "./types.js";
function textOf(node: HTMLElement | null | undefined): string {
return cleanText(node?.textContent ?? "");
}
function attrOf(node: HTMLElement | null | undefined, name: string): string {
return cleanText(node?.getAttribute(name) ?? "");
}
function cleanText(text: string): string {
return text
.replace(/\s+/g, " ")
.replace(/\s*\{".*$/g, "")
.trim();
}
function isScriptLike(text: string): boolean {
return /\(function\s*\(|window\.|P\.when|ue\.count|tracking\(\)|logShoppableMetrics|buying options|add to cart/i.test(text);
}
function firstText(root: HTMLElement, selectors: string[]): string {
for (const selector of selectors) {
const text = textOf(root.querySelector(selector));
if (text) {
return text;
}
}
return "";
}
function extractBullets(root: HTMLElement): string[] {
const spanBullets = root.querySelectorAll("#feature-bullets li span")
.map((node) => textOf(node))
.filter((text) => text && !/make sure this fits/i.test(text));
if (spanBullets.length > 0) {
return spanBullets;
}
return root.querySelectorAll("#feature-bullets li")
.map((node) => textOf(node))
.filter((text) => text && !/make sure this fits/i.test(text));
}
function extractSpecs(root: HTMLElement): ProductSpec[] {
const specs: ProductSpec[] = [];
const seen = new Set<string>();
const excludedNames = new Set(["customer reviews"]);
for (const row of root.querySelectorAll("tr")) {
const cells = row.querySelectorAll("th,td").map((cell) => textOf(cell)).filter(Boolean);
if (cells.length >= 2) {
const name = cells[0];
const value = cells.slice(1).join(" ");
const key = name.toLowerCase();
if (seen.has(key) || excludedNames.has(key) || isScriptLike(name) || isScriptLike(value)) {
continue;
}
seen.add(key);
specs.push({ name, value });
}
}
return specs;
}
function extractHistogramText(root: HTMLElement): string {
const rows = root.querySelectorAll("#histogramTable tr, [aria-label*='star'] tr");
const parts: string[] = [];
for (const row of rows) {
const cells = row.querySelectorAll("th,td").map((cell) => textOf(cell)).filter(Boolean);
if (cells.length >= 2) {
parts.push(`${cells[0]} ${cells[1]}`);
}
}
return parts.join(" ");
}
function deliveryFromText(text: string): DeliverySummary | undefined {
const display = text.replace(/\s+/g, " ").trim();
if (!display) {
return undefined;
}
return {
display,
free: /\bfree\b/i.test(display),
prime: /\bprime\b/i.test(display)
};
}
export function extractDetailPage(html: string, base: ProductSearchResult): ProductSearchResult {
const root = parse(html);
const title = firstText(root, ["#productTitle", "h1"]) || base.title;
const priceText = firstText(root, [
"#corePriceDisplay_desktop_feature_div .a-offscreen",
".a-price .a-offscreen",
".a-price"
]);
const deliveryText = firstText(root, [
"#mir-layout-DELIVERY_BLOCK-slot-PRIMARY_DELIVERY_MESSAGE_LARGE",
"#deliveryBlockMessage",
"[data-csa-c-delivery-price]"
]);
const availability = firstText(root, ["#availability", "#availabilityInsideBuyBox_feature_div"]);
const seller = firstText(root, ["#merchant-info", "#sellerProfileTriggerId"]);
const ratingText = attrOf(root.querySelector("#acrPopover"), "title") || textOf(root.querySelector("#acrPopover"));
const reviewText = textOf(root.querySelector("#acrCustomerReviewText"));
const histogram = parseStarBreakdown(extractHistogramText(root));
const product: ProductSearchResult = {
...base,
title,
price: parseMoney(priceText) ?? base.price,
rating: parseRating(ratingText) ?? base.rating,
reviewCount: parseReviewCount(reviewText) ?? base.reviewCount,
delivery: deliveryFromText(deliveryText) ?? base.delivery,
availability: availability || base.availability,
seller: seller || base.seller,
bullets: extractBullets(root),
specs: extractSpecs(root),
starBreakdown: histogram ?? base.starBreakdown,
missingFields: [...base.missingFields],
extractionNotes: [...base.extractionNotes]
};
for (const field of ["price", "delivery", "rating", "reviewCount", "starBreakdown"] as const) {
if (product[field] === undefined && !product.missingFields.includes(field)) {
product.missingFields.push(field);
}
}
return product;
}