feat(amazon-shopping): scrape and filter amazon product results
This commit is contained in:
@@ -0,0 +1,133 @@
|
||||
import { HTMLElement, parse } from "node-html-parser";
|
||||
|
||||
import { parseMoney, parseRating, parseReviewCount, parseStarBreakdown } from "./parsers.js";
|
||||
import type { DeliverySummary, ProductSearchResult, ProductSpec } from "./types.js";
|
||||
|
||||
function textOf(node: HTMLElement | null | undefined): string {
|
||||
return cleanText(node?.textContent ?? "");
|
||||
}
|
||||
|
||||
function attrOf(node: HTMLElement | null | undefined, name: string): string {
|
||||
return cleanText(node?.getAttribute(name) ?? "");
|
||||
}
|
||||
|
||||
function cleanText(text: string): string {
|
||||
return text
|
||||
.replace(/\s+/g, " ")
|
||||
.replace(/\s*\{".*$/g, "")
|
||||
.trim();
|
||||
}
|
||||
|
||||
function isScriptLike(text: string): boolean {
|
||||
return /\(function\s*\(|window\.|P\.when|ue\.count|tracking\(\)|logShoppableMetrics|buying options|add to cart/i.test(text);
|
||||
}
|
||||
|
||||
function firstText(root: HTMLElement, selectors: string[]): string {
|
||||
for (const selector of selectors) {
|
||||
const text = textOf(root.querySelector(selector));
|
||||
if (text) {
|
||||
return text;
|
||||
}
|
||||
}
|
||||
return "";
|
||||
}
|
||||
|
||||
function extractBullets(root: HTMLElement): string[] {
|
||||
const spanBullets = root.querySelectorAll("#feature-bullets li span")
|
||||
.map((node) => textOf(node))
|
||||
.filter((text) => text && !/make sure this fits/i.test(text));
|
||||
if (spanBullets.length > 0) {
|
||||
return spanBullets;
|
||||
}
|
||||
return root.querySelectorAll("#feature-bullets li")
|
||||
.map((node) => textOf(node))
|
||||
.filter((text) => text && !/make sure this fits/i.test(text));
|
||||
}
|
||||
|
||||
function extractSpecs(root: HTMLElement): ProductSpec[] {
|
||||
const specs: ProductSpec[] = [];
|
||||
const seen = new Set<string>();
|
||||
const excludedNames = new Set(["customer reviews"]);
|
||||
for (const row of root.querySelectorAll("tr")) {
|
||||
const cells = row.querySelectorAll("th,td").map((cell) => textOf(cell)).filter(Boolean);
|
||||
if (cells.length >= 2) {
|
||||
const name = cells[0];
|
||||
const value = cells.slice(1).join(" ");
|
||||
const key = name.toLowerCase();
|
||||
if (seen.has(key) || excludedNames.has(key) || isScriptLike(name) || isScriptLike(value)) {
|
||||
continue;
|
||||
}
|
||||
seen.add(key);
|
||||
specs.push({ name, value });
|
||||
}
|
||||
}
|
||||
return specs;
|
||||
}
|
||||
|
||||
function extractHistogramText(root: HTMLElement): string {
|
||||
const rows = root.querySelectorAll("#histogramTable tr, [aria-label*='star'] tr");
|
||||
const parts: string[] = [];
|
||||
for (const row of rows) {
|
||||
const cells = row.querySelectorAll("th,td").map((cell) => textOf(cell)).filter(Boolean);
|
||||
if (cells.length >= 2) {
|
||||
parts.push(`${cells[0]} ${cells[1]}`);
|
||||
}
|
||||
}
|
||||
return parts.join(" ");
|
||||
}
|
||||
|
||||
function deliveryFromText(text: string): DeliverySummary | undefined {
|
||||
const display = text.replace(/\s+/g, " ").trim();
|
||||
if (!display) {
|
||||
return undefined;
|
||||
}
|
||||
return {
|
||||
display,
|
||||
free: /\bfree\b/i.test(display),
|
||||
prime: /\bprime\b/i.test(display)
|
||||
};
|
||||
}
|
||||
|
||||
export function extractDetailPage(html: string, base: ProductSearchResult): ProductSearchResult {
|
||||
const root = parse(html);
|
||||
const title = firstText(root, ["#productTitle", "h1"]) || base.title;
|
||||
const priceText = firstText(root, [
|
||||
"#corePriceDisplay_desktop_feature_div .a-offscreen",
|
||||
".a-price .a-offscreen",
|
||||
".a-price"
|
||||
]);
|
||||
const deliveryText = firstText(root, [
|
||||
"#mir-layout-DELIVERY_BLOCK-slot-PRIMARY_DELIVERY_MESSAGE_LARGE",
|
||||
"#deliveryBlockMessage",
|
||||
"[data-csa-c-delivery-price]"
|
||||
]);
|
||||
const availability = firstText(root, ["#availability", "#availabilityInsideBuyBox_feature_div"]);
|
||||
const seller = firstText(root, ["#merchant-info", "#sellerProfileTriggerId"]);
|
||||
const ratingText = attrOf(root.querySelector("#acrPopover"), "title") || textOf(root.querySelector("#acrPopover"));
|
||||
const reviewText = textOf(root.querySelector("#acrCustomerReviewText"));
|
||||
const histogram = parseStarBreakdown(extractHistogramText(root));
|
||||
|
||||
const product: ProductSearchResult = {
|
||||
...base,
|
||||
title,
|
||||
price: parseMoney(priceText) ?? base.price,
|
||||
rating: parseRating(ratingText) ?? base.rating,
|
||||
reviewCount: parseReviewCount(reviewText) ?? base.reviewCount,
|
||||
delivery: deliveryFromText(deliveryText) ?? base.delivery,
|
||||
availability: availability || base.availability,
|
||||
seller: seller || base.seller,
|
||||
bullets: extractBullets(root),
|
||||
specs: extractSpecs(root),
|
||||
starBreakdown: histogram ?? base.starBreakdown,
|
||||
missingFields: [...base.missingFields],
|
||||
extractionNotes: [...base.extractionNotes]
|
||||
};
|
||||
|
||||
for (const field of ["price", "delivery", "rating", "reviewCount", "starBreakdown"] as const) {
|
||||
if (product[field] === undefined && !product.missingFields.includes(field)) {
|
||||
product.missingFields.push(field);
|
||||
}
|
||||
}
|
||||
|
||||
return product;
|
||||
}
|
||||
Reference in New Issue
Block a user