feat(amazon-shopping): scrape and filter amazon product results
This commit is contained in:
@@ -0,0 +1,43 @@
|
||||
import assert from "node:assert/strict";
|
||||
import { describe, it } from "node:test";
|
||||
|
||||
import { classifyHttpStatus, isPathAllowedByRobots, plannedAmazonPaths } from "../src/browser.js";
|
||||
|
||||
describe("browser compliance helpers", () => {
|
||||
it("plans only search and product-detail paths", () => {
|
||||
assert.deepEqual(plannedAmazonPaths(["B0TEST0001"]), ["/s", "/dp/B0TEST0001", "/gp/product/B0TEST0001"]);
|
||||
});
|
||||
|
||||
it("honors robots disallow rules for planned paths", () => {
|
||||
const robots = `
|
||||
User-agent: *
|
||||
Disallow: /cart
|
||||
Disallow: /product-reviews
|
||||
Disallow: /dp/private
|
||||
`;
|
||||
|
||||
assert.equal(isPathAllowedByRobots(robots, "*", "/s"), true);
|
||||
assert.equal(isPathAllowedByRobots(robots, "*", "/product-reviews/B0TEST0001"), false);
|
||||
assert.equal(isPathAllowedByRobots(robots, "*", "/dp/private/B0TEST0001"), false);
|
||||
});
|
||||
|
||||
it("does not leak disallow rules from other user-agent groups", () => {
|
||||
const robots = `
|
||||
User-agent: specialbot
|
||||
Disallow: /dp
|
||||
|
||||
User-agent: *
|
||||
Disallow: /cart
|
||||
`;
|
||||
|
||||
assert.equal(isPathAllowedByRobots(robots, "*", "/dp/B0TEST0001"), true);
|
||||
assert.equal(isPathAllowedByRobots(robots, "specialbot", "/dp/B0TEST0001"), false);
|
||||
});
|
||||
|
||||
it("classifies retryable and challenge statuses", () => {
|
||||
assert.equal(classifyHttpStatus(429), "retryable");
|
||||
assert.equal(classifyHttpStatus(503), "retryable");
|
||||
assert.equal(classifyHttpStatus(403), "challenge");
|
||||
assert.equal(classifyHttpStatus(200), "ok");
|
||||
});
|
||||
});
|
||||
@@ -0,0 +1,77 @@
|
||||
import assert from "node:assert/strict";
|
||||
import { readFile } from "node:fs/promises";
|
||||
import { join } from "node:path";
|
||||
import { describe, it } from "node:test";
|
||||
|
||||
import { extractDetailPage } from "../src/detail-page.js";
|
||||
|
||||
const fixturePath = join(import.meta.dirname, "fixtures", "product-detail.html");
|
||||
|
||||
describe("extractDetailPage", () => {
|
||||
it("extracts visible product detail fields from sanitized HTML", async () => {
|
||||
const html = await readFile(fixturePath, "utf8");
|
||||
const details = extractDetailPage(html, {
|
||||
asin: "B0TESTLED1",
|
||||
title: "Search title",
|
||||
url: "https://www.amazon.com/dp/B0TESTLED1",
|
||||
specs: [],
|
||||
bullets: [],
|
||||
matchedFilters: [],
|
||||
missingFields: [],
|
||||
extractionNotes: []
|
||||
});
|
||||
|
||||
assert.equal(details.title, "Bright Daylight LED Bulbs 100W Equivalent, 50 Count");
|
||||
assert.equal(details.price?.amount, 18.99);
|
||||
assert.equal(details.delivery?.free, true);
|
||||
assert.equal(details.availability, "In Stock");
|
||||
assert.equal(details.seller, "Ships from Amazon.com");
|
||||
assert.equal(details.bullets.length, 2);
|
||||
assert.deepEqual(details.specs[0], { name: "Brand", value: "BrightCo" });
|
||||
assert.equal(details.rating, 4.6);
|
||||
assert.equal(details.reviewCount, 1234);
|
||||
assert.equal(details.starBreakdown?.five, 72);
|
||||
});
|
||||
|
||||
it("records missing detail-only fields", () => {
|
||||
const details = extractDetailPage("<html><body><h1 id=\"productTitle\">Sparse Product</h1></body></html>", {
|
||||
asin: "B0SPARSE01",
|
||||
title: "Sparse",
|
||||
url: "https://www.amazon.com/dp/B0SPARSE01",
|
||||
specs: [],
|
||||
bullets: [],
|
||||
matchedFilters: [],
|
||||
missingFields: [],
|
||||
extractionNotes: []
|
||||
});
|
||||
|
||||
assert.equal(details.price, undefined);
|
||||
assert.ok(details.missingFields.includes("price"));
|
||||
assert.ok(details.missingFields.includes("starBreakdown"));
|
||||
});
|
||||
|
||||
it("drops script-like spec rows and trims availability metadata", () => {
|
||||
const details = extractDetailPage(`
|
||||
<h1 id="productTitle">Messy Product</h1>
|
||||
<div id="availability">In Stock {"merchantId":"secretish"}</div>
|
||||
<table>
|
||||
<tr><td>Special Feature</td><td>(function(P) { tracking(); }) Real feature text</td></tr>
|
||||
<tr><td>A19 Add to Cart logShoppableMetrics("x", true)</td><td>Buying Options</td></tr>
|
||||
<tr><td>Wattage</td><td>15 watts</td></tr>
|
||||
<tr><td>Customer Reviews</td><td>4.7 out of 5 stars tracking payload</td></tr>
|
||||
</table>
|
||||
`, {
|
||||
asin: "B0MESSY001",
|
||||
title: "Messy",
|
||||
url: "https://www.amazon.com/dp/B0MESSY001",
|
||||
specs: [],
|
||||
bullets: [],
|
||||
matchedFilters: [],
|
||||
missingFields: [],
|
||||
extractionNotes: []
|
||||
});
|
||||
|
||||
assert.equal(details.availability, "In Stock");
|
||||
assert.deepEqual(details.specs, [{ name: "Wattage", value: "15 watts" }]);
|
||||
});
|
||||
});
|
||||
@@ -0,0 +1,75 @@
|
||||
import assert from "node:assert/strict";
|
||||
import { describe, it } from "node:test";
|
||||
|
||||
import { applyFiltersAndLimit } from "../src/filters.js";
|
||||
import type { ProductSearchResult } from "../src/types.js";
|
||||
|
||||
function product(overrides: Partial<ProductSearchResult>): ProductSearchResult {
|
||||
return {
|
||||
asin: "B0BASE0001",
|
||||
title: "Base Product",
|
||||
url: "https://www.amazon.com/dp/B0BASE0001",
|
||||
specs: [],
|
||||
bullets: [],
|
||||
matchedFilters: [],
|
||||
missingFields: [],
|
||||
extractionNotes: [],
|
||||
...overrides
|
||||
};
|
||||
}
|
||||
|
||||
describe("applyFiltersAndLimit", () => {
|
||||
it("applies strict rating, review, and unit-price filters", () => {
|
||||
const result = applyFiltersAndLimit([
|
||||
product({
|
||||
asin: "B0PASS0001",
|
||||
rating: 4.6,
|
||||
reviewCount: 201,
|
||||
unitPrice: { amount: 3.99, currency: "USD", display: "$3.99/Count" }
|
||||
}),
|
||||
product({
|
||||
asin: "B0FAIL0001",
|
||||
rating: 4.5,
|
||||
reviewCount: 200,
|
||||
unitPrice: { amount: 3.99, currency: "USD", display: "$3.99/Count" }
|
||||
}),
|
||||
product({
|
||||
asin: "B0UNKNOWN1",
|
||||
rating: 4.7,
|
||||
reviewCount: 300
|
||||
})
|
||||
], {
|
||||
includeKeywords: [],
|
||||
excludeKeywords: [],
|
||||
minRating: 4.5,
|
||||
ratingComparison: "gt",
|
||||
minReviews: 200,
|
||||
reviewCountComparison: "gt",
|
||||
maxUnitPrice: 4
|
||||
}, 10);
|
||||
|
||||
assert.deepEqual(result.results.map((item) => item.asin), ["B0PASS0001"]);
|
||||
assert.equal(result.filteredOutCount, 2);
|
||||
assert.match(result.filteredOutReasons["B0UNKNOWN1"]?.join(" ") ?? "", /unit price unknown/i);
|
||||
});
|
||||
|
||||
it("sorts by rating, reviews, then price", () => {
|
||||
const result = applyFiltersAndLimit([
|
||||
product({ asin: "B0LOWPRICE", rating: 4.7, reviewCount: 1000, price: { amount: 15, currency: "USD", display: "$15.00" } }),
|
||||
product({ asin: "B0HIGHRATE", rating: 4.9, reviewCount: 100, price: { amount: 40, currency: "USD", display: "$40.00" } }),
|
||||
product({ asin: "B0MOREREV", rating: 4.7, reviewCount: 2000, price: { amount: 20, currency: "USD", display: "$20.00" } })
|
||||
], { includeKeywords: [], excludeKeywords: [] }, 2);
|
||||
|
||||
assert.deepEqual(result.results.map((item) => item.asin), ["B0HIGHRATE", "B0MOREREV"]);
|
||||
});
|
||||
|
||||
it("deduplicates repeated ASINs before limiting", () => {
|
||||
const result = applyFiltersAndLimit([
|
||||
product({ asin: "B0DUP0001", rating: 4.8, reviewCount: 1000 }),
|
||||
product({ asin: "B0DUP0001", rating: 4.8, reviewCount: 1000 }),
|
||||
product({ asin: "B0UNIQUE1", rating: 4.7, reviewCount: 900 })
|
||||
], { includeKeywords: [], excludeKeywords: [] }, 10);
|
||||
|
||||
assert.deepEqual(result.results.map((item) => item.asin), ["B0DUP0001", "B0UNIQUE1"]);
|
||||
});
|
||||
});
|
||||
@@ -0,0 +1,30 @@
|
||||
<!-- Hand-crafted sanitized fixture. Not a live Amazon snapshot. -->
|
||||
<html>
|
||||
<body>
|
||||
<h1 id="productTitle">Bright Daylight LED Bulbs 100W Equivalent, 50 Count</h1>
|
||||
<span id="productTitle_feature_div"></span>
|
||||
<span class="a-price"><span class="a-offscreen">$18.99</span></span>
|
||||
<div id="mir-layout-DELIVERY_BLOCK-slot-PRIMARY_DELIVERY_MESSAGE_LARGE">FREE delivery Tomorrow</div>
|
||||
<div id="availability">In Stock</div>
|
||||
<div id="merchant-info">Ships from Amazon.com</div>
|
||||
<div id="feature-bullets">
|
||||
<ul>
|
||||
<li><span>Energy efficient 100W equivalent bulbs.</span></li>
|
||||
<li><span>Daylight color temperature for kitchens and garages.</span></li>
|
||||
</ul>
|
||||
</div>
|
||||
<table id="productOverview_feature_div">
|
||||
<tr><td>Brand</td><td>BrightCo</td></tr>
|
||||
<tr><td>Light Type</td><td>LED</td></tr>
|
||||
</table>
|
||||
<span id="acrPopover" title="4.6 out of 5 stars"></span>
|
||||
<span id="acrCustomerReviewText">1,234 ratings</span>
|
||||
<table id="histogramTable">
|
||||
<tr><td>5 star</td><td>72%</td></tr>
|
||||
<tr><td>4 star</td><td>15%</td></tr>
|
||||
<tr><td>3 star</td><td>7%</td></tr>
|
||||
<tr><td>2 star</td><td>3%</td></tr>
|
||||
<tr><td>1 star</td><td>3%</td></tr>
|
||||
</table>
|
||||
</body>
|
||||
</html>
|
||||
@@ -68,6 +68,14 @@ describe("parsers", () => {
|
||||
});
|
||||
});
|
||||
|
||||
it("prefers the unit price when product price appears first", () => {
|
||||
assert.deepEqual(parseUnitPrice("$9.99 ($5.00$5.00/count)"), {
|
||||
amount: 5,
|
||||
currency: "USD",
|
||||
display: "$5.00/count"
|
||||
});
|
||||
});
|
||||
|
||||
it("parses whole-dollar and one-decimal prices", () => {
|
||||
assert.deepEqual(parseMoney("$20"), { amount: 20, currency: "USD", display: "$20" });
|
||||
assert.deepEqual(parseMoney("$19.9"), { amount: 19.9, currency: "USD", display: "$19.9" });
|
||||
|
||||
@@ -27,6 +27,14 @@ describe("parseNaturalLanguageRequest", () => {
|
||||
assert.equal(parsed.filters.ratingComparison, "gte");
|
||||
});
|
||||
|
||||
it("cleans rating filter phrases from search query text", () => {
|
||||
const parsed = parseNaturalLanguageRequest("usb c cable with over 1000 reviews and rating over 4 stars");
|
||||
|
||||
assert.equal(parsed.query, "usb c cable");
|
||||
assert.equal(parsed.filters.minReviews, 1000);
|
||||
assert.equal(parsed.filters.minRating, 4);
|
||||
});
|
||||
|
||||
it("extracts limit and max product price phrases", () => {
|
||||
const parsed = parseNaturalLanguageRequest("return 5 wireless mouse under $30");
|
||||
|
||||
|
||||
@@ -0,0 +1,55 @@
|
||||
import assert from "node:assert/strict";
|
||||
import { describe, it } from "node:test";
|
||||
|
||||
import { createMarkdownReport, createResponse } from "../src/report.js";
|
||||
|
||||
describe("report", () => {
|
||||
it("creates a structured JSON response", () => {
|
||||
const response = createResponse({
|
||||
query: "usb c cable",
|
||||
filters: { includeKeywords: [], excludeKeywords: [], minReviews: 1000 },
|
||||
limit: 1,
|
||||
maxSearchPages: 2,
|
||||
results: [],
|
||||
filteredOutCount: 4,
|
||||
warnings: ["partial extraction"],
|
||||
now: () => new Date("2026-04-15T00:00:00.000Z")
|
||||
});
|
||||
|
||||
assert.equal(response.source.site, "amazon.com");
|
||||
assert.equal(response.filteredOutCount, 4);
|
||||
assert.equal(response.source.scrapedAt, "2026-04-15T00:00:00.000Z");
|
||||
});
|
||||
|
||||
it("creates concise markdown with product details and warnings", () => {
|
||||
const markdown = createMarkdownReport(createResponse({
|
||||
query: "usb c cable",
|
||||
filters: { includeKeywords: [], excludeKeywords: [] },
|
||||
limit: 1,
|
||||
maxSearchPages: 2,
|
||||
filteredOutCount: 0,
|
||||
warnings: ["price missing for one item"],
|
||||
now: () => new Date("2026-04-15T00:00:00.000Z"),
|
||||
results: [{
|
||||
asin: "B0TEST0001",
|
||||
title: "USB-C Cable",
|
||||
url: "https://www.amazon.com/dp/B0TEST0001",
|
||||
price: { amount: 9.99, currency: "USD", display: "$9.99" },
|
||||
rating: 4.7,
|
||||
reviewCount: 1234,
|
||||
delivery: { display: "FREE delivery Tomorrow", free: true },
|
||||
specs: [{ name: "Length", value: "6 ft" }],
|
||||
bullets: ["Braided cable"],
|
||||
matchedFilters: [],
|
||||
missingFields: ["starBreakdown"],
|
||||
extractionNotes: []
|
||||
}]
|
||||
}));
|
||||
|
||||
assert.match(markdown, /USB-C Cable/);
|
||||
assert.match(markdown, /\$9\.99/);
|
||||
assert.match(markdown, /4\.7 stars/);
|
||||
assert.match(markdown, /price missing/);
|
||||
assert.match(markdown, /https:\/\/www\.amazon\.com\/dp\/B0TEST0001/);
|
||||
});
|
||||
});
|
||||
Reference in New Issue
Block a user