feat(amazon-shopping): parse filters and extract search candidates
This commit is contained in:
Generated
+134
-1
@@ -8,7 +8,8 @@
|
||||
"name": "amazon-shopping-scripts",
|
||||
"version": "1.0.0",
|
||||
"dependencies": {
|
||||
"minimist": "^1.2.8"
|
||||
"minimist": "^1.2.8",
|
||||
"node-html-parser": "^7.1.0"
|
||||
},
|
||||
"devDependencies": {
|
||||
"@types/minimist": "^1.2.5",
|
||||
@@ -476,6 +477,107 @@
|
||||
"undici-types": "~7.16.0"
|
||||
}
|
||||
},
|
||||
"node_modules/boolbase": {
|
||||
"version": "1.0.0",
|
||||
"resolved": "https://registry.npmjs.org/boolbase/-/boolbase-1.0.0.tgz",
|
||||
"integrity": "sha512-JZOSA7Mo9sNGB8+UjSgzdLtokWAky1zbztM3WRLCbZ70/3cTANmQmOdR7y2g+J0e2WXywy1yS468tY+IruqEww==",
|
||||
"license": "ISC"
|
||||
},
|
||||
"node_modules/css-select": {
|
||||
"version": "5.2.2",
|
||||
"resolved": "https://registry.npmjs.org/css-select/-/css-select-5.2.2.tgz",
|
||||
"integrity": "sha512-TizTzUddG/xYLA3NXodFM0fSbNizXjOKhqiQQwvhlspadZokn1KDy0NZFS0wuEubIYAV5/c1/lAr0TaaFXEXzw==",
|
||||
"license": "BSD-2-Clause",
|
||||
"dependencies": {
|
||||
"boolbase": "^1.0.0",
|
||||
"css-what": "^6.1.0",
|
||||
"domhandler": "^5.0.2",
|
||||
"domutils": "^3.0.1",
|
||||
"nth-check": "^2.0.1"
|
||||
},
|
||||
"funding": {
|
||||
"url": "https://github.com/sponsors/fb55"
|
||||
}
|
||||
},
|
||||
"node_modules/css-what": {
|
||||
"version": "6.2.2",
|
||||
"resolved": "https://registry.npmjs.org/css-what/-/css-what-6.2.2.tgz",
|
||||
"integrity": "sha512-u/O3vwbptzhMs3L1fQE82ZSLHQQfto5gyZzwteVIEyeaY5Fc7R4dapF/BvRoSYFeqfBk4m0V1Vafq5Pjv25wvA==",
|
||||
"license": "BSD-2-Clause",
|
||||
"engines": {
|
||||
"node": ">= 6"
|
||||
},
|
||||
"funding": {
|
||||
"url": "https://github.com/sponsors/fb55"
|
||||
}
|
||||
},
|
||||
"node_modules/dom-serializer": {
|
||||
"version": "2.0.0",
|
||||
"resolved": "https://registry.npmjs.org/dom-serializer/-/dom-serializer-2.0.0.tgz",
|
||||
"integrity": "sha512-wIkAryiqt/nV5EQKqQpo3SToSOV9J0DnbJqwK7Wv/Trc92zIAYZ4FlMu+JPFW1DfGFt81ZTCGgDEabffXeLyJg==",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"domelementtype": "^2.3.0",
|
||||
"domhandler": "^5.0.2",
|
||||
"entities": "^4.2.0"
|
||||
},
|
||||
"funding": {
|
||||
"url": "https://github.com/cheeriojs/dom-serializer?sponsor=1"
|
||||
}
|
||||
},
|
||||
"node_modules/domelementtype": {
|
||||
"version": "2.3.0",
|
||||
"resolved": "https://registry.npmjs.org/domelementtype/-/domelementtype-2.3.0.tgz",
|
||||
"integrity": "sha512-OLETBj6w0OsagBwdXnPdN0cnMfF9opN69co+7ZrbfPGrdpPVNBUj02spi6B1N7wChLQiPn4CSH/zJvXw56gmHw==",
|
||||
"funding": [
|
||||
{
|
||||
"type": "github",
|
||||
"url": "https://github.com/sponsors/fb55"
|
||||
}
|
||||
],
|
||||
"license": "BSD-2-Clause"
|
||||
},
|
||||
"node_modules/domhandler": {
|
||||
"version": "5.0.3",
|
||||
"resolved": "https://registry.npmjs.org/domhandler/-/domhandler-5.0.3.tgz",
|
||||
"integrity": "sha512-cgwlv/1iFQiFnU96XXgROh8xTeetsnJiDsTc7TYCLFd9+/WNkIqPTxiM/8pSd8VIrhXGTf1Ny1q1hquVqDJB5w==",
|
||||
"license": "BSD-2-Clause",
|
||||
"dependencies": {
|
||||
"domelementtype": "^2.3.0"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">= 4"
|
||||
},
|
||||
"funding": {
|
||||
"url": "https://github.com/fb55/domhandler?sponsor=1"
|
||||
}
|
||||
},
|
||||
"node_modules/domutils": {
|
||||
"version": "3.2.2",
|
||||
"resolved": "https://registry.npmjs.org/domutils/-/domutils-3.2.2.tgz",
|
||||
"integrity": "sha512-6kZKyUajlDuqlHKVX1w7gyslj9MPIXzIFiz/rGu35uC1wMi+kMhQwGhl4lt9unC9Vb9INnY9Z3/ZA3+FhASLaw==",
|
||||
"license": "BSD-2-Clause",
|
||||
"dependencies": {
|
||||
"dom-serializer": "^2.0.0",
|
||||
"domelementtype": "^2.3.0",
|
||||
"domhandler": "^5.0.3"
|
||||
},
|
||||
"funding": {
|
||||
"url": "https://github.com/fb55/domutils?sponsor=1"
|
||||
}
|
||||
},
|
||||
"node_modules/entities": {
|
||||
"version": "4.5.0",
|
||||
"resolved": "https://registry.npmjs.org/entities/-/entities-4.5.0.tgz",
|
||||
"integrity": "sha512-V0hjH4dGPh9Ao5p0MoRY6BVqtwCjhz6vI5LT8AJ55H+4g9/4vbHx1I54fS0XuclLhDHArPQCiMjDxjaL8fPxhw==",
|
||||
"license": "BSD-2-Clause",
|
||||
"engines": {
|
||||
"node": ">=0.12"
|
||||
},
|
||||
"funding": {
|
||||
"url": "https://github.com/fb55/entities?sponsor=1"
|
||||
}
|
||||
},
|
||||
"node_modules/esbuild": {
|
||||
"version": "0.27.7",
|
||||
"resolved": "https://registry.npmjs.org/esbuild/-/esbuild-0.27.7.tgz",
|
||||
@@ -546,6 +648,15 @@
|
||||
"url": "https://github.com/privatenumber/get-tsconfig?sponsor=1"
|
||||
}
|
||||
},
|
||||
"node_modules/he": {
|
||||
"version": "1.2.0",
|
||||
"resolved": "https://registry.npmjs.org/he/-/he-1.2.0.tgz",
|
||||
"integrity": "sha512-F/1DnUGPopORZi0ni+CvrCgHQ5FyEAHRLSApuYWMmrbSwoN2Mn/7k+Gl38gJnR7yyDZk6WLXwiGod1JOWNDKGw==",
|
||||
"license": "MIT",
|
||||
"bin": {
|
||||
"he": "bin/he"
|
||||
}
|
||||
},
|
||||
"node_modules/minimist": {
|
||||
"version": "1.2.8",
|
||||
"resolved": "https://registry.npmjs.org/minimist/-/minimist-1.2.8.tgz",
|
||||
@@ -555,6 +666,28 @@
|
||||
"url": "https://github.com/sponsors/ljharb"
|
||||
}
|
||||
},
|
||||
"node_modules/node-html-parser": {
|
||||
"version": "7.1.0",
|
||||
"resolved": "https://registry.npmjs.org/node-html-parser/-/node-html-parser-7.1.0.tgz",
|
||||
"integrity": "sha512-iJo8b2uYGT40Y8BTyy5ufL6IVbN8rbm/1QK2xffXU/1a/v3AAa0d1YAoqBNYqaS4R/HajkWIpIfdE6KcyFh1AQ==",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"css-select": "^5.1.0",
|
||||
"he": "1.2.0"
|
||||
}
|
||||
},
|
||||
"node_modules/nth-check": {
|
||||
"version": "2.1.1",
|
||||
"resolved": "https://registry.npmjs.org/nth-check/-/nth-check-2.1.1.tgz",
|
||||
"integrity": "sha512-lqjrjmaOoAnWfMmBPL+XNnynZh2+swxiX3WUE0s4yEHI6m+AwrK2UZOimIRl3X/4QctVqS8AiZjFqyOGrMXb/w==",
|
||||
"license": "BSD-2-Clause",
|
||||
"dependencies": {
|
||||
"boolbase": "^1.0.0"
|
||||
},
|
||||
"funding": {
|
||||
"url": "https://github.com/fb55/nth-check?sponsor=1"
|
||||
}
|
||||
},
|
||||
"node_modules/resolve-pkg-maps": {
|
||||
"version": "1.0.0",
|
||||
"resolved": "https://registry.npmjs.org/resolve-pkg-maps/-/resolve-pkg-maps-1.0.0.tgz",
|
||||
|
||||
@@ -10,7 +10,8 @@
|
||||
"typecheck": "tsc --noEmit"
|
||||
},
|
||||
"dependencies": {
|
||||
"minimist": "^1.2.8"
|
||||
"minimist": "^1.2.8",
|
||||
"node-html-parser": "^7.1.0"
|
||||
},
|
||||
"devDependencies": {
|
||||
"@types/minimist": "^1.2.5",
|
||||
|
||||
@@ -3,6 +3,7 @@
|
||||
import minimist from "minimist";
|
||||
import { fileURLToPath } from "node:url";
|
||||
|
||||
import { parseNaturalLanguageRequest } from "./query-parser.js";
|
||||
import type { ProductFilters, SearchProductsRequest, SearchProductsResponse } from "./types.js";
|
||||
|
||||
export interface CliDeps {
|
||||
@@ -76,12 +77,13 @@ export function parseCliRequest(argv: string[]): SearchProductsRequest {
|
||||
alias: { h: "help" }
|
||||
});
|
||||
|
||||
const query = String(args.query ?? args._.join(" ")).trim();
|
||||
if (!query) {
|
||||
const rawQuery = String(args.query ?? args._.join(" ")).trim();
|
||||
if (!rawQuery) {
|
||||
throw new Error("A product query is required");
|
||||
}
|
||||
|
||||
const limit = parsePositiveInteger(args.limit, "limit") ?? 15;
|
||||
const natural = parseNaturalLanguageRequest(rawQuery);
|
||||
const limit = parsePositiveInteger(args.limit, "limit") ?? natural.limit ?? 15;
|
||||
if (limit > 30 && !args["allow-large-limit"]) {
|
||||
throw new Error("Requested limits above 30 require --allow-large-limit or a batched run");
|
||||
}
|
||||
@@ -91,10 +93,7 @@ export function parseCliRequest(argv: string[]): SearchProductsRequest {
|
||||
throw new Error("max-search-pages must be 5 or less");
|
||||
}
|
||||
|
||||
const filters: ProductFilters = {
|
||||
includeKeywords: [],
|
||||
excludeKeywords: []
|
||||
};
|
||||
const filters: ProductFilters = { ...natural.filters };
|
||||
const minRating = parseNumber(args["min-rating"], "min-rating");
|
||||
const minReviews = parsePositiveInteger(args["min-reviews"], "min-reviews");
|
||||
const maxPrice = parseNumber(args["max-price"], "max-price");
|
||||
@@ -108,7 +107,7 @@ export function parseCliRequest(argv: string[]): SearchProductsRequest {
|
||||
const markdown = Boolean(args.markdown);
|
||||
|
||||
return {
|
||||
query,
|
||||
query: natural.query || rawQuery,
|
||||
filters,
|
||||
limit,
|
||||
maxSearchPages,
|
||||
|
||||
@@ -0,0 +1,112 @@
|
||||
import type { MoneyValue, StarBreakdown, UnitCountExtraction } from "./types.js";
|
||||
|
||||
export function parseMoney(text: string | undefined | null): MoneyValue | undefined {
|
||||
if (!text) {
|
||||
return undefined;
|
||||
}
|
||||
const compact = text.replace(/\s+/g, " ").trim();
|
||||
const match = compact.match(/\$\s*([0-9][0-9,]*(?:\.[0-9]{1,2})?)/);
|
||||
if (!match) {
|
||||
return undefined;
|
||||
}
|
||||
const amount = Number(match[1].replace(/,/g, ""));
|
||||
if (!Number.isFinite(amount)) {
|
||||
return undefined;
|
||||
}
|
||||
return {
|
||||
amount,
|
||||
currency: "USD",
|
||||
display: compact
|
||||
};
|
||||
}
|
||||
|
||||
export function parseUnitPrice(text: string | undefined | null): MoneyValue | undefined {
|
||||
if (!text || !(/[/]\s*\d|\$\s*\d/.test(text))) {
|
||||
return undefined;
|
||||
}
|
||||
if (!/(\/|\bper\b|\beach\b|\bcount\b)/i.test(text)) {
|
||||
return undefined;
|
||||
}
|
||||
return parseMoney(text);
|
||||
}
|
||||
|
||||
export function parseRating(text: string | undefined | null): number | undefined {
|
||||
if (!text) {
|
||||
return undefined;
|
||||
}
|
||||
const match = text.match(/([0-5](?:\.[0-9])?)\s*(?:out of\s*)?5\s*stars?/i)
|
||||
?? text.match(/\brated\s+([0-5](?:\.[0-9])?)/i);
|
||||
if (!match) {
|
||||
return undefined;
|
||||
}
|
||||
const rating = Number(match[1]);
|
||||
return Number.isFinite(rating) ? rating : undefined;
|
||||
}
|
||||
|
||||
export function parseReviewCount(text: string | undefined | null): number | undefined {
|
||||
if (!text) {
|
||||
return undefined;
|
||||
}
|
||||
const match = text.match(/([0-9][0-9,]*)\s*(?:ratings?|reviews?)/i);
|
||||
if (!match) {
|
||||
return undefined;
|
||||
}
|
||||
const count = Number(match[1].replace(/,/g, ""));
|
||||
return Number.isInteger(count) ? count : undefined;
|
||||
}
|
||||
|
||||
export function parseStarBreakdown(text: string | undefined | null): StarBreakdown | undefined {
|
||||
if (!text) {
|
||||
return undefined;
|
||||
}
|
||||
const breakdown: Partial<Omit<StarBreakdown, "basis">> = {};
|
||||
const words: Record<string, keyof Omit<StarBreakdown, "basis">> = {
|
||||
"5": "five",
|
||||
"4": "four",
|
||||
"3": "three",
|
||||
"2": "two",
|
||||
"1": "one"
|
||||
};
|
||||
const percentMatches = [...text.matchAll(/([1-5])\s*star\s*([0-9]{1,3})\s*%/gi)];
|
||||
if (percentMatches.length === 0) {
|
||||
return undefined;
|
||||
}
|
||||
for (const match of percentMatches) {
|
||||
const key = words[match[1]];
|
||||
if (key) {
|
||||
breakdown[key] = Number(match[2]);
|
||||
}
|
||||
}
|
||||
return {
|
||||
...breakdown,
|
||||
basis: "percent"
|
||||
};
|
||||
}
|
||||
|
||||
export function extractUnitCount(text: string | undefined | null): UnitCountExtraction | undefined {
|
||||
if (!text) {
|
||||
return undefined;
|
||||
}
|
||||
const patterns = [
|
||||
{ pattern: /(\d{1,4})\s*[- ]?(?:count|ct)\b/i, confidence: "high" as const },
|
||||
{ pattern: /\bpack\s+of\s+(\d{1,4})\b/i, confidence: "high" as const },
|
||||
{ pattern: /\b(\d{1,4})\s*[- ]?pack\b/i, confidence: "high" as const },
|
||||
{ pattern: /\bset\s+of\s+(\d{1,4})\b/i, confidence: "medium" as const },
|
||||
{ pattern: /\b(\d{1,4})\s+(?:bulbs?|cables?|pieces?|pcs)\b/i, confidence: "low" as const }
|
||||
];
|
||||
for (const { pattern, confidence } of patterns) {
|
||||
const match = text.match(pattern);
|
||||
if (!match) {
|
||||
continue;
|
||||
}
|
||||
const count = Number(match[1]);
|
||||
if (Number.isInteger(count) && count > 0) {
|
||||
return {
|
||||
count,
|
||||
confidence,
|
||||
source: match[0]
|
||||
};
|
||||
}
|
||||
}
|
||||
return undefined;
|
||||
}
|
||||
@@ -0,0 +1,68 @@
|
||||
import type { ParsedNaturalLanguageRequest, ProductFilters } from "./types.js";
|
||||
|
||||
function cleanQuery(text: string): string {
|
||||
return text
|
||||
.replace(/\b(?:that|and|with|have)\b/gi, " ")
|
||||
.replace(/\s+/g, " ")
|
||||
.replace(/\s+(and|or|a)$/i, "")
|
||||
.trim();
|
||||
}
|
||||
|
||||
function removeMatched(text: string, match: RegExpMatchArray | null): string {
|
||||
if (!match) {
|
||||
return text;
|
||||
}
|
||||
return text.replace(match[0], " ");
|
||||
}
|
||||
|
||||
export function parseNaturalLanguageRequest(input: string): ParsedNaturalLanguageRequest {
|
||||
let remaining = input.trim();
|
||||
const filters: ProductFilters = {
|
||||
includeKeywords: [],
|
||||
excludeKeywords: []
|
||||
};
|
||||
let limit: number | undefined;
|
||||
|
||||
const limitMatch = remaining.match(/\b(?:return|limit|top)\s+(\d{1,3})\b/i);
|
||||
if (limitMatch) {
|
||||
limit = Number(limitMatch[1]);
|
||||
remaining = removeMatched(remaining, limitMatch);
|
||||
}
|
||||
|
||||
const unitPriceMatch = remaining.match(/\b(?:cost\s+)?(?:less than|under|below)\s+\$([0-9]+(?:\.[0-9]{1,2})?)\s*(?:each|per\b|\/\s*(?:count|unit|item))\b/i);
|
||||
if (unitPriceMatch) {
|
||||
filters.maxUnitPrice = Number(unitPriceMatch[1]);
|
||||
remaining = removeMatched(remaining, unitPriceMatch);
|
||||
}
|
||||
|
||||
const maxPriceMatch = remaining.match(/\b(?:cost\s+)?(?:less than|under|below)\s+\$([0-9]+(?:\.[0-9]{1,2})?)\b/i);
|
||||
if (maxPriceMatch) {
|
||||
filters.maxPrice = Number(maxPriceMatch[1]);
|
||||
remaining = removeMatched(remaining, maxPriceMatch);
|
||||
}
|
||||
|
||||
const exclusiveReviews = remaining.match(/\b(?:over|more than|above)\s+([0-9][0-9,]*)\s*(?:reviews?|ratings?)\b/i);
|
||||
const inclusiveReviews = remaining.match(/\b(?:at least|minimum|min\.?)\s+([0-9][0-9,]*)\s*(?:reviews?|ratings?)\b/i);
|
||||
const reviewMatch = exclusiveReviews ?? inclusiveReviews;
|
||||
if (reviewMatch) {
|
||||
filters.minReviews = Number(reviewMatch[1].replace(/,/g, ""));
|
||||
filters.reviewCountComparison = exclusiveReviews ? "gt" : "gte";
|
||||
remaining = removeMatched(remaining, reviewMatch);
|
||||
}
|
||||
|
||||
const exclusiveRating = remaining.match(/\b(?:a\s+)?(?:review score of\s+)?(?:more than|over|above|rated above)\s+([0-5](?:\.[0-9])?)\s*(?:stars?)?\b/i);
|
||||
const inclusiveRating = remaining.match(/\b([0-5](?:\.[0-9])?)\s*stars?\s+or\s+better\b/i)
|
||||
?? remaining.match(/\b(?:at least|minimum|min\.?)\s+([0-5](?:\.[0-9])?)\s*(?:stars?|rating)?\b/i);
|
||||
const ratingMatch = exclusiveRating ?? inclusiveRating;
|
||||
if (ratingMatch) {
|
||||
filters.minRating = Number(ratingMatch[1]);
|
||||
filters.ratingComparison = exclusiveRating ? "gt" : "gte";
|
||||
remaining = removeMatched(remaining, ratingMatch);
|
||||
}
|
||||
|
||||
return {
|
||||
query: cleanQuery(remaining),
|
||||
filters,
|
||||
limit
|
||||
};
|
||||
}
|
||||
@@ -0,0 +1,144 @@
|
||||
import { HTMLElement, parse } from "node-html-parser";
|
||||
|
||||
import { parseMoney, parseRating, parseReviewCount, parseUnitPrice } from "./parsers.js";
|
||||
import type { DeliverySummary, ProductSearchResult, SearchPageExtraction } from "./types.js";
|
||||
|
||||
function textOf(node: HTMLElement | null | undefined): string {
|
||||
return node?.textContent.replace(/\s+/g, " ").trim() ?? "";
|
||||
}
|
||||
|
||||
function attrOf(node: HTMLElement | null | undefined, name: string): string | undefined {
|
||||
return node?.getAttribute(name) ?? undefined;
|
||||
}
|
||||
|
||||
function absoluteAmazonUrl(href: string | undefined, currentUrl = "https://www.amazon.com/"): string | undefined {
|
||||
if (!href) {
|
||||
return undefined;
|
||||
}
|
||||
if (href.startsWith("https://www.amazon.com")) {
|
||||
return href;
|
||||
}
|
||||
try {
|
||||
const parsed = new URL(href, currentUrl);
|
||||
if (parsed.hostname !== "www.amazon.com") {
|
||||
return undefined;
|
||||
}
|
||||
return parsed.toString();
|
||||
} catch {
|
||||
return undefined;
|
||||
}
|
||||
}
|
||||
|
||||
function normalizeProductUrl(asin: string, href: string | undefined, currentUrl: string): string {
|
||||
const absolute = absoluteAmazonUrl(href, currentUrl);
|
||||
if (!absolute) {
|
||||
return `https://www.amazon.com/dp/${asin}`;
|
||||
}
|
||||
try {
|
||||
const url = new URL(absolute);
|
||||
const match = url.pathname.match(/\/(?:dp|gp\/product)\/([A-Z0-9]{8,14})/i);
|
||||
if (match) {
|
||||
return `https://www.amazon.com/dp/${match[1].toUpperCase()}`;
|
||||
}
|
||||
} catch {
|
||||
return `https://www.amazon.com/dp/${asin}`;
|
||||
}
|
||||
return `https://www.amazon.com/dp/${asin}`;
|
||||
}
|
||||
|
||||
function detectChallenge(html: string): boolean {
|
||||
return /robot check|enter the characters you see|captcha|automated access|access denied/i.test(html);
|
||||
}
|
||||
|
||||
function deliveryFromText(text: string): DeliverySummary | undefined {
|
||||
const compact = text.replace(/\s+/g, " ").trim();
|
||||
const deliveryMatch = compact.match(/((?:FREE\s+)?delivery[^.]*?(?:Tomorrow|Today|Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday)?)/i);
|
||||
if (!deliveryMatch) {
|
||||
return undefined;
|
||||
}
|
||||
const display = deliveryMatch[1].trim();
|
||||
return {
|
||||
display,
|
||||
free: /\bfree\b/i.test(display),
|
||||
prime: /\bprime\b/i.test(compact)
|
||||
};
|
||||
}
|
||||
|
||||
function firstText(card: HTMLElement, selectors: string[]): string {
|
||||
for (const selector of selectors) {
|
||||
const value = textOf(card.querySelector(selector));
|
||||
if (value) {
|
||||
return value;
|
||||
}
|
||||
}
|
||||
return "";
|
||||
}
|
||||
|
||||
function firstUnitPriceText(card: HTMLElement): string {
|
||||
for (const node of card.querySelectorAll(".a-color-secondary, .a-size-base, span")) {
|
||||
const value = textOf(node);
|
||||
if (parseUnitPrice(value)) {
|
||||
return value;
|
||||
}
|
||||
}
|
||||
return "";
|
||||
}
|
||||
|
||||
export function extractSearchPage(html: string, currentUrl: string): SearchPageExtraction {
|
||||
if (detectChallenge(html)) {
|
||||
return {
|
||||
status: "challenge",
|
||||
products: [],
|
||||
warnings: ["Amazon returned a challenge or blocked page; stopping without bypass."],
|
||||
};
|
||||
}
|
||||
|
||||
const root = parse(html);
|
||||
const cards = root.querySelectorAll("[data-asin]")
|
||||
.filter((card) => /^[A-Z0-9]{8,14}$/i.test(card.getAttribute("data-asin") ?? ""));
|
||||
const products: ProductSearchResult[] = [];
|
||||
|
||||
for (const card of cards) {
|
||||
const asin = (card.getAttribute("data-asin") ?? "").toUpperCase();
|
||||
const link = card.querySelector("h2 a") ?? card.querySelector("a[href*='/dp/']") ?? card.querySelector("a[href*='/gp/product/']");
|
||||
const title = textOf(link) || firstText(card, ["h2", "[data-cy='title-recipe']"]);
|
||||
if (!title) {
|
||||
continue;
|
||||
}
|
||||
const priceText = firstText(card, [".a-price .a-offscreen", ".a-price"]);
|
||||
const allText = textOf(card);
|
||||
const unitPriceText = firstUnitPriceText(card);
|
||||
const ariaText = card.querySelectorAll("[aria-label]")
|
||||
.map((node) => attrOf(node, "aria-label") ?? "")
|
||||
.join(" ");
|
||||
const delivery = deliveryFromText(allText);
|
||||
const product: ProductSearchResult = {
|
||||
asin,
|
||||
title,
|
||||
url: normalizeProductUrl(asin, attrOf(link, "href"), currentUrl),
|
||||
imageUrl: attrOf(card.querySelector("img"), "src"),
|
||||
price: parseMoney(priceText),
|
||||
unitPrice: parseUnitPrice(unitPriceText),
|
||||
rating: parseRating(ariaText || allText),
|
||||
reviewCount: parseReviewCount(ariaText || allText),
|
||||
delivery,
|
||||
specs: [],
|
||||
bullets: [],
|
||||
isSponsored: /\bsponsored\b/i.test(allText),
|
||||
matchedFilters: [],
|
||||
missingFields: [],
|
||||
extractionNotes: []
|
||||
};
|
||||
products.push(product);
|
||||
}
|
||||
|
||||
const nextHref = attrOf(root.querySelector(".s-pagination-next[href]"), "href");
|
||||
const nextPageUrl = absoluteAmazonUrl(nextHref, currentUrl);
|
||||
|
||||
return {
|
||||
status: "ok",
|
||||
products,
|
||||
warnings: [],
|
||||
nextPageUrl: nextPageUrl ?? undefined
|
||||
};
|
||||
}
|
||||
@@ -10,7 +10,9 @@ export interface SearchProductsRequest {
|
||||
|
||||
export interface ProductFilters {
|
||||
minRating?: number;
|
||||
ratingComparison?: "gt" | "gte";
|
||||
minReviews?: number;
|
||||
reviewCountComparison?: "gt" | "gte";
|
||||
maxPrice?: number;
|
||||
maxUnitPrice?: number;
|
||||
includeKeywords: string[];
|
||||
@@ -82,3 +84,22 @@ export interface SearchProductsResponse {
|
||||
automation: "web-automation/CloakBrowser";
|
||||
};
|
||||
}
|
||||
|
||||
export interface ParsedNaturalLanguageRequest {
|
||||
query: string;
|
||||
filters: ProductFilters;
|
||||
limit?: number;
|
||||
}
|
||||
|
||||
export interface UnitCountExtraction {
|
||||
count: number;
|
||||
confidence: "high" | "medium" | "low";
|
||||
source: string;
|
||||
}
|
||||
|
||||
export interface SearchPageExtraction {
|
||||
status: "ok" | "challenge";
|
||||
products: ProductSearchResult[];
|
||||
warnings: string[];
|
||||
nextPageUrl?: string;
|
||||
}
|
||||
|
||||
@@ -0,0 +1,71 @@
|
||||
import { access } from "node:fs/promises";
|
||||
import { constants } from "node:fs";
|
||||
import { dirname, join, resolve } from "node:path";
|
||||
import { fileURLToPath } from "node:url";
|
||||
|
||||
export interface RuntimeResolverOptions {
|
||||
env?: NodeJS.ProcessEnv;
|
||||
homeDir?: string;
|
||||
skillDir?: string;
|
||||
}
|
||||
|
||||
export interface WebAutomationRuntime {
|
||||
scriptsDir: string;
|
||||
checkInstall: {
|
||||
cwd: string;
|
||||
command: string;
|
||||
args: string[];
|
||||
};
|
||||
}
|
||||
|
||||
async function assertFile(path: string, label: string): Promise<void> {
|
||||
try {
|
||||
await access(path, constants.F_OK);
|
||||
} catch {
|
||||
throw new Error(`web-automation runtime is missing ${label}: ${path}`);
|
||||
}
|
||||
}
|
||||
|
||||
async function assertExecutableOrFile(path: string, label: string): Promise<void> {
|
||||
try {
|
||||
await access(path, constants.X_OK);
|
||||
} catch {
|
||||
await assertFile(path, label);
|
||||
}
|
||||
}
|
||||
|
||||
function defaultSkillDir(): string {
|
||||
return resolve(dirname(fileURLToPath(import.meta.url)), "..");
|
||||
}
|
||||
|
||||
export async function resolveWebAutomationRuntime(options: RuntimeResolverOptions = {}): Promise<WebAutomationRuntime> {
|
||||
const env = options.env ?? process.env;
|
||||
const homeDir = options.homeDir ?? process.env.HOME ?? "";
|
||||
const skillDir = options.skillDir ?? defaultSkillDir();
|
||||
const candidates = [
|
||||
env.AMAZON_SHOPPING_WEB_AUTOMATION_DIR,
|
||||
homeDir ? join(homeDir, ".openclaw", "workspace", "skills", "web-automation", "scripts") : undefined,
|
||||
resolve(skillDir, "..", "web-automation", "scripts")
|
||||
].filter((candidate): candidate is string => Boolean(candidate));
|
||||
|
||||
const errors: string[] = [];
|
||||
for (const scriptsDir of candidates) {
|
||||
try {
|
||||
await assertFile(join(scriptsDir, "check-install.js"), "check-install.js");
|
||||
await assertFile(join(scriptsDir, "package.json"), "package.json");
|
||||
await assertExecutableOrFile(join(scriptsDir, "node_modules", ".bin", "tsx"), "node_modules/.bin/tsx");
|
||||
return {
|
||||
scriptsDir,
|
||||
checkInstall: {
|
||||
cwd: scriptsDir,
|
||||
command: "node",
|
||||
args: ["check-install.js"]
|
||||
}
|
||||
};
|
||||
} catch (error: unknown) {
|
||||
errors.push(error instanceof Error ? error.message : String(error));
|
||||
}
|
||||
}
|
||||
|
||||
throw new Error(`Unable to locate usable web-automation runtime.\n${errors.join("\n")}`);
|
||||
}
|
||||
@@ -64,6 +64,20 @@ describe("amazon-shopping CLI", () => {
|
||||
assert.equal(parseCliRequest(["usb c cable", "--json", "--markdown"]).output, "both");
|
||||
});
|
||||
|
||||
it("normalizes natural-language filters for the target request", () => {
|
||||
const request = parseCliRequest([
|
||||
"100w led bulbs that cost less than $4 each and have over 200 reviews with a review score of more than 4.5 stars",
|
||||
"--dry-run"
|
||||
]);
|
||||
|
||||
assert.equal(request.query, "100w led bulbs");
|
||||
assert.equal(request.filters.maxUnitPrice, 4);
|
||||
assert.equal(request.filters.minReviews, 200);
|
||||
assert.equal(request.filters.reviewCountComparison, "gt");
|
||||
assert.equal(request.filters.minRating, 4.5);
|
||||
assert.equal(request.filters.ratingComparison, "gt");
|
||||
});
|
||||
|
||||
it("rejects limits below one", () => {
|
||||
assert.throws(
|
||||
() => parseCliRequest(["usb c cable", "--limit", "0"]),
|
||||
|
||||
@@ -0,0 +1,3 @@
|
||||
# Fixtures
|
||||
|
||||
Fixtures in this directory are hand-crafted sanitized HTML snippets. They are not live Amazon snapshots and contain no cookies, account details, delivery location, scripts, tracking identifiers, or browser profile data.
|
||||
@@ -0,0 +1,23 @@
|
||||
<!-- Hand-crafted sanitized fixture. Not a live Amazon snapshot. -->
|
||||
<html>
|
||||
<body>
|
||||
<div data-component-type="s-search-result" data-asin="B0TESTLED1">
|
||||
<h2><a class="a-link-normal s-line-clamp-2" href="/Bright-Daylight-Equivalent/dp/B0TESTLED1/ref=sr_1_1">Bright Daylight 100W Equivalent LED Bulbs, 50 Count</a></h2>
|
||||
<span class="a-price"><span class="a-offscreen">$18.99</span></span>
|
||||
<span class="a-size-base a-color-secondary">$0.38/Count</span>
|
||||
<span aria-label="4.6 out of 5 stars"></span>
|
||||
<a aria-label="1,234 ratings"></a>
|
||||
<div class="a-row a-size-base a-color-secondary">FREE delivery Tomorrow</div>
|
||||
<img class="s-image" src="https://m.media-amazon.com/images/I/test-led.jpg" />
|
||||
</div>
|
||||
<div data-component-type="s-search-result" data-asin="B0TESTLED2">
|
||||
<span>Sponsored</span>
|
||||
<h2><a href="https://www.amazon.com/gp/product/B0TESTLED2">Value LED Bulbs Soft White, Pack of 24</a></h2>
|
||||
<span class="a-price"><span class="a-offscreen">$21.99</span></span>
|
||||
<span aria-label="4.3 out of 5 stars"></span>
|
||||
<a aria-label="543 ratings"></a>
|
||||
<div>Delivery Friday</div>
|
||||
</div>
|
||||
<a class="s-pagination-next" href="/s?k=led+bulbs&page=2">Next</a>
|
||||
</body>
|
||||
</html>
|
||||
@@ -0,0 +1,75 @@
|
||||
import assert from "node:assert/strict";
|
||||
import { describe, it } from "node:test";
|
||||
|
||||
import {
|
||||
extractUnitCount,
|
||||
parseMoney,
|
||||
parseRating,
|
||||
parseReviewCount,
|
||||
parseStarBreakdown,
|
||||
parseUnitPrice
|
||||
} from "../src/parsers.js";
|
||||
|
||||
describe("parsers", () => {
|
||||
it("parses USD money", () => {
|
||||
assert.deepEqual(parseMoney("$19.99"), { amount: 19.99, currency: "USD", display: "$19.99" });
|
||||
});
|
||||
|
||||
it("parses rating text", () => {
|
||||
assert.equal(parseRating("4.6 out of 5 stars"), 4.6);
|
||||
});
|
||||
|
||||
it("parses review count text", () => {
|
||||
assert.equal(parseReviewCount("1,234 ratings"), 1234);
|
||||
});
|
||||
|
||||
it("parses visible star histogram percentages", () => {
|
||||
assert.deepEqual(parseStarBreakdown("5 star 72% 4 star 15% 3 star 7% 2 star 3% 1 star 3%"), {
|
||||
five: 72,
|
||||
four: 15,
|
||||
three: 7,
|
||||
two: 3,
|
||||
one: 3,
|
||||
basis: "percent"
|
||||
});
|
||||
});
|
||||
|
||||
it("extracts high-confidence unit counts", () => {
|
||||
assert.deepEqual(extractUnitCount("LED bulbs, 100 Count, daylight"), {
|
||||
count: 100,
|
||||
confidence: "high",
|
||||
source: "100 Count"
|
||||
});
|
||||
assert.deepEqual(extractUnitCount("Pack of 6 USB-C cables"), {
|
||||
count: 6,
|
||||
confidence: "high",
|
||||
source: "Pack of 6"
|
||||
});
|
||||
});
|
||||
|
||||
it("distinguishes lower-confidence unit count phrases", () => {
|
||||
assert.deepEqual(extractUnitCount("Set of 8 replacement filters"), {
|
||||
count: 8,
|
||||
confidence: "medium",
|
||||
source: "Set of 8"
|
||||
});
|
||||
assert.deepEqual(extractUnitCount("6 bulbs soft white"), {
|
||||
count: 6,
|
||||
confidence: "low",
|
||||
source: "6 bulbs"
|
||||
});
|
||||
});
|
||||
|
||||
it("parses visible unit prices", () => {
|
||||
assert.deepEqual(parseUnitPrice("$0.33/Count"), {
|
||||
amount: 0.33,
|
||||
currency: "USD",
|
||||
display: "$0.33/Count"
|
||||
});
|
||||
});
|
||||
|
||||
it("parses whole-dollar and one-decimal prices", () => {
|
||||
assert.deepEqual(parseMoney("$20"), { amount: 20, currency: "USD", display: "$20" });
|
||||
assert.deepEqual(parseMoney("$19.9"), { amount: 19.9, currency: "USD", display: "$19.9" });
|
||||
});
|
||||
});
|
||||
@@ -0,0 +1,37 @@
|
||||
import assert from "node:assert/strict";
|
||||
import { describe, it } from "node:test";
|
||||
|
||||
import { parseNaturalLanguageRequest } from "../src/query-parser.js";
|
||||
|
||||
describe("parseNaturalLanguageRequest", () => {
|
||||
it("extracts the target LED bulb filters from natural language", () => {
|
||||
const parsed = parseNaturalLanguageRequest(
|
||||
"100w led bulbs that cost less than $4 each and have over 200 reviews with a review score of more than 4.5 stars"
|
||||
);
|
||||
|
||||
assert.equal(parsed.query, "100w led bulbs");
|
||||
assert.equal(parsed.filters.maxUnitPrice, 4);
|
||||
assert.equal(parsed.filters.minReviews, 200);
|
||||
assert.equal(parsed.filters.reviewCountComparison, "gt");
|
||||
assert.equal(parsed.filters.minRating, 4.5);
|
||||
assert.equal(parsed.filters.ratingComparison, "gt");
|
||||
});
|
||||
|
||||
it("distinguishes inclusive review and rating phrasing", () => {
|
||||
const parsed = parseNaturalLanguageRequest("usb c charger at least 500 reviews and 4.3 stars or better");
|
||||
|
||||
assert.equal(parsed.query, "usb c charger");
|
||||
assert.equal(parsed.filters.minReviews, 500);
|
||||
assert.equal(parsed.filters.reviewCountComparison, "gte");
|
||||
assert.equal(parsed.filters.minRating, 4.3);
|
||||
assert.equal(parsed.filters.ratingComparison, "gte");
|
||||
});
|
||||
|
||||
it("extracts limit and max product price phrases", () => {
|
||||
const parsed = parseNaturalLanguageRequest("return 5 wireless mouse under $30");
|
||||
|
||||
assert.equal(parsed.query, "wireless mouse");
|
||||
assert.equal(parsed.limit, 5);
|
||||
assert.equal(parsed.filters.maxPrice, 30);
|
||||
});
|
||||
});
|
||||
@@ -0,0 +1,65 @@
|
||||
import assert from "node:assert/strict";
|
||||
import { readFile } from "node:fs/promises";
|
||||
import { join } from "node:path";
|
||||
import { describe, it } from "node:test";
|
||||
|
||||
import { extractSearchPage } from "../src/search-page.js";
|
||||
|
||||
const fixturePath = join(import.meta.dirname, "fixtures", "search-results.html");
|
||||
|
||||
describe("extractSearchPage", () => {
|
||||
it("extracts normalized product candidates from sanitized search HTML", async () => {
|
||||
const html = await readFile(fixturePath, "utf8");
|
||||
const extracted = extractSearchPage(html, "https://www.amazon.com/s?k=led+bulbs");
|
||||
|
||||
assert.equal(extracted.status, "ok");
|
||||
assert.equal(extracted.products.length, 2);
|
||||
assert.equal(extracted.products[0]?.asin, "B0TESTLED1");
|
||||
assert.equal(extracted.products[0]?.url, "https://www.amazon.com/dp/B0TESTLED1");
|
||||
assert.equal(extracted.products[0]?.price?.amount, 18.99);
|
||||
assert.equal(extracted.products[0]?.unitPrice?.amount, 0.38);
|
||||
assert.equal(extracted.products[0]?.rating, 4.6);
|
||||
assert.equal(extracted.products[0]?.reviewCount, 1234);
|
||||
assert.equal(extracted.products[0]?.delivery?.free, true);
|
||||
assert.equal(extracted.products[0]?.isSponsored, false);
|
||||
assert.equal(extracted.products[1]?.isSponsored, true);
|
||||
assert.equal(extracted.nextPageUrl, "https://www.amazon.com/s?k=led+bulbs&page=2");
|
||||
});
|
||||
|
||||
it("detects Amazon challenge pages", () => {
|
||||
const extracted = extractSearchPage("<html><title>Robot Check</title><body>Enter the characters you see below</body></html>", "https://www.amazon.com/s?k=x");
|
||||
|
||||
assert.equal(extracted.status, "challenge");
|
||||
assert.match(extracted.warnings[0] ?? "", /challenge/i);
|
||||
assert.equal(extracted.products.length, 0);
|
||||
});
|
||||
|
||||
it("returns ok with no products for empty or cardless pages", () => {
|
||||
const extracted = extractSearchPage("<html><body>No results</body></html>", "https://www.amazon.com/s?k=x");
|
||||
|
||||
assert.equal(extracted.status, "ok");
|
||||
assert.deepEqual(extracted.products, []);
|
||||
assert.equal(extracted.nextPageUrl, undefined);
|
||||
});
|
||||
|
||||
it("skips malformed ASINs and cards without titles", () => {
|
||||
const extracted = extractSearchPage(`
|
||||
<div data-asin="bad"><h2><a href="/dp/bad">Bad ASIN</a></h2></div>
|
||||
<div data-asin="B0VALID1234"></div>
|
||||
`, "https://www.amazon.com/s?k=x");
|
||||
|
||||
assert.equal(extracted.status, "ok");
|
||||
assert.equal(extracted.products.length, 0);
|
||||
});
|
||||
|
||||
it("keeps candidates with missing price and records missing price later", () => {
|
||||
const extracted = extractSearchPage(`
|
||||
<div data-asin="B0NOPRICE1">
|
||||
<h2><a href="/dp/B0NOPRICE1">No Price Product</a></h2>
|
||||
</div>
|
||||
`, "https://www.amazon.com/s?k=x");
|
||||
|
||||
assert.equal(extracted.products.length, 1);
|
||||
assert.equal(extracted.products[0]?.price, undefined);
|
||||
});
|
||||
});
|
||||
@@ -0,0 +1,46 @@
|
||||
import assert from "node:assert/strict";
|
||||
import { mkdtemp, mkdir, writeFile } from "node:fs/promises";
|
||||
import { tmpdir } from "node:os";
|
||||
import { join } from "node:path";
|
||||
import { describe, it } from "node:test";
|
||||
|
||||
import { resolveWebAutomationRuntime } from "../src/web-automation-runtime.js";
|
||||
|
||||
async function createRuntime() {
|
||||
const dir = await mkdtemp(join(tmpdir(), "amazon-shopping-runtime-"));
|
||||
await writeFile(join(dir, "check-install.js"), "console.log('ok');\n");
|
||||
await writeFile(join(dir, "package.json"), "{\"type\":\"module\"}\n");
|
||||
await mkdir(join(dir, "node_modules", ".bin"), { recursive: true });
|
||||
await writeFile(join(dir, "node_modules", ".bin", "tsx"), "#!/usr/bin/env node\n");
|
||||
return dir;
|
||||
}
|
||||
|
||||
describe("resolveWebAutomationRuntime", () => {
|
||||
it("uses AMAZON_SHOPPING_WEB_AUTOMATION_DIR first", async () => {
|
||||
const runtimeDir = await createRuntime();
|
||||
const resolved = await resolveWebAutomationRuntime({
|
||||
env: { AMAZON_SHOPPING_WEB_AUTOMATION_DIR: runtimeDir },
|
||||
homeDir: "/missing-home",
|
||||
skillDir: "/missing-skill"
|
||||
});
|
||||
|
||||
assert.equal(resolved.scriptsDir, runtimeDir);
|
||||
assert.deepEqual(resolved.checkInstall, {
|
||||
cwd: runtimeDir,
|
||||
command: "node",
|
||||
args: ["check-install.js"]
|
||||
});
|
||||
});
|
||||
|
||||
it("returns a clear error when required files are missing", async () => {
|
||||
const dir = await mkdtemp(join(tmpdir(), "amazon-shopping-runtime-missing-"));
|
||||
await assert.rejects(
|
||||
() => resolveWebAutomationRuntime({
|
||||
env: { AMAZON_SHOPPING_WEB_AUTOMATION_DIR: dir },
|
||||
homeDir: "/missing-home",
|
||||
skillDir: "/missing-skill"
|
||||
}),
|
||||
/check-install.js/
|
||||
);
|
||||
});
|
||||
});
|
||||
Reference in New Issue
Block a user