feat(amazon-shopping): parse filters and extract search candidates

This commit is contained in:
2026-04-15 18:31:44 -05:00
parent 8ad532545d
commit ef326896f4
15 changed files with 822 additions and 10 deletions
+7 -8
View File
@@ -3,6 +3,7 @@
import minimist from "minimist";
import { fileURLToPath } from "node:url";
import { parseNaturalLanguageRequest } from "./query-parser.js";
import type { ProductFilters, SearchProductsRequest, SearchProductsResponse } from "./types.js";
export interface CliDeps {
@@ -76,12 +77,13 @@ export function parseCliRequest(argv: string[]): SearchProductsRequest {
alias: { h: "help" }
});
const query = String(args.query ?? args._.join(" ")).trim();
if (!query) {
const rawQuery = String(args.query ?? args._.join(" ")).trim();
if (!rawQuery) {
throw new Error("A product query is required");
}
const limit = parsePositiveInteger(args.limit, "limit") ?? 15;
const natural = parseNaturalLanguageRequest(rawQuery);
const limit = parsePositiveInteger(args.limit, "limit") ?? natural.limit ?? 15;
if (limit > 30 && !args["allow-large-limit"]) {
throw new Error("Requested limits above 30 require --allow-large-limit or a batched run");
}
@@ -91,10 +93,7 @@ export function parseCliRequest(argv: string[]): SearchProductsRequest {
throw new Error("max-search-pages must be 5 or less");
}
const filters: ProductFilters = {
includeKeywords: [],
excludeKeywords: []
};
const filters: ProductFilters = { ...natural.filters };
const minRating = parseNumber(args["min-rating"], "min-rating");
const minReviews = parsePositiveInteger(args["min-reviews"], "min-reviews");
const maxPrice = parseNumber(args["max-price"], "max-price");
@@ -108,7 +107,7 @@ export function parseCliRequest(argv: string[]): SearchProductsRequest {
const markdown = Boolean(args.markdown);
return {
query,
query: natural.query || rawQuery,
filters,
limit,
maxSearchPages,
+112
View File
@@ -0,0 +1,112 @@
import type { MoneyValue, StarBreakdown, UnitCountExtraction } from "./types.js";
export function parseMoney(text: string | undefined | null): MoneyValue | undefined {
if (!text) {
return undefined;
}
const compact = text.replace(/\s+/g, " ").trim();
const match = compact.match(/\$\s*([0-9][0-9,]*(?:\.[0-9]{1,2})?)/);
if (!match) {
return undefined;
}
const amount = Number(match[1].replace(/,/g, ""));
if (!Number.isFinite(amount)) {
return undefined;
}
return {
amount,
currency: "USD",
display: compact
};
}
export function parseUnitPrice(text: string | undefined | null): MoneyValue | undefined {
if (!text || !(/[/]\s*\d|\$\s*\d/.test(text))) {
return undefined;
}
if (!/(\/|\bper\b|\beach\b|\bcount\b)/i.test(text)) {
return undefined;
}
return parseMoney(text);
}
export function parseRating(text: string | undefined | null): number | undefined {
if (!text) {
return undefined;
}
const match = text.match(/([0-5](?:\.[0-9])?)\s*(?:out of\s*)?5\s*stars?/i)
?? text.match(/\brated\s+([0-5](?:\.[0-9])?)/i);
if (!match) {
return undefined;
}
const rating = Number(match[1]);
return Number.isFinite(rating) ? rating : undefined;
}
export function parseReviewCount(text: string | undefined | null): number | undefined {
if (!text) {
return undefined;
}
const match = text.match(/([0-9][0-9,]*)\s*(?:ratings?|reviews?)/i);
if (!match) {
return undefined;
}
const count = Number(match[1].replace(/,/g, ""));
return Number.isInteger(count) ? count : undefined;
}
export function parseStarBreakdown(text: string | undefined | null): StarBreakdown | undefined {
if (!text) {
return undefined;
}
const breakdown: Partial<Omit<StarBreakdown, "basis">> = {};
const words: Record<string, keyof Omit<StarBreakdown, "basis">> = {
"5": "five",
"4": "four",
"3": "three",
"2": "two",
"1": "one"
};
const percentMatches = [...text.matchAll(/([1-5])\s*star\s*([0-9]{1,3})\s*%/gi)];
if (percentMatches.length === 0) {
return undefined;
}
for (const match of percentMatches) {
const key = words[match[1]];
if (key) {
breakdown[key] = Number(match[2]);
}
}
return {
...breakdown,
basis: "percent"
};
}
export function extractUnitCount(text: string | undefined | null): UnitCountExtraction | undefined {
if (!text) {
return undefined;
}
const patterns = [
{ pattern: /(\d{1,4})\s*[- ]?(?:count|ct)\b/i, confidence: "high" as const },
{ pattern: /\bpack\s+of\s+(\d{1,4})\b/i, confidence: "high" as const },
{ pattern: /\b(\d{1,4})\s*[- ]?pack\b/i, confidence: "high" as const },
{ pattern: /\bset\s+of\s+(\d{1,4})\b/i, confidence: "medium" as const },
{ pattern: /\b(\d{1,4})\s+(?:bulbs?|cables?|pieces?|pcs)\b/i, confidence: "low" as const }
];
for (const { pattern, confidence } of patterns) {
const match = text.match(pattern);
if (!match) {
continue;
}
const count = Number(match[1]);
if (Number.isInteger(count) && count > 0) {
return {
count,
confidence,
source: match[0]
};
}
}
return undefined;
}
@@ -0,0 +1,68 @@
import type { ParsedNaturalLanguageRequest, ProductFilters } from "./types.js";
function cleanQuery(text: string): string {
return text
.replace(/\b(?:that|and|with|have)\b/gi, " ")
.replace(/\s+/g, " ")
.replace(/\s+(and|or|a)$/i, "")
.trim();
}
function removeMatched(text: string, match: RegExpMatchArray | null): string {
if (!match) {
return text;
}
return text.replace(match[0], " ");
}
export function parseNaturalLanguageRequest(input: string): ParsedNaturalLanguageRequest {
let remaining = input.trim();
const filters: ProductFilters = {
includeKeywords: [],
excludeKeywords: []
};
let limit: number | undefined;
const limitMatch = remaining.match(/\b(?:return|limit|top)\s+(\d{1,3})\b/i);
if (limitMatch) {
limit = Number(limitMatch[1]);
remaining = removeMatched(remaining, limitMatch);
}
const unitPriceMatch = remaining.match(/\b(?:cost\s+)?(?:less than|under|below)\s+\$([0-9]+(?:\.[0-9]{1,2})?)\s*(?:each|per\b|\/\s*(?:count|unit|item))\b/i);
if (unitPriceMatch) {
filters.maxUnitPrice = Number(unitPriceMatch[1]);
remaining = removeMatched(remaining, unitPriceMatch);
}
const maxPriceMatch = remaining.match(/\b(?:cost\s+)?(?:less than|under|below)\s+\$([0-9]+(?:\.[0-9]{1,2})?)\b/i);
if (maxPriceMatch) {
filters.maxPrice = Number(maxPriceMatch[1]);
remaining = removeMatched(remaining, maxPriceMatch);
}
const exclusiveReviews = remaining.match(/\b(?:over|more than|above)\s+([0-9][0-9,]*)\s*(?:reviews?|ratings?)\b/i);
const inclusiveReviews = remaining.match(/\b(?:at least|minimum|min\.?)\s+([0-9][0-9,]*)\s*(?:reviews?|ratings?)\b/i);
const reviewMatch = exclusiveReviews ?? inclusiveReviews;
if (reviewMatch) {
filters.minReviews = Number(reviewMatch[1].replace(/,/g, ""));
filters.reviewCountComparison = exclusiveReviews ? "gt" : "gte";
remaining = removeMatched(remaining, reviewMatch);
}
const exclusiveRating = remaining.match(/\b(?:a\s+)?(?:review score of\s+)?(?:more than|over|above|rated above)\s+([0-5](?:\.[0-9])?)\s*(?:stars?)?\b/i);
const inclusiveRating = remaining.match(/\b([0-5](?:\.[0-9])?)\s*stars?\s+or\s+better\b/i)
?? remaining.match(/\b(?:at least|minimum|min\.?)\s+([0-5](?:\.[0-9])?)\s*(?:stars?|rating)?\b/i);
const ratingMatch = exclusiveRating ?? inclusiveRating;
if (ratingMatch) {
filters.minRating = Number(ratingMatch[1]);
filters.ratingComparison = exclusiveRating ? "gt" : "gte";
remaining = removeMatched(remaining, ratingMatch);
}
return {
query: cleanQuery(remaining),
filters,
limit
};
}
+144
View File
@@ -0,0 +1,144 @@
import { HTMLElement, parse } from "node-html-parser";
import { parseMoney, parseRating, parseReviewCount, parseUnitPrice } from "./parsers.js";
import type { DeliverySummary, ProductSearchResult, SearchPageExtraction } from "./types.js";
function textOf(node: HTMLElement | null | undefined): string {
return node?.textContent.replace(/\s+/g, " ").trim() ?? "";
}
function attrOf(node: HTMLElement | null | undefined, name: string): string | undefined {
return node?.getAttribute(name) ?? undefined;
}
function absoluteAmazonUrl(href: string | undefined, currentUrl = "https://www.amazon.com/"): string | undefined {
if (!href) {
return undefined;
}
if (href.startsWith("https://www.amazon.com")) {
return href;
}
try {
const parsed = new URL(href, currentUrl);
if (parsed.hostname !== "www.amazon.com") {
return undefined;
}
return parsed.toString();
} catch {
return undefined;
}
}
function normalizeProductUrl(asin: string, href: string | undefined, currentUrl: string): string {
const absolute = absoluteAmazonUrl(href, currentUrl);
if (!absolute) {
return `https://www.amazon.com/dp/${asin}`;
}
try {
const url = new URL(absolute);
const match = url.pathname.match(/\/(?:dp|gp\/product)\/([A-Z0-9]{8,14})/i);
if (match) {
return `https://www.amazon.com/dp/${match[1].toUpperCase()}`;
}
} catch {
return `https://www.amazon.com/dp/${asin}`;
}
return `https://www.amazon.com/dp/${asin}`;
}
function detectChallenge(html: string): boolean {
return /robot check|enter the characters you see|captcha|automated access|access denied/i.test(html);
}
function deliveryFromText(text: string): DeliverySummary | undefined {
const compact = text.replace(/\s+/g, " ").trim();
const deliveryMatch = compact.match(/((?:FREE\s+)?delivery[^.]*?(?:Tomorrow|Today|Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday)?)/i);
if (!deliveryMatch) {
return undefined;
}
const display = deliveryMatch[1].trim();
return {
display,
free: /\bfree\b/i.test(display),
prime: /\bprime\b/i.test(compact)
};
}
function firstText(card: HTMLElement, selectors: string[]): string {
for (const selector of selectors) {
const value = textOf(card.querySelector(selector));
if (value) {
return value;
}
}
return "";
}
function firstUnitPriceText(card: HTMLElement): string {
for (const node of card.querySelectorAll(".a-color-secondary, .a-size-base, span")) {
const value = textOf(node);
if (parseUnitPrice(value)) {
return value;
}
}
return "";
}
export function extractSearchPage(html: string, currentUrl: string): SearchPageExtraction {
if (detectChallenge(html)) {
return {
status: "challenge",
products: [],
warnings: ["Amazon returned a challenge or blocked page; stopping without bypass."],
};
}
const root = parse(html);
const cards = root.querySelectorAll("[data-asin]")
.filter((card) => /^[A-Z0-9]{8,14}$/i.test(card.getAttribute("data-asin") ?? ""));
const products: ProductSearchResult[] = [];
for (const card of cards) {
const asin = (card.getAttribute("data-asin") ?? "").toUpperCase();
const link = card.querySelector("h2 a") ?? card.querySelector("a[href*='/dp/']") ?? card.querySelector("a[href*='/gp/product/']");
const title = textOf(link) || firstText(card, ["h2", "[data-cy='title-recipe']"]);
if (!title) {
continue;
}
const priceText = firstText(card, [".a-price .a-offscreen", ".a-price"]);
const allText = textOf(card);
const unitPriceText = firstUnitPriceText(card);
const ariaText = card.querySelectorAll("[aria-label]")
.map((node) => attrOf(node, "aria-label") ?? "")
.join(" ");
const delivery = deliveryFromText(allText);
const product: ProductSearchResult = {
asin,
title,
url: normalizeProductUrl(asin, attrOf(link, "href"), currentUrl),
imageUrl: attrOf(card.querySelector("img"), "src"),
price: parseMoney(priceText),
unitPrice: parseUnitPrice(unitPriceText),
rating: parseRating(ariaText || allText),
reviewCount: parseReviewCount(ariaText || allText),
delivery,
specs: [],
bullets: [],
isSponsored: /\bsponsored\b/i.test(allText),
matchedFilters: [],
missingFields: [],
extractionNotes: []
};
products.push(product);
}
const nextHref = attrOf(root.querySelector(".s-pagination-next[href]"), "href");
const nextPageUrl = absoluteAmazonUrl(nextHref, currentUrl);
return {
status: "ok",
products,
warnings: [],
nextPageUrl: nextPageUrl ?? undefined
};
}
+21
View File
@@ -10,7 +10,9 @@ export interface SearchProductsRequest {
export interface ProductFilters {
minRating?: number;
ratingComparison?: "gt" | "gte";
minReviews?: number;
reviewCountComparison?: "gt" | "gte";
maxPrice?: number;
maxUnitPrice?: number;
includeKeywords: string[];
@@ -82,3 +84,22 @@ export interface SearchProductsResponse {
automation: "web-automation/CloakBrowser";
};
}
export interface ParsedNaturalLanguageRequest {
query: string;
filters: ProductFilters;
limit?: number;
}
export interface UnitCountExtraction {
count: number;
confidence: "high" | "medium" | "low";
source: string;
}
export interface SearchPageExtraction {
status: "ok" | "challenge";
products: ProductSearchResult[];
warnings: string[];
nextPageUrl?: string;
}
@@ -0,0 +1,71 @@
import { access } from "node:fs/promises";
import { constants } from "node:fs";
import { dirname, join, resolve } from "node:path";
import { fileURLToPath } from "node:url";
export interface RuntimeResolverOptions {
env?: NodeJS.ProcessEnv;
homeDir?: string;
skillDir?: string;
}
export interface WebAutomationRuntime {
scriptsDir: string;
checkInstall: {
cwd: string;
command: string;
args: string[];
};
}
async function assertFile(path: string, label: string): Promise<void> {
try {
await access(path, constants.F_OK);
} catch {
throw new Error(`web-automation runtime is missing ${label}: ${path}`);
}
}
async function assertExecutableOrFile(path: string, label: string): Promise<void> {
try {
await access(path, constants.X_OK);
} catch {
await assertFile(path, label);
}
}
function defaultSkillDir(): string {
return resolve(dirname(fileURLToPath(import.meta.url)), "..");
}
export async function resolveWebAutomationRuntime(options: RuntimeResolverOptions = {}): Promise<WebAutomationRuntime> {
const env = options.env ?? process.env;
const homeDir = options.homeDir ?? process.env.HOME ?? "";
const skillDir = options.skillDir ?? defaultSkillDir();
const candidates = [
env.AMAZON_SHOPPING_WEB_AUTOMATION_DIR,
homeDir ? join(homeDir, ".openclaw", "workspace", "skills", "web-automation", "scripts") : undefined,
resolve(skillDir, "..", "web-automation", "scripts")
].filter((candidate): candidate is string => Boolean(candidate));
const errors: string[] = [];
for (const scriptsDir of candidates) {
try {
await assertFile(join(scriptsDir, "check-install.js"), "check-install.js");
await assertFile(join(scriptsDir, "package.json"), "package.json");
await assertExecutableOrFile(join(scriptsDir, "node_modules", ".bin", "tsx"), "node_modules/.bin/tsx");
return {
scriptsDir,
checkInstall: {
cwd: scriptsDir,
command: "node",
args: ["check-install.js"]
}
};
} catch (error: unknown) {
errors.push(error instanceof Error ? error.message : String(error));
}
}
throw new Error(`Unable to locate usable web-automation runtime.\n${errors.join("\n")}`);
}