feat(amazon-shopping): parse filters and extract search candidates
This commit is contained in:
@@ -3,6 +3,7 @@
|
||||
import minimist from "minimist";
|
||||
import { fileURLToPath } from "node:url";
|
||||
|
||||
import { parseNaturalLanguageRequest } from "./query-parser.js";
|
||||
import type { ProductFilters, SearchProductsRequest, SearchProductsResponse } from "./types.js";
|
||||
|
||||
export interface CliDeps {
|
||||
@@ -76,12 +77,13 @@ export function parseCliRequest(argv: string[]): SearchProductsRequest {
|
||||
alias: { h: "help" }
|
||||
});
|
||||
|
||||
const query = String(args.query ?? args._.join(" ")).trim();
|
||||
if (!query) {
|
||||
const rawQuery = String(args.query ?? args._.join(" ")).trim();
|
||||
if (!rawQuery) {
|
||||
throw new Error("A product query is required");
|
||||
}
|
||||
|
||||
const limit = parsePositiveInteger(args.limit, "limit") ?? 15;
|
||||
const natural = parseNaturalLanguageRequest(rawQuery);
|
||||
const limit = parsePositiveInteger(args.limit, "limit") ?? natural.limit ?? 15;
|
||||
if (limit > 30 && !args["allow-large-limit"]) {
|
||||
throw new Error("Requested limits above 30 require --allow-large-limit or a batched run");
|
||||
}
|
||||
@@ -91,10 +93,7 @@ export function parseCliRequest(argv: string[]): SearchProductsRequest {
|
||||
throw new Error("max-search-pages must be 5 or less");
|
||||
}
|
||||
|
||||
const filters: ProductFilters = {
|
||||
includeKeywords: [],
|
||||
excludeKeywords: []
|
||||
};
|
||||
const filters: ProductFilters = { ...natural.filters };
|
||||
const minRating = parseNumber(args["min-rating"], "min-rating");
|
||||
const minReviews = parsePositiveInteger(args["min-reviews"], "min-reviews");
|
||||
const maxPrice = parseNumber(args["max-price"], "max-price");
|
||||
@@ -108,7 +107,7 @@ export function parseCliRequest(argv: string[]): SearchProductsRequest {
|
||||
const markdown = Boolean(args.markdown);
|
||||
|
||||
return {
|
||||
query,
|
||||
query: natural.query || rawQuery,
|
||||
filters,
|
||||
limit,
|
||||
maxSearchPages,
|
||||
|
||||
@@ -0,0 +1,112 @@
|
||||
import type { MoneyValue, StarBreakdown, UnitCountExtraction } from "./types.js";
|
||||
|
||||
export function parseMoney(text: string | undefined | null): MoneyValue | undefined {
|
||||
if (!text) {
|
||||
return undefined;
|
||||
}
|
||||
const compact = text.replace(/\s+/g, " ").trim();
|
||||
const match = compact.match(/\$\s*([0-9][0-9,]*(?:\.[0-9]{1,2})?)/);
|
||||
if (!match) {
|
||||
return undefined;
|
||||
}
|
||||
const amount = Number(match[1].replace(/,/g, ""));
|
||||
if (!Number.isFinite(amount)) {
|
||||
return undefined;
|
||||
}
|
||||
return {
|
||||
amount,
|
||||
currency: "USD",
|
||||
display: compact
|
||||
};
|
||||
}
|
||||
|
||||
export function parseUnitPrice(text: string | undefined | null): MoneyValue | undefined {
|
||||
if (!text || !(/[/]\s*\d|\$\s*\d/.test(text))) {
|
||||
return undefined;
|
||||
}
|
||||
if (!/(\/|\bper\b|\beach\b|\bcount\b)/i.test(text)) {
|
||||
return undefined;
|
||||
}
|
||||
return parseMoney(text);
|
||||
}
|
||||
|
||||
export function parseRating(text: string | undefined | null): number | undefined {
|
||||
if (!text) {
|
||||
return undefined;
|
||||
}
|
||||
const match = text.match(/([0-5](?:\.[0-9])?)\s*(?:out of\s*)?5\s*stars?/i)
|
||||
?? text.match(/\brated\s+([0-5](?:\.[0-9])?)/i);
|
||||
if (!match) {
|
||||
return undefined;
|
||||
}
|
||||
const rating = Number(match[1]);
|
||||
return Number.isFinite(rating) ? rating : undefined;
|
||||
}
|
||||
|
||||
export function parseReviewCount(text: string | undefined | null): number | undefined {
|
||||
if (!text) {
|
||||
return undefined;
|
||||
}
|
||||
const match = text.match(/([0-9][0-9,]*)\s*(?:ratings?|reviews?)/i);
|
||||
if (!match) {
|
||||
return undefined;
|
||||
}
|
||||
const count = Number(match[1].replace(/,/g, ""));
|
||||
return Number.isInteger(count) ? count : undefined;
|
||||
}
|
||||
|
||||
export function parseStarBreakdown(text: string | undefined | null): StarBreakdown | undefined {
|
||||
if (!text) {
|
||||
return undefined;
|
||||
}
|
||||
const breakdown: Partial<Omit<StarBreakdown, "basis">> = {};
|
||||
const words: Record<string, keyof Omit<StarBreakdown, "basis">> = {
|
||||
"5": "five",
|
||||
"4": "four",
|
||||
"3": "three",
|
||||
"2": "two",
|
||||
"1": "one"
|
||||
};
|
||||
const percentMatches = [...text.matchAll(/([1-5])\s*star\s*([0-9]{1,3})\s*%/gi)];
|
||||
if (percentMatches.length === 0) {
|
||||
return undefined;
|
||||
}
|
||||
for (const match of percentMatches) {
|
||||
const key = words[match[1]];
|
||||
if (key) {
|
||||
breakdown[key] = Number(match[2]);
|
||||
}
|
||||
}
|
||||
return {
|
||||
...breakdown,
|
||||
basis: "percent"
|
||||
};
|
||||
}
|
||||
|
||||
export function extractUnitCount(text: string | undefined | null): UnitCountExtraction | undefined {
|
||||
if (!text) {
|
||||
return undefined;
|
||||
}
|
||||
const patterns = [
|
||||
{ pattern: /(\d{1,4})\s*[- ]?(?:count|ct)\b/i, confidence: "high" as const },
|
||||
{ pattern: /\bpack\s+of\s+(\d{1,4})\b/i, confidence: "high" as const },
|
||||
{ pattern: /\b(\d{1,4})\s*[- ]?pack\b/i, confidence: "high" as const },
|
||||
{ pattern: /\bset\s+of\s+(\d{1,4})\b/i, confidence: "medium" as const },
|
||||
{ pattern: /\b(\d{1,4})\s+(?:bulbs?|cables?|pieces?|pcs)\b/i, confidence: "low" as const }
|
||||
];
|
||||
for (const { pattern, confidence } of patterns) {
|
||||
const match = text.match(pattern);
|
||||
if (!match) {
|
||||
continue;
|
||||
}
|
||||
const count = Number(match[1]);
|
||||
if (Number.isInteger(count) && count > 0) {
|
||||
return {
|
||||
count,
|
||||
confidence,
|
||||
source: match[0]
|
||||
};
|
||||
}
|
||||
}
|
||||
return undefined;
|
||||
}
|
||||
@@ -0,0 +1,68 @@
|
||||
import type { ParsedNaturalLanguageRequest, ProductFilters } from "./types.js";
|
||||
|
||||
function cleanQuery(text: string): string {
|
||||
return text
|
||||
.replace(/\b(?:that|and|with|have)\b/gi, " ")
|
||||
.replace(/\s+/g, " ")
|
||||
.replace(/\s+(and|or|a)$/i, "")
|
||||
.trim();
|
||||
}
|
||||
|
||||
function removeMatched(text: string, match: RegExpMatchArray | null): string {
|
||||
if (!match) {
|
||||
return text;
|
||||
}
|
||||
return text.replace(match[0], " ");
|
||||
}
|
||||
|
||||
export function parseNaturalLanguageRequest(input: string): ParsedNaturalLanguageRequest {
|
||||
let remaining = input.trim();
|
||||
const filters: ProductFilters = {
|
||||
includeKeywords: [],
|
||||
excludeKeywords: []
|
||||
};
|
||||
let limit: number | undefined;
|
||||
|
||||
const limitMatch = remaining.match(/\b(?:return|limit|top)\s+(\d{1,3})\b/i);
|
||||
if (limitMatch) {
|
||||
limit = Number(limitMatch[1]);
|
||||
remaining = removeMatched(remaining, limitMatch);
|
||||
}
|
||||
|
||||
const unitPriceMatch = remaining.match(/\b(?:cost\s+)?(?:less than|under|below)\s+\$([0-9]+(?:\.[0-9]{1,2})?)\s*(?:each|per\b|\/\s*(?:count|unit|item))\b/i);
|
||||
if (unitPriceMatch) {
|
||||
filters.maxUnitPrice = Number(unitPriceMatch[1]);
|
||||
remaining = removeMatched(remaining, unitPriceMatch);
|
||||
}
|
||||
|
||||
const maxPriceMatch = remaining.match(/\b(?:cost\s+)?(?:less than|under|below)\s+\$([0-9]+(?:\.[0-9]{1,2})?)\b/i);
|
||||
if (maxPriceMatch) {
|
||||
filters.maxPrice = Number(maxPriceMatch[1]);
|
||||
remaining = removeMatched(remaining, maxPriceMatch);
|
||||
}
|
||||
|
||||
const exclusiveReviews = remaining.match(/\b(?:over|more than|above)\s+([0-9][0-9,]*)\s*(?:reviews?|ratings?)\b/i);
|
||||
const inclusiveReviews = remaining.match(/\b(?:at least|minimum|min\.?)\s+([0-9][0-9,]*)\s*(?:reviews?|ratings?)\b/i);
|
||||
const reviewMatch = exclusiveReviews ?? inclusiveReviews;
|
||||
if (reviewMatch) {
|
||||
filters.minReviews = Number(reviewMatch[1].replace(/,/g, ""));
|
||||
filters.reviewCountComparison = exclusiveReviews ? "gt" : "gte";
|
||||
remaining = removeMatched(remaining, reviewMatch);
|
||||
}
|
||||
|
||||
const exclusiveRating = remaining.match(/\b(?:a\s+)?(?:review score of\s+)?(?:more than|over|above|rated above)\s+([0-5](?:\.[0-9])?)\s*(?:stars?)?\b/i);
|
||||
const inclusiveRating = remaining.match(/\b([0-5](?:\.[0-9])?)\s*stars?\s+or\s+better\b/i)
|
||||
?? remaining.match(/\b(?:at least|minimum|min\.?)\s+([0-5](?:\.[0-9])?)\s*(?:stars?|rating)?\b/i);
|
||||
const ratingMatch = exclusiveRating ?? inclusiveRating;
|
||||
if (ratingMatch) {
|
||||
filters.minRating = Number(ratingMatch[1]);
|
||||
filters.ratingComparison = exclusiveRating ? "gt" : "gte";
|
||||
remaining = removeMatched(remaining, ratingMatch);
|
||||
}
|
||||
|
||||
return {
|
||||
query: cleanQuery(remaining),
|
||||
filters,
|
||||
limit
|
||||
};
|
||||
}
|
||||
@@ -0,0 +1,144 @@
|
||||
import { HTMLElement, parse } from "node-html-parser";
|
||||
|
||||
import { parseMoney, parseRating, parseReviewCount, parseUnitPrice } from "./parsers.js";
|
||||
import type { DeliverySummary, ProductSearchResult, SearchPageExtraction } from "./types.js";
|
||||
|
||||
function textOf(node: HTMLElement | null | undefined): string {
|
||||
return node?.textContent.replace(/\s+/g, " ").trim() ?? "";
|
||||
}
|
||||
|
||||
function attrOf(node: HTMLElement | null | undefined, name: string): string | undefined {
|
||||
return node?.getAttribute(name) ?? undefined;
|
||||
}
|
||||
|
||||
function absoluteAmazonUrl(href: string | undefined, currentUrl = "https://www.amazon.com/"): string | undefined {
|
||||
if (!href) {
|
||||
return undefined;
|
||||
}
|
||||
if (href.startsWith("https://www.amazon.com")) {
|
||||
return href;
|
||||
}
|
||||
try {
|
||||
const parsed = new URL(href, currentUrl);
|
||||
if (parsed.hostname !== "www.amazon.com") {
|
||||
return undefined;
|
||||
}
|
||||
return parsed.toString();
|
||||
} catch {
|
||||
return undefined;
|
||||
}
|
||||
}
|
||||
|
||||
function normalizeProductUrl(asin: string, href: string | undefined, currentUrl: string): string {
|
||||
const absolute = absoluteAmazonUrl(href, currentUrl);
|
||||
if (!absolute) {
|
||||
return `https://www.amazon.com/dp/${asin}`;
|
||||
}
|
||||
try {
|
||||
const url = new URL(absolute);
|
||||
const match = url.pathname.match(/\/(?:dp|gp\/product)\/([A-Z0-9]{8,14})/i);
|
||||
if (match) {
|
||||
return `https://www.amazon.com/dp/${match[1].toUpperCase()}`;
|
||||
}
|
||||
} catch {
|
||||
return `https://www.amazon.com/dp/${asin}`;
|
||||
}
|
||||
return `https://www.amazon.com/dp/${asin}`;
|
||||
}
|
||||
|
||||
function detectChallenge(html: string): boolean {
|
||||
return /robot check|enter the characters you see|captcha|automated access|access denied/i.test(html);
|
||||
}
|
||||
|
||||
function deliveryFromText(text: string): DeliverySummary | undefined {
|
||||
const compact = text.replace(/\s+/g, " ").trim();
|
||||
const deliveryMatch = compact.match(/((?:FREE\s+)?delivery[^.]*?(?:Tomorrow|Today|Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday)?)/i);
|
||||
if (!deliveryMatch) {
|
||||
return undefined;
|
||||
}
|
||||
const display = deliveryMatch[1].trim();
|
||||
return {
|
||||
display,
|
||||
free: /\bfree\b/i.test(display),
|
||||
prime: /\bprime\b/i.test(compact)
|
||||
};
|
||||
}
|
||||
|
||||
function firstText(card: HTMLElement, selectors: string[]): string {
|
||||
for (const selector of selectors) {
|
||||
const value = textOf(card.querySelector(selector));
|
||||
if (value) {
|
||||
return value;
|
||||
}
|
||||
}
|
||||
return "";
|
||||
}
|
||||
|
||||
function firstUnitPriceText(card: HTMLElement): string {
|
||||
for (const node of card.querySelectorAll(".a-color-secondary, .a-size-base, span")) {
|
||||
const value = textOf(node);
|
||||
if (parseUnitPrice(value)) {
|
||||
return value;
|
||||
}
|
||||
}
|
||||
return "";
|
||||
}
|
||||
|
||||
export function extractSearchPage(html: string, currentUrl: string): SearchPageExtraction {
|
||||
if (detectChallenge(html)) {
|
||||
return {
|
||||
status: "challenge",
|
||||
products: [],
|
||||
warnings: ["Amazon returned a challenge or blocked page; stopping without bypass."],
|
||||
};
|
||||
}
|
||||
|
||||
const root = parse(html);
|
||||
const cards = root.querySelectorAll("[data-asin]")
|
||||
.filter((card) => /^[A-Z0-9]{8,14}$/i.test(card.getAttribute("data-asin") ?? ""));
|
||||
const products: ProductSearchResult[] = [];
|
||||
|
||||
for (const card of cards) {
|
||||
const asin = (card.getAttribute("data-asin") ?? "").toUpperCase();
|
||||
const link = card.querySelector("h2 a") ?? card.querySelector("a[href*='/dp/']") ?? card.querySelector("a[href*='/gp/product/']");
|
||||
const title = textOf(link) || firstText(card, ["h2", "[data-cy='title-recipe']"]);
|
||||
if (!title) {
|
||||
continue;
|
||||
}
|
||||
const priceText = firstText(card, [".a-price .a-offscreen", ".a-price"]);
|
||||
const allText = textOf(card);
|
||||
const unitPriceText = firstUnitPriceText(card);
|
||||
const ariaText = card.querySelectorAll("[aria-label]")
|
||||
.map((node) => attrOf(node, "aria-label") ?? "")
|
||||
.join(" ");
|
||||
const delivery = deliveryFromText(allText);
|
||||
const product: ProductSearchResult = {
|
||||
asin,
|
||||
title,
|
||||
url: normalizeProductUrl(asin, attrOf(link, "href"), currentUrl),
|
||||
imageUrl: attrOf(card.querySelector("img"), "src"),
|
||||
price: parseMoney(priceText),
|
||||
unitPrice: parseUnitPrice(unitPriceText),
|
||||
rating: parseRating(ariaText || allText),
|
||||
reviewCount: parseReviewCount(ariaText || allText),
|
||||
delivery,
|
||||
specs: [],
|
||||
bullets: [],
|
||||
isSponsored: /\bsponsored\b/i.test(allText),
|
||||
matchedFilters: [],
|
||||
missingFields: [],
|
||||
extractionNotes: []
|
||||
};
|
||||
products.push(product);
|
||||
}
|
||||
|
||||
const nextHref = attrOf(root.querySelector(".s-pagination-next[href]"), "href");
|
||||
const nextPageUrl = absoluteAmazonUrl(nextHref, currentUrl);
|
||||
|
||||
return {
|
||||
status: "ok",
|
||||
products,
|
||||
warnings: [],
|
||||
nextPageUrl: nextPageUrl ?? undefined
|
||||
};
|
||||
}
|
||||
@@ -10,7 +10,9 @@ export interface SearchProductsRequest {
|
||||
|
||||
export interface ProductFilters {
|
||||
minRating?: number;
|
||||
ratingComparison?: "gt" | "gte";
|
||||
minReviews?: number;
|
||||
reviewCountComparison?: "gt" | "gte";
|
||||
maxPrice?: number;
|
||||
maxUnitPrice?: number;
|
||||
includeKeywords: string[];
|
||||
@@ -82,3 +84,22 @@ export interface SearchProductsResponse {
|
||||
automation: "web-automation/CloakBrowser";
|
||||
};
|
||||
}
|
||||
|
||||
export interface ParsedNaturalLanguageRequest {
|
||||
query: string;
|
||||
filters: ProductFilters;
|
||||
limit?: number;
|
||||
}
|
||||
|
||||
export interface UnitCountExtraction {
|
||||
count: number;
|
||||
confidence: "high" | "medium" | "low";
|
||||
source: string;
|
||||
}
|
||||
|
||||
export interface SearchPageExtraction {
|
||||
status: "ok" | "challenge";
|
||||
products: ProductSearchResult[];
|
||||
warnings: string[];
|
||||
nextPageUrl?: string;
|
||||
}
|
||||
|
||||
@@ -0,0 +1,71 @@
|
||||
import { access } from "node:fs/promises";
|
||||
import { constants } from "node:fs";
|
||||
import { dirname, join, resolve } from "node:path";
|
||||
import { fileURLToPath } from "node:url";
|
||||
|
||||
export interface RuntimeResolverOptions {
|
||||
env?: NodeJS.ProcessEnv;
|
||||
homeDir?: string;
|
||||
skillDir?: string;
|
||||
}
|
||||
|
||||
export interface WebAutomationRuntime {
|
||||
scriptsDir: string;
|
||||
checkInstall: {
|
||||
cwd: string;
|
||||
command: string;
|
||||
args: string[];
|
||||
};
|
||||
}
|
||||
|
||||
async function assertFile(path: string, label: string): Promise<void> {
|
||||
try {
|
||||
await access(path, constants.F_OK);
|
||||
} catch {
|
||||
throw new Error(`web-automation runtime is missing ${label}: ${path}`);
|
||||
}
|
||||
}
|
||||
|
||||
async function assertExecutableOrFile(path: string, label: string): Promise<void> {
|
||||
try {
|
||||
await access(path, constants.X_OK);
|
||||
} catch {
|
||||
await assertFile(path, label);
|
||||
}
|
||||
}
|
||||
|
||||
function defaultSkillDir(): string {
|
||||
return resolve(dirname(fileURLToPath(import.meta.url)), "..");
|
||||
}
|
||||
|
||||
export async function resolveWebAutomationRuntime(options: RuntimeResolverOptions = {}): Promise<WebAutomationRuntime> {
|
||||
const env = options.env ?? process.env;
|
||||
const homeDir = options.homeDir ?? process.env.HOME ?? "";
|
||||
const skillDir = options.skillDir ?? defaultSkillDir();
|
||||
const candidates = [
|
||||
env.AMAZON_SHOPPING_WEB_AUTOMATION_DIR,
|
||||
homeDir ? join(homeDir, ".openclaw", "workspace", "skills", "web-automation", "scripts") : undefined,
|
||||
resolve(skillDir, "..", "web-automation", "scripts")
|
||||
].filter((candidate): candidate is string => Boolean(candidate));
|
||||
|
||||
const errors: string[] = [];
|
||||
for (const scriptsDir of candidates) {
|
||||
try {
|
||||
await assertFile(join(scriptsDir, "check-install.js"), "check-install.js");
|
||||
await assertFile(join(scriptsDir, "package.json"), "package.json");
|
||||
await assertExecutableOrFile(join(scriptsDir, "node_modules", ".bin", "tsx"), "node_modules/.bin/tsx");
|
||||
return {
|
||||
scriptsDir,
|
||||
checkInstall: {
|
||||
cwd: scriptsDir,
|
||||
command: "node",
|
||||
args: ["check-install.js"]
|
||||
}
|
||||
};
|
||||
} catch (error: unknown) {
|
||||
errors.push(error instanceof Error ? error.message : String(error));
|
||||
}
|
||||
}
|
||||
|
||||
throw new Error(`Unable to locate usable web-automation runtime.\n${errors.join("\n")}`);
|
||||
}
|
||||
Reference in New Issue
Block a user