321 lines
8.6 KiB
JavaScript
321 lines
8.6 KiB
JavaScript
#!/usr/bin/env node
|
|
|
|
const DEFAULT_WAIT_MS = 4000;
|
|
const NAV_TIMEOUT_MS = 45000;
|
|
const CLICK_TIMEOUT_MS = 15000;
|
|
const MAX_SCROLL_PASSES = 12;
|
|
const SCROLL_PAUSE_MS = 900;
|
|
const LARGE_IMAGE_MIN_WIDTH = 300;
|
|
const LARGE_IMAGE_MIN_HEIGHT = 200;
|
|
const OPERATION_TIMEOUT_MS = Number(process.env.REAL_ESTATE_OPERATION_TIMEOUT_MS || 25000);
|
|
|
|
export function fail(message, details) {
|
|
const payload = { error: message };
|
|
if (details) payload.details = details;
|
|
process.stderr.write(`${JSON.stringify(payload)}\n`);
|
|
process.exit(1);
|
|
}
|
|
|
|
export function parseTarget(rawUrl) {
|
|
if (!rawUrl) {
|
|
fail("Missing URL.");
|
|
}
|
|
|
|
let parsed;
|
|
try {
|
|
parsed = new URL(rawUrl);
|
|
} catch (error) {
|
|
fail("Invalid URL.", error instanceof Error ? error.message : String(error));
|
|
}
|
|
|
|
if (!["http:", "https:"].includes(parsed.protocol)) {
|
|
fail("Only http and https URLs are allowed.");
|
|
}
|
|
|
|
return parsed.toString();
|
|
}
|
|
|
|
export function sleep(ms) {
|
|
return new Promise((resolve) => setTimeout(resolve, ms));
|
|
}
|
|
|
|
export async function runWithOperationTimeout(
|
|
operationName,
|
|
operation,
|
|
{ timeoutMs = OPERATION_TIMEOUT_MS, onTimeout } = {}
|
|
) {
|
|
let timer;
|
|
|
|
try {
|
|
return await Promise.race([
|
|
operation(),
|
|
new Promise((_, reject) => {
|
|
timer = setTimeout(async () => {
|
|
try {
|
|
await onTimeout?.();
|
|
} catch {
|
|
// Ignore cleanup errors; the timeout is the primary failure.
|
|
}
|
|
reject(new Error(`${operationName} timed out after ${timeoutMs}ms`));
|
|
}, timeoutMs);
|
|
}),
|
|
]);
|
|
} finally {
|
|
if (timer) {
|
|
clearTimeout(timer);
|
|
}
|
|
}
|
|
}
|
|
|
|
export async function loadCloakBrowser() {
|
|
try {
|
|
return await import("cloakbrowser");
|
|
} catch (error) {
|
|
fail(
|
|
"CloakBrowser is not installed for this skill. Run pnpm install in skills/web-automation/scripts first.",
|
|
error instanceof Error ? error.message : String(error)
|
|
);
|
|
}
|
|
}
|
|
|
|
export async function runWithStderrLogs(fn) {
|
|
const originalLog = console.log;
|
|
const originalError = console.error;
|
|
console.log = (...args) => process.stderr.write(`${args.join(" ")}\n`);
|
|
console.error = (...args) => process.stderr.write(`${args.join(" ")}\n`);
|
|
try {
|
|
return await fn();
|
|
} finally {
|
|
console.log = originalLog;
|
|
console.error = originalError;
|
|
}
|
|
}
|
|
|
|
export async function createPageSession({ headless = true } = {}) {
|
|
const { ensureBinary, launchContext } = await loadCloakBrowser();
|
|
await runWithStderrLogs(() => ensureBinary());
|
|
const context = await runWithStderrLogs(() =>
|
|
launchContext({
|
|
headless,
|
|
humanize: true,
|
|
locale: "en-US",
|
|
viewport: { width: 1440, height: 900 },
|
|
})
|
|
);
|
|
const page = await context.newPage();
|
|
page.setDefaultTimeout(CLICK_TIMEOUT_MS);
|
|
page.setDefaultNavigationTimeout(NAV_TIMEOUT_MS);
|
|
return { context, page };
|
|
}
|
|
|
|
export async function gotoListing(page, url, waitMs = DEFAULT_WAIT_MS) {
|
|
await page.goto(url, { waitUntil: "domcontentloaded", timeout: NAV_TIMEOUT_MS });
|
|
await page.waitForLoadState("networkidle", { timeout: 15000 }).catch(() => {});
|
|
await sleep(waitMs);
|
|
}
|
|
|
|
export async function dismissCommonOverlays(page) {
|
|
const dismissLabels = [
|
|
/accept/i,
|
|
/agree/i,
|
|
/close/i,
|
|
/got it/i,
|
|
/continue/i,
|
|
/dismiss/i,
|
|
/not now/i,
|
|
];
|
|
|
|
for (const label of dismissLabels) {
|
|
const targets = [
|
|
page.getByRole("button", { name: label }).first(),
|
|
page.getByRole("link", { name: label }).first(),
|
|
];
|
|
|
|
for (const target of targets) {
|
|
try {
|
|
if (await target.count()) {
|
|
await target.click({ timeout: 2500 });
|
|
await sleep(300);
|
|
}
|
|
} catch {
|
|
// Best-effort overlay dismissal only.
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
export async function clickPhotoEntryPoint(page, labels) {
|
|
for (const label of labels) {
|
|
const targets = [
|
|
page.getByRole("button", { name: label }).first(),
|
|
page.getByRole("link", { name: label }).first(),
|
|
page.getByText(label).first(),
|
|
];
|
|
|
|
for (const target of targets) {
|
|
try {
|
|
if (await target.count()) {
|
|
await target.scrollIntoViewIfNeeded().catch(() => {});
|
|
await target.click({ timeout: CLICK_TIMEOUT_MS });
|
|
return label.toString();
|
|
}
|
|
} catch {
|
|
// Keep trying the next candidate.
|
|
}
|
|
}
|
|
}
|
|
|
|
throw new Error("Could not find a photo entry point.");
|
|
}
|
|
|
|
export async function waitForPhotoExperience(page, previousUrl, waitMs = DEFAULT_WAIT_MS) {
|
|
await Promise.race([
|
|
page.waitForURL((url) => url.toString() !== previousUrl, { timeout: NAV_TIMEOUT_MS }).catch(() => {}),
|
|
page.waitForLoadState("networkidle", { timeout: 15000 }).catch(() => {}),
|
|
sleep(waitMs),
|
|
]);
|
|
await sleep(waitMs);
|
|
}
|
|
|
|
export async function scrollUntilSettled(page, passes = MAX_SCROLL_PASSES) {
|
|
let previousHeight = 0;
|
|
|
|
for (let i = 0; i < passes; i += 1) {
|
|
const currentHeight = await page.evaluate(() => {
|
|
const root = document.scrollingElement || document.documentElement || document.body;
|
|
return root ? root.scrollHeight : 0;
|
|
});
|
|
|
|
await page.evaluate(() => {
|
|
const root = document.scrollingElement || document.documentElement || document.body;
|
|
if (root) root.scrollTo({ top: root.scrollHeight, behavior: "instant" });
|
|
});
|
|
await sleep(SCROLL_PAUSE_MS);
|
|
|
|
if (currentHeight === previousHeight) {
|
|
break;
|
|
}
|
|
previousHeight = currentHeight;
|
|
}
|
|
|
|
await page.evaluate(() => {
|
|
const root = document.scrollingElement || document.documentElement || document.body;
|
|
if (root) root.scrollTo({ top: 0, behavior: "instant" });
|
|
});
|
|
await sleep(250);
|
|
}
|
|
|
|
export function normalizeImageCandidates(candidates, options = {}) {
|
|
const {
|
|
hostIncludes = [],
|
|
hostExcludes = [],
|
|
pathnameIncludes = [],
|
|
minWidth = LARGE_IMAGE_MIN_WIDTH,
|
|
minHeight = LARGE_IMAGE_MIN_HEIGHT,
|
|
} = options;
|
|
|
|
const seen = new Set();
|
|
const normalized = [];
|
|
|
|
for (const candidate of candidates || []) {
|
|
const rawUrl = typeof candidate?.url === "string" ? candidate.url.trim() : "";
|
|
if (!rawUrl || rawUrl.startsWith("data:")) continue;
|
|
|
|
let parsed;
|
|
try {
|
|
parsed = new URL(rawUrl);
|
|
} catch {
|
|
continue;
|
|
}
|
|
|
|
const host = parsed.hostname.toLowerCase();
|
|
const pathname = parsed.pathname.toLowerCase();
|
|
const width = Number(candidate.width || candidate.naturalWidth || 0);
|
|
const height = Number(candidate.height || candidate.naturalHeight || 0);
|
|
|
|
if (hostIncludes.length && !hostIncludes.some((part) => host.includes(part))) continue;
|
|
if (hostExcludes.some((part) => host.includes(part))) continue;
|
|
if (pathnameIncludes.length && !pathnameIncludes.some((part) => pathname.includes(part))) continue;
|
|
if (width && width < minWidth) continue;
|
|
if (height && height < minHeight) continue;
|
|
|
|
parsed.hash = "";
|
|
parsed.search = "";
|
|
const canonical = parsed.toString();
|
|
if (seen.has(canonical)) continue;
|
|
seen.add(canonical);
|
|
|
|
normalized.push({
|
|
url: canonical,
|
|
width,
|
|
height,
|
|
host,
|
|
pathname,
|
|
});
|
|
}
|
|
|
|
return normalized;
|
|
}
|
|
|
|
export async function collectRenderedImageCandidates(page) {
|
|
return page.evaluate(() => {
|
|
const out = [];
|
|
|
|
const addUrl = (url, width, height) => {
|
|
if (!url) return;
|
|
out.push({ url, width: Number(width || 0), height: Number(height || 0) });
|
|
};
|
|
|
|
const parseSrcset = (srcset) => {
|
|
if (!srcset) return [];
|
|
return srcset
|
|
.split(",")
|
|
.map((entry) => entry.trim().split(/\s+/)[0])
|
|
.filter(Boolean);
|
|
};
|
|
|
|
const all = Array.from(document.querySelectorAll("img, source"));
|
|
for (const node of all) {
|
|
if (node instanceof HTMLImageElement) {
|
|
addUrl(node.currentSrc || node.src, node.naturalWidth || node.clientWidth, node.naturalHeight || node.clientHeight);
|
|
for (const url of parseSrcset(node.srcset)) {
|
|
addUrl(url, node.naturalWidth || node.clientWidth, node.naturalHeight || node.clientHeight);
|
|
}
|
|
} else if (node instanceof HTMLSourceElement) {
|
|
for (const url of parseSrcset(node.srcset)) {
|
|
addUrl(url, 0, 0);
|
|
}
|
|
}
|
|
}
|
|
|
|
for (const anchor of Array.from(document.querySelectorAll("a[href]"))) {
|
|
const href = anchor.getAttribute("href") || "";
|
|
if (/\.(?:jpg|jpeg|png|webp)(?:$|\?)/i.test(href)) {
|
|
addUrl(href, 0, 0);
|
|
}
|
|
}
|
|
|
|
return out;
|
|
});
|
|
}
|
|
|
|
export function buildResult({
|
|
requestedUrl,
|
|
page,
|
|
clickedLabel,
|
|
imageUrls,
|
|
source,
|
|
notes = [],
|
|
}) {
|
|
return {
|
|
source,
|
|
requestedUrl,
|
|
finalUrl: page.url(),
|
|
title: null,
|
|
clickedLabel,
|
|
photoCount: imageUrls.length,
|
|
imageUrls,
|
|
notes,
|
|
};
|
|
}
|