Files
stef-openclaw-skills/skills/web-automation/scripts/real-estate-photo-common.js

321 lines
8.6 KiB
JavaScript

#!/usr/bin/env node
const DEFAULT_WAIT_MS = 4000;
const NAV_TIMEOUT_MS = 45000;
const CLICK_TIMEOUT_MS = 15000;
const MAX_SCROLL_PASSES = 12;
const SCROLL_PAUSE_MS = 900;
const LARGE_IMAGE_MIN_WIDTH = 300;
const LARGE_IMAGE_MIN_HEIGHT = 200;
const OPERATION_TIMEOUT_MS = Number(process.env.REAL_ESTATE_OPERATION_TIMEOUT_MS || 25000);
export function fail(message, details) {
const payload = { error: message };
if (details) payload.details = details;
process.stderr.write(`${JSON.stringify(payload)}\n`);
process.exit(1);
}
export function parseTarget(rawUrl) {
if (!rawUrl) {
fail("Missing URL.");
}
let parsed;
try {
parsed = new URL(rawUrl);
} catch (error) {
fail("Invalid URL.", error instanceof Error ? error.message : String(error));
}
if (!["http:", "https:"].includes(parsed.protocol)) {
fail("Only http and https URLs are allowed.");
}
return parsed.toString();
}
export function sleep(ms) {
return new Promise((resolve) => setTimeout(resolve, ms));
}
export async function runWithOperationTimeout(
operationName,
operation,
{ timeoutMs = OPERATION_TIMEOUT_MS, onTimeout } = {}
) {
let timer;
try {
return await Promise.race([
operation(),
new Promise((_, reject) => {
timer = setTimeout(async () => {
try {
await onTimeout?.();
} catch {
// Ignore cleanup errors; the timeout is the primary failure.
}
reject(new Error(`${operationName} timed out after ${timeoutMs}ms`));
}, timeoutMs);
}),
]);
} finally {
if (timer) {
clearTimeout(timer);
}
}
}
export async function loadCloakBrowser() {
try {
return await import("cloakbrowser");
} catch (error) {
fail(
"CloakBrowser is not installed for this skill. Run pnpm install in skills/web-automation/scripts first.",
error instanceof Error ? error.message : String(error)
);
}
}
export async function runWithStderrLogs(fn) {
const originalLog = console.log;
const originalError = console.error;
console.log = (...args) => process.stderr.write(`${args.join(" ")}\n`);
console.error = (...args) => process.stderr.write(`${args.join(" ")}\n`);
try {
return await fn();
} finally {
console.log = originalLog;
console.error = originalError;
}
}
export async function createPageSession({ headless = true } = {}) {
const { ensureBinary, launchContext } = await loadCloakBrowser();
await runWithStderrLogs(() => ensureBinary());
const context = await runWithStderrLogs(() =>
launchContext({
headless,
humanize: true,
locale: "en-US",
viewport: { width: 1440, height: 900 },
})
);
const page = await context.newPage();
page.setDefaultTimeout(CLICK_TIMEOUT_MS);
page.setDefaultNavigationTimeout(NAV_TIMEOUT_MS);
return { context, page };
}
export async function gotoListing(page, url, waitMs = DEFAULT_WAIT_MS) {
await page.goto(url, { waitUntil: "domcontentloaded", timeout: NAV_TIMEOUT_MS });
await page.waitForLoadState("networkidle", { timeout: 15000 }).catch(() => {});
await sleep(waitMs);
}
export async function dismissCommonOverlays(page) {
const dismissLabels = [
/accept/i,
/agree/i,
/close/i,
/got it/i,
/continue/i,
/dismiss/i,
/not now/i,
];
for (const label of dismissLabels) {
const targets = [
page.getByRole("button", { name: label }).first(),
page.getByRole("link", { name: label }).first(),
];
for (const target of targets) {
try {
if (await target.count()) {
await target.click({ timeout: 2500 });
await sleep(300);
}
} catch {
// Best-effort overlay dismissal only.
}
}
}
}
export async function clickPhotoEntryPoint(page, labels) {
for (const label of labels) {
const targets = [
page.getByRole("button", { name: label }).first(),
page.getByRole("link", { name: label }).first(),
page.getByText(label).first(),
];
for (const target of targets) {
try {
if (await target.count()) {
await target.scrollIntoViewIfNeeded().catch(() => {});
await target.click({ timeout: CLICK_TIMEOUT_MS });
return label.toString();
}
} catch {
// Keep trying the next candidate.
}
}
}
throw new Error("Could not find a photo entry point.");
}
export async function waitForPhotoExperience(page, previousUrl, waitMs = DEFAULT_WAIT_MS) {
await Promise.race([
page.waitForURL((url) => url.toString() !== previousUrl, { timeout: NAV_TIMEOUT_MS }).catch(() => {}),
page.waitForLoadState("networkidle", { timeout: 15000 }).catch(() => {}),
sleep(waitMs),
]);
await sleep(waitMs);
}
export async function scrollUntilSettled(page, passes = MAX_SCROLL_PASSES) {
let previousHeight = 0;
for (let i = 0; i < passes; i += 1) {
const currentHeight = await page.evaluate(() => {
const root = document.scrollingElement || document.documentElement || document.body;
return root ? root.scrollHeight : 0;
});
await page.evaluate(() => {
const root = document.scrollingElement || document.documentElement || document.body;
if (root) root.scrollTo({ top: root.scrollHeight, behavior: "instant" });
});
await sleep(SCROLL_PAUSE_MS);
if (currentHeight === previousHeight) {
break;
}
previousHeight = currentHeight;
}
await page.evaluate(() => {
const root = document.scrollingElement || document.documentElement || document.body;
if (root) root.scrollTo({ top: 0, behavior: "instant" });
});
await sleep(250);
}
export function normalizeImageCandidates(candidates, options = {}) {
const {
hostIncludes = [],
hostExcludes = [],
pathnameIncludes = [],
minWidth = LARGE_IMAGE_MIN_WIDTH,
minHeight = LARGE_IMAGE_MIN_HEIGHT,
} = options;
const seen = new Set();
const normalized = [];
for (const candidate of candidates || []) {
const rawUrl = typeof candidate?.url === "string" ? candidate.url.trim() : "";
if (!rawUrl || rawUrl.startsWith("data:")) continue;
let parsed;
try {
parsed = new URL(rawUrl);
} catch {
continue;
}
const host = parsed.hostname.toLowerCase();
const pathname = parsed.pathname.toLowerCase();
const width = Number(candidate.width || candidate.naturalWidth || 0);
const height = Number(candidate.height || candidate.naturalHeight || 0);
if (hostIncludes.length && !hostIncludes.some((part) => host.includes(part))) continue;
if (hostExcludes.some((part) => host.includes(part))) continue;
if (pathnameIncludes.length && !pathnameIncludes.some((part) => pathname.includes(part))) continue;
if (width && width < minWidth) continue;
if (height && height < minHeight) continue;
parsed.hash = "";
parsed.search = "";
const canonical = parsed.toString();
if (seen.has(canonical)) continue;
seen.add(canonical);
normalized.push({
url: canonical,
width,
height,
host,
pathname,
});
}
return normalized;
}
export async function collectRenderedImageCandidates(page) {
return page.evaluate(() => {
const out = [];
const addUrl = (url, width, height) => {
if (!url) return;
out.push({ url, width: Number(width || 0), height: Number(height || 0) });
};
const parseSrcset = (srcset) => {
if (!srcset) return [];
return srcset
.split(",")
.map((entry) => entry.trim().split(/\s+/)[0])
.filter(Boolean);
};
const all = Array.from(document.querySelectorAll("img, source"));
for (const node of all) {
if (node instanceof HTMLImageElement) {
addUrl(node.currentSrc || node.src, node.naturalWidth || node.clientWidth, node.naturalHeight || node.clientHeight);
for (const url of parseSrcset(node.srcset)) {
addUrl(url, node.naturalWidth || node.clientWidth, node.naturalHeight || node.clientHeight);
}
} else if (node instanceof HTMLSourceElement) {
for (const url of parseSrcset(node.srcset)) {
addUrl(url, 0, 0);
}
}
}
for (const anchor of Array.from(document.querySelectorAll("a[href]"))) {
const href = anchor.getAttribute("href") || "";
if (/\.(?:jpg|jpeg|png|webp)(?:$|\?)/i.test(href)) {
addUrl(href, 0, 0);
}
}
return out;
});
}
export function buildResult({
requestedUrl,
page,
clickedLabel,
imageUrls,
source,
notes = [],
}) {
return {
source,
requestedUrl,
finalUrl: page.url(),
title: null,
clickedLabel,
photoCount: imageUrls.length,
imageUrls,
notes,
};
}