Add Zillow and HAR photo extractors
This commit is contained in:
292
skills/web-automation/scripts/real-estate-photo-common.js
Normal file
292
skills/web-automation/scripts/real-estate-photo-common.js
Normal file
@@ -0,0 +1,292 @@
|
||||
#!/usr/bin/env node
|
||||
|
||||
const DEFAULT_WAIT_MS = 4000;
|
||||
const NAV_TIMEOUT_MS = 45000;
|
||||
const CLICK_TIMEOUT_MS = 15000;
|
||||
const MAX_SCROLL_PASSES = 12;
|
||||
const SCROLL_PAUSE_MS = 900;
|
||||
const LARGE_IMAGE_MIN_WIDTH = 300;
|
||||
const LARGE_IMAGE_MIN_HEIGHT = 200;
|
||||
|
||||
export function fail(message, details) {
|
||||
const payload = { error: message };
|
||||
if (details) payload.details = details;
|
||||
process.stderr.write(`${JSON.stringify(payload)}\n`);
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
export function parseTarget(rawUrl) {
|
||||
if (!rawUrl) {
|
||||
fail("Missing URL.");
|
||||
}
|
||||
|
||||
let parsed;
|
||||
try {
|
||||
parsed = new URL(rawUrl);
|
||||
} catch (error) {
|
||||
fail("Invalid URL.", error instanceof Error ? error.message : String(error));
|
||||
}
|
||||
|
||||
if (!["http:", "https:"].includes(parsed.protocol)) {
|
||||
fail("Only http and https URLs are allowed.");
|
||||
}
|
||||
|
||||
return parsed.toString();
|
||||
}
|
||||
|
||||
export function sleep(ms) {
|
||||
return new Promise((resolve) => setTimeout(resolve, ms));
|
||||
}
|
||||
|
||||
export async function loadCloakBrowser() {
|
||||
try {
|
||||
return await import("cloakbrowser");
|
||||
} catch (error) {
|
||||
fail(
|
||||
"CloakBrowser is not installed for this skill. Run pnpm install in skills/web-automation/scripts first.",
|
||||
error instanceof Error ? error.message : String(error)
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
export async function runWithStderrLogs(fn) {
|
||||
const originalLog = console.log;
|
||||
const originalError = console.error;
|
||||
console.log = (...args) => process.stderr.write(`${args.join(" ")}\n`);
|
||||
console.error = (...args) => process.stderr.write(`${args.join(" ")}\n`);
|
||||
try {
|
||||
return await fn();
|
||||
} finally {
|
||||
console.log = originalLog;
|
||||
console.error = originalError;
|
||||
}
|
||||
}
|
||||
|
||||
export async function createPageSession({ headless = true } = {}) {
|
||||
const { ensureBinary, launchContext } = await loadCloakBrowser();
|
||||
await runWithStderrLogs(() => ensureBinary());
|
||||
const context = await runWithStderrLogs(() =>
|
||||
launchContext({
|
||||
headless,
|
||||
humanize: true,
|
||||
locale: "en-US",
|
||||
viewport: { width: 1440, height: 900 },
|
||||
})
|
||||
);
|
||||
const page = await context.newPage();
|
||||
page.setDefaultTimeout(CLICK_TIMEOUT_MS);
|
||||
page.setDefaultNavigationTimeout(NAV_TIMEOUT_MS);
|
||||
return { context, page };
|
||||
}
|
||||
|
||||
export async function gotoListing(page, url, waitMs = DEFAULT_WAIT_MS) {
|
||||
await page.goto(url, { waitUntil: "domcontentloaded", timeout: NAV_TIMEOUT_MS });
|
||||
await page.waitForLoadState("networkidle", { timeout: 15000 }).catch(() => {});
|
||||
await sleep(waitMs);
|
||||
}
|
||||
|
||||
export async function dismissCommonOverlays(page) {
|
||||
const dismissLabels = [
|
||||
/accept/i,
|
||||
/agree/i,
|
||||
/close/i,
|
||||
/got it/i,
|
||||
/continue/i,
|
||||
/dismiss/i,
|
||||
/not now/i,
|
||||
];
|
||||
|
||||
for (const label of dismissLabels) {
|
||||
const targets = [
|
||||
page.getByRole("button", { name: label }).first(),
|
||||
page.getByRole("link", { name: label }).first(),
|
||||
];
|
||||
|
||||
for (const target of targets) {
|
||||
try {
|
||||
if (await target.count()) {
|
||||
await target.click({ timeout: 2500 });
|
||||
await sleep(300);
|
||||
}
|
||||
} catch {
|
||||
// Best-effort overlay dismissal only.
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
export async function clickPhotoEntryPoint(page, labels) {
|
||||
for (const label of labels) {
|
||||
const targets = [
|
||||
page.getByRole("button", { name: label }).first(),
|
||||
page.getByRole("link", { name: label }).first(),
|
||||
page.getByText(label).first(),
|
||||
];
|
||||
|
||||
for (const target of targets) {
|
||||
try {
|
||||
if (await target.count()) {
|
||||
await target.scrollIntoViewIfNeeded().catch(() => {});
|
||||
await target.click({ timeout: CLICK_TIMEOUT_MS });
|
||||
return label.toString();
|
||||
}
|
||||
} catch {
|
||||
// Keep trying the next candidate.
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
throw new Error("Could not find a photo entry point.");
|
||||
}
|
||||
|
||||
export async function waitForPhotoExperience(page, previousUrl, waitMs = DEFAULT_WAIT_MS) {
|
||||
await Promise.race([
|
||||
page.waitForURL((url) => url.toString() !== previousUrl, { timeout: NAV_TIMEOUT_MS }).catch(() => {}),
|
||||
page.waitForLoadState("networkidle", { timeout: 15000 }).catch(() => {}),
|
||||
sleep(waitMs),
|
||||
]);
|
||||
await sleep(waitMs);
|
||||
}
|
||||
|
||||
export async function scrollUntilSettled(page, passes = MAX_SCROLL_PASSES) {
|
||||
let previousHeight = 0;
|
||||
|
||||
for (let i = 0; i < passes; i += 1) {
|
||||
const currentHeight = await page.evaluate(() => {
|
||||
const root = document.scrollingElement || document.documentElement || document.body;
|
||||
return root ? root.scrollHeight : 0;
|
||||
});
|
||||
|
||||
await page.evaluate(() => {
|
||||
const root = document.scrollingElement || document.documentElement || document.body;
|
||||
if (root) root.scrollTo({ top: root.scrollHeight, behavior: "instant" });
|
||||
});
|
||||
await sleep(SCROLL_PAUSE_MS);
|
||||
|
||||
if (currentHeight === previousHeight) {
|
||||
break;
|
||||
}
|
||||
previousHeight = currentHeight;
|
||||
}
|
||||
|
||||
await page.evaluate(() => {
|
||||
const root = document.scrollingElement || document.documentElement || document.body;
|
||||
if (root) root.scrollTo({ top: 0, behavior: "instant" });
|
||||
});
|
||||
await sleep(250);
|
||||
}
|
||||
|
||||
export function normalizeImageCandidates(candidates, options = {}) {
|
||||
const {
|
||||
hostIncludes = [],
|
||||
hostExcludes = [],
|
||||
pathnameIncludes = [],
|
||||
minWidth = LARGE_IMAGE_MIN_WIDTH,
|
||||
minHeight = LARGE_IMAGE_MIN_HEIGHT,
|
||||
} = options;
|
||||
|
||||
const seen = new Set();
|
||||
const normalized = [];
|
||||
|
||||
for (const candidate of candidates || []) {
|
||||
const rawUrl = typeof candidate?.url === "string" ? candidate.url.trim() : "";
|
||||
if (!rawUrl || rawUrl.startsWith("data:")) continue;
|
||||
|
||||
let parsed;
|
||||
try {
|
||||
parsed = new URL(rawUrl);
|
||||
} catch {
|
||||
continue;
|
||||
}
|
||||
|
||||
const host = parsed.hostname.toLowerCase();
|
||||
const pathname = parsed.pathname.toLowerCase();
|
||||
const width = Number(candidate.width || candidate.naturalWidth || 0);
|
||||
const height = Number(candidate.height || candidate.naturalHeight || 0);
|
||||
|
||||
if (hostIncludes.length && !hostIncludes.some((part) => host.includes(part))) continue;
|
||||
if (hostExcludes.some((part) => host.includes(part))) continue;
|
||||
if (pathnameIncludes.length && !pathnameIncludes.some((part) => pathname.includes(part))) continue;
|
||||
if (width && width < minWidth) continue;
|
||||
if (height && height < minHeight) continue;
|
||||
|
||||
parsed.hash = "";
|
||||
parsed.search = "";
|
||||
const canonical = parsed.toString();
|
||||
if (seen.has(canonical)) continue;
|
||||
seen.add(canonical);
|
||||
|
||||
normalized.push({
|
||||
url: canonical,
|
||||
width,
|
||||
height,
|
||||
host,
|
||||
pathname,
|
||||
});
|
||||
}
|
||||
|
||||
return normalized;
|
||||
}
|
||||
|
||||
export async function collectRenderedImageCandidates(page) {
|
||||
return page.evaluate(() => {
|
||||
const out = [];
|
||||
|
||||
const addUrl = (url, width, height) => {
|
||||
if (!url) return;
|
||||
out.push({ url, width: Number(width || 0), height: Number(height || 0) });
|
||||
};
|
||||
|
||||
const parseSrcset = (srcset) => {
|
||||
if (!srcset) return [];
|
||||
return srcset
|
||||
.split(",")
|
||||
.map((entry) => entry.trim().split(/\s+/)[0])
|
||||
.filter(Boolean);
|
||||
};
|
||||
|
||||
const all = Array.from(document.querySelectorAll("img, source"));
|
||||
for (const node of all) {
|
||||
if (node instanceof HTMLImageElement) {
|
||||
addUrl(node.currentSrc || node.src, node.naturalWidth || node.clientWidth, node.naturalHeight || node.clientHeight);
|
||||
for (const url of parseSrcset(node.srcset)) {
|
||||
addUrl(url, node.naturalWidth || node.clientWidth, node.naturalHeight || node.clientHeight);
|
||||
}
|
||||
} else if (node instanceof HTMLSourceElement) {
|
||||
for (const url of parseSrcset(node.srcset)) {
|
||||
addUrl(url, 0, 0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (const anchor of Array.from(document.querySelectorAll("a[href]"))) {
|
||||
const href = anchor.getAttribute("href") || "";
|
||||
if (/\.(?:jpg|jpeg|png|webp)(?:$|\?)/i.test(href)) {
|
||||
addUrl(href, 0, 0);
|
||||
}
|
||||
}
|
||||
|
||||
return out;
|
||||
});
|
||||
}
|
||||
|
||||
export function buildResult({
|
||||
requestedUrl,
|
||||
page,
|
||||
clickedLabel,
|
||||
imageUrls,
|
||||
source,
|
||||
notes = [],
|
||||
}) {
|
||||
return {
|
||||
source,
|
||||
requestedUrl,
|
||||
finalUrl: page.url(),
|
||||
title: null,
|
||||
clickedLabel,
|
||||
photoCount: imageUrls.length,
|
||||
imageUrls,
|
||||
notes,
|
||||
};
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user