Add Zillow and HAR photo extractors

This commit is contained in:
2026-03-27 17:35:46 -05:00
parent e7c56fe760
commit eeea0c8ef1
11 changed files with 873 additions and 8 deletions

View File

@@ -0,0 +1,40 @@
#!/usr/bin/env node
import fs from "node:fs";
import path from "node:path";
import { fileURLToPath } from "node:url";
const __filename = fileURLToPath(import.meta.url);
const __dirname = path.dirname(__filename);
function fail(message, details) {
const payload = { error: message };
if (details) payload.details = details;
process.stderr.write(`${JSON.stringify(payload)}\n`);
process.exit(1);
}
async function main() {
try {
await import("cloakbrowser");
await import("playwright-core");
} catch (error) {
fail(
"Missing dependency/config: web-automation requires cloakbrowser and playwright-core.",
error instanceof Error ? error.message : String(error)
);
}
const browsePath = path.join(__dirname, "browse.ts");
const browseSource = fs.readFileSync(browsePath, "utf8");
if (!/launchPersistentContext/.test(browseSource) || !/from ['"]cloakbrowser['"]/.test(browseSource)) {
fail("browse.ts is not configured for CloakBrowser.");
}
process.stdout.write("OK: cloakbrowser + playwright-core installed\n");
process.stdout.write("OK: CloakBrowser integration detected in browse.ts\n");
}
main().catch((error) => {
fail("Install check failed.", error instanceof Error ? error.message : String(error));
});

View File

@@ -0,0 +1,83 @@
#!/usr/bin/env node
import {
clickPhotoEntryPoint,
collectRenderedImageCandidates,
createPageSession,
dismissCommonOverlays,
fail,
gotoListing,
normalizeImageCandidates,
parseTarget,
scrollUntilSettled,
sleep,
waitForPhotoExperience,
} from "./real-estate-photo-common.js";
const HAR_LABELS = [
/^Show all photos$/i,
/^View all photos$/i,
/^All photos$/i,
/^Photos$/i,
];
async function getAnnouncedPhotoCount(page) {
return page.evaluate(() => {
const text = document.body?.innerText || "";
const match = text.match(/(\d+)\s+photos/i);
return match ? Number(match[1]) : null;
});
}
async function main() {
const requestedUrl = parseTarget(process.argv[2]);
const { context, page } = await createPageSession({ headless: process.env.HEADLESS !== "false" });
try {
await gotoListing(page, requestedUrl);
await dismissCommonOverlays(page);
const expectedPhotoCount = await getAnnouncedPhotoCount(page);
const beforeUrl = page.url();
const clickedLabel = await clickPhotoEntryPoint(page, HAR_LABELS);
await waitForPhotoExperience(page, beforeUrl);
await scrollUntilSettled(page);
await sleep(1200);
const candidates = await collectRenderedImageCandidates(page);
const photos = normalizeImageCandidates(candidates, {
hostIncludes: ["pics.harstatic.com", "photos.harstatic.com"],
minWidth: 240,
minHeight: 180,
});
if (!photos.length) {
fail("HAR photo extraction failed.", "No large image URLs were found after opening the HAR all-photos view.");
}
const result = {
source: "har",
requestedUrl,
finalUrl: page.url(),
title: await page.title(),
clickedLabel,
expectedPhotoCount,
complete: expectedPhotoCount ? photos.length >= expectedPhotoCount : true,
photoCount: photos.length,
imageUrls: photos.map((photo) => photo.url),
notes: ["Opened HAR all-photos flow and extracted large rendered image URLs from the photo page."],
};
process.stdout.write(`${JSON.stringify(result, null, 2)}\n`);
await context.close();
} catch (error) {
try {
await context.close();
} catch {
// Ignore close errors after the primary failure.
}
fail("HAR photo extraction failed.", error instanceof Error ? error.message : String(error));
}
}
main();

View File

@@ -4,15 +4,19 @@
"description": "Web browsing and scraping scripts using CloakBrowser",
"type": "module",
"scripts": {
"check-install": "node check-install.js",
"extract": "node extract.js",
"har-photos": "node har-photos.js",
"browse": "tsx browse.ts",
"scrape": "tsx scrape.ts",
"test:photos": "node --test real-estate-photo-common.test.mjs zillow-photo-data.test.mjs",
"zillow-photos": "node zillow-photos.js",
"fetch-browser": "npx cloakbrowser install"
},
"dependencies": {
"@mozilla/readability": "^0.5.0",
"better-sqlite3": "^12.6.2",
"cloakbrowser": "^0.3.14",
"cloakbrowser": "^0.3.18",
"jsdom": "^24.0.0",
"minimist": "^1.2.8",
"playwright-core": "^1.58.2",

View File

@@ -15,8 +15,8 @@ importers:
specifier: ^12.6.2
version: 12.6.2
cloakbrowser:
specifier: ^0.3.14
version: 0.3.14(mmdb-lib@3.0.1)(playwright-core@1.58.2)
specifier: ^0.3.18
version: 0.3.18(mmdb-lib@3.0.1)(playwright-core@1.58.2)
jsdom:
specifier: ^24.0.0
version: 24.1.3
@@ -301,8 +301,8 @@ packages:
resolution: {integrity: sha512-+IxzY9BZOQd/XuYPRmrvEVjF/nqj5kgT4kEq7VofrDoM1MxoRjEWkrCC3EtLi59TVawxTAn+orJwFQcrqEN1+g==}
engines: {node: '>=18'}
cloakbrowser@0.3.14:
resolution: {integrity: sha512-8mcEVxfiNbAMHNa0B2IZKPtMDQ2peZlrScfQDJW+C9tjKG/P5Bg9wCweI0hnbaWR2ulG1MrxiEvTMMjz/SgmLw==}
cloakbrowser@0.3.18:
resolution: {integrity: sha512-As8boW2iodPh35XbHk2LSK7fIIgfcPmBpMpBiDv9yMmp6cuHLa7Td2scg8ZgzWJKhhak1uvL5RXQhE+lYxFFqQ==}
engines: {node: '>=18.0.0'}
hasBin: true
peerDependencies:
@@ -873,7 +873,7 @@ snapshots:
chownr@3.0.0: {}
cloakbrowser@0.3.14(mmdb-lib@3.0.1)(playwright-core@1.58.2):
cloakbrowser@0.3.18(mmdb-lib@3.0.1)(playwright-core@1.58.2):
dependencies:
tar: 7.5.11
optionalDependencies:

View File

@@ -0,0 +1,292 @@
#!/usr/bin/env node
const DEFAULT_WAIT_MS = 4000;
const NAV_TIMEOUT_MS = 45000;
const CLICK_TIMEOUT_MS = 15000;
const MAX_SCROLL_PASSES = 12;
const SCROLL_PAUSE_MS = 900;
const LARGE_IMAGE_MIN_WIDTH = 300;
const LARGE_IMAGE_MIN_HEIGHT = 200;
export function fail(message, details) {
const payload = { error: message };
if (details) payload.details = details;
process.stderr.write(`${JSON.stringify(payload)}\n`);
process.exit(1);
}
export function parseTarget(rawUrl) {
if (!rawUrl) {
fail("Missing URL.");
}
let parsed;
try {
parsed = new URL(rawUrl);
} catch (error) {
fail("Invalid URL.", error instanceof Error ? error.message : String(error));
}
if (!["http:", "https:"].includes(parsed.protocol)) {
fail("Only http and https URLs are allowed.");
}
return parsed.toString();
}
export function sleep(ms) {
return new Promise((resolve) => setTimeout(resolve, ms));
}
export async function loadCloakBrowser() {
try {
return await import("cloakbrowser");
} catch (error) {
fail(
"CloakBrowser is not installed for this skill. Run pnpm install in skills/web-automation/scripts first.",
error instanceof Error ? error.message : String(error)
);
}
}
export async function runWithStderrLogs(fn) {
const originalLog = console.log;
const originalError = console.error;
console.log = (...args) => process.stderr.write(`${args.join(" ")}\n`);
console.error = (...args) => process.stderr.write(`${args.join(" ")}\n`);
try {
return await fn();
} finally {
console.log = originalLog;
console.error = originalError;
}
}
export async function createPageSession({ headless = true } = {}) {
const { ensureBinary, launchContext } = await loadCloakBrowser();
await runWithStderrLogs(() => ensureBinary());
const context = await runWithStderrLogs(() =>
launchContext({
headless,
humanize: true,
locale: "en-US",
viewport: { width: 1440, height: 900 },
})
);
const page = await context.newPage();
page.setDefaultTimeout(CLICK_TIMEOUT_MS);
page.setDefaultNavigationTimeout(NAV_TIMEOUT_MS);
return { context, page };
}
export async function gotoListing(page, url, waitMs = DEFAULT_WAIT_MS) {
await page.goto(url, { waitUntil: "domcontentloaded", timeout: NAV_TIMEOUT_MS });
await page.waitForLoadState("networkidle", { timeout: 15000 }).catch(() => {});
await sleep(waitMs);
}
export async function dismissCommonOverlays(page) {
const dismissLabels = [
/accept/i,
/agree/i,
/close/i,
/got it/i,
/continue/i,
/dismiss/i,
/not now/i,
];
for (const label of dismissLabels) {
const targets = [
page.getByRole("button", { name: label }).first(),
page.getByRole("link", { name: label }).first(),
];
for (const target of targets) {
try {
if (await target.count()) {
await target.click({ timeout: 2500 });
await sleep(300);
}
} catch {
// Best-effort overlay dismissal only.
}
}
}
}
export async function clickPhotoEntryPoint(page, labels) {
for (const label of labels) {
const targets = [
page.getByRole("button", { name: label }).first(),
page.getByRole("link", { name: label }).first(),
page.getByText(label).first(),
];
for (const target of targets) {
try {
if (await target.count()) {
await target.scrollIntoViewIfNeeded().catch(() => {});
await target.click({ timeout: CLICK_TIMEOUT_MS });
return label.toString();
}
} catch {
// Keep trying the next candidate.
}
}
}
throw new Error("Could not find a photo entry point.");
}
export async function waitForPhotoExperience(page, previousUrl, waitMs = DEFAULT_WAIT_MS) {
await Promise.race([
page.waitForURL((url) => url.toString() !== previousUrl, { timeout: NAV_TIMEOUT_MS }).catch(() => {}),
page.waitForLoadState("networkidle", { timeout: 15000 }).catch(() => {}),
sleep(waitMs),
]);
await sleep(waitMs);
}
export async function scrollUntilSettled(page, passes = MAX_SCROLL_PASSES) {
let previousHeight = 0;
for (let i = 0; i < passes; i += 1) {
const currentHeight = await page.evaluate(() => {
const root = document.scrollingElement || document.documentElement || document.body;
return root ? root.scrollHeight : 0;
});
await page.evaluate(() => {
const root = document.scrollingElement || document.documentElement || document.body;
if (root) root.scrollTo({ top: root.scrollHeight, behavior: "instant" });
});
await sleep(SCROLL_PAUSE_MS);
if (currentHeight === previousHeight) {
break;
}
previousHeight = currentHeight;
}
await page.evaluate(() => {
const root = document.scrollingElement || document.documentElement || document.body;
if (root) root.scrollTo({ top: 0, behavior: "instant" });
});
await sleep(250);
}
export function normalizeImageCandidates(candidates, options = {}) {
const {
hostIncludes = [],
hostExcludes = [],
pathnameIncludes = [],
minWidth = LARGE_IMAGE_MIN_WIDTH,
minHeight = LARGE_IMAGE_MIN_HEIGHT,
} = options;
const seen = new Set();
const normalized = [];
for (const candidate of candidates || []) {
const rawUrl = typeof candidate?.url === "string" ? candidate.url.trim() : "";
if (!rawUrl || rawUrl.startsWith("data:")) continue;
let parsed;
try {
parsed = new URL(rawUrl);
} catch {
continue;
}
const host = parsed.hostname.toLowerCase();
const pathname = parsed.pathname.toLowerCase();
const width = Number(candidate.width || candidate.naturalWidth || 0);
const height = Number(candidate.height || candidate.naturalHeight || 0);
if (hostIncludes.length && !hostIncludes.some((part) => host.includes(part))) continue;
if (hostExcludes.some((part) => host.includes(part))) continue;
if (pathnameIncludes.length && !pathnameIncludes.some((part) => pathname.includes(part))) continue;
if (width && width < minWidth) continue;
if (height && height < minHeight) continue;
parsed.hash = "";
parsed.search = "";
const canonical = parsed.toString();
if (seen.has(canonical)) continue;
seen.add(canonical);
normalized.push({
url: canonical,
width,
height,
host,
pathname,
});
}
return normalized;
}
export async function collectRenderedImageCandidates(page) {
return page.evaluate(() => {
const out = [];
const addUrl = (url, width, height) => {
if (!url) return;
out.push({ url, width: Number(width || 0), height: Number(height || 0) });
};
const parseSrcset = (srcset) => {
if (!srcset) return [];
return srcset
.split(",")
.map((entry) => entry.trim().split(/\s+/)[0])
.filter(Boolean);
};
const all = Array.from(document.querySelectorAll("img, source"));
for (const node of all) {
if (node instanceof HTMLImageElement) {
addUrl(node.currentSrc || node.src, node.naturalWidth || node.clientWidth, node.naturalHeight || node.clientHeight);
for (const url of parseSrcset(node.srcset)) {
addUrl(url, node.naturalWidth || node.clientWidth, node.naturalHeight || node.clientHeight);
}
} else if (node instanceof HTMLSourceElement) {
for (const url of parseSrcset(node.srcset)) {
addUrl(url, 0, 0);
}
}
}
for (const anchor of Array.from(document.querySelectorAll("a[href]"))) {
const href = anchor.getAttribute("href") || "";
if (/\.(?:jpg|jpeg|png|webp)(?:$|\?)/i.test(href)) {
addUrl(href, 0, 0);
}
}
return out;
});
}
export function buildResult({
requestedUrl,
page,
clickedLabel,
imageUrls,
source,
notes = [],
}) {
return {
source,
requestedUrl,
finalUrl: page.url(),
title: null,
clickedLabel,
photoCount: imageUrls.length,
imageUrls,
notes,
};
}

View File

@@ -0,0 +1,66 @@
import test from "node:test";
import assert from "node:assert/strict";
import { normalizeImageCandidates } from "./real-estate-photo-common.js";
test("normalizeImageCandidates keeps distinct Zillow photo URLs and strips query strings", () => {
const result = normalizeImageCandidates(
[
{
url: "https://photos.zillowstatic.com/fp/abc123-p_e.jpg?set=1",
width: 1024,
height: 768,
},
{
url: "https://photos.zillowstatic.com/fp/abc123-p_e.jpg?set=2",
width: 1024,
height: 768,
},
{
url: "https://www.zillow.com/static/logo.png",
width: 120,
height: 40,
},
],
{
hostIncludes: ["photos.zillowstatic.com"],
minWidth: 240,
minHeight: 180,
}
);
assert.deepEqual(result.map((item) => item.url), [
"https://photos.zillowstatic.com/fp/abc123-p_e.jpg",
]);
});
test("normalizeImageCandidates filters tiny HAR page assets and keeps large photos", () => {
const result = normalizeImageCandidates(
[
{
url: "https://photos.har.com/123/main.jpg?size=large",
width: 1600,
height: 1200,
},
{
url: "https://cdn.har.com/icons/close.svg",
width: 24,
height: 24,
},
{
url: "data:image/png;base64,deadbeef",
width: 800,
height: 600,
},
],
{
hostExcludes: ["doubleclick", "gstatic"],
minWidth: 240,
minHeight: 180,
}
);
assert.deepEqual(result.map((item) => item.url), [
"https://photos.har.com/123/main.jpg",
]);
});

View File

@@ -0,0 +1,59 @@
export function extractZillowStructuredPhotoCandidatesFromNextDataScript(scriptText) {
if (typeof scriptText !== "string" || !scriptText.trim()) {
return [];
}
let nextData;
try {
nextData = JSON.parse(scriptText);
} catch {
return [];
}
const cacheText = nextData?.props?.pageProps?.componentProps?.gdpClientCache;
if (typeof cacheText !== "string" || !cacheText.trim()) {
return [];
}
let cache;
try {
cache = JSON.parse(cacheText);
} catch {
return [];
}
const out = [];
for (const entry of Object.values(cache)) {
const photos = entry?.property?.responsivePhotos;
if (!Array.isArray(photos)) continue;
for (const photo of photos) {
if (typeof photo?.url === "string" && photo.url) {
out.push({ url: photo.url });
continue;
}
const mixedSources = photo?.mixedSources;
if (!mixedSources || typeof mixedSources !== "object") continue;
let best = null;
for (const variants of Object.values(mixedSources)) {
if (!Array.isArray(variants)) continue;
for (const variant of variants) {
if (typeof variant?.url !== "string" || !variant.url) continue;
const width = Number(variant.width || 0);
if (!best || width > best.width) {
best = { url: variant.url, width };
}
}
}
if (best) {
out.push(best);
}
}
}
return out;
}

View File

@@ -0,0 +1,69 @@
import test from "node:test";
import assert from "node:assert/strict";
import { extractZillowStructuredPhotoCandidatesFromNextDataScript } from "./zillow-photo-data.js";
test("extractZillowStructuredPhotoCandidatesFromNextDataScript reads responsivePhotos", () => {
const scriptText = JSON.stringify({
props: {
pageProps: {
componentProps: {
gdpClientCache: JSON.stringify({
SomeQuery: {
property: {
responsivePhotos: [
{
url: "https://photos.zillowstatic.com/fp/photo-one-p_d.jpg",
mixedSources: {
jpeg: [{ url: "https://photos.zillowstatic.com/fp/photo-one-cc_ft_384.jpg", width: 384 }],
},
},
{
url: "https://photos.zillowstatic.com/fp/photo-two-p_d.jpg",
},
],
},
},
}),
},
},
},
});
assert.deepEqual(extractZillowStructuredPhotoCandidatesFromNextDataScript(scriptText), [
{ url: "https://photos.zillowstatic.com/fp/photo-one-p_d.jpg" },
{ url: "https://photos.zillowstatic.com/fp/photo-two-p_d.jpg" },
]);
});
test("extractZillowStructuredPhotoCandidatesFromNextDataScript falls back to mixedSources", () => {
const scriptText = JSON.stringify({
props: {
pageProps: {
componentProps: {
gdpClientCache: JSON.stringify({
SomeQuery: {
property: {
responsivePhotos: [
{
mixedSources: {
jpeg: [
{ url: "https://photos.zillowstatic.com/fp/photo-one-cc_ft_384.jpg", width: 384 },
{ url: "https://photos.zillowstatic.com/fp/photo-one-cc_ft_1536.jpg", width: 1536 },
],
webp: [{ url: "https://photos.zillowstatic.com/fp/photo-one-cc_ft_1152.webp", width: 1152 }],
},
},
],
},
},
}),
},
},
},
});
assert.deepEqual(extractZillowStructuredPhotoCandidatesFromNextDataScript(scriptText), [
{ url: "https://photos.zillowstatic.com/fp/photo-one-cc_ft_1536.jpg", width: 1536 },
]);
});

View File

@@ -0,0 +1,182 @@
#!/usr/bin/env node
import {
clickPhotoEntryPoint,
createPageSession,
dismissCommonOverlays,
fail,
gotoListing,
normalizeImageCandidates,
parseTarget,
scrollUntilSettled,
sleep,
waitForPhotoExperience,
} from "./real-estate-photo-common.js";
import { extractZillowStructuredPhotoCandidatesFromNextDataScript } from "./zillow-photo-data.js";
const ZILLOW_LABELS = [
/^See all(?: \d+)? photos$/i,
/^See all photos$/i,
/^Photos$/i,
];
async function getAnnouncedPhotoCount(page) {
return page.evaluate(() => {
const text = document.body?.innerText || "";
const match = text.match(/See all\s+(\d+)\s+photos/i);
return match ? Number(match[1]) : null;
});
}
function collapseZillowPhotos(candidates) {
const byBaseId = new Map();
for (const candidate of candidates) {
const filename = candidate.pathname.split("/").pop() || "";
const baseId = filename.split("-")[0];
const sizeScore = (candidate.width || 0) * (candidate.height || 0) || candidate.width || candidate.height || 0;
const preference = /-p_d\.(?:jpe?g|webp)$/i.test(candidate.url)
? 10_000_000
: Number(candidate.url.match(/-cc_ft_(\d+)\./i)?.[1] || 0);
const score = preference + sizeScore;
const existing = byBaseId.get(baseId);
const existingSizeScore = existing
? (existing.width || 0) * (existing.height || 0) || existing.width || existing.height || 0
: -1;
const existingPreference = existing
? /-p_d\.(?:jpe?g|webp)$/i.test(existing.url)
? 10_000_000
: Number(existing.url.match(/-cc_ft_(\d+)\./i)?.[1] || 0)
: 0;
const existingScore = existing ? existingPreference + existingSizeScore : -1;
if (!existing || score > existingScore) {
byBaseId.set(baseId, candidate);
}
}
return Array.from(byBaseId.values()).sort((a, b) => a.url.localeCompare(b.url));
}
async function collectZillowPhotoCandidates(page) {
return page.evaluate(() => {
const out = [];
const add = (url, width, height) => {
if (url) out.push({ url, width: Number(width || 0), height: Number(height || 0) });
};
const parseSrcset = (srcset) =>
(srcset || "")
.split(",")
.map((entry) => entry.trim().split(/\s+/)[0])
.filter(Boolean);
const selectors = [
".media-stream-tile img",
".media-stream-tile source",
'[class*="media-stream"] img',
'[class*="media-stream"] source',
'img[alt*="image of "]',
'img[alt*="image of this home"]',
];
const nodes = selectors.flatMap((selector) => Array.from(document.querySelectorAll(selector)));
for (const node of nodes) {
if (node instanceof HTMLImageElement) {
add(node.currentSrc || node.src, node.naturalWidth || node.clientWidth, node.naturalHeight || node.clientHeight);
for (const url of parseSrcset(node.srcset)) {
add(url, node.naturalWidth || node.clientWidth, node.naturalHeight || node.clientHeight);
}
} else if (node instanceof HTMLSourceElement) {
for (const url of parseSrcset(node.srcset)) {
add(url, 0, 0);
}
}
}
return out;
});
}
async function collectZillowStructuredPhotoCandidates(page) {
const scriptText = await page.locator("#__NEXT_DATA__").textContent().catch(() => null);
return extractZillowStructuredPhotoCandidatesFromNextDataScript(scriptText || "");
}
async function main() {
const requestedUrl = parseTarget(process.argv[2]);
const { context, page } = await createPageSession({ headless: process.env.HEADLESS !== "false" });
try {
await gotoListing(page, requestedUrl);
await dismissCommonOverlays(page);
const expectedPhotoCount = await getAnnouncedPhotoCount(page);
const beforeUrl = page.url();
let clickedLabel = null;
let clickError = null;
try {
clickedLabel = await clickPhotoEntryPoint(page, ZILLOW_LABELS);
await waitForPhotoExperience(page, beforeUrl);
await scrollUntilSettled(page);
await sleep(1200);
} catch (error) {
clickError = error instanceof Error ? error.message : String(error);
}
const [structuredCandidates, renderedCandidates] = await Promise.all([
collectZillowStructuredPhotoCandidates(page),
collectZillowPhotoCandidates(page),
]);
const candidates = [...structuredCandidates, ...renderedCandidates];
const normalized = normalizeImageCandidates(candidates, {
hostIncludes: ["photos.zillowstatic.com"],
minWidth: 240,
minHeight: 180,
});
const photos = collapseZillowPhotos(normalized);
if (!photos.length) {
fail(
"Zillow photo extraction failed.",
clickError || "No Zillow image URLs were found on the rendered listing page."
);
}
const complete = expectedPhotoCount ? photos.length >= expectedPhotoCount : true;
const notes = [];
if (clickedLabel) {
notes.push("Opened Zillow all-photos flow and extracted direct Zillow image URLs.");
} else {
notes.push("The rendered Zillow listing shell already exposed the Zillow photo stream, so extraction completed without relying on the all-photos click path.");
}
if (clickError) {
notes.push(`All-photos click path was not required: ${clickError}`);
}
const result = {
source: "zillow",
requestedUrl,
finalUrl: page.url(),
title: await page.title(),
clickedLabel,
expectedPhotoCount,
complete,
photoCount: photos.length,
imageUrls: photos.map((photo) => photo.url),
notes,
};
process.stdout.write(`${JSON.stringify(result, null, 2)}\n`);
await context.close();
} catch (error) {
try {
await context.close();
} catch {
// Ignore close errors after the primary failure.
}
fail("Zillow photo extraction failed.", error instanceof Error ? error.message : String(error));
}
}
main();