Add Zillow and HAR photo extractors
This commit is contained in:
40
skills/web-automation/scripts/check-install.js
Normal file
40
skills/web-automation/scripts/check-install.js
Normal file
@@ -0,0 +1,40 @@
|
||||
#!/usr/bin/env node
|
||||
|
||||
import fs from "node:fs";
|
||||
import path from "node:path";
|
||||
import { fileURLToPath } from "node:url";
|
||||
|
||||
const __filename = fileURLToPath(import.meta.url);
|
||||
const __dirname = path.dirname(__filename);
|
||||
|
||||
function fail(message, details) {
|
||||
const payload = { error: message };
|
||||
if (details) payload.details = details;
|
||||
process.stderr.write(`${JSON.stringify(payload)}\n`);
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
async function main() {
|
||||
try {
|
||||
await import("cloakbrowser");
|
||||
await import("playwright-core");
|
||||
} catch (error) {
|
||||
fail(
|
||||
"Missing dependency/config: web-automation requires cloakbrowser and playwright-core.",
|
||||
error instanceof Error ? error.message : String(error)
|
||||
);
|
||||
}
|
||||
|
||||
const browsePath = path.join(__dirname, "browse.ts");
|
||||
const browseSource = fs.readFileSync(browsePath, "utf8");
|
||||
if (!/launchPersistentContext/.test(browseSource) || !/from ['"]cloakbrowser['"]/.test(browseSource)) {
|
||||
fail("browse.ts is not configured for CloakBrowser.");
|
||||
}
|
||||
|
||||
process.stdout.write("OK: cloakbrowser + playwright-core installed\n");
|
||||
process.stdout.write("OK: CloakBrowser integration detected in browse.ts\n");
|
||||
}
|
||||
|
||||
main().catch((error) => {
|
||||
fail("Install check failed.", error instanceof Error ? error.message : String(error));
|
||||
});
|
||||
83
skills/web-automation/scripts/har-photos.js
Normal file
83
skills/web-automation/scripts/har-photos.js
Normal file
@@ -0,0 +1,83 @@
|
||||
#!/usr/bin/env node
|
||||
|
||||
import {
|
||||
clickPhotoEntryPoint,
|
||||
collectRenderedImageCandidates,
|
||||
createPageSession,
|
||||
dismissCommonOverlays,
|
||||
fail,
|
||||
gotoListing,
|
||||
normalizeImageCandidates,
|
||||
parseTarget,
|
||||
scrollUntilSettled,
|
||||
sleep,
|
||||
waitForPhotoExperience,
|
||||
} from "./real-estate-photo-common.js";
|
||||
|
||||
const HAR_LABELS = [
|
||||
/^Show all photos$/i,
|
||||
/^View all photos$/i,
|
||||
/^All photos$/i,
|
||||
/^Photos$/i,
|
||||
];
|
||||
|
||||
async function getAnnouncedPhotoCount(page) {
|
||||
return page.evaluate(() => {
|
||||
const text = document.body?.innerText || "";
|
||||
const match = text.match(/(\d+)\s+photos/i);
|
||||
return match ? Number(match[1]) : null;
|
||||
});
|
||||
}
|
||||
|
||||
async function main() {
|
||||
const requestedUrl = parseTarget(process.argv[2]);
|
||||
const { context, page } = await createPageSession({ headless: process.env.HEADLESS !== "false" });
|
||||
|
||||
try {
|
||||
await gotoListing(page, requestedUrl);
|
||||
await dismissCommonOverlays(page);
|
||||
|
||||
const expectedPhotoCount = await getAnnouncedPhotoCount(page);
|
||||
const beforeUrl = page.url();
|
||||
const clickedLabel = await clickPhotoEntryPoint(page, HAR_LABELS);
|
||||
await waitForPhotoExperience(page, beforeUrl);
|
||||
await scrollUntilSettled(page);
|
||||
await sleep(1200);
|
||||
|
||||
const candidates = await collectRenderedImageCandidates(page);
|
||||
const photos = normalizeImageCandidates(candidates, {
|
||||
hostIncludes: ["pics.harstatic.com", "photos.harstatic.com"],
|
||||
minWidth: 240,
|
||||
minHeight: 180,
|
||||
});
|
||||
|
||||
if (!photos.length) {
|
||||
fail("HAR photo extraction failed.", "No large image URLs were found after opening the HAR all-photos view.");
|
||||
}
|
||||
|
||||
const result = {
|
||||
source: "har",
|
||||
requestedUrl,
|
||||
finalUrl: page.url(),
|
||||
title: await page.title(),
|
||||
clickedLabel,
|
||||
expectedPhotoCount,
|
||||
complete: expectedPhotoCount ? photos.length >= expectedPhotoCount : true,
|
||||
photoCount: photos.length,
|
||||
imageUrls: photos.map((photo) => photo.url),
|
||||
notes: ["Opened HAR all-photos flow and extracted large rendered image URLs from the photo page."],
|
||||
};
|
||||
|
||||
process.stdout.write(`${JSON.stringify(result, null, 2)}\n`);
|
||||
await context.close();
|
||||
} catch (error) {
|
||||
try {
|
||||
await context.close();
|
||||
} catch {
|
||||
// Ignore close errors after the primary failure.
|
||||
}
|
||||
fail("HAR photo extraction failed.", error instanceof Error ? error.message : String(error));
|
||||
}
|
||||
}
|
||||
|
||||
main();
|
||||
@@ -4,15 +4,19 @@
|
||||
"description": "Web browsing and scraping scripts using CloakBrowser",
|
||||
"type": "module",
|
||||
"scripts": {
|
||||
"check-install": "node check-install.js",
|
||||
"extract": "node extract.js",
|
||||
"har-photos": "node har-photos.js",
|
||||
"browse": "tsx browse.ts",
|
||||
"scrape": "tsx scrape.ts",
|
||||
"test:photos": "node --test real-estate-photo-common.test.mjs zillow-photo-data.test.mjs",
|
||||
"zillow-photos": "node zillow-photos.js",
|
||||
"fetch-browser": "npx cloakbrowser install"
|
||||
},
|
||||
"dependencies": {
|
||||
"@mozilla/readability": "^0.5.0",
|
||||
"better-sqlite3": "^12.6.2",
|
||||
"cloakbrowser": "^0.3.14",
|
||||
"cloakbrowser": "^0.3.18",
|
||||
"jsdom": "^24.0.0",
|
||||
"minimist": "^1.2.8",
|
||||
"playwright-core": "^1.58.2",
|
||||
|
||||
10
skills/web-automation/scripts/pnpm-lock.yaml
generated
10
skills/web-automation/scripts/pnpm-lock.yaml
generated
@@ -15,8 +15,8 @@ importers:
|
||||
specifier: ^12.6.2
|
||||
version: 12.6.2
|
||||
cloakbrowser:
|
||||
specifier: ^0.3.14
|
||||
version: 0.3.14(mmdb-lib@3.0.1)(playwright-core@1.58.2)
|
||||
specifier: ^0.3.18
|
||||
version: 0.3.18(mmdb-lib@3.0.1)(playwright-core@1.58.2)
|
||||
jsdom:
|
||||
specifier: ^24.0.0
|
||||
version: 24.1.3
|
||||
@@ -301,8 +301,8 @@ packages:
|
||||
resolution: {integrity: sha512-+IxzY9BZOQd/XuYPRmrvEVjF/nqj5kgT4kEq7VofrDoM1MxoRjEWkrCC3EtLi59TVawxTAn+orJwFQcrqEN1+g==}
|
||||
engines: {node: '>=18'}
|
||||
|
||||
cloakbrowser@0.3.14:
|
||||
resolution: {integrity: sha512-8mcEVxfiNbAMHNa0B2IZKPtMDQ2peZlrScfQDJW+C9tjKG/P5Bg9wCweI0hnbaWR2ulG1MrxiEvTMMjz/SgmLw==}
|
||||
cloakbrowser@0.3.18:
|
||||
resolution: {integrity: sha512-As8boW2iodPh35XbHk2LSK7fIIgfcPmBpMpBiDv9yMmp6cuHLa7Td2scg8ZgzWJKhhak1uvL5RXQhE+lYxFFqQ==}
|
||||
engines: {node: '>=18.0.0'}
|
||||
hasBin: true
|
||||
peerDependencies:
|
||||
@@ -873,7 +873,7 @@ snapshots:
|
||||
|
||||
chownr@3.0.0: {}
|
||||
|
||||
cloakbrowser@0.3.14(mmdb-lib@3.0.1)(playwright-core@1.58.2):
|
||||
cloakbrowser@0.3.18(mmdb-lib@3.0.1)(playwright-core@1.58.2):
|
||||
dependencies:
|
||||
tar: 7.5.11
|
||||
optionalDependencies:
|
||||
|
||||
292
skills/web-automation/scripts/real-estate-photo-common.js
Normal file
292
skills/web-automation/scripts/real-estate-photo-common.js
Normal file
@@ -0,0 +1,292 @@
|
||||
#!/usr/bin/env node
|
||||
|
||||
const DEFAULT_WAIT_MS = 4000;
|
||||
const NAV_TIMEOUT_MS = 45000;
|
||||
const CLICK_TIMEOUT_MS = 15000;
|
||||
const MAX_SCROLL_PASSES = 12;
|
||||
const SCROLL_PAUSE_MS = 900;
|
||||
const LARGE_IMAGE_MIN_WIDTH = 300;
|
||||
const LARGE_IMAGE_MIN_HEIGHT = 200;
|
||||
|
||||
export function fail(message, details) {
|
||||
const payload = { error: message };
|
||||
if (details) payload.details = details;
|
||||
process.stderr.write(`${JSON.stringify(payload)}\n`);
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
export function parseTarget(rawUrl) {
|
||||
if (!rawUrl) {
|
||||
fail("Missing URL.");
|
||||
}
|
||||
|
||||
let parsed;
|
||||
try {
|
||||
parsed = new URL(rawUrl);
|
||||
} catch (error) {
|
||||
fail("Invalid URL.", error instanceof Error ? error.message : String(error));
|
||||
}
|
||||
|
||||
if (!["http:", "https:"].includes(parsed.protocol)) {
|
||||
fail("Only http and https URLs are allowed.");
|
||||
}
|
||||
|
||||
return parsed.toString();
|
||||
}
|
||||
|
||||
export function sleep(ms) {
|
||||
return new Promise((resolve) => setTimeout(resolve, ms));
|
||||
}
|
||||
|
||||
export async function loadCloakBrowser() {
|
||||
try {
|
||||
return await import("cloakbrowser");
|
||||
} catch (error) {
|
||||
fail(
|
||||
"CloakBrowser is not installed for this skill. Run pnpm install in skills/web-automation/scripts first.",
|
||||
error instanceof Error ? error.message : String(error)
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
export async function runWithStderrLogs(fn) {
|
||||
const originalLog = console.log;
|
||||
const originalError = console.error;
|
||||
console.log = (...args) => process.stderr.write(`${args.join(" ")}\n`);
|
||||
console.error = (...args) => process.stderr.write(`${args.join(" ")}\n`);
|
||||
try {
|
||||
return await fn();
|
||||
} finally {
|
||||
console.log = originalLog;
|
||||
console.error = originalError;
|
||||
}
|
||||
}
|
||||
|
||||
export async function createPageSession({ headless = true } = {}) {
|
||||
const { ensureBinary, launchContext } = await loadCloakBrowser();
|
||||
await runWithStderrLogs(() => ensureBinary());
|
||||
const context = await runWithStderrLogs(() =>
|
||||
launchContext({
|
||||
headless,
|
||||
humanize: true,
|
||||
locale: "en-US",
|
||||
viewport: { width: 1440, height: 900 },
|
||||
})
|
||||
);
|
||||
const page = await context.newPage();
|
||||
page.setDefaultTimeout(CLICK_TIMEOUT_MS);
|
||||
page.setDefaultNavigationTimeout(NAV_TIMEOUT_MS);
|
||||
return { context, page };
|
||||
}
|
||||
|
||||
export async function gotoListing(page, url, waitMs = DEFAULT_WAIT_MS) {
|
||||
await page.goto(url, { waitUntil: "domcontentloaded", timeout: NAV_TIMEOUT_MS });
|
||||
await page.waitForLoadState("networkidle", { timeout: 15000 }).catch(() => {});
|
||||
await sleep(waitMs);
|
||||
}
|
||||
|
||||
export async function dismissCommonOverlays(page) {
|
||||
const dismissLabels = [
|
||||
/accept/i,
|
||||
/agree/i,
|
||||
/close/i,
|
||||
/got it/i,
|
||||
/continue/i,
|
||||
/dismiss/i,
|
||||
/not now/i,
|
||||
];
|
||||
|
||||
for (const label of dismissLabels) {
|
||||
const targets = [
|
||||
page.getByRole("button", { name: label }).first(),
|
||||
page.getByRole("link", { name: label }).first(),
|
||||
];
|
||||
|
||||
for (const target of targets) {
|
||||
try {
|
||||
if (await target.count()) {
|
||||
await target.click({ timeout: 2500 });
|
||||
await sleep(300);
|
||||
}
|
||||
} catch {
|
||||
// Best-effort overlay dismissal only.
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
export async function clickPhotoEntryPoint(page, labels) {
|
||||
for (const label of labels) {
|
||||
const targets = [
|
||||
page.getByRole("button", { name: label }).first(),
|
||||
page.getByRole("link", { name: label }).first(),
|
||||
page.getByText(label).first(),
|
||||
];
|
||||
|
||||
for (const target of targets) {
|
||||
try {
|
||||
if (await target.count()) {
|
||||
await target.scrollIntoViewIfNeeded().catch(() => {});
|
||||
await target.click({ timeout: CLICK_TIMEOUT_MS });
|
||||
return label.toString();
|
||||
}
|
||||
} catch {
|
||||
// Keep trying the next candidate.
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
throw new Error("Could not find a photo entry point.");
|
||||
}
|
||||
|
||||
export async function waitForPhotoExperience(page, previousUrl, waitMs = DEFAULT_WAIT_MS) {
|
||||
await Promise.race([
|
||||
page.waitForURL((url) => url.toString() !== previousUrl, { timeout: NAV_TIMEOUT_MS }).catch(() => {}),
|
||||
page.waitForLoadState("networkidle", { timeout: 15000 }).catch(() => {}),
|
||||
sleep(waitMs),
|
||||
]);
|
||||
await sleep(waitMs);
|
||||
}
|
||||
|
||||
export async function scrollUntilSettled(page, passes = MAX_SCROLL_PASSES) {
|
||||
let previousHeight = 0;
|
||||
|
||||
for (let i = 0; i < passes; i += 1) {
|
||||
const currentHeight = await page.evaluate(() => {
|
||||
const root = document.scrollingElement || document.documentElement || document.body;
|
||||
return root ? root.scrollHeight : 0;
|
||||
});
|
||||
|
||||
await page.evaluate(() => {
|
||||
const root = document.scrollingElement || document.documentElement || document.body;
|
||||
if (root) root.scrollTo({ top: root.scrollHeight, behavior: "instant" });
|
||||
});
|
||||
await sleep(SCROLL_PAUSE_MS);
|
||||
|
||||
if (currentHeight === previousHeight) {
|
||||
break;
|
||||
}
|
||||
previousHeight = currentHeight;
|
||||
}
|
||||
|
||||
await page.evaluate(() => {
|
||||
const root = document.scrollingElement || document.documentElement || document.body;
|
||||
if (root) root.scrollTo({ top: 0, behavior: "instant" });
|
||||
});
|
||||
await sleep(250);
|
||||
}
|
||||
|
||||
export function normalizeImageCandidates(candidates, options = {}) {
|
||||
const {
|
||||
hostIncludes = [],
|
||||
hostExcludes = [],
|
||||
pathnameIncludes = [],
|
||||
minWidth = LARGE_IMAGE_MIN_WIDTH,
|
||||
minHeight = LARGE_IMAGE_MIN_HEIGHT,
|
||||
} = options;
|
||||
|
||||
const seen = new Set();
|
||||
const normalized = [];
|
||||
|
||||
for (const candidate of candidates || []) {
|
||||
const rawUrl = typeof candidate?.url === "string" ? candidate.url.trim() : "";
|
||||
if (!rawUrl || rawUrl.startsWith("data:")) continue;
|
||||
|
||||
let parsed;
|
||||
try {
|
||||
parsed = new URL(rawUrl);
|
||||
} catch {
|
||||
continue;
|
||||
}
|
||||
|
||||
const host = parsed.hostname.toLowerCase();
|
||||
const pathname = parsed.pathname.toLowerCase();
|
||||
const width = Number(candidate.width || candidate.naturalWidth || 0);
|
||||
const height = Number(candidate.height || candidate.naturalHeight || 0);
|
||||
|
||||
if (hostIncludes.length && !hostIncludes.some((part) => host.includes(part))) continue;
|
||||
if (hostExcludes.some((part) => host.includes(part))) continue;
|
||||
if (pathnameIncludes.length && !pathnameIncludes.some((part) => pathname.includes(part))) continue;
|
||||
if (width && width < minWidth) continue;
|
||||
if (height && height < minHeight) continue;
|
||||
|
||||
parsed.hash = "";
|
||||
parsed.search = "";
|
||||
const canonical = parsed.toString();
|
||||
if (seen.has(canonical)) continue;
|
||||
seen.add(canonical);
|
||||
|
||||
normalized.push({
|
||||
url: canonical,
|
||||
width,
|
||||
height,
|
||||
host,
|
||||
pathname,
|
||||
});
|
||||
}
|
||||
|
||||
return normalized;
|
||||
}
|
||||
|
||||
export async function collectRenderedImageCandidates(page) {
|
||||
return page.evaluate(() => {
|
||||
const out = [];
|
||||
|
||||
const addUrl = (url, width, height) => {
|
||||
if (!url) return;
|
||||
out.push({ url, width: Number(width || 0), height: Number(height || 0) });
|
||||
};
|
||||
|
||||
const parseSrcset = (srcset) => {
|
||||
if (!srcset) return [];
|
||||
return srcset
|
||||
.split(",")
|
||||
.map((entry) => entry.trim().split(/\s+/)[0])
|
||||
.filter(Boolean);
|
||||
};
|
||||
|
||||
const all = Array.from(document.querySelectorAll("img, source"));
|
||||
for (const node of all) {
|
||||
if (node instanceof HTMLImageElement) {
|
||||
addUrl(node.currentSrc || node.src, node.naturalWidth || node.clientWidth, node.naturalHeight || node.clientHeight);
|
||||
for (const url of parseSrcset(node.srcset)) {
|
||||
addUrl(url, node.naturalWidth || node.clientWidth, node.naturalHeight || node.clientHeight);
|
||||
}
|
||||
} else if (node instanceof HTMLSourceElement) {
|
||||
for (const url of parseSrcset(node.srcset)) {
|
||||
addUrl(url, 0, 0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (const anchor of Array.from(document.querySelectorAll("a[href]"))) {
|
||||
const href = anchor.getAttribute("href") || "";
|
||||
if (/\.(?:jpg|jpeg|png|webp)(?:$|\?)/i.test(href)) {
|
||||
addUrl(href, 0, 0);
|
||||
}
|
||||
}
|
||||
|
||||
return out;
|
||||
});
|
||||
}
|
||||
|
||||
export function buildResult({
|
||||
requestedUrl,
|
||||
page,
|
||||
clickedLabel,
|
||||
imageUrls,
|
||||
source,
|
||||
notes = [],
|
||||
}) {
|
||||
return {
|
||||
source,
|
||||
requestedUrl,
|
||||
finalUrl: page.url(),
|
||||
title: null,
|
||||
clickedLabel,
|
||||
photoCount: imageUrls.length,
|
||||
imageUrls,
|
||||
notes,
|
||||
};
|
||||
}
|
||||
|
||||
@@ -0,0 +1,66 @@
|
||||
import test from "node:test";
|
||||
import assert from "node:assert/strict";
|
||||
|
||||
import { normalizeImageCandidates } from "./real-estate-photo-common.js";
|
||||
|
||||
test("normalizeImageCandidates keeps distinct Zillow photo URLs and strips query strings", () => {
|
||||
const result = normalizeImageCandidates(
|
||||
[
|
||||
{
|
||||
url: "https://photos.zillowstatic.com/fp/abc123-p_e.jpg?set=1",
|
||||
width: 1024,
|
||||
height: 768,
|
||||
},
|
||||
{
|
||||
url: "https://photos.zillowstatic.com/fp/abc123-p_e.jpg?set=2",
|
||||
width: 1024,
|
||||
height: 768,
|
||||
},
|
||||
{
|
||||
url: "https://www.zillow.com/static/logo.png",
|
||||
width: 120,
|
||||
height: 40,
|
||||
},
|
||||
],
|
||||
{
|
||||
hostIncludes: ["photos.zillowstatic.com"],
|
||||
minWidth: 240,
|
||||
minHeight: 180,
|
||||
}
|
||||
);
|
||||
|
||||
assert.deepEqual(result.map((item) => item.url), [
|
||||
"https://photos.zillowstatic.com/fp/abc123-p_e.jpg",
|
||||
]);
|
||||
});
|
||||
|
||||
test("normalizeImageCandidates filters tiny HAR page assets and keeps large photos", () => {
|
||||
const result = normalizeImageCandidates(
|
||||
[
|
||||
{
|
||||
url: "https://photos.har.com/123/main.jpg?size=large",
|
||||
width: 1600,
|
||||
height: 1200,
|
||||
},
|
||||
{
|
||||
url: "https://cdn.har.com/icons/close.svg",
|
||||
width: 24,
|
||||
height: 24,
|
||||
},
|
||||
{
|
||||
url: "data:image/png;base64,deadbeef",
|
||||
width: 800,
|
||||
height: 600,
|
||||
},
|
||||
],
|
||||
{
|
||||
hostExcludes: ["doubleclick", "gstatic"],
|
||||
minWidth: 240,
|
||||
minHeight: 180,
|
||||
}
|
||||
);
|
||||
|
||||
assert.deepEqual(result.map((item) => item.url), [
|
||||
"https://photos.har.com/123/main.jpg",
|
||||
]);
|
||||
});
|
||||
59
skills/web-automation/scripts/zillow-photo-data.js
Normal file
59
skills/web-automation/scripts/zillow-photo-data.js
Normal file
@@ -0,0 +1,59 @@
|
||||
export function extractZillowStructuredPhotoCandidatesFromNextDataScript(scriptText) {
|
||||
if (typeof scriptText !== "string" || !scriptText.trim()) {
|
||||
return [];
|
||||
}
|
||||
|
||||
let nextData;
|
||||
try {
|
||||
nextData = JSON.parse(scriptText);
|
||||
} catch {
|
||||
return [];
|
||||
}
|
||||
|
||||
const cacheText = nextData?.props?.pageProps?.componentProps?.gdpClientCache;
|
||||
if (typeof cacheText !== "string" || !cacheText.trim()) {
|
||||
return [];
|
||||
}
|
||||
|
||||
let cache;
|
||||
try {
|
||||
cache = JSON.parse(cacheText);
|
||||
} catch {
|
||||
return [];
|
||||
}
|
||||
|
||||
const out = [];
|
||||
|
||||
for (const entry of Object.values(cache)) {
|
||||
const photos = entry?.property?.responsivePhotos;
|
||||
if (!Array.isArray(photos)) continue;
|
||||
|
||||
for (const photo of photos) {
|
||||
if (typeof photo?.url === "string" && photo.url) {
|
||||
out.push({ url: photo.url });
|
||||
continue;
|
||||
}
|
||||
|
||||
const mixedSources = photo?.mixedSources;
|
||||
if (!mixedSources || typeof mixedSources !== "object") continue;
|
||||
|
||||
let best = null;
|
||||
for (const variants of Object.values(mixedSources)) {
|
||||
if (!Array.isArray(variants)) continue;
|
||||
for (const variant of variants) {
|
||||
if (typeof variant?.url !== "string" || !variant.url) continue;
|
||||
const width = Number(variant.width || 0);
|
||||
if (!best || width > best.width) {
|
||||
best = { url: variant.url, width };
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (best) {
|
||||
out.push(best);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return out;
|
||||
}
|
||||
69
skills/web-automation/scripts/zillow-photo-data.test.mjs
Normal file
69
skills/web-automation/scripts/zillow-photo-data.test.mjs
Normal file
@@ -0,0 +1,69 @@
|
||||
import test from "node:test";
|
||||
import assert from "node:assert/strict";
|
||||
|
||||
import { extractZillowStructuredPhotoCandidatesFromNextDataScript } from "./zillow-photo-data.js";
|
||||
|
||||
test("extractZillowStructuredPhotoCandidatesFromNextDataScript reads responsivePhotos", () => {
|
||||
const scriptText = JSON.stringify({
|
||||
props: {
|
||||
pageProps: {
|
||||
componentProps: {
|
||||
gdpClientCache: JSON.stringify({
|
||||
SomeQuery: {
|
||||
property: {
|
||||
responsivePhotos: [
|
||||
{
|
||||
url: "https://photos.zillowstatic.com/fp/photo-one-p_d.jpg",
|
||||
mixedSources: {
|
||||
jpeg: [{ url: "https://photos.zillowstatic.com/fp/photo-one-cc_ft_384.jpg", width: 384 }],
|
||||
},
|
||||
},
|
||||
{
|
||||
url: "https://photos.zillowstatic.com/fp/photo-two-p_d.jpg",
|
||||
},
|
||||
],
|
||||
},
|
||||
},
|
||||
}),
|
||||
},
|
||||
},
|
||||
},
|
||||
});
|
||||
|
||||
assert.deepEqual(extractZillowStructuredPhotoCandidatesFromNextDataScript(scriptText), [
|
||||
{ url: "https://photos.zillowstatic.com/fp/photo-one-p_d.jpg" },
|
||||
{ url: "https://photos.zillowstatic.com/fp/photo-two-p_d.jpg" },
|
||||
]);
|
||||
});
|
||||
|
||||
test("extractZillowStructuredPhotoCandidatesFromNextDataScript falls back to mixedSources", () => {
|
||||
const scriptText = JSON.stringify({
|
||||
props: {
|
||||
pageProps: {
|
||||
componentProps: {
|
||||
gdpClientCache: JSON.stringify({
|
||||
SomeQuery: {
|
||||
property: {
|
||||
responsivePhotos: [
|
||||
{
|
||||
mixedSources: {
|
||||
jpeg: [
|
||||
{ url: "https://photos.zillowstatic.com/fp/photo-one-cc_ft_384.jpg", width: 384 },
|
||||
{ url: "https://photos.zillowstatic.com/fp/photo-one-cc_ft_1536.jpg", width: 1536 },
|
||||
],
|
||||
webp: [{ url: "https://photos.zillowstatic.com/fp/photo-one-cc_ft_1152.webp", width: 1152 }],
|
||||
},
|
||||
},
|
||||
],
|
||||
},
|
||||
},
|
||||
}),
|
||||
},
|
||||
},
|
||||
},
|
||||
});
|
||||
|
||||
assert.deepEqual(extractZillowStructuredPhotoCandidatesFromNextDataScript(scriptText), [
|
||||
{ url: "https://photos.zillowstatic.com/fp/photo-one-cc_ft_1536.jpg", width: 1536 },
|
||||
]);
|
||||
});
|
||||
182
skills/web-automation/scripts/zillow-photos.js
Normal file
182
skills/web-automation/scripts/zillow-photos.js
Normal file
@@ -0,0 +1,182 @@
|
||||
#!/usr/bin/env node
|
||||
|
||||
import {
|
||||
clickPhotoEntryPoint,
|
||||
createPageSession,
|
||||
dismissCommonOverlays,
|
||||
fail,
|
||||
gotoListing,
|
||||
normalizeImageCandidates,
|
||||
parseTarget,
|
||||
scrollUntilSettled,
|
||||
sleep,
|
||||
waitForPhotoExperience,
|
||||
} from "./real-estate-photo-common.js";
|
||||
import { extractZillowStructuredPhotoCandidatesFromNextDataScript } from "./zillow-photo-data.js";
|
||||
|
||||
const ZILLOW_LABELS = [
|
||||
/^See all(?: \d+)? photos$/i,
|
||||
/^See all photos$/i,
|
||||
/^Photos$/i,
|
||||
];
|
||||
|
||||
async function getAnnouncedPhotoCount(page) {
|
||||
return page.evaluate(() => {
|
||||
const text = document.body?.innerText || "";
|
||||
const match = text.match(/See all\s+(\d+)\s+photos/i);
|
||||
return match ? Number(match[1]) : null;
|
||||
});
|
||||
}
|
||||
|
||||
function collapseZillowPhotos(candidates) {
|
||||
const byBaseId = new Map();
|
||||
|
||||
for (const candidate of candidates) {
|
||||
const filename = candidate.pathname.split("/").pop() || "";
|
||||
const baseId = filename.split("-")[0];
|
||||
const sizeScore = (candidate.width || 0) * (candidate.height || 0) || candidate.width || candidate.height || 0;
|
||||
const preference = /-p_d\.(?:jpe?g|webp)$/i.test(candidate.url)
|
||||
? 10_000_000
|
||||
: Number(candidate.url.match(/-cc_ft_(\d+)\./i)?.[1] || 0);
|
||||
const score = preference + sizeScore;
|
||||
const existing = byBaseId.get(baseId);
|
||||
const existingSizeScore = existing
|
||||
? (existing.width || 0) * (existing.height || 0) || existing.width || existing.height || 0
|
||||
: -1;
|
||||
const existingPreference = existing
|
||||
? /-p_d\.(?:jpe?g|webp)$/i.test(existing.url)
|
||||
? 10_000_000
|
||||
: Number(existing.url.match(/-cc_ft_(\d+)\./i)?.[1] || 0)
|
||||
: 0;
|
||||
const existingScore = existing ? existingPreference + existingSizeScore : -1;
|
||||
|
||||
if (!existing || score > existingScore) {
|
||||
byBaseId.set(baseId, candidate);
|
||||
}
|
||||
}
|
||||
|
||||
return Array.from(byBaseId.values()).sort((a, b) => a.url.localeCompare(b.url));
|
||||
}
|
||||
|
||||
async function collectZillowPhotoCandidates(page) {
|
||||
return page.evaluate(() => {
|
||||
const out = [];
|
||||
const add = (url, width, height) => {
|
||||
if (url) out.push({ url, width: Number(width || 0), height: Number(height || 0) });
|
||||
};
|
||||
const parseSrcset = (srcset) =>
|
||||
(srcset || "")
|
||||
.split(",")
|
||||
.map((entry) => entry.trim().split(/\s+/)[0])
|
||||
.filter(Boolean);
|
||||
|
||||
const selectors = [
|
||||
".media-stream-tile img",
|
||||
".media-stream-tile source",
|
||||
'[class*="media-stream"] img',
|
||||
'[class*="media-stream"] source',
|
||||
'img[alt*="image of "]',
|
||||
'img[alt*="image of this home"]',
|
||||
];
|
||||
|
||||
const nodes = selectors.flatMap((selector) => Array.from(document.querySelectorAll(selector)));
|
||||
for (const node of nodes) {
|
||||
if (node instanceof HTMLImageElement) {
|
||||
add(node.currentSrc || node.src, node.naturalWidth || node.clientWidth, node.naturalHeight || node.clientHeight);
|
||||
for (const url of parseSrcset(node.srcset)) {
|
||||
add(url, node.naturalWidth || node.clientWidth, node.naturalHeight || node.clientHeight);
|
||||
}
|
||||
} else if (node instanceof HTMLSourceElement) {
|
||||
for (const url of parseSrcset(node.srcset)) {
|
||||
add(url, 0, 0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return out;
|
||||
});
|
||||
}
|
||||
|
||||
async function collectZillowStructuredPhotoCandidates(page) {
|
||||
const scriptText = await page.locator("#__NEXT_DATA__").textContent().catch(() => null);
|
||||
return extractZillowStructuredPhotoCandidatesFromNextDataScript(scriptText || "");
|
||||
}
|
||||
|
||||
async function main() {
|
||||
const requestedUrl = parseTarget(process.argv[2]);
|
||||
const { context, page } = await createPageSession({ headless: process.env.HEADLESS !== "false" });
|
||||
|
||||
try {
|
||||
await gotoListing(page, requestedUrl);
|
||||
await dismissCommonOverlays(page);
|
||||
|
||||
const expectedPhotoCount = await getAnnouncedPhotoCount(page);
|
||||
const beforeUrl = page.url();
|
||||
let clickedLabel = null;
|
||||
let clickError = null;
|
||||
|
||||
try {
|
||||
clickedLabel = await clickPhotoEntryPoint(page, ZILLOW_LABELS);
|
||||
await waitForPhotoExperience(page, beforeUrl);
|
||||
await scrollUntilSettled(page);
|
||||
await sleep(1200);
|
||||
} catch (error) {
|
||||
clickError = error instanceof Error ? error.message : String(error);
|
||||
}
|
||||
|
||||
const [structuredCandidates, renderedCandidates] = await Promise.all([
|
||||
collectZillowStructuredPhotoCandidates(page),
|
||||
collectZillowPhotoCandidates(page),
|
||||
]);
|
||||
const candidates = [...structuredCandidates, ...renderedCandidates];
|
||||
const normalized = normalizeImageCandidates(candidates, {
|
||||
hostIncludes: ["photos.zillowstatic.com"],
|
||||
minWidth: 240,
|
||||
minHeight: 180,
|
||||
});
|
||||
const photos = collapseZillowPhotos(normalized);
|
||||
|
||||
if (!photos.length) {
|
||||
fail(
|
||||
"Zillow photo extraction failed.",
|
||||
clickError || "No Zillow image URLs were found on the rendered listing page."
|
||||
);
|
||||
}
|
||||
|
||||
const complete = expectedPhotoCount ? photos.length >= expectedPhotoCount : true;
|
||||
const notes = [];
|
||||
if (clickedLabel) {
|
||||
notes.push("Opened Zillow all-photos flow and extracted direct Zillow image URLs.");
|
||||
} else {
|
||||
notes.push("The rendered Zillow listing shell already exposed the Zillow photo stream, so extraction completed without relying on the all-photos click path.");
|
||||
}
|
||||
if (clickError) {
|
||||
notes.push(`All-photos click path was not required: ${clickError}`);
|
||||
}
|
||||
|
||||
const result = {
|
||||
source: "zillow",
|
||||
requestedUrl,
|
||||
finalUrl: page.url(),
|
||||
title: await page.title(),
|
||||
clickedLabel,
|
||||
expectedPhotoCount,
|
||||
complete,
|
||||
photoCount: photos.length,
|
||||
imageUrls: photos.map((photo) => photo.url),
|
||||
notes,
|
||||
};
|
||||
|
||||
process.stdout.write(`${JSON.stringify(result, null, 2)}\n`);
|
||||
await context.close();
|
||||
} catch (error) {
|
||||
try {
|
||||
await context.close();
|
||||
} catch {
|
||||
// Ignore close errors after the primary failure.
|
||||
}
|
||||
fail("Zillow photo extraction failed.", error instanceof Error ? error.message : String(error));
|
||||
}
|
||||
}
|
||||
|
||||
main();
|
||||
Reference in New Issue
Block a user