183 lines
5.8 KiB
JavaScript
183 lines
5.8 KiB
JavaScript
#!/usr/bin/env node
|
|
|
|
import {
|
|
clickPhotoEntryPoint,
|
|
createPageSession,
|
|
dismissCommonOverlays,
|
|
fail,
|
|
gotoListing,
|
|
normalizeImageCandidates,
|
|
parseTarget,
|
|
scrollUntilSettled,
|
|
sleep,
|
|
waitForPhotoExperience,
|
|
} from "./real-estate-photo-common.js";
|
|
import { extractZillowStructuredPhotoCandidatesFromNextDataScript } from "./zillow-photo-data.js";
|
|
|
|
const ZILLOW_LABELS = [
|
|
/^See all(?: \d+)? photos$/i,
|
|
/^See all photos$/i,
|
|
/^Photos$/i,
|
|
];
|
|
|
|
async function getAnnouncedPhotoCount(page) {
|
|
return page.evaluate(() => {
|
|
const text = document.body?.innerText || "";
|
|
const match = text.match(/See all\s+(\d+)\s+photos/i);
|
|
return match ? Number(match[1]) : null;
|
|
});
|
|
}
|
|
|
|
function collapseZillowPhotos(candidates) {
|
|
const byBaseId = new Map();
|
|
|
|
for (const candidate of candidates) {
|
|
const filename = candidate.pathname.split("/").pop() || "";
|
|
const baseId = filename.split("-")[0];
|
|
const sizeScore = (candidate.width || 0) * (candidate.height || 0) || candidate.width || candidate.height || 0;
|
|
const preference = /-p_d\.(?:jpe?g|webp)$/i.test(candidate.url)
|
|
? 10_000_000
|
|
: Number(candidate.url.match(/-cc_ft_(\d+)\./i)?.[1] || 0);
|
|
const score = preference + sizeScore;
|
|
const existing = byBaseId.get(baseId);
|
|
const existingSizeScore = existing
|
|
? (existing.width || 0) * (existing.height || 0) || existing.width || existing.height || 0
|
|
: -1;
|
|
const existingPreference = existing
|
|
? /-p_d\.(?:jpe?g|webp)$/i.test(existing.url)
|
|
? 10_000_000
|
|
: Number(existing.url.match(/-cc_ft_(\d+)\./i)?.[1] || 0)
|
|
: 0;
|
|
const existingScore = existing ? existingPreference + existingSizeScore : -1;
|
|
|
|
if (!existing || score > existingScore) {
|
|
byBaseId.set(baseId, candidate);
|
|
}
|
|
}
|
|
|
|
return Array.from(byBaseId.values()).sort((a, b) => a.url.localeCompare(b.url));
|
|
}
|
|
|
|
async function collectZillowPhotoCandidates(page) {
|
|
return page.evaluate(() => {
|
|
const out = [];
|
|
const add = (url, width, height) => {
|
|
if (url) out.push({ url, width: Number(width || 0), height: Number(height || 0) });
|
|
};
|
|
const parseSrcset = (srcset) =>
|
|
(srcset || "")
|
|
.split(",")
|
|
.map((entry) => entry.trim().split(/\s+/)[0])
|
|
.filter(Boolean);
|
|
|
|
const selectors = [
|
|
".media-stream-tile img",
|
|
".media-stream-tile source",
|
|
'[class*="media-stream"] img',
|
|
'[class*="media-stream"] source',
|
|
'img[alt*="image of "]',
|
|
'img[alt*="image of this home"]',
|
|
];
|
|
|
|
const nodes = selectors.flatMap((selector) => Array.from(document.querySelectorAll(selector)));
|
|
for (const node of nodes) {
|
|
if (node instanceof HTMLImageElement) {
|
|
add(node.currentSrc || node.src, node.naturalWidth || node.clientWidth, node.naturalHeight || node.clientHeight);
|
|
for (const url of parseSrcset(node.srcset)) {
|
|
add(url, node.naturalWidth || node.clientWidth, node.naturalHeight || node.clientHeight);
|
|
}
|
|
} else if (node instanceof HTMLSourceElement) {
|
|
for (const url of parseSrcset(node.srcset)) {
|
|
add(url, 0, 0);
|
|
}
|
|
}
|
|
}
|
|
|
|
return out;
|
|
});
|
|
}
|
|
|
|
async function collectZillowStructuredPhotoCandidates(page) {
|
|
const scriptText = await page.locator("#__NEXT_DATA__").textContent().catch(() => null);
|
|
return extractZillowStructuredPhotoCandidatesFromNextDataScript(scriptText || "");
|
|
}
|
|
|
|
async function main() {
|
|
const requestedUrl = parseTarget(process.argv[2]);
|
|
const { context, page } = await createPageSession({ headless: process.env.HEADLESS !== "false" });
|
|
|
|
try {
|
|
await gotoListing(page, requestedUrl);
|
|
await dismissCommonOverlays(page);
|
|
|
|
const expectedPhotoCount = await getAnnouncedPhotoCount(page);
|
|
const beforeUrl = page.url();
|
|
let clickedLabel = null;
|
|
let clickError = null;
|
|
|
|
try {
|
|
clickedLabel = await clickPhotoEntryPoint(page, ZILLOW_LABELS);
|
|
await waitForPhotoExperience(page, beforeUrl);
|
|
await scrollUntilSettled(page);
|
|
await sleep(1200);
|
|
} catch (error) {
|
|
clickError = error instanceof Error ? error.message : String(error);
|
|
}
|
|
|
|
const [structuredCandidates, renderedCandidates] = await Promise.all([
|
|
collectZillowStructuredPhotoCandidates(page),
|
|
collectZillowPhotoCandidates(page),
|
|
]);
|
|
const candidates = [...structuredCandidates, ...renderedCandidates];
|
|
const normalized = normalizeImageCandidates(candidates, {
|
|
hostIncludes: ["photos.zillowstatic.com"],
|
|
minWidth: 240,
|
|
minHeight: 180,
|
|
});
|
|
const photos = collapseZillowPhotos(normalized);
|
|
|
|
if (!photos.length) {
|
|
fail(
|
|
"Zillow photo extraction failed.",
|
|
clickError || "No Zillow image URLs were found on the rendered listing page."
|
|
);
|
|
}
|
|
|
|
const complete = expectedPhotoCount ? photos.length >= expectedPhotoCount : true;
|
|
const notes = [];
|
|
if (clickedLabel) {
|
|
notes.push("Opened Zillow all-photos flow and extracted direct Zillow image URLs.");
|
|
} else {
|
|
notes.push("The rendered Zillow listing shell already exposed the Zillow photo stream, so extraction completed without relying on the all-photos click path.");
|
|
}
|
|
if (clickError) {
|
|
notes.push(`All-photos click path was not required: ${clickError}`);
|
|
}
|
|
|
|
const result = {
|
|
source: "zillow",
|
|
requestedUrl,
|
|
finalUrl: page.url(),
|
|
title: await page.title(),
|
|
clickedLabel,
|
|
expectedPhotoCount,
|
|
complete,
|
|
photoCount: photos.length,
|
|
imageUrls: photos.map((photo) => photo.url),
|
|
notes,
|
|
};
|
|
|
|
process.stdout.write(`${JSON.stringify(result, null, 2)}\n`);
|
|
await context.close();
|
|
} catch (error) {
|
|
try {
|
|
await context.close();
|
|
} catch {
|
|
// Ignore close errors after the primary failure.
|
|
}
|
|
fail("Zillow photo extraction failed.", error instanceof Error ? error.message : String(error));
|
|
}
|
|
}
|
|
|
|
main();
|