Prefer structured Zillow photo data before click path
This commit is contained in:
@@ -181,8 +181,8 @@ node zillow-photos.js "https://www.zillow.com/homedetails/4141-Whiteley-Dr-Corpu
|
|||||||
|
|
||||||
What it does:
|
What it does:
|
||||||
- opens the listing page with CloakBrowser
|
- opens the listing page with CloakBrowser
|
||||||
- tries the `See all photos` / `See all X photos` entry point
|
- first checks whether the rendered listing shell already exposes a complete photo set in Zillow's embedded `__NEXT_DATA__` payload
|
||||||
- if Zillow keeps the click path flaky, falls back to the listing's embedded `__NEXT_DATA__` payload
|
- only tries the `See all photos` / `See all X photos` entry point when the initial structured data is incomplete
|
||||||
- returns direct `photos.zillowstatic.com` image URLs as JSON
|
- returns direct `photos.zillowstatic.com` image URLs as JSON
|
||||||
- fails fast with a timeout if the browser-backed extraction stalls
|
- fails fast with a timeout if the browser-backed extraction stalls
|
||||||
|
|
||||||
|
|||||||
@@ -157,8 +157,9 @@ The discovery scripts are purpose-built for the common address-to-listing workfl
|
|||||||
|
|
||||||
The photo scripts are purpose-built for the common `See all photos` / `Show all photos` workflow:
|
The photo scripts are purpose-built for the common `See all photos` / `Show all photos` workflow:
|
||||||
- open the listing page
|
- open the listing page
|
||||||
- click the all-photos entry point
|
- on Zillow, first inspect the rendered listing shell for a complete structured `__NEXT_DATA__` photo set
|
||||||
- wait for the resulting photo page or scroller view
|
- only force the all-photos click path when the initial Zillow page data is incomplete
|
||||||
|
- wait for the resulting photo page or scroller view when the click path is actually needed
|
||||||
- extract direct image URLs from the rendered page
|
- extract direct image URLs from the rendered page
|
||||||
- fail fast with a timeout instead of hanging indefinitely when the browser-backed extraction stalls
|
- fail fast with a timeout instead of hanging indefinitely when the browser-backed extraction stalls
|
||||||
- support longer source-specific timeouts when a caller such as `property-assessor` imports them for slower exact-unit Zillow renders
|
- support longer source-specific timeouts when a caller such as `property-assessor` imports them for slower exact-unit Zillow renders
|
||||||
|
|||||||
@@ -57,3 +57,12 @@ export function extractZillowStructuredPhotoCandidatesFromNextDataScript(scriptT
|
|||||||
|
|
||||||
return out;
|
return out;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
export function shouldUseStructuredZillowPhotos(candidates, expectedPhotoCount) {
|
||||||
|
const count = Array.isArray(candidates) ? candidates.length : 0;
|
||||||
|
if (!Number.isFinite(expectedPhotoCount) || expectedPhotoCount <= 0) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
return count >= expectedPhotoCount;
|
||||||
|
}
|
||||||
|
|||||||
@@ -1,7 +1,10 @@
|
|||||||
import test from "node:test";
|
import test from "node:test";
|
||||||
import assert from "node:assert/strict";
|
import assert from "node:assert/strict";
|
||||||
|
|
||||||
import { extractZillowStructuredPhotoCandidatesFromNextDataScript } from "./zillow-photo-data.js";
|
import {
|
||||||
|
extractZillowStructuredPhotoCandidatesFromNextDataScript,
|
||||||
|
shouldUseStructuredZillowPhotos,
|
||||||
|
} from "./zillow-photo-data.js";
|
||||||
|
|
||||||
test("extractZillowStructuredPhotoCandidatesFromNextDataScript reads responsivePhotos", () => {
|
test("extractZillowStructuredPhotoCandidatesFromNextDataScript reads responsivePhotos", () => {
|
||||||
const scriptText = JSON.stringify({
|
const scriptText = JSON.stringify({
|
||||||
@@ -67,3 +70,19 @@ test("extractZillowStructuredPhotoCandidatesFromNextDataScript falls back to mix
|
|||||||
{ url: "https://photos.zillowstatic.com/fp/photo-one-cc_ft_1536.jpg", width: 1536 },
|
{ url: "https://photos.zillowstatic.com/fp/photo-one-cc_ft_1536.jpg", width: 1536 },
|
||||||
]);
|
]);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
test("shouldUseStructuredZillowPhotos returns true when structured photos already match the announced count", () => {
|
||||||
|
const candidates = Array.from({ length: 29 }, (_, index) => ({
|
||||||
|
url: `https://photos.zillowstatic.com/fp/photo-${index + 1}-p_d.jpg`,
|
||||||
|
}));
|
||||||
|
|
||||||
|
assert.equal(shouldUseStructuredZillowPhotos(candidates, 29), true);
|
||||||
|
});
|
||||||
|
|
||||||
|
test("shouldUseStructuredZillowPhotos returns false when structured photos are incomplete for the announced count", () => {
|
||||||
|
const candidates = Array.from({ length: 12 }, (_, index) => ({
|
||||||
|
url: `https://photos.zillowstatic.com/fp/photo-${index + 1}-p_d.jpg`,
|
||||||
|
}));
|
||||||
|
|
||||||
|
assert.equal(shouldUseStructuredZillowPhotos(candidates, 29), false);
|
||||||
|
});
|
||||||
|
|||||||
@@ -16,7 +16,10 @@ import {
|
|||||||
sleep,
|
sleep,
|
||||||
waitForPhotoExperience,
|
waitForPhotoExperience,
|
||||||
} from "./real-estate-photo-common.js";
|
} from "./real-estate-photo-common.js";
|
||||||
import { extractZillowStructuredPhotoCandidatesFromNextDataScript } from "./zillow-photo-data.js";
|
import {
|
||||||
|
extractZillowStructuredPhotoCandidatesFromNextDataScript,
|
||||||
|
shouldUseStructuredZillowPhotos,
|
||||||
|
} from "./zillow-photo-data.js";
|
||||||
|
|
||||||
const ZILLOW_LABELS = [
|
const ZILLOW_LABELS = [
|
||||||
/^See all(?: \d+)? photos$/i,
|
/^See all(?: \d+)? photos$/i,
|
||||||
@@ -106,6 +109,15 @@ async function collectZillowStructuredPhotoCandidates(page) {
|
|||||||
return extractZillowStructuredPhotoCandidatesFromNextDataScript(scriptText || "");
|
return extractZillowStructuredPhotoCandidatesFromNextDataScript(scriptText || "");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function normalizeZillowPhotos(candidates) {
|
||||||
|
const normalized = normalizeImageCandidates(candidates, {
|
||||||
|
hostIncludes: ["photos.zillowstatic.com"],
|
||||||
|
minWidth: 240,
|
||||||
|
minHeight: 180,
|
||||||
|
});
|
||||||
|
return collapseZillowPhotos(normalized);
|
||||||
|
}
|
||||||
|
|
||||||
export async function extractZillowPhotos(rawUrl, options = {}) {
|
export async function extractZillowPhotos(rawUrl, options = {}) {
|
||||||
const requestedUrl = parseTarget(rawUrl);
|
const requestedUrl = parseTarget(rawUrl);
|
||||||
const maxAttempts = 2;
|
const maxAttempts = 2;
|
||||||
@@ -125,6 +137,33 @@ export async function extractZillowPhotos(rawUrl, options = {}) {
|
|||||||
await dismissCommonOverlays(page);
|
await dismissCommonOverlays(page);
|
||||||
|
|
||||||
const expectedPhotoCount = await getAnnouncedPhotoCount(page);
|
const expectedPhotoCount = await getAnnouncedPhotoCount(page);
|
||||||
|
const initialStructuredPhotos = normalizeZillowPhotos(
|
||||||
|
await collectZillowStructuredPhotoCandidates(page)
|
||||||
|
);
|
||||||
|
|
||||||
|
if (shouldUseStructuredZillowPhotos(initialStructuredPhotos, expectedPhotoCount)) {
|
||||||
|
const notes = [
|
||||||
|
"The rendered Zillow listing shell already exposed a complete structured photo set, so extraction completed without relying on the all-photos click path.",
|
||||||
|
];
|
||||||
|
if (attempt > 1) {
|
||||||
|
notes.push(
|
||||||
|
"Recovered after retrying Zillow photo extraction once because the first browser session closed unexpectedly."
|
||||||
|
);
|
||||||
|
}
|
||||||
|
return {
|
||||||
|
source: "zillow",
|
||||||
|
requestedUrl,
|
||||||
|
finalUrl: page.url(),
|
||||||
|
title: await page.title(),
|
||||||
|
clickedLabel: null,
|
||||||
|
expectedPhotoCount,
|
||||||
|
complete: true,
|
||||||
|
photoCount: initialStructuredPhotos.length,
|
||||||
|
imageUrls: initialStructuredPhotos.map((photo) => photo.url),
|
||||||
|
notes,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
const beforeUrl = page.url();
|
const beforeUrl = page.url();
|
||||||
let clickedLabel = null;
|
let clickedLabel = null;
|
||||||
let clickError = null;
|
let clickError = null;
|
||||||
@@ -142,13 +181,7 @@ export async function extractZillowPhotos(rawUrl, options = {}) {
|
|||||||
collectZillowStructuredPhotoCandidates(page),
|
collectZillowStructuredPhotoCandidates(page),
|
||||||
collectZillowPhotoCandidates(page),
|
collectZillowPhotoCandidates(page),
|
||||||
]);
|
]);
|
||||||
const candidates = [...structuredCandidates, ...renderedCandidates];
|
const photos = normalizeZillowPhotos([...structuredCandidates, ...renderedCandidates]);
|
||||||
const normalized = normalizeImageCandidates(candidates, {
|
|
||||||
hostIncludes: ["photos.zillowstatic.com"],
|
|
||||||
minWidth: 240,
|
|
||||||
minHeight: 180,
|
|
||||||
});
|
|
||||||
const photos = collapseZillowPhotos(normalized);
|
|
||||||
|
|
||||||
if (!photos.length) {
|
if (!photos.length) {
|
||||||
fail(
|
fail(
|
||||||
|
|||||||
Reference in New Issue
Block a user