Trust embedded Zillow photo sets without visible count
This commit is contained in:
@@ -182,6 +182,7 @@ node zillow-photos.js "https://www.zillow.com/homedetails/4141-Whiteley-Dr-Corpu
|
|||||||
What it does:
|
What it does:
|
||||||
- opens the listing page with CloakBrowser
|
- opens the listing page with CloakBrowser
|
||||||
- first checks whether the rendered listing shell already exposes a complete photo set in Zillow's embedded `__NEXT_DATA__` payload
|
- first checks whether the rendered listing shell already exposes a complete photo set in Zillow's embedded `__NEXT_DATA__` payload
|
||||||
|
- if the visible `See all XX photos` count is missing, still trusts the embedded set when the page metadata confirms the count or when the embedded set is already clearly substantial
|
||||||
- only tries the `See all photos` / `See all X photos` entry point when the initial structured data is incomplete
|
- only tries the `See all photos` / `See all X photos` entry point when the initial structured data is incomplete
|
||||||
- returns direct `photos.zillowstatic.com` image URLs as JSON
|
- returns direct `photos.zillowstatic.com` image URLs as JSON
|
||||||
- fails fast with a timeout if the browser-backed extraction stalls
|
- fails fast with a timeout if the browser-backed extraction stalls
|
||||||
|
|||||||
@@ -158,6 +158,7 @@ The discovery scripts are purpose-built for the common address-to-listing workfl
|
|||||||
The photo scripts are purpose-built for the common `See all photos` / `Show all photos` workflow:
|
The photo scripts are purpose-built for the common `See all photos` / `Show all photos` workflow:
|
||||||
- open the listing page
|
- open the listing page
|
||||||
- on Zillow, first inspect the rendered listing shell for a complete structured `__NEXT_DATA__` photo set
|
- on Zillow, first inspect the rendered listing shell for a complete structured `__NEXT_DATA__` photo set
|
||||||
|
- if the visible page count is missing, trust the structured Zillow photo set when page metadata confirms the count or when the embedded set is already clearly substantial
|
||||||
- only force the all-photos click path when the initial Zillow page data is incomplete
|
- only force the all-photos click path when the initial Zillow page data is incomplete
|
||||||
- wait for the resulting photo page or scroller view when the click path is actually needed
|
- wait for the resulting photo page or scroller view when the click path is actually needed
|
||||||
- extract direct image URLs from the rendered page
|
- extract direct image URLs from the rendered page
|
||||||
|
|||||||
@@ -58,11 +58,29 @@ export function extractZillowStructuredPhotoCandidatesFromNextDataScript(scriptT
|
|||||||
return out;
|
return out;
|
||||||
}
|
}
|
||||||
|
|
||||||
export function shouldUseStructuredZillowPhotos(candidates, expectedPhotoCount) {
|
const DEFAULT_MINIMUM_TRUSTED_STRUCTURED_PHOTO_COUNT = 12;
|
||||||
const count = Array.isArray(candidates) ? candidates.length : 0;
|
|
||||||
if (!Number.isFinite(expectedPhotoCount) || expectedPhotoCount <= 0) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
export function shouldUseStructuredZillowPhotos(candidates, options = {}) {
|
||||||
|
const count = Array.isArray(candidates) ? candidates.length : 0;
|
||||||
|
const normalizedOptions =
|
||||||
|
typeof options === "number"
|
||||||
|
? { expectedPhotoCount: options }
|
||||||
|
: options && typeof options === "object"
|
||||||
|
? options
|
||||||
|
: {};
|
||||||
|
const expectedPhotoCount = Number(normalizedOptions.expectedPhotoCount || 0);
|
||||||
|
const fallbackPhotoCount = Number(normalizedOptions.fallbackPhotoCount || 0);
|
||||||
|
const minimumTrustCount = Number(
|
||||||
|
normalizedOptions.minimumTrustCount || DEFAULT_MINIMUM_TRUSTED_STRUCTURED_PHOTO_COUNT
|
||||||
|
);
|
||||||
|
|
||||||
|
if (Number.isFinite(expectedPhotoCount) && expectedPhotoCount > 0) {
|
||||||
return count >= expectedPhotoCount;
|
return count >= expectedPhotoCount;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (Number.isFinite(fallbackPhotoCount) && fallbackPhotoCount > 0) {
|
||||||
|
return count >= fallbackPhotoCount;
|
||||||
|
}
|
||||||
|
|
||||||
|
return count >= minimumTrustCount;
|
||||||
|
}
|
||||||
|
|||||||
@@ -84,5 +84,38 @@ test("shouldUseStructuredZillowPhotos returns false when structured photos are i
|
|||||||
url: `https://photos.zillowstatic.com/fp/photo-${index + 1}-p_d.jpg`,
|
url: `https://photos.zillowstatic.com/fp/photo-${index + 1}-p_d.jpg`,
|
||||||
}));
|
}));
|
||||||
|
|
||||||
assert.equal(shouldUseStructuredZillowPhotos(candidates, 29), false);
|
assert.equal(shouldUseStructuredZillowPhotos(candidates, { expectedPhotoCount: 29 }), false);
|
||||||
|
});
|
||||||
|
|
||||||
|
test("shouldUseStructuredZillowPhotos returns true when meta-description count matches even without visible page count", () => {
|
||||||
|
const candidates = Array.from({ length: 29 }, (_, index) => ({
|
||||||
|
url: `https://photos.zillowstatic.com/fp/photo-${index + 1}-p_d.jpg`,
|
||||||
|
}));
|
||||||
|
|
||||||
|
assert.equal(
|
||||||
|
shouldUseStructuredZillowPhotos(candidates, { expectedPhotoCount: null, fallbackPhotoCount: 29 }),
|
||||||
|
true
|
||||||
|
);
|
||||||
|
});
|
||||||
|
|
||||||
|
test("shouldUseStructuredZillowPhotos returns true for a substantial structured set when no count signal is available", () => {
|
||||||
|
const candidates = Array.from({ length: 18 }, (_, index) => ({
|
||||||
|
url: `https://photos.zillowstatic.com/fp/photo-${index + 1}-p_d.jpg`,
|
||||||
|
}));
|
||||||
|
|
||||||
|
assert.equal(
|
||||||
|
shouldUseStructuredZillowPhotos(candidates, { expectedPhotoCount: null, fallbackPhotoCount: null }),
|
||||||
|
true
|
||||||
|
);
|
||||||
|
});
|
||||||
|
|
||||||
|
test("shouldUseStructuredZillowPhotos returns false for a tiny structured set when no count signal is available", () => {
|
||||||
|
const candidates = Array.from({ length: 5 }, (_, index) => ({
|
||||||
|
url: `https://photos.zillowstatic.com/fp/photo-${index + 1}-p_d.jpg`,
|
||||||
|
}));
|
||||||
|
|
||||||
|
assert.equal(
|
||||||
|
shouldUseStructuredZillowPhotos(candidates, { expectedPhotoCount: null, fallbackPhotoCount: null }),
|
||||||
|
false
|
||||||
|
);
|
||||||
});
|
});
|
||||||
|
|||||||
@@ -35,6 +35,17 @@ async function getAnnouncedPhotoCount(page) {
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async function getMetaDescriptionPhotoCount(page) {
|
||||||
|
return page.evaluate(() => {
|
||||||
|
const content =
|
||||||
|
document.querySelector('meta[name="description"]')?.getAttribute("content") ||
|
||||||
|
document.querySelector('meta[property="og:description"]')?.getAttribute("content") ||
|
||||||
|
"";
|
||||||
|
const match = content.match(/\b(\d+)\s+photos\b/i);
|
||||||
|
return match ? Number(match[1]) : null;
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
function collapseZillowPhotos(candidates) {
|
function collapseZillowPhotos(candidates) {
|
||||||
const byBaseId = new Map();
|
const byBaseId = new Map();
|
||||||
|
|
||||||
@@ -137,14 +148,29 @@ export async function extractZillowPhotos(rawUrl, options = {}) {
|
|||||||
await dismissCommonOverlays(page);
|
await dismissCommonOverlays(page);
|
||||||
|
|
||||||
const expectedPhotoCount = await getAnnouncedPhotoCount(page);
|
const expectedPhotoCount = await getAnnouncedPhotoCount(page);
|
||||||
|
const fallbackPhotoCount = await getMetaDescriptionPhotoCount(page);
|
||||||
const initialStructuredPhotos = normalizeZillowPhotos(
|
const initialStructuredPhotos = normalizeZillowPhotos(
|
||||||
await collectZillowStructuredPhotoCandidates(page)
|
await collectZillowStructuredPhotoCandidates(page)
|
||||||
);
|
);
|
||||||
|
|
||||||
if (shouldUseStructuredZillowPhotos(initialStructuredPhotos, expectedPhotoCount)) {
|
if (
|
||||||
|
shouldUseStructuredZillowPhotos(initialStructuredPhotos, {
|
||||||
|
expectedPhotoCount,
|
||||||
|
fallbackPhotoCount,
|
||||||
|
})
|
||||||
|
) {
|
||||||
const notes = [
|
const notes = [
|
||||||
"The rendered Zillow listing shell already exposed a complete structured photo set, so extraction completed without relying on the all-photos click path.",
|
"The rendered Zillow listing shell already exposed a complete structured photo set, so extraction completed without relying on the all-photos click path.",
|
||||||
];
|
];
|
||||||
|
if (!expectedPhotoCount && fallbackPhotoCount) {
|
||||||
|
notes.push(
|
||||||
|
`Trusted the embedded Zillow photo set using the page metadata count of ${fallbackPhotoCount} photos.`
|
||||||
|
);
|
||||||
|
} else if (!expectedPhotoCount && !fallbackPhotoCount) {
|
||||||
|
notes.push(
|
||||||
|
"Trusted a substantial embedded Zillow photo set even though the page did not expose an explicit photo count."
|
||||||
|
);
|
||||||
|
}
|
||||||
if (attempt > 1) {
|
if (attempt > 1) {
|
||||||
notes.push(
|
notes.push(
|
||||||
"Recovered after retrying Zillow photo extraction once because the first browser session closed unexpectedly."
|
"Recovered after retrying Zillow photo extraction once because the first browser session closed unexpectedly."
|
||||||
@@ -156,7 +182,7 @@ export async function extractZillowPhotos(rawUrl, options = {}) {
|
|||||||
finalUrl: page.url(),
|
finalUrl: page.url(),
|
||||||
title: await page.title(),
|
title: await page.title(),
|
||||||
clickedLabel: null,
|
clickedLabel: null,
|
||||||
expectedPhotoCount,
|
expectedPhotoCount: expectedPhotoCount || fallbackPhotoCount || null,
|
||||||
complete: true,
|
complete: true,
|
||||||
photoCount: initialStructuredPhotos.length,
|
photoCount: initialStructuredPhotos.length,
|
||||||
imageUrls: initialStructuredPhotos.map((photo) => photo.url),
|
imageUrls: initialStructuredPhotos.map((photo) => photo.url),
|
||||||
|
|||||||
Reference in New Issue
Block a user