fix: make property-assessor safer for whatsapp runs

This commit is contained in:
2026-03-28 01:28:59 -05:00
parent 2deeb31369
commit 3d7ce7617c
15 changed files with 640 additions and 217 deletions

View File

@@ -7,6 +7,7 @@ import {
dismissCommonOverlays,
fail,
gotoListing,
runWithOperationTimeout,
sleep,
} from "./real-estate-photo-common.js";
import { parseAddressIdentity, scoreAddressCandidate } from "./real-estate-address.js";
@@ -64,63 +65,69 @@ export async function discoverHarListing(rawAddress) {
const identity = parseAddressIdentity(address);
const searchUrl = buildSearchUrl(address);
const { context, page } = await createPageSession({ headless: process.env.HEADLESS !== "false" });
const closeContext = async () => {
await context.close().catch(() => {});
};
try {
const attempts = [`Opened HAR search URL: ${searchUrl}`];
await gotoListing(page, searchUrl, 2500);
await dismissCommonOverlays(page);
await sleep(1500);
return await runWithOperationTimeout(
"HAR discovery",
async () => {
const attempts = [`Opened HAR search URL: ${searchUrl}`];
await gotoListing(page, searchUrl, 2500);
await dismissCommonOverlays(page);
await sleep(1500);
let listingUrl = null;
if (page.url().includes("/homedetail/")) {
const directScore = scoreAddressCandidate(
identity,
`${page.url()} ${(await page.title()) || ""}`
);
if (directScore.matched) {
listingUrl = normalizeListingUrl(page.url());
attempts.push("HAR search URL resolved directly to a matching property page.");
} else {
attempts.push("HAR redirected to a property page, but it did not match the requested address closely enough.");
}
} else {
const discovered = await collectListingUrl(page);
const scored = discovered
.map((candidate) => {
const match = scoreAddressCandidate(
let listingUrl = null;
if (page.url().includes("/homedetail/")) {
const directScore = scoreAddressCandidate(
identity,
`${candidate.url} ${candidate.text} ${candidate.parentText}`
`${page.url()} ${(await page.title()) || ""}`
);
return { ...candidate, match };
})
.sort((a, b) => b.match.score - a.match.score);
if (directScore.matched) {
listingUrl = normalizeListingUrl(page.url());
attempts.push("HAR search URL resolved directly to a matching property page.");
} else {
attempts.push("HAR redirected to a property page, but it did not match the requested address closely enough.");
}
} else {
const discovered = await collectListingUrl(page);
const scored = discovered
.map((candidate) => {
const match = scoreAddressCandidate(
identity,
`${candidate.url} ${candidate.text} ${candidate.parentText}`
);
return { ...candidate, match };
})
.sort((a, b) => b.match.score - a.match.score);
if (scored[0]?.match.matched) {
listingUrl = normalizeListingUrl(scored[0].url);
attempts.push(`HAR search results exposed a matching homedetail link with score ${scored[0].match.score}.`);
} else {
attempts.push("HAR discovery did not expose a confident homedetail match for this address.");
if (scored[0]?.match.matched) {
listingUrl = normalizeListingUrl(scored[0].url);
attempts.push(`HAR search results exposed a matching homedetail link with score ${scored[0].match.score}.`);
} else {
attempts.push("HAR discovery did not expose a confident homedetail match for this address.");
}
}
return {
source: "har",
address,
searchUrl,
finalUrl: page.url(),
title: await page.title(),
listingUrl,
attempts,
};
},
{
onTimeout: closeContext
}
}
const result = {
source: "har",
address,
searchUrl,
finalUrl: page.url(),
title: await page.title(),
listingUrl,
attempts,
};
await context.close();
return result;
);
} catch (error) {
try {
await context.close();
} catch {
// Ignore close errors after the primary failure.
}
throw new Error(`HAR discovery failed: ${error instanceof Error ? error.message : String(error)}`);
} finally {
await closeContext();
}
}

View File

@@ -11,6 +11,7 @@ import {
gotoListing,
normalizeImageCandidates,
parseTarget,
runWithOperationTimeout,
scrollUntilSettled,
sleep,
waitForPhotoExperience,
@@ -34,51 +35,56 @@ async function getAnnouncedPhotoCount(page) {
export async function extractHarPhotos(rawUrl) {
const requestedUrl = parseTarget(rawUrl);
const { context, page } = await createPageSession({ headless: process.env.HEADLESS !== "false" });
const closeContext = async () => {
await context.close().catch(() => {});
};
try {
await gotoListing(page, requestedUrl);
await dismissCommonOverlays(page);
return await runWithOperationTimeout(
"HAR photo extraction",
async () => {
await gotoListing(page, requestedUrl);
await dismissCommonOverlays(page);
const expectedPhotoCount = await getAnnouncedPhotoCount(page);
const beforeUrl = page.url();
const clickedLabel = await clickPhotoEntryPoint(page, HAR_LABELS);
await waitForPhotoExperience(page, beforeUrl);
await scrollUntilSettled(page);
await sleep(1200);
const expectedPhotoCount = await getAnnouncedPhotoCount(page);
const beforeUrl = page.url();
const clickedLabel = await clickPhotoEntryPoint(page, HAR_LABELS);
await waitForPhotoExperience(page, beforeUrl);
await scrollUntilSettled(page);
await sleep(1200);
const candidates = await collectRenderedImageCandidates(page);
const photos = normalizeImageCandidates(candidates, {
hostIncludes: ["pics.harstatic.com", "photos.harstatic.com"],
minWidth: 240,
minHeight: 180,
});
const candidates = await collectRenderedImageCandidates(page);
const photos = normalizeImageCandidates(candidates, {
hostIncludes: ["pics.harstatic.com", "photos.harstatic.com"],
minWidth: 240,
minHeight: 180,
});
if (!photos.length) {
fail("HAR photo extraction failed.", "No large image URLs were found after opening the HAR all-photos view.");
}
if (!photos.length) {
fail("HAR photo extraction failed.", "No large image URLs were found after opening the HAR all-photos view.");
}
const result = {
source: "har",
requestedUrl,
finalUrl: page.url(),
title: await page.title(),
clickedLabel,
expectedPhotoCount,
complete: expectedPhotoCount ? photos.length >= expectedPhotoCount : true,
photoCount: photos.length,
imageUrls: photos.map((photo) => photo.url),
notes: ["Opened HAR all-photos flow and extracted large rendered image URLs from the photo page."],
};
await context.close();
return result;
return {
source: "har",
requestedUrl,
finalUrl: page.url(),
title: await page.title(),
clickedLabel,
expectedPhotoCount,
complete: expectedPhotoCount ? photos.length >= expectedPhotoCount : true,
photoCount: photos.length,
imageUrls: photos.map((photo) => photo.url),
notes: ["Opened HAR all-photos flow and extracted large rendered image URLs from the photo page."],
};
},
{
onTimeout: closeContext
}
);
} catch (error) {
try {
await context.close();
} catch {
// Ignore close errors after the primary failure.
}
throw new Error(error instanceof Error ? error.message : String(error));
} finally {
await closeContext();
}
}

View File

@@ -7,6 +7,7 @@ const MAX_SCROLL_PASSES = 12;
const SCROLL_PAUSE_MS = 900;
const LARGE_IMAGE_MIN_WIDTH = 300;
const LARGE_IMAGE_MIN_HEIGHT = 200;
const OPERATION_TIMEOUT_MS = Number(process.env.REAL_ESTATE_OPERATION_TIMEOUT_MS || 25000);
export function fail(message, details) {
const payload = { error: message };
@@ -38,6 +39,34 @@ export function sleep(ms) {
return new Promise((resolve) => setTimeout(resolve, ms));
}
export async function runWithOperationTimeout(
operationName,
operation,
{ timeoutMs = OPERATION_TIMEOUT_MS, onTimeout } = {}
) {
let timer;
try {
return await Promise.race([
operation(),
new Promise((_, reject) => {
timer = setTimeout(async () => {
try {
await onTimeout?.();
} catch {
// Ignore cleanup errors; the timeout is the primary failure.
}
reject(new Error(`${operationName} timed out after ${timeoutMs}ms`));
}, timeoutMs);
}),
]);
} finally {
if (timer) {
clearTimeout(timer);
}
}
}
export async function loadCloakBrowser() {
try {
return await import("cloakbrowser");
@@ -289,4 +318,3 @@ export function buildResult({
notes,
};
}

View File

@@ -1,7 +1,7 @@
import test from "node:test";
import assert from "node:assert/strict";
import { normalizeImageCandidates } from "./real-estate-photo-common.js";
import { normalizeImageCandidates, runWithOperationTimeout } from "./real-estate-photo-common.js";
test("normalizeImageCandidates keeps distinct Zillow photo URLs and strips query strings", () => {
const result = normalizeImageCandidates(
@@ -64,3 +64,24 @@ test("normalizeImageCandidates filters tiny HAR page assets and keeps large phot
"https://photos.har.com/123/main.jpg",
]);
});
test("runWithOperationTimeout rejects stalled work and runs timeout cleanup", async () => {
let cleanedUp = false;
await assert.rejects(
async () =>
runWithOperationTimeout(
"HAR photo extraction",
async () => await new Promise(() => {}),
{
timeoutMs: 20,
onTimeout: async () => {
cleanedUp = true;
},
}
),
/timed out/i
);
assert.equal(cleanedUp, true);
});

View File

@@ -7,6 +7,7 @@ import {
dismissCommonOverlays,
fail,
gotoListing,
runWithOperationTimeout,
sleep,
} from "./real-estate-photo-common.js";
import {
@@ -68,63 +69,69 @@ export async function discoverZillowListing(rawAddress) {
const identity = parseAddressIdentity(address);
const searchUrl = `https://www.zillow.com/homes/${encodeURIComponent(buildZillowAddressSlug(address))}_rb/`;
const { context, page } = await createPageSession({ headless: process.env.HEADLESS !== "false" });
const closeContext = async () => {
await context.close().catch(() => {});
};
try {
const attempts = [`Opened Zillow address search URL: ${searchUrl}`];
await gotoListing(page, searchUrl, 2500);
await dismissCommonOverlays(page);
await sleep(1500);
return await runWithOperationTimeout(
"Zillow discovery",
async () => {
const attempts = [`Opened Zillow address search URL: ${searchUrl}`];
await gotoListing(page, searchUrl, 2500);
await dismissCommonOverlays(page);
await sleep(1500);
let listingUrl = null;
if (page.url().includes("/homedetails/")) {
const directScore = scoreAddressCandidate(
identity,
`${page.url()} ${(await page.title()) || ""}`
);
if (directScore.matched) {
listingUrl = normalizeListingUrl(page.url());
attempts.push("Zillow search URL resolved directly to a matching property page.");
} else {
attempts.push("Zillow redirected to a property page, but it did not match the requested address closely enough.");
}
} else {
const discovered = await collectListingUrl(page);
const scored = discovered
.map((candidate) => ({
...candidate,
match: scoreAddressCandidate(
let listingUrl = null;
if (page.url().includes("/homedetails/")) {
const directScore = scoreAddressCandidate(
identity,
`${candidate.url} ${candidate.text} ${candidate.aria} ${candidate.title} ${candidate.parentText}`
)
}))
.sort((a, b) => b.match.score - a.match.score);
`${page.url()} ${(await page.title()) || ""}`
);
if (directScore.matched) {
listingUrl = normalizeListingUrl(page.url());
attempts.push("Zillow search URL resolved directly to a matching property page.");
} else {
attempts.push("Zillow redirected to a property page, but it did not match the requested address closely enough.");
}
} else {
const discovered = await collectListingUrl(page);
const scored = discovered
.map((candidate) => ({
...candidate,
match: scoreAddressCandidate(
identity,
`${candidate.url} ${candidate.text} ${candidate.aria} ${candidate.title} ${candidate.parentText}`
)
}))
.sort((a, b) => b.match.score - a.match.score);
if (scored[0]?.match.matched) {
listingUrl = normalizeListingUrl(scored[0].url);
attempts.push(`Zillow search results exposed a matching homedetails link with score ${scored[0].match.score}.`);
} else {
attempts.push("Zillow discovery did not expose a confident homedetails match for this address.");
if (scored[0]?.match.matched) {
listingUrl = normalizeListingUrl(scored[0].url);
attempts.push(`Zillow search results exposed a matching homedetails link with score ${scored[0].match.score}.`);
} else {
attempts.push("Zillow discovery did not expose a confident homedetails match for this address.");
}
}
return {
source: "zillow",
address,
searchUrl,
finalUrl: page.url(),
title: await page.title(),
listingUrl,
attempts,
};
},
{
onTimeout: closeContext
}
}
const result = {
source: "zillow",
address,
searchUrl,
finalUrl: page.url(),
title: await page.title(),
listingUrl,
attempts,
};
await context.close();
return result;
);
} catch (error) {
try {
await context.close();
} catch {
// Ignore close errors after the primary failure.
}
throw new Error(`Zillow discovery failed: ${error instanceof Error ? error.message : String(error)}`);
} finally {
await closeContext();
}
}

View File

@@ -10,6 +10,7 @@ import {
gotoListing,
normalizeImageCandidates,
parseTarget,
runWithOperationTimeout,
scrollUntilSettled,
sleep,
waitForPhotoExperience,
@@ -107,77 +108,82 @@ async function collectZillowStructuredPhotoCandidates(page) {
export async function extractZillowPhotos(rawUrl) {
const requestedUrl = parseTarget(rawUrl);
const { context, page } = await createPageSession({ headless: process.env.HEADLESS !== "false" });
const closeContext = async () => {
await context.close().catch(() => {});
};
try {
await gotoListing(page, requestedUrl);
await dismissCommonOverlays(page);
return await runWithOperationTimeout(
"Zillow photo extraction",
async () => {
await gotoListing(page, requestedUrl);
await dismissCommonOverlays(page);
const expectedPhotoCount = await getAnnouncedPhotoCount(page);
const beforeUrl = page.url();
let clickedLabel = null;
let clickError = null;
const expectedPhotoCount = await getAnnouncedPhotoCount(page);
const beforeUrl = page.url();
let clickedLabel = null;
let clickError = null;
try {
clickedLabel = await clickPhotoEntryPoint(page, ZILLOW_LABELS);
await waitForPhotoExperience(page, beforeUrl);
await scrollUntilSettled(page);
await sleep(1200);
} catch (error) {
clickError = error instanceof Error ? error.message : String(error);
}
try {
clickedLabel = await clickPhotoEntryPoint(page, ZILLOW_LABELS);
await waitForPhotoExperience(page, beforeUrl);
await scrollUntilSettled(page);
await sleep(1200);
} catch (error) {
clickError = error instanceof Error ? error.message : String(error);
}
const [structuredCandidates, renderedCandidates] = await Promise.all([
collectZillowStructuredPhotoCandidates(page),
collectZillowPhotoCandidates(page),
]);
const candidates = [...structuredCandidates, ...renderedCandidates];
const normalized = normalizeImageCandidates(candidates, {
hostIncludes: ["photos.zillowstatic.com"],
minWidth: 240,
minHeight: 180,
});
const photos = collapseZillowPhotos(normalized);
const [structuredCandidates, renderedCandidates] = await Promise.all([
collectZillowStructuredPhotoCandidates(page),
collectZillowPhotoCandidates(page),
]);
const candidates = [...structuredCandidates, ...renderedCandidates];
const normalized = normalizeImageCandidates(candidates, {
hostIncludes: ["photos.zillowstatic.com"],
minWidth: 240,
minHeight: 180,
});
const photos = collapseZillowPhotos(normalized);
if (!photos.length) {
fail(
"Zillow photo extraction failed.",
clickError || "No Zillow image URLs were found on the rendered listing page."
);
}
if (!photos.length) {
fail(
"Zillow photo extraction failed.",
clickError || "No Zillow image URLs were found on the rendered listing page."
);
}
const complete = expectedPhotoCount ? photos.length >= expectedPhotoCount : true;
const notes = [];
if (clickedLabel) {
notes.push("Opened Zillow all-photos flow and extracted direct Zillow image URLs.");
} else {
notes.push("The rendered Zillow listing shell already exposed the Zillow photo stream, so extraction completed without relying on the all-photos click path.");
}
if (clickError) {
notes.push(`All-photos click path was not required: ${clickError}`);
}
const complete = expectedPhotoCount ? photos.length >= expectedPhotoCount : true;
const notes = [];
if (clickedLabel) {
notes.push("Opened Zillow all-photos flow and extracted direct Zillow image URLs.");
} else {
notes.push("The rendered Zillow listing shell already exposed the Zillow photo stream, so extraction completed without relying on the all-photos click path.");
}
if (clickError) {
notes.push(`All-photos click path was not required: ${clickError}`);
}
const result = {
source: "zillow",
requestedUrl,
finalUrl: page.url(),
title: await page.title(),
clickedLabel,
expectedPhotoCount,
complete,
photoCount: photos.length,
imageUrls: photos.map((photo) => photo.url),
notes,
};
await context.close();
return result;
return {
source: "zillow",
requestedUrl,
finalUrl: page.url(),
title: await page.title(),
clickedLabel,
expectedPhotoCount,
complete,
photoCount: photos.length,
imageUrls: photos.map((photo) => photo.url),
notes,
};
},
{
onTimeout: closeContext
}
);
} catch (error) {
try {
await context.close();
} catch {
// Ignore close errors after the primary failure.
}
throw new Error(error instanceof Error ? error.message : String(error));
} finally {
await closeContext();
}
}