Fix slower Zillow unit photo discovery path

This commit is contained in:
2026-03-28 02:28:30 -05:00
parent 7690dc259b
commit 8fe451e8d0
11 changed files with 167 additions and 49 deletions

View File

@@ -95,7 +95,9 @@ Current behavior:
- keeps CAD-site selection address-driven and jurisdiction-specific; it does not hardcode one county's CAD as the global source
- when a supported official CAD detail host is found, captures direct property facts such as property ID/account, owner, legal description, assessed value, exemptions, and the official property-detail URL
- automatically tries to discover Zillow and HAR listing URLs from the address when no listing URL is provided
- starts Zillow and HAR listing discovery in parallel so HAR can already be in flight if Zillow misses or stalls
- runs Zillow photo extraction first, then HAR as fallback when available
- gives Zillow a longer source-specific discovery/photo window than the generic fallback path, because some exact-unit Zillow pages resolve more slowly than HAR or public-record lookups
- reuses the OpenClaw web-automation logic in-process instead of spawning nested helper commands
- fails fast when Zillow/HAR discovery or photo extraction stalls instead of hanging indefinitely
- returns a structured preliminary report payload

View File

@@ -155,6 +155,9 @@ What it does:
- returns the discovered listing URL as JSON
- fails fast with a timeout if the browser-backed discovery stalls
Operational note:
- when imported by `property-assessor`, Zillow discovery is allowed a longer source-specific timeout than the generic helper default, because some exact-unit Zillow pages resolve more slowly than the basic search/listing flow
### HAR discovery
```bash
@@ -183,6 +186,9 @@ What it does:
- returns direct `photos.zillowstatic.com` image URLs as JSON
- fails fast with a timeout if the browser-backed extraction stalls
Operational note:
- when imported by `property-assessor`, Zillow photo extraction is allowed a longer source-specific timeout than the generic helper default, because some exact-unit Zillow listings expose the correct photo set only after a slower render path
Expected success shape:
- `complete: true`
- `expectedPhotoCount` matches `photoCount`

View File

@@ -115,7 +115,9 @@ scripts/property-assessor render-report --input "<report-payload-json>" --output
- resolve official public-record jurisdiction automatically from the address
- keep CAD discovery jurisdiction-specific from the address; do not hardcode one county CAD for every property
- try to discover Zillow and HAR listing URLs from the address when no listing URL is provided
- start Zillow and HAR discovery in parallel, while still preferring Zillow first for the photo-review path
- run the approval-safe Zillow/HAR photo extractor chain automatically
- allow slower exact-unit Zillow pages a longer source-specific discovery/photo window before giving up and falling back
- build a purpose-aware report payload
- complete the analysis without requiring recipient email(s)
- only stop and ask for recipient email(s) when the user is explicitly rendering or sending the PDF

View File

@@ -10,6 +10,8 @@ export interface ListingDiscoveryResult {
interface ListingDiscoveryDeps {
timeoutMs?: number;
zillowTimeoutMs?: number;
harTimeoutMs?: number;
discoverZillowListingFn?: typeof discoverZillowListing;
discoverHarListingFn?: typeof discoverHarListing;
}
@@ -17,61 +19,82 @@ interface ListingDiscoveryDeps {
const DEFAULT_DISCOVERY_TIMEOUT_MS = Number(
process.env.PROPERTY_ASSESSOR_DISCOVERY_TIMEOUT_MS || 20_000
);
const DEFAULT_ZILLOW_DISCOVERY_TIMEOUT_MS = Number(
process.env.PROPERTY_ASSESSOR_ZILLOW_DISCOVERY_TIMEOUT_MS || 60_000
);
const DEFAULT_HAR_DISCOVERY_TIMEOUT_MS = Number(
process.env.PROPERTY_ASSESSOR_HAR_DISCOVERY_TIMEOUT_MS || DEFAULT_DISCOVERY_TIMEOUT_MS
);
interface SourceDiscoveryOutcome {
source: "zillow" | "har";
url: string | null;
attempts: string[];
}
export async function discoverListingSources(
address: string,
deps: ListingDiscoveryDeps = {}
): Promise<ListingDiscoveryResult> {
const attempts: string[] = [];
let zillowUrl: string | null = null;
let harUrl: string | null = null;
const timeoutMs = deps.timeoutMs ?? DEFAULT_DISCOVERY_TIMEOUT_MS;
const zillowTimeoutMs =
deps.zillowTimeoutMs ??
(deps.timeoutMs != null ? timeoutMs : DEFAULT_ZILLOW_DISCOVERY_TIMEOUT_MS);
const harTimeoutMs =
deps.harTimeoutMs ??
(deps.timeoutMs != null ? timeoutMs : DEFAULT_HAR_DISCOVERY_TIMEOUT_MS);
const discoverZillowListingFn = deps.discoverZillowListingFn || discoverZillowListing;
const discoverHarListingFn = deps.discoverHarListingFn || discoverHarListing;
try {
const result = await withTimeout(
() => discoverZillowListingFn(address),
{
operationName: "Zillow discovery",
timeoutMs
const runSource = async (
source: "zillow" | "har",
timeoutForSourceMs: number,
operation: () => Promise<{ listingUrl: string | null; attempts: string[] }>
): Promise<SourceDiscoveryOutcome> => {
try {
const result = await withTimeout(operation, {
operationName: `${source === "zillow" ? "Zillow" : "HAR"} discovery`,
timeoutMs: timeoutForSourceMs
});
return {
source,
url: result.listingUrl,
attempts: result.attempts
};
} catch (error) {
if (error instanceof TimeoutError) {
return {
source,
url: null,
attempts: [
`${source === "zillow" ? "Zillow" : "HAR"} discovery timed out after ${timeoutForSourceMs}ms.`
]
};
}
);
zillowUrl = result.listingUrl;
attempts.push(...result.attempts);
} catch (error) {
if (error instanceof TimeoutError) {
attempts.push(`Zillow discovery timed out after ${timeoutMs}ms.`);
} else {
attempts.push(
`Zillow discovery failed: ${error instanceof Error ? error.message : String(error)}`
);
}
}
try {
const result = await withTimeout(
() => discoverHarListingFn(address),
{
operationName: "HAR discovery",
timeoutMs
}
);
harUrl = result.listingUrl;
attempts.push(...result.attempts);
} catch (error) {
if (error instanceof TimeoutError) {
attempts.push(`HAR discovery timed out after ${timeoutMs}ms.`);
} else {
attempts.push(
`HAR discovery failed: ${error instanceof Error ? error.message : String(error)}`
);
return {
source,
url: null,
attempts: [
`${source === "zillow" ? "Zillow" : "HAR"} discovery failed: ${error instanceof Error ? error.message : String(error)}`
]
};
}
}
};
const zillowPromise = runSource("zillow", zillowTimeoutMs, () =>
discoverZillowListingFn(address, { timeoutMs: zillowTimeoutMs })
);
const harPromise = runSource("har", harTimeoutMs, () =>
discoverHarListingFn(address, { timeoutMs: harTimeoutMs })
);
const [zillowResult, harResult] = await Promise.all([zillowPromise, harPromise]);
const attempts = [...zillowResult.attempts, ...harResult.attempts];
return {
attempts,
zillowUrl,
harUrl
zillowUrl: zillowResult.url,
harUrl: harResult.url
};
}

View File

@@ -22,6 +22,8 @@ export interface PhotoReviewResolution {
interface PhotoReviewDeps {
timeoutMs?: number;
zillowTimeoutMs?: number;
harTimeoutMs?: number;
extractZillowPhotosFn?: typeof extractZillowPhotos;
extractHarPhotosFn?: typeof extractHarPhotos;
}
@@ -29,6 +31,12 @@ interface PhotoReviewDeps {
const DEFAULT_PHOTO_EXTRACTION_TIMEOUT_MS = Number(
process.env.PROPERTY_ASSESSOR_PHOTO_TIMEOUT_MS || 25_000
);
const DEFAULT_ZILLOW_PHOTO_EXTRACTION_TIMEOUT_MS = Number(
process.env.PROPERTY_ASSESSOR_ZILLOW_PHOTO_TIMEOUT_MS || 60_000
);
const DEFAULT_HAR_PHOTO_EXTRACTION_TIMEOUT_MS = Number(
process.env.PROPERTY_ASSESSOR_HAR_PHOTO_TIMEOUT_MS || DEFAULT_PHOTO_EXTRACTION_TIMEOUT_MS
);
export async function extractPhotoData(
source: PhotoSource,
@@ -36,15 +44,21 @@ export async function extractPhotoData(
deps: PhotoReviewDeps = {}
): Promise<PhotoExtractionResult> {
const timeoutMs = deps.timeoutMs ?? DEFAULT_PHOTO_EXTRACTION_TIMEOUT_MS;
const zillowTimeoutMs =
deps.zillowTimeoutMs ??
(deps.timeoutMs != null ? timeoutMs : DEFAULT_ZILLOW_PHOTO_EXTRACTION_TIMEOUT_MS);
const harTimeoutMs =
deps.harTimeoutMs ??
(deps.timeoutMs != null ? timeoutMs : DEFAULT_HAR_PHOTO_EXTRACTION_TIMEOUT_MS);
const extractZillowPhotosFn = deps.extractZillowPhotosFn || extractZillowPhotos;
const extractHarPhotosFn = deps.extractHarPhotosFn || extractHarPhotos;
if (source === "zillow") {
const payload = await withTimeout(
() => extractZillowPhotosFn(url),
() => extractZillowPhotosFn(url, { timeoutMs: zillowTimeoutMs }),
{
operationName: "Zillow photo extraction",
timeoutMs
timeoutMs: zillowTimeoutMs
}
);
return {
@@ -60,10 +74,10 @@ export async function extractPhotoData(
}
const payload = await withTimeout(
() => extractHarPhotosFn(url),
() => extractHarPhotosFn(url, { timeoutMs: harTimeoutMs }),
{
operationName: "HAR photo extraction",
timeoutMs
timeoutMs: harTimeoutMs
}
);
return {

View File

@@ -20,6 +20,71 @@ test("discoverListingSources times out stalled Zillow and HAR discovery calls",
assert.match(result.attempts.join(" "), /har discovery timed out/i);
});
test("discoverListingSources starts Zillow and HAR discovery in parallel", async () => {
let zillowStarted = false;
let harStarted = false;
const discoveryPromise = discoverListingSources("1011 Ennis Joslin Rd APT 235, Corpus Christi, TX 78412", {
timeoutMs: 100,
discoverZillowListingFn: async () => {
zillowStarted = true;
await new Promise((resolve) => setTimeout(resolve, 50));
return {
source: "zillow",
address: "1011 Ennis Joslin Rd APT 235, Corpus Christi, TX 78412",
searchUrl: "https://www.zillow.com/example-search",
finalUrl: "https://www.zillow.com/example-search",
title: "Example Zillow Search",
listingUrl: null,
attempts: ["Zillow did not find a confident match."]
};
},
discoverHarListingFn: async () => {
harStarted = true;
return {
source: "har",
address: "1011 Ennis Joslin Rd APT 235, Corpus Christi, TX 78412",
searchUrl: "https://www.har.com/example-search",
finalUrl: "https://www.har.com/example-search",
title: "Example HAR Search",
listingUrl: "https://www.har.com/homedetail/example/123",
attempts: ["HAR found a matching listing quickly."]
};
}
});
await new Promise((resolve) => setTimeout(resolve, 10));
assert.equal(zillowStarted, true);
assert.equal(harStarted, true);
const result = await discoveryPromise;
assert.equal(result.harUrl, "https://www.har.com/homedetail/example/123");
});
test("extractPhotoData honors a longer Zillow timeout override", async () => {
const result = await extractPhotoData("zillow", "https://www.zillow.com/example", {
timeoutMs: 20,
zillowTimeoutMs: 80,
extractZillowPhotosFn: async () => {
await new Promise((resolve) => setTimeout(resolve, 40));
return {
source: "zillow",
requestedUrl: "https://www.zillow.com/example",
finalUrl: "https://www.zillow.com/example",
expectedPhotoCount: 1,
complete: true,
photoCount: 1,
imageUrls: ["https://photos.example/1.jpg"],
notes: ["Zillow extractor succeeded after a slow page load."]
};
}
});
assert.equal(result.source, "zillow");
assert.equal(result.photoCount, 1);
});
test("extractPhotoData times out a stalled photo extraction instead of hanging forever", async () => {
await assert.rejects(
async () =>

View File

@@ -153,6 +153,7 @@ The discovery scripts are purpose-built for the common address-to-listing workfl
- reject a mismatched unit when the requested address includes one
- still work normally for single-family / no-unit addresses
- return the direct listing URL as JSON
- support longer source-specific timeouts when a caller such as `property-assessor` imports them for slower exact-unit Zillow pages
The photo scripts are purpose-built for the common `See all photos` / `Show all photos` workflow:
- open the listing page
@@ -160,6 +161,7 @@ The photo scripts are purpose-built for the common `See all photos` / `Show all
- wait for the resulting photo page or scroller view
- extract direct image URLs from the rendered page
- fail fast with a timeout instead of hanging indefinitely when the browser-backed extraction stalls
- support longer source-specific timeouts when a caller such as `property-assessor` imports them for slower exact-unit Zillow renders
Output is JSON with:
- `requestedUrl`

View File

@@ -60,7 +60,7 @@ async function collectListingUrl(page) {
});
}
export async function discoverHarListing(rawAddress) {
export async function discoverHarListing(rawAddress, options = {}) {
const address = String(rawAddress || "").trim();
const identity = parseAddressIdentity(address);
const searchUrl = buildSearchUrl(address);
@@ -121,6 +121,7 @@ export async function discoverHarListing(rawAddress) {
};
},
{
timeoutMs: Number(options.timeoutMs || 0) || undefined,
onTimeout: closeContext
}
);

View File

@@ -32,7 +32,7 @@ async function getAnnouncedPhotoCount(page) {
});
}
export async function extractHarPhotos(rawUrl) {
export async function extractHarPhotos(rawUrl, options = {}) {
const requestedUrl = parseTarget(rawUrl);
const { context, page } = await createPageSession({ headless: process.env.HEADLESS !== "false" });
const closeContext = async () => {
@@ -78,6 +78,7 @@ export async function extractHarPhotos(rawUrl) {
};
},
{
timeoutMs: Number(options.timeoutMs || 0) || undefined,
onTimeout: closeContext
}
);

View File

@@ -64,7 +64,7 @@ async function collectListingUrl(page) {
});
}
export async function discoverZillowListing(rawAddress) {
export async function discoverZillowListing(rawAddress, options = {}) {
const address = String(rawAddress || "").trim();
const identity = parseAddressIdentity(address);
const searchUrl = `https://www.zillow.com/homes/${encodeURIComponent(buildZillowAddressSlug(address))}_rb/`;
@@ -125,6 +125,7 @@ export async function discoverZillowListing(rawAddress) {
};
},
{
timeoutMs: Number(options.timeoutMs || 0) || undefined,
onTimeout: closeContext
}
);

View File

@@ -105,7 +105,7 @@ async function collectZillowStructuredPhotoCandidates(page) {
return extractZillowStructuredPhotoCandidatesFromNextDataScript(scriptText || "");
}
export async function extractZillowPhotos(rawUrl) {
export async function extractZillowPhotos(rawUrl, options = {}) {
const requestedUrl = parseTarget(rawUrl);
const { context, page } = await createPageSession({ headless: process.env.HEADLESS !== "false" });
const closeContext = async () => {
@@ -177,6 +177,7 @@ export async function extractZillowPhotos(rawUrl) {
};
},
{
timeoutMs: Number(options.timeoutMs || 0) || undefined,
onTimeout: closeContext
}
);