Make listing discovery unit-aware

This commit is contained in:
2026-03-27 23:11:10 -05:00
parent 301986fb25
commit f8c998d579
9 changed files with 259 additions and 59 deletions

View File

@@ -6,6 +6,8 @@ Decision-grade residential property assessment skill for OpenClaw, with official
`property-assessor` is for evaluating a condo, townhouse, house, or similar residential property from an address or listing URL and ending with a practical recommendation such as `buy`, `pass`, or `only below X`. `property-assessor` is for evaluating a condo, townhouse, house, or similar residential property from an address or listing URL and ending with a practical recommendation such as `buy`, `pass`, or `only below X`.
If the subject property has an apartment / unit / suite number, include it. Discovery is now unit-aware for Zillow and HAR when unit data is present, while still supporting plain single-family addresses that have no unit.
The skill is intended to: The skill is intended to:
- normalize the property across listing sources - normalize the property across listing sources

View File

@@ -134,6 +134,7 @@ npx tsx flow.ts --instruction 'go to https://search.fiorinis.com then type "pipp
## Real-estate listing discovery and photo extraction ## Real-estate listing discovery and photo extraction
Use the dedicated Zillow and HAR discovery/photo commands before trying a free-form gallery flow. Use the dedicated Zillow and HAR discovery/photo commands before trying a free-form gallery flow.
Discovery is unit-aware when the address includes an apartment / unit / suite identifier, and still supports plain no-unit addresses for single-family homes.
### Zillow discovery ### Zillow discovery

View File

@@ -17,6 +17,9 @@ Accept any of:
The assessment purpose is required for a decision-grade result. The assessment purpose is required for a decision-grade result.
If the user does not say why they want the property assessed, stop and ask before finalizing the analysis. If the user does not say why they want the property assessed, stop and ask before finalizing the analysis.
If the property has a unit / apartment / suite number, include it.
Do not drop the unit when discovering listing sources. Unit-qualified condo/townhome addresses must be matched as the exact unit, while single-family addresses with no unit should still work normally.
## Core workflow ## Core workflow
1. Normalize the address and property type. 1. Normalize the address and property type.

View File

@@ -143,7 +143,10 @@ Use the dedicated extractors before trying a free-form gallery flow.
The discovery scripts are purpose-built for the common address-to-listing workflow: The discovery scripts are purpose-built for the common address-to-listing workflow:
- open the site search or address URL - open the site search or address URL
- keep apartment / unit identifiers when the address includes them
- resolve or identify a matching listing page when possible - resolve or identify a matching listing page when possible
- reject a mismatched unit when the requested address includes one
- still work normally for single-family / no-unit addresses
- return the direct listing URL as JSON - return the direct listing URL as JSON
The photo scripts are purpose-built for the common `See all photos` / `Show all photos` workflow: The photo scripts are purpose-built for the common `See all photos` / `Show all photos` workflow:

View File

@@ -7,28 +7,12 @@ import {
gotoListing, gotoListing,
sleep, sleep,
} from "./real-estate-photo-common.js"; } from "./real-estate-photo-common.js";
import { parseAddressIdentity, scoreAddressCandidate } from "./real-estate-address.js";
function parseAddress(rawAddress) {
const address = String(rawAddress || "").trim();
if (!address) {
fail("Missing address.");
}
return address;
}
function buildSearchUrl(address) { function buildSearchUrl(address) {
return `https://www.har.com/search/?q=${encodeURIComponent(address)}`; return `https://www.har.com/search/?q=${encodeURIComponent(address)}`;
} }
function buildAddressTokens(address) {
return address
.toLowerCase()
.replace(/[^a-z0-9\s]/g, " ")
.split(/\s+/)
.filter(Boolean)
.filter((token) => !new Set(["tx", "dr", "st", "rd", "ave", "blvd", "ct", "ln", "cir"]).has(token));
}
function normalizeListingUrl(url) { function normalizeListingUrl(url) {
try { try {
const parsed = new URL(url); const parsed = new URL(url);
@@ -74,7 +58,8 @@ async function collectListingUrl(page) {
} }
async function main() { async function main() {
const address = parseAddress(process.argv[2]); const address = String(process.argv[2] || "").trim();
const identity = parseAddressIdentity(address);
const searchUrl = buildSearchUrl(address); const searchUrl = buildSearchUrl(address);
const { context, page } = await createPageSession({ headless: process.env.HEADLESS !== "false" }); const { context, page } = await createPageSession({ headless: process.env.HEADLESS !== "false" });
@@ -85,26 +70,32 @@ async function main() {
await sleep(1500); await sleep(1500);
let listingUrl = null; let listingUrl = null;
const addressTokens = buildAddressTokens(address);
if (page.url().includes("/homedetail/")) { if (page.url().includes("/homedetail/")) {
const directScore = scoreAddressCandidate(
identity,
`${page.url()} ${(await page.title()) || ""}`
);
if (directScore.matched) {
listingUrl = normalizeListingUrl(page.url()); listingUrl = normalizeListingUrl(page.url());
attempts.push("HAR search URL resolved directly to a property page."); attempts.push("HAR search URL resolved directly to a matching property page.");
} else {
attempts.push("HAR redirected to a property page, but it did not match the requested address closely enough.");
}
} else { } else {
const discovered = await collectListingUrl(page); const discovered = await collectListingUrl(page);
const scored = discovered const scored = discovered
.map((candidate) => { .map((candidate) => {
const haystack = `${candidate.url} ${candidate.text} ${candidate.parentText}`.toLowerCase(); const match = scoreAddressCandidate(
const score = addressTokens.reduce( identity,
(total, token) => total + (haystack.includes(token) ? 1 : 0), `${candidate.url} ${candidate.text} ${candidate.parentText}`
0
); );
return { ...candidate, score }; return { ...candidate, match };
}) })
.sort((a, b) => b.score - a.score); .sort((a, b) => b.match.score - a.match.score);
if (scored[0] && scored[0].score >= Math.min(3, addressTokens.length)) { if (scored[0]?.match.matched) {
listingUrl = normalizeListingUrl(scored[0].url); listingUrl = normalizeListingUrl(scored[0].url);
attempts.push(`HAR search results exposed a matching homedetail link with score ${scored[0].score}.`); attempts.push(`HAR search results exposed a matching homedetail link with score ${scored[0].match.score}.`);
} else { } else {
attempts.push("HAR discovery did not expose a confident homedetail match for this address."); attempts.push("HAR discovery did not expose a confident homedetail match for this address.");
} }

View File

@@ -10,7 +10,7 @@
"har-photos": "node har-photos.js", "har-photos": "node har-photos.js",
"browse": "tsx browse.ts", "browse": "tsx browse.ts",
"scrape": "tsx scrape.ts", "scrape": "tsx scrape.ts",
"test:photos": "node --test real-estate-photo-common.test.mjs zillow-photo-data.test.mjs", "test:photos": "node --test real-estate-address.test.mjs real-estate-photo-common.test.mjs zillow-photo-data.test.mjs",
"zillow-discover": "node zillow-discover.js", "zillow-discover": "node zillow-discover.js",
"zillow-photos": "node zillow-photos.js", "zillow-photos": "node zillow-photos.js",
"fetch-browser": "npx cloakbrowser install" "fetch-browser": "npx cloakbrowser install"

View File

@@ -0,0 +1,125 @@
#!/usr/bin/env node
const STREET_STOP_WORDS = new Set([
"st",
"street",
"rd",
"road",
"dr",
"drive",
"ave",
"avenue",
"blvd",
"boulevard",
"ln",
"lane",
"ct",
"court",
"cir",
"circle",
"way",
"trl",
"trail",
"pkwy",
"parkway",
"tx"
]);
const UNIT_LABEL_PATTERN = "(?:apt|apartment|unit|suite|ste|#)";
function tokenize(value) {
return String(value || "")
.toLowerCase()
.replace(/[^a-z0-9#\s-]/g, " ")
.split(/[\s-]+/)
.map((token) => token.trim())
.filter(Boolean);
}
function collapseWhitespace(value) {
return String(value || "").replace(/\s+/g, " ").trim();
}
function escapeRegex(value) {
return value.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
}
export function parseAddressIdentity(rawAddress) {
const address = collapseWhitespace(rawAddress);
if (!address) {
throw new Error("Missing address.");
}
const [streetPartRaw, ...restParts] = address.split(",");
const streetPart = collapseWhitespace(streetPartRaw);
const locality = collapseWhitespace(restParts.join(" "));
const unitMatch = streetPart.match(
new RegExp(`^(.*?)(?:\\s+(?:${UNIT_LABEL_PATTERN})\\s*([a-z0-9-]+))$`, "i")
);
const streetWithoutUnit = collapseWhitespace(unitMatch ? unitMatch[1] : streetPart);
const unitValue = unitMatch ? unitMatch[2].toLowerCase() : null;
const streetTokens = tokenize(streetWithoutUnit).filter(
(token) => !STREET_STOP_WORDS.has(token)
);
const localityTokens = tokenize(locality).filter((token) => !STREET_STOP_WORDS.has(token));
return {
raw: address,
streetPart,
streetWithoutUnit,
locality,
streetTokens,
localityTokens,
streetNumber: streetTokens[0] || null,
unitValue,
hasUnit: Boolean(unitValue)
};
}
export function buildZillowAddressSlug(rawAddress) {
const address = collapseWhitespace(rawAddress);
return address
.replace(/,/g, "")
.replace(/#/g, "")
.trim()
.split(/\s+/)
.join("-");
}
export function scoreAddressCandidate(identity, candidateText) {
const normalized = collapseWhitespace(candidateText).toLowerCase();
const normalizedPadded = ` ${normalized.replace(/[^a-z0-9#]+/g, " ")} `;
const allTokens = [...identity.streetTokens, ...identity.localityTokens];
const uniqueTokens = Array.from(new Set(allTokens));
const matchedTokens = uniqueTokens.filter((token) =>
normalizedPadded.includes(` ${token} `)
);
let unitMatched = true;
if (identity.hasUnit && identity.unitValue) {
const unitRegex = new RegExp(
`(?:^|\\s)(?:${UNIT_LABEL_PATTERN})\\s*${escapeRegex(identity.unitValue)}(?:\\s|$)`,
"i"
);
const hashRegex = new RegExp(`#\\s*${escapeRegex(identity.unitValue)}(?:\\b|$)`, "i");
const looseTokenRegex = new RegExp(`(?:^|\\s)${escapeRegex(identity.unitValue)}(?:\\s|$)`, "i");
unitMatched =
unitRegex.test(normalized) ||
hashRegex.test(candidateText) ||
looseTokenRegex.test(normalizedPadded);
}
const minimumCoreMatches = identity.hasUnit ? 3 : 2;
const matched =
matchedTokens.length >= Math.min(minimumCoreMatches, uniqueTokens.length) && unitMatched;
return {
matched,
score: matchedTokens.length + (unitMatched && identity.hasUnit ? 2 : 0),
matchedTokens,
unitMatched
};
}

View File

@@ -0,0 +1,59 @@
import test from "node:test";
import assert from "node:assert/strict";
import {
buildZillowAddressSlug,
parseAddressIdentity,
scoreAddressCandidate,
} from "./real-estate-address.js";
test("parseAddressIdentity detects unit numbers when present", () => {
const identity = parseAddressIdentity("4141 Whiteley Dr Apt 204, Corpus Christi, TX 78418");
assert.equal(identity.streetWithoutUnit, "4141 Whiteley Dr");
assert.equal(identity.unitValue, "204");
assert.equal(identity.hasUnit, true);
assert.deepEqual(identity.streetTokens, ["4141", "whiteley"]);
});
test("parseAddressIdentity supports plain single-family style addresses with no unit", () => {
const identity = parseAddressIdentity("1201 E Iberian Ct, Granbury, TX 76048");
assert.equal(identity.unitValue, null);
assert.equal(identity.hasUnit, false);
assert.deepEqual(identity.streetTokens, ["1201", "e", "iberian"]);
});
test("buildZillowAddressSlug keeps unit identifiers in the slug", () => {
assert.equal(
buildZillowAddressSlug("4141 Whiteley Dr Apt 204, Corpus Christi, TX 78418"),
"4141-Whiteley-Dr-Apt-204-Corpus-Christi-TX-78418"
);
});
test("scoreAddressCandidate rejects a wrong unit on the same street", () => {
const identity = parseAddressIdentity("4141 Whiteley Dr Apt 204, Corpus Christi, TX 78418");
const good = scoreAddressCandidate(
identity,
"https://www.zillow.com/homedetails/4141-Whiteley-Dr-Apt-204-Corpus-Christi-TX-78418/123_zpid/"
);
const bad = scoreAddressCandidate(
identity,
"https://www.zillow.com/homedetails/4141-Whiteley-Dr-Apt-305-Corpus-Christi-TX-78418/456_zpid/"
);
assert.equal(good.matched, true);
assert.equal(good.unitMatched, true);
assert.equal(bad.matched, false);
assert.equal(bad.unitMatched, false);
});
test("scoreAddressCandidate still matches plain addresses with no unit", () => {
const identity = parseAddressIdentity("1201 E Iberian Ct, Granbury, TX 76048");
const result = scoreAddressCandidate(
identity,
"1201 E Iberian Ct Granbury, TX 76048 For Sale, Residential"
);
assert.equal(result.matched, true);
});

View File

@@ -7,24 +7,11 @@ import {
gotoListing, gotoListing,
sleep, sleep,
} from "./real-estate-photo-common.js"; } from "./real-estate-photo-common.js";
import {
function parseAddress(rawAddress) { buildZillowAddressSlug,
const address = String(rawAddress || "").trim(); parseAddressIdentity,
if (!address) { scoreAddressCandidate,
fail("Missing address."); } from "./real-estate-address.js";
}
return address;
}
function buildSearchUrl(address) {
const slug = address
.replace(/,/g, "")
.replace(/#/g, "")
.trim()
.split(/\s+/)
.join("-");
return `https://www.zillow.com/homes/${encodeURIComponent(slug)}_rb/`;
}
function normalizeListingUrl(url) { function normalizeListingUrl(url) {
try { try {
@@ -53,20 +40,31 @@ async function collectListingUrl(page) {
if (!href) continue; if (!href) continue;
const absolute = toAbsolute(href); const absolute = toAbsolute(href);
if (!absolute) continue; if (!absolute) continue;
candidates.push(absolute); const text = (anchor.textContent || "").replace(/\s+/g, " ").trim();
const aria = anchor.getAttribute("aria-label") || "";
const title = anchor.getAttribute("title") || "";
const parentText = (anchor.parentElement?.textContent || "").replace(/\s+/g, " ").trim();
candidates.push({
url: absolute,
text,
aria,
title,
parentText,
});
} }
const unique = []; const unique = [];
for (const candidate of candidates) { for (const candidate of candidates) {
if (!unique.includes(candidate)) unique.push(candidate); if (!unique.some((item) => item.url === candidate.url)) unique.push(candidate);
} }
return unique[0] || null; return unique;
}); });
} }
async function main() { async function main() {
const address = parseAddress(process.argv[2]); const address = String(process.argv[2] || "").trim();
const searchUrl = buildSearchUrl(address); const identity = parseAddressIdentity(address);
const searchUrl = `https://www.zillow.com/homes/${encodeURIComponent(buildZillowAddressSlug(address))}_rb/`;
const { context, page } = await createPageSession({ headless: process.env.HEADLESS !== "false" }); const { context, page } = await createPageSession({ headless: process.env.HEADLESS !== "false" });
try { try {
@@ -77,15 +75,33 @@ async function main() {
let listingUrl = null; let listingUrl = null;
if (page.url().includes("/homedetails/")) { if (page.url().includes("/homedetails/")) {
const directScore = scoreAddressCandidate(
identity,
`${page.url()} ${(await page.title()) || ""}`
);
if (directScore.matched) {
listingUrl = normalizeListingUrl(page.url()); listingUrl = normalizeListingUrl(page.url());
attempts.push("Zillow search URL resolved directly to a property page."); attempts.push("Zillow search URL resolved directly to a matching property page.");
} else {
attempts.push("Zillow redirected to a property page, but it did not match the requested address closely enough.");
}
} else { } else {
const discovered = await collectListingUrl(page); const discovered = await collectListingUrl(page);
if (discovered) { const scored = discovered
listingUrl = normalizeListingUrl(discovered); .map((candidate) => ({
attempts.push("Zillow search results exposed a homedetails link."); ...candidate,
match: scoreAddressCandidate(
identity,
`${candidate.url} ${candidate.text} ${candidate.aria} ${candidate.title} ${candidate.parentText}`
)
}))
.sort((a, b) => b.match.score - a.match.score);
if (scored[0]?.match.matched) {
listingUrl = normalizeListingUrl(scored[0].url);
attempts.push(`Zillow search results exposed a matching homedetails link with score ${scored[0].match.score}.`);
} else { } else {
attempts.push("Zillow discovery did not expose a homedetails link for this address."); attempts.push("Zillow discovery did not expose a confident homedetails match for this address.");
} }
} }