Make listing discovery unit-aware

This commit is contained in:
2026-03-27 23:11:10 -05:00
parent 301986fb25
commit f8c998d579
9 changed files with 259 additions and 59 deletions

View File

@@ -6,6 +6,8 @@ Decision-grade residential property assessment skill for OpenClaw, with official
`property-assessor` is for evaluating a condo, townhouse, house, or similar residential property from an address or listing URL and ending with a practical recommendation such as `buy`, `pass`, or `only below X`.
If the subject property has an apartment / unit / suite number, include it. Discovery is now unit-aware for Zillow and HAR when unit data is present, while still supporting plain single-family addresses that have no unit.
The skill is intended to:
- normalize the property across listing sources

View File

@@ -134,6 +134,7 @@ npx tsx flow.ts --instruction 'go to https://search.fiorinis.com then type "pipp
## Real-estate listing discovery and photo extraction
Use the dedicated Zillow and HAR discovery/photo commands before trying a free-form gallery flow.
Discovery is unit-aware when the address includes an apartment / unit / suite identifier, and still supports plain no-unit addresses for single-family homes.
### Zillow discovery

View File

@@ -17,6 +17,9 @@ Accept any of:
The assessment purpose is required for a decision-grade result.
If the user does not say why they want the property assessed, stop and ask before finalizing the analysis.
If the property has a unit / apartment / suite number, include it.
Do not drop the unit when discovering listing sources. Unit-qualified condo/townhome addresses must be matched as the exact unit, while single-family addresses with no unit should still work normally.
## Core workflow
1. Normalize the address and property type.

View File

@@ -143,7 +143,10 @@ Use the dedicated extractors before trying a free-form gallery flow.
The discovery scripts are purpose-built for the common address-to-listing workflow:
- open the site search or address URL
- keep apartment / unit identifiers when the address includes them
- resolve or identify a matching listing page when possible
- reject a mismatched unit when the requested address includes one
- still work normally for single-family / no-unit addresses
- return the direct listing URL as JSON
The photo scripts are purpose-built for the common `See all photos` / `Show all photos` workflow:

View File

@@ -7,28 +7,12 @@ import {
gotoListing,
sleep,
} from "./real-estate-photo-common.js";
function parseAddress(rawAddress) {
const address = String(rawAddress || "").trim();
if (!address) {
fail("Missing address.");
}
return address;
}
import { parseAddressIdentity, scoreAddressCandidate } from "./real-estate-address.js";
function buildSearchUrl(address) {
return `https://www.har.com/search/?q=${encodeURIComponent(address)}`;
}
function buildAddressTokens(address) {
return address
.toLowerCase()
.replace(/[^a-z0-9\s]/g, " ")
.split(/\s+/)
.filter(Boolean)
.filter((token) => !new Set(["tx", "dr", "st", "rd", "ave", "blvd", "ct", "ln", "cir"]).has(token));
}
function normalizeListingUrl(url) {
try {
const parsed = new URL(url);
@@ -74,7 +58,8 @@ async function collectListingUrl(page) {
}
async function main() {
const address = parseAddress(process.argv[2]);
const address = String(process.argv[2] || "").trim();
const identity = parseAddressIdentity(address);
const searchUrl = buildSearchUrl(address);
const { context, page } = await createPageSession({ headless: process.env.HEADLESS !== "false" });
@@ -85,26 +70,32 @@ async function main() {
await sleep(1500);
let listingUrl = null;
const addressTokens = buildAddressTokens(address);
if (page.url().includes("/homedetail/")) {
listingUrl = normalizeListingUrl(page.url());
attempts.push("HAR search URL resolved directly to a property page.");
const directScore = scoreAddressCandidate(
identity,
`${page.url()} ${(await page.title()) || ""}`
);
if (directScore.matched) {
listingUrl = normalizeListingUrl(page.url());
attempts.push("HAR search URL resolved directly to a matching property page.");
} else {
attempts.push("HAR redirected to a property page, but it did not match the requested address closely enough.");
}
} else {
const discovered = await collectListingUrl(page);
const scored = discovered
.map((candidate) => {
const haystack = `${candidate.url} ${candidate.text} ${candidate.parentText}`.toLowerCase();
const score = addressTokens.reduce(
(total, token) => total + (haystack.includes(token) ? 1 : 0),
0
const match = scoreAddressCandidate(
identity,
`${candidate.url} ${candidate.text} ${candidate.parentText}`
);
return { ...candidate, score };
return { ...candidate, match };
})
.sort((a, b) => b.score - a.score);
.sort((a, b) => b.match.score - a.match.score);
if (scored[0] && scored[0].score >= Math.min(3, addressTokens.length)) {
if (scored[0]?.match.matched) {
listingUrl = normalizeListingUrl(scored[0].url);
attempts.push(`HAR search results exposed a matching homedetail link with score ${scored[0].score}.`);
attempts.push(`HAR search results exposed a matching homedetail link with score ${scored[0].match.score}.`);
} else {
attempts.push("HAR discovery did not expose a confident homedetail match for this address.");
}

View File

@@ -10,7 +10,7 @@
"har-photos": "node har-photos.js",
"browse": "tsx browse.ts",
"scrape": "tsx scrape.ts",
"test:photos": "node --test real-estate-photo-common.test.mjs zillow-photo-data.test.mjs",
"test:photos": "node --test real-estate-address.test.mjs real-estate-photo-common.test.mjs zillow-photo-data.test.mjs",
"zillow-discover": "node zillow-discover.js",
"zillow-photos": "node zillow-photos.js",
"fetch-browser": "npx cloakbrowser install"

View File

@@ -0,0 +1,125 @@
#!/usr/bin/env node
const STREET_STOP_WORDS = new Set([
"st",
"street",
"rd",
"road",
"dr",
"drive",
"ave",
"avenue",
"blvd",
"boulevard",
"ln",
"lane",
"ct",
"court",
"cir",
"circle",
"way",
"trl",
"trail",
"pkwy",
"parkway",
"tx"
]);
const UNIT_LABEL_PATTERN = "(?:apt|apartment|unit|suite|ste|#)";
function tokenize(value) {
return String(value || "")
.toLowerCase()
.replace(/[^a-z0-9#\s-]/g, " ")
.split(/[\s-]+/)
.map((token) => token.trim())
.filter(Boolean);
}
function collapseWhitespace(value) {
return String(value || "").replace(/\s+/g, " ").trim();
}
function escapeRegex(value) {
return value.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
}
export function parseAddressIdentity(rawAddress) {
const address = collapseWhitespace(rawAddress);
if (!address) {
throw new Error("Missing address.");
}
const [streetPartRaw, ...restParts] = address.split(",");
const streetPart = collapseWhitespace(streetPartRaw);
const locality = collapseWhitespace(restParts.join(" "));
const unitMatch = streetPart.match(
new RegExp(`^(.*?)(?:\\s+(?:${UNIT_LABEL_PATTERN})\\s*([a-z0-9-]+))$`, "i")
);
const streetWithoutUnit = collapseWhitespace(unitMatch ? unitMatch[1] : streetPart);
const unitValue = unitMatch ? unitMatch[2].toLowerCase() : null;
const streetTokens = tokenize(streetWithoutUnit).filter(
(token) => !STREET_STOP_WORDS.has(token)
);
const localityTokens = tokenize(locality).filter((token) => !STREET_STOP_WORDS.has(token));
return {
raw: address,
streetPart,
streetWithoutUnit,
locality,
streetTokens,
localityTokens,
streetNumber: streetTokens[0] || null,
unitValue,
hasUnit: Boolean(unitValue)
};
}
export function buildZillowAddressSlug(rawAddress) {
const address = collapseWhitespace(rawAddress);
return address
.replace(/,/g, "")
.replace(/#/g, "")
.trim()
.split(/\s+/)
.join("-");
}
export function scoreAddressCandidate(identity, candidateText) {
const normalized = collapseWhitespace(candidateText).toLowerCase();
const normalizedPadded = ` ${normalized.replace(/[^a-z0-9#]+/g, " ")} `;
const allTokens = [...identity.streetTokens, ...identity.localityTokens];
const uniqueTokens = Array.from(new Set(allTokens));
const matchedTokens = uniqueTokens.filter((token) =>
normalizedPadded.includes(` ${token} `)
);
let unitMatched = true;
if (identity.hasUnit && identity.unitValue) {
const unitRegex = new RegExp(
`(?:^|\\s)(?:${UNIT_LABEL_PATTERN})\\s*${escapeRegex(identity.unitValue)}(?:\\s|$)`,
"i"
);
const hashRegex = new RegExp(`#\\s*${escapeRegex(identity.unitValue)}(?:\\b|$)`, "i");
const looseTokenRegex = new RegExp(`(?:^|\\s)${escapeRegex(identity.unitValue)}(?:\\s|$)`, "i");
unitMatched =
unitRegex.test(normalized) ||
hashRegex.test(candidateText) ||
looseTokenRegex.test(normalizedPadded);
}
const minimumCoreMatches = identity.hasUnit ? 3 : 2;
const matched =
matchedTokens.length >= Math.min(minimumCoreMatches, uniqueTokens.length) && unitMatched;
return {
matched,
score: matchedTokens.length + (unitMatched && identity.hasUnit ? 2 : 0),
matchedTokens,
unitMatched
};
}

View File

@@ -0,0 +1,59 @@
import test from "node:test";
import assert from "node:assert/strict";
import {
buildZillowAddressSlug,
parseAddressIdentity,
scoreAddressCandidate,
} from "./real-estate-address.js";
test("parseAddressIdentity detects unit numbers when present", () => {
const identity = parseAddressIdentity("4141 Whiteley Dr Apt 204, Corpus Christi, TX 78418");
assert.equal(identity.streetWithoutUnit, "4141 Whiteley Dr");
assert.equal(identity.unitValue, "204");
assert.equal(identity.hasUnit, true);
assert.deepEqual(identity.streetTokens, ["4141", "whiteley"]);
});
test("parseAddressIdentity supports plain single-family style addresses with no unit", () => {
const identity = parseAddressIdentity("1201 E Iberian Ct, Granbury, TX 76048");
assert.equal(identity.unitValue, null);
assert.equal(identity.hasUnit, false);
assert.deepEqual(identity.streetTokens, ["1201", "e", "iberian"]);
});
test("buildZillowAddressSlug keeps unit identifiers in the slug", () => {
assert.equal(
buildZillowAddressSlug("4141 Whiteley Dr Apt 204, Corpus Christi, TX 78418"),
"4141-Whiteley-Dr-Apt-204-Corpus-Christi-TX-78418"
);
});
test("scoreAddressCandidate rejects a wrong unit on the same street", () => {
const identity = parseAddressIdentity("4141 Whiteley Dr Apt 204, Corpus Christi, TX 78418");
const good = scoreAddressCandidate(
identity,
"https://www.zillow.com/homedetails/4141-Whiteley-Dr-Apt-204-Corpus-Christi-TX-78418/123_zpid/"
);
const bad = scoreAddressCandidate(
identity,
"https://www.zillow.com/homedetails/4141-Whiteley-Dr-Apt-305-Corpus-Christi-TX-78418/456_zpid/"
);
assert.equal(good.matched, true);
assert.equal(good.unitMatched, true);
assert.equal(bad.matched, false);
assert.equal(bad.unitMatched, false);
});
test("scoreAddressCandidate still matches plain addresses with no unit", () => {
const identity = parseAddressIdentity("1201 E Iberian Ct, Granbury, TX 76048");
const result = scoreAddressCandidate(
identity,
"1201 E Iberian Ct Granbury, TX 76048 For Sale, Residential"
);
assert.equal(result.matched, true);
});

View File

@@ -7,24 +7,11 @@ import {
gotoListing,
sleep,
} from "./real-estate-photo-common.js";
function parseAddress(rawAddress) {
const address = String(rawAddress || "").trim();
if (!address) {
fail("Missing address.");
}
return address;
}
function buildSearchUrl(address) {
const slug = address
.replace(/,/g, "")
.replace(/#/g, "")
.trim()
.split(/\s+/)
.join("-");
return `https://www.zillow.com/homes/${encodeURIComponent(slug)}_rb/`;
}
import {
buildZillowAddressSlug,
parseAddressIdentity,
scoreAddressCandidate,
} from "./real-estate-address.js";
function normalizeListingUrl(url) {
try {
@@ -53,20 +40,31 @@ async function collectListingUrl(page) {
if (!href) continue;
const absolute = toAbsolute(href);
if (!absolute) continue;
candidates.push(absolute);
const text = (anchor.textContent || "").replace(/\s+/g, " ").trim();
const aria = anchor.getAttribute("aria-label") || "";
const title = anchor.getAttribute("title") || "";
const parentText = (anchor.parentElement?.textContent || "").replace(/\s+/g, " ").trim();
candidates.push({
url: absolute,
text,
aria,
title,
parentText,
});
}
const unique = [];
for (const candidate of candidates) {
if (!unique.includes(candidate)) unique.push(candidate);
if (!unique.some((item) => item.url === candidate.url)) unique.push(candidate);
}
return unique[0] || null;
return unique;
});
}
async function main() {
const address = parseAddress(process.argv[2]);
const searchUrl = buildSearchUrl(address);
const address = String(process.argv[2] || "").trim();
const identity = parseAddressIdentity(address);
const searchUrl = `https://www.zillow.com/homes/${encodeURIComponent(buildZillowAddressSlug(address))}_rb/`;
const { context, page } = await createPageSession({ headless: process.env.HEADLESS !== "false" });
try {
@@ -77,15 +75,33 @@ async function main() {
let listingUrl = null;
if (page.url().includes("/homedetails/")) {
listingUrl = normalizeListingUrl(page.url());
attempts.push("Zillow search URL resolved directly to a property page.");
const directScore = scoreAddressCandidate(
identity,
`${page.url()} ${(await page.title()) || ""}`
);
if (directScore.matched) {
listingUrl = normalizeListingUrl(page.url());
attempts.push("Zillow search URL resolved directly to a matching property page.");
} else {
attempts.push("Zillow redirected to a property page, but it did not match the requested address closely enough.");
}
} else {
const discovered = await collectListingUrl(page);
if (discovered) {
listingUrl = normalizeListingUrl(discovered);
attempts.push("Zillow search results exposed a homedetails link.");
const scored = discovered
.map((candidate) => ({
...candidate,
match: scoreAddressCandidate(
identity,
`${candidate.url} ${candidate.text} ${candidate.aria} ${candidate.title} ${candidate.parentText}`
)
}))
.sort((a, b) => b.match.score - a.match.score);
if (scored[0]?.match.matched) {
listingUrl = normalizeListingUrl(scored[0].url);
attempts.push(`Zillow search results exposed a matching homedetails link with score ${scored[0].match.score}.`);
} else {
attempts.push("Zillow discovery did not expose a homedetails link for this address.");
attempts.push("Zillow discovery did not expose a confident homedetails match for this address.");
}
}