Make listing discovery unit-aware

This commit is contained in:
2026-03-27 23:11:10 -05:00
parent 301986fb25
commit f8c998d579
9 changed files with 259 additions and 59 deletions

View File

@@ -0,0 +1,125 @@
#!/usr/bin/env node
const STREET_STOP_WORDS = new Set([
"st",
"street",
"rd",
"road",
"dr",
"drive",
"ave",
"avenue",
"blvd",
"boulevard",
"ln",
"lane",
"ct",
"court",
"cir",
"circle",
"way",
"trl",
"trail",
"pkwy",
"parkway",
"tx"
]);
const UNIT_LABEL_PATTERN = "(?:apt|apartment|unit|suite|ste|#)";
function tokenize(value) {
return String(value || "")
.toLowerCase()
.replace(/[^a-z0-9#\s-]/g, " ")
.split(/[\s-]+/)
.map((token) => token.trim())
.filter(Boolean);
}
function collapseWhitespace(value) {
return String(value || "").replace(/\s+/g, " ").trim();
}
function escapeRegex(value) {
return value.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
}
export function parseAddressIdentity(rawAddress) {
const address = collapseWhitespace(rawAddress);
if (!address) {
throw new Error("Missing address.");
}
const [streetPartRaw, ...restParts] = address.split(",");
const streetPart = collapseWhitespace(streetPartRaw);
const locality = collapseWhitespace(restParts.join(" "));
const unitMatch = streetPart.match(
new RegExp(`^(.*?)(?:\\s+(?:${UNIT_LABEL_PATTERN})\\s*([a-z0-9-]+))$`, "i")
);
const streetWithoutUnit = collapseWhitespace(unitMatch ? unitMatch[1] : streetPart);
const unitValue = unitMatch ? unitMatch[2].toLowerCase() : null;
const streetTokens = tokenize(streetWithoutUnit).filter(
(token) => !STREET_STOP_WORDS.has(token)
);
const localityTokens = tokenize(locality).filter((token) => !STREET_STOP_WORDS.has(token));
return {
raw: address,
streetPart,
streetWithoutUnit,
locality,
streetTokens,
localityTokens,
streetNumber: streetTokens[0] || null,
unitValue,
hasUnit: Boolean(unitValue)
};
}
export function buildZillowAddressSlug(rawAddress) {
const address = collapseWhitespace(rawAddress);
return address
.replace(/,/g, "")
.replace(/#/g, "")
.trim()
.split(/\s+/)
.join("-");
}
export function scoreAddressCandidate(identity, candidateText) {
const normalized = collapseWhitespace(candidateText).toLowerCase();
const normalizedPadded = ` ${normalized.replace(/[^a-z0-9#]+/g, " ")} `;
const allTokens = [...identity.streetTokens, ...identity.localityTokens];
const uniqueTokens = Array.from(new Set(allTokens));
const matchedTokens = uniqueTokens.filter((token) =>
normalizedPadded.includes(` ${token} `)
);
let unitMatched = true;
if (identity.hasUnit && identity.unitValue) {
const unitRegex = new RegExp(
`(?:^|\\s)(?:${UNIT_LABEL_PATTERN})\\s*${escapeRegex(identity.unitValue)}(?:\\s|$)`,
"i"
);
const hashRegex = new RegExp(`#\\s*${escapeRegex(identity.unitValue)}(?:\\b|$)`, "i");
const looseTokenRegex = new RegExp(`(?:^|\\s)${escapeRegex(identity.unitValue)}(?:\\s|$)`, "i");
unitMatched =
unitRegex.test(normalized) ||
hashRegex.test(candidateText) ||
looseTokenRegex.test(normalizedPadded);
}
const minimumCoreMatches = identity.hasUnit ? 3 : 2;
const matched =
matchedTokens.length >= Math.min(minimumCoreMatches, uniqueTokens.length) && unitMatched;
return {
matched,
score: matchedTokens.length + (unitMatched && identity.hasUnit ? 2 : 0),
matchedTokens,
unitMatched
};
}