Make listing discovery unit-aware
This commit is contained in:
125
skills/web-automation/scripts/real-estate-address.js
Normal file
125
skills/web-automation/scripts/real-estate-address.js
Normal file
@@ -0,0 +1,125 @@
|
||||
#!/usr/bin/env node
|
||||
|
||||
const STREET_STOP_WORDS = new Set([
|
||||
"st",
|
||||
"street",
|
||||
"rd",
|
||||
"road",
|
||||
"dr",
|
||||
"drive",
|
||||
"ave",
|
||||
"avenue",
|
||||
"blvd",
|
||||
"boulevard",
|
||||
"ln",
|
||||
"lane",
|
||||
"ct",
|
||||
"court",
|
||||
"cir",
|
||||
"circle",
|
||||
"way",
|
||||
"trl",
|
||||
"trail",
|
||||
"pkwy",
|
||||
"parkway",
|
||||
"tx"
|
||||
]);
|
||||
|
||||
const UNIT_LABEL_PATTERN = "(?:apt|apartment|unit|suite|ste|#)";
|
||||
|
||||
function tokenize(value) {
|
||||
return String(value || "")
|
||||
.toLowerCase()
|
||||
.replace(/[^a-z0-9#\s-]/g, " ")
|
||||
.split(/[\s-]+/)
|
||||
.map((token) => token.trim())
|
||||
.filter(Boolean);
|
||||
}
|
||||
|
||||
function collapseWhitespace(value) {
|
||||
return String(value || "").replace(/\s+/g, " ").trim();
|
||||
}
|
||||
|
||||
function escapeRegex(value) {
|
||||
return value.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
|
||||
}
|
||||
|
||||
export function parseAddressIdentity(rawAddress) {
|
||||
const address = collapseWhitespace(rawAddress);
|
||||
if (!address) {
|
||||
throw new Error("Missing address.");
|
||||
}
|
||||
|
||||
const [streetPartRaw, ...restParts] = address.split(",");
|
||||
const streetPart = collapseWhitespace(streetPartRaw);
|
||||
const locality = collapseWhitespace(restParts.join(" "));
|
||||
|
||||
const unitMatch = streetPart.match(
|
||||
new RegExp(`^(.*?)(?:\\s+(?:${UNIT_LABEL_PATTERN})\\s*([a-z0-9-]+))$`, "i")
|
||||
);
|
||||
|
||||
const streetWithoutUnit = collapseWhitespace(unitMatch ? unitMatch[1] : streetPart);
|
||||
const unitValue = unitMatch ? unitMatch[2].toLowerCase() : null;
|
||||
|
||||
const streetTokens = tokenize(streetWithoutUnit).filter(
|
||||
(token) => !STREET_STOP_WORDS.has(token)
|
||||
);
|
||||
const localityTokens = tokenize(locality).filter((token) => !STREET_STOP_WORDS.has(token));
|
||||
|
||||
return {
|
||||
raw: address,
|
||||
streetPart,
|
||||
streetWithoutUnit,
|
||||
locality,
|
||||
streetTokens,
|
||||
localityTokens,
|
||||
streetNumber: streetTokens[0] || null,
|
||||
unitValue,
|
||||
hasUnit: Boolean(unitValue)
|
||||
};
|
||||
}
|
||||
|
||||
export function buildZillowAddressSlug(rawAddress) {
|
||||
const address = collapseWhitespace(rawAddress);
|
||||
return address
|
||||
.replace(/,/g, "")
|
||||
.replace(/#/g, "")
|
||||
.trim()
|
||||
.split(/\s+/)
|
||||
.join("-");
|
||||
}
|
||||
|
||||
export function scoreAddressCandidate(identity, candidateText) {
|
||||
const normalized = collapseWhitespace(candidateText).toLowerCase();
|
||||
const normalizedPadded = ` ${normalized.replace(/[^a-z0-9#]+/g, " ")} `;
|
||||
const allTokens = [...identity.streetTokens, ...identity.localityTokens];
|
||||
const uniqueTokens = Array.from(new Set(allTokens));
|
||||
const matchedTokens = uniqueTokens.filter((token) =>
|
||||
normalizedPadded.includes(` ${token} `)
|
||||
);
|
||||
|
||||
let unitMatched = true;
|
||||
if (identity.hasUnit && identity.unitValue) {
|
||||
const unitRegex = new RegExp(
|
||||
`(?:^|\\s)(?:${UNIT_LABEL_PATTERN})\\s*${escapeRegex(identity.unitValue)}(?:\\s|$)`,
|
||||
"i"
|
||||
);
|
||||
const hashRegex = new RegExp(`#\\s*${escapeRegex(identity.unitValue)}(?:\\b|$)`, "i");
|
||||
const looseTokenRegex = new RegExp(`(?:^|\\s)${escapeRegex(identity.unitValue)}(?:\\s|$)`, "i");
|
||||
unitMatched =
|
||||
unitRegex.test(normalized) ||
|
||||
hashRegex.test(candidateText) ||
|
||||
looseTokenRegex.test(normalizedPadded);
|
||||
}
|
||||
|
||||
const minimumCoreMatches = identity.hasUnit ? 3 : 2;
|
||||
const matched =
|
||||
matchedTokens.length >= Math.min(minimumCoreMatches, uniqueTokens.length) && unitMatched;
|
||||
|
||||
return {
|
||||
matched,
|
||||
score: matchedTokens.length + (unitMatched && identity.hasUnit ? 2 : 0),
|
||||
matchedTokens,
|
||||
unitMatched
|
||||
};
|
||||
}
|
||||
Reference in New Issue
Block a user