126 lines
3.2 KiB
JavaScript
126 lines
3.2 KiB
JavaScript
#!/usr/bin/env node
|
|
|
|
const STREET_STOP_WORDS = new Set([
|
|
"st",
|
|
"street",
|
|
"rd",
|
|
"road",
|
|
"dr",
|
|
"drive",
|
|
"ave",
|
|
"avenue",
|
|
"blvd",
|
|
"boulevard",
|
|
"ln",
|
|
"lane",
|
|
"ct",
|
|
"court",
|
|
"cir",
|
|
"circle",
|
|
"way",
|
|
"trl",
|
|
"trail",
|
|
"pkwy",
|
|
"parkway",
|
|
"tx"
|
|
]);
|
|
|
|
const UNIT_LABEL_PATTERN = "(?:apt|apartment|unit|suite|ste|#)";
|
|
|
|
function tokenize(value) {
|
|
return String(value || "")
|
|
.toLowerCase()
|
|
.replace(/[^a-z0-9#\s-]/g, " ")
|
|
.split(/[\s-]+/)
|
|
.map((token) => token.trim())
|
|
.filter(Boolean);
|
|
}
|
|
|
|
function collapseWhitespace(value) {
|
|
return String(value || "").replace(/\s+/g, " ").trim();
|
|
}
|
|
|
|
function escapeRegex(value) {
|
|
return value.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
|
|
}
|
|
|
|
export function parseAddressIdentity(rawAddress) {
|
|
const address = collapseWhitespace(rawAddress);
|
|
if (!address) {
|
|
throw new Error("Missing address.");
|
|
}
|
|
|
|
const [streetPartRaw, ...restParts] = address.split(",");
|
|
const streetPart = collapseWhitespace(streetPartRaw);
|
|
const locality = collapseWhitespace(restParts.join(" "));
|
|
|
|
const unitMatch = streetPart.match(
|
|
new RegExp(`^(.*?)(?:\\s+(?:${UNIT_LABEL_PATTERN})\\s*([a-z0-9-]+))$`, "i")
|
|
);
|
|
|
|
const streetWithoutUnit = collapseWhitespace(unitMatch ? unitMatch[1] : streetPart);
|
|
const unitValue = unitMatch ? unitMatch[2].toLowerCase() : null;
|
|
|
|
const streetTokens = tokenize(streetWithoutUnit).filter(
|
|
(token) => !STREET_STOP_WORDS.has(token)
|
|
);
|
|
const localityTokens = tokenize(locality).filter((token) => !STREET_STOP_WORDS.has(token));
|
|
|
|
return {
|
|
raw: address,
|
|
streetPart,
|
|
streetWithoutUnit,
|
|
locality,
|
|
streetTokens,
|
|
localityTokens,
|
|
streetNumber: streetTokens[0] || null,
|
|
unitValue,
|
|
hasUnit: Boolean(unitValue)
|
|
};
|
|
}
|
|
|
|
export function buildZillowAddressSlug(rawAddress) {
|
|
const address = collapseWhitespace(rawAddress);
|
|
return address
|
|
.replace(/,/g, "")
|
|
.replace(/#/g, "")
|
|
.trim()
|
|
.split(/\s+/)
|
|
.join("-");
|
|
}
|
|
|
|
export function scoreAddressCandidate(identity, candidateText) {
|
|
const normalized = collapseWhitespace(candidateText).toLowerCase();
|
|
const normalizedPadded = ` ${normalized.replace(/[^a-z0-9#]+/g, " ")} `;
|
|
const allTokens = [...identity.streetTokens, ...identity.localityTokens];
|
|
const uniqueTokens = Array.from(new Set(allTokens));
|
|
const matchedTokens = uniqueTokens.filter((token) =>
|
|
normalizedPadded.includes(` ${token} `)
|
|
);
|
|
|
|
let unitMatched = true;
|
|
if (identity.hasUnit && identity.unitValue) {
|
|
const unitRegex = new RegExp(
|
|
`(?:^|\\s)(?:${UNIT_LABEL_PATTERN})\\s*${escapeRegex(identity.unitValue)}(?:\\s|$)`,
|
|
"i"
|
|
);
|
|
const hashRegex = new RegExp(`#\\s*${escapeRegex(identity.unitValue)}(?:\\b|$)`, "i");
|
|
const looseTokenRegex = new RegExp(`(?:^|\\s)${escapeRegex(identity.unitValue)}(?:\\s|$)`, "i");
|
|
unitMatched =
|
|
unitRegex.test(normalized) ||
|
|
hashRegex.test(candidateText) ||
|
|
looseTokenRegex.test(normalizedPadded);
|
|
}
|
|
|
|
const minimumCoreMatches = identity.hasUnit ? 3 : 2;
|
|
const matched =
|
|
matchedTokens.length >= Math.min(minimumCoreMatches, uniqueTokens.length) && unitMatched;
|
|
|
|
return {
|
|
matched,
|
|
score: matchedTokens.length + (unitMatched && identity.hasUnit ? 2 : 0),
|
|
matchedTokens,
|
|
unitMatched
|
|
};
|
|
}
|