Metching algorithm
This commit is contained in:
parent
2f8c798304
commit
1d0aff07fe
2 changed files with 97 additions and 91 deletions
|
|
@ -16,12 +16,9 @@ function load () {
|
|||
|
||||
// MATCHING
|
||||
|
||||
const mapMax = (iteratee, list) => _.max(_.map(iteratee, list))
|
||||
// similarity algorithm
|
||||
|
||||
const allPairs = _.flow(
|
||||
(aList, bList) => _.map(a => _.map(b => [a, b], bList), aList),
|
||||
_.flatten
|
||||
)
|
||||
const stringSimilarity = _.curry(jaroWinkler)
|
||||
|
||||
// birth date
|
||||
|
||||
|
|
@ -37,72 +34,14 @@ function isDateWithinSomeDaysOfPeriod (period, date, days) {
|
|||
return (startDate < date && date < endDate)
|
||||
}
|
||||
|
||||
function isBornTooLongSince (individual, dateObject, days) {
|
||||
const isBornTooLongSince = _.curry((days, dateObject, individual) => {
|
||||
if (_.isEmpty(individual.birthDatePeriods)) return false
|
||||
const isWithinSomeYears = _.partialRight(isDateWithinSomeDaysOfPeriod, [dateObject.date, days])
|
||||
return !_.some(isWithinSomeYears, individual.birthDatePeriods)
|
||||
}
|
||||
|
||||
// similarity algorithm
|
||||
|
||||
const stringSimilarity = _.curry(jaroWinkler)
|
||||
|
||||
const wordSimilarity = (a, b) => {
|
||||
const phoneticPairs = allPairs(a.phonetic, b.phonetic)
|
||||
const phoneticMatch = _.map(_.spread(_.isEqual), phoneticPairs)
|
||||
if (_.some(_.identity, phoneticMatch)) return 1
|
||||
return stringSimilarity(a.value, b.value)
|
||||
}
|
||||
|
||||
const similarity = _.curry((candidate, individual) => {
|
||||
// Calculate if his birth date is within two years of the given date.
|
||||
// If an individual has multiple birth-date periods, return whether any are
|
||||
// within two years. Reject individuals who don't match this criterion.
|
||||
const twoYears = 365 * 2
|
||||
if (isBornTooLongSince(individual, candidate.birthDate, twoYears)) return 0
|
||||
|
||||
debug_log(individual)
|
||||
|
||||
// Calculate the Jaro-Winkler similarity of the full name.
|
||||
// If an individual has multiple aliases, use the maximum score.
|
||||
const scoreCandidateFullName = _.flow(
|
||||
_.get('fullName'),
|
||||
stringSimilarity(candidate.fullName)
|
||||
)
|
||||
const stringMatchScore = mapMax(scoreCandidateFullName, individual.aliases)
|
||||
|
||||
//
|
||||
|
||||
const candidateWords = candidate.fullNameWords
|
||||
const numCandidateWords = candidateWords.length
|
||||
|
||||
const scoreCandidateWords = alias => {
|
||||
const tooManyWords = _.flow(
|
||||
_.get(['words', 'length']),
|
||||
_.lt(numCandidateWords)
|
||||
)
|
||||
const parts = _.reject(tooManyWords, alias.parts)
|
||||
|
||||
const scorePartAt = _.curry((part, offset) => {
|
||||
const words = _.slice(offset, offset + part.words.length, candidateWords)
|
||||
return _.min(_.map(_.spread(wordSimilarity), _.zip(words, part.words)))
|
||||
})
|
||||
const scorePart = part => {
|
||||
const offsets = _.range(0, (numCandidateWords - part.words.length) + 1)
|
||||
return mapMax(scorePartAt(part), offsets)
|
||||
}
|
||||
const scores = _.orderBy([], 'desc', _.map(scorePart, parts))
|
||||
const thresholdIndex = _.min([2, scores.length]) - 1
|
||||
return scores[thresholdIndex]
|
||||
}
|
||||
const wordMatchScore = mapMax(scoreCandidateWords, individual.aliases)
|
||||
|
||||
console.log(stringMatchScore, wordMatchScore)
|
||||
|
||||
return _.max([stringMatchScore, wordMatchScore])
|
||||
})
|
||||
|
||||
// nameParts should be an object like {firstName: "John", lastName: "Doe", ...}
|
||||
|
||||
function makeCompatible (nameParts) {
|
||||
const partNames = _.keys(nameParts)
|
||||
const values = _.values(nameParts)
|
||||
|
|
@ -110,15 +49,22 @@ function makeCompatible (nameParts) {
|
|||
return _.map(_.zipObject(['partName', 'value']), props)
|
||||
}
|
||||
|
||||
function match (nameParts, birthDateString) {
|
||||
// algorithm
|
||||
|
||||
function match (nameParts, birthDateString, threshold) {
|
||||
if (!structs) {
|
||||
const message = 'The OFAC data sources have not been loaded yet.'
|
||||
return Promise.reject(new Error(message))
|
||||
}
|
||||
|
||||
// Prepare the input data
|
||||
|
||||
const parts = makeCompatible(nameParts)
|
||||
const fullName = nameUtils.makeFullName(parts)
|
||||
const fullNameWords = nameUtils.makeWords(fullName)
|
||||
const words = nameUtils.makeWords(fullName)
|
||||
|
||||
const wordValues = _.map(_.get('value'), words)
|
||||
const wordPhonetics = _.flatten(_.map(_.get('phonetics'), words))
|
||||
|
||||
// birthDateString is in YYYYMMDD format
|
||||
const year = parseInt(birthDateString.slice(0, 4))
|
||||
|
|
@ -128,12 +74,73 @@ function match (nameParts, birthDateString) {
|
|||
|
||||
const birthDate = {year, month, day, date}
|
||||
|
||||
const candidate = {parts, fullName, fullNameWords, birthDate}
|
||||
debug_log(candidate)
|
||||
debug_log({parts, fullName, wordValues, wordPhonetics, birthDate})
|
||||
|
||||
const similarToCandidate = similarity(candidate)
|
||||
const result = mapMax(similarToCandidate, structs.individuals)
|
||||
console.log(result)
|
||||
// Start matching
|
||||
|
||||
// Accept aliases who's full name matches.
|
||||
const doesNameMatch = _.flow(
|
||||
_.get('fullName'),
|
||||
stringSimilarity(fullName),
|
||||
_.lte(threshold)
|
||||
)
|
||||
const aliases = _.flatMap(_.get('aliases'), structs.individuals)
|
||||
const aliasIdsFromFullName = _.flow(
|
||||
_.filter(doesNameMatch),
|
||||
|
||||
_.map(_.get('id'))
|
||||
)(aliases)
|
||||
|
||||
// Gather aliases who's name-parts match phonetically.
|
||||
const getPhoneticMatches = phonetic => structs.phoneticMap.get(phonetic)
|
||||
const phoneticMatches = _.flow(
|
||||
_.map(getPhoneticMatches),
|
||||
_.compact,
|
||||
_.flatten
|
||||
)(wordPhonetics)
|
||||
|
||||
// Gether aliases whose name-parts match alphabetically.
|
||||
const getStringMatches = value => {
|
||||
const entryMatches = entry => (jaroWinkler(value, entry.value) >= threshold)
|
||||
return _.filter(entryMatches, structs.wordList)
|
||||
}
|
||||
const getSingleEntries = wordEntry => {
|
||||
const makeEntry = aliasId => ({value: wordEntry.value, aliasId})
|
||||
return _.map(makeEntry, wordEntry.aliasIds)
|
||||
}
|
||||
const stringMatches = _.flow(
|
||||
_.map(getStringMatches),
|
||||
_.flatten,
|
||||
_.map(getSingleEntries),
|
||||
_.flatten
|
||||
)(wordValues)
|
||||
|
||||
// At least two name-parts must match per alias
|
||||
const aliasIdsFromNamePart = _.flow(
|
||||
_.uniqWith(_.isEqual),
|
||||
_.map(_.get('aliasId')),
|
||||
_.countBy(_.identity),
|
||||
_.toPairs,
|
||||
_.filter(_.flow(_.last, _.lte(2))),
|
||||
_.map(_.first)
|
||||
)([...phoneticMatches, ...stringMatches])
|
||||
|
||||
// Get the full record for each matched id
|
||||
const getIndividual = aliasId => {
|
||||
const individualId = structs.aliasToIndividual.get(aliasId)
|
||||
return structs.individualsMap.get(individualId)
|
||||
}
|
||||
const suspects = _.uniq(_.map(getIndividual, [
|
||||
...aliasIdsFromFullName,
|
||||
...aliasIdsFromNamePart
|
||||
]))
|
||||
|
||||
// Reject everyone who is born two years away.
|
||||
const twoYears = 365 * 2
|
||||
const unqualified = isBornTooLongSince(twoYears, birthDate)
|
||||
const result = _.reject(unqualified, suspects)
|
||||
|
||||
debug_log(result)
|
||||
return result
|
||||
}
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue