Name matching logic
This commit is contained in:
parent
910d7e200f
commit
620863d703
3 changed files with 83 additions and 66 deletions
|
|
@ -18,6 +18,11 @@ function load () {
|
|||
|
||||
const mapMax = (iteratee, list) => _.max(_.map(iteratee, list))
|
||||
|
||||
const allPairs = _.flow(
|
||||
(aList, bList) => _.map(a => _.map(b => [a, b], bList), aList),
|
||||
_.flatten
|
||||
)
|
||||
|
||||
// birth date
|
||||
|
||||
function isDateWithinSomeDaysOfPeriod (period, date, days) {
|
||||
|
|
@ -38,18 +43,16 @@ function isBornTooLongSince (individual, dateObject, days) {
|
|||
return !_.some(isWithinSomeYears, individual.birthDatePeriods)
|
||||
}
|
||||
|
||||
// string similarity
|
||||
// similarity algorithm
|
||||
|
||||
const stringMatch = _.curry(jaroWinkler)
|
||||
const stringSimilarity = _.curry(jaroWinkler)
|
||||
|
||||
const bestMatchInList = _.curry((list, name) => mapMax(stringMatch(name), list))
|
||||
|
||||
const aliasStringMatch = _.curry((candidate, alias) => {
|
||||
const matchWithCandidate = bestMatchInList(candidate.fullNames)
|
||||
return mapMax(matchWithCandidate, alias.fullNames)
|
||||
})
|
||||
|
||||
// algorithm
|
||||
const wordSimilarity = (a, b) => {
|
||||
const phoneticPairs = allPairs(a.phonetic, b.phonetic)
|
||||
const phoneticMatch = _.map(_.spread(_.isEqual), phoneticPairs)
|
||||
if (_.some(_.identity, phoneticMatch)) return 1
|
||||
return stringSimilarity(a.value, b.value)
|
||||
}
|
||||
|
||||
const similarity = _.curry((candidate, individual) => {
|
||||
// Calculate if his birth date is within two years of the given date.
|
||||
|
|
@ -62,31 +65,60 @@ const similarity = _.curry((candidate, individual) => {
|
|||
|
||||
// Calculate the Jaro-Winkler similarity of the full name.
|
||||
// If an individual has multiple aliases, use the maximum score.
|
||||
const scoreAgainstCandidate = aliasStringMatch(candidate)
|
||||
const stringMatchScore = mapMax(scoreAgainstCandidate, individual.aliases)
|
||||
const scoreCandidateFullName = _.flow(
|
||||
_.get('fullName'),
|
||||
stringSimilarity(candidate.fullName)
|
||||
)
|
||||
const stringMatchScore = mapMax(scoreCandidateFullName, individual.aliases)
|
||||
|
||||
// // Calculate the Jaro-Winkler similarity of the phonetic representation of the full name.
|
||||
// // This should approximate the phonetic similarity of the two names.
|
||||
// // If an individual has multiple aliases, use the maximum score.
|
||||
// const phoneticMatchScore = mapMax(calcPhoneticMatchScore(candidate.phoneticFullName))(individual.aliases)
|
||||
//
|
||||
|
||||
console.log(stringMatchScore)
|
||||
const candidateWords = candidate.fullNameWords
|
||||
const numCandidateWords = candidateWords.length
|
||||
|
||||
return _.max([stringMatchScore])
|
||||
const scoreCandidateWords = alias => {
|
||||
const tooManyWords = _.flow(
|
||||
_.get(['words', 'length']),
|
||||
_.lt(numCandidateWords)
|
||||
)
|
||||
const parts = _.reject(tooManyWords, alias.parts)
|
||||
|
||||
const scorePartAt = _.curry((part, offset) => {
|
||||
const words = _.slice(offset, offset + part.words.length, candidateWords)
|
||||
return _.min(_.map(_.spread(wordSimilarity), _.zip(words, part.words)))
|
||||
})
|
||||
const scorePart = part => {
|
||||
const offsets = _.range(0, (numCandidateWords - part.words.length) + 1)
|
||||
return mapMax(scorePartAt(part), offsets)
|
||||
}
|
||||
const scores = _.orderBy([], 'desc', _.map(scorePart, parts))
|
||||
const thresholdIndex = _.min([2, scores.length]) - 1
|
||||
return scores[thresholdIndex]
|
||||
}
|
||||
const wordMatchScore = mapMax(scoreCandidateWords, individual.aliases)
|
||||
|
||||
console.log(stringMatchScore, wordMatchScore)
|
||||
|
||||
return _.max([stringMatchScore, wordMatchScore])
|
||||
})
|
||||
|
||||
// nameParts should be an object like {firstName: "John", lastName: "Doe", ...}
|
||||
function makeCompatible (nameParts) {
|
||||
const partNames = _.keys(nameParts)
|
||||
const values = _.values(nameParts)
|
||||
const props = _.zipAll([partNames, values])
|
||||
return _.map(_.zipObject(['partName', 'value']), props)
|
||||
}
|
||||
|
||||
function match (nameParts, birthDateString) {
|
||||
if (!individuals) {
|
||||
const message = 'The OFAC data sources have not been loaded yet.'
|
||||
return Promise.reject(new Error(message))
|
||||
}
|
||||
|
||||
// nameParts should be an object like {firstName: "John", lastName: "Doe", ...}
|
||||
const parts = _.mapValues(_.lowerCase, nameParts)
|
||||
const fullNames = nameUtils.makeFullNames(parts)
|
||||
|
||||
const phoneticParts = _.mapValues(nameUtils.phonetic, parts)
|
||||
const phoneticFullNames = _.map(nameUtils.phonetic, fullNames)
|
||||
const parts = makeCompatible(nameParts)
|
||||
const fullName = nameUtils.makeFullName(parts)
|
||||
const fullNameWords = nameUtils.makeWords(fullName)
|
||||
|
||||
// birthDateString is in YYYYMMDD format
|
||||
const year = parseInt(birthDateString.slice(0, 4))
|
||||
|
|
@ -96,11 +128,12 @@ function match (nameParts, birthDateString) {
|
|||
|
||||
const birthDate = {year, month, day, date}
|
||||
|
||||
const candidate = {parts, fullNames, phoneticParts, phoneticFullNames, birthDate}
|
||||
const candidate = {parts, fullName, fullNameWords, birthDate}
|
||||
debug_log(candidate)
|
||||
|
||||
const similarToCandidate = similarity(candidate)
|
||||
const result = mapMax(similarToCandidate, individuals)
|
||||
debug_log(candidate)
|
||||
console.log(result)
|
||||
return result
|
||||
}
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue