Name matching logic

This commit is contained in:
Konstantin Mamalakis 2018-02-27 02:59:21 +02:00 committed by Josh Harvey
parent 910d7e200f
commit 620863d703
3 changed files with 83 additions and 66 deletions

View file

@ -18,6 +18,11 @@ function load () {
const mapMax = (iteratee, list) => _.max(_.map(iteratee, list))
const allPairs = _.flow(
(aList, bList) => _.map(a => _.map(b => [a, b], bList), aList),
_.flatten
)
// birth date
function isDateWithinSomeDaysOfPeriod (period, date, days) {
@ -38,18 +43,16 @@ function isBornTooLongSince (individual, dateObject, days) {
return !_.some(isWithinSomeYears, individual.birthDatePeriods)
}
// string similarity
// similarity algorithm
const stringMatch = _.curry(jaroWinkler)
const stringSimilarity = _.curry(jaroWinkler)
const bestMatchInList = _.curry((list, name) => mapMax(stringMatch(name), list))
const aliasStringMatch = _.curry((candidate, alias) => {
const matchWithCandidate = bestMatchInList(candidate.fullNames)
return mapMax(matchWithCandidate, alias.fullNames)
})
// algorithm
const wordSimilarity = (a, b) => {
const phoneticPairs = allPairs(a.phonetic, b.phonetic)
const phoneticMatch = _.map(_.spread(_.isEqual), phoneticPairs)
if (_.some(_.identity, phoneticMatch)) return 1
return stringSimilarity(a.value, b.value)
}
const similarity = _.curry((candidate, individual) => {
// Calculate if his birth date is within two years of the given date.
@ -62,31 +65,60 @@ const similarity = _.curry((candidate, individual) => {
// Calculate the Jaro-Winkler similarity of the full name.
// If an individual has multiple aliases, use the maximum score.
const scoreAgainstCandidate = aliasStringMatch(candidate)
const stringMatchScore = mapMax(scoreAgainstCandidate, individual.aliases)
const scoreCandidateFullName = _.flow(
_.get('fullName'),
stringSimilarity(candidate.fullName)
)
const stringMatchScore = mapMax(scoreCandidateFullName, individual.aliases)
// // Calculate the Jaro-Winkler similarity of the phonetic representation of the full name.
// // This should approximate the phonetic similarity of the two names.
// // If an individual has multiple aliases, use the maximum score.
// const phoneticMatchScore = mapMax(calcPhoneticMatchScore(candidate.phoneticFullName))(individual.aliases)
//
console.log(stringMatchScore)
const candidateWords = candidate.fullNameWords
const numCandidateWords = candidateWords.length
return _.max([stringMatchScore])
const scoreCandidateWords = alias => {
const tooManyWords = _.flow(
_.get(['words', 'length']),
_.lt(numCandidateWords)
)
const parts = _.reject(tooManyWords, alias.parts)
const scorePartAt = _.curry((part, offset) => {
const words = _.slice(offset, offset + part.words.length, candidateWords)
return _.min(_.map(_.spread(wordSimilarity), _.zip(words, part.words)))
})
const scorePart = part => {
const offsets = _.range(0, (numCandidateWords - part.words.length) + 1)
return mapMax(scorePartAt(part), offsets)
}
const scores = _.orderBy([], 'desc', _.map(scorePart, parts))
const thresholdIndex = _.min([2, scores.length]) - 1
return scores[thresholdIndex]
}
const wordMatchScore = mapMax(scoreCandidateWords, individual.aliases)
console.log(stringMatchScore, wordMatchScore)
return _.max([stringMatchScore, wordMatchScore])
})
// nameParts should be an object like {firstName: "John", lastName: "Doe", ...}
function makeCompatible (nameParts) {
const partNames = _.keys(nameParts)
const values = _.values(nameParts)
const props = _.zipAll([partNames, values])
return _.map(_.zipObject(['partName', 'value']), props)
}
function match (nameParts, birthDateString) {
if (!individuals) {
const message = 'The OFAC data sources have not been loaded yet.'
return Promise.reject(new Error(message))
}
// nameParts should be an object like {firstName: "John", lastName: "Doe", ...}
const parts = _.mapValues(_.lowerCase, nameParts)
const fullNames = nameUtils.makeFullNames(parts)
const phoneticParts = _.mapValues(nameUtils.phonetic, parts)
const phoneticFullNames = _.map(nameUtils.phonetic, fullNames)
const parts = makeCompatible(nameParts)
const fullName = nameUtils.makeFullName(parts)
const fullNameWords = nameUtils.makeWords(fullName)
// birthDateString is in YYYYMMDD format
const year = parseInt(birthDateString.slice(0, 4))
@ -96,11 +128,12 @@ function match (nameParts, birthDateString) {
const birthDate = {year, month, day, date}
const candidate = {parts, fullNames, phoneticParts, phoneticFullNames, birthDate}
const candidate = {parts, fullName, fullNameWords, birthDate}
debug_log(candidate)
const similarToCandidate = similarity(candidate)
const result = mapMax(similarToCandidate, individuals)
debug_log(candidate)
console.log(result)
return result
}