Name matching logic

This commit is contained in:
Konstantin Mamalakis 2018-02-27 02:59:21 +02:00 committed by Josh Harvey
parent 910d7e200f
commit 620863d703
3 changed files with 83 additions and 66 deletions

View file

@ -50,7 +50,8 @@ const processDocumentedNamePart = _.curry((groupTypes, namePartNode) => {
const typeId = groupTypes.get(groupId)
const partName = partNames.get(typeId)
const value = _.lowerCase(valueNode.$text)
return {[partName]: value}
const words = nameUtils.makeWords(value)
return {partName, value, words}
})
const isLatin = _.matchesProperty(['$', 'DocNameStatusID'], PRIMARY_LATIN)
@ -68,15 +69,11 @@ const processAlias = _.curry((groupTypes, aliasNode) => {
}
const namePartNodes = latinNameNode.DocumentedNamePart
const nameParts = _.map(getNamePart, namePartNodes)
const parts = _.map(getNamePart, namePartNodes)
const parts = _.assignAll(nameParts)
const fullNames = nameUtils.makeFullNames(parts)
const fullName = nameUtils.makeFullName(parts)
const phoneticParts = _.mapValues(nameUtils.phonetic, parts)
const phoneticFullNames = _.map(nameUtils.phonetic, fullNames)
return {parts, fullNames, phoneticParts, phoneticFullNames}
return {parts, fullName}
})
// birth date

View file

@ -18,6 +18,11 @@ function load () {
const mapMax = (iteratee, list) => _.max(_.map(iteratee, list))
const allPairs = _.flow(
(aList, bList) => _.map(a => _.map(b => [a, b], bList), aList),
_.flatten
)
// birth date
function isDateWithinSomeDaysOfPeriod (period, date, days) {
@ -38,18 +43,16 @@ function isBornTooLongSince (individual, dateObject, days) {
return !_.some(isWithinSomeYears, individual.birthDatePeriods)
}
// string similarity
// similarity algorithm
const stringMatch = _.curry(jaroWinkler)
const stringSimilarity = _.curry(jaroWinkler)
const bestMatchInList = _.curry((list, name) => mapMax(stringMatch(name), list))
const aliasStringMatch = _.curry((candidate, alias) => {
const matchWithCandidate = bestMatchInList(candidate.fullNames)
return mapMax(matchWithCandidate, alias.fullNames)
})
// algorithm
const wordSimilarity = (a, b) => {
const phoneticPairs = allPairs(a.phonetic, b.phonetic)
const phoneticMatch = _.map(_.spread(_.isEqual), phoneticPairs)
if (_.some(_.identity, phoneticMatch)) return 1
return stringSimilarity(a.value, b.value)
}
const similarity = _.curry((candidate, individual) => {
// Calculate if his birth date is within two years of the given date.
@ -62,31 +65,60 @@ const similarity = _.curry((candidate, individual) => {
// Calculate the Jaro-Winkler similarity of the full name.
// If an individual has multiple aliases, use the maximum score.
const scoreAgainstCandidate = aliasStringMatch(candidate)
const stringMatchScore = mapMax(scoreAgainstCandidate, individual.aliases)
const scoreCandidateFullName = _.flow(
_.get('fullName'),
stringSimilarity(candidate.fullName)
)
const stringMatchScore = mapMax(scoreCandidateFullName, individual.aliases)
// // Calculate the Jaro-Winkler similarity of the phonetic representation of the full name.
// // This should approximate the phonetic similarity of the two names.
// // If an individual has multiple aliases, use the maximum score.
// const phoneticMatchScore = mapMax(calcPhoneticMatchScore(candidate.phoneticFullName))(individual.aliases)
//
console.log(stringMatchScore)
const candidateWords = candidate.fullNameWords
const numCandidateWords = candidateWords.length
return _.max([stringMatchScore])
const scoreCandidateWords = alias => {
const tooManyWords = _.flow(
_.get(['words', 'length']),
_.lt(numCandidateWords)
)
const parts = _.reject(tooManyWords, alias.parts)
const scorePartAt = _.curry((part, offset) => {
const words = _.slice(offset, offset + part.words.length, candidateWords)
return _.min(_.map(_.spread(wordSimilarity), _.zip(words, part.words)))
})
const scorePart = part => {
const offsets = _.range(0, (numCandidateWords - part.words.length) + 1)
return mapMax(scorePartAt(part), offsets)
}
const scores = _.orderBy([], 'desc', _.map(scorePart, parts))
const thresholdIndex = _.min([2, scores.length]) - 1
return scores[thresholdIndex]
}
const wordMatchScore = mapMax(scoreCandidateWords, individual.aliases)
console.log(stringMatchScore, wordMatchScore)
return _.max([stringMatchScore, wordMatchScore])
})
// nameParts should be an object like {firstName: "John", lastName: "Doe", ...}
function makeCompatible (nameParts) {
const partNames = _.keys(nameParts)
const values = _.values(nameParts)
const props = _.zipAll([partNames, values])
return _.map(_.zipObject(['partName', 'value']), props)
}
function match (nameParts, birthDateString) {
if (!individuals) {
const message = 'The OFAC data sources have not been loaded yet.'
return Promise.reject(new Error(message))
}
// nameParts should be an object like {firstName: "John", lastName: "Doe", ...}
const parts = _.mapValues(_.lowerCase, nameParts)
const fullNames = nameUtils.makeFullNames(parts)
const phoneticParts = _.mapValues(nameUtils.phonetic, parts)
const phoneticFullNames = _.map(nameUtils.phonetic, fullNames)
const parts = makeCompatible(nameParts)
const fullName = nameUtils.makeFullName(parts)
const fullNameWords = nameUtils.makeWords(fullName)
// birthDateString is in YYYYMMDD format
const year = parseInt(birthDateString.slice(0, 4))
@ -96,11 +128,12 @@ function match (nameParts, birthDateString) {
const birthDate = {year, month, day, date}
const candidate = {parts, fullNames, phoneticParts, phoneticFullNames, birthDate}
const candidate = {parts, fullName, fullNameWords, birthDate}
debug_log(candidate)
const similarToCandidate = similarity(candidate)
const result = mapMax(similarToCandidate, individuals)
debug_log(candidate)
console.log(result)
return result
}

View file

@ -1,46 +1,33 @@
const metaphone = require('talisman/phonetics/metaphone')
const doubleMetaphone = require('talisman/phonetics/double-metaphone')
const _ = require('lodash/fp')
// KOSTIS TODO: Decide on a method. Remove the others
const phoneticMethod1 = metaphone
const phoneticMethod2 = _.flow(doubleMetaphone, _.uniq)
const phoneticMethod3 = _.flow(_.split(' '), _.map(phoneticMethod2))
const makePhonetic = _.flow(doubleMetaphone, _.uniq)
// Combine name-parts in a standard order.
const commonOrderings = [
['firstName', 'lastName'],
['firstName', 'middleName', 'lastName'],
['firstName', 'maidenName', 'lastName'],
['firstName', 'patronymic', 'lastName'],
['firstName', 'matronymic', 'lastName']
]
const partOrdering = ['firstName', 'middleName', 'maidenName', 'patronymic', 'matronymic', 'lastName']
// const getFrom = _.flip()
const getFrom = _.curry((obj, key) => obj[key])
const getOrderedParts = (parts, ordering) => _.map(getFrom(parts), ordering)
const combineParts = _.curryN(2, _.flow(
getOrderedParts,
_.compact,
_.join(' ')
))
const makeAllOrderings = parts => _.map(combineParts(parts), commonOrderings)
const makeFullNames = _.flow(
makeAllOrderings,
_.uniq
const usingPartOrder = _.flow(
_.get('partName'),
_.partialRight(_.indexOf, [partOrdering])
)
const makeFullName = _.flow(
_.sortBy(usingPartOrder),
_.map(_.get('value')),
_.join(' ')
)
const makeWords = value => {
const words = _.split(' ', value)
const phonetic = _.map(makePhonetic, words)
const props = _.zipAll([words, phonetic])
return _.map(_.zipObject(['value', 'phonetic']), props)
}
module.exports = {
makeFullNames,
phonetic: phoneticMethod3
makeFullName,
makeWords
}