Name matching logic

This commit is contained in:
Konstantin Mamalakis 2018-02-27 02:59:21 +02:00 committed by Josh Harvey
parent 910d7e200f
commit 620863d703
3 changed files with 83 additions and 66 deletions

View file

@ -50,7 +50,8 @@ const processDocumentedNamePart = _.curry((groupTypes, namePartNode) => {
const typeId = groupTypes.get(groupId) const typeId = groupTypes.get(groupId)
const partName = partNames.get(typeId) const partName = partNames.get(typeId)
const value = _.lowerCase(valueNode.$text) const value = _.lowerCase(valueNode.$text)
return {[partName]: value} const words = nameUtils.makeWords(value)
return {partName, value, words}
}) })
const isLatin = _.matchesProperty(['$', 'DocNameStatusID'], PRIMARY_LATIN) const isLatin = _.matchesProperty(['$', 'DocNameStatusID'], PRIMARY_LATIN)
@ -68,15 +69,11 @@ const processAlias = _.curry((groupTypes, aliasNode) => {
} }
const namePartNodes = latinNameNode.DocumentedNamePart const namePartNodes = latinNameNode.DocumentedNamePart
const nameParts = _.map(getNamePart, namePartNodes) const parts = _.map(getNamePart, namePartNodes)
const parts = _.assignAll(nameParts) const fullName = nameUtils.makeFullName(parts)
const fullNames = nameUtils.makeFullNames(parts)
const phoneticParts = _.mapValues(nameUtils.phonetic, parts) return {parts, fullName}
const phoneticFullNames = _.map(nameUtils.phonetic, fullNames)
return {parts, fullNames, phoneticParts, phoneticFullNames}
}) })
// birth date // birth date

View file

@ -18,6 +18,11 @@ function load () {
const mapMax = (iteratee, list) => _.max(_.map(iteratee, list)) const mapMax = (iteratee, list) => _.max(_.map(iteratee, list))
const allPairs = _.flow(
(aList, bList) => _.map(a => _.map(b => [a, b], bList), aList),
_.flatten
)
// birth date // birth date
function isDateWithinSomeDaysOfPeriod (period, date, days) { function isDateWithinSomeDaysOfPeriod (period, date, days) {
@ -38,18 +43,16 @@ function isBornTooLongSince (individual, dateObject, days) {
return !_.some(isWithinSomeYears, individual.birthDatePeriods) return !_.some(isWithinSomeYears, individual.birthDatePeriods)
} }
// string similarity // similarity algorithm
const stringMatch = _.curry(jaroWinkler) const stringSimilarity = _.curry(jaroWinkler)
const bestMatchInList = _.curry((list, name) => mapMax(stringMatch(name), list)) const wordSimilarity = (a, b) => {
const phoneticPairs = allPairs(a.phonetic, b.phonetic)
const aliasStringMatch = _.curry((candidate, alias) => { const phoneticMatch = _.map(_.spread(_.isEqual), phoneticPairs)
const matchWithCandidate = bestMatchInList(candidate.fullNames) if (_.some(_.identity, phoneticMatch)) return 1
return mapMax(matchWithCandidate, alias.fullNames) return stringSimilarity(a.value, b.value)
}) }
// algorithm
const similarity = _.curry((candidate, individual) => { const similarity = _.curry((candidate, individual) => {
// Calculate if his birth date is within two years of the given date. // Calculate if his birth date is within two years of the given date.
@ -62,31 +65,60 @@ const similarity = _.curry((candidate, individual) => {
// Calculate the Jaro-Winkler similarity of the full name. // Calculate the Jaro-Winkler similarity of the full name.
// If an individual has multiple aliases, use the maximum score. // If an individual has multiple aliases, use the maximum score.
const scoreAgainstCandidate = aliasStringMatch(candidate) const scoreCandidateFullName = _.flow(
const stringMatchScore = mapMax(scoreAgainstCandidate, individual.aliases) _.get('fullName'),
stringSimilarity(candidate.fullName)
)
const stringMatchScore = mapMax(scoreCandidateFullName, individual.aliases)
// // Calculate the Jaro-Winkler similarity of the phonetic representation of the full name. //
// // This should approximate the phonetic similarity of the two names.
// // If an individual has multiple aliases, use the maximum score.
// const phoneticMatchScore = mapMax(calcPhoneticMatchScore(candidate.phoneticFullName))(individual.aliases)
console.log(stringMatchScore) const candidateWords = candidate.fullNameWords
const numCandidateWords = candidateWords.length
return _.max([stringMatchScore]) const scoreCandidateWords = alias => {
const tooManyWords = _.flow(
_.get(['words', 'length']),
_.lt(numCandidateWords)
)
const parts = _.reject(tooManyWords, alias.parts)
const scorePartAt = _.curry((part, offset) => {
const words = _.slice(offset, offset + part.words.length, candidateWords)
return _.min(_.map(_.spread(wordSimilarity), _.zip(words, part.words)))
})
const scorePart = part => {
const offsets = _.range(0, (numCandidateWords - part.words.length) + 1)
return mapMax(scorePartAt(part), offsets)
}
const scores = _.orderBy([], 'desc', _.map(scorePart, parts))
const thresholdIndex = _.min([2, scores.length]) - 1
return scores[thresholdIndex]
}
const wordMatchScore = mapMax(scoreCandidateWords, individual.aliases)
console.log(stringMatchScore, wordMatchScore)
return _.max([stringMatchScore, wordMatchScore])
}) })
// nameParts should be an object like {firstName: "John", lastName: "Doe", ...}
function makeCompatible (nameParts) {
const partNames = _.keys(nameParts)
const values = _.values(nameParts)
const props = _.zipAll([partNames, values])
return _.map(_.zipObject(['partName', 'value']), props)
}
function match (nameParts, birthDateString) { function match (nameParts, birthDateString) {
if (!individuals) { if (!individuals) {
const message = 'The OFAC data sources have not been loaded yet.' const message = 'The OFAC data sources have not been loaded yet.'
return Promise.reject(new Error(message)) return Promise.reject(new Error(message))
} }
// nameParts should be an object like {firstName: "John", lastName: "Doe", ...} const parts = makeCompatible(nameParts)
const parts = _.mapValues(_.lowerCase, nameParts) const fullName = nameUtils.makeFullName(parts)
const fullNames = nameUtils.makeFullNames(parts) const fullNameWords = nameUtils.makeWords(fullName)
const phoneticParts = _.mapValues(nameUtils.phonetic, parts)
const phoneticFullNames = _.map(nameUtils.phonetic, fullNames)
// birthDateString is in YYYYMMDD format // birthDateString is in YYYYMMDD format
const year = parseInt(birthDateString.slice(0, 4)) const year = parseInt(birthDateString.slice(0, 4))
@ -96,11 +128,12 @@ function match (nameParts, birthDateString) {
const birthDate = {year, month, day, date} const birthDate = {year, month, day, date}
const candidate = {parts, fullNames, phoneticParts, phoneticFullNames, birthDate} const candidate = {parts, fullName, fullNameWords, birthDate}
debug_log(candidate)
const similarToCandidate = similarity(candidate) const similarToCandidate = similarity(candidate)
const result = mapMax(similarToCandidate, individuals) const result = mapMax(similarToCandidate, individuals)
debug_log(candidate) console.log(result)
return result return result
} }

View file

@ -1,46 +1,33 @@
const metaphone = require('talisman/phonetics/metaphone')
const doubleMetaphone = require('talisman/phonetics/double-metaphone') const doubleMetaphone = require('talisman/phonetics/double-metaphone')
const _ = require('lodash/fp') const _ = require('lodash/fp')
// KOSTIS TODO: Decide on a method. Remove the others // KOSTIS TODO: Decide on a method. Remove the others
const phoneticMethod1 = metaphone const makePhonetic = _.flow(doubleMetaphone, _.uniq)
const phoneticMethod2 = _.flow(doubleMetaphone, _.uniq)
const phoneticMethod3 = _.flow(_.split(' '), _.map(phoneticMethod2))
// Combine name-parts in a standard order. // Combine name-parts in a standard order.
const commonOrderings = [ const partOrdering = ['firstName', 'middleName', 'maidenName', 'patronymic', 'matronymic', 'lastName']
['firstName', 'lastName'],
['firstName', 'middleName', 'lastName'],
['firstName', 'maidenName', 'lastName'],
['firstName', 'patronymic', 'lastName'],
['firstName', 'matronymic', 'lastName']
]
// const getFrom = _.flip() const usingPartOrder = _.flow(
_.get('partName'),
const getFrom = _.curry((obj, key) => obj[key]) _.partialRight(_.indexOf, [partOrdering])
const getOrderedParts = (parts, ordering) => _.map(getFrom(parts), ordering)
const combineParts = _.curryN(2, _.flow(
getOrderedParts,
_.compact,
_.join(' ')
))
const makeAllOrderings = parts => _.map(combineParts(parts), commonOrderings)
const makeFullNames = _.flow(
makeAllOrderings,
_.uniq
) )
const makeFullName = _.flow(
_.sortBy(usingPartOrder),
_.map(_.get('value')),
_.join(' ')
)
const makeWords = value => {
const words = _.split(' ', value)
const phonetic = _.map(makePhonetic, words)
const props = _.zipAll([words, phonetic])
return _.map(_.zipObject(['value', 'phonetic']), props)
}
module.exports = { module.exports = {
makeFullNames, makeFullName,
phonetic: phoneticMethod3 makeWords
} }