Name matching logic
This commit is contained in:
parent
910d7e200f
commit
620863d703
3 changed files with 83 additions and 66 deletions
|
|
@ -50,7 +50,8 @@ const processDocumentedNamePart = _.curry((groupTypes, namePartNode) => {
|
||||||
const typeId = groupTypes.get(groupId)
|
const typeId = groupTypes.get(groupId)
|
||||||
const partName = partNames.get(typeId)
|
const partName = partNames.get(typeId)
|
||||||
const value = _.lowerCase(valueNode.$text)
|
const value = _.lowerCase(valueNode.$text)
|
||||||
return {[partName]: value}
|
const words = nameUtils.makeWords(value)
|
||||||
|
return {partName, value, words}
|
||||||
})
|
})
|
||||||
|
|
||||||
const isLatin = _.matchesProperty(['$', 'DocNameStatusID'], PRIMARY_LATIN)
|
const isLatin = _.matchesProperty(['$', 'DocNameStatusID'], PRIMARY_LATIN)
|
||||||
|
|
@ -68,15 +69,11 @@ const processAlias = _.curry((groupTypes, aliasNode) => {
|
||||||
}
|
}
|
||||||
|
|
||||||
const namePartNodes = latinNameNode.DocumentedNamePart
|
const namePartNodes = latinNameNode.DocumentedNamePart
|
||||||
const nameParts = _.map(getNamePart, namePartNodes)
|
const parts = _.map(getNamePart, namePartNodes)
|
||||||
|
|
||||||
const parts = _.assignAll(nameParts)
|
const fullName = nameUtils.makeFullName(parts)
|
||||||
const fullNames = nameUtils.makeFullNames(parts)
|
|
||||||
|
|
||||||
const phoneticParts = _.mapValues(nameUtils.phonetic, parts)
|
return {parts, fullName}
|
||||||
const phoneticFullNames = _.map(nameUtils.phonetic, fullNames)
|
|
||||||
|
|
||||||
return {parts, fullNames, phoneticParts, phoneticFullNames}
|
|
||||||
})
|
})
|
||||||
|
|
||||||
// birth date
|
// birth date
|
||||||
|
|
|
||||||
|
|
@ -18,6 +18,11 @@ function load () {
|
||||||
|
|
||||||
const mapMax = (iteratee, list) => _.max(_.map(iteratee, list))
|
const mapMax = (iteratee, list) => _.max(_.map(iteratee, list))
|
||||||
|
|
||||||
|
const allPairs = _.flow(
|
||||||
|
(aList, bList) => _.map(a => _.map(b => [a, b], bList), aList),
|
||||||
|
_.flatten
|
||||||
|
)
|
||||||
|
|
||||||
// birth date
|
// birth date
|
||||||
|
|
||||||
function isDateWithinSomeDaysOfPeriod (period, date, days) {
|
function isDateWithinSomeDaysOfPeriod (period, date, days) {
|
||||||
|
|
@ -38,18 +43,16 @@ function isBornTooLongSince (individual, dateObject, days) {
|
||||||
return !_.some(isWithinSomeYears, individual.birthDatePeriods)
|
return !_.some(isWithinSomeYears, individual.birthDatePeriods)
|
||||||
}
|
}
|
||||||
|
|
||||||
// string similarity
|
// similarity algorithm
|
||||||
|
|
||||||
const stringMatch = _.curry(jaroWinkler)
|
const stringSimilarity = _.curry(jaroWinkler)
|
||||||
|
|
||||||
const bestMatchInList = _.curry((list, name) => mapMax(stringMatch(name), list))
|
const wordSimilarity = (a, b) => {
|
||||||
|
const phoneticPairs = allPairs(a.phonetic, b.phonetic)
|
||||||
const aliasStringMatch = _.curry((candidate, alias) => {
|
const phoneticMatch = _.map(_.spread(_.isEqual), phoneticPairs)
|
||||||
const matchWithCandidate = bestMatchInList(candidate.fullNames)
|
if (_.some(_.identity, phoneticMatch)) return 1
|
||||||
return mapMax(matchWithCandidate, alias.fullNames)
|
return stringSimilarity(a.value, b.value)
|
||||||
})
|
}
|
||||||
|
|
||||||
// algorithm
|
|
||||||
|
|
||||||
const similarity = _.curry((candidate, individual) => {
|
const similarity = _.curry((candidate, individual) => {
|
||||||
// Calculate if his birth date is within two years of the given date.
|
// Calculate if his birth date is within two years of the given date.
|
||||||
|
|
@ -62,18 +65,50 @@ const similarity = _.curry((candidate, individual) => {
|
||||||
|
|
||||||
// Calculate the Jaro-Winkler similarity of the full name.
|
// Calculate the Jaro-Winkler similarity of the full name.
|
||||||
// If an individual has multiple aliases, use the maximum score.
|
// If an individual has multiple aliases, use the maximum score.
|
||||||
const scoreAgainstCandidate = aliasStringMatch(candidate)
|
const scoreCandidateFullName = _.flow(
|
||||||
const stringMatchScore = mapMax(scoreAgainstCandidate, individual.aliases)
|
_.get('fullName'),
|
||||||
|
stringSimilarity(candidate.fullName)
|
||||||
|
)
|
||||||
|
const stringMatchScore = mapMax(scoreCandidateFullName, individual.aliases)
|
||||||
|
|
||||||
// // Calculate the Jaro-Winkler similarity of the phonetic representation of the full name.
|
//
|
||||||
// // This should approximate the phonetic similarity of the two names.
|
|
||||||
// // If an individual has multiple aliases, use the maximum score.
|
|
||||||
// const phoneticMatchScore = mapMax(calcPhoneticMatchScore(candidate.phoneticFullName))(individual.aliases)
|
|
||||||
|
|
||||||
console.log(stringMatchScore)
|
const candidateWords = candidate.fullNameWords
|
||||||
|
const numCandidateWords = candidateWords.length
|
||||||
|
|
||||||
return _.max([stringMatchScore])
|
const scoreCandidateWords = alias => {
|
||||||
|
const tooManyWords = _.flow(
|
||||||
|
_.get(['words', 'length']),
|
||||||
|
_.lt(numCandidateWords)
|
||||||
|
)
|
||||||
|
const parts = _.reject(tooManyWords, alias.parts)
|
||||||
|
|
||||||
|
const scorePartAt = _.curry((part, offset) => {
|
||||||
|
const words = _.slice(offset, offset + part.words.length, candidateWords)
|
||||||
|
return _.min(_.map(_.spread(wordSimilarity), _.zip(words, part.words)))
|
||||||
})
|
})
|
||||||
|
const scorePart = part => {
|
||||||
|
const offsets = _.range(0, (numCandidateWords - part.words.length) + 1)
|
||||||
|
return mapMax(scorePartAt(part), offsets)
|
||||||
|
}
|
||||||
|
const scores = _.orderBy([], 'desc', _.map(scorePart, parts))
|
||||||
|
const thresholdIndex = _.min([2, scores.length]) - 1
|
||||||
|
return scores[thresholdIndex]
|
||||||
|
}
|
||||||
|
const wordMatchScore = mapMax(scoreCandidateWords, individual.aliases)
|
||||||
|
|
||||||
|
console.log(stringMatchScore, wordMatchScore)
|
||||||
|
|
||||||
|
return _.max([stringMatchScore, wordMatchScore])
|
||||||
|
})
|
||||||
|
|
||||||
|
// nameParts should be an object like {firstName: "John", lastName: "Doe", ...}
|
||||||
|
function makeCompatible (nameParts) {
|
||||||
|
const partNames = _.keys(nameParts)
|
||||||
|
const values = _.values(nameParts)
|
||||||
|
const props = _.zipAll([partNames, values])
|
||||||
|
return _.map(_.zipObject(['partName', 'value']), props)
|
||||||
|
}
|
||||||
|
|
||||||
function match (nameParts, birthDateString) {
|
function match (nameParts, birthDateString) {
|
||||||
if (!individuals) {
|
if (!individuals) {
|
||||||
|
|
@ -81,12 +116,9 @@ function match (nameParts, birthDateString) {
|
||||||
return Promise.reject(new Error(message))
|
return Promise.reject(new Error(message))
|
||||||
}
|
}
|
||||||
|
|
||||||
// nameParts should be an object like {firstName: "John", lastName: "Doe", ...}
|
const parts = makeCompatible(nameParts)
|
||||||
const parts = _.mapValues(_.lowerCase, nameParts)
|
const fullName = nameUtils.makeFullName(parts)
|
||||||
const fullNames = nameUtils.makeFullNames(parts)
|
const fullNameWords = nameUtils.makeWords(fullName)
|
||||||
|
|
||||||
const phoneticParts = _.mapValues(nameUtils.phonetic, parts)
|
|
||||||
const phoneticFullNames = _.map(nameUtils.phonetic, fullNames)
|
|
||||||
|
|
||||||
// birthDateString is in YYYYMMDD format
|
// birthDateString is in YYYYMMDD format
|
||||||
const year = parseInt(birthDateString.slice(0, 4))
|
const year = parseInt(birthDateString.slice(0, 4))
|
||||||
|
|
@ -96,11 +128,12 @@ function match (nameParts, birthDateString) {
|
||||||
|
|
||||||
const birthDate = {year, month, day, date}
|
const birthDate = {year, month, day, date}
|
||||||
|
|
||||||
const candidate = {parts, fullNames, phoneticParts, phoneticFullNames, birthDate}
|
const candidate = {parts, fullName, fullNameWords, birthDate}
|
||||||
|
debug_log(candidate)
|
||||||
|
|
||||||
const similarToCandidate = similarity(candidate)
|
const similarToCandidate = similarity(candidate)
|
||||||
const result = mapMax(similarToCandidate, individuals)
|
const result = mapMax(similarToCandidate, individuals)
|
||||||
debug_log(candidate)
|
console.log(result)
|
||||||
return result
|
return result
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,46 +1,33 @@
|
||||||
const metaphone = require('talisman/phonetics/metaphone')
|
|
||||||
const doubleMetaphone = require('talisman/phonetics/double-metaphone')
|
const doubleMetaphone = require('talisman/phonetics/double-metaphone')
|
||||||
const _ = require('lodash/fp')
|
const _ = require('lodash/fp')
|
||||||
|
|
||||||
// KOSTIS TODO: Decide on a method. Remove the others
|
// KOSTIS TODO: Decide on a method. Remove the others
|
||||||
|
|
||||||
const phoneticMethod1 = metaphone
|
const makePhonetic = _.flow(doubleMetaphone, _.uniq)
|
||||||
|
|
||||||
const phoneticMethod2 = _.flow(doubleMetaphone, _.uniq)
|
|
||||||
|
|
||||||
const phoneticMethod3 = _.flow(_.split(' '), _.map(phoneticMethod2))
|
|
||||||
|
|
||||||
// Combine name-parts in a standard order.
|
// Combine name-parts in a standard order.
|
||||||
|
|
||||||
const commonOrderings = [
|
const partOrdering = ['firstName', 'middleName', 'maidenName', 'patronymic', 'matronymic', 'lastName']
|
||||||
['firstName', 'lastName'],
|
|
||||||
['firstName', 'middleName', 'lastName'],
|
|
||||||
['firstName', 'maidenName', 'lastName'],
|
|
||||||
['firstName', 'patronymic', 'lastName'],
|
|
||||||
['firstName', 'matronymic', 'lastName']
|
|
||||||
]
|
|
||||||
|
|
||||||
// const getFrom = _.flip()
|
const usingPartOrder = _.flow(
|
||||||
|
_.get('partName'),
|
||||||
const getFrom = _.curry((obj, key) => obj[key])
|
_.partialRight(_.indexOf, [partOrdering])
|
||||||
|
|
||||||
const getOrderedParts = (parts, ordering) => _.map(getFrom(parts), ordering)
|
|
||||||
|
|
||||||
const combineParts = _.curryN(2, _.flow(
|
|
||||||
getOrderedParts,
|
|
||||||
_.compact,
|
|
||||||
_.join(' ')
|
|
||||||
))
|
|
||||||
|
|
||||||
const makeAllOrderings = parts => _.map(combineParts(parts), commonOrderings)
|
|
||||||
|
|
||||||
const makeFullNames = _.flow(
|
|
||||||
makeAllOrderings,
|
|
||||||
_.uniq
|
|
||||||
)
|
)
|
||||||
|
|
||||||
|
const makeFullName = _.flow(
|
||||||
|
_.sortBy(usingPartOrder),
|
||||||
|
_.map(_.get('value')),
|
||||||
|
_.join(' ')
|
||||||
|
)
|
||||||
|
|
||||||
|
const makeWords = value => {
|
||||||
|
const words = _.split(' ', value)
|
||||||
|
const phonetic = _.map(makePhonetic, words)
|
||||||
|
const props = _.zipAll([words, phonetic])
|
||||||
|
return _.map(_.zipObject(['value', 'phonetic']), props)
|
||||||
|
}
|
||||||
|
|
||||||
module.exports = {
|
module.exports = {
|
||||||
makeFullNames,
|
makeFullName,
|
||||||
phonetic: phoneticMethod3
|
makeWords
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue