Name-part matching now combines Jaro and Double Metaphone scores
This commit is contained in:
parent
f00516ce2e
commit
793db0f449
5 changed files with 54 additions and 78 deletions
|
|
@ -1,9 +1,9 @@
|
|||
const jaroWinkler = require('talisman/metrics/distance/jaro-winkler')
|
||||
const jaro = require('talisman/metrics/distance/jaro')
|
||||
const _ = require('lodash/fp')
|
||||
|
||||
const debug_log = require('../pp')(__filename) // KOSTIS TODO: remove
|
||||
|
||||
const stringSimilarity = _.curry(jaroWinkler)
|
||||
const stringSimilarity = _.curry(jaro)
|
||||
|
||||
// birth date
|
||||
|
||||
|
|
@ -29,7 +29,7 @@ const isBornTooLongSince = _.curry((days, dateObject, individual) => {
|
|||
// algorithm
|
||||
|
||||
function match (structs, candidate, threshold) {
|
||||
const {fullName, wordPhonetics, wordValues, birthDate} = candidate
|
||||
const {fullName, words, birthDate} = candidate
|
||||
|
||||
// Accept aliases who's full name matches.
|
||||
const doesNameMatch = _.flow(
|
||||
|
|
@ -43,53 +43,42 @@ function match (structs, candidate, threshold) {
|
|||
_.map(_.get('id'))
|
||||
)(aliases)
|
||||
|
||||
// Gather aliases who's name-parts match phonetically.
|
||||
const getPhoneticMatches = phonetic => structs.phoneticMap.get(phonetic)
|
||||
const phoneticMatches = _.flow(
|
||||
_.map(wordPhonetic => {
|
||||
const {word, phonetic} = wordPhonetic
|
||||
const matches = getPhoneticMatches(phonetic)
|
||||
return _.map(match => ({...match, word}), matches)
|
||||
}),
|
||||
_.compact,
|
||||
// _.map(_.uniqWith((a, b) => a.aliasId === b.aliasId)),
|
||||
_.flatten
|
||||
)(wordPhonetics)
|
||||
|
||||
// Gether aliases whose name-parts match alphabetically.
|
||||
const getStringMatches = value => {
|
||||
const entryMatches = entry => (stringSimilarity(value, entry.value) >= threshold)
|
||||
return _.filter(entryMatches, structs.wordList)
|
||||
}
|
||||
const getSingleEntries = wordEntry => {
|
||||
const makeEntry = aliasId => ({value: wordEntry.value, aliasId})
|
||||
return _.map(makeEntry, wordEntry.aliasIds)
|
||||
}
|
||||
const stringMatches = _.flow(
|
||||
_.flatMap(getStringMatches),
|
||||
_.flatMap(getSingleEntries)
|
||||
)(wordValues)
|
||||
const aliasIds = []
|
||||
const phoneticWeight = 0.17
|
||||
const stringWeight = 1 - phoneticWeight
|
||||
|
||||
// At least two name-parts must match per alias
|
||||
const adequateMatch = ([aliasId, count]) => {
|
||||
const alias = structs.aliasesMap.get(aliasId)
|
||||
return count >= Math.min(2, alias.words.length)
|
||||
for (const word of words) {
|
||||
const getPhonetic = phonetic => structs.phoneticMap.get(phonetic)
|
||||
const phoneticMatches = new Set(_.flatMap(getPhonetic, word.phonetics))
|
||||
|
||||
for (const wordEntry of structs.wordList) {
|
||||
const stringScore = stringSimilarity(word.value, wordEntry.value)
|
||||
|
||||
if (stringWeight * stringScore + phoneticWeight < threshold) continue
|
||||
|
||||
for (const aliasId of wordEntry.aliasIds) {
|
||||
const phoneticScore = phoneticMatches.has(aliasId) ? 1 : 0
|
||||
const finalScore = stringWeight * stringScore + phoneticWeight * phoneticScore
|
||||
|
||||
if (finalScore >= threshold) {
|
||||
aliasIds.push(aliasId)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const aliasIdsFromNamePart = _.flow(
|
||||
_.uniqWith((a, b) => (
|
||||
(a.value === b.value && a.aliasId === b.aliasId) ||
|
||||
(a.word === b.word && a.aliasId === b.aliasId)
|
||||
)),
|
||||
_.map(_.get('aliasId')),
|
||||
_.countBy(_.identity),
|
||||
_.toPairs,
|
||||
_.filter(adequateMatch),
|
||||
_.reject(_.flow(
|
||||
_.last,
|
||||
_.gt(2)
|
||||
)),
|
||||
_.map(_.first)
|
||||
)([...phoneticMatches, ...stringMatches])
|
||||
)(aliasIds)
|
||||
|
||||
// debug_log(aliasIdsFromFullName)
|
||||
// debug_log(phoneticMatches)
|
||||
// debug_log(stringMatches)
|
||||
// debug_log(aliasIdsFromNamePart)
|
||||
|
||||
// Get the full record for each matched id
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue