Name-part matching now combines Jaro and Double Metaphone scores

This commit is contained in:
Konstantin Mamalakis 2018-03-13 22:07:44 +02:00 committed by Josh Harvey
parent f00516ce2e
commit 793db0f449
5 changed files with 54 additions and 78 deletions

View file

@ -37,7 +37,7 @@ function load () {
function makeCompatible (nameParts) { function makeCompatible (nameParts) {
const partNames = _.keys(nameParts) const partNames = _.keys(nameParts)
const values = _.values(nameParts) const values = _.map(_.lowerCase, _.values(nameParts))
const props = _.zipAll([partNames, values]) const props = _.zipAll([partNames, values])
return _.map(_.zipObject(['partName', 'value']), props) return _.map(_.zipObject(['partName', 'value']), props)
} }
@ -54,12 +54,6 @@ function match (nameParts, birthDateString, threshold) {
const fullName = nameUtils.makeFullName(parts) const fullName = nameUtils.makeFullName(parts)
const words = nameUtils.makeWords(fullName) const words = nameUtils.makeWords(fullName)
const wordValues = _.map(_.get('value'), words)
const wordPhonetics = _.flatMap(word => {
const {phonetics} = word
return _.map(phonetic => ({word: word.value, phonetic}), phonetics)
}, words)
// birthDateString is in YYYYMMDD format // birthDateString is in YYYYMMDD format
const birthDate = _.cond([ const birthDate = _.cond([
[_.identity, () => { [_.identity, () => {
@ -73,7 +67,7 @@ function match (nameParts, birthDateString, threshold) {
[_.stubTrue, () => null] [_.stubTrue, () => null]
])(birthDateString) ])(birthDateString)
const candidate = {parts, fullName, wordValues, wordPhonetics, birthDate} const candidate = {parts, fullName, words, birthDate}
// debug_log(candidate) // debug_log(candidate)
const result = matcher.match(structs, candidate, threshold) const result = matcher.match(structs, candidate, threshold)

View file

@ -1,9 +1,9 @@
const jaroWinkler = require('talisman/metrics/distance/jaro-winkler') const jaro = require('talisman/metrics/distance/jaro')
const _ = require('lodash/fp') const _ = require('lodash/fp')
const debug_log = require('../pp')(__filename) // KOSTIS TODO: remove const debug_log = require('../pp')(__filename) // KOSTIS TODO: remove
const stringSimilarity = _.curry(jaroWinkler) const stringSimilarity = _.curry(jaro)
// birth date // birth date
@ -29,7 +29,7 @@ const isBornTooLongSince = _.curry((days, dateObject, individual) => {
// algorithm // algorithm
function match (structs, candidate, threshold) { function match (structs, candidate, threshold) {
const {fullName, wordPhonetics, wordValues, birthDate} = candidate const {fullName, words, birthDate} = candidate
// Accept aliases who's full name matches. // Accept aliases who's full name matches.
const doesNameMatch = _.flow( const doesNameMatch = _.flow(
@ -43,53 +43,42 @@ function match (structs, candidate, threshold) {
_.map(_.get('id')) _.map(_.get('id'))
)(aliases) )(aliases)
// Gather aliases who's name-parts match phonetically.
const getPhoneticMatches = phonetic => structs.phoneticMap.get(phonetic)
const phoneticMatches = _.flow(
_.map(wordPhonetic => {
const {word, phonetic} = wordPhonetic
const matches = getPhoneticMatches(phonetic)
return _.map(match => ({...match, word}), matches)
}),
_.compact,
// _.map(_.uniqWith((a, b) => a.aliasId === b.aliasId)),
_.flatten
)(wordPhonetics)
// Gether aliases whose name-parts match alphabetically. const aliasIds = []
const getStringMatches = value => { const phoneticWeight = 0.17
const entryMatches = entry => (stringSimilarity(value, entry.value) >= threshold) const stringWeight = 1 - phoneticWeight
return _.filter(entryMatches, structs.wordList)
}
const getSingleEntries = wordEntry => {
const makeEntry = aliasId => ({value: wordEntry.value, aliasId})
return _.map(makeEntry, wordEntry.aliasIds)
}
const stringMatches = _.flow(
_.flatMap(getStringMatches),
_.flatMap(getSingleEntries)
)(wordValues)
// At least two name-parts must match per alias for (const word of words) {
const adequateMatch = ([aliasId, count]) => { const getPhonetic = phonetic => structs.phoneticMap.get(phonetic)
const alias = structs.aliasesMap.get(aliasId) const phoneticMatches = new Set(_.flatMap(getPhonetic, word.phonetics))
return count >= Math.min(2, alias.words.length)
for (const wordEntry of structs.wordList) {
const stringScore = stringSimilarity(word.value, wordEntry.value)
if (stringWeight * stringScore + phoneticWeight < threshold) continue
for (const aliasId of wordEntry.aliasIds) {
const phoneticScore = phoneticMatches.has(aliasId) ? 1 : 0
const finalScore = stringWeight * stringScore + phoneticWeight * phoneticScore
if (finalScore >= threshold) {
aliasIds.push(aliasId)
}
}
}
} }
const aliasIdsFromNamePart = _.flow( const aliasIdsFromNamePart = _.flow(
_.uniqWith((a, b) => (
(a.value === b.value && a.aliasId === b.aliasId) ||
(a.word === b.word && a.aliasId === b.aliasId)
)),
_.map(_.get('aliasId')),
_.countBy(_.identity), _.countBy(_.identity),
_.toPairs, _.toPairs,
_.filter(adequateMatch), _.reject(_.flow(
_.last,
_.gt(2)
)),
_.map(_.first) _.map(_.first)
)([...phoneticMatches, ...stringMatches]) )(aliasIds)
// debug_log(aliasIdsFromFullName) // debug_log(aliasIdsFromFullName)
// debug_log(phoneticMatches)
// debug_log(stringMatches)
// debug_log(aliasIdsFromNamePart) // debug_log(aliasIdsFromNamePart)
// Get the full record for each matched id // Get the full record for each matched id

View file

@ -31,7 +31,7 @@ const partNames = new Map([
]) ])
const filteredWords = [ const filteredWords = [
'al' // 'al'
] ]
// group-id to type-id // group-id to type-id
@ -187,7 +187,10 @@ const producePhoneticMap = _.flow(
mapAliases(getPhoneticEntries), mapAliases(getPhoneticEntries),
_.flatten, _.flatten,
_.groupBy(_.get('phonetic')), _.groupBy(_.get('phonetic')),
_.mapValues(_.map(_.pick(['value', 'aliasId']))), _.mapValues(_.flow(
_.map(_.get('aliasId')),
_.uniq
)),
_.toPairs, _.toPairs,
entries => new Map(entries) entries => new Map(entries)
) )

View file

@ -152,7 +152,6 @@ describe('OFAC', function () {
it('should match despite some misspellings', function () { it('should match despite some misspellings', function () {
this.timeout(0) this.timeout(0)
this.retries(4)
for (const fullName of fullNames) { for (const fullName of fullNames) {
const lightlyMisspelled = misspell(fullName) const lightlyMisspelled = misspell(fullName)
@ -163,17 +162,18 @@ describe('OFAC', function () {
_.join(' ') _.join(' ')
)(fullName) )(fullName)
const matchesA = ofac.match({firstName: lightlyMisspelled}, null, 0.90) const matchesA = ofac.match({firstName: lightlyMisspelled}, null, 0.85)
if (matchesA.length === 0) { console.log(1, fullName, '|', lightlyMisspelled) }
assert.ok(matchesA.length > 0) assert.ok(matchesA.length > 0)
const matchesB = ofac.match({firstName: heavilyMisspelled}, null, 0.80) const matchesB = ofac.match({firstName: heavilyMisspelled}, null, 0.75)
if (matchesB.length === 0) { console.log(2, fullName, '|', heavilyMisspelled) }
assert.ok(matchesB.length > 0) assert.ok(matchesB.length > 0)
} }
}) })
it('should match phonetically similar words', function () { it('should match phonetically similar words', function () {
this.timeout(0) this.timeout(0)
this.retries(4)
for (const fullName of fullNames) { for (const fullName of fullNames) {
const transcribed = transcribe(fullName) const transcribed = transcribe(fullName)
@ -183,7 +183,8 @@ describe('OFAC', function () {
continue continue
} }
const matches = ofac.match({firstName: transcribed}, null, 1) const matches = ofac.match({firstName: transcribed}, null, 0.85)
if (matches.length === 0) { console.log(fullName, '|', transcribed) }
assert.ok(!_.isEmpty(matches)) assert.ok(!_.isEmpty(matches))
} }
}) })
@ -227,14 +228,12 @@ describe('OFAC', function () {
for (const lastName of lastNames.slice(0, 100)) { for (const lastName of lastNames.slice(0, 100)) {
for (firstName of firstNamesMale.slice(0, 100)) { for (firstName of firstNamesMale.slice(0, 100)) {
const matches = ofac.match({firstName, lastName}, null, 0.8) const matches = ofac.match({firstName, lastName}, null, 0.85)
console.log({firstName, lastName})
assert.ok(_.isEmpty(matches)) assert.ok(_.isEmpty(matches))
} }
for (firstName of firstNamesFemale.slice(0, 100)) { for (firstName of firstNamesFemale.slice(0, 100)) {
const matches = ofac.match({firstName, lastName}, null, 0.8) const matches = ofac.match({firstName, lastName}, null, 0.85)
console.log({firstName, lastName})
assert.ok(_.isEmpty(matches)) assert.ok(_.isEmpty(matches))
} }
} }

View file

@ -176,11 +176,11 @@ describe('OFAC', function () {
assert.ok(phoneticMap instanceof Map) assert.ok(phoneticMap instanceof Map)
assert.equal(phoneticMap.size, 3) assert.equal(phoneticMap.size, 3)
assert.ok(phoneticMap.has('JN')) assert.ok(phoneticMap.has('JN'))
assert.deepEqual(phoneticMap.get('JN'), [{value: 'john', aliasId: '5'}]) assert.deepEqual(phoneticMap.get('JN'), ['5'])
assert.ok(phoneticMap.has('AN')) assert.ok(phoneticMap.has('AN'))
assert.deepEqual(phoneticMap.get('AN'), [{value: 'john', aliasId: '5'}]) assert.deepEqual(phoneticMap.get('AN'), ['5'])
assert.ok(phoneticMap.has('T')) assert.ok(phoneticMap.has('T'))
assert.deepEqual(phoneticMap.get('T'), [{value: 'doe', aliasId: '5'}]) assert.deepEqual(phoneticMap.get('T'), ['5'])
const {wordList} = structs const {wordList} = structs
assert.ok(Array.isArray(wordList)) assert.ok(Array.isArray(wordList))
@ -222,20 +222,11 @@ describe('OFAC', function () {
assert.ok(phoneticMap instanceof Map) assert.ok(phoneticMap instanceof Map)
assert.equal(phoneticMap.size, 4) assert.equal(phoneticMap.size, 4)
assert.ok(phoneticMap.has('JN')) assert.ok(phoneticMap.has('JN'))
assert.deepEqual(phoneticMap.get('JN'), [ assert.deepEqual(phoneticMap.get('JN'), ['5', '15'])
{value: 'john', aliasId: '5'},
{value: 'john', aliasId: '15'}
])
assert.ok(phoneticMap.has('AN')) assert.ok(phoneticMap.has('AN'))
assert.deepEqual(phoneticMap.get('AN'), [ assert.deepEqual(phoneticMap.get('AN'), ['5', '15'])
{value: 'john', aliasId: '5'},
{value: 'john', aliasId: '15'}
])
assert.ok(phoneticMap.has('T')) assert.ok(phoneticMap.has('T'))
assert.deepEqual(phoneticMap.get('T'), [ assert.deepEqual(phoneticMap.get('T'), ['5', '15'])
{value: 'doe', aliasId: '5'},
{value: 'de', aliasId: '15'}
])
const {wordList} = structs const {wordList} = structs
assert.ok(Array.isArray(wordList)) assert.ok(Array.isArray(wordList))
@ -274,11 +265,11 @@ describe('OFAC', function () {
assert.ok(phoneticMap instanceof Map) assert.ok(phoneticMap instanceof Map)
assert.equal(phoneticMap.size, 3) assert.equal(phoneticMap.size, 3)
assert.ok(phoneticMap.has('JN')) assert.ok(phoneticMap.has('JN'))
assert.deepEqual(phoneticMap.get('JN'), [{value: 'john', aliasId: '5'}]) assert.deepEqual(phoneticMap.get('JN'), ['5'])
assert.ok(phoneticMap.has('AN')) assert.ok(phoneticMap.has('AN'))
assert.deepEqual(phoneticMap.get('AN'), [{value: 'john', aliasId: '5'}]) assert.deepEqual(phoneticMap.get('AN'), ['5'])
assert.ok(phoneticMap.has('T')) assert.ok(phoneticMap.has('T'))
assert.deepEqual(phoneticMap.get('T'), [{value: 'doe', aliasId: '5'}]) assert.deepEqual(phoneticMap.get('T'), ['5'])
const {wordList} = structs const {wordList} = structs
assert.ok(Array.isArray(wordList)) assert.ok(Array.isArray(wordList))