Name-part matching now combines Jaro and Double Metaphone scores
This commit is contained in:
parent
f00516ce2e
commit
793db0f449
5 changed files with 54 additions and 78 deletions
|
|
@ -37,7 +37,7 @@ function load () {
|
||||||
|
|
||||||
function makeCompatible (nameParts) {
|
function makeCompatible (nameParts) {
|
||||||
const partNames = _.keys(nameParts)
|
const partNames = _.keys(nameParts)
|
||||||
const values = _.values(nameParts)
|
const values = _.map(_.lowerCase, _.values(nameParts))
|
||||||
const props = _.zipAll([partNames, values])
|
const props = _.zipAll([partNames, values])
|
||||||
return _.map(_.zipObject(['partName', 'value']), props)
|
return _.map(_.zipObject(['partName', 'value']), props)
|
||||||
}
|
}
|
||||||
|
|
@ -54,12 +54,6 @@ function match (nameParts, birthDateString, threshold) {
|
||||||
const fullName = nameUtils.makeFullName(parts)
|
const fullName = nameUtils.makeFullName(parts)
|
||||||
const words = nameUtils.makeWords(fullName)
|
const words = nameUtils.makeWords(fullName)
|
||||||
|
|
||||||
const wordValues = _.map(_.get('value'), words)
|
|
||||||
const wordPhonetics = _.flatMap(word => {
|
|
||||||
const {phonetics} = word
|
|
||||||
return _.map(phonetic => ({word: word.value, phonetic}), phonetics)
|
|
||||||
}, words)
|
|
||||||
|
|
||||||
// birthDateString is in YYYYMMDD format
|
// birthDateString is in YYYYMMDD format
|
||||||
const birthDate = _.cond([
|
const birthDate = _.cond([
|
||||||
[_.identity, () => {
|
[_.identity, () => {
|
||||||
|
|
@ -73,7 +67,7 @@ function match (nameParts, birthDateString, threshold) {
|
||||||
[_.stubTrue, () => null]
|
[_.stubTrue, () => null]
|
||||||
])(birthDateString)
|
])(birthDateString)
|
||||||
|
|
||||||
const candidate = {parts, fullName, wordValues, wordPhonetics, birthDate}
|
const candidate = {parts, fullName, words, birthDate}
|
||||||
// debug_log(candidate)
|
// debug_log(candidate)
|
||||||
|
|
||||||
const result = matcher.match(structs, candidate, threshold)
|
const result = matcher.match(structs, candidate, threshold)
|
||||||
|
|
|
||||||
|
|
@ -1,9 +1,9 @@
|
||||||
const jaroWinkler = require('talisman/metrics/distance/jaro-winkler')
|
const jaro = require('talisman/metrics/distance/jaro')
|
||||||
const _ = require('lodash/fp')
|
const _ = require('lodash/fp')
|
||||||
|
|
||||||
const debug_log = require('../pp')(__filename) // KOSTIS TODO: remove
|
const debug_log = require('../pp')(__filename) // KOSTIS TODO: remove
|
||||||
|
|
||||||
const stringSimilarity = _.curry(jaroWinkler)
|
const stringSimilarity = _.curry(jaro)
|
||||||
|
|
||||||
// birth date
|
// birth date
|
||||||
|
|
||||||
|
|
@ -29,7 +29,7 @@ const isBornTooLongSince = _.curry((days, dateObject, individual) => {
|
||||||
// algorithm
|
// algorithm
|
||||||
|
|
||||||
function match (structs, candidate, threshold) {
|
function match (structs, candidate, threshold) {
|
||||||
const {fullName, wordPhonetics, wordValues, birthDate} = candidate
|
const {fullName, words, birthDate} = candidate
|
||||||
|
|
||||||
// Accept aliases who's full name matches.
|
// Accept aliases who's full name matches.
|
||||||
const doesNameMatch = _.flow(
|
const doesNameMatch = _.flow(
|
||||||
|
|
@ -43,53 +43,42 @@ function match (structs, candidate, threshold) {
|
||||||
_.map(_.get('id'))
|
_.map(_.get('id'))
|
||||||
)(aliases)
|
)(aliases)
|
||||||
|
|
||||||
// Gather aliases who's name-parts match phonetically.
|
|
||||||
const getPhoneticMatches = phonetic => structs.phoneticMap.get(phonetic)
|
|
||||||
const phoneticMatches = _.flow(
|
|
||||||
_.map(wordPhonetic => {
|
|
||||||
const {word, phonetic} = wordPhonetic
|
|
||||||
const matches = getPhoneticMatches(phonetic)
|
|
||||||
return _.map(match => ({...match, word}), matches)
|
|
||||||
}),
|
|
||||||
_.compact,
|
|
||||||
// _.map(_.uniqWith((a, b) => a.aliasId === b.aliasId)),
|
|
||||||
_.flatten
|
|
||||||
)(wordPhonetics)
|
|
||||||
|
|
||||||
// Gether aliases whose name-parts match alphabetically.
|
const aliasIds = []
|
||||||
const getStringMatches = value => {
|
const phoneticWeight = 0.17
|
||||||
const entryMatches = entry => (stringSimilarity(value, entry.value) >= threshold)
|
const stringWeight = 1 - phoneticWeight
|
||||||
return _.filter(entryMatches, structs.wordList)
|
|
||||||
}
|
|
||||||
const getSingleEntries = wordEntry => {
|
|
||||||
const makeEntry = aliasId => ({value: wordEntry.value, aliasId})
|
|
||||||
return _.map(makeEntry, wordEntry.aliasIds)
|
|
||||||
}
|
|
||||||
const stringMatches = _.flow(
|
|
||||||
_.flatMap(getStringMatches),
|
|
||||||
_.flatMap(getSingleEntries)
|
|
||||||
)(wordValues)
|
|
||||||
|
|
||||||
// At least two name-parts must match per alias
|
for (const word of words) {
|
||||||
const adequateMatch = ([aliasId, count]) => {
|
const getPhonetic = phonetic => structs.phoneticMap.get(phonetic)
|
||||||
const alias = structs.aliasesMap.get(aliasId)
|
const phoneticMatches = new Set(_.flatMap(getPhonetic, word.phonetics))
|
||||||
return count >= Math.min(2, alias.words.length)
|
|
||||||
|
for (const wordEntry of structs.wordList) {
|
||||||
|
const stringScore = stringSimilarity(word.value, wordEntry.value)
|
||||||
|
|
||||||
|
if (stringWeight * stringScore + phoneticWeight < threshold) continue
|
||||||
|
|
||||||
|
for (const aliasId of wordEntry.aliasIds) {
|
||||||
|
const phoneticScore = phoneticMatches.has(aliasId) ? 1 : 0
|
||||||
|
const finalScore = stringWeight * stringScore + phoneticWeight * phoneticScore
|
||||||
|
|
||||||
|
if (finalScore >= threshold) {
|
||||||
|
aliasIds.push(aliasId)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
const aliasIdsFromNamePart = _.flow(
|
const aliasIdsFromNamePart = _.flow(
|
||||||
_.uniqWith((a, b) => (
|
|
||||||
(a.value === b.value && a.aliasId === b.aliasId) ||
|
|
||||||
(a.word === b.word && a.aliasId === b.aliasId)
|
|
||||||
)),
|
|
||||||
_.map(_.get('aliasId')),
|
|
||||||
_.countBy(_.identity),
|
_.countBy(_.identity),
|
||||||
_.toPairs,
|
_.toPairs,
|
||||||
_.filter(adequateMatch),
|
_.reject(_.flow(
|
||||||
|
_.last,
|
||||||
|
_.gt(2)
|
||||||
|
)),
|
||||||
_.map(_.first)
|
_.map(_.first)
|
||||||
)([...phoneticMatches, ...stringMatches])
|
)(aliasIds)
|
||||||
|
|
||||||
// debug_log(aliasIdsFromFullName)
|
// debug_log(aliasIdsFromFullName)
|
||||||
// debug_log(phoneticMatches)
|
|
||||||
// debug_log(stringMatches)
|
|
||||||
// debug_log(aliasIdsFromNamePart)
|
// debug_log(aliasIdsFromNamePart)
|
||||||
|
|
||||||
// Get the full record for each matched id
|
// Get the full record for each matched id
|
||||||
|
|
|
||||||
|
|
@ -31,7 +31,7 @@ const partNames = new Map([
|
||||||
])
|
])
|
||||||
|
|
||||||
const filteredWords = [
|
const filteredWords = [
|
||||||
'al'
|
// 'al'
|
||||||
]
|
]
|
||||||
|
|
||||||
// group-id to type-id
|
// group-id to type-id
|
||||||
|
|
@ -187,7 +187,10 @@ const producePhoneticMap = _.flow(
|
||||||
mapAliases(getPhoneticEntries),
|
mapAliases(getPhoneticEntries),
|
||||||
_.flatten,
|
_.flatten,
|
||||||
_.groupBy(_.get('phonetic')),
|
_.groupBy(_.get('phonetic')),
|
||||||
_.mapValues(_.map(_.pick(['value', 'aliasId']))),
|
_.mapValues(_.flow(
|
||||||
|
_.map(_.get('aliasId')),
|
||||||
|
_.uniq
|
||||||
|
)),
|
||||||
_.toPairs,
|
_.toPairs,
|
||||||
entries => new Map(entries)
|
entries => new Map(entries)
|
||||||
)
|
)
|
||||||
|
|
|
||||||
|
|
@ -152,7 +152,6 @@ describe('OFAC', function () {
|
||||||
|
|
||||||
it('should match despite some misspellings', function () {
|
it('should match despite some misspellings', function () {
|
||||||
this.timeout(0)
|
this.timeout(0)
|
||||||
this.retries(4)
|
|
||||||
|
|
||||||
for (const fullName of fullNames) {
|
for (const fullName of fullNames) {
|
||||||
const lightlyMisspelled = misspell(fullName)
|
const lightlyMisspelled = misspell(fullName)
|
||||||
|
|
@ -163,17 +162,18 @@ describe('OFAC', function () {
|
||||||
_.join(' ')
|
_.join(' ')
|
||||||
)(fullName)
|
)(fullName)
|
||||||
|
|
||||||
const matchesA = ofac.match({firstName: lightlyMisspelled}, null, 0.90)
|
const matchesA = ofac.match({firstName: lightlyMisspelled}, null, 0.85)
|
||||||
|
if (matchesA.length === 0) { console.log(1, fullName, '|', lightlyMisspelled) }
|
||||||
assert.ok(matchesA.length > 0)
|
assert.ok(matchesA.length > 0)
|
||||||
|
|
||||||
const matchesB = ofac.match({firstName: heavilyMisspelled}, null, 0.80)
|
const matchesB = ofac.match({firstName: heavilyMisspelled}, null, 0.75)
|
||||||
|
if (matchesB.length === 0) { console.log(2, fullName, '|', heavilyMisspelled) }
|
||||||
assert.ok(matchesB.length > 0)
|
assert.ok(matchesB.length > 0)
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
|
|
||||||
it('should match phonetically similar words', function () {
|
it('should match phonetically similar words', function () {
|
||||||
this.timeout(0)
|
this.timeout(0)
|
||||||
this.retries(4)
|
|
||||||
|
|
||||||
for (const fullName of fullNames) {
|
for (const fullName of fullNames) {
|
||||||
const transcribed = transcribe(fullName)
|
const transcribed = transcribe(fullName)
|
||||||
|
|
@ -183,7 +183,8 @@ describe('OFAC', function () {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
const matches = ofac.match({firstName: transcribed}, null, 1)
|
const matches = ofac.match({firstName: transcribed}, null, 0.85)
|
||||||
|
if (matches.length === 0) { console.log(fullName, '|', transcribed) }
|
||||||
assert.ok(!_.isEmpty(matches))
|
assert.ok(!_.isEmpty(matches))
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
|
|
@ -227,14 +228,12 @@ describe('OFAC', function () {
|
||||||
|
|
||||||
for (const lastName of lastNames.slice(0, 100)) {
|
for (const lastName of lastNames.slice(0, 100)) {
|
||||||
for (firstName of firstNamesMale.slice(0, 100)) {
|
for (firstName of firstNamesMale.slice(0, 100)) {
|
||||||
const matches = ofac.match({firstName, lastName}, null, 0.8)
|
const matches = ofac.match({firstName, lastName}, null, 0.85)
|
||||||
console.log({firstName, lastName})
|
|
||||||
assert.ok(_.isEmpty(matches))
|
assert.ok(_.isEmpty(matches))
|
||||||
}
|
}
|
||||||
|
|
||||||
for (firstName of firstNamesFemale.slice(0, 100)) {
|
for (firstName of firstNamesFemale.slice(0, 100)) {
|
||||||
const matches = ofac.match({firstName, lastName}, null, 0.8)
|
const matches = ofac.match({firstName, lastName}, null, 0.85)
|
||||||
console.log({firstName, lastName})
|
|
||||||
assert.ok(_.isEmpty(matches))
|
assert.ok(_.isEmpty(matches))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -176,11 +176,11 @@ describe('OFAC', function () {
|
||||||
assert.ok(phoneticMap instanceof Map)
|
assert.ok(phoneticMap instanceof Map)
|
||||||
assert.equal(phoneticMap.size, 3)
|
assert.equal(phoneticMap.size, 3)
|
||||||
assert.ok(phoneticMap.has('JN'))
|
assert.ok(phoneticMap.has('JN'))
|
||||||
assert.deepEqual(phoneticMap.get('JN'), [{value: 'john', aliasId: '5'}])
|
assert.deepEqual(phoneticMap.get('JN'), ['5'])
|
||||||
assert.ok(phoneticMap.has('AN'))
|
assert.ok(phoneticMap.has('AN'))
|
||||||
assert.deepEqual(phoneticMap.get('AN'), [{value: 'john', aliasId: '5'}])
|
assert.deepEqual(phoneticMap.get('AN'), ['5'])
|
||||||
assert.ok(phoneticMap.has('T'))
|
assert.ok(phoneticMap.has('T'))
|
||||||
assert.deepEqual(phoneticMap.get('T'), [{value: 'doe', aliasId: '5'}])
|
assert.deepEqual(phoneticMap.get('T'), ['5'])
|
||||||
|
|
||||||
const {wordList} = structs
|
const {wordList} = structs
|
||||||
assert.ok(Array.isArray(wordList))
|
assert.ok(Array.isArray(wordList))
|
||||||
|
|
@ -222,20 +222,11 @@ describe('OFAC', function () {
|
||||||
assert.ok(phoneticMap instanceof Map)
|
assert.ok(phoneticMap instanceof Map)
|
||||||
assert.equal(phoneticMap.size, 4)
|
assert.equal(phoneticMap.size, 4)
|
||||||
assert.ok(phoneticMap.has('JN'))
|
assert.ok(phoneticMap.has('JN'))
|
||||||
assert.deepEqual(phoneticMap.get('JN'), [
|
assert.deepEqual(phoneticMap.get('JN'), ['5', '15'])
|
||||||
{value: 'john', aliasId: '5'},
|
|
||||||
{value: 'john', aliasId: '15'}
|
|
||||||
])
|
|
||||||
assert.ok(phoneticMap.has('AN'))
|
assert.ok(phoneticMap.has('AN'))
|
||||||
assert.deepEqual(phoneticMap.get('AN'), [
|
assert.deepEqual(phoneticMap.get('AN'), ['5', '15'])
|
||||||
{value: 'john', aliasId: '5'},
|
|
||||||
{value: 'john', aliasId: '15'}
|
|
||||||
])
|
|
||||||
assert.ok(phoneticMap.has('T'))
|
assert.ok(phoneticMap.has('T'))
|
||||||
assert.deepEqual(phoneticMap.get('T'), [
|
assert.deepEqual(phoneticMap.get('T'), ['5', '15'])
|
||||||
{value: 'doe', aliasId: '5'},
|
|
||||||
{value: 'de', aliasId: '15'}
|
|
||||||
])
|
|
||||||
|
|
||||||
const {wordList} = structs
|
const {wordList} = structs
|
||||||
assert.ok(Array.isArray(wordList))
|
assert.ok(Array.isArray(wordList))
|
||||||
|
|
@ -274,11 +265,11 @@ describe('OFAC', function () {
|
||||||
assert.ok(phoneticMap instanceof Map)
|
assert.ok(phoneticMap instanceof Map)
|
||||||
assert.equal(phoneticMap.size, 3)
|
assert.equal(phoneticMap.size, 3)
|
||||||
assert.ok(phoneticMap.has('JN'))
|
assert.ok(phoneticMap.has('JN'))
|
||||||
assert.deepEqual(phoneticMap.get('JN'), [{value: 'john', aliasId: '5'}])
|
assert.deepEqual(phoneticMap.get('JN'), ['5'])
|
||||||
assert.ok(phoneticMap.has('AN'))
|
assert.ok(phoneticMap.has('AN'))
|
||||||
assert.deepEqual(phoneticMap.get('AN'), [{value: 'john', aliasId: '5'}])
|
assert.deepEqual(phoneticMap.get('AN'), ['5'])
|
||||||
assert.ok(phoneticMap.has('T'))
|
assert.ok(phoneticMap.has('T'))
|
||||||
assert.deepEqual(phoneticMap.get('T'), [{value: 'doe', aliasId: '5'}])
|
assert.deepEqual(phoneticMap.get('T'), ['5'])
|
||||||
|
|
||||||
const {wordList} = structs
|
const {wordList} = structs
|
||||||
assert.ok(Array.isArray(wordList))
|
assert.ok(Array.isArray(wordList))
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue