From 793db0f449a51c55b4cd261220dccdf449f2f041 Mon Sep 17 00:00:00 2001 From: Konstantin Mamalakis Date: Tue, 13 Mar 2018 22:07:44 +0200 Subject: [PATCH] Name-part matching now combines Jaro and Double Metaphone scores --- lib/ofac/index.js | 10 ++---- lib/ofac/matching.js | 71 ++++++++++++++++++------------------------ lib/ofac/parsing.js | 7 +++-- tests/ofac/matching.js | 17 +++++----- tests/ofac/parsing.js | 27 ++++++---------- 5 files changed, 54 insertions(+), 78 deletions(-) diff --git a/lib/ofac/index.js b/lib/ofac/index.js index 5392ec55..a7c23673 100644 --- a/lib/ofac/index.js +++ b/lib/ofac/index.js @@ -37,7 +37,7 @@ function load () { function makeCompatible (nameParts) { const partNames = _.keys(nameParts) - const values = _.values(nameParts) + const values = _.map(_.lowerCase, _.values(nameParts)) const props = _.zipAll([partNames, values]) return _.map(_.zipObject(['partName', 'value']), props) } @@ -54,12 +54,6 @@ function match (nameParts, birthDateString, threshold) { const fullName = nameUtils.makeFullName(parts) const words = nameUtils.makeWords(fullName) - const wordValues = _.map(_.get('value'), words) - const wordPhonetics = _.flatMap(word => { - const {phonetics} = word - return _.map(phonetic => ({word: word.value, phonetic}), phonetics) - }, words) - // birthDateString is in YYYYMMDD format const birthDate = _.cond([ [_.identity, () => { @@ -73,7 +67,7 @@ function match (nameParts, birthDateString, threshold) { [_.stubTrue, () => null] ])(birthDateString) - const candidate = {parts, fullName, wordValues, wordPhonetics, birthDate} + const candidate = {parts, fullName, words, birthDate} // debug_log(candidate) const result = matcher.match(structs, candidate, threshold) diff --git a/lib/ofac/matching.js b/lib/ofac/matching.js index aaf78acf..bcf5023e 100644 --- a/lib/ofac/matching.js +++ b/lib/ofac/matching.js @@ -1,9 +1,9 @@ -const jaroWinkler = require('talisman/metrics/distance/jaro-winkler') +const jaro = require('talisman/metrics/distance/jaro') const _ = require('lodash/fp') const debug_log = require('../pp')(__filename) // KOSTIS TODO: remove -const stringSimilarity = _.curry(jaroWinkler) +const stringSimilarity = _.curry(jaro) // birth date @@ -29,7 +29,7 @@ const isBornTooLongSince = _.curry((days, dateObject, individual) => { // algorithm function match (structs, candidate, threshold) { - const {fullName, wordPhonetics, wordValues, birthDate} = candidate + const {fullName, words, birthDate} = candidate // Accept aliases who's full name matches. const doesNameMatch = _.flow( @@ -43,53 +43,42 @@ function match (structs, candidate, threshold) { _.map(_.get('id')) )(aliases) - // Gather aliases who's name-parts match phonetically. - const getPhoneticMatches = phonetic => structs.phoneticMap.get(phonetic) - const phoneticMatches = _.flow( - _.map(wordPhonetic => { - const {word, phonetic} = wordPhonetic - const matches = getPhoneticMatches(phonetic) - return _.map(match => ({...match, word}), matches) - }), - _.compact, - // _.map(_.uniqWith((a, b) => a.aliasId === b.aliasId)), - _.flatten - )(wordPhonetics) - // Gether aliases whose name-parts match alphabetically. - const getStringMatches = value => { - const entryMatches = entry => (stringSimilarity(value, entry.value) >= threshold) - return _.filter(entryMatches, structs.wordList) - } - const getSingleEntries = wordEntry => { - const makeEntry = aliasId => ({value: wordEntry.value, aliasId}) - return _.map(makeEntry, wordEntry.aliasIds) - } - const stringMatches = _.flow( - _.flatMap(getStringMatches), - _.flatMap(getSingleEntries) - )(wordValues) + const aliasIds = [] + const phoneticWeight = 0.17 + const stringWeight = 1 - phoneticWeight - // At least two name-parts must match per alias - const adequateMatch = ([aliasId, count]) => { - const alias = structs.aliasesMap.get(aliasId) - return count >= Math.min(2, alias.words.length) + for (const word of words) { + const getPhonetic = phonetic => structs.phoneticMap.get(phonetic) + const phoneticMatches = new Set(_.flatMap(getPhonetic, word.phonetics)) + + for (const wordEntry of structs.wordList) { + const stringScore = stringSimilarity(word.value, wordEntry.value) + + if (stringWeight * stringScore + phoneticWeight < threshold) continue + + for (const aliasId of wordEntry.aliasIds) { + const phoneticScore = phoneticMatches.has(aliasId) ? 1 : 0 + const finalScore = stringWeight * stringScore + phoneticWeight * phoneticScore + + if (finalScore >= threshold) { + aliasIds.push(aliasId) + } + } + } } + const aliasIdsFromNamePart = _.flow( - _.uniqWith((a, b) => ( - (a.value === b.value && a.aliasId === b.aliasId) || - (a.word === b.word && a.aliasId === b.aliasId) - )), - _.map(_.get('aliasId')), _.countBy(_.identity), _.toPairs, - _.filter(adequateMatch), + _.reject(_.flow( + _.last, + _.gt(2) + )), _.map(_.first) - )([...phoneticMatches, ...stringMatches]) + )(aliasIds) // debug_log(aliasIdsFromFullName) - // debug_log(phoneticMatches) - // debug_log(stringMatches) // debug_log(aliasIdsFromNamePart) // Get the full record for each matched id diff --git a/lib/ofac/parsing.js b/lib/ofac/parsing.js index 7cb5b7c7..dc987738 100644 --- a/lib/ofac/parsing.js +++ b/lib/ofac/parsing.js @@ -31,7 +31,7 @@ const partNames = new Map([ ]) const filteredWords = [ - 'al' + // 'al' ] // group-id to type-id @@ -187,7 +187,10 @@ const producePhoneticMap = _.flow( mapAliases(getPhoneticEntries), _.flatten, _.groupBy(_.get('phonetic')), - _.mapValues(_.map(_.pick(['value', 'aliasId']))), + _.mapValues(_.flow( + _.map(_.get('aliasId')), + _.uniq + )), _.toPairs, entries => new Map(entries) ) diff --git a/tests/ofac/matching.js b/tests/ofac/matching.js index 3b54926d..5690c72b 100644 --- a/tests/ofac/matching.js +++ b/tests/ofac/matching.js @@ -152,7 +152,6 @@ describe('OFAC', function () { it('should match despite some misspellings', function () { this.timeout(0) - this.retries(4) for (const fullName of fullNames) { const lightlyMisspelled = misspell(fullName) @@ -163,17 +162,18 @@ describe('OFAC', function () { _.join(' ') )(fullName) - const matchesA = ofac.match({firstName: lightlyMisspelled}, null, 0.90) + const matchesA = ofac.match({firstName: lightlyMisspelled}, null, 0.85) + if (matchesA.length === 0) { console.log(1, fullName, '|', lightlyMisspelled) } assert.ok(matchesA.length > 0) - const matchesB = ofac.match({firstName: heavilyMisspelled}, null, 0.80) + const matchesB = ofac.match({firstName: heavilyMisspelled}, null, 0.75) + if (matchesB.length === 0) { console.log(2, fullName, '|', heavilyMisspelled) } assert.ok(matchesB.length > 0) } }) it('should match phonetically similar words', function () { this.timeout(0) - this.retries(4) for (const fullName of fullNames) { const transcribed = transcribe(fullName) @@ -183,7 +183,8 @@ describe('OFAC', function () { continue } - const matches = ofac.match({firstName: transcribed}, null, 1) + const matches = ofac.match({firstName: transcribed}, null, 0.85) + if (matches.length === 0) { console.log(fullName, '|', transcribed) } assert.ok(!_.isEmpty(matches)) } }) @@ -227,14 +228,12 @@ describe('OFAC', function () { for (const lastName of lastNames.slice(0, 100)) { for (firstName of firstNamesMale.slice(0, 100)) { - const matches = ofac.match({firstName, lastName}, null, 0.8) - console.log({firstName, lastName}) + const matches = ofac.match({firstName, lastName}, null, 0.85) assert.ok(_.isEmpty(matches)) } for (firstName of firstNamesFemale.slice(0, 100)) { - const matches = ofac.match({firstName, lastName}, null, 0.8) - console.log({firstName, lastName}) + const matches = ofac.match({firstName, lastName}, null, 0.85) assert.ok(_.isEmpty(matches)) } } diff --git a/tests/ofac/parsing.js b/tests/ofac/parsing.js index eda58e4d..290de12a 100644 --- a/tests/ofac/parsing.js +++ b/tests/ofac/parsing.js @@ -176,11 +176,11 @@ describe('OFAC', function () { assert.ok(phoneticMap instanceof Map) assert.equal(phoneticMap.size, 3) assert.ok(phoneticMap.has('JN')) - assert.deepEqual(phoneticMap.get('JN'), [{value: 'john', aliasId: '5'}]) + assert.deepEqual(phoneticMap.get('JN'), ['5']) assert.ok(phoneticMap.has('AN')) - assert.deepEqual(phoneticMap.get('AN'), [{value: 'john', aliasId: '5'}]) + assert.deepEqual(phoneticMap.get('AN'), ['5']) assert.ok(phoneticMap.has('T')) - assert.deepEqual(phoneticMap.get('T'), [{value: 'doe', aliasId: '5'}]) + assert.deepEqual(phoneticMap.get('T'), ['5']) const {wordList} = structs assert.ok(Array.isArray(wordList)) @@ -222,20 +222,11 @@ describe('OFAC', function () { assert.ok(phoneticMap instanceof Map) assert.equal(phoneticMap.size, 4) assert.ok(phoneticMap.has('JN')) - assert.deepEqual(phoneticMap.get('JN'), [ - {value: 'john', aliasId: '5'}, - {value: 'john', aliasId: '15'} - ]) + assert.deepEqual(phoneticMap.get('JN'), ['5', '15']) assert.ok(phoneticMap.has('AN')) - assert.deepEqual(phoneticMap.get('AN'), [ - {value: 'john', aliasId: '5'}, - {value: 'john', aliasId: '15'} - ]) + assert.deepEqual(phoneticMap.get('AN'), ['5', '15']) assert.ok(phoneticMap.has('T')) - assert.deepEqual(phoneticMap.get('T'), [ - {value: 'doe', aliasId: '5'}, - {value: 'de', aliasId: '15'} - ]) + assert.deepEqual(phoneticMap.get('T'), ['5', '15']) const {wordList} = structs assert.ok(Array.isArray(wordList)) @@ -274,11 +265,11 @@ describe('OFAC', function () { assert.ok(phoneticMap instanceof Map) assert.equal(phoneticMap.size, 3) assert.ok(phoneticMap.has('JN')) - assert.deepEqual(phoneticMap.get('JN'), [{value: 'john', aliasId: '5'}]) + assert.deepEqual(phoneticMap.get('JN'), ['5']) assert.ok(phoneticMap.has('AN')) - assert.deepEqual(phoneticMap.get('AN'), [{value: 'john', aliasId: '5'}]) + assert.deepEqual(phoneticMap.get('AN'), ['5']) assert.ok(phoneticMap.has('T')) - assert.deepEqual(phoneticMap.get('T'), [{value: 'doe', aliasId: '5'}]) + assert.deepEqual(phoneticMap.get('T'), ['5']) const {wordList} = structs assert.ok(Array.isArray(wordList))