This commit is contained in:
Konstantin Mamalakis 2018-03-19 14:32:38 +02:00 committed by Josh Harvey
parent 577a85c9b1
commit f7561acf3c
4 changed files with 120 additions and 71 deletions

View file

@ -29,14 +29,14 @@ const isBornTooLongSince = _.curry((days, dateObject, individual) => {
// algorithm // algorithm
function match (structs, candidate, options) { function match (structs, candidate, options) {
const {threshold, ratio = 0.1, debug, verboseFor} = options const {threshold, fullNameThreshold, ratio = 0.5, debug, verboseFor} = options
const {fullName, words, birthDate} = candidate const {fullName, words, birthDate} = candidate
// Accept aliases who's full name matches. // Accept aliases who's full name matches.
const doesNameMatch = _.flow( const doesNameMatch = _.flow(
_.get('fullName'), _.get('fullName'),
stringSimilarity(fullName), stringSimilarity(fullName),
_.lte(threshold) _.lte(fullNameThreshold)
) )
const aliases = _.flatMap(_.get('aliases'), structs.individuals) const aliases = _.flatMap(_.get('aliases'), structs.individuals)
const aliasIdsFromFullName = _.flow( const aliasIdsFromFullName = _.flow(
@ -45,54 +45,50 @@ function match (structs, candidate, options) {
)(aliases) )(aliases)
const aliasIdCounts = new Map()
const phoneticWeight = ratio const phoneticWeight = ratio
const stringWeight = 1 - phoneticWeight const stringWeight = 1 - phoneticWeight
const matches = []
for (const word of words) { for (const word of words) {
const getPhonetic = phonetic => structs.phoneticMap.get(phonetic) const getPhonetic = phonetic => structs.phoneticMap.get(phonetic)
const phoneticMatches = new Set(_.flatMap(getPhonetic, word.phonetics)) const phoneticMatches = new Set(_.flatMap(getPhonetic, word.phonetics))
const aliasIds = new Set()
for (const wordEntry of structs.wordList) { for (const wordEntry of structs.wordList) {
const stringScore = stringSimilarity(word.value, wordEntry.value) const stringScore = stringSimilarity(word.value, wordEntry.value)
const verbose = _.includes(wordEntry.value, verboseFor) const verbose = _.includes(wordEntry.value, verboseFor)
if (!verbose && stringWeight * stringScore + phoneticWeight < threshold) continue
for (const aliasId of wordEntry.aliasIds) { for (const aliasId of wordEntry.aliasIds) {
const phoneticScore = phoneticMatches.has(aliasId) ? 1 : -1 const phoneticScore = phoneticMatches.has(aliasId) ? 1 : -1
// const finalScore = stringWeight * stringScore + phoneticWeight * phoneticScore const finalScore = stringWeight * stringScore + phoneticWeight * phoneticScore
const finalScore = stringScore + phoneticWeight * phoneticScore
verbose && console.log(finalScore.toFixed(2), stringScore.toFixed(2), phoneticScore.toFixed(2), word.value, wordEntry.value) verbose && console.log(finalScore.toFixed(2), stringScore.toFixed(2), phoneticScore.toFixed(2), word.value, wordEntry.value)
if (finalScore >= threshold) { if (finalScore >= threshold) {
aliasIds.add(aliasId) const entry = {aliasId, score: finalScore, word: word.value, value: wordEntry.value}
const index = _.sortedIndexBy(x => -x.score, entry, matches)
matches.splice(index, 0, entry)
} }
} }
} }
verboseFor && console.log(aliasIds)
for (const aliasId of aliasIds.values()) {
const count = aliasIdCounts.get(aliasId) || 0
aliasIdCounts.set(aliasId, count + 1)
}
} }
verboseFor && console.log(aliasIdCounts) const sameWord = (a, b) => a.aliasId === b.aliasId && a.word === b.word
const sameValue = (a, b) => a.aliasId === b.aliasId && a.value === b.value
const aliasIdsFromNamePart = [] const aliasIdsFromNamePart = _.flow(
_.uniqWith(sameWord),
for (const [aliasId, count] of aliasIdCounts) { _.uniqWith(sameValue),
const {length} = structs.aliasesMap.get(aliasId).words _.map(_.get('aliasId')),
if (count >= _.min([2, words.length, length])) { _.countBy(_.identity),
aliasIdsFromNamePart.push(aliasId) _.toPairs,
} _.filter(([aliasId, count]) => {
} const {length} = structs.aliasesMap.get(aliasId).words
return (count >= _.min([2, words.length, length]))
}),
_.map(_.first)
)(matches)
debug && debug_log(aliasIdsFromFullName) debug && debug_log(aliasIdsFromFullName)
debug && debug_log(aliasIdsFromNamePart) debug && debug_log(aliasIdsFromNamePart)

View file

@ -1,8 +1,6 @@
const doubleMetaphone = require('talisman/phonetics/double-metaphone') const doubleMetaphone = require('talisman/phonetics/double-metaphone')
const _ = require('lodash/fp') const _ = require('lodash/fp')
// KOSTIS TODO: Decide on a method. Remove the others
const makePhonetic = _.flow(doubleMetaphone, _.uniq) const makePhonetic = _.flow(doubleMetaphone, _.uniq)
// Combine name-parts in a standard order. // Combine name-parts in a standard order.

View file

@ -29,6 +29,7 @@ const transpose = (word, index) => {
} }
const alter = (word, index) => { const alter = (word, index) => {
if (word[index] === ' ') return word
const o = word.charCodeAt(index) const o = word.charCodeAt(index)
const collection = _.includes(o, vowels) ? vowels : consonants const collection = _.includes(o, vowels) ? vowels : consonants
const oo = _.sample(collection) const oo = _.sample(collection)
@ -115,6 +116,7 @@ const transcribe = word => {
} }
const threshold = 0.85 const threshold = 0.85
const fullNameThreshold = 0.95
describe('OFAC', function () { describe('OFAC', function () {
describe('Matching', function () { describe('Matching', function () {
@ -132,18 +134,19 @@ describe('OFAC', function () {
}) })
}) })
it('should match the exact full names of suspects', function () { it.skip('should match the exact full names of suspects', function () {
this.timeout(0) this.timeout(0)
for (const fullName of fullNames) { for (const fullName of fullNames) {
const matches = ofac.match({firstName: fullName}, null, { const matches = ofac.match({firstName: fullName}, null, {
threshold,//: 1 threshold,
fullNameThreshold,
}) })
assert.ok(!_.isEmpty(matches)) assert.ok(!_.isEmpty(matches))
} }
}) })
it('should match the permutated full names of suspects', function () { it.skip('should match the permutated full names of suspects', function () {
this.timeout(0) this.timeout(0)
for (const fullName of fullNames) { for (const fullName of fullNames) {
@ -154,7 +157,8 @@ describe('OFAC', function () {
)(fullName) )(fullName)
const matches = ofac.match({firstName: reversed}, null, { const matches = ofac.match({firstName: reversed}, null, {
threshold,//: 1 threshold,
fullNameThreshold,
}) })
assert.ok(!_.isEmpty(matches)) assert.ok(!_.isEmpty(matches))
} }
@ -163,6 +167,9 @@ describe('OFAC', function () {
it('should match despite some misspellings', function () { it('should match despite some misspellings', function () {
this.timeout(0) this.timeout(0)
let countMatches = 0
const failures = []
for (const fullName of fullNames) { for (const fullName of fullNames) {
const lightlyMisspelled = misspell(fullName) const lightlyMisspelled = misspell(fullName)
@ -173,54 +180,80 @@ describe('OFAC', function () {
)(fullName) )(fullName)
const matchesA = ofac.match({firstName: lightlyMisspelled}, null, { const matchesA = ofac.match({firstName: lightlyMisspelled}, null, {
threshold,//: 0.875 threshold,
fullNameThreshold,
}) })
if (_.isEmpty(matchesA)) {
console.log(fullName) if (!_.isEmpty(matchesA)) {
ofac.match({firstName: lightlyMisspelled}, null, { countMatches += 1
threshold,//: 0.875, }
debug: true else {
}) failures.push({fullName, misspelled: lightlyMisspelled})
} }
assert.ok(!_.isEmpty(matchesA))
const matchesB = ofac.match({firstName: heavilyMisspelled}, null, { const matchesB = ofac.match({firstName: heavilyMisspelled}, null, {
threshold: threshold - 0.1,//: 0.75 threshold: threshold - 0.1,//: 0.75
}) })
if (_.isEmpty(matchesB)) {
console.log(fullName) if (!_.isEmpty(matchesB)) {
ofac.match({firstName: heavilyMisspelled}, null, { countMatches += 1
threshold: threshold - 0.1,//: 0.75, }
debug: true else {
}) failures.push({fullName, heavy: true, misspelled: heavilyMisspelled})
} }
assert.ok(!_.isEmpty(matchesB))
} }
for (const failure of failures) {
const {fullName, heavy, misspelled} = failure
console.log("Original:", fullName)
ofac.match({firstName: misspelled}, null, {
threshold: threshold + (heavy ? -0.1 : 0),
debug: true
})
}
assert.equal(countMatches, fullNames.length * 2)
}) })
it('should match phonetically similar words', function () { it('should match phonetically similar words', function () {
this.timeout(0) this.timeout(0)
let countMatches = 0
const failures = []
for (const fullName of fullNames) { for (const fullName of fullNames) {
const transcribed = transcribe(fullName) const transcribed = transcribe(fullName)
if (!transcribed) { if (!transcribed) {
console.warn(`Couldn't find an appropriate phonetic alteration for '${fullName}'`) console.warn(`Couldn't find an appropriate phonetic alteration for '${fullName}'`)
countMatches += 1
continue continue
} }
const matches = ofac.match({firstName: transcribed}, null, { const matches = ofac.match({firstName: transcribed}, null, {
threshold,//: 0.85 threshold,
fullNameThreshold,
}) })
if (_.isEmpty(matches)) {
console.log(fullName) if (!_.isEmpty(matches)) {
ofac.match({firstName: transcribed}, null, { countMatches += 1
threshold,//: 0.85, }
debug: true else {
}) failures.push({fullName, misspelled: transcribed})
} }
assert.ok(!_.isEmpty(matches))
} }
for (const failure of failures) {
const {fullName, misspelled} = failure
console.log("Original:", fullName)
ofac.match({firstName: misspelled}, null, {
threshold,
fullNameThreshold,
debug: true
})
}
assert.equal(countMatches, fullNames.length)
}) })
it('should discard matches with inapropriate birthdates', function () { it('should discard matches with inapropriate birthdates', function () {
@ -239,7 +272,8 @@ describe('OFAC', function () {
for (const fullName of fullNames) { for (const fullName of fullNames) {
const matches = ofac.match({firstName: fullName}, dateString, { const matches = ofac.match({firstName: fullName}, dateString, {
threshold,//: 1 threshold,
fullNameThreshold,
}) })
assert.ok(noMatchesWithBirthDates(matches)) assert.ok(noMatchesWithBirthDates(matches))
} }
@ -262,44 +296,67 @@ describe('OFAC', function () {
const firstNamesMale = getNamesFromFile('dist.male.first.txt') const firstNamesMale = getNamesFromFile('dist.male.first.txt')
const firstNamesFemale = getNamesFromFile('dist.female.first.txt') const firstNamesFemale = getNamesFromFile('dist.female.first.txt')
let countMatches = 0
const failures = []
for (const lastName of lastNames.slice(0, 100)) { for (const lastName of lastNames.slice(0, 100)) {
for (firstName of firstNamesMale.slice(0, 100)) { for (firstName of firstNamesMale.slice(0, 100)) {
const matches = ofac.match({firstName, lastName}, null, { const matches = ofac.match({firstName, lastName}, null, {
threshold,//: 0.875 threshold,
fullNameThreshold,
}) })
if (!_.isEmpty(matches)) { if (!_.isEmpty(matches)) {
ofac.match({firstName, lastName}, null, { countMatches += 1
threshold,//: 0.875, failures.push({firstName, lastName})
debug: true
})
} }
assert.ok(_.isEmpty(matches))
} }
for (firstName of firstNamesFemale.slice(0, 100)) { for (firstName of firstNamesFemale.slice(0, 100)) {
const matches = ofac.match({firstName, lastName}, null, { const matches = ofac.match({firstName, lastName}, null, {
threshold,//: 0.875 threshold,
fullNameThreshold,
}) })
if (!_.isEmpty(matches)) { if (!_.isEmpty(matches)) {
ofac.match({firstName, lastName}, null, { countMatches += 1
threshold,//: 0.875, failures.push({firstName, lastName})
debug: true
})
} }
assert.ok(_.isEmpty(matches))
} }
} }
for (const failure of failures) {
ofac.match(failure, null, {
threshold,
fullNameThreshold,
debug: true
})
}
assert.equal(countMatches, 0)
}) })
it.skip('test', function () { it.skip('test', function () {
const firstName = 'hian chariapaporn' const firstName = 'hian chariapaporn'
ofac.match({firstName}, null, { ofac.match({firstName}, null, {
threshold,//: 0.875, threshold,
fullNameThreshold,
debug: true, debug: true,
verboseFor: ['hiran', 'chariapaporn'] verboseFor: ['hiran', 'chariapaporn']
}) })
}) })
it.skip('test', function () {
const firstName = 'janice smith'
ofac.match({firstName}, null, {
threshold,
fullNameThreshold,
debug: true,
verboseFor: ['samih', 'anis']
})
})
}) })
}) })

View file

@ -180,8 +180,6 @@ describe('OFAC', function () {
.then(([individuals]) => { .then(([individuals]) => {
assert.ok(Array.isArray(individuals)) assert.ok(Array.isArray(individuals))
assert.equal(individuals.length, 2) assert.equal(individuals.length, 2)
console.log(JSON.stringify(individuals[0]))
console.log(JSON.stringify(individualA))
assert.deepEqual(individuals, [individualA, individualB]) assert.deepEqual(individuals, [individualA, individualB])
}) })
}) })