Metching algorithm
This commit is contained in:
parent
2f8c798304
commit
1d0aff07fe
2 changed files with 97 additions and 91 deletions
|
|
@ -158,36 +158,22 @@ function promiseParseDocument (source) {
|
||||||
|
|
||||||
const readdir = util.promisify(fs.readdir)
|
const readdir = util.promisify(fs.readdir)
|
||||||
|
|
||||||
// const {id, individual, words} = result
|
|
||||||
//
|
|
||||||
// const individualEntry = [id, individual]
|
|
||||||
// individuals.push(individualEntry)
|
|
||||||
//
|
|
||||||
// const phoneticWithWord = pair => {
|
|
||||||
// const [word, phonetics] = pair
|
|
||||||
// const makeEntry = phonetic => ({word, phonetic, individualId: id})
|
|
||||||
// return _.map(makeEntry, phonetics)
|
|
||||||
// }
|
|
||||||
//
|
|
||||||
// const phoneticEntries = _.flatten(_.map(phoneticWithWord, words))
|
|
||||||
// allPhonetics.push(...phoneticEntries)
|
|
||||||
|
|
||||||
const mapAliases = _.curry((iteratee, individuals) => {
|
const mapAliases = _.curry((iteratee, individuals) => {
|
||||||
const foreachIndividual = individual => {
|
const mapIndividual = individual => {
|
||||||
const {id, aliases} = individual
|
const {id, aliases} = individual
|
||||||
return _.map(alias => iteratee(id, alias), aliases)
|
return _.map(alias => iteratee(id, alias), aliases)
|
||||||
}
|
}
|
||||||
return _.flatten(_.map(foreachIndividual, individuals))
|
return _.flatMap(mapIndividual, individuals)
|
||||||
})
|
})
|
||||||
|
|
||||||
|
|
||||||
const getPhoneticEntries = (individualId, alias) => {
|
const getPhoneticEntries = (individualId, alias) => {
|
||||||
const pairPhoneticsWithWords = word => {
|
const pairPhoneticsWithValues = word => {
|
||||||
const {value, phonetics} = word
|
const {value, phonetics} = word
|
||||||
const makeEntry = phonetic => ({value, phonetic, aliasId: alias.id})
|
const makeEntry = phonetic => ({value, phonetic, aliasId: alias.id})
|
||||||
return _.map(makeEntry, phonetics)
|
return _.map(makeEntry, phonetics)
|
||||||
}
|
}
|
||||||
return _.flatten(_.map(pairPhoneticsWithWords, alias.words))
|
return _.flatMap(pairPhoneticsWithValues, alias.words)
|
||||||
}
|
}
|
||||||
|
|
||||||
const producePhoneticMap = _.flow(
|
const producePhoneticMap = _.flow(
|
||||||
|
|
@ -219,6 +205,13 @@ const combineAndDedupe = _.flow(
|
||||||
_.compact,
|
_.compact,
|
||||||
_.uniqBy(_.get('id')),
|
_.uniqBy(_.get('id')),
|
||||||
individuals => {
|
individuals => {
|
||||||
|
const individualsMap = _.flow(
|
||||||
|
_.groupBy(_.get('id')),
|
||||||
|
_.mapValues(_.first),
|
||||||
|
_.toPairs,
|
||||||
|
entries => new Map(entries)
|
||||||
|
)(individuals)
|
||||||
|
|
||||||
const getIdPairs = (individualId, alias) => [alias.id, individualId]
|
const getIdPairs = (individualId, alias) => [alias.id, individualId]
|
||||||
const idPairs = mapAliases(getIdPairs, individuals)
|
const idPairs = mapAliases(getIdPairs, individuals)
|
||||||
const aliasToIndividual = new Map(idPairs)
|
const aliasToIndividual = new Map(idPairs)
|
||||||
|
|
@ -226,7 +219,13 @@ const combineAndDedupe = _.flow(
|
||||||
const phoneticMap = producePhoneticMap(individuals)
|
const phoneticMap = producePhoneticMap(individuals)
|
||||||
const wordList = produceWordList(individuals)
|
const wordList = produceWordList(individuals)
|
||||||
|
|
||||||
return {individuals, aliasToIndividual, phoneticMap, wordList}
|
return {
|
||||||
|
individuals,
|
||||||
|
individualsMap,
|
||||||
|
aliasToIndividual,
|
||||||
|
phoneticMap,
|
||||||
|
wordList
|
||||||
|
}
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -16,12 +16,9 @@ function load () {
|
||||||
|
|
||||||
// MATCHING
|
// MATCHING
|
||||||
|
|
||||||
const mapMax = (iteratee, list) => _.max(_.map(iteratee, list))
|
// similarity algorithm
|
||||||
|
|
||||||
const allPairs = _.flow(
|
const stringSimilarity = _.curry(jaroWinkler)
|
||||||
(aList, bList) => _.map(a => _.map(b => [a, b], bList), aList),
|
|
||||||
_.flatten
|
|
||||||
)
|
|
||||||
|
|
||||||
// birth date
|
// birth date
|
||||||
|
|
||||||
|
|
@ -37,72 +34,14 @@ function isDateWithinSomeDaysOfPeriod (period, date, days) {
|
||||||
return (startDate < date && date < endDate)
|
return (startDate < date && date < endDate)
|
||||||
}
|
}
|
||||||
|
|
||||||
function isBornTooLongSince (individual, dateObject, days) {
|
const isBornTooLongSince = _.curry((days, dateObject, individual) => {
|
||||||
if (_.isEmpty(individual.birthDatePeriods)) return false
|
if (_.isEmpty(individual.birthDatePeriods)) return false
|
||||||
const isWithinSomeYears = _.partialRight(isDateWithinSomeDaysOfPeriod, [dateObject.date, days])
|
const isWithinSomeYears = _.partialRight(isDateWithinSomeDaysOfPeriod, [dateObject.date, days])
|
||||||
return !_.some(isWithinSomeYears, individual.birthDatePeriods)
|
return !_.some(isWithinSomeYears, individual.birthDatePeriods)
|
||||||
}
|
|
||||||
|
|
||||||
// similarity algorithm
|
|
||||||
|
|
||||||
const stringSimilarity = _.curry(jaroWinkler)
|
|
||||||
|
|
||||||
const wordSimilarity = (a, b) => {
|
|
||||||
const phoneticPairs = allPairs(a.phonetic, b.phonetic)
|
|
||||||
const phoneticMatch = _.map(_.spread(_.isEqual), phoneticPairs)
|
|
||||||
if (_.some(_.identity, phoneticMatch)) return 1
|
|
||||||
return stringSimilarity(a.value, b.value)
|
|
||||||
}
|
|
||||||
|
|
||||||
const similarity = _.curry((candidate, individual) => {
|
|
||||||
// Calculate if his birth date is within two years of the given date.
|
|
||||||
// If an individual has multiple birth-date periods, return whether any are
|
|
||||||
// within two years. Reject individuals who don't match this criterion.
|
|
||||||
const twoYears = 365 * 2
|
|
||||||
if (isBornTooLongSince(individual, candidate.birthDate, twoYears)) return 0
|
|
||||||
|
|
||||||
debug_log(individual)
|
|
||||||
|
|
||||||
// Calculate the Jaro-Winkler similarity of the full name.
|
|
||||||
// If an individual has multiple aliases, use the maximum score.
|
|
||||||
const scoreCandidateFullName = _.flow(
|
|
||||||
_.get('fullName'),
|
|
||||||
stringSimilarity(candidate.fullName)
|
|
||||||
)
|
|
||||||
const stringMatchScore = mapMax(scoreCandidateFullName, individual.aliases)
|
|
||||||
|
|
||||||
//
|
|
||||||
|
|
||||||
const candidateWords = candidate.fullNameWords
|
|
||||||
const numCandidateWords = candidateWords.length
|
|
||||||
|
|
||||||
const scoreCandidateWords = alias => {
|
|
||||||
const tooManyWords = _.flow(
|
|
||||||
_.get(['words', 'length']),
|
|
||||||
_.lt(numCandidateWords)
|
|
||||||
)
|
|
||||||
const parts = _.reject(tooManyWords, alias.parts)
|
|
||||||
|
|
||||||
const scorePartAt = _.curry((part, offset) => {
|
|
||||||
const words = _.slice(offset, offset + part.words.length, candidateWords)
|
|
||||||
return _.min(_.map(_.spread(wordSimilarity), _.zip(words, part.words)))
|
|
||||||
})
|
|
||||||
const scorePart = part => {
|
|
||||||
const offsets = _.range(0, (numCandidateWords - part.words.length) + 1)
|
|
||||||
return mapMax(scorePartAt(part), offsets)
|
|
||||||
}
|
|
||||||
const scores = _.orderBy([], 'desc', _.map(scorePart, parts))
|
|
||||||
const thresholdIndex = _.min([2, scores.length]) - 1
|
|
||||||
return scores[thresholdIndex]
|
|
||||||
}
|
|
||||||
const wordMatchScore = mapMax(scoreCandidateWords, individual.aliases)
|
|
||||||
|
|
||||||
console.log(stringMatchScore, wordMatchScore)
|
|
||||||
|
|
||||||
return _.max([stringMatchScore, wordMatchScore])
|
|
||||||
})
|
})
|
||||||
|
|
||||||
// nameParts should be an object like {firstName: "John", lastName: "Doe", ...}
|
// nameParts should be an object like {firstName: "John", lastName: "Doe", ...}
|
||||||
|
|
||||||
function makeCompatible (nameParts) {
|
function makeCompatible (nameParts) {
|
||||||
const partNames = _.keys(nameParts)
|
const partNames = _.keys(nameParts)
|
||||||
const values = _.values(nameParts)
|
const values = _.values(nameParts)
|
||||||
|
|
@ -110,15 +49,22 @@ function makeCompatible (nameParts) {
|
||||||
return _.map(_.zipObject(['partName', 'value']), props)
|
return _.map(_.zipObject(['partName', 'value']), props)
|
||||||
}
|
}
|
||||||
|
|
||||||
function match (nameParts, birthDateString) {
|
// algorithm
|
||||||
|
|
||||||
|
function match (nameParts, birthDateString, threshold) {
|
||||||
if (!structs) {
|
if (!structs) {
|
||||||
const message = 'The OFAC data sources have not been loaded yet.'
|
const message = 'The OFAC data sources have not been loaded yet.'
|
||||||
return Promise.reject(new Error(message))
|
return Promise.reject(new Error(message))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Prepare the input data
|
||||||
|
|
||||||
const parts = makeCompatible(nameParts)
|
const parts = makeCompatible(nameParts)
|
||||||
const fullName = nameUtils.makeFullName(parts)
|
const fullName = nameUtils.makeFullName(parts)
|
||||||
const fullNameWords = nameUtils.makeWords(fullName)
|
const words = nameUtils.makeWords(fullName)
|
||||||
|
|
||||||
|
const wordValues = _.map(_.get('value'), words)
|
||||||
|
const wordPhonetics = _.flatten(_.map(_.get('phonetics'), words))
|
||||||
|
|
||||||
// birthDateString is in YYYYMMDD format
|
// birthDateString is in YYYYMMDD format
|
||||||
const year = parseInt(birthDateString.slice(0, 4))
|
const year = parseInt(birthDateString.slice(0, 4))
|
||||||
|
|
@ -128,12 +74,73 @@ function match (nameParts, birthDateString) {
|
||||||
|
|
||||||
const birthDate = {year, month, day, date}
|
const birthDate = {year, month, day, date}
|
||||||
|
|
||||||
const candidate = {parts, fullName, fullNameWords, birthDate}
|
debug_log({parts, fullName, wordValues, wordPhonetics, birthDate})
|
||||||
debug_log(candidate)
|
|
||||||
|
|
||||||
const similarToCandidate = similarity(candidate)
|
// Start matching
|
||||||
const result = mapMax(similarToCandidate, structs.individuals)
|
|
||||||
console.log(result)
|
// Accept aliases who's full name matches.
|
||||||
|
const doesNameMatch = _.flow(
|
||||||
|
_.get('fullName'),
|
||||||
|
stringSimilarity(fullName),
|
||||||
|
_.lte(threshold)
|
||||||
|
)
|
||||||
|
const aliases = _.flatMap(_.get('aliases'), structs.individuals)
|
||||||
|
const aliasIdsFromFullName = _.flow(
|
||||||
|
_.filter(doesNameMatch),
|
||||||
|
|
||||||
|
_.map(_.get('id'))
|
||||||
|
)(aliases)
|
||||||
|
|
||||||
|
// Gather aliases who's name-parts match phonetically.
|
||||||
|
const getPhoneticMatches = phonetic => structs.phoneticMap.get(phonetic)
|
||||||
|
const phoneticMatches = _.flow(
|
||||||
|
_.map(getPhoneticMatches),
|
||||||
|
_.compact,
|
||||||
|
_.flatten
|
||||||
|
)(wordPhonetics)
|
||||||
|
|
||||||
|
// Gether aliases whose name-parts match alphabetically.
|
||||||
|
const getStringMatches = value => {
|
||||||
|
const entryMatches = entry => (jaroWinkler(value, entry.value) >= threshold)
|
||||||
|
return _.filter(entryMatches, structs.wordList)
|
||||||
|
}
|
||||||
|
const getSingleEntries = wordEntry => {
|
||||||
|
const makeEntry = aliasId => ({value: wordEntry.value, aliasId})
|
||||||
|
return _.map(makeEntry, wordEntry.aliasIds)
|
||||||
|
}
|
||||||
|
const stringMatches = _.flow(
|
||||||
|
_.map(getStringMatches),
|
||||||
|
_.flatten,
|
||||||
|
_.map(getSingleEntries),
|
||||||
|
_.flatten
|
||||||
|
)(wordValues)
|
||||||
|
|
||||||
|
// At least two name-parts must match per alias
|
||||||
|
const aliasIdsFromNamePart = _.flow(
|
||||||
|
_.uniqWith(_.isEqual),
|
||||||
|
_.map(_.get('aliasId')),
|
||||||
|
_.countBy(_.identity),
|
||||||
|
_.toPairs,
|
||||||
|
_.filter(_.flow(_.last, _.lte(2))),
|
||||||
|
_.map(_.first)
|
||||||
|
)([...phoneticMatches, ...stringMatches])
|
||||||
|
|
||||||
|
// Get the full record for each matched id
|
||||||
|
const getIndividual = aliasId => {
|
||||||
|
const individualId = structs.aliasToIndividual.get(aliasId)
|
||||||
|
return structs.individualsMap.get(individualId)
|
||||||
|
}
|
||||||
|
const suspects = _.uniq(_.map(getIndividual, [
|
||||||
|
...aliasIdsFromFullName,
|
||||||
|
...aliasIdsFromNamePart
|
||||||
|
]))
|
||||||
|
|
||||||
|
// Reject everyone who is born two years away.
|
||||||
|
const twoYears = 365 * 2
|
||||||
|
const unqualified = isBornTooLongSince(twoYears, birthDate)
|
||||||
|
const result = _.reject(unqualified, suspects)
|
||||||
|
|
||||||
|
debug_log(result)
|
||||||
return result
|
return result
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue