Metching algorithm
This commit is contained in:
parent
2f8c798304
commit
1d0aff07fe
2 changed files with 97 additions and 91 deletions
|
|
@ -158,36 +158,22 @@ function promiseParseDocument (source) {
|
|||
|
||||
const readdir = util.promisify(fs.readdir)
|
||||
|
||||
// const {id, individual, words} = result
|
||||
//
|
||||
// const individualEntry = [id, individual]
|
||||
// individuals.push(individualEntry)
|
||||
//
|
||||
// const phoneticWithWord = pair => {
|
||||
// const [word, phonetics] = pair
|
||||
// const makeEntry = phonetic => ({word, phonetic, individualId: id})
|
||||
// return _.map(makeEntry, phonetics)
|
||||
// }
|
||||
//
|
||||
// const phoneticEntries = _.flatten(_.map(phoneticWithWord, words))
|
||||
// allPhonetics.push(...phoneticEntries)
|
||||
|
||||
const mapAliases = _.curry((iteratee, individuals) => {
|
||||
const foreachIndividual = individual => {
|
||||
const mapIndividual = individual => {
|
||||
const {id, aliases} = individual
|
||||
return _.map(alias => iteratee(id, alias), aliases)
|
||||
}
|
||||
return _.flatten(_.map(foreachIndividual, individuals))
|
||||
return _.flatMap(mapIndividual, individuals)
|
||||
})
|
||||
|
||||
|
||||
const getPhoneticEntries = (individualId, alias) => {
|
||||
const pairPhoneticsWithWords = word => {
|
||||
const pairPhoneticsWithValues = word => {
|
||||
const {value, phonetics} = word
|
||||
const makeEntry = phonetic => ({value, phonetic, aliasId: alias.id})
|
||||
return _.map(makeEntry, phonetics)
|
||||
}
|
||||
return _.flatten(_.map(pairPhoneticsWithWords, alias.words))
|
||||
return _.flatMap(pairPhoneticsWithValues, alias.words)
|
||||
}
|
||||
|
||||
const producePhoneticMap = _.flow(
|
||||
|
|
@ -219,6 +205,13 @@ const combineAndDedupe = _.flow(
|
|||
_.compact,
|
||||
_.uniqBy(_.get('id')),
|
||||
individuals => {
|
||||
const individualsMap = _.flow(
|
||||
_.groupBy(_.get('id')),
|
||||
_.mapValues(_.first),
|
||||
_.toPairs,
|
||||
entries => new Map(entries)
|
||||
)(individuals)
|
||||
|
||||
const getIdPairs = (individualId, alias) => [alias.id, individualId]
|
||||
const idPairs = mapAliases(getIdPairs, individuals)
|
||||
const aliasToIndividual = new Map(idPairs)
|
||||
|
|
@ -226,7 +219,13 @@ const combineAndDedupe = _.flow(
|
|||
const phoneticMap = producePhoneticMap(individuals)
|
||||
const wordList = produceWordList(individuals)
|
||||
|
||||
return {individuals, aliasToIndividual, phoneticMap, wordList}
|
||||
return {
|
||||
individuals,
|
||||
individualsMap,
|
||||
aliasToIndividual,
|
||||
phoneticMap,
|
||||
wordList
|
||||
}
|
||||
}
|
||||
)
|
||||
|
||||
|
|
|
|||
|
|
@ -16,12 +16,9 @@ function load () {
|
|||
|
||||
// MATCHING
|
||||
|
||||
const mapMax = (iteratee, list) => _.max(_.map(iteratee, list))
|
||||
// similarity algorithm
|
||||
|
||||
const allPairs = _.flow(
|
||||
(aList, bList) => _.map(a => _.map(b => [a, b], bList), aList),
|
||||
_.flatten
|
||||
)
|
||||
const stringSimilarity = _.curry(jaroWinkler)
|
||||
|
||||
// birth date
|
||||
|
||||
|
|
@ -37,72 +34,14 @@ function isDateWithinSomeDaysOfPeriod (period, date, days) {
|
|||
return (startDate < date && date < endDate)
|
||||
}
|
||||
|
||||
function isBornTooLongSince (individual, dateObject, days) {
|
||||
const isBornTooLongSince = _.curry((days, dateObject, individual) => {
|
||||
if (_.isEmpty(individual.birthDatePeriods)) return false
|
||||
const isWithinSomeYears = _.partialRight(isDateWithinSomeDaysOfPeriod, [dateObject.date, days])
|
||||
return !_.some(isWithinSomeYears, individual.birthDatePeriods)
|
||||
}
|
||||
|
||||
// similarity algorithm
|
||||
|
||||
const stringSimilarity = _.curry(jaroWinkler)
|
||||
|
||||
const wordSimilarity = (a, b) => {
|
||||
const phoneticPairs = allPairs(a.phonetic, b.phonetic)
|
||||
const phoneticMatch = _.map(_.spread(_.isEqual), phoneticPairs)
|
||||
if (_.some(_.identity, phoneticMatch)) return 1
|
||||
return stringSimilarity(a.value, b.value)
|
||||
}
|
||||
|
||||
const similarity = _.curry((candidate, individual) => {
|
||||
// Calculate if his birth date is within two years of the given date.
|
||||
// If an individual has multiple birth-date periods, return whether any are
|
||||
// within two years. Reject individuals who don't match this criterion.
|
||||
const twoYears = 365 * 2
|
||||
if (isBornTooLongSince(individual, candidate.birthDate, twoYears)) return 0
|
||||
|
||||
debug_log(individual)
|
||||
|
||||
// Calculate the Jaro-Winkler similarity of the full name.
|
||||
// If an individual has multiple aliases, use the maximum score.
|
||||
const scoreCandidateFullName = _.flow(
|
||||
_.get('fullName'),
|
||||
stringSimilarity(candidate.fullName)
|
||||
)
|
||||
const stringMatchScore = mapMax(scoreCandidateFullName, individual.aliases)
|
||||
|
||||
//
|
||||
|
||||
const candidateWords = candidate.fullNameWords
|
||||
const numCandidateWords = candidateWords.length
|
||||
|
||||
const scoreCandidateWords = alias => {
|
||||
const tooManyWords = _.flow(
|
||||
_.get(['words', 'length']),
|
||||
_.lt(numCandidateWords)
|
||||
)
|
||||
const parts = _.reject(tooManyWords, alias.parts)
|
||||
|
||||
const scorePartAt = _.curry((part, offset) => {
|
||||
const words = _.slice(offset, offset + part.words.length, candidateWords)
|
||||
return _.min(_.map(_.spread(wordSimilarity), _.zip(words, part.words)))
|
||||
})
|
||||
const scorePart = part => {
|
||||
const offsets = _.range(0, (numCandidateWords - part.words.length) + 1)
|
||||
return mapMax(scorePartAt(part), offsets)
|
||||
}
|
||||
const scores = _.orderBy([], 'desc', _.map(scorePart, parts))
|
||||
const thresholdIndex = _.min([2, scores.length]) - 1
|
||||
return scores[thresholdIndex]
|
||||
}
|
||||
const wordMatchScore = mapMax(scoreCandidateWords, individual.aliases)
|
||||
|
||||
console.log(stringMatchScore, wordMatchScore)
|
||||
|
||||
return _.max([stringMatchScore, wordMatchScore])
|
||||
})
|
||||
|
||||
// nameParts should be an object like {firstName: "John", lastName: "Doe", ...}
|
||||
|
||||
function makeCompatible (nameParts) {
|
||||
const partNames = _.keys(nameParts)
|
||||
const values = _.values(nameParts)
|
||||
|
|
@ -110,15 +49,22 @@ function makeCompatible (nameParts) {
|
|||
return _.map(_.zipObject(['partName', 'value']), props)
|
||||
}
|
||||
|
||||
function match (nameParts, birthDateString) {
|
||||
// algorithm
|
||||
|
||||
function match (nameParts, birthDateString, threshold) {
|
||||
if (!structs) {
|
||||
const message = 'The OFAC data sources have not been loaded yet.'
|
||||
return Promise.reject(new Error(message))
|
||||
}
|
||||
|
||||
// Prepare the input data
|
||||
|
||||
const parts = makeCompatible(nameParts)
|
||||
const fullName = nameUtils.makeFullName(parts)
|
||||
const fullNameWords = nameUtils.makeWords(fullName)
|
||||
const words = nameUtils.makeWords(fullName)
|
||||
|
||||
const wordValues = _.map(_.get('value'), words)
|
||||
const wordPhonetics = _.flatten(_.map(_.get('phonetics'), words))
|
||||
|
||||
// birthDateString is in YYYYMMDD format
|
||||
const year = parseInt(birthDateString.slice(0, 4))
|
||||
|
|
@ -128,12 +74,73 @@ function match (nameParts, birthDateString) {
|
|||
|
||||
const birthDate = {year, month, day, date}
|
||||
|
||||
const candidate = {parts, fullName, fullNameWords, birthDate}
|
||||
debug_log(candidate)
|
||||
debug_log({parts, fullName, wordValues, wordPhonetics, birthDate})
|
||||
|
||||
const similarToCandidate = similarity(candidate)
|
||||
const result = mapMax(similarToCandidate, structs.individuals)
|
||||
console.log(result)
|
||||
// Start matching
|
||||
|
||||
// Accept aliases who's full name matches.
|
||||
const doesNameMatch = _.flow(
|
||||
_.get('fullName'),
|
||||
stringSimilarity(fullName),
|
||||
_.lte(threshold)
|
||||
)
|
||||
const aliases = _.flatMap(_.get('aliases'), structs.individuals)
|
||||
const aliasIdsFromFullName = _.flow(
|
||||
_.filter(doesNameMatch),
|
||||
|
||||
_.map(_.get('id'))
|
||||
)(aliases)
|
||||
|
||||
// Gather aliases who's name-parts match phonetically.
|
||||
const getPhoneticMatches = phonetic => structs.phoneticMap.get(phonetic)
|
||||
const phoneticMatches = _.flow(
|
||||
_.map(getPhoneticMatches),
|
||||
_.compact,
|
||||
_.flatten
|
||||
)(wordPhonetics)
|
||||
|
||||
// Gether aliases whose name-parts match alphabetically.
|
||||
const getStringMatches = value => {
|
||||
const entryMatches = entry => (jaroWinkler(value, entry.value) >= threshold)
|
||||
return _.filter(entryMatches, structs.wordList)
|
||||
}
|
||||
const getSingleEntries = wordEntry => {
|
||||
const makeEntry = aliasId => ({value: wordEntry.value, aliasId})
|
||||
return _.map(makeEntry, wordEntry.aliasIds)
|
||||
}
|
||||
const stringMatches = _.flow(
|
||||
_.map(getStringMatches),
|
||||
_.flatten,
|
||||
_.map(getSingleEntries),
|
||||
_.flatten
|
||||
)(wordValues)
|
||||
|
||||
// At least two name-parts must match per alias
|
||||
const aliasIdsFromNamePart = _.flow(
|
||||
_.uniqWith(_.isEqual),
|
||||
_.map(_.get('aliasId')),
|
||||
_.countBy(_.identity),
|
||||
_.toPairs,
|
||||
_.filter(_.flow(_.last, _.lte(2))),
|
||||
_.map(_.first)
|
||||
)([...phoneticMatches, ...stringMatches])
|
||||
|
||||
// Get the full record for each matched id
|
||||
const getIndividual = aliasId => {
|
||||
const individualId = structs.aliasToIndividual.get(aliasId)
|
||||
return structs.individualsMap.get(individualId)
|
||||
}
|
||||
const suspects = _.uniq(_.map(getIndividual, [
|
||||
...aliasIdsFromFullName,
|
||||
...aliasIdsFromNamePart
|
||||
]))
|
||||
|
||||
// Reject everyone who is born two years away.
|
||||
const twoYears = 365 * 2
|
||||
const unqualified = isBornTooLongSince(twoYears, birthDate)
|
||||
const result = _.reject(unqualified, suspects)
|
||||
|
||||
debug_log(result)
|
||||
return result
|
||||
}
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue