Metching algorithm

This commit is contained in:
Konstantin Mamalakis 2018-03-01 19:19:04 +02:00 committed by Josh Harvey
parent 2f8c798304
commit 1d0aff07fe
2 changed files with 97 additions and 91 deletions

View file

@ -158,36 +158,22 @@ function promiseParseDocument (source) {
const readdir = util.promisify(fs.readdir)
// const {id, individual, words} = result
//
// const individualEntry = [id, individual]
// individuals.push(individualEntry)
//
// const phoneticWithWord = pair => {
// const [word, phonetics] = pair
// const makeEntry = phonetic => ({word, phonetic, individualId: id})
// return _.map(makeEntry, phonetics)
// }
//
// const phoneticEntries = _.flatten(_.map(phoneticWithWord, words))
// allPhonetics.push(...phoneticEntries)
const mapAliases = _.curry((iteratee, individuals) => {
const foreachIndividual = individual => {
const mapIndividual = individual => {
const {id, aliases} = individual
return _.map(alias => iteratee(id, alias), aliases)
}
return _.flatten(_.map(foreachIndividual, individuals))
return _.flatMap(mapIndividual, individuals)
})
const getPhoneticEntries = (individualId, alias) => {
const pairPhoneticsWithWords = word => {
const pairPhoneticsWithValues = word => {
const {value, phonetics} = word
const makeEntry = phonetic => ({value, phonetic, aliasId: alias.id})
return _.map(makeEntry, phonetics)
}
return _.flatten(_.map(pairPhoneticsWithWords, alias.words))
return _.flatMap(pairPhoneticsWithValues, alias.words)
}
const producePhoneticMap = _.flow(
@ -219,6 +205,13 @@ const combineAndDedupe = _.flow(
_.compact,
_.uniqBy(_.get('id')),
individuals => {
const individualsMap = _.flow(
_.groupBy(_.get('id')),
_.mapValues(_.first),
_.toPairs,
entries => new Map(entries)
)(individuals)
const getIdPairs = (individualId, alias) => [alias.id, individualId]
const idPairs = mapAliases(getIdPairs, individuals)
const aliasToIndividual = new Map(idPairs)
@ -226,7 +219,13 @@ const combineAndDedupe = _.flow(
const phoneticMap = producePhoneticMap(individuals)
const wordList = produceWordList(individuals)
return {individuals, aliasToIndividual, phoneticMap, wordList}
return {
individuals,
individualsMap,
aliasToIndividual,
phoneticMap,
wordList
}
}
)

View file

@ -16,12 +16,9 @@ function load () {
// MATCHING
const mapMax = (iteratee, list) => _.max(_.map(iteratee, list))
// similarity algorithm
const allPairs = _.flow(
(aList, bList) => _.map(a => _.map(b => [a, b], bList), aList),
_.flatten
)
const stringSimilarity = _.curry(jaroWinkler)
// birth date
@ -37,72 +34,14 @@ function isDateWithinSomeDaysOfPeriod (period, date, days) {
return (startDate < date && date < endDate)
}
function isBornTooLongSince (individual, dateObject, days) {
const isBornTooLongSince = _.curry((days, dateObject, individual) => {
if (_.isEmpty(individual.birthDatePeriods)) return false
const isWithinSomeYears = _.partialRight(isDateWithinSomeDaysOfPeriod, [dateObject.date, days])
return !_.some(isWithinSomeYears, individual.birthDatePeriods)
}
// similarity algorithm
const stringSimilarity = _.curry(jaroWinkler)
const wordSimilarity = (a, b) => {
const phoneticPairs = allPairs(a.phonetic, b.phonetic)
const phoneticMatch = _.map(_.spread(_.isEqual), phoneticPairs)
if (_.some(_.identity, phoneticMatch)) return 1
return stringSimilarity(a.value, b.value)
}
const similarity = _.curry((candidate, individual) => {
// Calculate if his birth date is within two years of the given date.
// If an individual has multiple birth-date periods, return whether any are
// within two years. Reject individuals who don't match this criterion.
const twoYears = 365 * 2
if (isBornTooLongSince(individual, candidate.birthDate, twoYears)) return 0
debug_log(individual)
// Calculate the Jaro-Winkler similarity of the full name.
// If an individual has multiple aliases, use the maximum score.
const scoreCandidateFullName = _.flow(
_.get('fullName'),
stringSimilarity(candidate.fullName)
)
const stringMatchScore = mapMax(scoreCandidateFullName, individual.aliases)
//
const candidateWords = candidate.fullNameWords
const numCandidateWords = candidateWords.length
const scoreCandidateWords = alias => {
const tooManyWords = _.flow(
_.get(['words', 'length']),
_.lt(numCandidateWords)
)
const parts = _.reject(tooManyWords, alias.parts)
const scorePartAt = _.curry((part, offset) => {
const words = _.slice(offset, offset + part.words.length, candidateWords)
return _.min(_.map(_.spread(wordSimilarity), _.zip(words, part.words)))
})
const scorePart = part => {
const offsets = _.range(0, (numCandidateWords - part.words.length) + 1)
return mapMax(scorePartAt(part), offsets)
}
const scores = _.orderBy([], 'desc', _.map(scorePart, parts))
const thresholdIndex = _.min([2, scores.length]) - 1
return scores[thresholdIndex]
}
const wordMatchScore = mapMax(scoreCandidateWords, individual.aliases)
console.log(stringMatchScore, wordMatchScore)
return _.max([stringMatchScore, wordMatchScore])
})
// nameParts should be an object like {firstName: "John", lastName: "Doe", ...}
function makeCompatible (nameParts) {
const partNames = _.keys(nameParts)
const values = _.values(nameParts)
@ -110,15 +49,22 @@ function makeCompatible (nameParts) {
return _.map(_.zipObject(['partName', 'value']), props)
}
function match (nameParts, birthDateString) {
// algorithm
function match (nameParts, birthDateString, threshold) {
if (!structs) {
const message = 'The OFAC data sources have not been loaded yet.'
return Promise.reject(new Error(message))
}
// Prepare the input data
const parts = makeCompatible(nameParts)
const fullName = nameUtils.makeFullName(parts)
const fullNameWords = nameUtils.makeWords(fullName)
const words = nameUtils.makeWords(fullName)
const wordValues = _.map(_.get('value'), words)
const wordPhonetics = _.flatten(_.map(_.get('phonetics'), words))
// birthDateString is in YYYYMMDD format
const year = parseInt(birthDateString.slice(0, 4))
@ -128,12 +74,73 @@ function match (nameParts, birthDateString) {
const birthDate = {year, month, day, date}
const candidate = {parts, fullName, fullNameWords, birthDate}
debug_log(candidate)
debug_log({parts, fullName, wordValues, wordPhonetics, birthDate})
const similarToCandidate = similarity(candidate)
const result = mapMax(similarToCandidate, structs.individuals)
console.log(result)
// Start matching
// Accept aliases who's full name matches.
const doesNameMatch = _.flow(
_.get('fullName'),
stringSimilarity(fullName),
_.lte(threshold)
)
const aliases = _.flatMap(_.get('aliases'), structs.individuals)
const aliasIdsFromFullName = _.flow(
_.filter(doesNameMatch),
_.map(_.get('id'))
)(aliases)
// Gather aliases who's name-parts match phonetically.
const getPhoneticMatches = phonetic => structs.phoneticMap.get(phonetic)
const phoneticMatches = _.flow(
_.map(getPhoneticMatches),
_.compact,
_.flatten
)(wordPhonetics)
// Gether aliases whose name-parts match alphabetically.
const getStringMatches = value => {
const entryMatches = entry => (jaroWinkler(value, entry.value) >= threshold)
return _.filter(entryMatches, structs.wordList)
}
const getSingleEntries = wordEntry => {
const makeEntry = aliasId => ({value: wordEntry.value, aliasId})
return _.map(makeEntry, wordEntry.aliasIds)
}
const stringMatches = _.flow(
_.map(getStringMatches),
_.flatten,
_.map(getSingleEntries),
_.flatten
)(wordValues)
// At least two name-parts must match per alias
const aliasIdsFromNamePart = _.flow(
_.uniqWith(_.isEqual),
_.map(_.get('aliasId')),
_.countBy(_.identity),
_.toPairs,
_.filter(_.flow(_.last, _.lte(2))),
_.map(_.first)
)([...phoneticMatches, ...stringMatches])
// Get the full record for each matched id
const getIndividual = aliasId => {
const individualId = structs.aliasToIndividual.get(aliasId)
return structs.individualsMap.get(individualId)
}
const suspects = _.uniq(_.map(getIndividual, [
...aliasIdsFromFullName,
...aliasIdsFromNamePart
]))
// Reject everyone who is born two years away.
const twoYears = 365 * 2
const unqualified = isBornTooLongSince(twoYears, birthDate)
const result = _.reject(unqualified, suspects)
debug_log(result)
return result
}