diff --git a/lib/ofac/data-parser.js b/lib/ofac/data-parser.js index 7bd74fe9..7054cb51 100644 --- a/lib/ofac/data-parser.js +++ b/lib/ofac/data-parser.js @@ -158,36 +158,22 @@ function promiseParseDocument (source) { const readdir = util.promisify(fs.readdir) -// const {id, individual, words} = result -// -// const individualEntry = [id, individual] -// individuals.push(individualEntry) -// -// const phoneticWithWord = pair => { -// const [word, phonetics] = pair -// const makeEntry = phonetic => ({word, phonetic, individualId: id}) -// return _.map(makeEntry, phonetics) -// } -// -// const phoneticEntries = _.flatten(_.map(phoneticWithWord, words)) -// allPhonetics.push(...phoneticEntries) - const mapAliases = _.curry((iteratee, individuals) => { - const foreachIndividual = individual => { + const mapIndividual = individual => { const {id, aliases} = individual return _.map(alias => iteratee(id, alias), aliases) } - return _.flatten(_.map(foreachIndividual, individuals)) + return _.flatMap(mapIndividual, individuals) }) const getPhoneticEntries = (individualId, alias) => { - const pairPhoneticsWithWords = word => { + const pairPhoneticsWithValues = word => { const {value, phonetics} = word const makeEntry = phonetic => ({value, phonetic, aliasId: alias.id}) return _.map(makeEntry, phonetics) } - return _.flatten(_.map(pairPhoneticsWithWords, alias.words)) + return _.flatMap(pairPhoneticsWithValues, alias.words) } const producePhoneticMap = _.flow( @@ -219,6 +205,13 @@ const combineAndDedupe = _.flow( _.compact, _.uniqBy(_.get('id')), individuals => { + const individualsMap = _.flow( + _.groupBy(_.get('id')), + _.mapValues(_.first), + _.toPairs, + entries => new Map(entries) + )(individuals) + const getIdPairs = (individualId, alias) => [alias.id, individualId] const idPairs = mapAliases(getIdPairs, individuals) const aliasToIndividual = new Map(idPairs) @@ -226,7 +219,13 @@ const combineAndDedupe = _.flow( const phoneticMap = producePhoneticMap(individuals) const wordList = produceWordList(individuals) - return {individuals, aliasToIndividual, phoneticMap, wordList} + return { + individuals, + individualsMap, + aliasToIndividual, + phoneticMap, + wordList + } } ) diff --git a/lib/ofac/index.js b/lib/ofac/index.js index d640155f..61a217c1 100644 --- a/lib/ofac/index.js +++ b/lib/ofac/index.js @@ -16,12 +16,9 @@ function load () { // MATCHING -const mapMax = (iteratee, list) => _.max(_.map(iteratee, list)) +// similarity algorithm -const allPairs = _.flow( - (aList, bList) => _.map(a => _.map(b => [a, b], bList), aList), - _.flatten -) +const stringSimilarity = _.curry(jaroWinkler) // birth date @@ -37,72 +34,14 @@ function isDateWithinSomeDaysOfPeriod (period, date, days) { return (startDate < date && date < endDate) } -function isBornTooLongSince (individual, dateObject, days) { +const isBornTooLongSince = _.curry((days, dateObject, individual) => { if (_.isEmpty(individual.birthDatePeriods)) return false const isWithinSomeYears = _.partialRight(isDateWithinSomeDaysOfPeriod, [dateObject.date, days]) return !_.some(isWithinSomeYears, individual.birthDatePeriods) -} - -// similarity algorithm - -const stringSimilarity = _.curry(jaroWinkler) - -const wordSimilarity = (a, b) => { - const phoneticPairs = allPairs(a.phonetic, b.phonetic) - const phoneticMatch = _.map(_.spread(_.isEqual), phoneticPairs) - if (_.some(_.identity, phoneticMatch)) return 1 - return stringSimilarity(a.value, b.value) -} - -const similarity = _.curry((candidate, individual) => { - // Calculate if his birth date is within two years of the given date. - // If an individual has multiple birth-date periods, return whether any are - // within two years. Reject individuals who don't match this criterion. - const twoYears = 365 * 2 - if (isBornTooLongSince(individual, candidate.birthDate, twoYears)) return 0 - - debug_log(individual) - - // Calculate the Jaro-Winkler similarity of the full name. - // If an individual has multiple aliases, use the maximum score. - const scoreCandidateFullName = _.flow( - _.get('fullName'), - stringSimilarity(candidate.fullName) - ) - const stringMatchScore = mapMax(scoreCandidateFullName, individual.aliases) - - // - - const candidateWords = candidate.fullNameWords - const numCandidateWords = candidateWords.length - - const scoreCandidateWords = alias => { - const tooManyWords = _.flow( - _.get(['words', 'length']), - _.lt(numCandidateWords) - ) - const parts = _.reject(tooManyWords, alias.parts) - - const scorePartAt = _.curry((part, offset) => { - const words = _.slice(offset, offset + part.words.length, candidateWords) - return _.min(_.map(_.spread(wordSimilarity), _.zip(words, part.words))) - }) - const scorePart = part => { - const offsets = _.range(0, (numCandidateWords - part.words.length) + 1) - return mapMax(scorePartAt(part), offsets) - } - const scores = _.orderBy([], 'desc', _.map(scorePart, parts)) - const thresholdIndex = _.min([2, scores.length]) - 1 - return scores[thresholdIndex] - } - const wordMatchScore = mapMax(scoreCandidateWords, individual.aliases) - - console.log(stringMatchScore, wordMatchScore) - - return _.max([stringMatchScore, wordMatchScore]) }) // nameParts should be an object like {firstName: "John", lastName: "Doe", ...} + function makeCompatible (nameParts) { const partNames = _.keys(nameParts) const values = _.values(nameParts) @@ -110,15 +49,22 @@ function makeCompatible (nameParts) { return _.map(_.zipObject(['partName', 'value']), props) } -function match (nameParts, birthDateString) { +// algorithm + +function match (nameParts, birthDateString, threshold) { if (!structs) { const message = 'The OFAC data sources have not been loaded yet.' return Promise.reject(new Error(message)) } + // Prepare the input data + const parts = makeCompatible(nameParts) const fullName = nameUtils.makeFullName(parts) - const fullNameWords = nameUtils.makeWords(fullName) + const words = nameUtils.makeWords(fullName) + + const wordValues = _.map(_.get('value'), words) + const wordPhonetics = _.flatten(_.map(_.get('phonetics'), words)) // birthDateString is in YYYYMMDD format const year = parseInt(birthDateString.slice(0, 4)) @@ -128,12 +74,73 @@ function match (nameParts, birthDateString) { const birthDate = {year, month, day, date} - const candidate = {parts, fullName, fullNameWords, birthDate} - debug_log(candidate) + debug_log({parts, fullName, wordValues, wordPhonetics, birthDate}) - const similarToCandidate = similarity(candidate) - const result = mapMax(similarToCandidate, structs.individuals) - console.log(result) + // Start matching + + // Accept aliases who's full name matches. + const doesNameMatch = _.flow( + _.get('fullName'), + stringSimilarity(fullName), + _.lte(threshold) + ) + const aliases = _.flatMap(_.get('aliases'), structs.individuals) + const aliasIdsFromFullName = _.flow( + _.filter(doesNameMatch), + + _.map(_.get('id')) + )(aliases) + + // Gather aliases who's name-parts match phonetically. + const getPhoneticMatches = phonetic => structs.phoneticMap.get(phonetic) + const phoneticMatches = _.flow( + _.map(getPhoneticMatches), + _.compact, + _.flatten + )(wordPhonetics) + + // Gether aliases whose name-parts match alphabetically. + const getStringMatches = value => { + const entryMatches = entry => (jaroWinkler(value, entry.value) >= threshold) + return _.filter(entryMatches, structs.wordList) + } + const getSingleEntries = wordEntry => { + const makeEntry = aliasId => ({value: wordEntry.value, aliasId}) + return _.map(makeEntry, wordEntry.aliasIds) + } + const stringMatches = _.flow( + _.map(getStringMatches), + _.flatten, + _.map(getSingleEntries), + _.flatten + )(wordValues) + + // At least two name-parts must match per alias + const aliasIdsFromNamePart = _.flow( + _.uniqWith(_.isEqual), + _.map(_.get('aliasId')), + _.countBy(_.identity), + _.toPairs, + _.filter(_.flow(_.last, _.lte(2))), + _.map(_.first) + )([...phoneticMatches, ...stringMatches]) + + // Get the full record for each matched id + const getIndividual = aliasId => { + const individualId = structs.aliasToIndividual.get(aliasId) + return structs.individualsMap.get(individualId) + } + const suspects = _.uniq(_.map(getIndividual, [ + ...aliasIdsFromFullName, + ...aliasIdsFromNamePart + ])) + + // Reject everyone who is born two years away. + const twoYears = 365 * 2 + const unqualified = isBornTooLongSince(twoYears, birthDate) + const result = _.reject(unqualified, suspects) + + debug_log(result) return result }