diff --git a/lib/ofac/data-parser.js b/lib/ofac/data-parser.js index 7a5a343c..18014222 100644 --- a/lib/ofac/data-parser.js +++ b/lib/ofac/data-parser.js @@ -50,7 +50,8 @@ const processDocumentedNamePart = _.curry((groupTypes, namePartNode) => { const typeId = groupTypes.get(groupId) const partName = partNames.get(typeId) const value = _.lowerCase(valueNode.$text) - return {[partName]: value} + const words = nameUtils.makeWords(value) + return {partName, value, words} }) const isLatin = _.matchesProperty(['$', 'DocNameStatusID'], PRIMARY_LATIN) @@ -68,15 +69,11 @@ const processAlias = _.curry((groupTypes, aliasNode) => { } const namePartNodes = latinNameNode.DocumentedNamePart - const nameParts = _.map(getNamePart, namePartNodes) + const parts = _.map(getNamePart, namePartNodes) - const parts = _.assignAll(nameParts) - const fullNames = nameUtils.makeFullNames(parts) + const fullName = nameUtils.makeFullName(parts) - const phoneticParts = _.mapValues(nameUtils.phonetic, parts) - const phoneticFullNames = _.map(nameUtils.phonetic, fullNames) - - return {parts, fullNames, phoneticParts, phoneticFullNames} + return {parts, fullName} }) // birth date diff --git a/lib/ofac/index.js b/lib/ofac/index.js index c7e5c171..759b4674 100644 --- a/lib/ofac/index.js +++ b/lib/ofac/index.js @@ -18,6 +18,11 @@ function load () { const mapMax = (iteratee, list) => _.max(_.map(iteratee, list)) +const allPairs = _.flow( + (aList, bList) => _.map(a => _.map(b => [a, b], bList), aList), + _.flatten +) + // birth date function isDateWithinSomeDaysOfPeriod (period, date, days) { @@ -38,18 +43,16 @@ function isBornTooLongSince (individual, dateObject, days) { return !_.some(isWithinSomeYears, individual.birthDatePeriods) } -// string similarity +// similarity algorithm -const stringMatch = _.curry(jaroWinkler) +const stringSimilarity = _.curry(jaroWinkler) -const bestMatchInList = _.curry((list, name) => mapMax(stringMatch(name), list)) - -const aliasStringMatch = _.curry((candidate, alias) => { - const matchWithCandidate = bestMatchInList(candidate.fullNames) - return mapMax(matchWithCandidate, alias.fullNames) -}) - -// algorithm +const wordSimilarity = (a, b) => { + const phoneticPairs = allPairs(a.phonetic, b.phonetic) + const phoneticMatch = _.map(_.spread(_.isEqual), phoneticPairs) + if (_.some(_.identity, phoneticMatch)) return 1 + return stringSimilarity(a.value, b.value) +} const similarity = _.curry((candidate, individual) => { // Calculate if his birth date is within two years of the given date. @@ -62,31 +65,60 @@ const similarity = _.curry((candidate, individual) => { // Calculate the Jaro-Winkler similarity of the full name. // If an individual has multiple aliases, use the maximum score. - const scoreAgainstCandidate = aliasStringMatch(candidate) - const stringMatchScore = mapMax(scoreAgainstCandidate, individual.aliases) + const scoreCandidateFullName = _.flow( + _.get('fullName'), + stringSimilarity(candidate.fullName) + ) + const stringMatchScore = mapMax(scoreCandidateFullName, individual.aliases) - // // Calculate the Jaro-Winkler similarity of the phonetic representation of the full name. - // // This should approximate the phonetic similarity of the two names. - // // If an individual has multiple aliases, use the maximum score. - // const phoneticMatchScore = mapMax(calcPhoneticMatchScore(candidate.phoneticFullName))(individual.aliases) + // - console.log(stringMatchScore) + const candidateWords = candidate.fullNameWords + const numCandidateWords = candidateWords.length - return _.max([stringMatchScore]) + const scoreCandidateWords = alias => { + const tooManyWords = _.flow( + _.get(['words', 'length']), + _.lt(numCandidateWords) + ) + const parts = _.reject(tooManyWords, alias.parts) + + const scorePartAt = _.curry((part, offset) => { + const words = _.slice(offset, offset + part.words.length, candidateWords) + return _.min(_.map(_.spread(wordSimilarity), _.zip(words, part.words))) + }) + const scorePart = part => { + const offsets = _.range(0, (numCandidateWords - part.words.length) + 1) + return mapMax(scorePartAt(part), offsets) + } + const scores = _.orderBy([], 'desc', _.map(scorePart, parts)) + const thresholdIndex = _.min([2, scores.length]) - 1 + return scores[thresholdIndex] + } + const wordMatchScore = mapMax(scoreCandidateWords, individual.aliases) + + console.log(stringMatchScore, wordMatchScore) + + return _.max([stringMatchScore, wordMatchScore]) }) +// nameParts should be an object like {firstName: "John", lastName: "Doe", ...} +function makeCompatible (nameParts) { + const partNames = _.keys(nameParts) + const values = _.values(nameParts) + const props = _.zipAll([partNames, values]) + return _.map(_.zipObject(['partName', 'value']), props) +} + function match (nameParts, birthDateString) { if (!individuals) { const message = 'The OFAC data sources have not been loaded yet.' return Promise.reject(new Error(message)) } - // nameParts should be an object like {firstName: "John", lastName: "Doe", ...} - const parts = _.mapValues(_.lowerCase, nameParts) - const fullNames = nameUtils.makeFullNames(parts) - - const phoneticParts = _.mapValues(nameUtils.phonetic, parts) - const phoneticFullNames = _.map(nameUtils.phonetic, fullNames) + const parts = makeCompatible(nameParts) + const fullName = nameUtils.makeFullName(parts) + const fullNameWords = nameUtils.makeWords(fullName) // birthDateString is in YYYYMMDD format const year = parseInt(birthDateString.slice(0, 4)) @@ -96,11 +128,12 @@ function match (nameParts, birthDateString) { const birthDate = {year, month, day, date} - const candidate = {parts, fullNames, phoneticParts, phoneticFullNames, birthDate} + const candidate = {parts, fullName, fullNameWords, birthDate} + debug_log(candidate) const similarToCandidate = similarity(candidate) const result = mapMax(similarToCandidate, individuals) - debug_log(candidate) + console.log(result) return result } diff --git a/lib/ofac/name-utils.js b/lib/ofac/name-utils.js index 4bb0df35..60e46e8d 100644 --- a/lib/ofac/name-utils.js +++ b/lib/ofac/name-utils.js @@ -1,46 +1,33 @@ -const metaphone = require('talisman/phonetics/metaphone') const doubleMetaphone = require('talisman/phonetics/double-metaphone') const _ = require('lodash/fp') // KOSTIS TODO: Decide on a method. Remove the others -const phoneticMethod1 = metaphone - -const phoneticMethod2 = _.flow(doubleMetaphone, _.uniq) - -const phoneticMethod3 = _.flow(_.split(' '), _.map(phoneticMethod2)) +const makePhonetic = _.flow(doubleMetaphone, _.uniq) // Combine name-parts in a standard order. -const commonOrderings = [ - ['firstName', 'lastName'], - ['firstName', 'middleName', 'lastName'], - ['firstName', 'maidenName', 'lastName'], - ['firstName', 'patronymic', 'lastName'], - ['firstName', 'matronymic', 'lastName'] -] +const partOrdering = ['firstName', 'middleName', 'maidenName', 'patronymic', 'matronymic', 'lastName'] -// const getFrom = _.flip() - -const getFrom = _.curry((obj, key) => obj[key]) - -const getOrderedParts = (parts, ordering) => _.map(getFrom(parts), ordering) - -const combineParts = _.curryN(2, _.flow( - getOrderedParts, - _.compact, - _.join(' ') -)) - -const makeAllOrderings = parts => _.map(combineParts(parts), commonOrderings) - -const makeFullNames = _.flow( - makeAllOrderings, - _.uniq +const usingPartOrder = _.flow( + _.get('partName'), + _.partialRight(_.indexOf, [partOrdering]) ) +const makeFullName = _.flow( + _.sortBy(usingPartOrder), + _.map(_.get('value')), + _.join(' ') +) + +const makeWords = value => { + const words = _.split(' ', value) + const phonetic = _.map(makePhonetic, words) + const props = _.zipAll([words, phonetic]) + return _.map(_.zipObject(['value', 'phonetic']), props) +} module.exports = { - makeFullNames, - phonetic: phoneticMethod3 + makeFullName, + makeWords }