diff --git a/lib/ofac/index.js b/lib/ofac/index.js index 61a217c1..9f8c4ffe 100644 --- a/lib/ofac/index.js +++ b/lib/ofac/index.js @@ -1,45 +1,38 @@ -const dataParser = require('./data-parser') +const fs = require('fs') +const path = require('path') +const util = require('util') +const parser = require('./parsing') +const matcher = require('./matching') const nameUtils = require('./name-utils') -const jaroWinkler = require('talisman/metrics/distance/jaro-winkler') +const options = require('../options') const _ = require('lodash/fp') const debug_log = require('../pp')(__filename) // KOSTIS TODO: remove +const OFAC_DATA_DIR = options.ofacDataDir + let structs = null +const readdir = util.promisify(fs.readdir) + function load () { - return dataParser.produceStructs() + // NOTE: Not sure how you push code updates to existing clients. This problem + // might pop up if new code is pushed, without re-doing setup. + if (!OFAC_DATA_DIR) { + const message = 'The ofacDataDir option has not been set in lamassu.json' + return Promise.reject(new Error(message)) + } + + return readdir(OFAC_DATA_DIR) + .then(_.flow( + _.map(file => path.join(OFAC_DATA_DIR, file)), + parser.parse + )) .then(result => { structs = result }) } -// MATCHING - -// similarity algorithm - -const stringSimilarity = _.curry(jaroWinkler) - -// birth date - -function isDateWithinSomeDaysOfPeriod (period, date, days) { - const inMillisecs = 24 * 60 * 60 * 1000 - - const startTime = period.start.date.getTime() - days * inMillisecs - const startDate = new Date(startTime) - - const endTime = period.end.date.getTime() + days * inMillisecs - const endDate = new Date(endTime) - - return (startDate < date && date < endDate) -} - -const isBornTooLongSince = _.curry((days, dateObject, individual) => { - if (_.isEmpty(individual.birthDatePeriods)) return false - const isWithinSomeYears = _.partialRight(isDateWithinSomeDaysOfPeriod, [dateObject.date, days]) - return !_.some(isWithinSomeYears, individual.birthDatePeriods) -}) - // nameParts should be an object like {firstName: "John", lastName: "Doe", ...} function makeCompatible (nameParts) { @@ -49,8 +42,6 @@ function makeCompatible (nameParts) { return _.map(_.zipObject(['partName', 'value']), props) } -// algorithm - function match (nameParts, birthDateString, threshold) { if (!structs) { const message = 'The OFAC data sources have not been loaded yet.' @@ -64,7 +55,7 @@ function match (nameParts, birthDateString, threshold) { const words = nameUtils.makeWords(fullName) const wordValues = _.map(_.get('value'), words) - const wordPhonetics = _.flatten(_.map(_.get('phonetics'), words)) + const wordPhonetics = _.flatMap(_.get('phonetics'), words) // birthDateString is in YYYYMMDD format const year = parseInt(birthDateString.slice(0, 4)) @@ -74,72 +65,10 @@ function match (nameParts, birthDateString, threshold) { const birthDate = {year, month, day, date} - debug_log({parts, fullName, wordValues, wordPhonetics, birthDate}) - - // Start matching - - // Accept aliases who's full name matches. - const doesNameMatch = _.flow( - _.get('fullName'), - stringSimilarity(fullName), - _.lte(threshold) - ) - const aliases = _.flatMap(_.get('aliases'), structs.individuals) - const aliasIdsFromFullName = _.flow( - _.filter(doesNameMatch), - - _.map(_.get('id')) - )(aliases) - - // Gather aliases who's name-parts match phonetically. - const getPhoneticMatches = phonetic => structs.phoneticMap.get(phonetic) - const phoneticMatches = _.flow( - _.map(getPhoneticMatches), - _.compact, - _.flatten - )(wordPhonetics) - - // Gether aliases whose name-parts match alphabetically. - const getStringMatches = value => { - const entryMatches = entry => (jaroWinkler(value, entry.value) >= threshold) - return _.filter(entryMatches, structs.wordList) - } - const getSingleEntries = wordEntry => { - const makeEntry = aliasId => ({value: wordEntry.value, aliasId}) - return _.map(makeEntry, wordEntry.aliasIds) - } - const stringMatches = _.flow( - _.map(getStringMatches), - _.flatten, - _.map(getSingleEntries), - _.flatten - )(wordValues) - - // At least two name-parts must match per alias - const aliasIdsFromNamePart = _.flow( - _.uniqWith(_.isEqual), - _.map(_.get('aliasId')), - _.countBy(_.identity), - _.toPairs, - _.filter(_.flow(_.last, _.lte(2))), - _.map(_.first) - )([...phoneticMatches, ...stringMatches]) - - // Get the full record for each matched id - const getIndividual = aliasId => { - const individualId = structs.aliasToIndividual.get(aliasId) - return structs.individualsMap.get(individualId) - } - const suspects = _.uniq(_.map(getIndividual, [ - ...aliasIdsFromFullName, - ...aliasIdsFromNamePart - ])) - - // Reject everyone who is born two years away. - const twoYears = 365 * 2 - const unqualified = isBornTooLongSince(twoYears, birthDate) - const result = _.reject(unqualified, suspects) + const candidate = {parts, fullName, wordValues, wordPhonetics, birthDate} + debug_log(candidate) + const result = matcher.match(structs, candidate, threshold) debug_log(result) return result } diff --git a/lib/ofac/matching.js b/lib/ofac/matching.js new file mode 100644 index 00000000..5cf87d66 --- /dev/null +++ b/lib/ofac/matching.js @@ -0,0 +1,94 @@ +const jaroWinkler = require('talisman/metrics/distance/jaro-winkler') +const _ = require('lodash/fp') + +const debug_log = require('../pp')(__filename) // KOSTIS TODO: remove + +const stringSimilarity = _.curry(jaroWinkler) + +// birth date + +function isDateWithinSomeDaysOfPeriod (period, date, days) { + const inMillisecs = 24 * 60 * 60 * 1000 + + const startTime = period.start.date.getTime() - days * inMillisecs + const startDate = new Date(startTime) + + const endTime = period.end.date.getTime() + days * inMillisecs + const endDate = new Date(endTime) + + return (startDate < date && date < endDate) +} + +const isBornTooLongSince = _.curry((days, dateObject, individual) => { + if (_.isEmpty(individual.birthDatePeriods)) return false + const isWithinSomeYears = _.partialRight(isDateWithinSomeDaysOfPeriod, [dateObject.date, days]) + return !_.some(isWithinSomeYears, individual.birthDatePeriods) +}) + +// algorithm + +function match (structs, candidate, threshold) { + const {fullName, wordPhonetics, wordValues, birthDate} = candidate + + // Accept aliases who's full name matches. + const doesNameMatch = _.flow( + _.get('fullName'), + stringSimilarity(fullName), + _.lte(threshold) + ) + const aliases = _.flatMap(_.get('aliases'), structs.individuals) + const aliasIdsFromFullName = _.flow( + _.filter(doesNameMatch), + + _.map(_.get('id')) + )(aliases) + + // Gather aliases who's name-parts match phonetically. + const getPhoneticMatches = phonetic => structs.phoneticMap.get(phonetic) + const phoneticMatches = _.flow( + _.map(getPhoneticMatches), + _.compact, + _.flatten + )(wordPhonetics) + + // Gether aliases whose name-parts match alphabetically. + const getStringMatches = value => { + const entryMatches = entry => (jaroWinkler(value, entry.value) >= threshold) + return _.filter(entryMatches, structs.wordList) + } + const getSingleEntries = wordEntry => { + const makeEntry = aliasId => ({value: wordEntry.value, aliasId}) + return _.map(makeEntry, wordEntry.aliasIds) + } + const stringMatches = _.flow( + _.flatMap(getStringMatches), + _.flatMap(getSingleEntries) + )(wordValues) + + // At least two name-parts must match per alias + const aliasIdsFromNamePart = _.flow( + _.uniqWith(_.isEqual), + _.map(_.get('aliasId')), + _.countBy(_.identity), + _.toPairs, + _.filter(_.flow(_.last, _.lte(2))), + _.map(_.first) + )([...phoneticMatches, ...stringMatches]) + + // Get the full record for each matched id + const getIndividual = aliasId => { + const individualId = structs.aliasToIndividual.get(aliasId) + return structs.individualsMap.get(individualId) + } + const suspects = _.uniq(_.map(getIndividual, [ + ...aliasIdsFromFullName, + ...aliasIdsFromNamePart + ])) + + // Reject everyone who is born two years away. + const twoYears = 365 * 2 + const unqualified = isBornTooLongSince(twoYears, birthDate) + return _.reject(unqualified, suspects) +} + +module.exports = {match} diff --git a/lib/ofac/matching_tests.js b/lib/ofac/matching_tests.js new file mode 100644 index 00000000..e69de29b diff --git a/lib/ofac/data-parser.js b/lib/ofac/parsing.js similarity index 78% rename from lib/ofac/data-parser.js rename to lib/ofac/parsing.js index 7054cb51..bb7864e0 100644 --- a/lib/ofac/data-parser.js +++ b/lib/ofac/parsing.js @@ -1,16 +1,11 @@ const fs = require('fs') -const path = require('path') -const util = require('util') const XmlStream = require('xml-stream') const nameUtils = require('./name-utils') -const options = require('../options') const logger = require('../logger') const _ = require('lodash/fp') const debug_log = require('../pp')(__filename) // KOSTIS TODO: remove -const OFAC_DATA_DIR = options.ofacDataDir - // KOSTIS TODO: get these from the document itself const INDIVIDUAL = '4' const NAME = '1403' @@ -57,6 +52,7 @@ const isLatin = _.matchesProperty(['$', 'DocNameStatusID'], PRIMARY_LATIN) const processAlias = _.curry((groupTypes, aliasNode) => { if (aliasNode.$.AliasTypeID !== NAME) return + if (aliasNode.$.LowQuality === 'true') return const getNamePart = processDocumentedNamePart(groupTypes) const latinNameNode = _.find(isLatin, aliasNode.DocumentedName) @@ -131,8 +127,7 @@ function processProfile (profileNode) { function promiseParseDocument (source) { return new Promise((resolve, reject) => { - const fileName = path.join(OFAC_DATA_DIR, source) - const stream = fs.createReadStream(fileName) + const stream = fs.createReadStream(source) const xml = new XmlStream(stream) xml.on('error', err => { @@ -156,8 +151,6 @@ function promiseParseDocument (source) { }) } -const readdir = util.promisify(fs.readdir) - const mapAliases = _.curry((iteratee, individuals) => { const mapIndividual = individual => { const {id, aliases} = individual @@ -200,46 +193,36 @@ const produceWordList = _.flow( _.map(_.zipObject(['value', 'aliasIds'])) ) -const combineAndDedupe = _.flow( - _.flatten, - _.compact, - _.uniqBy(_.get('id')), - individuals => { - const individualsMap = _.flow( - _.groupBy(_.get('id')), - _.mapValues(_.first), - _.toPairs, - entries => new Map(entries) - )(individuals) +function parse (sources) { + return Promise.all(_.map(promiseParseDocument, sources)) + .then(_.flow( + _.flatten, + _.compact, + _.uniqBy(_.get('id')), + individuals => { + const individualsMap = _.flow( + _.groupBy(_.get('id')), + _.mapValues(_.first), + _.toPairs, + entries => new Map(entries) + )(individuals) - const getIdPairs = (individualId, alias) => [alias.id, individualId] - const idPairs = mapAliases(getIdPairs, individuals) - const aliasToIndividual = new Map(idPairs) + const getIdPairs = (individualId, alias) => [alias.id, individualId] + const idPairs = mapAliases(getIdPairs, individuals) + const aliasToIndividual = new Map(idPairs) - const phoneticMap = producePhoneticMap(individuals) - const wordList = produceWordList(individuals) + const phoneticMap = producePhoneticMap(individuals) + const wordList = produceWordList(individuals) - return { - individuals, - individualsMap, - aliasToIndividual, - phoneticMap, - wordList + return { + individuals, + individualsMap, + aliasToIndividual, + phoneticMap, + wordList + } } - } -) - -function produceStructs () { - // NOTE: Not sure how you push code updates to existing clients. This problem - // might pop up if new code is pushed, without re-doing setup. - if (!OFAC_DATA_DIR) { - const message = 'The ofacDataDir option has not been set in lamassu.json' - return Promise.reject(new Error(message)) - } - - return readdir(OFAC_DATA_DIR) - .then(sources => Promise.all(_.map(promiseParseDocument, sources))) - .then(combineAndDedupe) + )) } -module.exports = {produceStructs} +module.exports = {parse} diff --git a/lib/ofac/parsing_tests.js b/lib/ofac/parsing_tests.js new file mode 100644 index 00000000..e69de29b diff --git a/package-lock.json b/package-lock.json index 603b03ae..62d8b0df 100644 --- a/package-lock.json +++ b/package-lock.json @@ -1364,6 +1364,12 @@ "resolved": "https://registry.npmjs.org/brorand/-/brorand-1.1.0.tgz", "integrity": "sha1-EsJe/kCkXjwyPrhnWgoM5XsiNx8=" }, + "browser-stdout": { + "version": "1.3.0", + "resolved": "https://registry.npmjs.org/browser-stdout/-/browser-stdout-1.3.0.tgz", + "integrity": "sha1-81HTKWnTL6XXpVZxVCY9korjvR8=", + "dev": true + }, "browserify-aes": { "version": "1.0.6", "resolved": "https://registry.npmjs.org/browserify-aes/-/browserify-aes-1.0.6.tgz", @@ -3701,6 +3707,12 @@ "resolved": "https://registry.npmjs.org/graceful-fs/-/graceful-fs-4.1.11.tgz", "integrity": "sha1-Dovf5NHduIVNZOBOp8AOKgJuVlg=" }, + "growl": { + "version": "1.10.3", + "resolved": "https://registry.npmjs.org/growl/-/growl-1.10.3.tgz", + "integrity": "sha512-hKlsbA5Vu3xsh1Cg3J7jSmX/WaW6A5oBeqzM88oNbCRQFz+zUaXm6yxS4RVytp1scBoJzSYl4YAEOQIt6O8V1Q==", + "dev": true + }, "grpc": { "version": "1.8.4", "resolved": "https://registry.npmjs.org/grpc/-/grpc-1.8.4.tgz", @@ -4536,6 +4548,12 @@ "secp256k1": "3.2.5" } }, + "he": { + "version": "1.1.1", + "resolved": "https://registry.npmjs.org/he/-/he-1.1.1.tgz", + "integrity": "sha1-k0EP0hsAlzUVH4howvJx80J+I/0=", + "dev": true + }, "helmet": { "version": "3.8.1", "resolved": "https://registry.npmjs.org/helmet/-/helmet-3.8.1.tgz", @@ -5736,6 +5754,50 @@ "obliterator": "1.2.1" } }, + "mocha": { + "version": "5.0.1", + "resolved": "https://registry.npmjs.org/mocha/-/mocha-5.0.1.tgz", + "integrity": "sha512-SpwyojlnE/WRBNGtvJSNfllfm5PqEDFxcWluSIgLeSBJtXG4DmoX2NNAeEA7rP5kK+79VgtVq8nG6HskaL1ykg==", + "dev": true, + "requires": { + "browser-stdout": "1.3.0", + "commander": "2.11.0", + "debug": "3.1.0", + "diff": "3.3.1", + "escape-string-regexp": "1.0.5", + "glob": "7.1.2", + "growl": "1.10.3", + "he": "1.1.1", + "mkdirp": "0.5.1", + "supports-color": "4.4.0" + }, + "dependencies": { + "debug": { + "version": "3.1.0", + "resolved": "https://registry.npmjs.org/debug/-/debug-3.1.0.tgz", + "integrity": "sha512-OX8XqP7/1a9cqkxYw2yXss15f26NKWBpDXQd0/uK/KPqdQhxbPa994hnzjcE2VqQpDslf55723cKPUOGSmMY3g==", + "dev": true, + "requires": { + "ms": "2.0.0" + } + }, + "diff": { + "version": "3.3.1", + "resolved": "https://registry.npmjs.org/diff/-/diff-3.3.1.tgz", + "integrity": "sha512-MKPHZDMB0o6yHyDryUOScqZibp914ksXwAMYMTHj6KO8UeKsRYNJD3oNCKjTqZon+V488P7N/HzXF8t7ZR95ww==", + "dev": true + }, + "supports-color": { + "version": "4.4.0", + "resolved": "https://registry.npmjs.org/supports-color/-/supports-color-4.4.0.tgz", + "integrity": "sha512-rKC3+DyXWgK0ZLKwmRsrkyHVZAjNkfzeehuFWdGGcqGDTZFH73+RH6S/RDAAxl9GusSjZSUWYLmT9N5pzXFOXQ==", + "dev": true, + "requires": { + "has-flag": "2.0.0" + } + } + } + }, "moment": { "version": "2.18.1", "resolved": "https://registry.npmjs.org/moment/-/moment-2.18.1.tgz", diff --git a/package.json b/package.json index 654c9b8e..3bb19736 100644 --- a/package.json +++ b/package.json @@ -78,9 +78,10 @@ }, "scripts": { "start": "node bin/lamassu-server", - "test": "ava" + "test": "mocha $(find . -path ./node_modules -prune -o -name '*_tests.js')" }, "devDependencies": { - "ava": "^0.19.1" + "ava": "^0.19.1", + "mocha": "^5.0.1" } }