Refactored for testing
This commit is contained in:
parent
1d0aff07fe
commit
2232340f6f
7 changed files with 214 additions and 145 deletions
|
|
@ -1,45 +1,38 @@
|
|||
const dataParser = require('./data-parser')
|
||||
const fs = require('fs')
|
||||
const path = require('path')
|
||||
const util = require('util')
|
||||
const parser = require('./parsing')
|
||||
const matcher = require('./matching')
|
||||
const nameUtils = require('./name-utils')
|
||||
const jaroWinkler = require('talisman/metrics/distance/jaro-winkler')
|
||||
const options = require('../options')
|
||||
const _ = require('lodash/fp')
|
||||
|
||||
const debug_log = require('../pp')(__filename) // KOSTIS TODO: remove
|
||||
|
||||
const OFAC_DATA_DIR = options.ofacDataDir
|
||||
|
||||
let structs = null
|
||||
|
||||
const readdir = util.promisify(fs.readdir)
|
||||
|
||||
function load () {
|
||||
return dataParser.produceStructs()
|
||||
// NOTE: Not sure how you push code updates to existing clients. This problem
|
||||
// might pop up if new code is pushed, without re-doing setup.
|
||||
if (!OFAC_DATA_DIR) {
|
||||
const message = 'The ofacDataDir option has not been set in lamassu.json'
|
||||
return Promise.reject(new Error(message))
|
||||
}
|
||||
|
||||
return readdir(OFAC_DATA_DIR)
|
||||
.then(_.flow(
|
||||
_.map(file => path.join(OFAC_DATA_DIR, file)),
|
||||
parser.parse
|
||||
))
|
||||
.then(result => {
|
||||
structs = result
|
||||
})
|
||||
}
|
||||
|
||||
// MATCHING
|
||||
|
||||
// similarity algorithm
|
||||
|
||||
const stringSimilarity = _.curry(jaroWinkler)
|
||||
|
||||
// birth date
|
||||
|
||||
function isDateWithinSomeDaysOfPeriod (period, date, days) {
|
||||
const inMillisecs = 24 * 60 * 60 * 1000
|
||||
|
||||
const startTime = period.start.date.getTime() - days * inMillisecs
|
||||
const startDate = new Date(startTime)
|
||||
|
||||
const endTime = period.end.date.getTime() + days * inMillisecs
|
||||
const endDate = new Date(endTime)
|
||||
|
||||
return (startDate < date && date < endDate)
|
||||
}
|
||||
|
||||
const isBornTooLongSince = _.curry((days, dateObject, individual) => {
|
||||
if (_.isEmpty(individual.birthDatePeriods)) return false
|
||||
const isWithinSomeYears = _.partialRight(isDateWithinSomeDaysOfPeriod, [dateObject.date, days])
|
||||
return !_.some(isWithinSomeYears, individual.birthDatePeriods)
|
||||
})
|
||||
|
||||
// nameParts should be an object like {firstName: "John", lastName: "Doe", ...}
|
||||
|
||||
function makeCompatible (nameParts) {
|
||||
|
|
@ -49,8 +42,6 @@ function makeCompatible (nameParts) {
|
|||
return _.map(_.zipObject(['partName', 'value']), props)
|
||||
}
|
||||
|
||||
// algorithm
|
||||
|
||||
function match (nameParts, birthDateString, threshold) {
|
||||
if (!structs) {
|
||||
const message = 'The OFAC data sources have not been loaded yet.'
|
||||
|
|
@ -64,7 +55,7 @@ function match (nameParts, birthDateString, threshold) {
|
|||
const words = nameUtils.makeWords(fullName)
|
||||
|
||||
const wordValues = _.map(_.get('value'), words)
|
||||
const wordPhonetics = _.flatten(_.map(_.get('phonetics'), words))
|
||||
const wordPhonetics = _.flatMap(_.get('phonetics'), words)
|
||||
|
||||
// birthDateString is in YYYYMMDD format
|
||||
const year = parseInt(birthDateString.slice(0, 4))
|
||||
|
|
@ -74,72 +65,10 @@ function match (nameParts, birthDateString, threshold) {
|
|||
|
||||
const birthDate = {year, month, day, date}
|
||||
|
||||
debug_log({parts, fullName, wordValues, wordPhonetics, birthDate})
|
||||
|
||||
// Start matching
|
||||
|
||||
// Accept aliases who's full name matches.
|
||||
const doesNameMatch = _.flow(
|
||||
_.get('fullName'),
|
||||
stringSimilarity(fullName),
|
||||
_.lte(threshold)
|
||||
)
|
||||
const aliases = _.flatMap(_.get('aliases'), structs.individuals)
|
||||
const aliasIdsFromFullName = _.flow(
|
||||
_.filter(doesNameMatch),
|
||||
|
||||
_.map(_.get('id'))
|
||||
)(aliases)
|
||||
|
||||
// Gather aliases who's name-parts match phonetically.
|
||||
const getPhoneticMatches = phonetic => structs.phoneticMap.get(phonetic)
|
||||
const phoneticMatches = _.flow(
|
||||
_.map(getPhoneticMatches),
|
||||
_.compact,
|
||||
_.flatten
|
||||
)(wordPhonetics)
|
||||
|
||||
// Gether aliases whose name-parts match alphabetically.
|
||||
const getStringMatches = value => {
|
||||
const entryMatches = entry => (jaroWinkler(value, entry.value) >= threshold)
|
||||
return _.filter(entryMatches, structs.wordList)
|
||||
}
|
||||
const getSingleEntries = wordEntry => {
|
||||
const makeEntry = aliasId => ({value: wordEntry.value, aliasId})
|
||||
return _.map(makeEntry, wordEntry.aliasIds)
|
||||
}
|
||||
const stringMatches = _.flow(
|
||||
_.map(getStringMatches),
|
||||
_.flatten,
|
||||
_.map(getSingleEntries),
|
||||
_.flatten
|
||||
)(wordValues)
|
||||
|
||||
// At least two name-parts must match per alias
|
||||
const aliasIdsFromNamePart = _.flow(
|
||||
_.uniqWith(_.isEqual),
|
||||
_.map(_.get('aliasId')),
|
||||
_.countBy(_.identity),
|
||||
_.toPairs,
|
||||
_.filter(_.flow(_.last, _.lte(2))),
|
||||
_.map(_.first)
|
||||
)([...phoneticMatches, ...stringMatches])
|
||||
|
||||
// Get the full record for each matched id
|
||||
const getIndividual = aliasId => {
|
||||
const individualId = structs.aliasToIndividual.get(aliasId)
|
||||
return structs.individualsMap.get(individualId)
|
||||
}
|
||||
const suspects = _.uniq(_.map(getIndividual, [
|
||||
...aliasIdsFromFullName,
|
||||
...aliasIdsFromNamePart
|
||||
]))
|
||||
|
||||
// Reject everyone who is born two years away.
|
||||
const twoYears = 365 * 2
|
||||
const unqualified = isBornTooLongSince(twoYears, birthDate)
|
||||
const result = _.reject(unqualified, suspects)
|
||||
const candidate = {parts, fullName, wordValues, wordPhonetics, birthDate}
|
||||
debug_log(candidate)
|
||||
|
||||
const result = matcher.match(structs, candidate, threshold)
|
||||
debug_log(result)
|
||||
return result
|
||||
}
|
||||
|
|
|
|||
94
lib/ofac/matching.js
Normal file
94
lib/ofac/matching.js
Normal file
|
|
@ -0,0 +1,94 @@
|
|||
const jaroWinkler = require('talisman/metrics/distance/jaro-winkler')
|
||||
const _ = require('lodash/fp')
|
||||
|
||||
const debug_log = require('../pp')(__filename) // KOSTIS TODO: remove
|
||||
|
||||
const stringSimilarity = _.curry(jaroWinkler)
|
||||
|
||||
// birth date
|
||||
|
||||
function isDateWithinSomeDaysOfPeriod (period, date, days) {
|
||||
const inMillisecs = 24 * 60 * 60 * 1000
|
||||
|
||||
const startTime = period.start.date.getTime() - days * inMillisecs
|
||||
const startDate = new Date(startTime)
|
||||
|
||||
const endTime = period.end.date.getTime() + days * inMillisecs
|
||||
const endDate = new Date(endTime)
|
||||
|
||||
return (startDate < date && date < endDate)
|
||||
}
|
||||
|
||||
const isBornTooLongSince = _.curry((days, dateObject, individual) => {
|
||||
if (_.isEmpty(individual.birthDatePeriods)) return false
|
||||
const isWithinSomeYears = _.partialRight(isDateWithinSomeDaysOfPeriod, [dateObject.date, days])
|
||||
return !_.some(isWithinSomeYears, individual.birthDatePeriods)
|
||||
})
|
||||
|
||||
// algorithm
|
||||
|
||||
function match (structs, candidate, threshold) {
|
||||
const {fullName, wordPhonetics, wordValues, birthDate} = candidate
|
||||
|
||||
// Accept aliases who's full name matches.
|
||||
const doesNameMatch = _.flow(
|
||||
_.get('fullName'),
|
||||
stringSimilarity(fullName),
|
||||
_.lte(threshold)
|
||||
)
|
||||
const aliases = _.flatMap(_.get('aliases'), structs.individuals)
|
||||
const aliasIdsFromFullName = _.flow(
|
||||
_.filter(doesNameMatch),
|
||||
|
||||
_.map(_.get('id'))
|
||||
)(aliases)
|
||||
|
||||
// Gather aliases who's name-parts match phonetically.
|
||||
const getPhoneticMatches = phonetic => structs.phoneticMap.get(phonetic)
|
||||
const phoneticMatches = _.flow(
|
||||
_.map(getPhoneticMatches),
|
||||
_.compact,
|
||||
_.flatten
|
||||
)(wordPhonetics)
|
||||
|
||||
// Gether aliases whose name-parts match alphabetically.
|
||||
const getStringMatches = value => {
|
||||
const entryMatches = entry => (jaroWinkler(value, entry.value) >= threshold)
|
||||
return _.filter(entryMatches, structs.wordList)
|
||||
}
|
||||
const getSingleEntries = wordEntry => {
|
||||
const makeEntry = aliasId => ({value: wordEntry.value, aliasId})
|
||||
return _.map(makeEntry, wordEntry.aliasIds)
|
||||
}
|
||||
const stringMatches = _.flow(
|
||||
_.flatMap(getStringMatches),
|
||||
_.flatMap(getSingleEntries)
|
||||
)(wordValues)
|
||||
|
||||
// At least two name-parts must match per alias
|
||||
const aliasIdsFromNamePart = _.flow(
|
||||
_.uniqWith(_.isEqual),
|
||||
_.map(_.get('aliasId')),
|
||||
_.countBy(_.identity),
|
||||
_.toPairs,
|
||||
_.filter(_.flow(_.last, _.lte(2))),
|
||||
_.map(_.first)
|
||||
)([...phoneticMatches, ...stringMatches])
|
||||
|
||||
// Get the full record for each matched id
|
||||
const getIndividual = aliasId => {
|
||||
const individualId = structs.aliasToIndividual.get(aliasId)
|
||||
return structs.individualsMap.get(individualId)
|
||||
}
|
||||
const suspects = _.uniq(_.map(getIndividual, [
|
||||
...aliasIdsFromFullName,
|
||||
...aliasIdsFromNamePart
|
||||
]))
|
||||
|
||||
// Reject everyone who is born two years away.
|
||||
const twoYears = 365 * 2
|
||||
const unqualified = isBornTooLongSince(twoYears, birthDate)
|
||||
return _.reject(unqualified, suspects)
|
||||
}
|
||||
|
||||
module.exports = {match}
|
||||
0
lib/ofac/matching_tests.js
Normal file
0
lib/ofac/matching_tests.js
Normal file
|
|
@ -1,16 +1,11 @@
|
|||
const fs = require('fs')
|
||||
const path = require('path')
|
||||
const util = require('util')
|
||||
const XmlStream = require('xml-stream')
|
||||
const nameUtils = require('./name-utils')
|
||||
const options = require('../options')
|
||||
const logger = require('../logger')
|
||||
const _ = require('lodash/fp')
|
||||
|
||||
const debug_log = require('../pp')(__filename) // KOSTIS TODO: remove
|
||||
|
||||
const OFAC_DATA_DIR = options.ofacDataDir
|
||||
|
||||
// KOSTIS TODO: get these from the document itself
|
||||
const INDIVIDUAL = '4'
|
||||
const NAME = '1403'
|
||||
|
|
@ -57,6 +52,7 @@ const isLatin = _.matchesProperty(['$', 'DocNameStatusID'], PRIMARY_LATIN)
|
|||
|
||||
const processAlias = _.curry((groupTypes, aliasNode) => {
|
||||
if (aliasNode.$.AliasTypeID !== NAME) return
|
||||
if (aliasNode.$.LowQuality === 'true') return
|
||||
|
||||
const getNamePart = processDocumentedNamePart(groupTypes)
|
||||
const latinNameNode = _.find(isLatin, aliasNode.DocumentedName)
|
||||
|
|
@ -131,8 +127,7 @@ function processProfile (profileNode) {
|
|||
|
||||
function promiseParseDocument (source) {
|
||||
return new Promise((resolve, reject) => {
|
||||
const fileName = path.join(OFAC_DATA_DIR, source)
|
||||
const stream = fs.createReadStream(fileName)
|
||||
const stream = fs.createReadStream(source)
|
||||
const xml = new XmlStream(stream)
|
||||
|
||||
xml.on('error', err => {
|
||||
|
|
@ -156,8 +151,6 @@ function promiseParseDocument (source) {
|
|||
})
|
||||
}
|
||||
|
||||
const readdir = util.promisify(fs.readdir)
|
||||
|
||||
const mapAliases = _.curry((iteratee, individuals) => {
|
||||
const mapIndividual = individual => {
|
||||
const {id, aliases} = individual
|
||||
|
|
@ -200,7 +193,9 @@ const produceWordList = _.flow(
|
|||
_.map(_.zipObject(['value', 'aliasIds']))
|
||||
)
|
||||
|
||||
const combineAndDedupe = _.flow(
|
||||
function parse (sources) {
|
||||
return Promise.all(_.map(promiseParseDocument, sources))
|
||||
.then(_.flow(
|
||||
_.flatten,
|
||||
_.compact,
|
||||
_.uniqBy(_.get('id')),
|
||||
|
|
@ -227,19 +222,7 @@ const combineAndDedupe = _.flow(
|
|||
wordList
|
||||
}
|
||||
}
|
||||
)
|
||||
|
||||
function produceStructs () {
|
||||
// NOTE: Not sure how you push code updates to existing clients. This problem
|
||||
// might pop up if new code is pushed, without re-doing setup.
|
||||
if (!OFAC_DATA_DIR) {
|
||||
const message = 'The ofacDataDir option has not been set in lamassu.json'
|
||||
return Promise.reject(new Error(message))
|
||||
))
|
||||
}
|
||||
|
||||
return readdir(OFAC_DATA_DIR)
|
||||
.then(sources => Promise.all(_.map(promiseParseDocument, sources)))
|
||||
.then(combineAndDedupe)
|
||||
}
|
||||
|
||||
module.exports = {produceStructs}
|
||||
module.exports = {parse}
|
||||
0
lib/ofac/parsing_tests.js
Normal file
0
lib/ofac/parsing_tests.js
Normal file
62
package-lock.json
generated
62
package-lock.json
generated
|
|
@ -1364,6 +1364,12 @@
|
|||
"resolved": "https://registry.npmjs.org/brorand/-/brorand-1.1.0.tgz",
|
||||
"integrity": "sha1-EsJe/kCkXjwyPrhnWgoM5XsiNx8="
|
||||
},
|
||||
"browser-stdout": {
|
||||
"version": "1.3.0",
|
||||
"resolved": "https://registry.npmjs.org/browser-stdout/-/browser-stdout-1.3.0.tgz",
|
||||
"integrity": "sha1-81HTKWnTL6XXpVZxVCY9korjvR8=",
|
||||
"dev": true
|
||||
},
|
||||
"browserify-aes": {
|
||||
"version": "1.0.6",
|
||||
"resolved": "https://registry.npmjs.org/browserify-aes/-/browserify-aes-1.0.6.tgz",
|
||||
|
|
@ -3701,6 +3707,12 @@
|
|||
"resolved": "https://registry.npmjs.org/graceful-fs/-/graceful-fs-4.1.11.tgz",
|
||||
"integrity": "sha1-Dovf5NHduIVNZOBOp8AOKgJuVlg="
|
||||
},
|
||||
"growl": {
|
||||
"version": "1.10.3",
|
||||
"resolved": "https://registry.npmjs.org/growl/-/growl-1.10.3.tgz",
|
||||
"integrity": "sha512-hKlsbA5Vu3xsh1Cg3J7jSmX/WaW6A5oBeqzM88oNbCRQFz+zUaXm6yxS4RVytp1scBoJzSYl4YAEOQIt6O8V1Q==",
|
||||
"dev": true
|
||||
},
|
||||
"grpc": {
|
||||
"version": "1.8.4",
|
||||
"resolved": "https://registry.npmjs.org/grpc/-/grpc-1.8.4.tgz",
|
||||
|
|
@ -4536,6 +4548,12 @@
|
|||
"secp256k1": "3.2.5"
|
||||
}
|
||||
},
|
||||
"he": {
|
||||
"version": "1.1.1",
|
||||
"resolved": "https://registry.npmjs.org/he/-/he-1.1.1.tgz",
|
||||
"integrity": "sha1-k0EP0hsAlzUVH4howvJx80J+I/0=",
|
||||
"dev": true
|
||||
},
|
||||
"helmet": {
|
||||
"version": "3.8.1",
|
||||
"resolved": "https://registry.npmjs.org/helmet/-/helmet-3.8.1.tgz",
|
||||
|
|
@ -5736,6 +5754,50 @@
|
|||
"obliterator": "1.2.1"
|
||||
}
|
||||
},
|
||||
"mocha": {
|
||||
"version": "5.0.1",
|
||||
"resolved": "https://registry.npmjs.org/mocha/-/mocha-5.0.1.tgz",
|
||||
"integrity": "sha512-SpwyojlnE/WRBNGtvJSNfllfm5PqEDFxcWluSIgLeSBJtXG4DmoX2NNAeEA7rP5kK+79VgtVq8nG6HskaL1ykg==",
|
||||
"dev": true,
|
||||
"requires": {
|
||||
"browser-stdout": "1.3.0",
|
||||
"commander": "2.11.0",
|
||||
"debug": "3.1.0",
|
||||
"diff": "3.3.1",
|
||||
"escape-string-regexp": "1.0.5",
|
||||
"glob": "7.1.2",
|
||||
"growl": "1.10.3",
|
||||
"he": "1.1.1",
|
||||
"mkdirp": "0.5.1",
|
||||
"supports-color": "4.4.0"
|
||||
},
|
||||
"dependencies": {
|
||||
"debug": {
|
||||
"version": "3.1.0",
|
||||
"resolved": "https://registry.npmjs.org/debug/-/debug-3.1.0.tgz",
|
||||
"integrity": "sha512-OX8XqP7/1a9cqkxYw2yXss15f26NKWBpDXQd0/uK/KPqdQhxbPa994hnzjcE2VqQpDslf55723cKPUOGSmMY3g==",
|
||||
"dev": true,
|
||||
"requires": {
|
||||
"ms": "2.0.0"
|
||||
}
|
||||
},
|
||||
"diff": {
|
||||
"version": "3.3.1",
|
||||
"resolved": "https://registry.npmjs.org/diff/-/diff-3.3.1.tgz",
|
||||
"integrity": "sha512-MKPHZDMB0o6yHyDryUOScqZibp914ksXwAMYMTHj6KO8UeKsRYNJD3oNCKjTqZon+V488P7N/HzXF8t7ZR95ww==",
|
||||
"dev": true
|
||||
},
|
||||
"supports-color": {
|
||||
"version": "4.4.0",
|
||||
"resolved": "https://registry.npmjs.org/supports-color/-/supports-color-4.4.0.tgz",
|
||||
"integrity": "sha512-rKC3+DyXWgK0ZLKwmRsrkyHVZAjNkfzeehuFWdGGcqGDTZFH73+RH6S/RDAAxl9GusSjZSUWYLmT9N5pzXFOXQ==",
|
||||
"dev": true,
|
||||
"requires": {
|
||||
"has-flag": "2.0.0"
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"moment": {
|
||||
"version": "2.18.1",
|
||||
"resolved": "https://registry.npmjs.org/moment/-/moment-2.18.1.tgz",
|
||||
|
|
|
|||
|
|
@ -78,9 +78,10 @@
|
|||
},
|
||||
"scripts": {
|
||||
"start": "node bin/lamassu-server",
|
||||
"test": "ava"
|
||||
"test": "mocha $(find . -path ./node_modules -prune -o -name '*_tests.js')"
|
||||
},
|
||||
"devDependencies": {
|
||||
"ava": "^0.19.1"
|
||||
"ava": "^0.19.1",
|
||||
"mocha": "^5.0.1"
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue