Parsing moved to downloading. Matching is being tweaked.

This commit is contained in:
Konstantin Mamalakis 2018-03-15 20:36:34 +02:00 committed by Josh Harvey
parent 793db0f449
commit b72f5549a5
10 changed files with 456 additions and 276 deletions

View file

@ -65,7 +65,8 @@ openssl x509 \
rm /tmp/Lamassu_OP.csr.pem rm /tmp/Lamassu_OP.csr.pem
mkdir -p $OFAC_DATA_DIR mkdir -p $OFAC_DATA_DIR/sources
touch $OFAC_DATA_DIR/etags.json
cat <<EOF > $CONFIG_DIR/lamassu.json cat <<EOF > $CONFIG_DIR/lamassu.json
{ {

View file

@ -1,7 +1,7 @@
const fs = require('fs') const fs = require('fs')
const path = require('path') const path = require('path')
const util = require('util') const util = require('util')
const parser = require('./parsing') const loader = require('./loading')
const matcher = require('./matching') const matcher = require('./matching')
const nameUtils = require('./name-utils') const nameUtils = require('./name-utils')
const options = require('../options') const options = require('../options')
@ -9,7 +9,7 @@ const _ = require('lodash/fp')
const debug_log = require('../pp')(__filename) // KOSTIS TODO: remove const debug_log = require('../pp')(__filename) // KOSTIS TODO: remove
const OFAC_DATA_DIR = options.ofacDataDir const OFAC_SOURCES_DIR = path.join(options.ofacDataDir, 'sources')
let structs = null let structs = null
@ -18,15 +18,15 @@ const readdir = util.promisify(fs.readdir)
function load () { function load () {
// NOTE: Not sure how you push code updates to existing clients. This problem // NOTE: Not sure how you push code updates to existing clients. This problem
// might pop up if new code is pushed, without re-doing setup. // might pop up if new code is pushed, without re-doing setup.
if (!OFAC_DATA_DIR) { if (!OFAC_SOURCES_DIR) {
const message = 'The ofacDataDir option has not been set in lamassu.json' const message = 'The ofacDataDir option has not been set in lamassu.json'
return Promise.reject(new Error(message)) return Promise.reject(new Error(message))
} }
return readdir(OFAC_DATA_DIR) return readdir(OFAC_SOURCES_DIR)
.then(_.flow( .then(_.flow(
_.map(file => path.join(OFAC_DATA_DIR, file)), _.map(file => path.join(OFAC_SOURCES_DIR, file)),
parser.parse loader.load
)) ))
.then(result => { .then(result => {
return (structs = result) return (structs = result)
@ -42,7 +42,8 @@ function makeCompatible (nameParts) {
return _.map(_.zipObject(['partName', 'value']), props) return _.map(_.zipObject(['partName', 'value']), props)
} }
function match (nameParts, birthDateString, threshold) { function match (nameParts, birthDateString, options) {
const {debug} = options
if (!structs) { if (!structs) {
const message = 'The OFAC data sources have not been loaded yet.' const message = 'The OFAC data sources have not been loaded yet.'
return Promise.reject(new Error(message)) return Promise.reject(new Error(message))
@ -68,10 +69,10 @@ function match (nameParts, birthDateString, threshold) {
])(birthDateString) ])(birthDateString)
const candidate = {parts, fullName, words, birthDate} const candidate = {parts, fullName, words, birthDate}
// debug_log(candidate) debug && debug_log(candidate)
const result = matcher.match(structs, candidate, threshold) const result = matcher.match(structs, candidate, options)
// debug_log(result) debug && debug_log(result)
return result return result
} }

103
lib/ofac/loading.js Normal file
View file

@ -0,0 +1,103 @@
const fs = require('fs')
const ndjson = require('ndjson')
const _ = require('lodash/fp')
const mapAliases = _.curry((iteratee, individuals) => {
const mapIndividual = individual => {
const {id, aliases} = individual
return _.map(alias => iteratee(id, alias), aliases)
}
return _.flatMap(mapIndividual, individuals)
})
const getPhoneticEntries = (individualId, alias) => {
const pairPhoneticsWithValues = word => {
const {value, phonetics} = word
const makeEntry = phonetic => ({value, phonetic, aliasId: alias.id})
return _.map(makeEntry, phonetics)
}
return _.flatMap(pairPhoneticsWithValues, alias.words)
}
const producePhoneticMap = _.flow(
mapAliases(getPhoneticEntries),
_.flatten,
_.groupBy(_.get('phonetic')),
_.mapValues(_.flow(
_.map(_.get('aliasId')),
_.uniq
)),
_.toPairs,
entries => new Map(entries)
)
const getWords = (individualId, alias) => {
const pairWordsWithIds = word => ({value: word.value, aliasId: alias.id})
return _.map(pairWordsWithIds, alias.words)
}
const produceWordList = _.flow(
mapAliases(getWords),
_.flatten,
_.groupBy(_.get('value')),
_.mapValues(_.map(_.get('aliasId'))),
_.toPairs,
_.map(_.zipObject(['value', 'aliasIds']))
)
const parseSource = source => {
const individuals = []
const readStream = fs.createReadStream(source)
const jsonStream = readStream.pipe(ndjson.parse())
jsonStream.on('data', individual => {
individuals.push(individual)
})
return new Promise((resolve, reject) => {
jsonStream.on('error', reject)
jsonStream.on('end', () => {
resolve(individuals)
})
})
}
const load = sources => Promise.all(_.map(parseSource, sources))
.then(_.flow(
_.flatten,
_.compact,
_.uniqBy(_.get('id')),
individuals => {
const individualsMap = _.flow(
_.groupBy(_.get('id')),
_.mapValues(_.first),
_.toPairs,
entries => new Map(entries)
)(individuals)
const makeEntries = (individualId, alias) => [alias.id, alias]
const aliasesMap = new Map(mapAliases(makeEntries, individuals))
const getIdPairs = (individualId, alias) => [alias.id, individualId]
const idPairs = mapAliases(getIdPairs, individuals)
const aliasToIndividual = new Map(idPairs)
const phoneticMap = producePhoneticMap(individuals)
const wordList = produceWordList(individuals)
return {
individuals,
individualsMap,
aliasesMap,
aliasToIndividual,
phoneticMap,
wordList
}
}
))
module.exports = {load}

View file

@ -28,7 +28,8 @@ const isBornTooLongSince = _.curry((days, dateObject, individual) => {
// algorithm // algorithm
function match (structs, candidate, threshold) { function match (structs, candidate, options) {
const {threshold, ratio = 0.1, debug, verboseFor} = options
const {fullName, words, birthDate} = candidate const {fullName, words, birthDate} = candidate
// Accept aliases who's full name matches. // Accept aliases who's full name matches.
@ -44,42 +45,57 @@ function match (structs, candidate, threshold) {
)(aliases) )(aliases)
const aliasIds = [] const aliasIdCounts = new Map()
const phoneticWeight = 0.17 const phoneticWeight = ratio
const stringWeight = 1 - phoneticWeight const stringWeight = 1 - phoneticWeight
for (const word of words) { for (const word of words) {
const getPhonetic = phonetic => structs.phoneticMap.get(phonetic) const getPhonetic = phonetic => structs.phoneticMap.get(phonetic)
const phoneticMatches = new Set(_.flatMap(getPhonetic, word.phonetics)) const phoneticMatches = new Set(_.flatMap(getPhonetic, word.phonetics))
const aliasIds = new Set()
for (const wordEntry of structs.wordList) { for (const wordEntry of structs.wordList) {
const stringScore = stringSimilarity(word.value, wordEntry.value) const stringScore = stringSimilarity(word.value, wordEntry.value)
if (stringWeight * stringScore + phoneticWeight < threshold) continue const verbose = _.includes(wordEntry.value, verboseFor)
if (!verbose && stringWeight * stringScore + phoneticWeight < threshold) continue
for (const aliasId of wordEntry.aliasIds) { for (const aliasId of wordEntry.aliasIds) {
const phoneticScore = phoneticMatches.has(aliasId) ? 1 : 0 const phoneticScore = phoneticMatches.has(aliasId) ? 1 : -1
const finalScore = stringWeight * stringScore + phoneticWeight * phoneticScore // const finalScore = stringWeight * stringScore + phoneticWeight * phoneticScore
const finalScore = stringScore + phoneticWeight * phoneticScore
verbose && console.log(finalScore.toFixed(2), stringScore.toFixed(2), phoneticScore.toFixed(2), word.value, wordEntry.value)
if (finalScore >= threshold) { if (finalScore >= threshold) {
aliasIds.push(aliasId) aliasIds.add(aliasId)
} }
} }
} }
verboseFor && console.log(aliasIds)
for (const aliasId of aliasIds.values()) {
const count = aliasIdCounts.get(aliasId) || 0
aliasIdCounts.set(aliasId, count + 1)
}
} }
const aliasIdsFromNamePart = _.flow( verboseFor && console.log(aliasIdCounts)
_.countBy(_.identity),
_.toPairs,
_.reject(_.flow(
_.last,
_.gt(2)
)),
_.map(_.first)
)(aliasIds)
// debug_log(aliasIdsFromFullName) const aliasIdsFromNamePart = []
// debug_log(aliasIdsFromNamePart)
for (const [aliasId, count] of aliasIdCounts) {
const {length} = structs.aliasesMap.get(aliasId).words
if (count >= _.min([2, words.length, length])) {
aliasIdsFromNamePart.push(aliasId)
}
}
debug && debug_log(aliasIdsFromFullName)
debug && debug_log(aliasIdsFromNamePart)
// Get the full record for each matched id // Get the full record for each matched id
const getIndividual = aliasId => { const getIndividual = aliasId => {

View file

@ -139,111 +139,29 @@ function processProfile (profileNode) {
return individual return individual
} }
function promiseParseDocument (source) { const parse = (source, callback) => {
return new Promise((resolve, reject) => { const stream = fs.createReadStream(source)
const stream = fs.createReadStream(source) const xml = new XmlStream(stream)
const xml = new XmlStream(stream)
xml.on('error', err => { xml.on('error', err => {
xml.pause() xml.pause()
const message = `Error while parsing OFAC data source file (${source}): ${err.message}` const message = `Error while parsing OFAC data source file (${source}): ${err.message}`
reject(new Error(message)) callback(new Error(message))
}) })
xml.collect('Alias') xml.collect('Alias')
xml.collect('DocumentedName') xml.collect('DocumentedName')
xml.collect('DocumentedNamePart') xml.collect('DocumentedNamePart')
xml.collect('Feature') xml.collect('Feature')
xml.collect('MasterNamePartGroup') xml.collect('MasterNamePartGroup')
const individuals = [] const forwardProfile = profile => profile && callback(null, profile)
const collectResult = result => result && individuals.push(result) xml.on('updateElement: Profile', _.flow(processProfile, forwardProfile))
xml.on('updateElement: Profile', _.flow(processProfile, collectResult))
xml.on('end', _.wrap(resolve, individuals)) xml.on('end', () => {
callback(null, null)
}) })
} }
const mapAliases = _.curry((iteratee, individuals) => {
const mapIndividual = individual => {
const {id, aliases} = individual
return _.map(alias => iteratee(id, alias), aliases)
}
return _.flatMap(mapIndividual, individuals)
})
const getPhoneticEntries = (individualId, alias) => {
const pairPhoneticsWithValues = word => {
const {value, phonetics} = word
const makeEntry = phonetic => ({value, phonetic, aliasId: alias.id})
return _.map(makeEntry, phonetics)
}
return _.flatMap(pairPhoneticsWithValues, alias.words)
}
const producePhoneticMap = _.flow(
mapAliases(getPhoneticEntries),
_.flatten,
_.groupBy(_.get('phonetic')),
_.mapValues(_.flow(
_.map(_.get('aliasId')),
_.uniq
)),
_.toPairs,
entries => new Map(entries)
)
const getWords = (individualId, alias) => {
const pairWordsWithIds = word => ({value: word.value, aliasId: alias.id})
return _.map(pairWordsWithIds, alias.words)
}
const produceWordList = _.flow(
mapAliases(getWords),
_.flatten,
_.groupBy(_.get('value')),
_.mapValues(_.map(_.get('aliasId'))),
_.toPairs,
_.map(_.zipObject(['value', 'aliasIds']))
)
function parse (sources) {
return Promise.all(_.map(promiseParseDocument, sources))
.then(_.flow(
_.flatten,
_.compact,
_.uniqBy(_.get('id')),
individuals => {
const individualsMap = _.flow(
_.groupBy(_.get('id')),
_.mapValues(_.first),
_.toPairs,
entries => new Map(entries)
)(individuals)
const makeEntries = (individualId, alias) => [alias.id, alias]
const aliasesMap = new Map(mapAliases(makeEntries, individuals))
const getIdPairs = (individualId, alias) => [alias.id, individualId]
const idPairs = mapAliases(getIdPairs, individuals)
const aliasToIndividual = new Map(idPairs)
const phoneticMap = producePhoneticMap(individuals)
const wordList = produceWordList(individuals)
return {
individuals,
individualsMap,
aliasesMap,
aliasToIndividual,
phoneticMap,
wordList
}
}
))
}
module.exports = {parse} module.exports = {parse}

155
lib/ofac/update.js Normal file
View file

@ -0,0 +1,155 @@
const parser = require('./parsing')
const https = require('https')
const url = require('url')
const fs = require('fs')
const path = require('path')
const util = require('util')
const options = require('../options')
const _ = require('lodash/fp')
const OFAC_DATA_DIR = options.ofacDataDir
const OFAC_SOURCES_DIR = path.join(OFAC_DATA_DIR, 'sources')
const OFAC_SOURCES_FILE = path.join(OFAC_DATA_DIR, 'sources.json')
const OFAC_ETAGS_FILE = path.join(OFAC_DATA_DIR, 'etags.json')
const DOWNLOAD_DIR = path.resolve('/tmp')
const readFile = util.promisify(fs.readFile)
const writeFile = util.promisify(fs.writeFile)
const rename = util.promisify(fs.rename)
const unlink = util.promisify(fs.unlink)
const remove = file => {
console.log("remove", file)
return unlink(file)
}
const promiseGetEtag = (source) => {
return new Promise((resolve, reject) => {
const {url: sourceUrl} = source
const parsed = url.parse(sourceUrl)
const requestOptions = {
hostname: parsed.hostname,
path: parsed.path,
method: 'HEAD'
}
const request = https.request(requestOptions, _.flow(
_.get(['headers', 'etag']),
resolve
))
request.on('error', reject)
request.end()
})
}
const download = _.curry((dstDir, source) => {
console.log("download", source)
const {url: sourceUrl} = source
const fileName = path.basename(sourceUrl)
const dstFile = path.join(dstDir, fileName)
const file = fs.createWriteStream(dstFile)
return new Promise((resolve, reject) => {
const request = https.get(sourceUrl, response => {
response.pipe(file);
file.on('finish', () => file.close(() => resolve(dstFile)))
})
request.on('error', reject)
})
})
const parseToJson = srcFile => {
console.log("parseToJson", srcFile)
const dstFile = srcFile.replace(/\.xml$/, '.json')
const writeStream = fs.createWriteStream(dstFile)
return new Promise((resolve, reject) => {
parser.parse(srcFile, (err, profile) => {
console.log("callback", err, profile)
if (err) {
reject(err)
return
}
if (!profile) {
writeStream.end()
return
}
const json = JSON.stringify(profile)
writeStream.write(json + '\n', 'utf-8')
})
writeStream.on('error', reject)
writeStream.on('finish', () => resolve(dstFile))
})
}
const moveToSourcesDir = srcFile => {
console.log("moveToSourcesDir", srcFile)
const name = path.basename(srcFile)
const dstFile = path.join(OFAC_SOURCES_DIR, name)
return rename(srcFile, dstFile)
}
function update () {
const promiseOldEtags = readFile(OFAC_ETAGS_FILE, {encoding: 'utf-8'})
.then(json => JSON.parse(json) || {})
const promiseNewEtags = readFile(OFAC_SOURCES_FILE, {encoding: 'utf-8'})
.then(json => {
const obj = JSON.parse(json)
return obj ? obj.sources : []
})
.then(sources => Promise.all(_.map(promiseGetEtag, sources))
.then(etags => _.map(
([source, etag]) => ({...source, etag}),
_.zip(sources, etags)
))
)
return Promise.all([promiseOldEtags, promiseNewEtags])
.then(([oldEtags, newEtags]) => {
console.log("OLD", JSON.stringify(oldEtags, null, 4))
console.log("NEW", JSON.stringify(newEtags, null, 4))
const hasNotChanged = ({name, etag}) => oldEtags[name] === etag
const downloads = _.flow(
_.reject(hasNotChanged),
_.map(file => download(DOWNLOAD_DIR, file).then(parseToJson))
)(newEtags)
const oldFileNames = _.keys(oldEtags)
const newFileNames = _.map(_.get('name'), newEtags)
const missingFileNames = _.difference(oldFileNames, newFileNames)
const resolve = name => path.join(OFAC_SOURCES_DIR, name + '.json')
const missing = _.map(resolve, missingFileNames)
const etagsJson = _.flow(
_.map(source => [source.name, source.etag]),
_.fromPairs,
obj => JSON.stringify(obj, null, 4)
)(newEtags)
return Promise.all(downloads)
.then(parsed => {
console.log("finished", parsed)
const moves = _.map(moveToSourcesDir, parsed)
const deletions = _.map(remove, missing)
const updateEtags = writeFile(OFAC_ETAGS_FILE, etagsJson)
return Promise.all([updateEtags, ...moves, ...deletions])
})
})
}
module.exports = {update}

20
package-lock.json generated
View file

@ -5855,6 +5855,17 @@
"integrity": "sha1-Sr6/7tdUHywnrPspvbvRXI1bpPc=", "integrity": "sha1-Sr6/7tdUHywnrPspvbvRXI1bpPc=",
"dev": true "dev": true
}, },
"ndjson": {
"version": "1.5.0",
"resolved": "https://registry.npmjs.org/ndjson/-/ndjson-1.5.0.tgz",
"integrity": "sha1-rmA7NrE0vOw0e0UkIrC/mNWDLsg=",
"requires": {
"json-stringify-safe": "5.0.1",
"minimist": "1.2.0",
"split2": "2.2.0",
"through2": "2.0.3"
}
},
"negotiator": { "negotiator": {
"version": "0.6.1", "version": "0.6.1",
"resolved": "https://registry.npmjs.org/negotiator/-/negotiator-0.6.1.tgz", "resolved": "https://registry.npmjs.org/negotiator/-/negotiator-0.6.1.tgz",
@ -7738,6 +7749,14 @@
"through": "2.3.8" "through": "2.3.8"
} }
}, },
"split2": {
"version": "2.2.0",
"resolved": "https://registry.npmjs.org/split2/-/split2-2.2.0.tgz",
"integrity": "sha512-RAb22TG39LhI31MbreBgIuKiIKhVsawfTgEGqKHTK87aG+ul/PB8Sqoi3I7kVdRWiCfrKxK3uo4/YUkpNvhPbw==",
"requires": {
"through2": "2.0.3"
}
},
"sprintf-js": { "sprintf-js": {
"version": "1.0.3", "version": "1.0.3",
"resolved": "https://registry.npmjs.org/sprintf-js/-/sprintf-js-1.0.3.tgz", "resolved": "https://registry.npmjs.org/sprintf-js/-/sprintf-js-1.0.3.tgz",
@ -8003,7 +8022,6 @@
"version": "2.0.3", "version": "2.0.3",
"resolved": "https://registry.npmjs.org/through2/-/through2-2.0.3.tgz", "resolved": "https://registry.npmjs.org/through2/-/through2-2.0.3.tgz",
"integrity": "sha1-AARWmzfHx0ujnEPzzteNGtlBQL4=", "integrity": "sha1-AARWmzfHx0ujnEPzzteNGtlBQL4=",
"dev": true,
"requires": { "requires": {
"readable-stream": "2.3.3", "readable-stream": "2.3.3",
"xtend": "4.0.1" "xtend": "4.0.1"

View file

@ -38,6 +38,7 @@
"minimist": "^1.2.0", "minimist": "^1.2.0",
"moment": "^2.17.0", "moment": "^2.17.0",
"morgan": "^1.8.2", "morgan": "^1.8.2",
"ndjson": "^1.5.0",
"node-hkdf-sync": "^1.0.0", "node-hkdf-sync": "^1.0.0",
"node-mailjet": "^3.2.1", "node-mailjet": "^3.2.1",
"numeral": "^2.0.3", "numeral": "^2.0.3",

View file

@ -9,6 +9,10 @@ let fullNames
const rand = N => _.random(0, N - 1) const rand = N => _.random(0, N - 1)
const letters = _.range('a'.charCodeAt(0), 'z'.charCodeAt(0))
const vowels = _.map(c => c.charCodeAt(0), ['a', 'e', 'i', 'o', 'u'])
const consonants = _.difference(letters, vowels)
const duplicate = (word, index) => { const duplicate = (word, index) => {
const c = word[index] const c = word[index]
return _.join('', [word.slice(0, index), c, c, word.slice(index + 1)]) return _.join('', [word.slice(0, index), c, c, word.slice(index + 1)])
@ -25,10 +29,10 @@ const transpose = (word, index) => {
} }
const alter = (word, index) => { const alter = (word, index) => {
const c = word.charCodeAt(index) const o = word.charCodeAt(index)
const o = c - 'a'.charCodeAt(0) const collection = _.includes(o, vowels) ? vowels : consonants
const oo = (o + _.random(1, 26)) % 26 const oo = _.sample(collection)
const cc = String.fromCharCode(oo + 'a'.charCodeAt(0)) const cc = String.fromCharCode(oo)
return _.join('', [word.slice(0, index), cc, word.slice(index + 1)]) return _.join('', [word.slice(0, index), cc, word.slice(index + 1)])
} }
@ -54,15 +58,14 @@ const misspellRandomly = word => {
const shiftVowel = word => { const shiftVowel = word => {
const vowels = 'aeiou'
const indexedVowels = _.flow( const indexedVowels = _.flow(
_.get('length'), _.get('length'),
_.range(0), _.range(0),
_.zip(_.split('', word)), _.zip(_.split('', word)),
_.map(_.zipObject(['letter', 'index'])), _.map(_.zipObject(['letter', 'index'])),
_.map(indexedLetter => { _.map(indexedLetter => {
const vowelIndex = _.indexOf(indexedLetter.letter, vowels) const ord = indexedLetter.letter.charCodeAt(0)
const vowelIndex = _.indexOf(ord, vowels)
return {...indexedLetter, vowelIndex} return {...indexedLetter, vowelIndex}
}), }),
_.reject(_.flow( _.reject(_.flow(
@ -78,7 +81,8 @@ const shiftVowel = word => {
: indexedVowel.vowelIndex === 4 ? [ -1 ] : indexedVowel.vowelIndex === 4 ? [ -1 ]
: [ -1, +1 ] : [ -1, +1 ]
const offset = _.sample(options) const offset = _.sample(options)
const replacement = vowels[indexedVowel.vowelIndex + offset] const replacementOrd = vowels[indexedVowel.vowelIndex + offset]
const replacement = String.fromCharCode(replacementOrd)
const index = indexedVowel.index const index = indexedVowel.index
return _.join('', [word.slice(0, index), replacement, word.slice(index + 1)]) return _.join('', [word.slice(0, index), replacement, word.slice(index + 1)])
@ -110,6 +114,8 @@ const transcribe = word => {
} }
} }
const threshold = 0.85
describe('OFAC', function () { describe('OFAC', function () {
describe('Matching', function () { describe('Matching', function () {
@ -130,7 +136,9 @@ describe('OFAC', function () {
this.timeout(0) this.timeout(0)
for (const fullName of fullNames) { for (const fullName of fullNames) {
const matches = ofac.match({firstName: fullName}, null, 1) const matches = ofac.match({firstName: fullName}, null, {
threshold,//: 1
})
assert.ok(!_.isEmpty(matches)) assert.ok(!_.isEmpty(matches))
} }
}) })
@ -145,7 +153,9 @@ describe('OFAC', function () {
_.join(' ') _.join(' ')
)(fullName) )(fullName)
const matches = ofac.match({firstName: reversed}, null, 1) const matches = ofac.match({firstName: reversed}, null, {
threshold,//: 1
})
assert.ok(!_.isEmpty(matches)) assert.ok(!_.isEmpty(matches))
} }
}) })
@ -162,13 +172,29 @@ describe('OFAC', function () {
_.join(' ') _.join(' ')
)(fullName) )(fullName)
const matchesA = ofac.match({firstName: lightlyMisspelled}, null, 0.85) const matchesA = ofac.match({firstName: lightlyMisspelled}, null, {
if (matchesA.length === 0) { console.log(1, fullName, '|', lightlyMisspelled) } threshold,//: 0.875
assert.ok(matchesA.length > 0) })
if (_.isEmpty(matchesA)) {
console.log(fullName)
ofac.match({firstName: lightlyMisspelled}, null, {
threshold,//: 0.875,
debug: true
})
}
assert.ok(!_.isEmpty(matchesA))
const matchesB = ofac.match({firstName: heavilyMisspelled}, null, 0.75) const matchesB = ofac.match({firstName: heavilyMisspelled}, null, {
if (matchesB.length === 0) { console.log(2, fullName, '|', heavilyMisspelled) } threshold: threshold - 0.1,//: 0.75
assert.ok(matchesB.length > 0) })
if (_.isEmpty(matchesB)) {
console.log(fullName)
ofac.match({firstName: heavilyMisspelled}, null, {
threshold: threshold - 0.1,//: 0.75,
debug: true
})
}
assert.ok(!_.isEmpty(matchesB))
} }
}) })
@ -183,8 +209,16 @@ describe('OFAC', function () {
continue continue
} }
const matches = ofac.match({firstName: transcribed}, null, 0.85) const matches = ofac.match({firstName: transcribed}, null, {
if (matches.length === 0) { console.log(fullName, '|', transcribed) } threshold,//: 0.85
})
if (_.isEmpty(matches)) {
console.log(fullName)
ofac.match({firstName: transcribed}, null, {
threshold,//: 0.85,
debug: true
})
}
assert.ok(!_.isEmpty(matches)) assert.ok(!_.isEmpty(matches))
} }
}) })
@ -204,7 +238,9 @@ describe('OFAC', function () {
)) ))
for (const fullName of fullNames) { for (const fullName of fullNames) {
const matches = ofac.match({firstName: fullName}, dateString, 1) const matches = ofac.match({firstName: fullName}, dateString, {
threshold,//: 1
})
assert.ok(noMatchesWithBirthDates(matches)) assert.ok(noMatchesWithBirthDates(matches))
} }
}) })
@ -228,16 +264,42 @@ describe('OFAC', function () {
for (const lastName of lastNames.slice(0, 100)) { for (const lastName of lastNames.slice(0, 100)) {
for (firstName of firstNamesMale.slice(0, 100)) { for (firstName of firstNamesMale.slice(0, 100)) {
const matches = ofac.match({firstName, lastName}, null, 0.85) const matches = ofac.match({firstName, lastName}, null, {
threshold,//: 0.875
})
if (!_.isEmpty(matches)) {
ofac.match({firstName, lastName}, null, {
threshold,//: 0.875,
debug: true
})
}
assert.ok(_.isEmpty(matches)) assert.ok(_.isEmpty(matches))
} }
for (firstName of firstNamesFemale.slice(0, 100)) { for (firstName of firstNamesFemale.slice(0, 100)) {
const matches = ofac.match({firstName, lastName}, null, 0.85) const matches = ofac.match({firstName, lastName}, null, {
threshold,//: 0.875
})
if (!_.isEmpty(matches)) {
ofac.match({firstName, lastName}, null, {
threshold,//: 0.875,
debug: true
})
}
assert.ok(_.isEmpty(matches)) assert.ok(_.isEmpty(matches))
} }
} }
}) })
it.skip('test', function () {
const firstName = 'hian chariapaporn'
ofac.match({firstName}, null, {
threshold,//: 0.875,
debug: true,
verboseFor: ['hiran', 'chariapaporn']
})
})
}) })
}) })

View file

@ -134,148 +134,53 @@ const individualB = {id: '11', aliases: [{id: '15',
} }
const parseIndividuals = source => {
const individuals = []
return new Promise((resolve, reject) => {
parser.parse(source, (err, profile) => {
if (err) {
reject(err)
return
}
if (!profile) {
resolve(individuals)
return
}
individuals.push(profile)
})
})
}
describe('OFAC', function () { describe('OFAC', function () {
describe('Parsing', function () { describe('Parsing', function () {
// To detect botched downloads // To detect botched downloads
it('should fail on malformed XML', function () { it('should fail on malformed XML', function () {
const xml = '<a><b></a>' const xml = '<a><b></a>'
return makeDataFiles([xml]).then(parser.parse) return makeDataFiles([xml])
.then(files => Promise.all(_.map(parseIndividuals, files)))
.catch(error => { .catch(error => {
assert.ok(error instanceof Error) assert.ok(error instanceof Error)
return true return 'failed'
}) })
.then(ret => { .then(ret => {
assert.equal(ret, true) assert.equal(ret, 'failed')
}) })
}) })
it('should return the expected structs', function () { it('should return the expected individuals', function () {
const xml = makeXml([individualA]) const xml = makeXml([individualA, individualB])
return makeDataFiles([xml]).then(parser.parse) return makeDataFiles([xml])
.then(structs => { .then(files => Promise.all(_.map(parseIndividuals, files)))
const {individuals} = structs .then(([individuals]) => {
assert.ok(Array.isArray(individuals))
assert.equal(individuals.length, 1)
assert.deepEqual(individuals[0], individualA)
const {individualsMap} = structs
assert.ok(individualsMap instanceof Map)
assert.equal(individualsMap.size, 1)
assert.ok(individualsMap.has('9'))
assert.deepEqual(individualsMap.get('9'), individualA)
const {aliasToIndividual} = structs
assert.ok(aliasToIndividual instanceof Map)
assert.equal(aliasToIndividual.size, 1)
assert.ok(aliasToIndividual.has('5'))
assert.strictEqual(aliasToIndividual.get('5'), '9')
const {phoneticMap} = structs
assert.ok(phoneticMap instanceof Map)
assert.equal(phoneticMap.size, 3)
assert.ok(phoneticMap.has('JN'))
assert.deepEqual(phoneticMap.get('JN'), ['5'])
assert.ok(phoneticMap.has('AN'))
assert.deepEqual(phoneticMap.get('AN'), ['5'])
assert.ok(phoneticMap.has('T'))
assert.deepEqual(phoneticMap.get('T'), ['5'])
const {wordList} = structs
assert.ok(Array.isArray(wordList))
assert.equal(wordList.length, 2)
assert.deepEqual(wordList[0], {value: 'john', aliasIds: ['5']})
assert.deepEqual(wordList[1], {value: 'doe', aliasIds: ['5']})
})
})
it('should be able to combine multiple sources', function () {
const xmlA = makeXml([individualA])
const xmlB = makeXml([individualB])
return makeDataFiles([xmlA, xmlB]).then(parser.parse)
.then(structs => {
const {individuals} = structs
assert.ok(Array.isArray(individuals)) assert.ok(Array.isArray(individuals))
assert.equal(individuals.length, 2) assert.equal(individuals.length, 2)
assert.deepEqual(individuals[0], individualA) assert.deepEqual(individuals, [individualA, individualB])
assert.deepEqual(individuals[1], individualB)
const {individualsMap} = structs
assert.ok(individualsMap instanceof Map)
assert.equal(individualsMap.size, 2)
assert.ok(individualsMap.has('9'))
assert.deepEqual(individualsMap.get('9'), individualA)
assert.ok(individualsMap.has('11'))
assert.deepEqual(individualsMap.get('11'), individualB)
const {aliasToIndividual} = structs
assert.ok(aliasToIndividual instanceof Map)
assert.equal(aliasToIndividual.size, 2)
assert.ok(aliasToIndividual.has('5'))
assert.strictEqual(aliasToIndividual.get('5'), '9')
assert.ok(aliasToIndividual.has('15'))
assert.strictEqual(aliasToIndividual.get('15'), '11')
const {phoneticMap} = structs
assert.ok(phoneticMap instanceof Map)
assert.equal(phoneticMap.size, 4)
assert.ok(phoneticMap.has('JN'))
assert.deepEqual(phoneticMap.get('JN'), ['5', '15'])
assert.ok(phoneticMap.has('AN'))
assert.deepEqual(phoneticMap.get('AN'), ['5', '15'])
assert.ok(phoneticMap.has('T'))
assert.deepEqual(phoneticMap.get('T'), ['5', '15'])
const {wordList} = structs
assert.ok(Array.isArray(wordList))
assert.equal(wordList.length, 4)
assert.deepEqual(wordList[0], {value: 'john', aliasIds: ['5', '15']})
assert.deepEqual(wordList[1], {value: 'doe', aliasIds: ['5']})
assert.deepEqual(wordList[2], {value: 'de', aliasIds: ['15']})
assert.deepEqual(wordList[3], {value: 'gaul', aliasIds: ['15']})
})
})
it('should remove duplicates from multiple sources', function () {
const xmlA1 = makeXml([individualA, individualA])
const xmlA2 = makeXml([individualA])
return makeDataFiles([xmlA1, xmlA2]).then(parser.parse)
.then(structs => {
const {individuals} = structs
assert.ok(Array.isArray(individuals))
assert.equal(individuals.length, 1)
assert.deepEqual(individuals[0], individualA)
const {individualsMap} = structs
assert.ok(individualsMap instanceof Map)
assert.equal(individualsMap.size, 1)
assert.ok(individualsMap.has('9'))
assert.deepEqual(individualsMap.get('9'), individualA)
const {aliasToIndividual} = structs
assert.ok(aliasToIndividual instanceof Map)
assert.equal(aliasToIndividual.size, 1)
assert.ok(aliasToIndividual.has('5'))
assert.strictEqual(aliasToIndividual.get('5'), '9')
const {phoneticMap} = structs
assert.ok(phoneticMap instanceof Map)
assert.equal(phoneticMap.size, 3)
assert.ok(phoneticMap.has('JN'))
assert.deepEqual(phoneticMap.get('JN'), ['5'])
assert.ok(phoneticMap.has('AN'))
assert.deepEqual(phoneticMap.get('AN'), ['5'])
assert.ok(phoneticMap.has('T'))
assert.deepEqual(phoneticMap.get('T'), ['5'])
const {wordList} = structs
assert.ok(Array.isArray(wordList))
assert.equal(wordList.length, 2)
assert.deepEqual(wordList[0], {value: 'john', aliasIds: ['5']})
assert.deepEqual(wordList[1], {value: 'doe', aliasIds: ['5']})
}) })
}) })