Refactored for testing

This commit is contained in:
Konstantin Mamalakis 2018-03-02 17:01:39 +02:00 committed by Josh Harvey
parent 1d0aff07fe
commit 2232340f6f
7 changed files with 214 additions and 145 deletions

View file

@ -1,45 +1,38 @@
const dataParser = require('./data-parser')
const fs = require('fs')
const path = require('path')
const util = require('util')
const parser = require('./parsing')
const matcher = require('./matching')
const nameUtils = require('./name-utils')
const jaroWinkler = require('talisman/metrics/distance/jaro-winkler')
const options = require('../options')
const _ = require('lodash/fp')
const debug_log = require('../pp')(__filename) // KOSTIS TODO: remove
const OFAC_DATA_DIR = options.ofacDataDir
let structs = null
const readdir = util.promisify(fs.readdir)
function load () {
return dataParser.produceStructs()
// NOTE: Not sure how you push code updates to existing clients. This problem
// might pop up if new code is pushed, without re-doing setup.
if (!OFAC_DATA_DIR) {
const message = 'The ofacDataDir option has not been set in lamassu.json'
return Promise.reject(new Error(message))
}
return readdir(OFAC_DATA_DIR)
.then(_.flow(
_.map(file => path.join(OFAC_DATA_DIR, file)),
parser.parse
))
.then(result => {
structs = result
})
}
// MATCHING
// similarity algorithm
const stringSimilarity = _.curry(jaroWinkler)
// birth date
function isDateWithinSomeDaysOfPeriod (period, date, days) {
const inMillisecs = 24 * 60 * 60 * 1000
const startTime = period.start.date.getTime() - days * inMillisecs
const startDate = new Date(startTime)
const endTime = period.end.date.getTime() + days * inMillisecs
const endDate = new Date(endTime)
return (startDate < date && date < endDate)
}
const isBornTooLongSince = _.curry((days, dateObject, individual) => {
if (_.isEmpty(individual.birthDatePeriods)) return false
const isWithinSomeYears = _.partialRight(isDateWithinSomeDaysOfPeriod, [dateObject.date, days])
return !_.some(isWithinSomeYears, individual.birthDatePeriods)
})
// nameParts should be an object like {firstName: "John", lastName: "Doe", ...}
function makeCompatible (nameParts) {
@ -49,8 +42,6 @@ function makeCompatible (nameParts) {
return _.map(_.zipObject(['partName', 'value']), props)
}
// algorithm
function match (nameParts, birthDateString, threshold) {
if (!structs) {
const message = 'The OFAC data sources have not been loaded yet.'
@ -64,7 +55,7 @@ function match (nameParts, birthDateString, threshold) {
const words = nameUtils.makeWords(fullName)
const wordValues = _.map(_.get('value'), words)
const wordPhonetics = _.flatten(_.map(_.get('phonetics'), words))
const wordPhonetics = _.flatMap(_.get('phonetics'), words)
// birthDateString is in YYYYMMDD format
const year = parseInt(birthDateString.slice(0, 4))
@ -74,72 +65,10 @@ function match (nameParts, birthDateString, threshold) {
const birthDate = {year, month, day, date}
debug_log({parts, fullName, wordValues, wordPhonetics, birthDate})
// Start matching
// Accept aliases who's full name matches.
const doesNameMatch = _.flow(
_.get('fullName'),
stringSimilarity(fullName),
_.lte(threshold)
)
const aliases = _.flatMap(_.get('aliases'), structs.individuals)
const aliasIdsFromFullName = _.flow(
_.filter(doesNameMatch),
_.map(_.get('id'))
)(aliases)
// Gather aliases who's name-parts match phonetically.
const getPhoneticMatches = phonetic => structs.phoneticMap.get(phonetic)
const phoneticMatches = _.flow(
_.map(getPhoneticMatches),
_.compact,
_.flatten
)(wordPhonetics)
// Gether aliases whose name-parts match alphabetically.
const getStringMatches = value => {
const entryMatches = entry => (jaroWinkler(value, entry.value) >= threshold)
return _.filter(entryMatches, structs.wordList)
}
const getSingleEntries = wordEntry => {
const makeEntry = aliasId => ({value: wordEntry.value, aliasId})
return _.map(makeEntry, wordEntry.aliasIds)
}
const stringMatches = _.flow(
_.map(getStringMatches),
_.flatten,
_.map(getSingleEntries),
_.flatten
)(wordValues)
// At least two name-parts must match per alias
const aliasIdsFromNamePart = _.flow(
_.uniqWith(_.isEqual),
_.map(_.get('aliasId')),
_.countBy(_.identity),
_.toPairs,
_.filter(_.flow(_.last, _.lte(2))),
_.map(_.first)
)([...phoneticMatches, ...stringMatches])
// Get the full record for each matched id
const getIndividual = aliasId => {
const individualId = structs.aliasToIndividual.get(aliasId)
return structs.individualsMap.get(individualId)
}
const suspects = _.uniq(_.map(getIndividual, [
...aliasIdsFromFullName,
...aliasIdsFromNamePart
]))
// Reject everyone who is born two years away.
const twoYears = 365 * 2
const unqualified = isBornTooLongSince(twoYears, birthDate)
const result = _.reject(unqualified, suspects)
const candidate = {parts, fullName, wordValues, wordPhonetics, birthDate}
debug_log(candidate)
const result = matcher.match(structs, candidate, threshold)
debug_log(result)
return result
}

94
lib/ofac/matching.js Normal file
View file

@ -0,0 +1,94 @@
const jaroWinkler = require('talisman/metrics/distance/jaro-winkler')
const _ = require('lodash/fp')
const debug_log = require('../pp')(__filename) // KOSTIS TODO: remove
const stringSimilarity = _.curry(jaroWinkler)
// birth date
function isDateWithinSomeDaysOfPeriod (period, date, days) {
const inMillisecs = 24 * 60 * 60 * 1000
const startTime = period.start.date.getTime() - days * inMillisecs
const startDate = new Date(startTime)
const endTime = period.end.date.getTime() + days * inMillisecs
const endDate = new Date(endTime)
return (startDate < date && date < endDate)
}
const isBornTooLongSince = _.curry((days, dateObject, individual) => {
if (_.isEmpty(individual.birthDatePeriods)) return false
const isWithinSomeYears = _.partialRight(isDateWithinSomeDaysOfPeriod, [dateObject.date, days])
return !_.some(isWithinSomeYears, individual.birthDatePeriods)
})
// algorithm
function match (structs, candidate, threshold) {
const {fullName, wordPhonetics, wordValues, birthDate} = candidate
// Accept aliases who's full name matches.
const doesNameMatch = _.flow(
_.get('fullName'),
stringSimilarity(fullName),
_.lte(threshold)
)
const aliases = _.flatMap(_.get('aliases'), structs.individuals)
const aliasIdsFromFullName = _.flow(
_.filter(doesNameMatch),
_.map(_.get('id'))
)(aliases)
// Gather aliases who's name-parts match phonetically.
const getPhoneticMatches = phonetic => structs.phoneticMap.get(phonetic)
const phoneticMatches = _.flow(
_.map(getPhoneticMatches),
_.compact,
_.flatten
)(wordPhonetics)
// Gether aliases whose name-parts match alphabetically.
const getStringMatches = value => {
const entryMatches = entry => (jaroWinkler(value, entry.value) >= threshold)
return _.filter(entryMatches, structs.wordList)
}
const getSingleEntries = wordEntry => {
const makeEntry = aliasId => ({value: wordEntry.value, aliasId})
return _.map(makeEntry, wordEntry.aliasIds)
}
const stringMatches = _.flow(
_.flatMap(getStringMatches),
_.flatMap(getSingleEntries)
)(wordValues)
// At least two name-parts must match per alias
const aliasIdsFromNamePart = _.flow(
_.uniqWith(_.isEqual),
_.map(_.get('aliasId')),
_.countBy(_.identity),
_.toPairs,
_.filter(_.flow(_.last, _.lte(2))),
_.map(_.first)
)([...phoneticMatches, ...stringMatches])
// Get the full record for each matched id
const getIndividual = aliasId => {
const individualId = structs.aliasToIndividual.get(aliasId)
return structs.individualsMap.get(individualId)
}
const suspects = _.uniq(_.map(getIndividual, [
...aliasIdsFromFullName,
...aliasIdsFromNamePart
]))
// Reject everyone who is born two years away.
const twoYears = 365 * 2
const unqualified = isBornTooLongSince(twoYears, birthDate)
return _.reject(unqualified, suspects)
}
module.exports = {match}

View file

View file

@ -1,16 +1,11 @@
const fs = require('fs')
const path = require('path')
const util = require('util')
const XmlStream = require('xml-stream')
const nameUtils = require('./name-utils')
const options = require('../options')
const logger = require('../logger')
const _ = require('lodash/fp')
const debug_log = require('../pp')(__filename) // KOSTIS TODO: remove
const OFAC_DATA_DIR = options.ofacDataDir
// KOSTIS TODO: get these from the document itself
const INDIVIDUAL = '4'
const NAME = '1403'
@ -57,6 +52,7 @@ const isLatin = _.matchesProperty(['$', 'DocNameStatusID'], PRIMARY_LATIN)
const processAlias = _.curry((groupTypes, aliasNode) => {
if (aliasNode.$.AliasTypeID !== NAME) return
if (aliasNode.$.LowQuality === 'true') return
const getNamePart = processDocumentedNamePart(groupTypes)
const latinNameNode = _.find(isLatin, aliasNode.DocumentedName)
@ -131,8 +127,7 @@ function processProfile (profileNode) {
function promiseParseDocument (source) {
return new Promise((resolve, reject) => {
const fileName = path.join(OFAC_DATA_DIR, source)
const stream = fs.createReadStream(fileName)
const stream = fs.createReadStream(source)
const xml = new XmlStream(stream)
xml.on('error', err => {
@ -156,8 +151,6 @@ function promiseParseDocument (source) {
})
}
const readdir = util.promisify(fs.readdir)
const mapAliases = _.curry((iteratee, individuals) => {
const mapIndividual = individual => {
const {id, aliases} = individual
@ -200,46 +193,36 @@ const produceWordList = _.flow(
_.map(_.zipObject(['value', 'aliasIds']))
)
const combineAndDedupe = _.flow(
_.flatten,
_.compact,
_.uniqBy(_.get('id')),
individuals => {
const individualsMap = _.flow(
_.groupBy(_.get('id')),
_.mapValues(_.first),
_.toPairs,
entries => new Map(entries)
)(individuals)
function parse (sources) {
return Promise.all(_.map(promiseParseDocument, sources))
.then(_.flow(
_.flatten,
_.compact,
_.uniqBy(_.get('id')),
individuals => {
const individualsMap = _.flow(
_.groupBy(_.get('id')),
_.mapValues(_.first),
_.toPairs,
entries => new Map(entries)
)(individuals)
const getIdPairs = (individualId, alias) => [alias.id, individualId]
const idPairs = mapAliases(getIdPairs, individuals)
const aliasToIndividual = new Map(idPairs)
const getIdPairs = (individualId, alias) => [alias.id, individualId]
const idPairs = mapAliases(getIdPairs, individuals)
const aliasToIndividual = new Map(idPairs)
const phoneticMap = producePhoneticMap(individuals)
const wordList = produceWordList(individuals)
const phoneticMap = producePhoneticMap(individuals)
const wordList = produceWordList(individuals)
return {
individuals,
individualsMap,
aliasToIndividual,
phoneticMap,
wordList
return {
individuals,
individualsMap,
aliasToIndividual,
phoneticMap,
wordList
}
}
}
)
function produceStructs () {
// NOTE: Not sure how you push code updates to existing clients. This problem
// might pop up if new code is pushed, without re-doing setup.
if (!OFAC_DATA_DIR) {
const message = 'The ofacDataDir option has not been set in lamassu.json'
return Promise.reject(new Error(message))
}
return readdir(OFAC_DATA_DIR)
.then(sources => Promise.all(_.map(promiseParseDocument, sources)))
.then(combineAndDedupe)
))
}
module.exports = {produceStructs}
module.exports = {parse}

View file