Split into files. Worked on style.

This commit is contained in:
Konstantin Mamalakis 2018-02-22 01:17:22 +02:00 committed by Josh Harvey
parent c4307cb749
commit a3f8db79b3
4 changed files with 238 additions and 208 deletions

171
lib/ofac/data-parser.js Normal file
View file

@ -0,0 +1,171 @@
const fs = require('fs')
const path = require('path')
const util = require('util')
const XmlStream = require('xml-stream')
const nameUtils = require('./name-utils')
const options = require('../options')
const _ = require('lodash/fp')
const debug_log = require('./debug') // KOSTIS TODO: remove
const OFAC_DATA_DIR = options.ofacDataDir
// KOSTIS TODO: get these from the document itself
const INDIVIDUAL = '4'
const NAME = '1403'
const BIRTH_DATE = '8'
const PRIMARY_LATIN = '1'
const LAST_NAME = '1520'
const FIRST_NAME = '1521'
const MIDDLE_NAME = '1522'
const MAIDEN_NAME = '1523'
const PATRONYMIC = '91708'
const MATRONYMIC = '91709'
const NICKNAME = '1528'
const partNames = new Map([
[LAST_NAME, 'lastName'],
[FIRST_NAME, 'firstName'],
[MIDDLE_NAME, 'middleName'],
[MAIDEN_NAME, 'maidenName'],
[PATRONYMIC, 'patronymic'],
[MATRONYMIC, 'matronymic'],
[NICKNAME, 'nickname']
])
// group-id to type-id
function processMasterNamePartGroup (groupNode) {
const namePartGroupNode = groupNode.NamePartGroup
const groupId = namePartGroupNode.$.ID
const typeId = namePartGroupNode.$.NamePartTypeID
return [groupId, typeId]
}
const processDocumentedNamePart = _.curry((groupTypes, namePartNode) => {
const valueNode = namePartNode.NamePartValue
const groupId = valueNode.$.NamePartGroupID
const typeId = groupTypes.get(groupId)
const partName = partNames.get(typeId)
const value = _.lowerCase(valueNode.$text)
return {[partName]: value}
})
const isLatin = _.matchesProperty(['$', 'DocNameStatusID'], PRIMARY_LATIN)
const processAlias = _.curry((groupTypes, aliasNode) => {
if (aliasNode.$.AliasTypeID !== NAME) return
const getNamePart = processDocumentedNamePart(groupTypes)
const latinNameNode = _.find(isLatin, aliasNode.DocumentedName)
const namePartNodes = latinNameNode.DocumentedNamePart
const nameParts = _.map(getNamePart, namePartNodes)
const parts = _.assignAll(nameParts)
const fullName = nameUtils.fullNameFromParts(parts)
const phoneticParts = _.mapValues(nameUtils.phonetic, parts)
const phoneticFullName = nameUtils.phonetic(fullName)
return {parts, fullName, phoneticParts, phoneticFullName}
})
// birth date
function processDate (dateNode) {
const year = parseInt(dateNode.Year)
const month = parseInt(dateNode.Month)
const day = parseInt(dateNode.Day)
const date = new Date(year, month - 1, day)
return {year, month, day, date}
}
function processFeature (featureNode) {
if (featureNode.$.FeatureTypeID !== BIRTH_DATE) return
const datePeriodNode = featureNode.FeatureVersion.DatePeriod
// Ignore the fact that both Start and end can be a range.
// By using Start.From and End.To we use the extremes of the date-period.
const period = {
start: datePeriodNode.Start.From,
end: datePeriodNode.End.To
}
return _.mapValues(processDate, period)
}
// profile
function processProfile (profileNode) {
if (profileNode.$.PartySubTypeID !== INDIVIDUAL) return
const id = profileNode.$.ID
const identityNode = profileNode.Identity
const groupTypesEntries = _.map(processMasterNamePartGroup, identityNode.NamePartGroups.MasterNamePartGroup)
const groupTypes = new Map(groupTypesEntries)
const getNameParts = processAlias(groupTypes)
const aliases = _.compact(_.map(getNameParts, identityNode.Alias))
const birthDatePeriods = _.compact(_.map(processFeature, profileNode.Feature))
const individual = {aliases, birthDatePeriods}
debug_log(individual)
return [id, individual]
}
function promiseParseDocument (source) {
return new Promise((resolve, reject) => {
const fileName = path.join(OFAC_DATA_DIR, source)
const stream = fs.createReadStream(fileName)
const xml = new XmlStream(stream)
xml.on('error', err => {
xml.pause()
const message = `Error while parsing OFAC data source file (${source}): ${err.message()}`
reject(new Error(message))
})
xml.collect('Alias')
xml.collect('DocumentedName')
xml.collect('DocumentedNamePart')
xml.collect('Feature')
xml.collect('MasterNamePartGroup')
const individuals = []
const collectResult = result => individuals.push(result)
xml.on('updateElement: Profile', _.flow(processProfile, collectResult))
xml.on('end', _.wrap(resolve, individuals))
})
}
const readdir = util.promisify(fs.readdir)
const combineAndDedupe = _.flow(
_.flatten,
_.compact,
_.uniqBy(_.nth(0)),
_.map(_.nth(1))
)
function parseList () {
// NOTE: Not sure how you push code updates to existing clients. This problem
// might pop up if new code is pushed, without re-doing setup.
if (!OFAC_DATA_DIR) {
const message = 'The ofacDataDir option has not been set in lamassu.json'
return Promise.reject(new Error(message))
}
return readdir(OFAC_DATA_DIR)
.then(sources => {
const promises = _.map(promiseParseDocument, sources)
return Promise.all(promises)
})
.then(combineAndDedupe)
}
module.exports = {parseList}

1
lib/ofac/debug.js Normal file
View file

@ -0,0 +1 @@
module.exports = (...args) => console.log(require('util').inspect(args, {depth: null, colors: true}))

View file

@ -1,203 +1,39 @@
const fs = require('fs')
const path = require('path')
const XmlStream = require('xml-stream')
const dataParser = require('./data-paraser')
const nameUtils = require('./name-utils')
const jaroWinkler = require('talisman/metrics/distance/jaro-winkler')
const metaphone = require('talisman/phonetics/metaphone')
const options = require('../options')
const logger = require('../logger')
const _ = require('lodash/fp')
// PARSING
const debug_log = require('./debug') // KOSTIS TODO: remove
const OFAC_DATA_DIR = options.ofacDataDir
// TODO: get these from the document itself
const INDIVIDUAL = '4'
const NAME = '1403'
const BIRTH_DATE = '8'
const LAST_NAME = '1520'
const FIRST_NAME = '1521'
const MIDDLE_NAME = '1522'
const MAIDEN_NAME = '1523'
const PATRONYMIC = '91708'
const MATRONYMIC = '91709'
const NICKNAME = '1528'
const partNames = new Map([
[LAST_NAME, 'lastName'],
[FIRST_NAME, 'firstName'],
[MIDDLE_NAME, 'middleName'],
[MAIDEN_NAME, 'maidenName'],
[PATRONYMIC, 'patronymic'],
[MATRONYMIC, 'matronymic'],
[NICKNAME, 'nickname']
])
// TODO: get this from admin configuration
const SIMILARITY_THRESHOLD = 0.5
// TODO: remove
const debug_log = (...args) => console.log(require('util').inspect(args, {depth: null, colors: true}))
let individuals = []
const individualsById = new Map()
// group-id to type-id
function processMasterNamePartGroup (groupNode) {
const namePartGroupNode = groupNode.NamePartGroup
const groupId = namePartGroupNode.$.ID
const typeId = namePartGroupNode.$.NamePartTypeID
return [groupId, typeId]
}
// name parts
function makeFullNameFromParts (nameParts) {
// Combine name-parts in a standared order.
const namePartPairs = _.toPairs(nameParts)
const sortedPairs = _.sortBy(_.nth(0), namePartPairs)
return _.map(_.nth(1), sortedPairs).join(' ')
}
function makePhonetic (name) {
return metaphone(name)
}
function processDocumentedNamePart (groupTypes) {
return function (namePartNode) {
const valueNode = namePartNode.NamePartValue
const groupId = valueNode.$.NamePartGroupID
const typeId = groupTypes.get(groupId)
const partName = partNames.get(typeId)
const value = valueNode.$text
return {[partName]: value}
}
}
function processAlias (groupTypes) {
return function (aliasNode) {
if (aliasNode.$.AliasTypeID !== NAME) return
const nameParts = _.map(processDocumentedNamePart(groupTypes), aliasNode.DocumentedName.DocumentedNamePart)
const parts = _.assignAll(nameParts)
const fullName = makeFullNameFromParts(parts)
const phoneticParts = _.mapValues(makePhonetic, parts)
const phoneticFullName = makePhonetic(fullName)
return {parts, fullName, phoneticParts, phoneticFullName}
}
}
// birth date
function processDate (dateNode) {
const year = parseInt(dateNode.Year)
const month = parseInt(dateNode.Month)
const day = parseInt(dateNode.Day)
const date = new Date(year, month - 1, day)
return {year, month, day, date}
}
function processFeature (featureNode) {
if (featureNode.$.FeatureTypeID !== BIRTH_DATE) return
const datePeriodNode = featureNode.FeatureVersion.DatePeriod
// Ignore the fact that both Start and end can be a range.
// By using Start.From and End.To we use the extremes of the date-period.
const period = {
start: datePeriodNode.Start.From,
end: datePeriodNode.End.To
}
return _.mapValues(processDate, period)
}
// profile
function processProfile (profileNode) {
if (profileNode.$.PartySubTypeID !== INDIVIDUAL) return
const id = profileNode.$.ID
const identityNode = profileNode.Identity
const groupTypesEntries = _.map(processMasterNamePartGroup, identityNode.NamePartGroups.MasterNamePartGroup)
const groupTypes = new Map(groupTypesEntries)
const aliases = _.compact(_.map(processAlias(groupTypes), identityNode.Alias))
const birthDatePeriods = _.compact(_.map(processFeature, profileNode.Feature))
const individual = {aliases, birthDatePeriods}
individualsById.set(id, individual)
debug_log(individual)
}
function promiseParseDocument (source) {
return new Promise(resolve => {
const fileName = path.join(OFAC_DATA_DIR, source)
const stream = fs.createReadStream(fileName)
const xml = new XmlStream(stream)
xml.on('error', error => {
logger.error('Error while parsing the OFAC data sources.')
logger.error(error)
xml.pause()
resolve()
})
xml.collect('Alias')
xml.collect('DocumentedNamePart')
xml.collect('Feature')
xml.collect('MasterNamePartGroup')
xml.on('updateElement: Profile', processProfile)
xml.on('end', resolve)
})
}
const individuals = []
function load () {
// NOTE: Not sure how you push code updates to existing clients. This problem
// might pop up if new code is pushed, without re-doing setup.
if (!OFAC_DATA_DIR) {
logger.error('The ofacDataDir option has not been set in lamassu.json')
return
}
individualsById.clear()
const sources = fs.readdirSync(OFAC_DATA_DIR)
const promises = _.map(promiseParseDocument, sources)
return Promise.all(promises)
.then(() => {
individuals = Array.from(individualsById.values())
})
const newList = Array.from(dataParser.parseList())
const oldLength = individuals.length
individuals.splice(0, oldLength, newList)
}
load()
// MATCHING
// birth date
function isDateWithinTwoYearsOfPeriod (targetDate) {
return function (period) {
function isDateWithinSomeYearsOfPeriod (period, date, years) {
const startDate = new Date(period.from.date)
const startYear = startDate.getFullYear()
startDate.setFullYear(startYear - 2)
startDate.setFullYear(startYear - years)
const endDate = new Date(period.to.date)
const endYear = endDate.getFullYear()
endDate.setFullYear(endYear + 2)
endDate.setFullYear(endYear + years)
return (startDate < targetDate && targetDate < endDate)
}
return (startDate < date && date < endDate)
}
function isBornWithinTwoYears (individual, dateObject) {
return _.some(isDateWithinTwoYearsOfPeriod(dateObject.date), individual.birthDatePeriods)
const isWithinTwoYears = _.partialRight(isDateWithinSomeYearsOfPeriod, [dateObject.date, 2])
return _.some(isWithinTwoYears, individual.birthDatePeriods)
}
// exact match
@ -221,36 +57,30 @@ function calcPhoneticMatchScore (candidatePhoneticFullName) {
// NOTE: I'm still not 100% on what matching algorithm is the best choice.
// I just experiment with a few metrics for now.
function doesMatch (nameParts, fullName, phoneticParts, phoneticFullName, birthDate) {
return function (individual) {
const similarity = _.curry((candidate, individual) => {
// Calculate if his birth date is within two years of the given date.
// If an individual has multiple birth-date periods, return wether any are
// within two years. Reject individuals who don't match this criterion.
if (individual.birthDatePeriods.length && !isBornWithinTwoYears(individual, birthDate)) return false
if (individual.birthDatePeriods.length && !isBornWithinTwoYears(individual, candidate.birthDate)) return 0
// Calculate the Jaro-Winkler similarity of the full name.
// If an individual has multiple aliases, use the maximum score.
const exactMatchScore = _.max(_.map(calcExactMatchScore(fullName), individual.aliases))
if (exactMatchScore > SIMILARITY_THRESHOLD) return true
const exactMatchScore = _.max(_.map(calcExactMatchScore(candidate.fullName), individual.aliases))
// Calculate the Jaro-Winkler similarity of the phonetic representation of the full name.
// This should approximate the phonetic similarity of the two names.
// If an individual has multiple aliases, use the maximum score.
const phoneticMatchScore = _.max(_.map(calcPhoneticMatchScore(phoneticFullName), individual.aliases))
const phoneticMatchScore = _.max(_.map(calcPhoneticMatchScore(candidate.phoneticFullName), individual.aliases))
if (phoneticMatchScore > SIMILARITY_THRESHOLD) return true
return _.max([exactMatchScore, phoneticMatchScore])
})
return false
}
}
function match (nameParts, birthDateString) {
function match (parts, birthDateString) {
// nameParts should be an object like {firstName: "John", lastName: "Doe", ...}
const fullName = makeFullNameFromParts(nameParts)
const fullName = nameUtils.fullNameFromParts(parts)
const phoneticParts = _.mapValues(makePhonetic, nameParts)
const phoneticFullName = makePhonetic(fullName)
const phoneticParts = _.mapValues(nameUtils.phonetic, parts)
const phoneticFullName = nameUtils.phonetic(fullName)
// birthDateString is in YYYYMMDD format
const year = parseInt(birthDateString.slice(0, 4))
@ -260,7 +90,10 @@ function match (nameParts, birthDateString) {
const birthDate = {year, month, day, date}
return _.some(doesMatch(nameParts, fullName, phoneticParts, phoneticFullName, birthDate), individuals)
const candidate = {parts, fullName, phoneticParts, phoneticFullName, birthDate}
const similarToCandidate = similarity(candidate)
return _.max(similarToCandidate, individuals)
}
module.exports = {load, match}

25
lib/ofac/name-utils.js Normal file
View file

@ -0,0 +1,25 @@
const metaphone = require('talisman/phonetics/metaphone')
const doubleMetaphone = require('talisman/phonetics/double-metaphone')
const _ = require('lodash/fp')
// KOSTIS TODO: Decide on a method. Remove the others
const phoneticMethod1 = metaphone
const phoneticMethod2 = _.flow(doubleMetaphone, _.uniq)
const phoneticMethod3 = _.flow(_.split(' '), _.map(phoneticMethod2))
// Combine name-parts in a standared order.
const fullNameFromParts = _.flow(
_.toPairs,
_.sortBy(_.nth(0)), // sort by part name,
_.map(_.nth(1)), // get part value
_.join(' ')
)
module.exports = {
fullNameFromParts,
phonetic: phoneticMethod3
}