diff --git a/lib/ofac/data-parser.js b/lib/ofac/data-parser.js new file mode 100644 index 00000000..2363e95f --- /dev/null +++ b/lib/ofac/data-parser.js @@ -0,0 +1,171 @@ +const fs = require('fs') +const path = require('path') +const util = require('util') +const XmlStream = require('xml-stream') +const nameUtils = require('./name-utils') +const options = require('../options') +const _ = require('lodash/fp') + +const debug_log = require('./debug') // KOSTIS TODO: remove + +const OFAC_DATA_DIR = options.ofacDataDir + +// KOSTIS TODO: get these from the document itself +const INDIVIDUAL = '4' +const NAME = '1403' +const BIRTH_DATE = '8' +const PRIMARY_LATIN = '1' + +const LAST_NAME = '1520' +const FIRST_NAME = '1521' +const MIDDLE_NAME = '1522' +const MAIDEN_NAME = '1523' +const PATRONYMIC = '91708' +const MATRONYMIC = '91709' +const NICKNAME = '1528' + +const partNames = new Map([ + [LAST_NAME, 'lastName'], + [FIRST_NAME, 'firstName'], + [MIDDLE_NAME, 'middleName'], + [MAIDEN_NAME, 'maidenName'], + [PATRONYMIC, 'patronymic'], + [MATRONYMIC, 'matronymic'], + [NICKNAME, 'nickname'] +]) + +// group-id to type-id + +function processMasterNamePartGroup (groupNode) { + const namePartGroupNode = groupNode.NamePartGroup + const groupId = namePartGroupNode.$.ID + const typeId = namePartGroupNode.$.NamePartTypeID + return [groupId, typeId] +} + +const processDocumentedNamePart = _.curry((groupTypes, namePartNode) => { + const valueNode = namePartNode.NamePartValue + const groupId = valueNode.$.NamePartGroupID + const typeId = groupTypes.get(groupId) + const partName = partNames.get(typeId) + const value = _.lowerCase(valueNode.$text) + return {[partName]: value} +}) + +const isLatin = _.matchesProperty(['$', 'DocNameStatusID'], PRIMARY_LATIN) + +const processAlias = _.curry((groupTypes, aliasNode) => { + if (aliasNode.$.AliasTypeID !== NAME) return + + const getNamePart = processDocumentedNamePart(groupTypes) + const latinNameNode = _.find(isLatin, aliasNode.DocumentedName) + const namePartNodes = latinNameNode.DocumentedNamePart + const nameParts = _.map(getNamePart, namePartNodes) + + const parts = _.assignAll(nameParts) + const fullName = nameUtils.fullNameFromParts(parts) + + const phoneticParts = _.mapValues(nameUtils.phonetic, parts) + const phoneticFullName = nameUtils.phonetic(fullName) + + return {parts, fullName, phoneticParts, phoneticFullName} +}) + +// birth date + +function processDate (dateNode) { + const year = parseInt(dateNode.Year) + const month = parseInt(dateNode.Month) + const day = parseInt(dateNode.Day) + const date = new Date(year, month - 1, day) + + return {year, month, day, date} +} + +function processFeature (featureNode) { + if (featureNode.$.FeatureTypeID !== BIRTH_DATE) return + + const datePeriodNode = featureNode.FeatureVersion.DatePeriod + // Ignore the fact that both Start and end can be a range. + // By using Start.From and End.To we use the extremes of the date-period. + const period = { + start: datePeriodNode.Start.From, + end: datePeriodNode.End.To + } + + return _.mapValues(processDate, period) +} + +// profile + +function processProfile (profileNode) { + if (profileNode.$.PartySubTypeID !== INDIVIDUAL) return + + const id = profileNode.$.ID + + const identityNode = profileNode.Identity + const groupTypesEntries = _.map(processMasterNamePartGroup, identityNode.NamePartGroups.MasterNamePartGroup) + const groupTypes = new Map(groupTypesEntries) + + const getNameParts = processAlias(groupTypes) + const aliases = _.compact(_.map(getNameParts, identityNode.Alias)) + const birthDatePeriods = _.compact(_.map(processFeature, profileNode.Feature)) + const individual = {aliases, birthDatePeriods} + + debug_log(individual) + return [id, individual] +} + +function promiseParseDocument (source) { + return new Promise((resolve, reject) => { + const fileName = path.join(OFAC_DATA_DIR, source) + const stream = fs.createReadStream(fileName) + const xml = new XmlStream(stream) + + xml.on('error', err => { + xml.pause() + const message = `Error while parsing OFAC data source file (${source}): ${err.message()}` + reject(new Error(message)) + }) + + xml.collect('Alias') + xml.collect('DocumentedName') + xml.collect('DocumentedNamePart') + xml.collect('Feature') + xml.collect('MasterNamePartGroup') + + const individuals = [] + const collectResult = result => individuals.push(result) + xml.on('updateElement: Profile', _.flow(processProfile, collectResult)) + + xml.on('end', _.wrap(resolve, individuals)) + }) +} + +const readdir = util.promisify(fs.readdir) + +const combineAndDedupe = _.flow( + _.flatten, + _.compact, + _.uniqBy(_.nth(0)), + _.map(_.nth(1)) +) + +function parseList () { + // NOTE: Not sure how you push code updates to existing clients. This problem + // might pop up if new code is pushed, without re-doing setup. + if (!OFAC_DATA_DIR) { + const message = 'The ofacDataDir option has not been set in lamassu.json' + return Promise.reject(new Error(message)) + } + + return readdir(OFAC_DATA_DIR) + .then(sources => { + const promises = _.map(promiseParseDocument, sources) + + return Promise.all(promises) + }) + .then(combineAndDedupe) +} + +module.exports = {parseList} diff --git a/lib/ofac/debug.js b/lib/ofac/debug.js new file mode 100644 index 00000000..93247731 --- /dev/null +++ b/lib/ofac/debug.js @@ -0,0 +1 @@ +module.exports = (...args) => console.log(require('util').inspect(args, {depth: null, colors: true})) diff --git a/lib/ofac/index.js b/lib/ofac/index.js index ed6c9f9b..325de342 100644 --- a/lib/ofac/index.js +++ b/lib/ofac/index.js @@ -1,203 +1,39 @@ -const fs = require('fs') -const path = require('path') -const XmlStream = require('xml-stream') +const dataParser = require('./data-paraser') +const nameUtils = require('./name-utils') const jaroWinkler = require('talisman/metrics/distance/jaro-winkler') -const metaphone = require('talisman/phonetics/metaphone') -const options = require('../options') -const logger = require('../logger') const _ = require('lodash/fp') -// PARSING +const debug_log = require('./debug') // KOSTIS TODO: remove -const OFAC_DATA_DIR = options.ofacDataDir - -// TODO: get these from the document itself -const INDIVIDUAL = '4' -const NAME = '1403' -const BIRTH_DATE = '8' - -const LAST_NAME = '1520' -const FIRST_NAME = '1521' -const MIDDLE_NAME = '1522' -const MAIDEN_NAME = '1523' -const PATRONYMIC = '91708' -const MATRONYMIC = '91709' -const NICKNAME = '1528' - -const partNames = new Map([ - [LAST_NAME, 'lastName'], - [FIRST_NAME, 'firstName'], - [MIDDLE_NAME, 'middleName'], - [MAIDEN_NAME, 'maidenName'], - [PATRONYMIC, 'patronymic'], - [MATRONYMIC, 'matronymic'], - [NICKNAME, 'nickname'] -]) - -// TODO: get this from admin configuration -const SIMILARITY_THRESHOLD = 0.5 - -// TODO: remove -const debug_log = (...args) => console.log(require('util').inspect(args, {depth: null, colors: true})) - -let individuals = [] -const individualsById = new Map() - -// group-id to type-id - -function processMasterNamePartGroup (groupNode) { - const namePartGroupNode = groupNode.NamePartGroup - const groupId = namePartGroupNode.$.ID - const typeId = namePartGroupNode.$.NamePartTypeID - return [groupId, typeId] -} - -// name parts - -function makeFullNameFromParts (nameParts) { - // Combine name-parts in a standared order. - const namePartPairs = _.toPairs(nameParts) - const sortedPairs = _.sortBy(_.nth(0), namePartPairs) - return _.map(_.nth(1), sortedPairs).join(' ') -} - -function makePhonetic (name) { - return metaphone(name) -} - -function processDocumentedNamePart (groupTypes) { - return function (namePartNode) { - const valueNode = namePartNode.NamePartValue - const groupId = valueNode.$.NamePartGroupID - const typeId = groupTypes.get(groupId) - const partName = partNames.get(typeId) - const value = valueNode.$text - return {[partName]: value} - } -} - -function processAlias (groupTypes) { - return function (aliasNode) { - if (aliasNode.$.AliasTypeID !== NAME) return - - const nameParts = _.map(processDocumentedNamePart(groupTypes), aliasNode.DocumentedName.DocumentedNamePart) - const parts = _.assignAll(nameParts) - const fullName = makeFullNameFromParts(parts) - - const phoneticParts = _.mapValues(makePhonetic, parts) - const phoneticFullName = makePhonetic(fullName) - - return {parts, fullName, phoneticParts, phoneticFullName} - } -} - -// birth date - -function processDate (dateNode) { - const year = parseInt(dateNode.Year) - const month = parseInt(dateNode.Month) - const day = parseInt(dateNode.Day) - const date = new Date(year, month - 1, day) - - return {year, month, day, date} -} - -function processFeature (featureNode) { - if (featureNode.$.FeatureTypeID !== BIRTH_DATE) return - - const datePeriodNode = featureNode.FeatureVersion.DatePeriod - // Ignore the fact that both Start and end can be a range. - // By using Start.From and End.To we use the extremes of the date-period. - const period = { - start: datePeriodNode.Start.From, - end: datePeriodNode.End.To - } - - return _.mapValues(processDate, period) -} - -// profile - -function processProfile (profileNode) { - if (profileNode.$.PartySubTypeID !== INDIVIDUAL) return - - const id = profileNode.$.ID - - const identityNode = profileNode.Identity - const groupTypesEntries = _.map(processMasterNamePartGroup, identityNode.NamePartGroups.MasterNamePartGroup) - const groupTypes = new Map(groupTypesEntries) - - const aliases = _.compact(_.map(processAlias(groupTypes), identityNode.Alias)) - const birthDatePeriods = _.compact(_.map(processFeature, profileNode.Feature)) - const individual = {aliases, birthDatePeriods} - - individualsById.set(id, individual) - debug_log(individual) -} - -function promiseParseDocument (source) { - return new Promise(resolve => { - const fileName = path.join(OFAC_DATA_DIR, source) - const stream = fs.createReadStream(fileName) - const xml = new XmlStream(stream) - - xml.on('error', error => { - logger.error('Error while parsing the OFAC data sources.') - logger.error(error) - xml.pause() - resolve() - }) - - xml.collect('Alias') - xml.collect('DocumentedNamePart') - xml.collect('Feature') - xml.collect('MasterNamePartGroup') - - xml.on('updateElement: Profile', processProfile) - - xml.on('end', resolve) - }) -} +const individuals = [] function load () { - // NOTE: Not sure how you push code updates to existing clients. This problem - // might pop up if new code is pushed, without re-doing setup. - if (!OFAC_DATA_DIR) { - logger.error('The ofacDataDir option has not been set in lamassu.json') - return - } - - individualsById.clear() - - const sources = fs.readdirSync(OFAC_DATA_DIR) - const promises = _.map(promiseParseDocument, sources) - - return Promise.all(promises) - .then(() => { - individuals = Array.from(individualsById.values()) - }) + const newList = Array.from(dataParser.parseList()) + const oldLength = individuals.length + individuals.splice(0, oldLength, newList) } +load() + // MATCHING // birth date -function isDateWithinTwoYearsOfPeriod (targetDate) { - return function (period) { - const startDate = new Date(period.from.date) - const startYear = startDate.getFullYear() - startDate.setFullYear(startYear - 2) +function isDateWithinSomeYearsOfPeriod (period, date, years) { + const startDate = new Date(period.from.date) + const startYear = startDate.getFullYear() + startDate.setFullYear(startYear - years) - const endDate = new Date(period.to.date) - const endYear = endDate.getFullYear() - endDate.setFullYear(endYear + 2) + const endDate = new Date(period.to.date) + const endYear = endDate.getFullYear() + endDate.setFullYear(endYear + years) - return (startDate < targetDate && targetDate < endDate) - } + return (startDate < date && date < endDate) } function isBornWithinTwoYears (individual, dateObject) { - return _.some(isDateWithinTwoYearsOfPeriod(dateObject.date), individual.birthDatePeriods) + const isWithinTwoYears = _.partialRight(isDateWithinSomeYearsOfPeriod, [dateObject.date, 2]) + return _.some(isWithinTwoYears, individual.birthDatePeriods) } // exact match @@ -221,36 +57,30 @@ function calcPhoneticMatchScore (candidatePhoneticFullName) { // NOTE: I'm still not 100% on what matching algorithm is the best choice. // I just experiment with a few metrics for now. -function doesMatch (nameParts, fullName, phoneticParts, phoneticFullName, birthDate) { - return function (individual) { - // Calculate if his birth date is within two years of the given date. - // If an individual has multiple birth-date periods, return wether any are - // within two years. Reject individuals who don't match this criterion. - if (individual.birthDatePeriods.length && !isBornWithinTwoYears(individual, birthDate)) return false +const similarity = _.curry((candidate, individual) => { + // Calculate if his birth date is within two years of the given date. + // If an individual has multiple birth-date periods, return wether any are + // within two years. Reject individuals who don't match this criterion. + if (individual.birthDatePeriods.length && !isBornWithinTwoYears(individual, candidate.birthDate)) return 0 - // Calculate the Jaro-Winkler similarity of the full name. - // If an individual has multiple aliases, use the maximum score. - const exactMatchScore = _.max(_.map(calcExactMatchScore(fullName), individual.aliases)) + // Calculate the Jaro-Winkler similarity of the full name. + // If an individual has multiple aliases, use the maximum score. + const exactMatchScore = _.max(_.map(calcExactMatchScore(candidate.fullName), individual.aliases)) - if (exactMatchScore > SIMILARITY_THRESHOLD) return true + // Calculate the Jaro-Winkler similarity of the phonetic representation of the full name. + // This should approximate the phonetic similarity of the two names. + // If an individual has multiple aliases, use the maximum score. + const phoneticMatchScore = _.max(_.map(calcPhoneticMatchScore(candidate.phoneticFullName), individual.aliases)) - // Calculate the Jaro-Winkler similarity of the phonetic representation of the full name. - // This should approximate the phonetic similarity of the two names. - // If an individual has multiple aliases, use the maximum score. - const phoneticMatchScore = _.max(_.map(calcPhoneticMatchScore(phoneticFullName), individual.aliases)) + return _.max([exactMatchScore, phoneticMatchScore]) +}) - if (phoneticMatchScore > SIMILARITY_THRESHOLD) return true - - return false - } -} - -function match (nameParts, birthDateString) { +function match (parts, birthDateString) { // nameParts should be an object like {firstName: "John", lastName: "Doe", ...} - const fullName = makeFullNameFromParts(nameParts) + const fullName = nameUtils.fullNameFromParts(parts) - const phoneticParts = _.mapValues(makePhonetic, nameParts) - const phoneticFullName = makePhonetic(fullName) + const phoneticParts = _.mapValues(nameUtils.phonetic, parts) + const phoneticFullName = nameUtils.phonetic(fullName) // birthDateString is in YYYYMMDD format const year = parseInt(birthDateString.slice(0, 4)) @@ -260,7 +90,10 @@ function match (nameParts, birthDateString) { const birthDate = {year, month, day, date} - return _.some(doesMatch(nameParts, fullName, phoneticParts, phoneticFullName, birthDate), individuals) + const candidate = {parts, fullName, phoneticParts, phoneticFullName, birthDate} + + const similarToCandidate = similarity(candidate) + return _.max(similarToCandidate, individuals) } module.exports = {load, match} diff --git a/lib/ofac/name-utils.js b/lib/ofac/name-utils.js new file mode 100644 index 00000000..dfd86b5f --- /dev/null +++ b/lib/ofac/name-utils.js @@ -0,0 +1,25 @@ +const metaphone = require('talisman/phonetics/metaphone') +const doubleMetaphone = require('talisman/phonetics/double-metaphone') +const _ = require('lodash/fp') + +// KOSTIS TODO: Decide on a method. Remove the others + +const phoneticMethod1 = metaphone + +const phoneticMethod2 = _.flow(doubleMetaphone, _.uniq) + +const phoneticMethod3 = _.flow(_.split(' '), _.map(phoneticMethod2)) + +// Combine name-parts in a standared order. + +const fullNameFromParts = _.flow( + _.toPairs, + _.sortBy(_.nth(0)), // sort by part name, + _.map(_.nth(1)), // get part value + _.join(' ') +) + +module.exports = { + fullNameFromParts, + phonetic: phoneticMethod3 +}