From c4307cb749e580096883f6e53e18df47b988bfe0 Mon Sep 17 00:00:00 2001 From: Konstantin Mamalakis Date: Wed, 21 Feb 2018 06:06:08 +0200 Subject: [PATCH] Corrections and initial algorithm --- bin/cert-gen.sh | 6 +- install | 6 +- lib/ofac/index.js | 266 ++++++++++++++++++++++++++++++++++++++++++++++ package-lock.json | 86 +++++++++++++++ package.json | 4 +- 5 files changed, 365 insertions(+), 3 deletions(-) create mode 100644 lib/ofac/index.js diff --git a/bin/cert-gen.sh b/bin/cert-gen.sh index 4daf40c2..17235119 100755 --- a/bin/cert-gen.sh +++ b/bin/cert-gen.sh @@ -11,6 +11,7 @@ KEY_DIR=$PWD/certs LAMASSU_CA_PATH=$PWD/Lamassu_CA.pem MIGRATE_STATE_PATH=$CONFIG_DIR/.migrate POSTGRES_PASS=postgres123 +OFAC_DATA_DIR=$CONFIG_DIR/ofac mkdir -p $CERT_DIR mkdir -p $CONFIG_DIR >> $LOG_FILE 2>&1 @@ -64,6 +65,8 @@ openssl x509 \ rm /tmp/Lamassu_OP.csr.pem +mkdir -p $OFAC_DATA_DIR + cat < $CONFIG_DIR/lamassu.json { "postgresql": "psql://postgres:$POSTGRES_PASS@localhost/lamassu", @@ -75,7 +78,8 @@ cat < $CONFIG_DIR/lamassu.json "logLevel": "debug", "lamassuCaPath": "$LAMASSU_CA_PATH", "lamassuServerPath": "$PWD", - "migrateStatePath": "$MIGRATE_STATE_PATH" + "migrateStatePath": "$MIGRATE_STATE_PATH", + "ofacDataDir": "$OFAC_DATA_DIR" } EOF diff --git a/install b/install index 740377d4..933b2b2d 100644 --- a/install +++ b/install @@ -16,6 +16,7 @@ SEEDS_DIR=$HOME/seeds SEED_FILE=$SEEDS_DIR/seed.txt BACKUP_DIR=/var/backups/postgresql BLOCKCHAIN_DIR=/mnt/blockchains +OFAC_DATA_DIR=/var/lamassu/ofac # Look into http://unix.stackexchange.com/questions/140734/configure-localtime-dpkg-reconfigure-tzdata @@ -145,6 +146,8 @@ openssl x509 \ rm /tmp/Lamassu_OP.csr.pem +mkdir -p $OFAC_DATA_DIR + cat < $CONFIG_DIR/lamassu.json { "postgresql": "postgres://lamassu_pg:$POSTGRES_PW@localhost/lamassu", @@ -156,7 +159,8 @@ cat < $CONFIG_DIR/lamassu.json "logLevel": "info", "lamassuServerPath": "$NODE_MODULES/lamassu-server", "migrateStatePath": "$MIGRATE_STATE_PATH", - "blockchainDir": "$BLOCKCHAIN_DIR" + "blockchainDir": "$BLOCKCHAIN_DIR", + "ofacDataDir": "$OFAC_DATA_DIR" } EOF diff --git a/lib/ofac/index.js b/lib/ofac/index.js new file mode 100644 index 00000000..ed6c9f9b --- /dev/null +++ b/lib/ofac/index.js @@ -0,0 +1,266 @@ +const fs = require('fs') +const path = require('path') +const XmlStream = require('xml-stream') +const jaroWinkler = require('talisman/metrics/distance/jaro-winkler') +const metaphone = require('talisman/phonetics/metaphone') +const options = require('../options') +const logger = require('../logger') +const _ = require('lodash/fp') + +// PARSING + +const OFAC_DATA_DIR = options.ofacDataDir + +// TODO: get these from the document itself +const INDIVIDUAL = '4' +const NAME = '1403' +const BIRTH_DATE = '8' + +const LAST_NAME = '1520' +const FIRST_NAME = '1521' +const MIDDLE_NAME = '1522' +const MAIDEN_NAME = '1523' +const PATRONYMIC = '91708' +const MATRONYMIC = '91709' +const NICKNAME = '1528' + +const partNames = new Map([ + [LAST_NAME, 'lastName'], + [FIRST_NAME, 'firstName'], + [MIDDLE_NAME, 'middleName'], + [MAIDEN_NAME, 'maidenName'], + [PATRONYMIC, 'patronymic'], + [MATRONYMIC, 'matronymic'], + [NICKNAME, 'nickname'] +]) + +// TODO: get this from admin configuration +const SIMILARITY_THRESHOLD = 0.5 + +// TODO: remove +const debug_log = (...args) => console.log(require('util').inspect(args, {depth: null, colors: true})) + +let individuals = [] +const individualsById = new Map() + +// group-id to type-id + +function processMasterNamePartGroup (groupNode) { + const namePartGroupNode = groupNode.NamePartGroup + const groupId = namePartGroupNode.$.ID + const typeId = namePartGroupNode.$.NamePartTypeID + return [groupId, typeId] +} + +// name parts + +function makeFullNameFromParts (nameParts) { + // Combine name-parts in a standared order. + const namePartPairs = _.toPairs(nameParts) + const sortedPairs = _.sortBy(_.nth(0), namePartPairs) + return _.map(_.nth(1), sortedPairs).join(' ') +} + +function makePhonetic (name) { + return metaphone(name) +} + +function processDocumentedNamePart (groupTypes) { + return function (namePartNode) { + const valueNode = namePartNode.NamePartValue + const groupId = valueNode.$.NamePartGroupID + const typeId = groupTypes.get(groupId) + const partName = partNames.get(typeId) + const value = valueNode.$text + return {[partName]: value} + } +} + +function processAlias (groupTypes) { + return function (aliasNode) { + if (aliasNode.$.AliasTypeID !== NAME) return + + const nameParts = _.map(processDocumentedNamePart(groupTypes), aliasNode.DocumentedName.DocumentedNamePart) + const parts = _.assignAll(nameParts) + const fullName = makeFullNameFromParts(parts) + + const phoneticParts = _.mapValues(makePhonetic, parts) + const phoneticFullName = makePhonetic(fullName) + + return {parts, fullName, phoneticParts, phoneticFullName} + } +} + +// birth date + +function processDate (dateNode) { + const year = parseInt(dateNode.Year) + const month = parseInt(dateNode.Month) + const day = parseInt(dateNode.Day) + const date = new Date(year, month - 1, day) + + return {year, month, day, date} +} + +function processFeature (featureNode) { + if (featureNode.$.FeatureTypeID !== BIRTH_DATE) return + + const datePeriodNode = featureNode.FeatureVersion.DatePeriod + // Ignore the fact that both Start and end can be a range. + // By using Start.From and End.To we use the extremes of the date-period. + const period = { + start: datePeriodNode.Start.From, + end: datePeriodNode.End.To + } + + return _.mapValues(processDate, period) +} + +// profile + +function processProfile (profileNode) { + if (profileNode.$.PartySubTypeID !== INDIVIDUAL) return + + const id = profileNode.$.ID + + const identityNode = profileNode.Identity + const groupTypesEntries = _.map(processMasterNamePartGroup, identityNode.NamePartGroups.MasterNamePartGroup) + const groupTypes = new Map(groupTypesEntries) + + const aliases = _.compact(_.map(processAlias(groupTypes), identityNode.Alias)) + const birthDatePeriods = _.compact(_.map(processFeature, profileNode.Feature)) + const individual = {aliases, birthDatePeriods} + + individualsById.set(id, individual) + debug_log(individual) +} + +function promiseParseDocument (source) { + return new Promise(resolve => { + const fileName = path.join(OFAC_DATA_DIR, source) + const stream = fs.createReadStream(fileName) + const xml = new XmlStream(stream) + + xml.on('error', error => { + logger.error('Error while parsing the OFAC data sources.') + logger.error(error) + xml.pause() + resolve() + }) + + xml.collect('Alias') + xml.collect('DocumentedNamePart') + xml.collect('Feature') + xml.collect('MasterNamePartGroup') + + xml.on('updateElement: Profile', processProfile) + + xml.on('end', resolve) + }) +} + +function load () { + // NOTE: Not sure how you push code updates to existing clients. This problem + // might pop up if new code is pushed, without re-doing setup. + if (!OFAC_DATA_DIR) { + logger.error('The ofacDataDir option has not been set in lamassu.json') + return + } + + individualsById.clear() + + const sources = fs.readdirSync(OFAC_DATA_DIR) + const promises = _.map(promiseParseDocument, sources) + + return Promise.all(promises) + .then(() => { + individuals = Array.from(individualsById.values()) + }) +} + +// MATCHING + +// birth date + +function isDateWithinTwoYearsOfPeriod (targetDate) { + return function (period) { + const startDate = new Date(period.from.date) + const startYear = startDate.getFullYear() + startDate.setFullYear(startYear - 2) + + const endDate = new Date(period.to.date) + const endYear = endDate.getFullYear() + endDate.setFullYear(endYear + 2) + + return (startDate < targetDate && targetDate < endDate) + } +} + +function isBornWithinTwoYears (individual, dateObject) { + return _.some(isDateWithinTwoYearsOfPeriod(dateObject.date), individual.birthDatePeriods) +} + +// exact match + +function calcExactMatchScore (candidateFullName) { + return function (alias) { + return jaroWinkler(alias.fullName, candidateFullName) + } +} + +// phonetic match + +function calcPhoneticMatchScore (candidatePhoneticFullName) { + return function (alias) { + return jaroWinkler(alias.phoneticFullName, candidatePhoneticFullName) + } +} + +// algorithm + +// NOTE: I'm still not 100% on what matching algorithm is the best choice. +// I just experiment with a few metrics for now. + +function doesMatch (nameParts, fullName, phoneticParts, phoneticFullName, birthDate) { + return function (individual) { + // Calculate if his birth date is within two years of the given date. + // If an individual has multiple birth-date periods, return wether any are + // within two years. Reject individuals who don't match this criterion. + if (individual.birthDatePeriods.length && !isBornWithinTwoYears(individual, birthDate)) return false + + // Calculate the Jaro-Winkler similarity of the full name. + // If an individual has multiple aliases, use the maximum score. + const exactMatchScore = _.max(_.map(calcExactMatchScore(fullName), individual.aliases)) + + if (exactMatchScore > SIMILARITY_THRESHOLD) return true + + // Calculate the Jaro-Winkler similarity of the phonetic representation of the full name. + // This should approximate the phonetic similarity of the two names. + // If an individual has multiple aliases, use the maximum score. + const phoneticMatchScore = _.max(_.map(calcPhoneticMatchScore(phoneticFullName), individual.aliases)) + + if (phoneticMatchScore > SIMILARITY_THRESHOLD) return true + + return false + } +} + +function match (nameParts, birthDateString) { + // nameParts should be an object like {firstName: "John", lastName: "Doe", ...} + const fullName = makeFullNameFromParts(nameParts) + + const phoneticParts = _.mapValues(makePhonetic, nameParts) + const phoneticFullName = makePhonetic(fullName) + + // birthDateString is in YYYYMMDD format + const year = parseInt(birthDateString.slice(0, 4)) + const month = parseInt(birthDateString.slice(4, 6)) + const day = parseInt(birthDateString.slice(6, 8)) + const date = new Date(year, month - 1, day) + + const birthDate = {year, month, day, date} + + return _.some(doesMatch(nameParts, fullName, phoneticParts, phoneticFullName, birthDate), individuals) +} + +module.exports = {load, match} diff --git a/package-lock.json b/package-lock.json index 6263d212..603b03ae 100644 --- a/package-lock.json +++ b/package-lock.json @@ -4616,6 +4616,11 @@ "core-util-is": "1.0.2" } }, + "html-entities": { + "version": "1.2.1", + "resolved": "https://registry.npmjs.org/html-entities/-/html-entities-1.2.1.tgz", + "integrity": "sha1-DfKTUfByEWNRXfueVUPl9u7VFi8=" + }, "htmlparser2": { "version": "3.9.2", "resolved": "https://registry.npmjs.org/htmlparser2/-/htmlparser2-3.9.2.tgz", @@ -4727,6 +4732,14 @@ } } }, + "iconv": { + "version": "2.3.0", + "resolved": "https://registry.npmjs.org/iconv/-/iconv-2.3.0.tgz", + "integrity": "sha512-eu9senpOZ7wzNweLX09jtrCdmEiie8Z5/iMxdIq3i7tkgg562EwKSU9yjXMz8ncaQ0B+845vbqAz+1kPFXzbtQ==", + "requires": { + "nan": "2.6.2" + } + }, "iconv-lite": { "version": "0.4.15", "resolved": "https://registry.npmjs.org/iconv-lite/-/iconv-lite-0.4.15.tgz", @@ -5715,6 +5728,14 @@ } } }, + "mnemonist": { + "version": "0.14.0", + "resolved": "https://registry.npmjs.org/mnemonist/-/mnemonist-0.14.0.tgz", + "integrity": "sha512-GosoNab9mShR9w6QJA+bFT9NC2fN+2smFuQ6dEvjt4myuMLFvy63qeFE0cJZ8DQQxmCrqKftTAA/8N686cRPiQ==", + "requires": { + "obliterator": "1.2.1" + } + }, "moment": { "version": "2.18.1", "resolved": "https://registry.npmjs.org/moment/-/moment-2.18.1.tgz", @@ -5792,6 +5813,15 @@ "resolved": "https://registry.npmjs.org/node-abi/-/node-abi-2.0.3.tgz", "integrity": "sha1-DKZ+XmZ7jhNDVJyhcVOoFdC7/ao=" }, + "node-expat": { + "version": "2.3.16", + "resolved": "https://registry.npmjs.org/node-expat/-/node-expat-2.3.16.tgz", + "integrity": "sha512-e3HyQI0lk5CXyYQ4RsDYGiWdY5LJxNMlNCzo4/gwqY8lhYIeTf5VwGirGDa1EPrcZROmOR37wHuFVnoHmOWnOw==", + "requires": { + "bindings": "1.2.1", + "nan": "2.6.2" + } + }, "node-hkdf-sync": { "version": "1.0.0", "resolved": "https://registry.npmjs.org/node-hkdf-sync/-/node-hkdf-sync-1.0.0.tgz", @@ -5932,6 +5962,11 @@ "is-extendable": "0.1.1" } }, + "obliterator": { + "version": "1.2.1", + "resolved": "https://registry.npmjs.org/obliterator/-/obliterator-1.2.1.tgz", + "integrity": "sha512-KMA0nZW3Z0UdG9Qtt5Ti8aFg8WvWHE8dKEL2/U5/+PfqyzpVyeLVXOrwhFskyrxnYjn936JZVm76rshSOYHgxQ==" + }, "observable-to-promise": { "version": "0.5.0", "resolved": "https://registry.npmjs.org/observable-to-promise/-/observable-to-promise-0.5.0.tgz", @@ -6176,6 +6211,11 @@ "resolved": "https://registry.npmjs.org/packet-reader/-/packet-reader-0.3.1.tgz", "integrity": "sha1-zWLmCvjX/qinBexP+ZCHHEaHHyc=" }, + "pandemonium": { + "version": "1.4.1", + "resolved": "https://registry.npmjs.org/pandemonium/-/pandemonium-1.4.1.tgz", + "integrity": "sha512-KhwY9xv8tZGQE8L7FfzaTHrLH+JnarUsDlsa8mqfisjtU3J00P362IL52Ei/EhDp025yBDzPuES/zMdWvvAR5g==" + }, "parse-glob": { "version": "3.0.4", "resolved": "https://registry.npmjs.org/parse-glob/-/parse-glob-3.0.4.tgz", @@ -7784,6 +7824,19 @@ "integrity": "sha1-lag9smGG1q9+ehjb2XYKL4bQj0A=", "dev": true }, + "talisman": { + "version": "0.20.0", + "resolved": "https://registry.npmjs.org/talisman/-/talisman-0.20.0.tgz", + "integrity": "sha512-cIO2x+MaWrY/d1YxzzTnNI3Jkgmqy0NSpiEOhJ2ExvHNySlVodcGRK7mj+xRNDIUy9qSKWq2B8XTSt6THMLLbQ==", + "requires": { + "html-entities": "1.2.1", + "lodash": "4.17.4", + "long": "3.2.0", + "mnemonist": "0.14.0", + "obliterator": "1.2.1", + "pandemonium": "1.4.1" + } + }, "tar": { "version": "2.2.1", "resolved": "https://registry.npmjs.org/tar/-/tar-2.2.1.tgz", @@ -8417,6 +8470,39 @@ "resolved": "https://registry.npmjs.org/xhr2/-/xhr2-0.1.4.tgz", "integrity": "sha1-f4dliEdxbbUCYyOBL4GMras4el8=" }, + "xml-stream": { + "version": "0.4.5", + "resolved": "https://registry.npmjs.org/xml-stream/-/xml-stream-0.4.5.tgz", + "integrity": "sha1-dFLYWzf5uIGnDv8M90oN8CCI7es=", + "requires": { + "iconv": "2.3.0", + "node-expat": "2.3.16", + "readable-stream": "1.1.14" + }, + "dependencies": { + "isarray": { + "version": "0.0.1", + "resolved": "https://registry.npmjs.org/isarray/-/isarray-0.0.1.tgz", + "integrity": "sha1-ihis/Kmo9Bd+Cav8YDiTmwXR7t8=" + }, + "readable-stream": { + "version": "1.1.14", + "resolved": "https://registry.npmjs.org/readable-stream/-/readable-stream-1.1.14.tgz", + "integrity": "sha1-fPTFTvZI44EwhMY23SB54WbAgdk=", + "requires": { + "core-util-is": "1.0.2", + "inherits": "2.0.3", + "isarray": "0.0.1", + "string_decoder": "0.10.31" + } + }, + "string_decoder": { + "version": "0.10.31", + "resolved": "https://registry.npmjs.org/string_decoder/-/string_decoder-0.10.31.tgz", + "integrity": "sha1-YuIDvEF2bGwoyfyEMB2rHFMQ+pQ=" + } + } + }, "xmlbuilder": { "version": "8.2.2", "resolved": "https://registry.npmjs.org/xmlbuilder/-/xmlbuilder-8.2.2.tgz", diff --git a/package.json b/package.json index cd89a5a3..654c9b8e 100644 --- a/package.json +++ b/package.json @@ -50,11 +50,13 @@ "serve-static": "^1.12.4", "socket.io": "^2.0.3", "socket.io-client": "^2.0.3", + "talisman": "^0.20.0", "twilio": "^3.6.1", "uuid": "^3.1.0", "web3": "^0.19.1", "winston": "^2.3.0", - "ws": "^3.1.0" + "ws": "^3.1.0", + "xml-stream": "^0.4.5" }, "repository": { "type": "git",