Created new data structures for the upcomming matching

This commit is contained in:
Konstantin Mamalakis 2018-02-28 19:17:45 +02:00 committed by Josh Harvey
parent 620863d703
commit 2f8c798304
3 changed files with 86 additions and 18 deletions

View file

@ -50,8 +50,7 @@ const processDocumentedNamePart = _.curry((groupTypes, namePartNode) => {
const typeId = groupTypes.get(groupId) const typeId = groupTypes.get(groupId)
const partName = partNames.get(typeId) const partName = partNames.get(typeId)
const value = _.lowerCase(valueNode.$text) const value = _.lowerCase(valueNode.$text)
const words = nameUtils.makeWords(value) return {partName, value}
return {partName, value, words}
}) })
const isLatin = _.matchesProperty(['$', 'DocNameStatusID'], PRIMARY_LATIN) const isLatin = _.matchesProperty(['$', 'DocNameStatusID'], PRIMARY_LATIN)
@ -68,12 +67,14 @@ const processAlias = _.curry((groupTypes, aliasNode) => {
return return
} }
const id = latinNameNode.$.ID
const namePartNodes = latinNameNode.DocumentedNamePart const namePartNodes = latinNameNode.DocumentedNamePart
const parts = _.map(getNamePart, namePartNodes) const parts = _.map(getNamePart, namePartNodes)
const fullName = nameUtils.makeFullName(parts) const fullName = nameUtils.makeFullName(parts)
const words = nameUtils.makeWords(fullName)
return {parts, fullName} return {id, parts, fullName, words}
}) })
// birth date // birth date
@ -121,10 +122,11 @@ function processProfile (profileNode) {
if (_.isEmpty(aliases)) return if (_.isEmpty(aliases)) return
const birthDatePeriods = mapCompact(processFeature, profileNode.Feature) const birthDatePeriods = mapCompact(processFeature, profileNode.Feature)
const individual = {aliases, birthDatePeriods} const individual = {id, aliases, birthDatePeriods}
debug_log(individual) debug_log(individual)
return [id, individual]
return individual
} }
function promiseParseDocument (source) { function promiseParseDocument (source) {
@ -146,6 +148,7 @@ function promiseParseDocument (source) {
xml.collect('MasterNamePartGroup') xml.collect('MasterNamePartGroup')
const individuals = [] const individuals = []
const collectResult = result => result && individuals.push(result) const collectResult = result => result && individuals.push(result)
xml.on('updateElement: Profile', _.flow(processProfile, collectResult)) xml.on('updateElement: Profile', _.flow(processProfile, collectResult))
@ -155,14 +158,79 @@ function promiseParseDocument (source) {
const readdir = util.promisify(fs.readdir) const readdir = util.promisify(fs.readdir)
// const {id, individual, words} = result
//
// const individualEntry = [id, individual]
// individuals.push(individualEntry)
//
// const phoneticWithWord = pair => {
// const [word, phonetics] = pair
// const makeEntry = phonetic => ({word, phonetic, individualId: id})
// return _.map(makeEntry, phonetics)
// }
//
// const phoneticEntries = _.flatten(_.map(phoneticWithWord, words))
// allPhonetics.push(...phoneticEntries)
const mapAliases = _.curry((iteratee, individuals) => {
const foreachIndividual = individual => {
const {id, aliases} = individual
return _.map(alias => iteratee(id, alias), aliases)
}
return _.flatten(_.map(foreachIndividual, individuals))
})
const getPhoneticEntries = (individualId, alias) => {
const pairPhoneticsWithWords = word => {
const {value, phonetics} = word
const makeEntry = phonetic => ({value, phonetic, aliasId: alias.id})
return _.map(makeEntry, phonetics)
}
return _.flatten(_.map(pairPhoneticsWithWords, alias.words))
}
const producePhoneticMap = _.flow(
mapAliases(getPhoneticEntries),
_.flatten,
_.groupBy(_.get('phonetic')),
_.mapValues(_.map(_.pick(['value', 'aliasId']))),
_.toPairs,
entries => new Map(entries)
)
const getWords = (individualId, alias) => {
const pairWordsWithIds = word => ({value: word.value, aliasId: alias.id})
return _.map(pairWordsWithIds, alias.words)
}
const produceWordList = _.flow(
mapAliases(getWords),
_.flatten,
_.groupBy(_.get('value')),
_.mapValues(_.map(_.get('aliasId'))),
_.toPairs,
_.map(_.zipObject(['value', 'aliasIds']))
)
const combineAndDedupe = _.flow( const combineAndDedupe = _.flow(
_.flatten, _.flatten,
_.compact, _.compact,
_.uniqBy(_.first), _.uniqBy(_.get('id')),
_.map(_.last) individuals => {
const getIdPairs = (individualId, alias) => [alias.id, individualId]
const idPairs = mapAliases(getIdPairs, individuals)
const aliasToIndividual = new Map(idPairs)
const phoneticMap = producePhoneticMap(individuals)
const wordList = produceWordList(individuals)
return {individuals, aliasToIndividual, phoneticMap, wordList}
}
) )
function parseList () { function produceStructs () {
// NOTE: Not sure how you push code updates to existing clients. This problem // NOTE: Not sure how you push code updates to existing clients. This problem
// might pop up if new code is pushed, without re-doing setup. // might pop up if new code is pushed, without re-doing setup.
if (!OFAC_DATA_DIR) { if (!OFAC_DATA_DIR) {
@ -175,4 +243,4 @@ function parseList () {
.then(combineAndDedupe) .then(combineAndDedupe)
} }
module.exports = {parseList} module.exports = {produceStructs}

View file

@ -5,12 +5,12 @@ const _ = require('lodash/fp')
const debug_log = require('../pp')(__filename) // KOSTIS TODO: remove const debug_log = require('../pp')(__filename) // KOSTIS TODO: remove
let individuals = null let structs = null
function load () { function load () {
return dataParser.parseList() return dataParser.produceStructs()
.then(list => { .then(result => {
individuals = Array.from(list) structs = result
}) })
} }
@ -111,7 +111,7 @@ function makeCompatible (nameParts) {
} }
function match (nameParts, birthDateString) { function match (nameParts, birthDateString) {
if (!individuals) { if (!structs) {
const message = 'The OFAC data sources have not been loaded yet.' const message = 'The OFAC data sources have not been loaded yet.'
return Promise.reject(new Error(message)) return Promise.reject(new Error(message))
} }
@ -132,7 +132,7 @@ function match (nameParts, birthDateString) {
debug_log(candidate) debug_log(candidate)
const similarToCandidate = similarity(candidate) const similarToCandidate = similarity(candidate)
const result = mapMax(similarToCandidate, individuals) const result = mapMax(similarToCandidate, structs.individuals)
console.log(result) console.log(result)
return result return result
} }

View file

@ -22,9 +22,9 @@ const makeFullName = _.flow(
const makeWords = value => { const makeWords = value => {
const words = _.split(' ', value) const words = _.split(' ', value)
const phonetic = _.map(makePhonetic, words) const phonetics = _.map(makePhonetic, words)
const props = _.zipAll([words, phonetic]) const pairs = _.zipAll([words, phonetics])
return _.map(_.zipObject(['value', 'phonetic']), props) return _.map(_.zipObject(['value', 'phonetics']), pairs)
} }
module.exports = { module.exports = {