Incorporated some corrections from comments.

This commit is contained in:
Konstantin Mamalakis 2018-02-22 14:01:16 +02:00 committed by Josh Harvey
parent a3f8db79b3
commit 402f75f50c
3 changed files with 39 additions and 31 deletions

View file

@ -4,6 +4,7 @@ const util = require('util')
const XmlStream = require('xml-stream') const XmlStream = require('xml-stream')
const nameUtils = require('./name-utils') const nameUtils = require('./name-utils')
const options = require('../options') const options = require('../options')
const logger = require('../logger')
const _ = require('lodash/fp') const _ = require('lodash/fp')
const debug_log = require('./debug') // KOSTIS TODO: remove const debug_log = require('./debug') // KOSTIS TODO: remove
@ -59,6 +60,13 @@ const processAlias = _.curry((groupTypes, aliasNode) => {
const getNamePart = processDocumentedNamePart(groupTypes) const getNamePart = processDocumentedNamePart(groupTypes)
const latinNameNode = _.find(isLatin, aliasNode.DocumentedName) const latinNameNode = _.find(isLatin, aliasNode.DocumentedName)
if (!latinNameNode) {
const id = aliasNode.$.FixedRef
const message = `Alias for Person with ID="${id}" has no latinized name`
logger.error(message)
return
}
const namePartNodes = latinNameNode.DocumentedNamePart const namePartNodes = latinNameNode.DocumentedNamePart
const nameParts = _.map(getNamePart, namePartNodes) const nameParts = _.map(getNamePart, namePartNodes)
@ -86,6 +94,7 @@ function processFeature (featureNode) {
if (featureNode.$.FeatureTypeID !== BIRTH_DATE) return if (featureNode.$.FeatureTypeID !== BIRTH_DATE) return
const datePeriodNode = featureNode.FeatureVersion.DatePeriod const datePeriodNode = featureNode.FeatureVersion.DatePeriod
// Ignore the fact that both Start and end can be a range. // Ignore the fact that both Start and end can be a range.
// By using Start.From and End.To we use the extremes of the date-period. // By using Start.From and End.To we use the extremes of the date-period.
const period = { const period = {
@ -107,9 +116,14 @@ function processProfile (profileNode) {
const groupTypesEntries = _.map(processMasterNamePartGroup, identityNode.NamePartGroups.MasterNamePartGroup) const groupTypesEntries = _.map(processMasterNamePartGroup, identityNode.NamePartGroups.MasterNamePartGroup)
const groupTypes = new Map(groupTypesEntries) const groupTypes = new Map(groupTypesEntries)
const mapCompact = _.flow(_.compact, _.map)
const getNameParts = processAlias(groupTypes) const getNameParts = processAlias(groupTypes)
const aliases = _.compact(_.map(getNameParts, identityNode.Alias)) const aliases = mapCompact(getNameParts, identityNode.Alias)
const birthDatePeriods = _.compact(_.map(processFeature, profileNode.Feature))
if (_.isEmpty(aliases)) return
const birthDatePeriods = mapCompact(processFeature, profileNode.Feature)
const individual = {aliases, birthDatePeriods} const individual = {aliases, birthDatePeriods}
debug_log(individual) debug_log(individual)
@ -135,7 +149,7 @@ function promiseParseDocument (source) {
xml.collect('MasterNamePartGroup') xml.collect('MasterNamePartGroup')
const individuals = [] const individuals = []
const collectResult = result => individuals.push(result) const collectResult = result => result && individuals.push(result)
xml.on('updateElement: Profile', _.flow(processProfile, collectResult)) xml.on('updateElement: Profile', _.flow(processProfile, collectResult))
xml.on('end', _.wrap(resolve, individuals)) xml.on('end', _.wrap(resolve, individuals))
@ -147,8 +161,8 @@ const readdir = util.promisify(fs.readdir)
const combineAndDedupe = _.flow( const combineAndDedupe = _.flow(
_.flatten, _.flatten,
_.compact, _.compact,
_.uniqBy(_.nth(0)), _.uniqBy(_.first),
_.map(_.nth(1)) _.map(_.last)
) )
function parseList () { function parseList () {
@ -160,11 +174,7 @@ function parseList () {
} }
return readdir(OFAC_DATA_DIR) return readdir(OFAC_DATA_DIR)
.then(sources => { .then(sources => Promise.all(_.map(promiseParseDocument, sources)))
const promises = _.map(promiseParseDocument, sources)
return Promise.all(promises)
})
.then(combineAndDedupe) .then(combineAndDedupe)
} }

View file

@ -5,35 +5,32 @@ const _ = require('lodash/fp')
const debug_log = require('./debug') // KOSTIS TODO: remove const debug_log = require('./debug') // KOSTIS TODO: remove
const individuals = [] let individuals = []
function load () { function load () {
const newList = Array.from(dataParser.parseList()) individuals = Array.from(dataParser.parseList())
const oldLength = individuals.length
individuals.splice(0, oldLength, newList)
} }
load()
// MATCHING // MATCHING
// birth date // birth date
function isDateWithinSomeYearsOfPeriod (period, date, years) { function isDateWithinSomeDaysOfPeriod (period, date, days) {
const startDate = new Date(period.from.date) const inMillisecs = 24 * 60 * 60 * 1000
const startYear = startDate.getFullYear()
startDate.setFullYear(startYear - years)
const endDate = new Date(period.to.date) const startTime = period.from.date.getTime() - days * inMillisecs
const endYear = endDate.getFullYear() const startDate = new Date(startTime)
endDate.setFullYear(endYear + years)
const endTime = period.to.date.getTime() + days * inMillisecs
const endDate = new Date(endTime)
return (startDate < date && date < endDate) return (startDate < date && date < endDate)
} }
function isBornWithinTwoYears (individual, dateObject) { function isBornTooLongSince (individual, dateObject, days) {
const isWithinTwoYears = _.partialRight(isDateWithinSomeYearsOfPeriod, [dateObject.date, 2]) if (_.isEmpty(individual.birthDatePeriods)) return false
return _.some(isWithinTwoYears, individual.birthDatePeriods) const isWithinSomeYears = _.partialRight(isDateWithinSomeDaysOfPeriod, [dateObject.date, days])
return !_.some(isWithinSomeYears, individual.birthDatePeriods)
} }
// exact match // exact match
@ -59,9 +56,10 @@ function calcPhoneticMatchScore (candidatePhoneticFullName) {
const similarity = _.curry((candidate, individual) => { const similarity = _.curry((candidate, individual) => {
// Calculate if his birth date is within two years of the given date. // Calculate if his birth date is within two years of the given date.
// If an individual has multiple birth-date periods, return wether any are // If an individual has multiple birth-date periods, return whether any are
// within two years. Reject individuals who don't match this criterion. // within two years. Reject individuals who don't match this criterion.
if (individual.birthDatePeriods.length && !isBornWithinTwoYears(individual, candidate.birthDate)) return 0 const twoYears = 365 * 2
if (!isBornTooLongSince(individual, candidate.birthDate, twoYears)) return 0
// Calculate the Jaro-Winkler similarity of the full name. // Calculate the Jaro-Winkler similarity of the full name.
// If an individual has multiple aliases, use the maximum score. // If an individual has multiple aliases, use the maximum score.

View file

@ -10,12 +10,12 @@ const phoneticMethod2 = _.flow(doubleMetaphone, _.uniq)
const phoneticMethod3 = _.flow(_.split(' '), _.map(phoneticMethod2)) const phoneticMethod3 = _.flow(_.split(' '), _.map(phoneticMethod2))
// Combine name-parts in a standared order. // Combine name-parts in a standard order.
const fullNameFromParts = _.flow( const fullNameFromParts = _.flow(
_.toPairs, _.toPairs,
_.sortBy(_.nth(0)), // sort by part name, _.sortBy(_.first), // sort by part name,
_.map(_.nth(1)), // get part value _.map(_.last), // get part value
_.join(' ') _.join(' ')
) )