Incorporated some corrections from comments.

This commit is contained in:
Konstantin Mamalakis 2018-02-22 14:01:16 +02:00 committed by Josh Harvey
parent a3f8db79b3
commit 402f75f50c
3 changed files with 39 additions and 31 deletions

View file

@ -4,6 +4,7 @@ const util = require('util')
const XmlStream = require('xml-stream')
const nameUtils = require('./name-utils')
const options = require('../options')
const logger = require('../logger')
const _ = require('lodash/fp')
const debug_log = require('./debug') // KOSTIS TODO: remove
@ -59,6 +60,13 @@ const processAlias = _.curry((groupTypes, aliasNode) => {
const getNamePart = processDocumentedNamePart(groupTypes)
const latinNameNode = _.find(isLatin, aliasNode.DocumentedName)
if (!latinNameNode) {
const id = aliasNode.$.FixedRef
const message = `Alias for Person with ID="${id}" has no latinized name`
logger.error(message)
return
}
const namePartNodes = latinNameNode.DocumentedNamePart
const nameParts = _.map(getNamePart, namePartNodes)
@ -86,6 +94,7 @@ function processFeature (featureNode) {
if (featureNode.$.FeatureTypeID !== BIRTH_DATE) return
const datePeriodNode = featureNode.FeatureVersion.DatePeriod
// Ignore the fact that both Start and end can be a range.
// By using Start.From and End.To we use the extremes of the date-period.
const period = {
@ -107,9 +116,14 @@ function processProfile (profileNode) {
const groupTypesEntries = _.map(processMasterNamePartGroup, identityNode.NamePartGroups.MasterNamePartGroup)
const groupTypes = new Map(groupTypesEntries)
const mapCompact = _.flow(_.compact, _.map)
const getNameParts = processAlias(groupTypes)
const aliases = _.compact(_.map(getNameParts, identityNode.Alias))
const birthDatePeriods = _.compact(_.map(processFeature, profileNode.Feature))
const aliases = mapCompact(getNameParts, identityNode.Alias)
if (_.isEmpty(aliases)) return
const birthDatePeriods = mapCompact(processFeature, profileNode.Feature)
const individual = {aliases, birthDatePeriods}
debug_log(individual)
@ -135,7 +149,7 @@ function promiseParseDocument (source) {
xml.collect('MasterNamePartGroup')
const individuals = []
const collectResult = result => individuals.push(result)
const collectResult = result => result && individuals.push(result)
xml.on('updateElement: Profile', _.flow(processProfile, collectResult))
xml.on('end', _.wrap(resolve, individuals))
@ -147,8 +161,8 @@ const readdir = util.promisify(fs.readdir)
const combineAndDedupe = _.flow(
_.flatten,
_.compact,
_.uniqBy(_.nth(0)),
_.map(_.nth(1))
_.uniqBy(_.first),
_.map(_.last)
)
function parseList () {
@ -160,11 +174,7 @@ function parseList () {
}
return readdir(OFAC_DATA_DIR)
.then(sources => {
const promises = _.map(promiseParseDocument, sources)
return Promise.all(promises)
})
.then(sources => Promise.all(_.map(promiseParseDocument, sources)))
.then(combineAndDedupe)
}

View file

@ -5,35 +5,32 @@ const _ = require('lodash/fp')
const debug_log = require('./debug') // KOSTIS TODO: remove
const individuals = []
let individuals = []
function load () {
const newList = Array.from(dataParser.parseList())
const oldLength = individuals.length
individuals.splice(0, oldLength, newList)
individuals = Array.from(dataParser.parseList())
}
load()
// MATCHING
// birth date
function isDateWithinSomeYearsOfPeriod (period, date, years) {
const startDate = new Date(period.from.date)
const startYear = startDate.getFullYear()
startDate.setFullYear(startYear - years)
function isDateWithinSomeDaysOfPeriod (period, date, days) {
const inMillisecs = 24 * 60 * 60 * 1000
const endDate = new Date(period.to.date)
const endYear = endDate.getFullYear()
endDate.setFullYear(endYear + years)
const startTime = period.from.date.getTime() - days * inMillisecs
const startDate = new Date(startTime)
const endTime = period.to.date.getTime() + days * inMillisecs
const endDate = new Date(endTime)
return (startDate < date && date < endDate)
}
function isBornWithinTwoYears (individual, dateObject) {
const isWithinTwoYears = _.partialRight(isDateWithinSomeYearsOfPeriod, [dateObject.date, 2])
return _.some(isWithinTwoYears, individual.birthDatePeriods)
function isBornTooLongSince (individual, dateObject, days) {
if (_.isEmpty(individual.birthDatePeriods)) return false
const isWithinSomeYears = _.partialRight(isDateWithinSomeDaysOfPeriod, [dateObject.date, days])
return !_.some(isWithinSomeYears, individual.birthDatePeriods)
}
// exact match
@ -59,9 +56,10 @@ function calcPhoneticMatchScore (candidatePhoneticFullName) {
const similarity = _.curry((candidate, individual) => {
// Calculate if his birth date is within two years of the given date.
// If an individual has multiple birth-date periods, return wether any are
// If an individual has multiple birth-date periods, return whether any are
// within two years. Reject individuals who don't match this criterion.
if (individual.birthDatePeriods.length && !isBornWithinTwoYears(individual, candidate.birthDate)) return 0
const twoYears = 365 * 2
if (!isBornTooLongSince(individual, candidate.birthDate, twoYears)) return 0
// Calculate the Jaro-Winkler similarity of the full name.
// If an individual has multiple aliases, use the maximum score.

View file

@ -10,12 +10,12 @@ const phoneticMethod2 = _.flow(doubleMetaphone, _.uniq)
const phoneticMethod3 = _.flow(_.split(' '), _.map(phoneticMethod2))
// Combine name-parts in a standared order.
// Combine name-parts in a standard order.
const fullNameFromParts = _.flow(
_.toPairs,
_.sortBy(_.nth(0)), // sort by part name,
_.map(_.nth(1)), // get part value
_.sortBy(_.first), // sort by part name,
_.map(_.last), // get part value
_.join(' ')
)