Incorporated some corrections from comments.
This commit is contained in:
parent
a3f8db79b3
commit
402f75f50c
3 changed files with 39 additions and 31 deletions
|
|
@ -4,6 +4,7 @@ const util = require('util')
|
|||
const XmlStream = require('xml-stream')
|
||||
const nameUtils = require('./name-utils')
|
||||
const options = require('../options')
|
||||
const logger = require('../logger')
|
||||
const _ = require('lodash/fp')
|
||||
|
||||
const debug_log = require('./debug') // KOSTIS TODO: remove
|
||||
|
|
@ -59,6 +60,13 @@ const processAlias = _.curry((groupTypes, aliasNode) => {
|
|||
|
||||
const getNamePart = processDocumentedNamePart(groupTypes)
|
||||
const latinNameNode = _.find(isLatin, aliasNode.DocumentedName)
|
||||
if (!latinNameNode) {
|
||||
const id = aliasNode.$.FixedRef
|
||||
const message = `Alias for Person with ID="${id}" has no latinized name`
|
||||
logger.error(message)
|
||||
return
|
||||
}
|
||||
|
||||
const namePartNodes = latinNameNode.DocumentedNamePart
|
||||
const nameParts = _.map(getNamePart, namePartNodes)
|
||||
|
||||
|
|
@ -86,6 +94,7 @@ function processFeature (featureNode) {
|
|||
if (featureNode.$.FeatureTypeID !== BIRTH_DATE) return
|
||||
|
||||
const datePeriodNode = featureNode.FeatureVersion.DatePeriod
|
||||
|
||||
// Ignore the fact that both Start and end can be a range.
|
||||
// By using Start.From and End.To we use the extremes of the date-period.
|
||||
const period = {
|
||||
|
|
@ -107,9 +116,14 @@ function processProfile (profileNode) {
|
|||
const groupTypesEntries = _.map(processMasterNamePartGroup, identityNode.NamePartGroups.MasterNamePartGroup)
|
||||
const groupTypes = new Map(groupTypesEntries)
|
||||
|
||||
const mapCompact = _.flow(_.compact, _.map)
|
||||
|
||||
const getNameParts = processAlias(groupTypes)
|
||||
const aliases = _.compact(_.map(getNameParts, identityNode.Alias))
|
||||
const birthDatePeriods = _.compact(_.map(processFeature, profileNode.Feature))
|
||||
const aliases = mapCompact(getNameParts, identityNode.Alias)
|
||||
|
||||
if (_.isEmpty(aliases)) return
|
||||
|
||||
const birthDatePeriods = mapCompact(processFeature, profileNode.Feature)
|
||||
const individual = {aliases, birthDatePeriods}
|
||||
|
||||
debug_log(individual)
|
||||
|
|
@ -135,7 +149,7 @@ function promiseParseDocument (source) {
|
|||
xml.collect('MasterNamePartGroup')
|
||||
|
||||
const individuals = []
|
||||
const collectResult = result => individuals.push(result)
|
||||
const collectResult = result => result && individuals.push(result)
|
||||
xml.on('updateElement: Profile', _.flow(processProfile, collectResult))
|
||||
|
||||
xml.on('end', _.wrap(resolve, individuals))
|
||||
|
|
@ -147,8 +161,8 @@ const readdir = util.promisify(fs.readdir)
|
|||
const combineAndDedupe = _.flow(
|
||||
_.flatten,
|
||||
_.compact,
|
||||
_.uniqBy(_.nth(0)),
|
||||
_.map(_.nth(1))
|
||||
_.uniqBy(_.first),
|
||||
_.map(_.last)
|
||||
)
|
||||
|
||||
function parseList () {
|
||||
|
|
@ -160,11 +174,7 @@ function parseList () {
|
|||
}
|
||||
|
||||
return readdir(OFAC_DATA_DIR)
|
||||
.then(sources => {
|
||||
const promises = _.map(promiseParseDocument, sources)
|
||||
|
||||
return Promise.all(promises)
|
||||
})
|
||||
.then(sources => Promise.all(_.map(promiseParseDocument, sources)))
|
||||
.then(combineAndDedupe)
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -5,35 +5,32 @@ const _ = require('lodash/fp')
|
|||
|
||||
const debug_log = require('./debug') // KOSTIS TODO: remove
|
||||
|
||||
const individuals = []
|
||||
let individuals = []
|
||||
|
||||
function load () {
|
||||
const newList = Array.from(dataParser.parseList())
|
||||
const oldLength = individuals.length
|
||||
individuals.splice(0, oldLength, newList)
|
||||
individuals = Array.from(dataParser.parseList())
|
||||
}
|
||||
|
||||
load()
|
||||
|
||||
// MATCHING
|
||||
|
||||
// birth date
|
||||
|
||||
function isDateWithinSomeYearsOfPeriod (period, date, years) {
|
||||
const startDate = new Date(period.from.date)
|
||||
const startYear = startDate.getFullYear()
|
||||
startDate.setFullYear(startYear - years)
|
||||
function isDateWithinSomeDaysOfPeriod (period, date, days) {
|
||||
const inMillisecs = 24 * 60 * 60 * 1000
|
||||
|
||||
const endDate = new Date(period.to.date)
|
||||
const endYear = endDate.getFullYear()
|
||||
endDate.setFullYear(endYear + years)
|
||||
const startTime = period.from.date.getTime() - days * inMillisecs
|
||||
const startDate = new Date(startTime)
|
||||
|
||||
const endTime = period.to.date.getTime() + days * inMillisecs
|
||||
const endDate = new Date(endTime)
|
||||
|
||||
return (startDate < date && date < endDate)
|
||||
}
|
||||
|
||||
function isBornWithinTwoYears (individual, dateObject) {
|
||||
const isWithinTwoYears = _.partialRight(isDateWithinSomeYearsOfPeriod, [dateObject.date, 2])
|
||||
return _.some(isWithinTwoYears, individual.birthDatePeriods)
|
||||
function isBornTooLongSince (individual, dateObject, days) {
|
||||
if (_.isEmpty(individual.birthDatePeriods)) return false
|
||||
const isWithinSomeYears = _.partialRight(isDateWithinSomeDaysOfPeriod, [dateObject.date, days])
|
||||
return !_.some(isWithinSomeYears, individual.birthDatePeriods)
|
||||
}
|
||||
|
||||
// exact match
|
||||
|
|
@ -59,9 +56,10 @@ function calcPhoneticMatchScore (candidatePhoneticFullName) {
|
|||
|
||||
const similarity = _.curry((candidate, individual) => {
|
||||
// Calculate if his birth date is within two years of the given date.
|
||||
// If an individual has multiple birth-date periods, return wether any are
|
||||
// If an individual has multiple birth-date periods, return whether any are
|
||||
// within two years. Reject individuals who don't match this criterion.
|
||||
if (individual.birthDatePeriods.length && !isBornWithinTwoYears(individual, candidate.birthDate)) return 0
|
||||
const twoYears = 365 * 2
|
||||
if (!isBornTooLongSince(individual, candidate.birthDate, twoYears)) return 0
|
||||
|
||||
// Calculate the Jaro-Winkler similarity of the full name.
|
||||
// If an individual has multiple aliases, use the maximum score.
|
||||
|
|
|
|||
|
|
@ -10,12 +10,12 @@ const phoneticMethod2 = _.flow(doubleMetaphone, _.uniq)
|
|||
|
||||
const phoneticMethod3 = _.flow(_.split(' '), _.map(phoneticMethod2))
|
||||
|
||||
// Combine name-parts in a standared order.
|
||||
// Combine name-parts in a standard order.
|
||||
|
||||
const fullNameFromParts = _.flow(
|
||||
_.toPairs,
|
||||
_.sortBy(_.nth(0)), // sort by part name,
|
||||
_.map(_.nth(1)), // get part value
|
||||
_.sortBy(_.first), // sort by part name,
|
||||
_.map(_.last), // get part value
|
||||
_.join(' ')
|
||||
)
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue