Incorporated some corrections from comments.
This commit is contained in:
parent
a3f8db79b3
commit
402f75f50c
3 changed files with 39 additions and 31 deletions
|
|
@ -4,6 +4,7 @@ const util = require('util')
|
||||||
const XmlStream = require('xml-stream')
|
const XmlStream = require('xml-stream')
|
||||||
const nameUtils = require('./name-utils')
|
const nameUtils = require('./name-utils')
|
||||||
const options = require('../options')
|
const options = require('../options')
|
||||||
|
const logger = require('../logger')
|
||||||
const _ = require('lodash/fp')
|
const _ = require('lodash/fp')
|
||||||
|
|
||||||
const debug_log = require('./debug') // KOSTIS TODO: remove
|
const debug_log = require('./debug') // KOSTIS TODO: remove
|
||||||
|
|
@ -59,6 +60,13 @@ const processAlias = _.curry((groupTypes, aliasNode) => {
|
||||||
|
|
||||||
const getNamePart = processDocumentedNamePart(groupTypes)
|
const getNamePart = processDocumentedNamePart(groupTypes)
|
||||||
const latinNameNode = _.find(isLatin, aliasNode.DocumentedName)
|
const latinNameNode = _.find(isLatin, aliasNode.DocumentedName)
|
||||||
|
if (!latinNameNode) {
|
||||||
|
const id = aliasNode.$.FixedRef
|
||||||
|
const message = `Alias for Person with ID="${id}" has no latinized name`
|
||||||
|
logger.error(message)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
const namePartNodes = latinNameNode.DocumentedNamePart
|
const namePartNodes = latinNameNode.DocumentedNamePart
|
||||||
const nameParts = _.map(getNamePart, namePartNodes)
|
const nameParts = _.map(getNamePart, namePartNodes)
|
||||||
|
|
||||||
|
|
@ -86,6 +94,7 @@ function processFeature (featureNode) {
|
||||||
if (featureNode.$.FeatureTypeID !== BIRTH_DATE) return
|
if (featureNode.$.FeatureTypeID !== BIRTH_DATE) return
|
||||||
|
|
||||||
const datePeriodNode = featureNode.FeatureVersion.DatePeriod
|
const datePeriodNode = featureNode.FeatureVersion.DatePeriod
|
||||||
|
|
||||||
// Ignore the fact that both Start and end can be a range.
|
// Ignore the fact that both Start and end can be a range.
|
||||||
// By using Start.From and End.To we use the extremes of the date-period.
|
// By using Start.From and End.To we use the extremes of the date-period.
|
||||||
const period = {
|
const period = {
|
||||||
|
|
@ -107,9 +116,14 @@ function processProfile (profileNode) {
|
||||||
const groupTypesEntries = _.map(processMasterNamePartGroup, identityNode.NamePartGroups.MasterNamePartGroup)
|
const groupTypesEntries = _.map(processMasterNamePartGroup, identityNode.NamePartGroups.MasterNamePartGroup)
|
||||||
const groupTypes = new Map(groupTypesEntries)
|
const groupTypes = new Map(groupTypesEntries)
|
||||||
|
|
||||||
|
const mapCompact = _.flow(_.compact, _.map)
|
||||||
|
|
||||||
const getNameParts = processAlias(groupTypes)
|
const getNameParts = processAlias(groupTypes)
|
||||||
const aliases = _.compact(_.map(getNameParts, identityNode.Alias))
|
const aliases = mapCompact(getNameParts, identityNode.Alias)
|
||||||
const birthDatePeriods = _.compact(_.map(processFeature, profileNode.Feature))
|
|
||||||
|
if (_.isEmpty(aliases)) return
|
||||||
|
|
||||||
|
const birthDatePeriods = mapCompact(processFeature, profileNode.Feature)
|
||||||
const individual = {aliases, birthDatePeriods}
|
const individual = {aliases, birthDatePeriods}
|
||||||
|
|
||||||
debug_log(individual)
|
debug_log(individual)
|
||||||
|
|
@ -135,7 +149,7 @@ function promiseParseDocument (source) {
|
||||||
xml.collect('MasterNamePartGroup')
|
xml.collect('MasterNamePartGroup')
|
||||||
|
|
||||||
const individuals = []
|
const individuals = []
|
||||||
const collectResult = result => individuals.push(result)
|
const collectResult = result => result && individuals.push(result)
|
||||||
xml.on('updateElement: Profile', _.flow(processProfile, collectResult))
|
xml.on('updateElement: Profile', _.flow(processProfile, collectResult))
|
||||||
|
|
||||||
xml.on('end', _.wrap(resolve, individuals))
|
xml.on('end', _.wrap(resolve, individuals))
|
||||||
|
|
@ -147,8 +161,8 @@ const readdir = util.promisify(fs.readdir)
|
||||||
const combineAndDedupe = _.flow(
|
const combineAndDedupe = _.flow(
|
||||||
_.flatten,
|
_.flatten,
|
||||||
_.compact,
|
_.compact,
|
||||||
_.uniqBy(_.nth(0)),
|
_.uniqBy(_.first),
|
||||||
_.map(_.nth(1))
|
_.map(_.last)
|
||||||
)
|
)
|
||||||
|
|
||||||
function parseList () {
|
function parseList () {
|
||||||
|
|
@ -160,11 +174,7 @@ function parseList () {
|
||||||
}
|
}
|
||||||
|
|
||||||
return readdir(OFAC_DATA_DIR)
|
return readdir(OFAC_DATA_DIR)
|
||||||
.then(sources => {
|
.then(sources => Promise.all(_.map(promiseParseDocument, sources)))
|
||||||
const promises = _.map(promiseParseDocument, sources)
|
|
||||||
|
|
||||||
return Promise.all(promises)
|
|
||||||
})
|
|
||||||
.then(combineAndDedupe)
|
.then(combineAndDedupe)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -5,35 +5,32 @@ const _ = require('lodash/fp')
|
||||||
|
|
||||||
const debug_log = require('./debug') // KOSTIS TODO: remove
|
const debug_log = require('./debug') // KOSTIS TODO: remove
|
||||||
|
|
||||||
const individuals = []
|
let individuals = []
|
||||||
|
|
||||||
function load () {
|
function load () {
|
||||||
const newList = Array.from(dataParser.parseList())
|
individuals = Array.from(dataParser.parseList())
|
||||||
const oldLength = individuals.length
|
|
||||||
individuals.splice(0, oldLength, newList)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
load()
|
|
||||||
|
|
||||||
// MATCHING
|
// MATCHING
|
||||||
|
|
||||||
// birth date
|
// birth date
|
||||||
|
|
||||||
function isDateWithinSomeYearsOfPeriod (period, date, years) {
|
function isDateWithinSomeDaysOfPeriod (period, date, days) {
|
||||||
const startDate = new Date(period.from.date)
|
const inMillisecs = 24 * 60 * 60 * 1000
|
||||||
const startYear = startDate.getFullYear()
|
|
||||||
startDate.setFullYear(startYear - years)
|
|
||||||
|
|
||||||
const endDate = new Date(period.to.date)
|
const startTime = period.from.date.getTime() - days * inMillisecs
|
||||||
const endYear = endDate.getFullYear()
|
const startDate = new Date(startTime)
|
||||||
endDate.setFullYear(endYear + years)
|
|
||||||
|
const endTime = period.to.date.getTime() + days * inMillisecs
|
||||||
|
const endDate = new Date(endTime)
|
||||||
|
|
||||||
return (startDate < date && date < endDate)
|
return (startDate < date && date < endDate)
|
||||||
}
|
}
|
||||||
|
|
||||||
function isBornWithinTwoYears (individual, dateObject) {
|
function isBornTooLongSince (individual, dateObject, days) {
|
||||||
const isWithinTwoYears = _.partialRight(isDateWithinSomeYearsOfPeriod, [dateObject.date, 2])
|
if (_.isEmpty(individual.birthDatePeriods)) return false
|
||||||
return _.some(isWithinTwoYears, individual.birthDatePeriods)
|
const isWithinSomeYears = _.partialRight(isDateWithinSomeDaysOfPeriod, [dateObject.date, days])
|
||||||
|
return !_.some(isWithinSomeYears, individual.birthDatePeriods)
|
||||||
}
|
}
|
||||||
|
|
||||||
// exact match
|
// exact match
|
||||||
|
|
@ -59,9 +56,10 @@ function calcPhoneticMatchScore (candidatePhoneticFullName) {
|
||||||
|
|
||||||
const similarity = _.curry((candidate, individual) => {
|
const similarity = _.curry((candidate, individual) => {
|
||||||
// Calculate if his birth date is within two years of the given date.
|
// Calculate if his birth date is within two years of the given date.
|
||||||
// If an individual has multiple birth-date periods, return wether any are
|
// If an individual has multiple birth-date periods, return whether any are
|
||||||
// within two years. Reject individuals who don't match this criterion.
|
// within two years. Reject individuals who don't match this criterion.
|
||||||
if (individual.birthDatePeriods.length && !isBornWithinTwoYears(individual, candidate.birthDate)) return 0
|
const twoYears = 365 * 2
|
||||||
|
if (!isBornTooLongSince(individual, candidate.birthDate, twoYears)) return 0
|
||||||
|
|
||||||
// Calculate the Jaro-Winkler similarity of the full name.
|
// Calculate the Jaro-Winkler similarity of the full name.
|
||||||
// If an individual has multiple aliases, use the maximum score.
|
// If an individual has multiple aliases, use the maximum score.
|
||||||
|
|
|
||||||
|
|
@ -10,12 +10,12 @@ const phoneticMethod2 = _.flow(doubleMetaphone, _.uniq)
|
||||||
|
|
||||||
const phoneticMethod3 = _.flow(_.split(' '), _.map(phoneticMethod2))
|
const phoneticMethod3 = _.flow(_.split(' '), _.map(phoneticMethod2))
|
||||||
|
|
||||||
// Combine name-parts in a standared order.
|
// Combine name-parts in a standard order.
|
||||||
|
|
||||||
const fullNameFromParts = _.flow(
|
const fullNameFromParts = _.flow(
|
||||||
_.toPairs,
|
_.toPairs,
|
||||||
_.sortBy(_.nth(0)), // sort by part name,
|
_.sortBy(_.first), // sort by part name,
|
||||||
_.map(_.nth(1)), // get part value
|
_.map(_.last), // get part value
|
||||||
_.join(' ')
|
_.join(' ')
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue