Set the name ordering & small fixes
This commit is contained in:
parent
402f75f50c
commit
910d7e200f
4 changed files with 74 additions and 44 deletions
|
|
@ -7,7 +7,7 @@ const options = require('../options')
|
||||||
const logger = require('../logger')
|
const logger = require('../logger')
|
||||||
const _ = require('lodash/fp')
|
const _ = require('lodash/fp')
|
||||||
|
|
||||||
const debug_log = require('./debug') // KOSTIS TODO: remove
|
const debug_log = require('../pp')(__filename) // KOSTIS TODO: remove
|
||||||
|
|
||||||
const OFAC_DATA_DIR = options.ofacDataDir
|
const OFAC_DATA_DIR = options.ofacDataDir
|
||||||
|
|
||||||
|
|
@ -71,12 +71,12 @@ const processAlias = _.curry((groupTypes, aliasNode) => {
|
||||||
const nameParts = _.map(getNamePart, namePartNodes)
|
const nameParts = _.map(getNamePart, namePartNodes)
|
||||||
|
|
||||||
const parts = _.assignAll(nameParts)
|
const parts = _.assignAll(nameParts)
|
||||||
const fullName = nameUtils.fullNameFromParts(parts)
|
const fullNames = nameUtils.makeFullNames(parts)
|
||||||
|
|
||||||
const phoneticParts = _.mapValues(nameUtils.phonetic, parts)
|
const phoneticParts = _.mapValues(nameUtils.phonetic, parts)
|
||||||
const phoneticFullName = nameUtils.phonetic(fullName)
|
const phoneticFullNames = _.map(nameUtils.phonetic, fullNames)
|
||||||
|
|
||||||
return {parts, fullName, phoneticParts, phoneticFullName}
|
return {parts, fullNames, phoneticParts, phoneticFullNames}
|
||||||
})
|
})
|
||||||
|
|
||||||
// birth date
|
// birth date
|
||||||
|
|
@ -116,7 +116,7 @@ function processProfile (profileNode) {
|
||||||
const groupTypesEntries = _.map(processMasterNamePartGroup, identityNode.NamePartGroups.MasterNamePartGroup)
|
const groupTypesEntries = _.map(processMasterNamePartGroup, identityNode.NamePartGroups.MasterNamePartGroup)
|
||||||
const groupTypes = new Map(groupTypesEntries)
|
const groupTypes = new Map(groupTypesEntries)
|
||||||
|
|
||||||
const mapCompact = _.flow(_.compact, _.map)
|
const mapCompact = _.flow(_.map, _.compact)
|
||||||
|
|
||||||
const getNameParts = processAlias(groupTypes)
|
const getNameParts = processAlias(groupTypes)
|
||||||
const aliases = mapCompact(getNameParts, identityNode.Alias)
|
const aliases = mapCompact(getNameParts, identityNode.Alias)
|
||||||
|
|
|
||||||
|
|
@ -1 +0,0 @@
|
||||||
module.exports = (...args) => console.log(require('util').inspect(args, {depth: null, colors: true}))
|
|
||||||
|
|
@ -1,27 +1,32 @@
|
||||||
const dataParser = require('./data-paraser')
|
const dataParser = require('./data-parser')
|
||||||
const nameUtils = require('./name-utils')
|
const nameUtils = require('./name-utils')
|
||||||
const jaroWinkler = require('talisman/metrics/distance/jaro-winkler')
|
const jaroWinkler = require('talisman/metrics/distance/jaro-winkler')
|
||||||
const _ = require('lodash/fp')
|
const _ = require('lodash/fp')
|
||||||
|
|
||||||
const debug_log = require('./debug') // KOSTIS TODO: remove
|
const debug_log = require('../pp')(__filename) // KOSTIS TODO: remove
|
||||||
|
|
||||||
let individuals = []
|
let individuals = null
|
||||||
|
|
||||||
function load () {
|
function load () {
|
||||||
individuals = Array.from(dataParser.parseList())
|
return dataParser.parseList()
|
||||||
|
.then(list => {
|
||||||
|
individuals = Array.from(list)
|
||||||
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
// MATCHING
|
// MATCHING
|
||||||
|
|
||||||
|
const mapMax = (iteratee, list) => _.max(_.map(iteratee, list))
|
||||||
|
|
||||||
// birth date
|
// birth date
|
||||||
|
|
||||||
function isDateWithinSomeDaysOfPeriod (period, date, days) {
|
function isDateWithinSomeDaysOfPeriod (period, date, days) {
|
||||||
const inMillisecs = 24 * 60 * 60 * 1000
|
const inMillisecs = 24 * 60 * 60 * 1000
|
||||||
|
|
||||||
const startTime = period.from.date.getTime() - days * inMillisecs
|
const startTime = period.start.date.getTime() - days * inMillisecs
|
||||||
const startDate = new Date(startTime)
|
const startDate = new Date(startTime)
|
||||||
|
|
||||||
const endTime = period.to.date.getTime() + days * inMillisecs
|
const endTime = period.end.date.getTime() + days * inMillisecs
|
||||||
const endDate = new Date(endTime)
|
const endDate = new Date(endTime)
|
||||||
|
|
||||||
return (startDate < date && date < endDate)
|
return (startDate < date && date < endDate)
|
||||||
|
|
@ -33,52 +38,55 @@ function isBornTooLongSince (individual, dateObject, days) {
|
||||||
return !_.some(isWithinSomeYears, individual.birthDatePeriods)
|
return !_.some(isWithinSomeYears, individual.birthDatePeriods)
|
||||||
}
|
}
|
||||||
|
|
||||||
// exact match
|
// string similarity
|
||||||
|
|
||||||
function calcExactMatchScore (candidateFullName) {
|
const stringMatch = _.curry(jaroWinkler)
|
||||||
return function (alias) {
|
|
||||||
return jaroWinkler(alias.fullName, candidateFullName)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// phonetic match
|
const bestMatchInList = _.curry((list, name) => mapMax(stringMatch(name), list))
|
||||||
|
|
||||||
function calcPhoneticMatchScore (candidatePhoneticFullName) {
|
const aliasStringMatch = _.curry((candidate, alias) => {
|
||||||
return function (alias) {
|
const matchWithCandidate = bestMatchInList(candidate.fullNames)
|
||||||
return jaroWinkler(alias.phoneticFullName, candidatePhoneticFullName)
|
return mapMax(matchWithCandidate, alias.fullNames)
|
||||||
}
|
})
|
||||||
}
|
|
||||||
|
|
||||||
// algorithm
|
// algorithm
|
||||||
|
|
||||||
// NOTE: I'm still not 100% on what matching algorithm is the best choice.
|
|
||||||
// I just experiment with a few metrics for now.
|
|
||||||
|
|
||||||
const similarity = _.curry((candidate, individual) => {
|
const similarity = _.curry((candidate, individual) => {
|
||||||
// Calculate if his birth date is within two years of the given date.
|
// Calculate if his birth date is within two years of the given date.
|
||||||
// If an individual has multiple birth-date periods, return whether any are
|
// If an individual has multiple birth-date periods, return whether any are
|
||||||
// within two years. Reject individuals who don't match this criterion.
|
// within two years. Reject individuals who don't match this criterion.
|
||||||
const twoYears = 365 * 2
|
const twoYears = 365 * 2
|
||||||
if (!isBornTooLongSince(individual, candidate.birthDate, twoYears)) return 0
|
if (isBornTooLongSince(individual, candidate.birthDate, twoYears)) return 0
|
||||||
|
|
||||||
|
debug_log(individual)
|
||||||
|
|
||||||
// Calculate the Jaro-Winkler similarity of the full name.
|
// Calculate the Jaro-Winkler similarity of the full name.
|
||||||
// If an individual has multiple aliases, use the maximum score.
|
// If an individual has multiple aliases, use the maximum score.
|
||||||
const exactMatchScore = _.max(_.map(calcExactMatchScore(candidate.fullName), individual.aliases))
|
const scoreAgainstCandidate = aliasStringMatch(candidate)
|
||||||
|
const stringMatchScore = mapMax(scoreAgainstCandidate, individual.aliases)
|
||||||
|
|
||||||
// Calculate the Jaro-Winkler similarity of the phonetic representation of the full name.
|
// // Calculate the Jaro-Winkler similarity of the phonetic representation of the full name.
|
||||||
// This should approximate the phonetic similarity of the two names.
|
// // This should approximate the phonetic similarity of the two names.
|
||||||
// If an individual has multiple aliases, use the maximum score.
|
// // If an individual has multiple aliases, use the maximum score.
|
||||||
const phoneticMatchScore = _.max(_.map(calcPhoneticMatchScore(candidate.phoneticFullName), individual.aliases))
|
// const phoneticMatchScore = mapMax(calcPhoneticMatchScore(candidate.phoneticFullName))(individual.aliases)
|
||||||
|
|
||||||
return _.max([exactMatchScore, phoneticMatchScore])
|
console.log(stringMatchScore)
|
||||||
|
|
||||||
|
return _.max([stringMatchScore])
|
||||||
})
|
})
|
||||||
|
|
||||||
function match (parts, birthDateString) {
|
function match (nameParts, birthDateString) {
|
||||||
|
if (!individuals) {
|
||||||
|
const message = 'The OFAC data sources have not been loaded yet.'
|
||||||
|
return Promise.reject(new Error(message))
|
||||||
|
}
|
||||||
|
|
||||||
// nameParts should be an object like {firstName: "John", lastName: "Doe", ...}
|
// nameParts should be an object like {firstName: "John", lastName: "Doe", ...}
|
||||||
const fullName = nameUtils.fullNameFromParts(parts)
|
const parts = _.mapValues(_.lowerCase, nameParts)
|
||||||
|
const fullNames = nameUtils.makeFullNames(parts)
|
||||||
|
|
||||||
const phoneticParts = _.mapValues(nameUtils.phonetic, parts)
|
const phoneticParts = _.mapValues(nameUtils.phonetic, parts)
|
||||||
const phoneticFullName = nameUtils.phonetic(fullName)
|
const phoneticFullNames = _.map(nameUtils.phonetic, fullNames)
|
||||||
|
|
||||||
// birthDateString is in YYYYMMDD format
|
// birthDateString is in YYYYMMDD format
|
||||||
const year = parseInt(birthDateString.slice(0, 4))
|
const year = parseInt(birthDateString.slice(0, 4))
|
||||||
|
|
@ -88,10 +96,12 @@ function match (parts, birthDateString) {
|
||||||
|
|
||||||
const birthDate = {year, month, day, date}
|
const birthDate = {year, month, day, date}
|
||||||
|
|
||||||
const candidate = {parts, fullName, phoneticParts, phoneticFullName, birthDate}
|
const candidate = {parts, fullNames, phoneticParts, phoneticFullNames, birthDate}
|
||||||
|
|
||||||
const similarToCandidate = similarity(candidate)
|
const similarToCandidate = similarity(candidate)
|
||||||
return _.max(similarToCandidate, individuals)
|
const result = mapMax(similarToCandidate, individuals)
|
||||||
|
debug_log(candidate)
|
||||||
|
return result
|
||||||
}
|
}
|
||||||
|
|
||||||
module.exports = {load, match}
|
module.exports = {load, match}
|
||||||
|
|
|
||||||
|
|
@ -12,14 +12,35 @@ const phoneticMethod3 = _.flow(_.split(' '), _.map(phoneticMethod2))
|
||||||
|
|
||||||
// Combine name-parts in a standard order.
|
// Combine name-parts in a standard order.
|
||||||
|
|
||||||
const fullNameFromParts = _.flow(
|
const commonOrderings = [
|
||||||
_.toPairs,
|
['firstName', 'lastName'],
|
||||||
_.sortBy(_.first), // sort by part name,
|
['firstName', 'middleName', 'lastName'],
|
||||||
_.map(_.last), // get part value
|
['firstName', 'maidenName', 'lastName'],
|
||||||
|
['firstName', 'patronymic', 'lastName'],
|
||||||
|
['firstName', 'matronymic', 'lastName']
|
||||||
|
]
|
||||||
|
|
||||||
|
// const getFrom = _.flip()
|
||||||
|
|
||||||
|
const getFrom = _.curry((obj, key) => obj[key])
|
||||||
|
|
||||||
|
const getOrderedParts = (parts, ordering) => _.map(getFrom(parts), ordering)
|
||||||
|
|
||||||
|
const combineParts = _.curryN(2, _.flow(
|
||||||
|
getOrderedParts,
|
||||||
|
_.compact,
|
||||||
_.join(' ')
|
_.join(' ')
|
||||||
|
))
|
||||||
|
|
||||||
|
const makeAllOrderings = parts => _.map(combineParts(parts), commonOrderings)
|
||||||
|
|
||||||
|
const makeFullNames = _.flow(
|
||||||
|
makeAllOrderings,
|
||||||
|
_.uniq
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
module.exports = {
|
module.exports = {
|
||||||
fullNameFromParts,
|
makeFullNames,
|
||||||
phonetic: phoneticMethod3
|
phonetic: phoneticMethod3
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue