Set the name ordering & small fixes

This commit is contained in:
Konstantin Mamalakis 2018-02-22 20:37:33 +02:00 committed by Josh Harvey
parent 402f75f50c
commit 910d7e200f
4 changed files with 74 additions and 44 deletions

View file

@ -1,27 +1,32 @@
const dataParser = require('./data-paraser')
const dataParser = require('./data-parser')
const nameUtils = require('./name-utils')
const jaroWinkler = require('talisman/metrics/distance/jaro-winkler')
const _ = require('lodash/fp')
const debug_log = require('./debug') // KOSTIS TODO: remove
const debug_log = require('../pp')(__filename) // KOSTIS TODO: remove
let individuals = []
let individuals = null
function load () {
individuals = Array.from(dataParser.parseList())
return dataParser.parseList()
.then(list => {
individuals = Array.from(list)
})
}
// MATCHING
const mapMax = (iteratee, list) => _.max(_.map(iteratee, list))
// birth date
function isDateWithinSomeDaysOfPeriod (period, date, days) {
const inMillisecs = 24 * 60 * 60 * 1000
const startTime = period.from.date.getTime() - days * inMillisecs
const startTime = period.start.date.getTime() - days * inMillisecs
const startDate = new Date(startTime)
const endTime = period.to.date.getTime() + days * inMillisecs
const endTime = period.end.date.getTime() + days * inMillisecs
const endDate = new Date(endTime)
return (startDate < date && date < endDate)
@ -33,52 +38,55 @@ function isBornTooLongSince (individual, dateObject, days) {
return !_.some(isWithinSomeYears, individual.birthDatePeriods)
}
// exact match
// string similarity
function calcExactMatchScore (candidateFullName) {
return function (alias) {
return jaroWinkler(alias.fullName, candidateFullName)
}
}
const stringMatch = _.curry(jaroWinkler)
// phonetic match
const bestMatchInList = _.curry((list, name) => mapMax(stringMatch(name), list))
function calcPhoneticMatchScore (candidatePhoneticFullName) {
return function (alias) {
return jaroWinkler(alias.phoneticFullName, candidatePhoneticFullName)
}
}
const aliasStringMatch = _.curry((candidate, alias) => {
const matchWithCandidate = bestMatchInList(candidate.fullNames)
return mapMax(matchWithCandidate, alias.fullNames)
})
// algorithm
// NOTE: I'm still not 100% on what matching algorithm is the best choice.
// I just experiment with a few metrics for now.
const similarity = _.curry((candidate, individual) => {
// Calculate if his birth date is within two years of the given date.
// If an individual has multiple birth-date periods, return whether any are
// within two years. Reject individuals who don't match this criterion.
const twoYears = 365 * 2
if (!isBornTooLongSince(individual, candidate.birthDate, twoYears)) return 0
if (isBornTooLongSince(individual, candidate.birthDate, twoYears)) return 0
debug_log(individual)
// Calculate the Jaro-Winkler similarity of the full name.
// If an individual has multiple aliases, use the maximum score.
const exactMatchScore = _.max(_.map(calcExactMatchScore(candidate.fullName), individual.aliases))
const scoreAgainstCandidate = aliasStringMatch(candidate)
const stringMatchScore = mapMax(scoreAgainstCandidate, individual.aliases)
// Calculate the Jaro-Winkler similarity of the phonetic representation of the full name.
// This should approximate the phonetic similarity of the two names.
// If an individual has multiple aliases, use the maximum score.
const phoneticMatchScore = _.max(_.map(calcPhoneticMatchScore(candidate.phoneticFullName), individual.aliases))
// // Calculate the Jaro-Winkler similarity of the phonetic representation of the full name.
// // This should approximate the phonetic similarity of the two names.
// // If an individual has multiple aliases, use the maximum score.
// const phoneticMatchScore = mapMax(calcPhoneticMatchScore(candidate.phoneticFullName))(individual.aliases)
return _.max([exactMatchScore, phoneticMatchScore])
console.log(stringMatchScore)
return _.max([stringMatchScore])
})
function match (parts, birthDateString) {
function match (nameParts, birthDateString) {
if (!individuals) {
const message = 'The OFAC data sources have not been loaded yet.'
return Promise.reject(new Error(message))
}
// nameParts should be an object like {firstName: "John", lastName: "Doe", ...}
const fullName = nameUtils.fullNameFromParts(parts)
const parts = _.mapValues(_.lowerCase, nameParts)
const fullNames = nameUtils.makeFullNames(parts)
const phoneticParts = _.mapValues(nameUtils.phonetic, parts)
const phoneticFullName = nameUtils.phonetic(fullName)
const phoneticFullNames = _.map(nameUtils.phonetic, fullNames)
// birthDateString is in YYYYMMDD format
const year = parseInt(birthDateString.slice(0, 4))
@ -88,10 +96,12 @@ function match (parts, birthDateString) {
const birthDate = {year, month, day, date}
const candidate = {parts, fullName, phoneticParts, phoneticFullName, birthDate}
const candidate = {parts, fullNames, phoneticParts, phoneticFullNames, birthDate}
const similarToCandidate = similarity(candidate)
return _.max(similarToCandidate, individuals)
const result = mapMax(similarToCandidate, individuals)
debug_log(candidate)
return result
}
module.exports = {load, match}