112 lines
3.5 KiB
JavaScript
112 lines
3.5 KiB
JavaScript
const jaro = require('talisman/metrics/distance/jaro')
|
|
const _ = require('lodash/fp')
|
|
|
|
const debug_log = require('../pp')(__filename) // KOSTIS TODO: remove
|
|
|
|
const stringSimilarity = _.curry(jaro)
|
|
|
|
// birth date
|
|
|
|
function isDateWithinSomeDaysOfPeriod (period, date, days) {
|
|
const inMillisecs = 24 * 60 * 60 * 1000
|
|
|
|
const startTime = period.start.date.getTime() - days * inMillisecs
|
|
const startDate = new Date(startTime)
|
|
|
|
const endTime = period.end.date.getTime() + days * inMillisecs
|
|
const endDate = new Date(endTime)
|
|
|
|
return (startDate < date && date < endDate)
|
|
}
|
|
|
|
const isBornTooLongSince = _.curry((days, dateObject, individual) => {
|
|
if (!dateObject) return false
|
|
if (_.isEmpty(individual.birthDatePeriods)) return false
|
|
const isWithinSomeYears = _.partialRight(isDateWithinSomeDaysOfPeriod, [dateObject.date, days])
|
|
return !_.some(isWithinSomeYears, individual.birthDatePeriods)
|
|
})
|
|
|
|
// algorithm
|
|
|
|
function match (structs, candidate, options) {
|
|
const {threshold, fullNameThreshold, ratio = 0.5, debug, verboseFor} = options
|
|
const {fullName, words, birthDate} = candidate
|
|
|
|
// Accept aliases who's full name matches.
|
|
const doesNameMatch = _.flow(
|
|
_.get('fullName'),
|
|
stringSimilarity(fullName),
|
|
_.lte(fullNameThreshold)
|
|
)
|
|
const aliases = _.flatMap(_.get('aliases'), structs.individuals)
|
|
const aliasIdsFromFullName = _.flow(
|
|
_.filter(doesNameMatch),
|
|
_.map(_.get('id'))
|
|
)(aliases)
|
|
|
|
|
|
const phoneticWeight = ratio
|
|
const stringWeight = 1 - phoneticWeight
|
|
|
|
const matches = []
|
|
|
|
for (const word of words) {
|
|
const getPhonetic = phonetic => structs.phoneticMap.get(phonetic)
|
|
const phoneticMatches = new Set(_.flatMap(getPhonetic, word.phonetics))
|
|
|
|
for (const wordEntry of structs.wordList) {
|
|
const stringScore = stringSimilarity(word.value, wordEntry.value)
|
|
|
|
const verbose = _.includes(wordEntry.value, verboseFor)
|
|
|
|
for (const aliasId of wordEntry.aliasIds) {
|
|
const phoneticScore = phoneticMatches.has(aliasId) ? 1 : -1
|
|
const finalScore = stringWeight * stringScore + phoneticWeight * phoneticScore
|
|
|
|
verbose && console.log(finalScore.toFixed(2), stringScore.toFixed(2), phoneticScore.toFixed(2), word.value, wordEntry.value)
|
|
|
|
if (finalScore >= threshold) {
|
|
const entry = {aliasId, score: finalScore, word: word.value, value: wordEntry.value}
|
|
const index = _.sortedIndexBy(x => -x.score, entry, matches)
|
|
matches.splice(index, 0, entry)
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
const sameWord = (a, b) => a.aliasId === b.aliasId && a.word === b.word
|
|
const sameValue = (a, b) => a.aliasId === b.aliasId && a.value === b.value
|
|
|
|
const aliasIdsFromNamePart = _.flow(
|
|
_.uniqWith(sameWord),
|
|
_.uniqWith(sameValue),
|
|
_.map(_.get('aliasId')),
|
|
_.countBy(_.identity),
|
|
_.toPairs,
|
|
_.filter(([aliasId, count]) => {
|
|
const {length} = structs.aliasesMap.get(aliasId).words
|
|
return (count >= _.min([2, words.length, length]))
|
|
}),
|
|
_.map(_.first)
|
|
)(matches)
|
|
|
|
debug && debug_log(aliasIdsFromFullName)
|
|
debug && debug_log(aliasIdsFromNamePart)
|
|
|
|
// Get the full record for each matched id
|
|
const getIndividual = aliasId => {
|
|
const individualId = structs.aliasToIndividual.get(aliasId)
|
|
return structs.individualsMap.get(individualId)
|
|
}
|
|
const suspects = _.uniq(_.map(getIndividual, [
|
|
...aliasIdsFromFullName,
|
|
...aliasIdsFromNamePart
|
|
]))
|
|
|
|
// Reject everyone who is born two years away.
|
|
const twoYears = 365 * 2
|
|
const unqualified = isBornTooLongSince(twoYears, birthDate)
|
|
return _.reject(unqualified, suspects)
|
|
}
|
|
|
|
module.exports = {match}
|