const jaro = require('talisman/metrics/distance/jaro') const _ = require('lodash/fp') const logger = require('../logger') const stringSimilarity = _.curry(jaro) // birth date function isDateWithinSomeDaysOfPeriod(period, date, days) { const inMillisecs = 24 * 60 * 60 * 1000 const startTime = period.start.date.getTime() - days * inMillisecs const startDate = new Date(startTime) const endTime = period.end.date.getTime() + days * inMillisecs const endDate = new Date(endTime) return startDate < date && date < endDate } const isBornTooLongSince = _.curry((days, dateObject, individual) => { if (!dateObject) return false if (_.isEmpty(individual.birthDatePeriods)) return false const isWithinSomeYears = _.partialRight(isDateWithinSomeDaysOfPeriod, [ dateObject.date, days, ]) return !_.some(isWithinSomeYears, individual.birthDatePeriods) }) // algorithm function match(structs, candidate, options) { const { threshold, fullNameThreshold, ratio = 0.5, verboseFor } = options const { fullName, words, birthDate } = candidate // Accept aliases who's full name matches. const doesNameMatch = _.flow( _.get('fullName'), stringSimilarity(fullName), _.lte(fullNameThreshold), ) const aliases = _.flatMap(_.get('aliases'), structs.individuals) const aliasIdsFromFullName = _.flow( _.filter(doesNameMatch), _.map(_.get('id')), )(aliases) const phoneticWeight = ratio const stringWeight = 1 - phoneticWeight const matches = [] for (const word of words) { const getPhonetic = phonetic => structs.phoneticMap.get(phonetic) const phoneticMatches = new Set(_.flatMap(getPhonetic, word.phonetics)) for (const wordEntry of structs.wordList) { const stringScore = stringSimilarity(word.value, wordEntry.value) const verbose = _.includes(wordEntry.value, verboseFor) for (const aliasId of wordEntry.aliasIds) { const phoneticScore = phoneticMatches.has(aliasId) ? 1 : -1 const finalScore = stringWeight * stringScore + phoneticWeight * phoneticScore verbose && logger.debug( finalScore.toFixed(2), stringScore.toFixed(2), phoneticScore.toFixed(2), word.value, wordEntry.value, ) if (finalScore >= threshold) { const entry = { aliasId, score: finalScore, word: word.value, value: wordEntry.value, } const index = _.sortedIndexBy(x => -x.score, entry, matches) matches.splice(index, 0, entry) } } } } const sameWord = (a, b) => a.aliasId === b.aliasId && a.word === b.word const sameValue = (a, b) => a.aliasId === b.aliasId && a.value === b.value const aliasIdsFromNamePart = _.flow( _.uniqWith(sameWord), _.uniqWith(sameValue), _.map(_.get('aliasId')), _.countBy(_.identity), _.toPairs, _.filter(([aliasId, count]) => { const { length } = structs.aliasesMap.get(aliasId).words return count >= _.min([2, words.length, length]) }), _.map(_.first), )(matches) // Get the full record for each matched id const getIndividual = aliasId => { const individualId = structs.aliasToIndividual.get(aliasId) return structs.individualsMap.get(individualId) } const suspects = _.uniq( _.map(getIndividual, [...aliasIdsFromFullName, ...aliasIdsFromNamePart]), ) // Reject everyone who is born two years away. const twoYears = 365 * 2 const unqualified = isBornTooLongSince(twoYears, birthDate) return _.reject(unqualified, suspects) } module.exports = { match }