This commit is contained in:
Konstantin Mamalakis 2018-03-19 14:32:38 +02:00 committed by Josh Harvey
parent 577a85c9b1
commit f7561acf3c
4 changed files with 120 additions and 71 deletions

View file

@ -29,14 +29,14 @@ const isBornTooLongSince = _.curry((days, dateObject, individual) => {
// algorithm
function match (structs, candidate, options) {
const {threshold, ratio = 0.1, debug, verboseFor} = options
const {threshold, fullNameThreshold, ratio = 0.5, debug, verboseFor} = options
const {fullName, words, birthDate} = candidate
// Accept aliases who's full name matches.
const doesNameMatch = _.flow(
_.get('fullName'),
stringSimilarity(fullName),
_.lte(threshold)
_.lte(fullNameThreshold)
)
const aliases = _.flatMap(_.get('aliases'), structs.individuals)
const aliasIdsFromFullName = _.flow(
@ -45,54 +45,50 @@ function match (structs, candidate, options) {
)(aliases)
const aliasIdCounts = new Map()
const phoneticWeight = ratio
const stringWeight = 1 - phoneticWeight
const matches = []
for (const word of words) {
const getPhonetic = phonetic => structs.phoneticMap.get(phonetic)
const phoneticMatches = new Set(_.flatMap(getPhonetic, word.phonetics))
const aliasIds = new Set()
for (const wordEntry of structs.wordList) {
const stringScore = stringSimilarity(word.value, wordEntry.value)
const verbose = _.includes(wordEntry.value, verboseFor)
if (!verbose && stringWeight * stringScore + phoneticWeight < threshold) continue
for (const aliasId of wordEntry.aliasIds) {
const phoneticScore = phoneticMatches.has(aliasId) ? 1 : -1
// const finalScore = stringWeight * stringScore + phoneticWeight * phoneticScore
const finalScore = stringScore + phoneticWeight * phoneticScore
const finalScore = stringWeight * stringScore + phoneticWeight * phoneticScore
verbose && console.log(finalScore.toFixed(2), stringScore.toFixed(2), phoneticScore.toFixed(2), word.value, wordEntry.value)
if (finalScore >= threshold) {
aliasIds.add(aliasId)
const entry = {aliasId, score: finalScore, word: word.value, value: wordEntry.value}
const index = _.sortedIndexBy(x => -x.score, entry, matches)
matches.splice(index, 0, entry)
}
}
}
verboseFor && console.log(aliasIds)
for (const aliasId of aliasIds.values()) {
const count = aliasIdCounts.get(aliasId) || 0
aliasIdCounts.set(aliasId, count + 1)
}
}
verboseFor && console.log(aliasIdCounts)
const sameWord = (a, b) => a.aliasId === b.aliasId && a.word === b.word
const sameValue = (a, b) => a.aliasId === b.aliasId && a.value === b.value
const aliasIdsFromNamePart = []
for (const [aliasId, count] of aliasIdCounts) {
const {length} = structs.aliasesMap.get(aliasId).words
if (count >= _.min([2, words.length, length])) {
aliasIdsFromNamePart.push(aliasId)
}
}
const aliasIdsFromNamePart = _.flow(
_.uniqWith(sameWord),
_.uniqWith(sameValue),
_.map(_.get('aliasId')),
_.countBy(_.identity),
_.toPairs,
_.filter(([aliasId, count]) => {
const {length} = structs.aliasesMap.get(aliasId).words
return (count >= _.min([2, words.length, length]))
}),
_.map(_.first)
)(matches)
debug && debug_log(aliasIdsFromFullName)
debug && debug_log(aliasIdsFromNamePart)