Parsing moved to downloading. Matching is being tweaked.
This commit is contained in:
parent
793db0f449
commit
b72f5549a5
10 changed files with 456 additions and 276 deletions
|
|
@ -28,7 +28,8 @@ const isBornTooLongSince = _.curry((days, dateObject, individual) => {
|
|||
|
||||
// algorithm
|
||||
|
||||
function match (structs, candidate, threshold) {
|
||||
function match (structs, candidate, options) {
|
||||
const {threshold, ratio = 0.1, debug, verboseFor} = options
|
||||
const {fullName, words, birthDate} = candidate
|
||||
|
||||
// Accept aliases who's full name matches.
|
||||
|
|
@ -44,42 +45,57 @@ function match (structs, candidate, threshold) {
|
|||
)(aliases)
|
||||
|
||||
|
||||
const aliasIds = []
|
||||
const phoneticWeight = 0.17
|
||||
const aliasIdCounts = new Map()
|
||||
const phoneticWeight = ratio
|
||||
const stringWeight = 1 - phoneticWeight
|
||||
|
||||
for (const word of words) {
|
||||
const getPhonetic = phonetic => structs.phoneticMap.get(phonetic)
|
||||
const phoneticMatches = new Set(_.flatMap(getPhonetic, word.phonetics))
|
||||
|
||||
const aliasIds = new Set()
|
||||
|
||||
for (const wordEntry of structs.wordList) {
|
||||
const stringScore = stringSimilarity(word.value, wordEntry.value)
|
||||
|
||||
if (stringWeight * stringScore + phoneticWeight < threshold) continue
|
||||
const verbose = _.includes(wordEntry.value, verboseFor)
|
||||
|
||||
if (!verbose && stringWeight * stringScore + phoneticWeight < threshold) continue
|
||||
|
||||
for (const aliasId of wordEntry.aliasIds) {
|
||||
const phoneticScore = phoneticMatches.has(aliasId) ? 1 : 0
|
||||
const finalScore = stringWeight * stringScore + phoneticWeight * phoneticScore
|
||||
const phoneticScore = phoneticMatches.has(aliasId) ? 1 : -1
|
||||
// const finalScore = stringWeight * stringScore + phoneticWeight * phoneticScore
|
||||
const finalScore = stringScore + phoneticWeight * phoneticScore
|
||||
|
||||
verbose && console.log(finalScore.toFixed(2), stringScore.toFixed(2), phoneticScore.toFixed(2), word.value, wordEntry.value)
|
||||
|
||||
if (finalScore >= threshold) {
|
||||
aliasIds.push(aliasId)
|
||||
aliasIds.add(aliasId)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
verboseFor && console.log(aliasIds)
|
||||
|
||||
for (const aliasId of aliasIds.values()) {
|
||||
const count = aliasIdCounts.get(aliasId) || 0
|
||||
aliasIdCounts.set(aliasId, count + 1)
|
||||
}
|
||||
}
|
||||
|
||||
const aliasIdsFromNamePart = _.flow(
|
||||
_.countBy(_.identity),
|
||||
_.toPairs,
|
||||
_.reject(_.flow(
|
||||
_.last,
|
||||
_.gt(2)
|
||||
)),
|
||||
_.map(_.first)
|
||||
)(aliasIds)
|
||||
verboseFor && console.log(aliasIdCounts)
|
||||
|
||||
// debug_log(aliasIdsFromFullName)
|
||||
// debug_log(aliasIdsFromNamePart)
|
||||
const aliasIdsFromNamePart = []
|
||||
|
||||
for (const [aliasId, count] of aliasIdCounts) {
|
||||
const {length} = structs.aliasesMap.get(aliasId).words
|
||||
if (count >= _.min([2, words.length, length])) {
|
||||
aliasIdsFromNamePart.push(aliasId)
|
||||
}
|
||||
}
|
||||
|
||||
debug && debug_log(aliasIdsFromFullName)
|
||||
debug && debug_log(aliasIdsFromNamePart)
|
||||
|
||||
// Get the full record for each matched id
|
||||
const getIndividual = aliasId => {
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue