Name-part matching now combines Jaro and Double Metaphone scores

2018-03-13 22:07:44 +02:00 · 2018-03-13 22:07:44 +02:00 · 793db0f449
commit 793db0f449
parent f00516ce2e
5 changed files with 54 additions and 78 deletions
--- a/lib/ofac/matching.js
+++ b/lib/ofac/matching.js
@ -1,9 +1,9 @@
-const jaroWinkler = require('talisman/metrics/distance/jaro-winkler')
+const jaro = require('talisman/metrics/distance/jaro')
 const _ = require('lodash/fp')

 const debug_log = require('../pp')(__filename) // KOSTIS TODO: remove

-const stringSimilarity = _.curry(jaroWinkler)
+const stringSimilarity = _.curry(jaro)

 // birth date

@ -29,7 +29,7 @@ const isBornTooLongSince = _.curry((days, dateObject, individual) => {
 // algorithm

 function match (structs, candidate, threshold) {
-  const {fullName, wordPhonetics, wordValues, birthDate} = candidate
+  const {fullName, words, birthDate} = candidate

  // Accept aliases who's full name matches.
  const doesNameMatch = _.flow(
@ -43,53 +43,42 @@ function match (structs, candidate, threshold) {
    _.map(_.get('id'))
  )(aliases)

-  // Gather aliases who's name-parts match phonetically.
-  const getPhoneticMatches = phonetic => structs.phoneticMap.get(phonetic)
-  const phoneticMatches = _.flow(
-    _.map(wordPhonetic => {
-      const {word, phonetic} = wordPhonetic
-      const matches = getPhoneticMatches(phonetic)
-      return _.map(match => ({...match, word}), matches)
-    }),
-    _.compact,
-    // _.map(_.uniqWith((a, b) => a.aliasId === b.aliasId)),
-    _.flatten
-  )(wordPhonetics)

-  // Gether aliases whose name-parts match alphabetically.
-  const getStringMatches = value => {
-    const entryMatches = entry => (stringSimilarity(value, entry.value) >= threshold)
-    return _.filter(entryMatches, structs.wordList)
-  }
-  const getSingleEntries = wordEntry => {
-    const makeEntry = aliasId => ({value: wordEntry.value, aliasId})
-    return _.map(makeEntry, wordEntry.aliasIds)
-  }
-  const stringMatches = _.flow(
-    _.flatMap(getStringMatches),
-    _.flatMap(getSingleEntries)
-  )(wordValues)
+  const aliasIds = []
+  const phoneticWeight = 0.17
+  const stringWeight = 1 - phoneticWeight

-  // At least two name-parts must match per alias
-  const adequateMatch = ([aliasId, count]) => {
-    const alias = structs.aliasesMap.get(aliasId)
-    return count >= Math.min(2, alias.words.length)
+  for (const word of words) {
+    const getPhonetic = phonetic => structs.phoneticMap.get(phonetic)
+    const phoneticMatches = new Set(_.flatMap(getPhonetic, word.phonetics))
+
+    for (const wordEntry of structs.wordList) {
+      const stringScore = stringSimilarity(word.value, wordEntry.value)
+
+      if (stringWeight * stringScore + phoneticWeight < threshold) continue
+
+      for (const aliasId of wordEntry.aliasIds) {
+        const phoneticScore = phoneticMatches.has(aliasId) ? 1 : 0
+        const finalScore = stringWeight * stringScore + phoneticWeight * phoneticScore
+
+        if (finalScore >= threshold) {
+          aliasIds.push(aliasId)
+        }
+      }
+    }
  }
+
  const aliasIdsFromNamePart = _.flow(
-    _.uniqWith((a, b) => (
-      (a.value === b.value && a.aliasId === b.aliasId) ||
-      (a.word === b.word && a.aliasId === b.aliasId)
-    )),
-    _.map(_.get('aliasId')),
    _.countBy(_.identity),
    _.toPairs,
-    _.filter(adequateMatch),
+    _.reject(_.flow(
+      _.last,
+      _.gt(2)
+    )),
    _.map(_.first)
-  )([...phoneticMatches, ...stringMatches])
+  )(aliasIds)

  // debug_log(aliasIdsFromFullName)
-  // debug_log(phoneticMatches)
-  // debug_log(stringMatches)
  // debug_log(aliasIdsFromNamePart)

  // Get the full record for each matched id