chore: use monorepo organization

This commit is contained in:
Rafael Taranto 2025-05-12 10:52:54 +01:00
parent deaf7d6ecc
commit a687827f7e
1099 changed files with 8184 additions and 11535 deletions

View file

@ -0,0 +1,73 @@
const { readdir } = require('fs/promises')
const path = require('path')
const loader = require('./loading')
const matcher = require('./matching')
const nameUtils = require('./name-utils')
const _ = require('lodash/fp')
const logger = require('../logger')
const OFAC_DATA_DIR = process.env.OFAC_DATA_DIR
let structs = null
function load () {
if (!OFAC_DATA_DIR) {
const message = 'The ofacDataDir option has not been set in the environment'
return Promise.reject(new Error(message))
}
const ofacSourcesDir = path.join(OFAC_DATA_DIR, 'sources')
return readdir(ofacSourcesDir)
.then(_.flow(
_.map(file => path.join(ofacSourcesDir, file)),
loader.load
))
.then(result => {
return (structs = result)
})
}
// nameParts should be an object like {firstName: "John", lastName: "Doe", ...}
function makeCompatible (nameParts) {
const partNames = _.keys(nameParts)
const values = _.map(_.lowerCase, _.values(nameParts))
const props = _.zipAll([partNames, values])
return _.map(_.zipObject(['partName', 'value']), props)
}
function match (nameParts, birthDateString, options) {
if (!structs) {
logger.error(new Error('The OFAC data sources have not been loaded yet.'))
return false
}
// Prepare the input data
const parts = makeCompatible(nameParts)
const fullName = nameUtils.makeFullName(parts)
const words = nameUtils.makeWords(fullName)
// birthDateString is in YYYYMMDD format
const birthDate = _.cond([
[_.identity, () => {
const year = parseInt(birthDateString.slice(0, 4))
const month = parseInt(birthDateString.slice(4, 6))
const day = parseInt(birthDateString.slice(6, 8))
const date = new Date(year, month - 1, day)
return {year, month, day, date}
}],
[_.stubTrue, () => null]
])(birthDateString)
const candidate = {parts, fullName, words, birthDate}
const result = matcher.match(structs, candidate, options)
return result
}
function getStructs () {
return structs
}
module.exports = {load, match, getStructs}

View file

@ -0,0 +1,109 @@
const fs = require('fs')
const ndjson = require('ndjson')
const _ = require('lodash/fp')
const mapAliases = _.curry((iteratee, individuals) => {
const mapIndividual = individual => {
const {id, aliases} = individual
return _.map(alias => iteratee(id, alias), aliases)
}
return _.flatMap(mapIndividual, individuals)
})
const getPhoneticEntries = (individualId, alias) => {
const pairPhoneticsWithValues = word => {
const {value, phonetics} = word
const makeEntry = phonetic => ({value, phonetic, aliasId: alias.id})
return _.map(makeEntry, phonetics)
}
return _.flatMap(pairPhoneticsWithValues, alias.words)
}
const producePhoneticMap = _.flow(
mapAliases(getPhoneticEntries),
_.flatten,
_.groupBy(_.get('phonetic')),
_.mapValues(_.flow(
_.map(_.get('aliasId')),
_.uniq
)),
_.toPairs,
entries => new Map(entries)
)
const getWords = (individualId, alias) => {
const pairWordsWithIds = word => ({value: word.value, aliasId: alias.id})
return _.map(pairWordsWithIds, alias.words)
}
const produceWordList = _.flow(
mapAliases(getWords),
_.flatten,
_.groupBy(_.get('value')),
_.mapValues(_.map(_.get('aliasId'))),
_.toPairs,
_.map(_.zipObject(['value', 'aliasIds']))
)
const parseSource = source => {
const individuals = []
const readStream = fs.createReadStream(source)
const jsonStream = readStream.pipe(ndjson.parse())
jsonStream.on('data', individual => {
_.forEach(period => {
_.forEach(date => {
const {year, month, day} = date
date.date = new Date(year, month - 1, day)
}, [period.start, period.end])
}, individual.birthDatePeriods)
individuals.push(individual)
})
return new Promise((resolve, reject) => {
jsonStream.on('error', reject)
jsonStream.on('end', () => {
resolve(individuals)
})
})
}
const load = sources => Promise.all(_.map(parseSource, sources))
.then(_.flow(
_.flatten,
_.compact,
_.uniqBy(_.get('id')),
individuals => {
const individualsMap = _.flow(
_.groupBy(_.get('id')),
_.mapValues(_.first),
_.toPairs,
entries => new Map(entries)
)(individuals)
const makeEntries = (individualId, alias) => [alias.id, alias]
const aliasesMap = new Map(mapAliases(makeEntries, individuals))
const getIdPairs = (individualId, alias) => [alias.id, individualId]
const idPairs = mapAliases(getIdPairs, individuals)
const aliasToIndividual = new Map(idPairs)
const phoneticMap = producePhoneticMap(individuals)
const wordList = produceWordList(individuals)
return {
individuals,
individualsMap,
aliasesMap,
aliasToIndividual,
phoneticMap,
wordList
}
}
))
module.exports = {load}

View file

@ -0,0 +1,108 @@
const jaro = require('talisman/metrics/distance/jaro')
const _ = require('lodash/fp')
const logger = require('../logger')
const stringSimilarity = _.curry(jaro)
// birth date
function isDateWithinSomeDaysOfPeriod (period, date, days) {
const inMillisecs = 24 * 60 * 60 * 1000
const startTime = period.start.date.getTime() - days * inMillisecs
const startDate = new Date(startTime)
const endTime = period.end.date.getTime() + days * inMillisecs
const endDate = new Date(endTime)
return (startDate < date && date < endDate)
}
const isBornTooLongSince = _.curry((days, dateObject, individual) => {
if (!dateObject) return false
if (_.isEmpty(individual.birthDatePeriods)) return false
const isWithinSomeYears = _.partialRight(isDateWithinSomeDaysOfPeriod, [dateObject.date, days])
return !_.some(isWithinSomeYears, individual.birthDatePeriods)
})
// algorithm
function match (structs, candidate, options) {
const {threshold, fullNameThreshold, ratio = 0.5, verboseFor} = options
const {fullName, words, birthDate} = candidate
// Accept aliases who's full name matches.
const doesNameMatch = _.flow(
_.get('fullName'),
stringSimilarity(fullName),
_.lte(fullNameThreshold)
)
const aliases = _.flatMap(_.get('aliases'), structs.individuals)
const aliasIdsFromFullName = _.flow(
_.filter(doesNameMatch),
_.map(_.get('id'))
)(aliases)
const phoneticWeight = ratio
const stringWeight = 1 - phoneticWeight
const matches = []
for (const word of words) {
const getPhonetic = phonetic => structs.phoneticMap.get(phonetic)
const phoneticMatches = new Set(_.flatMap(getPhonetic, word.phonetics))
for (const wordEntry of structs.wordList) {
const stringScore = stringSimilarity(word.value, wordEntry.value)
const verbose = _.includes(wordEntry.value, verboseFor)
for (const aliasId of wordEntry.aliasIds) {
const phoneticScore = phoneticMatches.has(aliasId) ? 1 : -1
const finalScore = stringWeight * stringScore + phoneticWeight * phoneticScore
verbose && logger.debug(finalScore.toFixed(2), stringScore.toFixed(2), phoneticScore.toFixed(2), word.value, wordEntry.value)
if (finalScore >= threshold) {
const entry = {aliasId, score: finalScore, word: word.value, value: wordEntry.value}
const index = _.sortedIndexBy(x => -x.score, entry, matches)
matches.splice(index, 0, entry)
}
}
}
}
const sameWord = (a, b) => a.aliasId === b.aliasId && a.word === b.word
const sameValue = (a, b) => a.aliasId === b.aliasId && a.value === b.value
const aliasIdsFromNamePart = _.flow(
_.uniqWith(sameWord),
_.uniqWith(sameValue),
_.map(_.get('aliasId')),
_.countBy(_.identity),
_.toPairs,
_.filter(([aliasId, count]) => {
const {length} = structs.aliasesMap.get(aliasId).words
return (count >= _.min([2, words.length, length]))
}),
_.map(_.first)
)(matches)
// Get the full record for each matched id
const getIndividual = aliasId => {
const individualId = structs.aliasToIndividual.get(aliasId)
return structs.individualsMap.get(individualId)
}
const suspects = _.uniq(_.map(getIndividual, [
...aliasIdsFromFullName,
...aliasIdsFromNamePart
]))
// Reject everyone who is born two years away.
const twoYears = 365 * 2
const unqualified = isBornTooLongSince(twoYears, birthDate)
return _.reject(unqualified, suspects)
}
module.exports = {match}

View file

@ -0,0 +1,31 @@
const doubleMetaphone = require('talisman/phonetics/double-metaphone')
const _ = require('lodash/fp')
const makePhonetic = _.flow(doubleMetaphone, _.uniq)
// Combine name-parts in a standard order.
const partOrdering = ['firstName', 'middleName', 'maidenName', 'patronymic', 'matronymic', 'lastName']
const usingPartOrder = _.flow(
_.get('partName'),
_.partialRight(_.indexOf, [partOrdering])
)
const makeFullName = _.flow(
_.sortBy(usingPartOrder),
_.map(_.get('value')),
_.join(' ')
)
const makeWords = value => {
const words = _.split(' ', value)
const phonetics = _.map(makePhonetic, words)
const pairs = _.zipAll([words, phonetics])
return _.map(_.zipObject(['value', 'phonetics']), pairs)
}
module.exports = {
makeFullName,
makeWords
}

View file

@ -0,0 +1,161 @@
const fs = require('fs')
const XmlStream = require('xml-stream')
const nameUtils = require('./name-utils')
const logger = require('../logger')
const _ = require('lodash/fp')
// KOSTIS TODO: get these from the document itself
const INDIVIDUAL = '4'
const NAME = '1403'
const BIRTH_DATE = '8'
const PRIMARY_LATIN = '1'
const LAST_NAME = '1520'
const FIRST_NAME = '1521'
const MIDDLE_NAME = '1522'
const MAIDEN_NAME = '1523'
const PATRONYMIC = '91708'
const MATRONYMIC = '91709'
const NICKNAME = '1528'
const partNames = new Map([
[LAST_NAME, 'lastName'],
[FIRST_NAME, 'firstName'],
[MIDDLE_NAME, 'middleName'],
[MAIDEN_NAME, 'maidenName'],
[PATRONYMIC, 'patronymic'],
[MATRONYMIC, 'matronymic'],
[NICKNAME, 'nickname']
])
const filteredWords = [
// 'al'
]
// group-id to type-id
function processMasterNamePartGroup (groupNode) {
const namePartGroupNode = groupNode.NamePartGroup
const groupId = namePartGroupNode.$.ID
const typeId = namePartGroupNode.$.NamePartTypeID
return [groupId, typeId]
}
const processDocumentedNamePart = _.curry((groupTypes, namePartNode) => {
const valueNode = namePartNode.NamePartValue
const groupId = valueNode.$.NamePartGroupID
const typeId = groupTypes.get(groupId)
const partName = partNames.get(typeId)
const value = _.lowerCase(valueNode.$text)
return {partName, value}
})
const isLatin = _.matchesProperty(['$', 'DocNameStatusID'], PRIMARY_LATIN)
const processAlias = _.curry((groupTypes, aliasNode) => {
if (aliasNode.$.AliasTypeID !== NAME) return
if (aliasNode.$.LowQuality === 'true') return
const getNamePart = processDocumentedNamePart(groupTypes)
const latinNameNode = _.find(isLatin, aliasNode.DocumentedName)
if (!latinNameNode) {
const id = aliasNode.$.FixedRef
const message = `Alias for Person with ID="${id}" has no latinized name`
logger.error(message)
return
}
const id = latinNameNode.$.ID
const namePartNodes = latinNameNode.DocumentedNamePart
const parts = _.map(getNamePart, namePartNodes)
const fullName = nameUtils.makeFullName(parts)
const words = _.flow(
nameUtils.makeWords,
_.reject(_.flow(
_.get('value'),
word => filteredWords.includes(word)
))
)(fullName)
// if (words.length < 2) {
// console.log(JSON.stringify(words))
// }
return {id, parts, fullName, words}
})
// birth date
function processDate (dateNode) {
const year = parseInt(dateNode.Year)
const month = parseInt(dateNode.Month)
const day = parseInt(dateNode.Day)
return {year, month, day}
}
function processFeature (featureNode) {
if (featureNode.$.FeatureTypeID !== BIRTH_DATE) return
const datePeriodNode = featureNode.FeatureVersion.DatePeriod
// Ignore the fact that both Start and end can be a range.
// By using Start.From and End.To we use the extremes of the date-period.
const period = {
start: datePeriodNode.Start.From,
end: datePeriodNode.End.To
}
return _.mapValues(processDate, period)
}
// profile
function processProfile (profileNode) {
if (profileNode.$.PartySubTypeID !== INDIVIDUAL) return
const id = profileNode.$.ID
const identityNode = profileNode.Identity
const groupTypesEntries = _.map(processMasterNamePartGroup, identityNode.NamePartGroups.MasterNamePartGroup)
const groupTypes = new Map(groupTypesEntries)
const mapCompact = _.flow(_.map, _.compact)
const getNameParts = processAlias(groupTypes)
const aliases = mapCompact(getNameParts, identityNode.Alias)
if (_.isEmpty(aliases)) return
const birthDatePeriods = mapCompact(processFeature, profileNode.Feature)
const individual = {id, aliases, birthDatePeriods}
return individual
}
const parse = (source, callback) => {
const stream = fs.createReadStream(source)
const xml = new XmlStream(stream)
xml.on('error', err => {
xml.pause()
const message = `Error while parsing OFAC data source file (${source}): ${err.message}`
callback(new Error(message))
})
xml.collect('Alias')
xml.collect('DocumentedName')
xml.collect('DocumentedNamePart')
xml.collect('Feature')
xml.collect('MasterNamePartGroup')
const forwardProfile = profile => profile && callback(null, profile)
xml.on('updateElement: Profile', _.flow(processProfile, forwardProfile))
xml.on('end', () => {
callback(null, null)
})
}
module.exports = {parse}

View file

@ -0,0 +1,135 @@
const parser = require('./parsing')
const axios = require('axios')
const { createWriteStream } = require('fs')
const { rename, writeFile, readFile, mkdir, copyFile, unlink } = require('fs/promises')
const path = require('path')
const _ = require('lodash/fp')
const DOWNLOAD_DIR = path.resolve('/tmp')
const OFAC_DATA_DIR = process.env.OFAC_DATA_DIR
const OFAC_SOURCES_DIR = path.join(OFAC_DATA_DIR, 'sources')
const LAST_UPDATED_FILE = path.resolve(OFAC_DATA_DIR, 'last_updated.dat')
const OFAC_SOURCES = [{
name: 'sdn_advanced',
url: 'https://sanctionslistservice.ofac.treas.gov/api/download/sdn_advanced.xml'
}, {
name: 'cons_advanced',
url: 'https://sanctionslistservice.ofac.treas.gov/api/download/cons_advanced.xml'
}]
const _mkdir = path =>
mkdir(path)
.catch(err => err.code === 'EEXIST' ? Promise.resolve() : Promise.reject(err))
const download = (dstDir, { name, url }) => {
const dstFile = path.join(dstDir, name + '.xml')
const writer = createWriteStream(dstFile)
return axios({
method: 'get',
url: url,
responseType: 'stream',
}).then(response => {
return new Promise((resolve, reject) => {
response.data.pipe(writer)
let error = null
writer.on('error', err => {
error = err
writer.close()
reject(err)
})
writer.on('close', () => {
if (!error) {
resolve(dstFile)
}
})
})
})
}
const parseToJson = srcFile => {
const dstFile = srcFile.replace(/\.xml$/, '.json')
const writeStream = createWriteStream(dstFile)
return new Promise((resolve, reject) => {
parser.parse(srcFile, (err, profile) => {
if (err) {
reject(err)
return
}
if (!profile) {
writeStream.end()
return
}
const json = JSON.stringify(profile)
writeStream.write(json + '\n', 'utf-8')
})
writeStream.on('error', reject)
writeStream.on('finish', () => resolve(dstFile))
})
}
const moveToSourcesDir = async (srcFile, ofacSourcesDir) => {
const name = path.basename(srcFile)
const dstFile = path.join(ofacSourcesDir, name)
try {
await rename(srcFile, dstFile)
} catch (err) {
if (err.code === 'EXDEV') {
// If rename fails due to cross-device link, fallback to copy + delete
await copyFile(srcFile, dstFile)
await unlink(srcFile)
} else {
throw err
}
}
return dstFile
}
function update () {
if (!OFAC_DATA_DIR) {
throw new Error('ofacDataDir must be defined in the environment')
}
return _mkdir(OFAC_DATA_DIR)
.then(() => _mkdir(OFAC_SOURCES_DIR))
.catch(err => {
if (err.code === 'EEXIST') return
throw err
})
.then(() => readFile(LAST_UPDATED_FILE))
.then(data => {
const lastUpdate = new Date(data.toString())
const now = new Date()
const hoursSinceUpdate = (now - lastUpdate) / (1000 * 60 * 60)
return hoursSinceUpdate < 24
})
.catch(err => {
// If file doesn't exist, continue with update
if (err.code === 'ENOENT') return false
throw err
})
.then(skipUpdate => {
if (skipUpdate) return Promise.resolve()
const downloads = _.flow(
_.map(file => download(DOWNLOAD_DIR, file).then(parseToJson))
)(OFAC_SOURCES)
return Promise.all(downloads)
.then(parsed => {
const moves = _.map(src => moveToSourcesDir(src, OFAC_SOURCES_DIR), parsed)
const timestamp = new Date().toISOString()
return Promise.all([...moves])
.then(() => writeFile(LAST_UPDATED_FILE, timestamp))
})
})
}
module.exports = { update }