Skip to content

Commit

Permalink
Merge pull request #3 from hardcoretech/feat/GFG-10793
Browse files Browse the repository at this point in the history
Feat/gfg 10793
  • Loading branch information
danielhardcore authored Jul 5, 2021
2 parents 4333cf5 + c80c370 commit 22a4609
Show file tree
Hide file tree
Showing 13 changed files with 477 additions and 5 deletions.
3 changes: 2 additions & 1 deletion app/models/screening_list/consolidated.rb
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,8 @@ class Consolidated
ScreeningList::Plc,
ScreeningList::Sdn,
ScreeningList::Ssi,
ScreeningList::Uvl,]
ScreeningList::Uvl,
ScreeningList::Mbs,]
self.fetch_all_sort_by = "name.keyword"

include SeparatedValuesable
Expand Down
12 changes: 12 additions & 0 deletions app/models/screening_list/mbs.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# frozen_string_literal: true

module ScreeningList
class Mbs
include Indexable
include ScreeningList::Mappable
self.source = {
full_name: "Non-SDN Menu-Based Sanctions List (NS-MBS List) - Treasury Department",
code: "MBS",
}
end
end
6 changes: 6 additions & 0 deletions app/queries/screening_list/mbs_query.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
# frozen_string_literal: true

module ScreeningList
class MbsQuery < ScreeningList::Query
end
end
23 changes: 23 additions & 0 deletions app/views/consolidated/mbs/_entry.json.jbuilder
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
# frozen_string_literal: true

json.partial! "consolidated/addresses",
addresses: entry[:_source][:addresses]
json.call(entry[:_source],
:alt_names,
:citizenships,
:dates_of_birth,
:entity_number,
:ids,
:name,
:nationalities,
:places_of_birth,
:programs,
:remarks,
)
json.source entry[:_source][:source][:full_name]
json.call(entry[:_source],
:source_information_url,
:source_list_url,
:title,
:type,
)
9 changes: 9 additions & 0 deletions app/views/consolidated/mbs/search.json.jbuilder
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
# frozen_string_literal: true

json.partial! "shared/queryinfo"
json.results do
json.array! @search[:hits] do |hit|
entry = hit.deep_symbolize_keys
json.partial! "entry", entry: entry
end
end
16 changes: 16 additions & 0 deletions docker/csl-python/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
FROM python:3.6-slim-buster

WORKDIR /csl-python
COPY . /csl-python/

RUN pip install --no-cache-dir -r requirements.txt

ENV ELASTICSEARCH_HOST="elastic"
ENV ELASTICSEARCH_PORT=9200

COPY ./entrypoint.sh /usr/bin/
RUN chmod +x /usr/bin/entrypoint.sh
COPY ./wait-for-it.sh /usr/bin/
RUN chmod +x /usr/bin/wait-for-it.sh

CMD ["/usr/bin/entrypoint.sh"]
6 changes: 6 additions & 0 deletions docker/csl-python/entrypoint.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
#!/bin/sh

/usr/bin/wait-for-it.sh elastic:9200 -t 30 -- echo "Elasticsearch server is ready"

echo "********** Start CSL-python import script **********"
python import_source.py
201 changes: 201 additions & 0 deletions docker/csl-python/import_source.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,201 @@
import logging
import os
import re
import time
import traceback
from logging.handlers import WatchedFileHandler

import requests
from elasticsearch import Elasticsearch
from elasticsearch.exceptions import ConnectionError


STOPWORDS = {'and', 'the', 'los'}
COMMON_WORDS = {
'co', 'company', 'corp', 'corporation', 'inc', 'incorporated', 'limited', 'ltd', 'mr', 'mrs', 'ms',
'organization', 'sa', 'sas', 'llc', 'university', 'univ'
}


def make_names(doc):
doc['name_idx'] = filter_alnum_and_space(doc['name'])
doc['name_idx'] = remove_words(doc['name_idx'], STOPWORDS)

if has_any_common_words(doc['name_idx']):
make_names_with_common(doc, 'name')

doc['name_rev'] = name_rev(doc['name'])
doc['name_no_ws'] = doc['name_idx'].replace(' ', '')
doc['name_no_ws_rev'] = doc['name_rev'].replace(' ', '')

if doc['alt_names']:
make_alt_names(doc)

return doc


def make_alt_names(doc):
doc['alt_idx'] = [filter_alnum_and_space(n) for n in doc['alt_names']]
doc['alt_idx'] = [remove_words(n, STOPWORDS) for n in doc['alt_idx']]

if has_any_common_words(' '.join(doc['alt_idx'])):
make_alt_names_with_common(doc)

doc['alt_rev'] = [name_rev(n) for n in doc['alt_idx']]
doc['alt_no_ws'] = [n.replace(' ', '') for n in doc['alt_idx']]
doc['alt_no_ws_rev'] = [n.replace(' ', '') for n in doc['alt_rev']]


def filter_alnum_and_space(name):
return re.sub(r'[^a-zA-Z0-9 ]', '', name)


def remove_words(name, words):
return ' '.join([n for n in name.split() if n.lower() not in words])


def has_any_common_words(name):
names = set(name.lower().split(' '))
return len(names.intersection(COMMON_WORDS)) > 0


def make_names_with_common(doc, prefix):
doc[f'{prefix}_no_ws_with_common'] = doc[f'{prefix}_idx'].replace(' ', '')
doc[f'{prefix}_no_ws_rev_with_common'] = name_rev(doc[f'{prefix}_idx']).replace(' ', '')
doc[f'{prefix}_idx'] = remove_words(doc[f'{prefix}_idx'], COMMON_WORDS)


def make_alt_names_with_common(doc):
doc['alt_no_ws_with_common'] = [n.replace(' ', '') for n in doc['alt_idx']]
doc['alt_no_ws_rev_with_common'] = [name_rev(n.replace(' ', '')) for n in doc['alt_idx']]
doc['alt_idx'] = [remove_words(n, COMMON_WORDS) for n in doc['alt_idx']]


def name_rev(name):
names = name.split(' ')
names.reverse()
return ' '.join(names)


def make_full_addresses(doc):
if doc.get('addresses'):
for addr in doc.get('addresses'):
addr_info = [addr[k] for k in ['address', 'city', 'country', 'postal_code', 'state'] if addr[k]]
addr['full_address'] = ', '.join(addr_info) if addr_info else None
return doc


def make_source_object(name, doc):
"""
"source": {
"code": "ISN",
"full_name": "Nonproliferation Sanctions (ISN) - State Department",
}
"""
doc['source'] = {
'code': name,
'full_name': doc['source'],
}

return doc


def get_json_data(url):
r = requests.get(url)
return r.json()


logger = logging.getLogger('csl')
logger.setLevel(logging.INFO)

steam_handler = logging.StreamHandler()
steam_handler.setLevel(logging.INFO)
logger.addHandler(steam_handler)

file_handler = WatchedFileHandler('/var/log/csl.log')
file_handler.setLevel(logging.INFO)
logger.addHandler(file_handler)


class BaseImporter:
ES_INDEX_NAME = None
SOURCE_NAME = None

def __init__(self, es) -> None:
self._es = es

if not self.SOURCE_NAME:
raise NotImplementedError('no SOURCE_NAME')

def do_import(self, data):
_count = 0
docs = self.get_docs(data)

for doc in docs:
doc_id = doc.pop('id')
doc = make_names(doc)
doc = make_full_addresses(doc)
doc = make_source_object(self.SOURCE_NAME, doc)

self._es.index(index=self.ES_INDEX_NAME, body=doc, id=doc_id)
_count += 1

es.indices.refresh(index=self.ES_INDEX_NAME)
logger.info(f'Finish import {self.SOURCE_NAME}: {_count}')

def get_docs(self, data):
docs = filter(self.filter_source, data.get('results', []))
return docs

def filter_source(self, doc) -> bool:
raise NotImplementedError


class ISNImporter(BaseImporter):
ES_INDEX_NAME = 'isn'
SOURCE_NAME = 'ISN'

def filter_source(self, doc) -> bool:
return f'({self.SOURCE_NAME})' in doc['source']


class NSMBSImporter(BaseImporter):
ES_INDEX_NAME = 'mbs'
SOURCE_NAME = 'MBS'

def filter_source(self, doc) -> bool:
return f'(NS-{self.SOURCE_NAME} List)' in doc['source']


SOURCE_IMPORTER_CLASSES = [ISNImporter, NSMBSImporter]


def is_isn_source(doc) -> bool:
return f'(ISN)' in doc['source']


if __name__ == '__main__':
hosts = os.getenv('ELASTICSEARCH_HOST', 'localhost')
port = os.environ.get('ELASTICSEARCH_PORT', 9200)
es = Elasticsearch(hosts=hosts, port=port)

source_importers = [importer_cls(es) for importer_cls in SOURCE_IMPORTER_CLASSES]

while True:
logger.info('Start import CSL source')
try:
if not es.ping():
raise ConnectionError

json_data = get_json_data('https://api.trade.gov/static/consolidated_screening_list/consolidated.json')

for importer in source_importers:
importer.do_import(json_data)

logger.info('Finish import CSL source')
except ConnectionError:
logger.error('Connect ES server failed')
except Exception as e:
logger.error(f'Import CSL source failed: {e}, {traceback.format_exc()}')

time.sleep(1800)
6 changes: 6 additions & 0 deletions docker/csl-python/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
certifi==2021.5.30
chardet==4.0.0
elasticsearch==7.13.1
idna==2.10
requests==2.25.1
urllib3==1.26.5
Loading

0 comments on commit 22a4609

Please sign in to comment.