From deff8251c46e6bbf58311fab12b58f37ef9a956a Mon Sep 17 00:00:00 2001 From: daniel Date: Thu, 1 Jul 2021 17:07:54 +0800 Subject: [PATCH 1/4] feat(csl): add mbs source GFG-10793 --- app/models/screening_list/consolidated.rb | 3 ++- app/models/screening_list/mbs.rb | 12 ++++++++++ app/queries/screening_list/mbs_query.rb | 6 +++++ .../consolidated/mbs/_entry.json.jbuilder | 23 +++++++++++++++++++ .../consolidated/mbs/search.json.jbuilder | 9 ++++++++ 5 files changed, 52 insertions(+), 1 deletion(-) create mode 100644 app/models/screening_list/mbs.rb create mode 100644 app/queries/screening_list/mbs_query.rb create mode 100644 app/views/consolidated/mbs/_entry.json.jbuilder create mode 100644 app/views/consolidated/mbs/search.json.jbuilder diff --git a/app/models/screening_list/consolidated.rb b/app/models/screening_list/consolidated.rb index c30daf7..c5995ff 100644 --- a/app/models/screening_list/consolidated.rb +++ b/app/models/screening_list/consolidated.rb @@ -13,7 +13,8 @@ class Consolidated ScreeningList::Plc, ScreeningList::Sdn, ScreeningList::Ssi, - ScreeningList::Uvl,] + ScreeningList::Uvl, + ScreeningList::Mbs,] self.fetch_all_sort_by = "name.keyword" include SeparatedValuesable diff --git a/app/models/screening_list/mbs.rb b/app/models/screening_list/mbs.rb new file mode 100644 index 0000000..e6b3cc2 --- /dev/null +++ b/app/models/screening_list/mbs.rb @@ -0,0 +1,12 @@ +# frozen_string_literal: true + +module ScreeningList + class Mbs + include Indexable + include ScreeningList::Mappable + self.source = { + full_name: "Non-SDN Menu-Based Sanctions List (NS-MBS List) - Treasury Department", + code: "MBS", + } + end +end diff --git a/app/queries/screening_list/mbs_query.rb b/app/queries/screening_list/mbs_query.rb new file mode 100644 index 0000000..457a73b --- /dev/null +++ b/app/queries/screening_list/mbs_query.rb @@ -0,0 +1,6 @@ +# frozen_string_literal: true + +module ScreeningList + class MbsQuery < ScreeningList::Query + end +end diff --git a/app/views/consolidated/mbs/_entry.json.jbuilder b/app/views/consolidated/mbs/_entry.json.jbuilder new file mode 100644 index 0000000..70582f1 --- /dev/null +++ b/app/views/consolidated/mbs/_entry.json.jbuilder @@ -0,0 +1,23 @@ +# frozen_string_literal: true + +json.partial! "consolidated/addresses", + addresses: entry[:_source][:addresses] +json.call(entry[:_source], + :alt_names, + :citizenships, + :dates_of_birth, + :entity_number, + :ids, + :name, + :nationalities, + :places_of_birth, + :programs, + :remarks, + ) +json.source entry[:_source][:source][:full_name] +json.call(entry[:_source], + :source_information_url, + :source_list_url, + :title, + :type, + ) diff --git a/app/views/consolidated/mbs/search.json.jbuilder b/app/views/consolidated/mbs/search.json.jbuilder new file mode 100644 index 0000000..ddbd895 --- /dev/null +++ b/app/views/consolidated/mbs/search.json.jbuilder @@ -0,0 +1,9 @@ +# frozen_string_literal: true + +json.partial! "shared/queryinfo" +json.results do + json.array! @search[:hits] do |hit| + entry = hit.deep_symbolize_keys + json.partial! "entry", entry: entry + end +end From 91d7f96034f8990ae86f6fe2294cf00b543e23da Mon Sep 17 00:00:00 2001 From: daniel Date: Thu, 1 Jul 2021 17:09:12 +0800 Subject: [PATCH 2/4] feat(csl): add python script for import isn and mbs source GFG-10793 GFG-10722 --- docker/csl-python/import_source.py | 201 +++++++++++++++++++++++++++++ 1 file changed, 201 insertions(+) create mode 100644 docker/csl-python/import_source.py diff --git a/docker/csl-python/import_source.py b/docker/csl-python/import_source.py new file mode 100644 index 0000000..c2220a8 --- /dev/null +++ b/docker/csl-python/import_source.py @@ -0,0 +1,201 @@ +import logging +import os +import re +import time +import traceback +from logging.handlers import WatchedFileHandler + +import requests +from elasticsearch import Elasticsearch +from elasticsearch.exceptions import ConnectionError + + +STOPWORDS = {'and', 'the', 'los'} +COMMON_WORDS = { + 'co', 'company', 'corp', 'corporation', 'inc', 'incorporated', 'limited', 'ltd', 'mr', 'mrs', 'ms', + 'organization', 'sa', 'sas', 'llc', 'university', 'univ' +} + + +def make_names(doc): + doc['name_idx'] = filter_alnum_and_space(doc['name']) + doc['name_idx'] = remove_words(doc['name_idx'], STOPWORDS) + + if not has_any_common_words(doc['name_idx']): + make_names_with_common(doc, 'name') + + doc['name_rev'] = name_rev(doc['name']) + doc['name_no_ws'] = doc['name_idx'].replace(' ', '') + doc['name_no_ws_rev'] = doc['name_rev'].replace(' ', '') + + if doc['alt_names']: + make_alt_names(doc) + + return doc + + +def make_alt_names(doc): + doc['alt_idx'] = [filter_alnum_and_space(n) for n in doc['alt_names']] + doc['alt_idx'] = [remove_words(n, STOPWORDS) for n in doc['alt_idx']] + + if not has_any_common_words(' '.join(doc['alt_idx'])): + make_alt_names_with_common(doc) + + doc['alt_rev'] = [name_rev(n) for n in doc['alt_idx']] + doc['alt_no_ws'] = [n.replace(' ', '') for n in doc['alt_idx']] + doc['alt_no_ws_rev'] = [n.replace(' ', '') for n in doc['alt_rev']] + + +def filter_alnum_and_space(name): + return re.sub(r'[^a-zA-Z0-9 ]', '', name) + + +def remove_words(name, words): + return ' '.join([n for n in name.split() if n.lower() not in words]) + + +def has_any_common_words(name): + names = set(name.lower().split(' ')) + return len(names.intersection(COMMON_WORDS)) > 0 + + +def make_names_with_common(doc, prefix): + doc[f'{prefix}_no_ws_with_common'] = doc[f'{prefix}_idx'].replace(' ', '') + doc[f'{prefix}_no_ws_rev_with_common'] = name_rev(doc[f'{prefix}_idx']).replace(' ', '') + doc[f'{prefix}_idx'] = remove_words(doc[f'{prefix}_idx'], COMMON_WORDS) + + +def make_alt_names_with_common(doc): + doc['alt_no_ws_with_common'] = [n.replace(' ', '') for n in doc['alt_idx']] + doc['alt_no_ws_rev_with_common'] = [name_rev(n.replace(' ', '')) for n in doc['alt_idx']] + doc['alt_idx'] = [remove_words(n, COMMON_WORDS) for n in doc['alt_idx']] + + +def name_rev(name): + names = name.split(' ') + names.reverse() + return ' '.join(names) + + +def make_full_addresses(doc): + if doc.get('addresses'): + for addr in doc.get('addresses'): + addr_info = [addr[k] for k in ['address', 'city', 'country', 'postal_code', 'state'] if addr[k]] + addr['full_address'] = ', '.join(addr_info) if addr_info else None + return doc + + +def make_source_object(name, doc): + """ + "source": { + "code": "ISN", + "full_name": "Nonproliferation Sanctions (ISN) - State Department", + } + """ + doc['source'] = { + 'code': name, + 'full_name': doc['source'], + } + + return doc + + +def get_json_data(url): + r = requests.get(url) + return r.json() + + +logger = logging.getLogger('csl') +logger.setLevel(logging.INFO) + +steam_handler = logging.StreamHandler() +steam_handler.setLevel(logging.INFO) +logger.addHandler(steam_handler) + +file_handler = WatchedFileHandler('/var/log/csl.log') +file_handler.setLevel(logging.INFO) +logger.addHandler(file_handler) + + +class BaseImporter: + ES_INDEX_NAME = None + SOURCE_NAME = None + + def __init__(self, es) -> None: + self._es = es + + if not self.SOURCE_NAME: + raise NotImplementedError('no SOURCE_NAME') + + def do_import(self, data): + _count = 0 + docs = self.get_docs(data) + + for doc in docs: + doc_id = doc.pop('id') + doc = make_names(doc) + doc = make_full_addresses(doc) + doc = make_source_object(self.SOURCE_NAME, doc) + + self._es.index(index=self.ES_INDEX_NAME, body=doc, id=doc_id) + _count += 1 + + es.indices.refresh(index=self.ES_INDEX_NAME) + logger.info(f'Finish import {self.SOURCE_NAME}: {_count}') + + def get_docs(self, data): + docs = filter(self.filter_source, data.get('results', [])) + return docs + + def filter_source(self, doc) -> bool: + raise NotImplementedError + + +class ISNImporter(BaseImporter): + ES_INDEX_NAME = 'isn' + SOURCE_NAME = 'ISN' + + def filter_source(self, doc) -> bool: + return f'({self.SOURCE_NAME})' in doc['source'] + + +class NSMBSImporter(BaseImporter): + ES_INDEX_NAME = 'mbs' + SOURCE_NAME = 'MBS' + + def filter_source(self, doc) -> bool: + return f'(NS-{self.SOURCE_NAME} List)' in doc['source'] + + +SOURCE_IMPORTER_CLASSES = [ISNImporter, NSMBSImporter] + + +def is_isn_source(doc) -> bool: + return f'(ISN)' in doc['source'] + + +if __name__ == '__main__': + hosts = os.getenv('ELASTICSEARCH_HOST', 'localhost') + port = os.environ.get('ELASTICSEARCH_PORT', 9200) + es = Elasticsearch(hosts=hosts, port=port) + + source_importers = [importer_cls(es) for importer_cls in SOURCE_IMPORTER_CLASSES] + + while True: + logger.info('Start import CSL source') + try: + if not es.ping(): + raise ConnectionError + + data = get_json_data('https://api.trade.gov/static/consolidated_screening_list/consolidated.json') + + for importer in source_importers: + importer.do_import(data) + + logger.info('Finish import CSL source') + except ConnectionError: + logger.error('Connect ES server failed') + except Exception as e: + logger.error(f'Import CSL source failed: {e}, {traceback.format_exc()}') + + time.sleep(1800) From 1a335ebb8e649f6cf5e3d6916ca9f5d9557a41e9 Mon Sep 17 00:00:00 2001 From: daniel Date: Thu, 1 Jul 2021 17:11:10 +0800 Subject: [PATCH 3/4] feat(docker): add docker file for python script --- docker/csl-python/Dockerfile | 16 ++ docker/csl-python/entrypoint.sh | 6 + docker/csl-python/requirements.txt | 6 + docker/csl-python/wait-for-it.sh | 182 ++++++++++++++++++ .../docker-compose.yml} | 9 +- docker/{run-csl-dev.sh => dev/run-csl.sh} | 4 +- docker/docker-compose.yml | 5 + 7 files changed, 224 insertions(+), 4 deletions(-) create mode 100644 docker/csl-python/Dockerfile create mode 100755 docker/csl-python/entrypoint.sh create mode 100644 docker/csl-python/requirements.txt create mode 100755 docker/csl-python/wait-for-it.sh rename docker/{docker-compose-dev.yml => dev/docker-compose.yml} (66%) rename docker/{run-csl-dev.sh => dev/run-csl.sh} (77%) diff --git a/docker/csl-python/Dockerfile b/docker/csl-python/Dockerfile new file mode 100644 index 0000000..1e61c93 --- /dev/null +++ b/docker/csl-python/Dockerfile @@ -0,0 +1,16 @@ +FROM python:3.6-slim-buster + +WORKDIR /csl-python +COPY . /csl-python/ + +RUN pip install --no-cache-dir -r requirements.txt + +ENV ELASTICSEARCH_HOST="elastic" +ENV ELASTICSEARCH_PORT=9200 + +COPY ./entrypoint.sh /usr/bin/ +RUN chmod +x /usr/bin/entrypoint.sh +COPY ./wait-for-it.sh /usr/bin/ +RUN chmod +x /usr/bin/wait-for-it.sh + +CMD ["/usr/bin/entrypoint.sh"] diff --git a/docker/csl-python/entrypoint.sh b/docker/csl-python/entrypoint.sh new file mode 100755 index 0000000..0dd96c7 --- /dev/null +++ b/docker/csl-python/entrypoint.sh @@ -0,0 +1,6 @@ +#!/bin/sh + +/usr/bin/wait-for-it.sh elastic:9200 -t 30 -- echo "Elasticsearch server is ready" + +echo "********** Start CSL-python import script **********" +python import_source.py diff --git a/docker/csl-python/requirements.txt b/docker/csl-python/requirements.txt new file mode 100644 index 0000000..39c9871 --- /dev/null +++ b/docker/csl-python/requirements.txt @@ -0,0 +1,6 @@ +certifi==2021.5.30 +chardet==4.0.0 +elasticsearch==7.13.1 +idna==2.10 +requests==2.25.1 +urllib3==1.26.5 diff --git a/docker/csl-python/wait-for-it.sh b/docker/csl-python/wait-for-it.sh new file mode 100755 index 0000000..d990e0d --- /dev/null +++ b/docker/csl-python/wait-for-it.sh @@ -0,0 +1,182 @@ +#!/usr/bin/env bash +# Use this script to test if a given TCP host/port are available + +WAITFORIT_cmdname=${0##*/} + +echoerr() { if [[ $WAITFORIT_QUIET -ne 1 ]]; then echo "$@" 1>&2; fi } + +usage() +{ + cat << USAGE >&2 +Usage: + $WAITFORIT_cmdname host:port [-s] [-t timeout] [-- command args] + -h HOST | --host=HOST Host or IP under test + -p PORT | --port=PORT TCP port under test + Alternatively, you specify the host and port as host:port + -s | --strict Only execute subcommand if the test succeeds + -q | --quiet Don't output any status messages + -t TIMEOUT | --timeout=TIMEOUT + Timeout in seconds, zero for no timeout + -- COMMAND ARGS Execute command with args after the test finishes +USAGE + exit 1 +} + +wait_for() +{ + if [[ $WAITFORIT_TIMEOUT -gt 0 ]]; then + echoerr "$WAITFORIT_cmdname: waiting $WAITFORIT_TIMEOUT seconds for $WAITFORIT_HOST:$WAITFORIT_PORT" + else + echoerr "$WAITFORIT_cmdname: waiting for $WAITFORIT_HOST:$WAITFORIT_PORT without a timeout" + fi + WAITFORIT_start_ts=$(date +%s) + while : + do + if [[ $WAITFORIT_ISBUSY -eq 1 ]]; then + nc -z $WAITFORIT_HOST $WAITFORIT_PORT + WAITFORIT_result=$? + else + (echo -n > /dev/tcp/$WAITFORIT_HOST/$WAITFORIT_PORT) >/dev/null 2>&1 + WAITFORIT_result=$? + fi + if [[ $WAITFORIT_result -eq 0 ]]; then + WAITFORIT_end_ts=$(date +%s) + echoerr "$WAITFORIT_cmdname: $WAITFORIT_HOST:$WAITFORIT_PORT is available after $((WAITFORIT_end_ts - WAITFORIT_start_ts)) seconds" + break + fi + sleep 1 + done + return $WAITFORIT_result +} + +wait_for_wrapper() +{ + # In order to support SIGINT during timeout: http://unix.stackexchange.com/a/57692 + if [[ $WAITFORIT_QUIET -eq 1 ]]; then + timeout $WAITFORIT_BUSYTIMEFLAG $WAITFORIT_TIMEOUT $0 --quiet --child --host=$WAITFORIT_HOST --port=$WAITFORIT_PORT --timeout=$WAITFORIT_TIMEOUT & + else + timeout $WAITFORIT_BUSYTIMEFLAG $WAITFORIT_TIMEOUT $0 --child --host=$WAITFORIT_HOST --port=$WAITFORIT_PORT --timeout=$WAITFORIT_TIMEOUT & + fi + WAITFORIT_PID=$! + trap "kill -INT -$WAITFORIT_PID" INT + wait $WAITFORIT_PID + WAITFORIT_RESULT=$? + if [[ $WAITFORIT_RESULT -ne 0 ]]; then + echoerr "$WAITFORIT_cmdname: timeout occurred after waiting $WAITFORIT_TIMEOUT seconds for $WAITFORIT_HOST:$WAITFORIT_PORT" + fi + return $WAITFORIT_RESULT +} + +# process arguments +while [[ $# -gt 0 ]] +do + case "$1" in + *:* ) + WAITFORIT_hostport=(${1//:/ }) + WAITFORIT_HOST=${WAITFORIT_hostport[0]} + WAITFORIT_PORT=${WAITFORIT_hostport[1]} + shift 1 + ;; + --child) + WAITFORIT_CHILD=1 + shift 1 + ;; + -q | --quiet) + WAITFORIT_QUIET=1 + shift 1 + ;; + -s | --strict) + WAITFORIT_STRICT=1 + shift 1 + ;; + -h) + WAITFORIT_HOST="$2" + if [[ $WAITFORIT_HOST == "" ]]; then break; fi + shift 2 + ;; + --host=*) + WAITFORIT_HOST="${1#*=}" + shift 1 + ;; + -p) + WAITFORIT_PORT="$2" + if [[ $WAITFORIT_PORT == "" ]]; then break; fi + shift 2 + ;; + --port=*) + WAITFORIT_PORT="${1#*=}" + shift 1 + ;; + -t) + WAITFORIT_TIMEOUT="$2" + if [[ $WAITFORIT_TIMEOUT == "" ]]; then break; fi + shift 2 + ;; + --timeout=*) + WAITFORIT_TIMEOUT="${1#*=}" + shift 1 + ;; + --) + shift + WAITFORIT_CLI=("$@") + break + ;; + --help) + usage + ;; + *) + echoerr "Unknown argument: $1" + usage + ;; + esac +done + +if [[ "$WAITFORIT_HOST" == "" || "$WAITFORIT_PORT" == "" ]]; then + echoerr "Error: you need to provide a host and port to test." + usage +fi + +WAITFORIT_TIMEOUT=${WAITFORIT_TIMEOUT:-15} +WAITFORIT_STRICT=${WAITFORIT_STRICT:-0} +WAITFORIT_CHILD=${WAITFORIT_CHILD:-0} +WAITFORIT_QUIET=${WAITFORIT_QUIET:-0} + +# Check to see if timeout is from busybox? +WAITFORIT_TIMEOUT_PATH=$(type -p timeout) +WAITFORIT_TIMEOUT_PATH=$(realpath $WAITFORIT_TIMEOUT_PATH 2>/dev/null || readlink -f $WAITFORIT_TIMEOUT_PATH) + +WAITFORIT_BUSYTIMEFLAG="" +if [[ $WAITFORIT_TIMEOUT_PATH =~ "busybox" ]]; then + WAITFORIT_ISBUSY=1 + # Check if busybox timeout uses -t flag + # (recent Alpine versions don't support -t anymore) + if timeout &>/dev/stdout | grep -q -e '-t '; then + WAITFORIT_BUSYTIMEFLAG="-t" + fi +else + WAITFORIT_ISBUSY=0 +fi + +if [[ $WAITFORIT_CHILD -gt 0 ]]; then + wait_for + WAITFORIT_RESULT=$? + exit $WAITFORIT_RESULT +else + if [[ $WAITFORIT_TIMEOUT -gt 0 ]]; then + wait_for_wrapper + WAITFORIT_RESULT=$? + else + wait_for + WAITFORIT_RESULT=$? + fi +fi + +if [[ $WAITFORIT_CLI != "" ]]; then + if [[ $WAITFORIT_RESULT -ne 0 && $WAITFORIT_STRICT -eq 1 ]]; then + echoerr "$WAITFORIT_cmdname: strict mode, refusing to execute subprocess" + exit $WAITFORIT_RESULT + fi + exec "${WAITFORIT_CLI[@]}" +else + exit $WAITFORIT_RESULT +fi diff --git a/docker/docker-compose-dev.yml b/docker/dev/docker-compose.yml similarity index 66% rename from docker/docker-compose-dev.yml rename to docker/dev/docker-compose.yml index bf9432f..3c90f1c 100644 --- a/docker/docker-compose-dev.yml +++ b/docker/dev/docker-compose.yml @@ -7,8 +7,8 @@ services: - discovery.type=single-node rails: build: - context: ./../ - dockerfile: $PWD/csl/Dockerfile + context: ./../../ + dockerfile: $PWD/../csl/Dockerfile container_name: csl environment: - RECREATE_DB=true @@ -16,3 +16,8 @@ services: - "3000:3000" depends_on: - elastic + python: + build: ./../csl-python/ + container_name: csl-python + depends_on: + - elastic diff --git a/docker/run-csl-dev.sh b/docker/dev/run-csl.sh similarity index 77% rename from docker/run-csl-dev.sh rename to docker/dev/run-csl.sh index 69a0ca0..5ce54b8 100755 --- a/docker/run-csl-dev.sh +++ b/docker/dev/run-csl.sh @@ -1,9 +1,9 @@ # ======================================================================== # Build docker image # ======================================================================== -docker-compose -f docker-compose-dev.yml build +docker-compose -f docker-compose.yml build # ======================================================================== # Run CSL # ======================================================================== -docker-compose -f docker-compose-dev.yml up -d +docker-compose -f docker-compose.yml up -d diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml index 5fa7dec..7461a2c 100644 --- a/docker/docker-compose.yml +++ b/docker/docker-compose.yml @@ -17,6 +17,11 @@ services: - RECREATE_DB=true depends_on: - elastic + python: + build: ./csl-python/ + container_name: csl-python + depends_on: + - elastic csl-proxy: build: ./nginx/ container_name: csl-proxy From c80c37020de6337b3946bad712bb67115f8c0186 Mon Sep 17 00:00:00 2001 From: daniel Date: Mon, 5 Jul 2021 10:56:14 +0800 Subject: [PATCH 4/4] fix: fix review's issue --- docker/csl-python/import_source.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/docker/csl-python/import_source.py b/docker/csl-python/import_source.py index c2220a8..12424f7 100644 --- a/docker/csl-python/import_source.py +++ b/docker/csl-python/import_source.py @@ -21,7 +21,7 @@ def make_names(doc): doc['name_idx'] = filter_alnum_and_space(doc['name']) doc['name_idx'] = remove_words(doc['name_idx'], STOPWORDS) - if not has_any_common_words(doc['name_idx']): + if has_any_common_words(doc['name_idx']): make_names_with_common(doc, 'name') doc['name_rev'] = name_rev(doc['name']) @@ -38,7 +38,7 @@ def make_alt_names(doc): doc['alt_idx'] = [filter_alnum_and_space(n) for n in doc['alt_names']] doc['alt_idx'] = [remove_words(n, STOPWORDS) for n in doc['alt_idx']] - if not has_any_common_words(' '.join(doc['alt_idx'])): + if has_any_common_words(' '.join(doc['alt_idx'])): make_alt_names_with_common(doc) doc['alt_rev'] = [name_rev(n) for n in doc['alt_idx']] @@ -187,10 +187,10 @@ def is_isn_source(doc) -> bool: if not es.ping(): raise ConnectionError - data = get_json_data('https://api.trade.gov/static/consolidated_screening_list/consolidated.json') + json_data = get_json_data('https://api.trade.gov/static/consolidated_screening_list/consolidated.json') for importer in source_importers: - importer.do_import(data) + importer.do_import(json_data) logger.info('Finish import CSL source') except ConnectionError: