diff --git a/Dockerfile b/Dockerfile index f2cf21e..a9bb490 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,13 +1,13 @@ # The ibm_db driver currently only supports x86_64 architecture, so we'll # support that as the only option for now. -FROM --platform=linux/amd64 senzing/senzingapi-runtime:${SENZING_VERSION:-3.10.3} AS configs +FROM --platform=linux/amd64 senzing/senzingapi-runtime:${SENZING_VERSION:-3.12.0} AS configs FROM --platform=linux/amd64 ruby:${RUBY_VERSION:-3.3} # Required in order to bypass the license prompt. ENV SENZING_ACCEPT_EULA="I_ACCEPT_THE_SENZING_EULA" ENV TERM=xterm -ENV SENZING_VERSION=${SENZING_VERSION:-3.10.3} +ENV SENZING_VERSION=${SENZING_VERSION:-3.12.0} # Update packages and install additional dependencies. RUN apt-get update && \ diff --git a/docker-compose.yml b/docker-compose.yml index 8013355..81f8b5a 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,3 +1,31 @@ +x-senzing-options: &senzing-options + SENZING_API_SERVER_ALLOWED_ORIGINS: '*' + SENZING_API_SERVER_ENABLE_ADMIN: 'true' + SENZING_ENGINE_CONFIGURATION_JSON: >- + { + "PIPELINE": { + "CONFIGPATH": "/etc/opt/senzing", + "LICENSESTRINGBASE64": "${SENZING_LICENSE_BASE64_ENCODED}", + "RESOURCEPATH": "/opt/senzing/g2/resources", + "SUPPORTPATH": "/opt/senzing/data/current" + }, + "SQL": { + "BACKEND": "SQL", + "CONNECTION": "postgresql://${POSTGRES_USERNAME:-postgres}:${POSTGRES_PASSWORD:-postgres}@postgres:5432:G2/" + } + } + +x-service-defaults: &service-defaults + build: . + depends_on: + - api + environment: + <<: *senzing-options + networks: + - senzing + volumes: + - .:/opt/cmr + services: rabbitmq: profiles: @@ -89,8 +117,7 @@ services: depends_on: - postgres environment: - SENZING_API_SERVER_ALLOWED_ORIGINS: '*' - SENZING_API_SERVER_ENABLE_ADMIN: 'true' + <<: *senzing-options SENZING_ENGINE_CONFIGURATION_JSON: >- { "PIPELINE": { @@ -104,7 +131,7 @@ services: "CONNECTION": "postgresql://${POSTGRES_USERNAME:-postgres}:${POSTGRES_PASSWORD:-postgres}@postgres:5432:G2/" } } - image: senzing/senzing-api-server:${SENZING_DOCKER_IMAGE_VERSION_SENZING_API_SERVER:-latest} + image: senzing/senzing-api-server:${SENZING_DOCKER_IMAGE_VERSION_SENZING_API_SERVER:-3.5.15} networks: - senzing ports: @@ -115,79 +142,26 @@ services: - '/tmp' tools: - build: . - environment: - SENZING_ENGINE_CONFIGURATION_JSON: >- - { - "PIPELINE": { - "CONFIGPATH": "/etc/opt/senzing", - "LICENSESTRINGBASE64": "${SENZING_LICENSE_BASE64_ENCODED}", - "RESOURCEPATH": "/opt/senzing/g2/resources", - "SUPPORTPATH": "/opt/senzing/data/current" - }, - "SQL": { - "BACKEND": "SQL", - "CONNECTION": "postgresql://${POSTGRES_USERNAME:-senzing}:${POSTGRES_PASSWORD:-senzing}@postgres:5432:G2" - } - } - networks: - - senzing + <<: *service-defaults command: run volumes: - ./config:/etc/cmr/config - ./data/import:/etc/cmr/import - ./data/export:/etc/cmr/export - - ./lib:/opt/cmr/lib importer: + <<: *service-defaults profiles: - load - depends_on: - - api - build: . - environment: - SENZING_ENGINE_CONFIGURATION_JSON: >- - { - "PIPELINE": { - "CONFIGPATH": "/etc/opt/senzing", - "LICENSESTRINGBASE64": "${SENZING_LICENSE_BASE64_ENCODED}", - "RESOURCEPATH": "/opt/senzing/g2/resources", - "SUPPORTPATH": "/opt/senzing/data/current" - }, - "SQL": { - "BACKEND": "SQL", - "CONNECTION": "postgresql://${POSTGRES_USERNAME:-senzing}:${POSTGRES_PASSWORD:-senzing}@postgres:5432:G2" - } - } - networks: - - senzing command: load volumes: - ${IMPORTER_CONFIG_FILE:-./config/config.yml}:/etc/cmr/config.yml - ./data/import:/etc/cmr/import exporter: + <<: *service-defaults profiles: - export - depends_on: - - api - build: . - environment: - SENZING_ENGINE_CONFIGURATION_JSON: >- - { - "PIPELINE": { - "CONFIGPATH": "/etc/opt/senzing", - "LICENSESTRINGBASE64": "${SENZING_LICENSE_BASE64_ENCODED}", - "RESOURCEPATH": "/opt/senzing/g2/resources", - "SUPPORTPATH": "/opt/senzing/data/current" - }, - "SQL": { - "BACKEND": "SQL", - "CONNECTION": "postgresql://${POSTGRES_USERNAME:-senzing}:${POSTGRES_PASSWORD:-senzing}@postgres:5432:G2" - } - } - networks: - - senzing command: export volumes: - ${EXPORTER_CONFIG_FILE:-./config/config.yml}:/etc/cmr/config.yml diff --git a/docs/destinations.md b/docs/destinations.md index 98a9b24..fa2e13b 100644 --- a/docs/destinations.md +++ b/docs/destinations.md @@ -9,6 +9,7 @@ before it is sent to the destination. | Option | Default | Required | Description | |-----------------|---------|----------|--------------------------------------------------------------------------------------| | export_file[^1] | | YES | Path to the JSON export from Senzing. | +| field_map | | YES | A mapping of fields from Senzing to their counterparts in the destination. | | type | | YES | The type of destination to use. Should be the name of one of the destinations below. | ## CSV @@ -49,6 +50,37 @@ destination: export_file: /home/senzing/export.json ``` +## JSONL + +Write records to a [JSON Lines][jsonl] formatted file. Each record will be +written as a single JSON object each on their own line. + +### Configuration + +The following options are available for this destination. + +| Option | Default | Required | Description | +|-----------|---------|----------|---------------------------------------------------------| +| overwrite | false | NO | Overwrite the existing file instead of appending to it. | +| path | | YES | The path to write the JSONL file. | + +### Example + +```yaml +destination: + type: JSONL + path: /home/senzing/export.csv + overwrite: false + field_map: + ENTITY_ID: person_id + DATABASE: database + PARTY_ID: party_id + MATCH_SCORE: match_score + RELATED_RECORD_ID: potential_person_id + RELATED_MATCH_SCORE: potential_match_score + export_file: /home/senzing/export.json +``` + ## Mongo Write records to a [MongoDB][mongo] collection as individual JSON documents. @@ -88,40 +120,53 @@ destination: Check out the [Export to Mongo][mongo-example] to see this in action. -## JSONL +## MySQL -Write records to a [JSON Lines][jsonl] formatted file. Each record will be -written as a single JSON object each on their own line. +Insert entities into a [MySQL] or compatible (such as [MariaDB]) database. ### Configuration -The following options are available for this destination. +The following options are available for this source. -| Option | Default | Required | Description | -|-----------|---------|----------|---------------------------------------------------------| -| overwrite | false | NO | Overwrite the existing file instead of appending to it. | -| path | | YES | The path to write the JSONL file. | +| Option | Default | Required | Description | +|----------|-----------|----------|--------------------------------------------| +| database | | YES | Database to write to. | +| host | | YES | Database host to connect to. | +| password | | YES | Password for the database user. | +| port | 3306 | NO | Port to connect to on the database server. | +| security | nil | NO | Set to "SSL" in order to utilize TLS[^2]. | +| table | | YES | Table to write entities to. | +| username | | YES | User with access to the database. | ### Example ```yaml -destination: - type: JSONL - path: /home/senzing/export.csv - overwrite: false - field_map: - ENTITY_ID: person_id - DATABASE: database - PARTY_ID: party_id - MATCH_SCORE: match_score - RELATED_RECORD_ID: potential_person_id - RELATED_MATCH_SCORE: potential_match_score - export_file: /home/senzing/export.json +sources: + informix: + type: MySQL + host: localhost + database: people + table: entity_resolution + username: mysql + password: password + field_map: + ENTITY_ID: person_id + DATABASE: database + PARTY_ID: party_id + MATCH_SCORE: match_score + RELATED_RECORD_ID: potential_person_id + RELATED_MATCH_SCORE: potential_match_score + export_file: /etc/cmr/export/export.json ``` +Check out the [Import from MySQL][mysql-example] to see this in action. + [jsonl]: https://jsonlines.org/ +[mariadb]: https://mariadb.org/ [mongo]: https://www.mongodb.com/ [mongo-example]: examples/export-to-mongo.md +[mysql]: https://www.mysql.com/ [transformations]: transformations.md [^1]: Use of an export file is temporary until records can be exported directly using the API. +[^2]: Transport Layer Security diff --git a/docs/examples.md b/docs/examples.md index 68bcb2f..54cb7d7 100644 --- a/docs/examples.md +++ b/docs/examples.md @@ -6,12 +6,13 @@ functionality of this entity resolution system. ## Destinations * [Export to Mongo][export-to-mongo] +* [Export to MySQL][mysql] ## Sources * [Import from Informix][import-from-informix] -* [Import from MySQL][import-from-mysql] +* [Import from MySQL][mysql] [export-to-mongo]: examples/export-to-mongo.md [import-from-informix]: examples/import-from-informix.md -[import-from-mysql]: examples/import-from-mysql.md +[mysql]: examples/mysql.md diff --git a/docs/examples/assets/config.mysql.yml b/docs/examples/assets/config.mysql.yml index d34e51d..3c6b4d5 100644 --- a/docs/examples/assets/config.mysql.yml +++ b/docs/examples/assets/config.mysql.yml @@ -23,7 +23,7 @@ sources: username: root password: password field_map: - party_id: OTHER_ID_PARTY + party_id: RECORD_ID last_name: PRIMARY_NAME_LAST first_name: PRIMARY_NAME_FIRST gender: GENDER @@ -43,16 +43,12 @@ sources: party_code: TYPE destination: - type: CSV - path: /etc/cmr/export/export.csv - overwrite: true - headers: - - person_id - - database - - party_id - - match_score - - potential_person_id - - potential_match_score + type: MySQL + host: maraidb + database: people + table: entity_resolution + username: root + password: password field_map: ENTITY_ID: person_id DATABASE: database diff --git a/docs/examples/assets/mysql-schema.sql b/docs/examples/assets/mysql-schema.sql index bd95a71..52b1d05 100644 --- a/docs/examples/assets/mysql-schema.sql +++ b/docs/examples/assets/mysql-schema.sql @@ -24,4 +24,14 @@ LOAD DATA LOCAL INFILE "/docker-entrypoint-initdb.d/import.csv" INTO TABLE people FIELDS TERMINATED BY ',' LINES TERMINATED BY '\n' - IGNORE 1 ROWS; + IGNORE 1 ROWS; + +CREATE TABLE entity_resolution( + person_id VARCHAR(255) NOT NULL, + database VARCHAR(255) NOT NULL, + party_id VARCHAR(255) NOT NULL, + match_score INTEGER NULL, + potential_person_id VARCHAR(255) NULL, + potential_match_score INTEGER NULL, + PRIMARY KEY (person_id, party_id, database) +); diff --git a/docs/examples/import-from-mysql.md b/docs/examples/mysql.md similarity index 69% rename from docs/examples/import-from-mysql.md rename to docs/examples/mysql.md index a083f82..fd6ad47 100644 --- a/docs/examples/import-from-mysql.md +++ b/docs/examples/mysql.md @@ -4,11 +4,12 @@ > The MySQL source type can be used for any MySQL compatible database, as shown > in this example using MariaDB. -This example demonstrates importing data from a [MySQL][mysql] database. -Following the steps below will launch a [MariaDB] container locally, and load a -sample dataset to be imported into Senzing. You can also use your own -MySQL compatible database, but you must also provide an appropriate -configuration file rather than the one specified here. +This example demonstrates importing data from a [MySQL][mysql] database, and +exporting the results to another table in that same database. Following the +steps below will launch a [MariaDB] container locally, and load a sample dataset +to be imported into Senzing. You can also use your own MySQL compatible +database, but you must also provide an appropriate configuration file rather +than the one specified here. > [!NOTE] > All commands listed in this document are run from the root directory of this @@ -62,8 +63,23 @@ Once the importer container exits, your data is now in Senzing! ## Exporting -To verify that the import succeeded, we can export the results from Senzing -to a CSV file. Our config file already has this setup. +With our records imported into Senzing, we can export the resulting entities. +For this example, we'll export the entities to a new table in the same database. + +> [!NOTE] +> The export process assumes that the table already exists. For this example, +> we've used the following to create the table: +> ```sql +> CREATE TABLE entity_resolution( +> person_id VARCHAR(255) NOT NULL, +> database VARCHAR(255) NOT NULL, +> party_id VARCHAR(255) NOT NULL, +> match_score INTEGER NULL, +> potential_person_id VARCHAR(255) NULL, +> potential_match_score INTEGER NULL, +> PRIMARY KEY (person_id, party_id, database) +> ); +> ``` ```bash export EXPORTER_CONFIG_FILE="$(pwd)/docs/examples/assets/config.mysql.yml" diff --git a/docs/sources.md b/docs/sources.md index ac7b97e..5ce80ef 100644 --- a/docs/sources.md +++ b/docs/sources.md @@ -65,8 +65,8 @@ The following options are available for this source. | port | 9089 | NO | Port to connect to on the database server. | | security | nil | NO | Set to "SSL" in order to utilize TLS[^1]. | | schema | $username | NO | Schema that the database is attached to. Defaults to the value of `username`. | -| table | | YES | Table that contains the records to be imported | -| username | | YES | User with access to the database | +| table | | YES | Table that contains the records to be imported. | +| username | | YES | User with access to the database. | ### Example @@ -94,22 +94,22 @@ Check out the [Import from Informix][informix-example] to see this in action. ## MySQL -Query an [MySQL] or compatible (such as [MariaDB]) database for records to +Query a [MySQL] or compatible (such as [MariaDB]) database for records to import. ### Configuration The following options are available for this source. -| Option | Default | Required | Description | -|----------|-----------|----------|-------------------------------------------------------------------------------| -| database | | YES | Database to read from. | -| host | | YES | Informix host to connect to. | -| password | | YES | Password for the database user. | -| port | 3306 | NO | Port to connect to on the database server. | -| security | nil | NO | Set to "SSL" in order to utilize TLS[^1]. | -| table | | YES | Table that contains the records to be imported | -| username | | YES | User with access to the database | +| Option | Default | Required | Description | +|----------|---------|----------|-------------------------------------------------| +| database | | YES | Database to read from. | +| host | | YES | Database host to connect to. | +| password | | YES | Password for the database user. | +| port | 3306 | NO | Port to connect to on the database server. | +| security | nil | NO | Set to "SSL" in order to utilize TLS[^1]. | +| table | | YES | Table that contains the records to be imported. | +| username | | YES | User with access to the database. | ### Example diff --git a/lib/destination.rb b/lib/destination.rb index 855bb92..44aa5a8 100644 --- a/lib/destination.rb +++ b/lib/destination.rb @@ -2,6 +2,7 @@ require_relative 'destination/csv' require_relative 'destination/mongo' +require_relative 'destination/mysql' require_relative 'destination/jsonl' # Helper methods for loading destinations. diff --git a/lib/destination/database.rb b/lib/destination/database.rb new file mode 100644 index 0000000..cc5e630 --- /dev/null +++ b/lib/destination/database.rb @@ -0,0 +1,38 @@ +# frozen_string_literal: true + +require 'sequel' +require_relative 'base' + +module Destination + # Basic database destination for data exports. Should be extended by specific + # database implementations. + class Database < Base + # TODO: Add support for databases that don't support REPLACE. + def add_record(record) + table = db[@destination_config[:table].to_sym] + table.replace(record) + end + + private + + # Establishes a database connection and proxies calls to the database. + # + # @return [Sequel::Database] + def db + @db ||= Sequel.connect( + adapter: @destination_config[:adapter], + host: @destination_config[:host], + database: @destination_config[:database], + port: @destination_config[:port], + user: @destination_config[:username], + password: @destination_config[:password], + schema: @destination_config[:schema] || @destination_config[:username], + security: @destination_config[:security] + ) + end + + def upsert(record) + raise NotImplementedError, "No upsert query defined for #{@destination_config.type}" + end + end +end diff --git a/lib/destination/informix.rb b/lib/destination/informix.rb new file mode 100644 index 0000000..969b77a --- /dev/null +++ b/lib/destination/informix.rb @@ -0,0 +1,14 @@ +# frozen_string_literal: true + +require_relative 'database' + +module Destination + # Informix destination for data exports. + class Informix < Database + private + + def defaults + super.merge({ adapter: :ibmdb, port: 9089, security: nil }) + end + end +end diff --git a/lib/destination/mysql.rb b/lib/destination/mysql.rb new file mode 100644 index 0000000..4e40bf1 --- /dev/null +++ b/lib/destination/mysql.rb @@ -0,0 +1,14 @@ +# frozen_string_literal: true + +require_relative 'database' + +module Destination + # MySQL destination for data imports. + class MySQL < Database + private + + def defaults + super.merge({ adapter: :mysql2, port: 3306, security: nil }) + end + end +end diff --git a/lib/export.rb b/lib/export.rb index f2f2496..734114f 100644 --- a/lib/export.rb +++ b/lib/export.rb @@ -19,6 +19,7 @@ def from_file entity = JSON.parse(line, symbolize_names: true) entity[:RESOLVED_ENTITY][:RECORDS].each do |record| record[:ENTITY_ID] = entity[:RESOLVED_ENTITY][:ENTITY_ID] + @config.logger.debug("Exporting record: #{record[:ENTITY_ID]}") destination.add_record(process_record(record)) end end diff --git a/lib/source/database.rb b/lib/source/database.rb index b512906..535689c 100644 --- a/lib/source/database.rb +++ b/lib/source/database.rb @@ -1,7 +1,7 @@ # frozen_string_literal: true require 'sequel' -require_relative 'file' +require_relative 'base' module Source # Basic database source for data imports. Should be extended by specific diff --git a/mkdocs.yaml b/mkdocs.yaml index 93cc00a..d407cd5 100644 --- a/mkdocs.yaml +++ b/mkdocs.yaml @@ -73,7 +73,7 @@ nav: - Examples: - Export to MongoDB: examples/export-to-mongo.md - Import from Informix: examples/import-from-informix.md - - Import from MySQL: examples/import-from-mysql.md + - Import & export with MySQL: examples/mysql.md - About: - License: license.md