Skip to content

Commit

Permalink
ability to remove duplicates
Browse files Browse the repository at this point in the history
  • Loading branch information
micronix committed Jun 3, 2024
1 parent 3c10cc3 commit 4e4189f
Show file tree
Hide file tree
Showing 5 changed files with 104 additions and 6 deletions.
2 changes: 2 additions & 0 deletions app/models/country.rb
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ class Country < StudyRelationship
{
table: :countries,
root: [:protocolSection, :contactsLocationsModule, :locations],
unique: true,
columns: [
{ name: :name, value: :country },
{ name: :removed, value: false }
Expand All @@ -13,6 +14,7 @@ class Country < StudyRelationship
{
table: :countries,
root: [:derivedSection, :miscInfoModule, :removedCountries],
unique: true,
columns: [
{ name: :name, value: nil },
{ name: :removed, value: true }
Expand Down
4 changes: 4 additions & 0 deletions app/models/result_group.rb
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ class ResultGroup < StudyRelationship
table: :result_groups,
root: [:resultsSection, :baselineCharacteristicsModule, :groups],
index: [:ctgov_group_code, :result_type],
unique: true,
columns: [
{ name: :ctgov_group_code, value: :id },
{ name: :result_type, value: 'Baseline' },
Expand All @@ -28,6 +29,7 @@ class ResultGroup < StudyRelationship
root: [:resultsSection, :outcomeMeasuresModule, :outcomeMeasures],
flatten: [:groups],
index: [:ctgov_group_code, :result_type],
unique: true,
columns: [
{ name: :ctgov_group_code, value: :id },
{ name: :result_type, value: 'Outcome' },
Expand All @@ -39,6 +41,7 @@ class ResultGroup < StudyRelationship
table: :result_groups,
root: [:resultsSection, :participantFlowModule, :groups],
index: [:ctgov_group_code, :result_type],
unique: true,
columns: [
{ name: :ctgov_group_code, value: :id },
{ name: :result_type, value: 'Participant Flow' },
Expand All @@ -50,6 +53,7 @@ class ResultGroup < StudyRelationship
table: :result_groups,
root: [:resultsSection, :adverseEventsModule, :eventGroups],
index: [:ctgov_group_code, :result_type],
unique: true,
columns: [
{ name: :ctgov_group_code, value: :id },
{ name: :result_type, value: 'Reported Event' },
Expand Down
5 changes: 5 additions & 0 deletions app/models/study_json_record/worker.rb
Original file line number Diff line number Diff line change
Expand Up @@ -245,6 +245,11 @@ def process_mapping(mapping, records)
end
end

# remove duplicates
if mapping[:unique]
collection = collection.uniq{|k| k.attributes }
end

# import models
print "\r #{mapping[:table]} - #{collection.count}"
model.import(collection)
Expand Down
72 changes: 72 additions & 0 deletions lib/tasks/load.rake
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,44 @@ namespace :db do
end
end

desc 'import both, imports studies using both api versions'
task :import_both, [:nct_id] => :environment do |t, args|
# download both
StudyDownloader.download([args[:nct_id]], '1')
StudyDownloader.download([args[:nct_id]], '2')

# # import v1
record = StudyJsonRecord.find_by(nct_id: args[:nct_id], version: '1')
record.create_or_update_study

# # import v2
worker = StudyJsonRecord::Worker.new
records = StudyJsonRecord.where(nct_id: args[:nct_id], version: '2')
worker.process(1, records)

# compare the two
StudyRelationship.study_models.each do |model|
sql = <<-SQL
SELECT
COUNT(*)
FROM ctgov.#{model.table_name}
WHERE nct_id = '#{args[:nct_id]}'
SQL
original = ActiveRecord::Base.connection.execute(sql).to_a.first.dig('count')

sql = <<-SQL
SELECT
COUNT(*)
FROM ctgov_v2.#{model.table_name}
WHERE nct_id = '#{args[:nct_id]}'
SQL
future = ActiveRecord::Base.connection.execute(sql).to_a.first.dig('count')
if original != future
puts "#{model.table_name}: v1: #{original} v2: #{future}"
end
end
end

desc "Load the AACT database from ClinicalTrials.gov"
task :run, [:schema] => :environment do |t, args|
if args[:schema] == 'ctgov_v2'
Expand Down Expand Up @@ -63,4 +101,38 @@ namespace :db do
studies = File.read('sample-studies')
Util::Updater.new.load_multiple_studies(studies)
end

desc 'drop all entries for a model and reimport it'
task :import_model, [:model] => :environment do |t, args|
with_search_path('ctgov_v2, support, public') do
model = args[:model].classify.constantize
model.delete_all

worker = StudyJsonRecord::Worker.new
mappings = StudyRelationship.sorted_mapping.select{|k| k[:table] == args[:model].to_sym }
puts mappings.inspect
StudyJsonRecord.where(version: '2').find_in_batches(batch_size: 5000) do |records|
puts records.length
mappings.each do |mapping|
worker.process_mapping(mapping, records)
end
end
end
end

desc 'add indexes to the database'
task :add_indexes, [:schema] => :environment do |t, args|
with_search_path(args[:schema]) do
db = Util::DbManager.new
db.add_indexes
end
end

desc 'remove constraints from the database'
task :remove_constraints, [:schema] => :environment do |t, args|
with_search_path(args[:schema]) do
db = Util::DbManager.new(schema: args[:schema])
db.remove_constraints
end
end
end
27 changes: 21 additions & 6 deletions lib/tasks/stats.rake
Original file line number Diff line number Diff line change
Expand Up @@ -69,18 +69,33 @@ namespace :stats do
SELECT
COUNT(*)
FROM ctgov.#{model.table_name}
WHERE nct_id = #{args[:nct_id]}
WHERE nct_id = '#{args[:nct_id]}'
SQL
original = ActiveRecord::Base.connection.execute(sql).to_a[0][0]
original = ActiveRecord::Base.connection.execute(sql).to_a.first.dig('count')

sql = <<-SQL
SELECT
COUNT(*)
FROM ctgov.#{model.table_name}
WHERE nct_id = #{args[:nct_id]}
FROM ctgov_v2.#{model.table_name}
WHERE nct_id = '#{args[:nct_id]}'
SQL
future = ActiveRecord::Base.connection.execute(sql).to_a[0][0]
puts "#{model.table_name}: #{original} vs #{future}"
future = ActiveRecord::Base.connection.execute(sql).to_a.first.dig('count')
if original != future
puts "#{model.table_name}: #{original} vs #{future}"
end
end
end

desc 'show indexes and foreign keys'
task :indexes, [:schema] => :environment do |t, args|
with_search_path(args[:schema]) do
StudyRelationship.study_models.each do |model|
puts model.table_name.blue
#model.connection.indexes(model.table_name)
model.connection.foreign_keys(model.table_name).each do |fk|
puts " #{fk.column} -> #{fk.to_table}.#{fk.primary_key}"
end
end
end
end
end

0 comments on commit 4e4189f

Please sign in to comment.