diff --git a/app/domain/etl/ga/internal_search_service.rb b/app/domain/etl/ga/internal_search_service.rb index 58ce322d2..e56fbe489 100644 --- a/app/domain/etl/ga/internal_search_service.rb +++ b/app/domain/etl/ga/internal_search_service.rb @@ -6,75 +6,38 @@ def self.find_in_batches(*args, **kwargs, &block) def find_in_batches(date:, batch_size: 10_000, &block) fetch_data(date:) .lazy - .map(&:to_h) - .flat_map(&method(:extract_rows)) - .map(&method(:extract_dimensions_and_metrics)) + .map(&:stringify_keys) .map(&method(:append_labels)) - .map { |hash| set_date(hash, date) } .each_slice(batch_size, &block) end def client - @client ||= Etl::GA::Client.build + @client ||= Etl::GA::Bigquery.build end private - def set_date(hash, date) - hash["date"] = date.strftime("%F") - hash - end - - def append_labels(values) - page_path, searches = *values - { - "page_path" => page_path, - "searches" => searches, - "process_name" => "searches", - } - end - - def extract_dimensions_and_metrics(row) - dimensions = row.fetch(:dimensions) - metrics = row.fetch(:metrics).flat_map do |metric| - metric.fetch(:values).map(&:to_i) - end - - dimensions + metrics - end - - def extract_rows(report) - report.fetch(:rows) + def append_labels(hash) + hash.merge("process_name" => "searches") end def fetch_data(date:) - @fetch_data ||= client.fetch_all(items: :data) do |page_token, service| - service - .batch_get_reports( - Google::Apis::AnalyticsreportingV4::GetReportsRequest.new( - report_requests: [build_request(date:).merge(page_token:)], - ), - ) - .reports - .first - end - end - - def build_request(date:) - { - date_ranges: [ - { start_date: date.to_fs("%Y-%m-%d"), end_date: date.to_fs("%Y-%m-%d") }, - ], - dimensions: [ - { name: "ga:searchStartPage" }, - ], - hide_totals: true, - hide_value_ranges: true, - metrics: [ - { expression: "ga:searchUniques" }, - ], - page_size: 10_000, - view_id: ENV["GOOGLE_ANALYTICS_GOVUK_VIEW_ID"], - } + @fetch_data ||= client + @date = date.strftime("%Y-%m-%d") + + query = <<~SQL + WITH CTE1 AS ( + SELECT * + FROM `govuk-content-data.ga4.GA4 dataform` + WHERE the_date = @date + ) + SELECT + cleaned_page_location AS page_path, + total_searches AS searches, + the_date AS date, + FROM CTE1 + SQL + + @fetch_data.query(query, params: { date: @date }).all end end diff --git a/spec/domain/etl/ga/internal_search_processor_spec.rb b/spec/domain/etl/ga/internal_search_processor_spec.rb index a8279fa49..215484f87 100644 --- a/spec/domain/etl/ga/internal_search_processor_spec.rb +++ b/spec/domain/etl/ga/internal_search_processor_spec.rb @@ -75,6 +75,61 @@ end end + context "When the GA path contains the gov.uk prefix" do + before { allow(Etl::GA::InternalSearchService).to receive(:find_in_batches).and_yield(ga_response_with_govuk_prefix) } + + context "when an event does not already exist with the same page_path" do + it "updates the fact with the GA metrics" do + edition1 = create :edition, date: "2018-02-20", base_path: "/Path1" + fact1 = create :metric, edition: edition1, date: "2018-02-20" + + described_class.process(date:) + + expect(fact1.reload).to have_attributes(searches: 1) + end + end + + context "when an event already exists with the same page_path" do + before do + create(:ga_event, :with_views, date: "2018-02-20", page_path: "/path1") + end + + it "updates the fact with the aggregated GA metrics" do + edition1 = create :edition, date: "2018-02-20", base_path: "/Path1" + fact1 = create :metric, edition: edition1, date: "2018-02-20" + + described_class.process(date:) + + expect(fact1.reload).to have_attributes(searches: 1) + end + + it "does not update metrics for other days" do + edition1 = create :edition, date: "2018-02-20", base_path: "/Path1" + fact1 = create :metric, edition: edition1, date: "2018-02-20", searches: 1 + + day_before = date - 1 + described_class.process(date: day_before) + + expect(fact1.reload).to have_attributes(searches: 1) + end + + it "does not update metrics for other items" do + edition = create :edition, base_path: "/non-matching-path", date: "2018-02-20" + fact = create :metric, edition:, date: "2018-02-20", searches: 1 + + described_class.process(date:) + + expect(fact.reload).to have_attributes(searches: 1) + end + + it "deletes events after updating facts metrics" do + described_class.process(date:) + + expect(Events::GA.count).to eq(0) + end + end + end + it_behaves_like "traps and logs errors in process", Etl::GA::InternalSearchService, :find_in_batches private diff --git a/spec/domain/etl/ga/internal_search_service_spec.rb b/spec/domain/etl/ga/internal_search_service_spec.rb index c18058108..43e9d65fd 100644 --- a/spec/domain/etl/ga/internal_search_service_spec.rb +++ b/spec/domain/etl/ga/internal_search_service_spec.rb @@ -1,32 +1,23 @@ -require "google/apis/analyticsreporting_v4" - RSpec.describe Etl::GA::InternalSearchService do - include GoogleAnalyticsRequests - subject { Etl::GA::InternalSearchService } - let(:google_client) { instance_double(Google::Apis::AnalyticsreportingV4::AnalyticsReportingService) } - before { expect(Etl::GA::Client).to receive(:build).and_return(google_client) } + let(:google_client) { instance_double(Google::Cloud::Bigquery::Project) } + let(:results) { instance_double(Google::Cloud::Bigquery::Data) } + + before do + allow(Etl::GA::Bigquery).to receive(:build).and_return(google_client) + allow(google_client).to receive(:query).and_return(results) + + allow(results).to receive(:all).and_return([ + { "page_path" => "/foo", "searches" => 1, "date" => "2018-02-20" }, + { "page_path" => "/bar", "searches" => 2, "upviews" => 2, "date" => "2018-02-20" }, + { "page_path" => "/cool", "searches" => 3, "upviews" => 3, "date" => "2018-02-20" }, + ]) + end describe "#find_in_batches" do let(:date) { Date.new(2018, 2, 20) } - before do - allow(google_client).to receive(:fetch_all) do - [ - build_report_data( - build_report_row(dimensions: %w[/foo], metrics: %w[1]), - ), - build_report_data( - build_report_row(dimensions: %w[/bar], metrics: %w[2]), - ), - build_report_data( - build_report_row(dimensions: %w[/cool], metrics: %w[3]), - ), - ] - end - end - context "when #find_in_batches is called with a block" do it "yields successive report data" do arg1 = [ @@ -53,6 +44,18 @@ expect { |probe| subject.find_in_batches(date:, batch_size: 2, &probe) } .to yield_successive_args(arg1, arg2) end + + it "includes the process name" do + arg1 = [ + a_hash_including("process_name" => "searches"), + a_hash_including("process_name" => "searches"), + ] + arg2 = [ + a_hash_including("process_name" => "searches"), + ] + expect { |probe| subject.find_in_batches(date:, batch_size: 2, &probe) } + .to yield_successive_args(arg1, arg2) + end end end end