Skip to content

Commit

Permalink
Update Internal Search service to pull metrics from GA4
Browse files Browse the repository at this point in the history
Instead of fetching data from UA we now use the BigQuery API which
allows us to query the table for the specific metrics we need from
GA4. It already returns the information we need in the correct format
and reduces the amount of work required post query.

`ga_response_with_govuk_prefix` has not been used in any of the tests and
looks like it has been missed previously. We should expect paths that
contain the gov.uk prefix to be transformed into it's base path. If an
an event already exists with the same path in the Events::GA table, it
should update the entry with the aggregative total for the metrics which is
then reflected in the Facts::Metrics table.
  • Loading branch information
deborahchua committed Mar 28, 2024
1 parent 011f737 commit 3b29dd7
Show file tree
Hide file tree
Showing 3 changed files with 101 additions and 80 deletions.
79 changes: 21 additions & 58 deletions app/domain/etl/ga/internal_search_service.rb
Original file line number Diff line number Diff line change
Expand Up @@ -6,75 +6,38 @@ def self.find_in_batches(*args, **kwargs, &block)
def find_in_batches(date:, batch_size: 10_000, &block)
fetch_data(date:)
.lazy
.map(&:to_h)
.flat_map(&method(:extract_rows))
.map(&method(:extract_dimensions_and_metrics))
.map(&:stringify_keys)
.map(&method(:append_labels))
.map { |hash| set_date(hash, date) }
.each_slice(batch_size, &block)
end

def client
@client ||= Etl::GA::Client.build
@client ||= Etl::GA::Bigquery.build
end

private

def set_date(hash, date)
hash["date"] = date.strftime("%F")
hash
end

def append_labels(values)
page_path, searches = *values
{
"page_path" => page_path,
"searches" => searches,
"process_name" => "searches",
}
end

def extract_dimensions_and_metrics(row)
dimensions = row.fetch(:dimensions)
metrics = row.fetch(:metrics).flat_map do |metric|
metric.fetch(:values).map(&:to_i)
end

dimensions + metrics
end

def extract_rows(report)
report.fetch(:rows)
def append_labels(hash)
hash.merge("process_name" => "searches")
end

def fetch_data(date:)
@fetch_data ||= client.fetch_all(items: :data) do |page_token, service|
service
.batch_get_reports(
Google::Apis::AnalyticsreportingV4::GetReportsRequest.new(
report_requests: [build_request(date:).merge(page_token:)],
),
)
.reports
.first
end
end

def build_request(date:)
{
date_ranges: [
{ start_date: date.to_fs("%Y-%m-%d"), end_date: date.to_fs("%Y-%m-%d") },
],
dimensions: [
{ name: "ga:searchStartPage" },
],
hide_totals: true,
hide_value_ranges: true,
metrics: [
{ expression: "ga:searchUniques" },
],
page_size: 10_000,
view_id: ENV["GOOGLE_ANALYTICS_GOVUK_VIEW_ID"],
}
@fetch_data ||= client
@date = date.strftime("%Y-%m-%d")

query = <<~SQL
WITH CTE1 AS (
SELECT *
FROM `govuk-content-data.ga4.GA4 dataform`
WHERE the_date = @date
)
SELECT
cleaned_page_location AS page_path,
total_searches AS searches,
the_date AS date,
FROM CTE1
SQL

@fetch_data.query(query, params: { date: @date }).all
end
end
55 changes: 55 additions & 0 deletions spec/domain/etl/ga/internal_search_processor_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,61 @@
end
end

context "When the GA path contains the gov.uk prefix" do
before { allow(Etl::GA::InternalSearchService).to receive(:find_in_batches).and_yield(ga_response_with_govuk_prefix) }

context "when an event does not already exist with the same page_path" do
it "updates the fact with the GA metrics" do
edition1 = create :edition, date: "2018-02-20", base_path: "/Path1"
fact1 = create :metric, edition: edition1, date: "2018-02-20"

described_class.process(date:)

expect(fact1.reload).to have_attributes(searches: 1)
end
end

context "when an event already exists with the same page_path" do
before do
create(:ga_event, :with_views, date: "2018-02-20", page_path: "/path1")
end

it "updates the fact with the aggregated GA metrics" do
edition1 = create :edition, date: "2018-02-20", base_path: "/Path1"
fact1 = create :metric, edition: edition1, date: "2018-02-20"

described_class.process(date:)

expect(fact1.reload).to have_attributes(searches: 1)
end

it "does not update metrics for other days" do
edition1 = create :edition, date: "2018-02-20", base_path: "/Path1"
fact1 = create :metric, edition: edition1, date: "2018-02-20", searches: 1

day_before = date - 1
described_class.process(date: day_before)

expect(fact1.reload).to have_attributes(searches: 1)
end

it "does not update metrics for other items" do
edition = create :edition, base_path: "/non-matching-path", date: "2018-02-20"
fact = create :metric, edition:, date: "2018-02-20", searches: 1

described_class.process(date:)

expect(fact.reload).to have_attributes(searches: 1)
end

it "deletes events after updating facts metrics" do
described_class.process(date:)

expect(Events::GA.count).to eq(0)
end
end
end

it_behaves_like "traps and logs errors in process", Etl::GA::InternalSearchService, :find_in_batches

private
Expand Down
47 changes: 25 additions & 22 deletions spec/domain/etl/ga/internal_search_service_spec.rb
Original file line number Diff line number Diff line change
@@ -1,32 +1,23 @@
require "google/apis/analyticsreporting_v4"

RSpec.describe Etl::GA::InternalSearchService do
include GoogleAnalyticsRequests

subject { Etl::GA::InternalSearchService }

let(:google_client) { instance_double(Google::Apis::AnalyticsreportingV4::AnalyticsReportingService) }
before { expect(Etl::GA::Client).to receive(:build).and_return(google_client) }
let(:google_client) { instance_double(Google::Cloud::Bigquery::Project) }
let(:results) { instance_double(Google::Cloud::Bigquery::Data) }

before do
allow(Etl::GA::Bigquery).to receive(:build).and_return(google_client)
allow(google_client).to receive(:query).and_return(results)

allow(results).to receive(:all).and_return([
{ "page_path" => "/foo", "searches" => 1, "date" => "2018-02-20" },
{ "page_path" => "/bar", "searches" => 2, "upviews" => 2, "date" => "2018-02-20" },
{ "page_path" => "/cool", "searches" => 3, "upviews" => 3, "date" => "2018-02-20" },
])
end

describe "#find_in_batches" do
let(:date) { Date.new(2018, 2, 20) }

before do
allow(google_client).to receive(:fetch_all) do
[
build_report_data(
build_report_row(dimensions: %w[/foo], metrics: %w[1]),
),
build_report_data(
build_report_row(dimensions: %w[/bar], metrics: %w[2]),
),
build_report_data(
build_report_row(dimensions: %w[/cool], metrics: %w[3]),
),
]
end
end

context "when #find_in_batches is called with a block" do
it "yields successive report data" do
arg1 = [
Expand All @@ -53,6 +44,18 @@
expect { |probe| subject.find_in_batches(date:, batch_size: 2, &probe) }
.to yield_successive_args(arg1, arg2)
end

it "includes the process name" do
arg1 = [
a_hash_including("process_name" => "searches"),
a_hash_including("process_name" => "searches"),
]
arg2 = [
a_hash_including("process_name" => "searches"),
]
expect { |probe| subject.find_in_batches(date:, batch_size: 2, &probe) }
.to yield_successive_args(arg1, arg2)
end
end
end
end

0 comments on commit 3b29dd7

Please sign in to comment.