diff --git a/Gemfile b/Gemfile index dbd7859a9a..ac767a5319 100644 --- a/Gemfile +++ b/Gemfile @@ -56,7 +56,9 @@ gem "lograge" # For distributed tracing and telemetry gem "opentelemetry-exporter-otlp", "~> 0.34.0" +gem "opentelemetry-exporter-otlp-metrics", "~> 0.10.0" gem "opentelemetry-instrumentation-all", "~> 0.94.0" +gem "opentelemetry-metrics-sdk", "~> 0.15.0" gem "opentelemetry-propagator-xray", "~> 0.27.0" gem "opentelemetry-sdk", "~> 1.12" diff --git a/Gemfile.lock b/Gemfile.lock index fbb322f484..97d9774026 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -383,6 +383,15 @@ GEM opentelemetry-common (~> 0.20) opentelemetry-sdk (~> 1.10) opentelemetry-semantic_conventions + opentelemetry-exporter-otlp-metrics (0.10.0) + google-protobuf (>= 3.18, < 5.0) + googleapis-common-protos-types (~> 1.3) + opentelemetry-api (~> 1.1) + opentelemetry-common (~> 0.20) + opentelemetry-metrics-api (~> 0.2) + opentelemetry-metrics-sdk (~> 0.5) + opentelemetry-sdk (~> 1.2) + opentelemetry-semantic_conventions opentelemetry-helpers-mysql (0.6.0) opentelemetry-api (~> 1.7) opentelemetry-common (~> 0.21) @@ -537,6 +546,12 @@ GEM opentelemetry-helpers-sql-processor opentelemetry-instrumentation-base (~> 0.25) opentelemetry-semantic_conventions (>= 1.8.0) + opentelemetry-metrics-api (0.6.0) + opentelemetry-api (~> 1.0) + opentelemetry-metrics-sdk (0.15.0) + opentelemetry-api (~> 1.1) + opentelemetry-metrics-api (~> 0.2) + opentelemetry-sdk (~> 1.2) opentelemetry-propagator-xray (0.27.0) opentelemetry-api (~> 1.7) opentelemetry-registry (0.6.0) @@ -838,7 +853,9 @@ DEPENDENCIES omniauth-auth0 omniauth-rails_csrf_protection opentelemetry-exporter-otlp (~> 0.34.0) + opentelemetry-exporter-otlp-metrics (~> 0.10.0) opentelemetry-instrumentation-all (~> 0.94.0) + opentelemetry-metrics-sdk (~> 0.15.0) opentelemetry-propagator-xray (~> 0.27.0) opentelemetry-sdk (~> 1.12) pagy @@ -994,6 +1011,7 @@ CHECKSUMS opentelemetry-api (1.10.0) sha256=99ee7c829b18381c31a817ee9bf6a160d737542d99cb8da55d443336d266bfa9 opentelemetry-common (0.25.0) sha256=73915362e58d337fc92acbe1abfdaee1f725442527125fdb2af1420417f1149d opentelemetry-exporter-otlp (0.34.0) sha256=3b3cdf4329ba30f4389d849c7f13b8f9f983ecb4a030031c03997dffae1e2a60 + opentelemetry-exporter-otlp-metrics (0.10.0) sha256=d8cbff9b8a3391eb61486b8be9b6ad74e3b9306a3c60fb4c906b28bc857167c8 opentelemetry-helpers-mysql (0.6.0) sha256=7eeb5e6950c434775a8cf28b5fde4defc12e8b865c86479ce3119fcf593d9337 opentelemetry-helpers-sql (0.4.0) sha256=b10e8c3a2cca28a98af951bbb3e4efdc59e68b25ba0825e055574af543420afb opentelemetry-helpers-sql-processor (0.5.0) sha256=b199241bc9451fcbd9f00b2f454830af19d4ca27c2219ea379c9b0d53cd0e0f1 @@ -1043,6 +1061,8 @@ CHECKSUMS opentelemetry-instrumentation-sidekiq (0.29.0) sha256=b1d2a0cb9041a5e14239fe7c94d99e3dd07f870e2759460ab63592d7cdd8aadc opentelemetry-instrumentation-sinatra (0.30.0) sha256=b67301153420f43264a0c68cdb3ca5bd77467cf5054e57b83a2bf891aaaa0361 opentelemetry-instrumentation-trilogy (0.69.0) sha256=0676dd720eeab284abfa52f273967442156fcac7084a1e1411373cf14ec026ad + opentelemetry-metrics-api (0.6.0) sha256=b9300821680a1370684098cb030c18423dd55909ea0206faadfa7bc47362df87 + opentelemetry-metrics-sdk (0.15.0) sha256=611a9cd9f473c461095c7401b8c25f9774160d286a1acbfcbf044da2972aeada opentelemetry-propagator-xray (0.27.0) sha256=753f756c7ad3146f182d428b06041084eecc77769edfd280f365e0bc09b9c4d1 opentelemetry-registry (0.6.0) sha256=5d3ed32ab9eee0fbdb30d4f0d0bb61ad11a4040b267b475ae815b80a8498a728 opentelemetry-sdk (1.12.0) sha256=a224abe0c59023d41cb7ac1c634d9d28843907efcd045ed1ae320796c48b864b diff --git a/app/services/metrics/form_count_service.rb b/app/services/metrics/form_count_service.rb new file mode 100644 index 0000000000..c94a137f85 --- /dev/null +++ b/app/services/metrics/form_count_service.rb @@ -0,0 +1,95 @@ +module Metrics + class FormCountService + class ExportError < StandardError; end + + METRIC_NAME = "FormCount".freeze + METER_NAME = "forms-admin".freeze + METER_VERSION = "1.0".freeze + UNKNOWN_ORG = "Unknown".freeze + METRIC_STATES = %w[draft live archived].freeze + + def publish_form_counts + metric_count = 0 + + form_counts_by_org_and_state.each do |(org, state), count| + form_count_gauge.record(count, attributes: metric_attributes(org:, state:)) + metric_count += 1 + end + + export_metrics! + + Rails.logger.info "Published #{metric_count} form count metrics via OpenTelemetry" + rescue StandardError => e + Sentry.capture_exception(e) + raise + end + + private + + def form_counts_by_org_and_state + totals = counted_form_totals + organisation_names.each { |org_name| ensure_all_metric_states(totals, org_name) } + ensure_all_metric_states(totals, UNKNOWN_ORG) if totals.keys.any? { |(org, _state)| org == UNKNOWN_ORG } + totals + end + + def counted_form_totals + counts_by_org_and_state = Form + .where.not(state: :deleted) + .left_joins(group_form: { group: :organisation }) + .group(Organisation.arel_table[:name], Form.arel_table[:state], Organisation.arel_table[:internal]) + .count + + counts_by_org_and_state.each_with_object(Hash.new(0)) do |((org_name, state, internal), count), totals| + next if internal == true # Skip internal organisations for metrics + + totals[[org_name || UNKNOWN_ORG, metric_state(state)]] += count + end + end + + def organisation_names + Organisation.where(internal: false).pluck(:name) + end + + def ensure_all_metric_states(totals, org_name) + METRIC_STATES.each { |state| totals[[org_name, state]] += 0 } + end + + def metric_state(state) + case state + when "live", "live_with_draft" then "live" + when "archived", "archived_with_draft" then "archived" + when "draft" then "draft" + end + end + + def metric_attributes(org:, state:) + { + "Environment" => Settings.forms_env.downcase, + "Org" => org, + "State" => state, + } + end + + def form_count_gauge + @form_count_gauge ||= meter.create_gauge( + METRIC_NAME, + unit: "1", + description: "Count of forms grouped by organisation and state", + ) + end + + def meter + OpenTelemetry.meter_provider.meter(METER_NAME, version: METER_VERSION) + end + + def export_metrics! + return if OpenTelemetry.meter_provider.metric_readers.empty? + + result = OpenTelemetry.meter_provider.force_flush + return if result == OpenTelemetry::SDK::Metrics::Export::SUCCESS + + raise ExportError, "OpenTelemetry metrics export failed with result code #{result}" + end + end +end diff --git a/config/initializers/opentelemetry.rb b/config/initializers/opentelemetry.rb index eda8872b3e..0e2009569a 100644 --- a/config/initializers/opentelemetry.rb +++ b/config/initializers/opentelemetry.rb @@ -1,5 +1,7 @@ require "opentelemetry/sdk" require "opentelemetry/instrumentation/all" +require "opentelemetry-metrics-sdk" +require "opentelemetry/exporter/otlp_metrics" return unless ENV["ENABLE_OTEL"] == "true" @@ -15,3 +17,6 @@ # Disable logging for Rake tasks to avoid cluttering output c.logger = Logger.new(File::NULL) if Rails.const_defined?(:Rake) && Rake.application.top_level_tasks.any? end + +# Metrics are configured automatically by opentelemetry-metrics-sdk via OTEL_METRICS_EXPORTER +# (defaults to "otlp"), pushing to the collector sidecar at OTEL_EXPORTER_OTLP_ENDPOINT. diff --git a/lib/tasks/metrics.rake b/lib/tasks/metrics.rake new file mode 100644 index 0000000000..4adf733089 --- /dev/null +++ b/lib/tasks/metrics.rake @@ -0,0 +1,6 @@ +namespace :metrics do + desc "Export form counts as OpenTelemetry metrics grouped by organisation and state" + task export_form_counts: :environment do + Metrics::FormCountService.new.publish_form_counts + end +end diff --git a/spec/lib/tasks/metrics.rake_spec.rb b/spec/lib/tasks/metrics.rake_spec.rb new file mode 100644 index 0000000000..37e0df3b71 --- /dev/null +++ b/spec/lib/tasks/metrics.rake_spec.rb @@ -0,0 +1,17 @@ +require "rails_helper" + +RSpec.describe "metrics.rake", type: :task do + describe "metrics:export_form_counts" do + subject(:task) do + Rake::Task["metrics:export_form_counts"] + end + + it "publishes form counts via Metrics::FormCountService" do + service = instance_double(Metrics::FormCountService) + allow(Metrics::FormCountService).to receive(:new).and_return(service) + expect(service).to receive(:publish_form_counts) + + task.invoke + end + end +end diff --git a/spec/services/metrics/form_count_service_spec.rb b/spec/services/metrics/form_count_service_spec.rb new file mode 100644 index 0000000000..36de466d74 --- /dev/null +++ b/spec/services/metrics/form_count_service_spec.rb @@ -0,0 +1,136 @@ +require "rails_helper" +require "opentelemetry-metrics-sdk" + +describe Metrics::FormCountService do + subject(:service) { described_class.new } + + let(:forms_env) { "test" } + let(:metric_exporter) { OpenTelemetry::SDK::Metrics::Export::InMemoryMetricPullExporter.new } + let(:organisation) { create(:organisation, name: "Department for Testing") } + let(:group) { create(:group, organisation:) } + let!(:original_meter_provider) { OpenTelemetry.meter_provider } + + before do + allow(Settings).to receive(:forms_env).and_return(forms_env) + + provider = OpenTelemetry::SDK::Metrics::MeterProvider.new + periodic_reader = OpenTelemetry::SDK::Metrics::Export::PeriodicMetricReader.new( + export_interval_millis: 60_000, + exporter: metric_exporter, + ) + provider.add_metric_reader(periodic_reader) + OpenTelemetry.meter_provider = provider + end + + after do + OpenTelemetry.meter_provider.shutdown + OpenTelemetry.meter_provider = original_meter_provider + end + + around do |example| + travel_to(Time.zone.local(2026, 6, 3, 12, 0, 0)) do + example.run + end + end + + describe "#publish_form_counts" do + before do + Form.destroy_all + + # Use explicit states instead of :live/:archived traits — those pull in :with_pages, + # and each page factory creates its own :form, inflating counts. + create(:form, :with_group, group:, state: :draft) + create(:form, :with_group, group:, state: :live, pages: []) + create(:form, :with_group, group:, state: :live_with_draft, pages: []) + create(:form, :with_group, group:, state: :archived, pages: []) + create(:form, :with_group, group:, state: :archived_with_draft, pages: []) + create(:form, state: :draft) + end + + it "publishes grouped form counts via OpenTelemetry" do + service.publish_form_counts + + expect(exported_data_points).to contain_exactly( + metric_data_point(org: organisation.name, state: "draft", count: 1), + metric_data_point(org: organisation.name, state: "live", count: 2), + metric_data_point(org: organisation.name, state: "archived", count: 2), + metric_data_point(org: "Unknown", state: "draft", count: 1), + metric_data_point(org: "Unknown", state: "live", count: 0), + metric_data_point(org: "Unknown", state: "archived", count: 0), + ) + end + + context "when an organisation has no forms" do + let(:empty_organisation) { create(:organisation, name: "Empty Org", slug: "empty-org") } + + before { empty_organisation } + + it "publishes zero counts for each state" do + service.publish_form_counts + + expect(exported_data_points).to contain_exactly( + metric_data_point(org: organisation.name, state: "draft", count: 1), + metric_data_point(org: organisation.name, state: "live", count: 2), + metric_data_point(org: organisation.name, state: "archived", count: 2), + metric_data_point(org: empty_organisation.name, state: "draft", count: 0), + metric_data_point(org: empty_organisation.name, state: "live", count: 0), + metric_data_point(org: empty_organisation.name, state: "archived", count: 0), + metric_data_point(org: "Unknown", state: "draft", count: 1), + metric_data_point(org: "Unknown", state: "live", count: 0), + metric_data_point(org: "Unknown", state: "archived", count: 0), + ) + end + end + + context "when an organisation is internal" do + let(:internal_organisation) { create(:organisation, name: "Internal Org", slug: "internal-org", internal: true) } + let(:internal_group) { create(:group, organisation: internal_organisation) } + + before do + create(:form, :with_group, group: internal_group, state: :draft) + create(:form, :with_group, group: internal_group, state: :live, pages: []) + end + + it "excludes forms belonging to internal organisations" do + service.publish_form_counts + + expect(exported_data_points).to contain_exactly( + metric_data_point(org: organisation.name, state: "draft", count: 1), + metric_data_point(org: organisation.name, state: "live", count: 2), + metric_data_point(org: organisation.name, state: "archived", count: 2), + metric_data_point(org: "Unknown", state: "draft", count: 1), + metric_data_point(org: "Unknown", state: "live", count: 0), + metric_data_point(org: "Unknown", state: "archived", count: 0), + ) + end + end + + context "when OpenTelemetry export fails" do + before do + allow(OpenTelemetry.meter_provider).to receive(:force_flush) + .and_return(OpenTelemetry::SDK::Metrics::Export::FAILURE) + end + + it "captures the exception and re-raises" do + expect(Sentry).to receive(:capture_exception).with(instance_of(Metrics::FormCountService::ExportError)) + + expect { service.publish_form_counts }.to raise_error(Metrics::FormCountService::ExportError) + end + end + end + + def exported_data_points + metric_exporter.metric_snapshots.flat_map(&:data_points) + end + + def metric_data_point(org:, state:, count:) + have_attributes( + attributes: { + "Environment" => forms_env, + "Org" => org, + "State" => state, + }, + value: count, + ) + end +end