Skip to content

Commit 9983338

Browse files
Merge pull request #5430 from nhsuk/prometheus_exports_to_cloudwatch_metrics
MAV-2918: Prometheus exporter setup with cloudwatch
2 parents aa45c29 + bba670e commit 9983338

16 files changed

Lines changed: 232 additions & 14 deletions

File tree

Gemfile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,7 @@ gem "omniauth_openid_connect"
5252
gem "omniauth-rails_csrf_protection"
5353
gem "pagy"
5454
gem "phonelib"
55+
gem "prometheus_exporter"
5556
gem "pstore"
5657
gem "pundit"
5758
gem "rails_semantic_logger"

Gemfile.lock

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -463,6 +463,8 @@ GEM
463463
prettier_print (1.2.1)
464464
prettyprint (0.2.0)
465465
prism (1.8.0)
466+
prometheus_exporter (2.3.1)
467+
webrick
466468
propshaft (1.3.1)
467469
actionpack (>= 7.0.0)
468470
activesupport (>= 7.0.0)
@@ -852,6 +854,7 @@ DEPENDENCIES
852854
pg
853855
phonelib
854856
prettier_print
857+
prometheus_exporter
855858
propshaft
856859
pry-rails
857860
pstore

bin/docker-start

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,13 @@ BIN_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
44

55
if [ "$SERVER_TYPE" == "web" ]; then
66
echo "Starting web server..."
7+
"$BIN_DIR"/prometheus_exporter &
8+
sleep 5
79
exec "$BIN_DIR"/thrust "$BIN_DIR"/rails server
810
elif [ "$SERVER_TYPE" == "sidekiq" ]; then
911
echo "Starting sidekiq server..."
12+
"$BIN_DIR"/prometheus_exporter &
13+
sleep 5
1014
exec "$BIN_DIR"/sidekiq
1115
elif [ "$SERVER_TYPE" == "none" ]; then
1216
echo "No server started"

bin/prometheus_exporter

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
#!/usr/bin/env bash
2+
3+
TASK_DETAILS=$(curl ${ECS_CONTAINER_METADATA_URI_V4})
4+
TASK_ID=$(echo "$TASK_DETAILS" | jq -r '.Labels["com.amazonaws.ecs.task-arn"] | split("/") | last')
5+
6+
task_details() {
7+
curl -sSf ${ECS_CONTAINER_METADATA_URI_V4}
8+
}
9+
10+
MAX_RETRIES=12
11+
RETRY_DELAY=10
12+
TASK_DETAILS=""
13+
14+
for ((i=1; i<=MAX_RETRIES; i++)); do
15+
if TASK_DETAILS=$(task_details); then
16+
break
17+
fi
18+
if [[ $i -eq $MAX_RETRIES ]]; then
19+
echo "Failed to fetch task details after $MAX_RETRIES attempts" >&2
20+
exit 1
21+
fi
22+
sleep $RETRY_DELAY
23+
done
24+
25+
TASK_ID=$(echo "$TASK_DETAILS" | jq '.Labels["com.amazonaws.ecs.task-arn"] | split("/") | last')
26+
27+
bundle exec prometheus_exporter \
28+
--port 9394 \
29+
--bind 0.0.0.0 \
30+
--label "{\"TaskId\": ${TASK_ID}, \"ServiceName\": \"${SERVICE_NAME}\"}"

config/initializers/sidekiq.rb

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,25 @@
99
redis_config[:timeout] = 10
1010
end
1111

12-
Sidekiq.configure_server { |config| config.redis = redis_config }
12+
Sidekiq.configure_server do |config|
13+
config.redis = redis_config
14+
if ENV["EXPORT_SIDEKIQ_METRICS"] == "true"
15+
require "prometheus_exporter/instrumentation"
16+
config.server_middleware do |chain|
17+
chain.add PrometheusExporter::Instrumentation::Sidekiq
18+
end
19+
config.death_handlers << PrometheusExporter::Instrumentation::Sidekiq.death_handler
20+
config.on :startup do
21+
PrometheusExporter::Instrumentation::Process.start type: "sidekiq"
22+
PrometheusExporter::Instrumentation::SidekiqProcess.start
23+
PrometheusExporter::Instrumentation::SidekiqQueue.start
24+
PrometheusExporter::Instrumentation::SidekiqStats.start
25+
end
26+
at_exit do
27+
PrometheusExporter::Client.default.stop(wait_timeout_seconds: 10)
28+
end
29+
end
30+
end
1331

1432
Sidekiq.configure_client { |config| config.redis = redis_config }
1533

config/puma.rb

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,3 +60,14 @@
6060

6161
# Re-open appenders after forking the process; needed for Semantic Logger
6262
before_worker_boot { SemanticLogger.reopen }
63+
64+
# Export puma metrics
65+
if ENV["EXPORT_PUMA_METRICS"] == "true"
66+
after_worker_boot do
67+
require "prometheus_exporter/instrumentation"
68+
# optional check, avoids spinning up and down threads per worker
69+
unless PrometheusExporter::Instrumentation::Puma.started?
70+
PrometheusExporter::Instrumentation::Puma.start
71+
end
72+
end
73+
end

script/shell.sh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,9 +24,9 @@ describe_tasks() {
2424
aws ecs describe-tasks --region "$region" --cluster "$cluster_name" --tasks $task_arns
2525
}
2626

27-
select_running_container() {
27+
select_running_container() {
2828
local task_data="$1"
29-
echo "$task_data" | jq -r '.containers | map(select(.lastStatus == "RUNNING" and .runtimeId != null))[0].name'
29+
echo "$task_data" | jq -r '.containers | map(select(.lastStatus == "RUNNING" and .name == "application"))[0].name'
3030
}
3131

3232
if [ "$1" = "--help" ]; then

terraform/account/iam.tf

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,12 @@ resource "aws_iam_role_policy_attachment" "ecs_task_fargate" {
88
policy_arn = aws_iam_policy.session_manager_access.arn
99
}
1010

11+
resource "aws_iam_role_policy_attachment" "ecs_task_cloudwatch_agent" {
12+
role = aws_iam_role.ecs_task_role.name
13+
policy_arn = "arn:aws:iam::aws:policy/CloudWatchAgentServerPolicy"
14+
}
15+
16+
1117
resource "aws_iam_role_policy_attachment" "get_s3_object" {
1218
role = aws_iam_role.ecs_task_role.name
1319
policy_arn = aws_iam_policy.get_s3_object.arn
@@ -113,4 +119,4 @@ resource "aws_iam_policy" "get_s3_object" {
113119
lifecycle {
114120
ignore_changes = [description]
115121
}
116-
}
122+
}

terraform/app/ecs.tf

Lines changed: 24 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,7 @@ data "aws_iam_role" "ecs_task_role" {
6464
module "web_service" {
6565
source = "./modules/ecs_service"
6666
task_config = {
67-
environment = local.task_envs["CORE"]
67+
environment = local.web_envs
6868
secrets = local.task_secrets["CORE"]
6969
cpu = 2048
7070
memory = 4096
@@ -74,6 +74,17 @@ module "web_service" {
7474
region = var.region
7575
health_check_command = ["CMD-SHELL", "./bin/internal_healthcheck http://localhost:${local.container_ports.web}/health/database"]
7676
}
77+
export_prometheus_metrics = local.export_prometheus_metrics
78+
cloudwatch_agent_secrets = [
79+
{
80+
"name" : "PROMETHEUS_CONFIG_CONTENT",
81+
"valueFrom" : aws_ssm_parameter.prometheus_config.arn
82+
},
83+
{
84+
"name" : "CW_CONFIG_CONTENT",
85+
"valueFrom" : aws_ssm_parameter.cloudwatch_agent_config.arn
86+
}
87+
]
7788
network_params = {
7889
subnets = [aws_subnet.private_subnet_a.id, aws_subnet.private_subnet_b.id]
7990
vpc_id = aws_vpc.application_vpc.id
@@ -122,7 +133,7 @@ module "web_service" {
122133
module "sidekiq_service" {
123134
source = "./modules/ecs_service"
124135
task_config = {
125-
environment = local.task_envs["CORE"]
136+
environment = local.sidekiq_envs
126137
secrets = local.task_secrets["CORE"]
127138
cpu = 1024
128139
memory = 6144
@@ -132,6 +143,17 @@ module "sidekiq_service" {
132143
region = var.region
133144
health_check_command = ["CMD-SHELL", "./bin/internal_healthcheck && grep -q '[s]idekiq' /proc/*/cmdline 2>/dev/null || exit 1"]
134145
}
146+
export_prometheus_metrics = local.export_prometheus_metrics
147+
cloudwatch_agent_secrets = [
148+
{
149+
"name" : "PROMETHEUS_CONFIG_CONTENT",
150+
"valueFrom" : aws_ssm_parameter.prometheus_config.arn
151+
},
152+
{
153+
"name" : "CW_CONFIG_CONTENT",
154+
"valueFrom" : aws_ssm_parameter.cloudwatch_agent_config.arn
155+
}
156+
]
135157
network_params = {
136158
subnets = [aws_subnet.private_subnet_a.id, aws_subnet.private_subnet_b.id]
137159
vpc_id = aws_vpc.application_vpc.id

terraform/app/iam_policy_documents.tf

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -16,10 +16,13 @@ data "aws_iam_policy_document" "ecs_secrets_access" {
1616
dynamic "statement" {
1717
for_each = length(local.parameter_values[each.key]) == 0 ? [] : [1]
1818
content {
19-
sid = "ssmParameterStoreAccessSid"
20-
actions = ["ssm:GetParameters"]
21-
resources = [for kv_pair in local.parameter_values[each.key] : kv_pair["valueFrom"]]
22-
effect = "Allow"
19+
sid = "ssmParameterStoreAccessSid"
20+
actions = ["ssm:GetParameters"]
21+
resources = concat(
22+
[for kv_pair in local.parameter_values[each.key] : kv_pair["valueFrom"]],
23+
[aws_ssm_parameter.prometheus_config.arn, aws_ssm_parameter.cloudwatch_agent_config.arn]
24+
)
25+
effect = "Allow"
2326
}
2427
}
2528
dynamic "statement" {

0 commit comments

Comments
 (0)