Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 17 additions & 3 deletions .env.gcp.template
Original file line number Diff line number Diff line change
Expand Up @@ -46,15 +46,17 @@ TERRAFORM_ENVIRONMENT=dev
# "disk_type": "local-ssd", // Cache disk type (local-ssd or pd-ssd)
# "size_gb": 375, // Cache disk size in GB
# "count": 1 // Number of cache disks
# }
# },
# "subnetwork_name": "", // (optional) Subnetwork override for this cluster entry
# "network_tag": "" // (optional) Additional network tag for this cluster entry
# }
# Build cluster configuration
# Example:
BUILD_CLUSTERS_CONFIG='{"default": {"cluster_size": 1, "machine":{"type":"n1-standard-8","min_cpu_platform":"Intel Skylake"}, "boot_disk":{"disk_type":"pd-ssd","size_gb":200}, "cache_disks":{"disk_type":"local-ssd","size_gb":375,"count":1}}}'
BUILD_CLUSTERS_CONFIG='{"default": {"cluster_size": 1, "machine":{"type":"n1-standard-8","min_cpu_platform":"Intel Skylake"}, "boot_disk":{"disk_type":"pd-ssd","size_gb":200}, "cache_disks":{"disk_type":"local-ssd","size_gb":375,"count":1}, "subnetwork_name":"", "network_tag":""}}'

# Client cluster configuration
# Example:
CLIENT_CLUSTERS_CONFIG='{"default": {"cluster_size": 1, "hugepages_percentage": 80, "machine":{"type":"n1-standard-8","min_cpu_platform":"Intel Skylake"}, "autoscaler": {"size_max": 2, "memory_target": 100, "cpu_target": 0.7}, "boot_disk":{"disk_type":"pd-ssd","size_gb":200}, "cache_disks":{"disk_type":"local-ssd","size_gb":375,"count":1}}}'
CLIENT_CLUSTERS_CONFIG='{"default": {"cluster_size": 1, "hugepages_percentage": 80, "machine":{"type":"n1-standard-8","min_cpu_platform":"Intel Skylake"}, "autoscaler": {"size_max": 2, "memory_target": 100, "cpu_target": 0.7}, "boot_disk":{"disk_type":"pd-ssd","size_gb":200}, "cache_disks":{"disk_type":"local-ssd","size_gb":375,"count":1}, "subnetwork_name":"", "network_tag":""}}'

# This is the nomad and consul server (only for scheduling and service discovery)
# eg e2-standard-2
Expand Down Expand Up @@ -82,6 +84,18 @@ ENABLE_AUTH_USER_SYNC_BACKGROUND_WORKER=
# Enable dashboard-api billing/team provisioning sink (default: false)
ENABLE_BILLING_HTTP_TEAM_PROVISION_SINK=

# Per-pool subnetwork overrides (default: network's auto-subnet)
SERVER_SUBNETWORK_NAME=
API_SUBNETWORK_NAME=
CLICKHOUSE_SUBNETWORK_NAME=
LOKI_SUBNETWORK_NAME=

# Per-pool network tag overrides (added alongside default "orch" tag)
SERVER_NETWORK_TAG=
API_NETWORK_TAG=
CLICKHOUSE_NETWORK_TAG=
LOKI_NETWORK_TAG=

# Filestore cache for builds shared across cluster (default:false)
FILESTORE_CACHE_ENABLED=
# BASIC_HDD for staging+dev, ZONAL for production
Expand Down
10 changes: 9 additions & 1 deletion iac/provider-gcp/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,15 @@ tf_vars := \
$(call tfvar, AUTH_DB_MIN_IDLE_CONNECTIONS) \
$(call tfvar, GCS_GRPC_CONNECTION_POOL_SIZE) \
$(call tfvar, ADDITIONAL_API_PATHS_HANDLED_BY_INGRESS) \
$(call tfvar, ANYWHERE_CACHE_ENABLED)
$(call tfvar, ANYWHERE_CACHE_ENABLED) \
$(call tfvar, SERVER_SUBNETWORK_NAME) \
$(call tfvar, API_SUBNETWORK_NAME) \
$(call tfvar, CLICKHOUSE_SUBNETWORK_NAME) \
$(call tfvar, LOKI_SUBNETWORK_NAME) \
$(call tfvar, SERVER_NETWORK_TAG) \
$(call tfvar, API_NETWORK_TAG) \
$(call tfvar, CLICKHOUSE_NETWORK_TAG) \
$(call tfvar, LOKI_NETWORK_TAG)

.PHONY: init
init:
Expand Down
12 changes: 12 additions & 0 deletions iac/provider-gcp/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -200,6 +200,18 @@ module "cluster" {
clickhouse_boot_disk_type = var.clickhouse_boot_disk_type
loki_boot_disk_type = var.loki_boot_disk_type

# Per-pool subnetwork overrides
server_subnetwork_name = var.server_subnetwork_name
api_subnetwork_name = var.api_subnetwork_name
clickhouse_subnetwork_name = var.clickhouse_subnetwork_name
loki_subnetwork_name = var.loki_subnetwork_name

# Per-pool network tag overrides
server_network_tag = var.server_network_tag
api_network_tag = var.api_network_tag
clickhouse_network_tag = var.clickhouse_network_tag
loki_network_tag = var.loki_network_tag

# ClickHouse stateful data disk
clickhouse_stateful_disk_type = var.clickhouse_stateful_disk_type
clickhouse_stateful_disk_size_gb = var.clickhouse_stateful_disk_size_gb
Expand Down
16 changes: 16 additions & 0 deletions iac/provider-gcp/nomad-cluster/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,18 @@ locals {
build_base_hugepages_percentage = 60
client_base_hugepages_percentage = 80

# Per-pool effective subnetwork names (null when not overridden)
server_subnetwork_name = var.server_subnetwork_name != "" ? var.server_subnetwork_name : null
api_subnetwork_name = var.api_subnetwork_name != "" ? var.api_subnetwork_name : null
clickhouse_subnetwork_name = var.clickhouse_subnetwork_name != "" ? var.clickhouse_subnetwork_name : null
loki_subnetwork_name = var.loki_subnetwork_name != "" ? var.loki_subnetwork_name : null

# Per-pool effective network tags
server_network_tags = distinct(compact([var.cluster_tag_name, var.server_network_tag]))
api_network_tags = distinct(compact([var.cluster_tag_name, var.api_network_tag]))
clickhouse_network_tags = distinct(compact([var.cluster_tag_name, var.clickhouse_network_tag]))
loki_network_tags = distinct(compact([var.cluster_tag_name, var.loki_network_tag]))

nfs_mount_path = "/orchestrator/shared-store"
nfs_mount_subdir = "chunks-cache"
nfs_mount_opts = join(",", [ // for more docs, see https://linux.die.net/man/5/nfs
Expand Down Expand Up @@ -166,11 +178,13 @@ module "build_cluster" {
cluster_name = "${var.prefix}${var.build_cluster_name}-${each.key}"
image_family = var.build_image_family
network_name = var.network_name
subnetwork_name = each.value.subnetwork_name
base_hugepages_percentage = coalesce((each.value.hugepages_percentage), local.build_base_hugepages_percentage)
network_interface_type = each.value.network_interface_type
node_labels = each.value.node_labels

cluster_tag_name = var.cluster_tag_name
network_tag = each.value.network_tag
node_pool = var.build_node_pool
nomad_port = var.nomad_port
consul_acl_token_secret = var.consul_acl_token_secret
Expand Down Expand Up @@ -225,11 +239,13 @@ module "client_cluster" {
cluster_name = each.key == "default" ? "${var.prefix}${var.client_cluster_name}" : "${var.prefix}${var.client_cluster_name}-${each.key}"
image_family = var.client_image_family
network_name = var.network_name
subnetwork_name = each.value.subnetwork_name
base_hugepages_percentage = coalesce((each.value.hugepages_percentage), local.client_base_hugepages_percentage)
network_interface_type = each.value.network_interface_type
node_labels = each.value.node_labels

cluster_tag_name = var.cluster_tag_name
network_tag = each.value.network_tag
node_pool = var.orchestrator_node_pool
nomad_port = var.nomad_port
consul_acl_token_secret = var.consul_acl_token_secret
Expand Down
5 changes: 3 additions & 2 deletions iac/provider-gcp/nomad-cluster/nodepool-api.tf
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,7 @@ resource "google_compute_instance_template" "api" {
goog-ops-agent-policy = "v2-x86-template-1-2-0-${var.gcp_zone}"
} : {})
)
tags = [var.cluster_tag_name]
tags = local.api_network_tags
metadata_startup_script = local.api_startup_script
metadata = merge(
{ api_cluster = "TRUE" },
Expand All @@ -134,7 +134,8 @@ resource "google_compute_instance_template" "api" {
}

network_interface {
network = var.network_name
network = var.network_name
subnetwork = local.api_subnetwork_name

dynamic "access_config" {
for_each = var.api_use_nat ? [] : ["public_ip"]
Expand Down
5 changes: 3 additions & 2 deletions iac/provider-gcp/nomad-cluster/nodepool-clickhouse.tf
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,7 @@ resource "google_compute_instance_template" "clickhouse" {
labels = merge(
var.labels,
)
tags = [var.cluster_tag_name]
tags = local.clickhouse_network_tags
metadata_startup_script = local.clickhouse_start_script
metadata = {
enable-osconfig = "TRUE",
Expand All @@ -139,7 +139,8 @@ resource "google_compute_instance_template" "clickhouse" {
}

network_interface {
network = var.network_name
network = var.network_name
subnetwork = local.clickhouse_subnetwork_name

access_config {}
}
Expand Down
5 changes: 3 additions & 2 deletions iac/provider-gcp/nomad-cluster/nodepool-control-server.tf
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ resource "google_compute_instance_template" "server" {
instance_description = null
machine_type = var.server_machine_type

tags = [var.cluster_tag_name]
tags = local.server_network_tags
metadata_startup_script = local.server_startup_script
metadata = {
enable-osconfig = "TRUE",
Expand All @@ -121,7 +121,8 @@ resource "google_compute_instance_template" "server" {
}

network_interface {
network = var.network_name
network = var.network_name
subnetwork = local.server_subnetwork_name

# Create access config dynamically. If a public ip is requested, we just need the empty `access_config` block
# to automatically assign an external IP address.
Expand Down
5 changes: 3 additions & 2 deletions iac/provider-gcp/nomad-cluster/nodepool-loki.tf
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ resource "google_compute_instance_template" "loki" {
goog-ops-agent-policy = "v2-x86-template-1-2-0-${var.gcp_zone}"
} : {})
)
tags = [var.cluster_tag_name]
tags = local.loki_network_tags
metadata_startup_script = local.loki_startup_script
metadata = merge(
{ loki_cluster = "TRUE" },
Expand All @@ -110,7 +110,8 @@ resource "google_compute_instance_template" "loki" {
}

network_interface {
network = var.network_name
network = var.network_name
subnetwork = local.loki_subnetwork_name

dynamic "access_config" {
for_each = ["public_ip"]
Expand Down
54 changes: 54 additions & 0 deletions iac/provider-gcp/nomad-cluster/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,8 @@ variable "client_clusters_config" {
hugepages_percentage = optional(number)
network_interface_type = optional(string)
node_labels = optional(list(string), [])
subnetwork_name = optional(string)
network_tag = optional(string)
}))
}

Expand Down Expand Up @@ -164,6 +166,8 @@ variable "build_clusters_config" {
hugepages_percentage = optional(number)
network_interface_type = optional(string)
node_labels = optional(list(string), [])
subnetwork_name = optional(string)
network_tag = optional(string)
}))
}

Expand Down Expand Up @@ -392,3 +396,53 @@ variable "additional_api_paths_handled_by_ingress" {
variable "ingress_timeout_seconds" {
type = number
}

# Per-pool subnetwork overrides
variable "server_subnetwork_name" {
description = "Subnetwork override for server MIG. Leave empty to use network default."
type = string
default = ""
}

variable "api_subnetwork_name" {
description = "Subnetwork override for API MIG. Leave empty to use network default."
type = string
default = ""
}

variable "clickhouse_subnetwork_name" {
description = "Subnetwork override for ClickHouse MIG. Leave empty to use network default."
type = string
default = ""
}

variable "loki_subnetwork_name" {
description = "Subnetwork override for Loki MIG. Leave empty to use network default."
type = string
default = ""
}

# Per-pool network tag overrides
variable "server_network_tag" {
description = "Additional network tag for server MIG."
type = string
default = ""
}

variable "api_network_tag" {
description = "Additional network tag for API MIG."
type = string
default = ""
}

variable "clickhouse_network_tag" {
description = "Additional network tag for ClickHouse MIG."
type = string
default = ""
}

variable "loki_network_tag" {
description = "Additional network tag for Loki MIG."
type = string
default = ""
}
7 changes: 4 additions & 3 deletions iac/provider-gcp/nomad-cluster/worker-cluster/nodepool.tf
Original file line number Diff line number Diff line change
Expand Up @@ -160,7 +160,7 @@ resource "google_compute_instance_template" "template" {
goog-ops-agent-policy = "v2-x86-template-1-2-0-${var.gcp_zone}"
} : {})
)
tags = [var.cluster_tag_name]
tags = distinct(compact([var.cluster_tag_name, var.network_tag]))
metadata_startup_script = local.startup_script
metadata = {
enable-osconfig = "TRUE",
Expand Down Expand Up @@ -208,8 +208,9 @@ resource "google_compute_instance_template" "template" {
}

network_interface {
network = var.network_name
nic_type = var.network_interface_type
network = var.network_name
subnetwork = var.subnetwork_name
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P1 Badge Normalize empty subnetwork override before template apply

subnetwork is now wired directly from var.subnetwork_name, but the new .env.gcp.template examples set subnetwork_name to ""; that propagates an explicit empty string into network_interface.subnetwork instead of omitting the field. In this configuration path (build/client cluster JSON overrides), Terraform will attempt to apply an empty subnetwork value rather than the previous default-network behavior, which can break applies for users who copy the template defaults. Convert empty strings to null before assigning subnetwork (as is already done for server/api/clickhouse/loki locals).

Useful? React with 👍 / 👎.

nic_type = var.network_interface_type

dynamic "access_config" {
for_each = ["public_ip"]
Expand Down
12 changes: 12 additions & 0 deletions iac/provider-gcp/nomad-cluster/worker-cluster/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,18 @@ variable "cluster_tag_name" {
type = string
}

variable "network_tag" {
description = "Additional network tag for this worker cluster. Added alongside cluster_tag_name."
type = string
default = null
}

variable "subnetwork_name" {
description = "Subnetwork override for this worker cluster. Leave null to use network default."
type = string
default = null
}

# SERVICE ACCOUNT & AUTHENTICATION

variable "google_service_account_email" {
Expand Down
54 changes: 54 additions & 0 deletions iac/provider-gcp/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -505,6 +505,8 @@ variable "client_clusters_config" {
hugepages_percentage = optional(number)
network_interface_type = optional(string)
node_labels = optional(list(string), [])
subnetwork_name = optional(string)
network_tag = optional(string)
}))

description = <<EOT
Expand Down Expand Up @@ -565,6 +567,8 @@ variable "build_clusters_config" {
hugepages_percentage = optional(number)
network_interface_type = optional(string)
node_labels = optional(list(string), [])
subnetwork_name = optional(string)
network_tag = optional(string)
}))
description = <<EOT
Configuration for the build clusters.
Expand Down Expand Up @@ -705,6 +709,56 @@ variable "network_name" {
default = "default"
}

# Per-pool subnetwork overrides
variable "server_subnetwork_name" {
description = "Subnetwork override for server (Nomad/Consul) MIG. Leave empty to use network default."
type = string
default = ""
}

variable "api_subnetwork_name" {
description = "Subnetwork override for API MIG. Leave empty to use network default."
type = string
default = ""
}

variable "clickhouse_subnetwork_name" {
description = "Subnetwork override for ClickHouse MIG. Leave empty to use network default."
type = string
default = ""
}

variable "loki_subnetwork_name" {
description = "Subnetwork override for Loki MIG. Leave empty to use network default."
type = string
default = ""
}

# Per-pool network tag overrides (added alongside default cluster tag)
variable "server_network_tag" {
description = "Additional network tag for server MIG. Added alongside the cluster-wide tag."
type = string
default = ""
}

variable "api_network_tag" {
description = "Additional network tag for API MIG. Added alongside the cluster-wide tag."
type = string
default = ""
}

variable "clickhouse_network_tag" {
description = "Additional network tag for ClickHouse MIG. Added alongside the cluster-wide tag."
type = string
default = ""
}

variable "loki_network_tag" {
description = "Additional network tag for Loki MIG. Added alongside the cluster-wide tag."
type = string
default = ""
}

variable "volume_token_issuer" {
type = string
default = ""
Expand Down
Loading