diff --git a/.env.gcp.template b/.env.gcp.template index a539103bfd..32b7e3c1e9 100644 --- a/.env.gcp.template +++ b/.env.gcp.template @@ -46,15 +46,17 @@ TERRAFORM_ENVIRONMENT=dev # "disk_type": "local-ssd", // Cache disk type (local-ssd or pd-ssd) # "size_gb": 375, // Cache disk size in GB # "count": 1 // Number of cache disks -# } +# }, +# "subnetwork_name": "", // (optional) Subnetwork override for this cluster entry +# "network_tag": "" // (optional) Additional network tag for this cluster entry # } # Build cluster configuration # Example: -BUILD_CLUSTERS_CONFIG='{"default": {"cluster_size": 1, "machine":{"type":"n1-standard-8","min_cpu_platform":"Intel Skylake"}, "boot_disk":{"disk_type":"pd-ssd","size_gb":200}, "cache_disks":{"disk_type":"local-ssd","size_gb":375,"count":1}}}' +BUILD_CLUSTERS_CONFIG='{"default": {"cluster_size": 1, "machine":{"type":"n1-standard-8","min_cpu_platform":"Intel Skylake"}, "boot_disk":{"disk_type":"pd-ssd","size_gb":200}, "cache_disks":{"disk_type":"local-ssd","size_gb":375,"count":1}, "subnetwork_name":"", "network_tag":""}}' # Client cluster configuration # Example: -CLIENT_CLUSTERS_CONFIG='{"default": {"cluster_size": 1, "hugepages_percentage": 80, "machine":{"type":"n1-standard-8","min_cpu_platform":"Intel Skylake"}, "autoscaler": {"size_max": 2, "memory_target": 100, "cpu_target": 0.7}, "boot_disk":{"disk_type":"pd-ssd","size_gb":200}, "cache_disks":{"disk_type":"local-ssd","size_gb":375,"count":1}}}' +CLIENT_CLUSTERS_CONFIG='{"default": {"cluster_size": 1, "hugepages_percentage": 80, "machine":{"type":"n1-standard-8","min_cpu_platform":"Intel Skylake"}, "autoscaler": {"size_max": 2, "memory_target": 100, "cpu_target": 0.7}, "boot_disk":{"disk_type":"pd-ssd","size_gb":200}, "cache_disks":{"disk_type":"local-ssd","size_gb":375,"count":1}, "subnetwork_name":"", "network_tag":""}}' # This is the nomad and consul server (only for scheduling and service discovery) # eg e2-standard-2 @@ -82,6 +84,18 @@ ENABLE_AUTH_USER_SYNC_BACKGROUND_WORKER= # Enable dashboard-api billing/team provisioning sink (default: false) ENABLE_BILLING_HTTP_TEAM_PROVISION_SINK= +# Per-pool subnetwork overrides (default: network's auto-subnet) +SERVER_SUBNETWORK_NAME= +API_SUBNETWORK_NAME= +CLICKHOUSE_SUBNETWORK_NAME= +LOKI_SUBNETWORK_NAME= + +# Per-pool network tag overrides (added alongside default "orch" tag) +SERVER_NETWORK_TAG= +API_NETWORK_TAG= +CLICKHOUSE_NETWORK_TAG= +LOKI_NETWORK_TAG= + # Filestore cache for builds shared across cluster (default:false) FILESTORE_CACHE_ENABLED= # BASIC_HDD for staging+dev, ZONAL for production diff --git a/iac/provider-gcp/Makefile b/iac/provider-gcp/Makefile index 5fae13a6f2..90b851cb54 100644 --- a/iac/provider-gcp/Makefile +++ b/iac/provider-gcp/Makefile @@ -90,7 +90,15 @@ tf_vars := \ $(call tfvar, AUTH_DB_MIN_IDLE_CONNECTIONS) \ $(call tfvar, GCS_GRPC_CONNECTION_POOL_SIZE) \ $(call tfvar, ADDITIONAL_API_PATHS_HANDLED_BY_INGRESS) \ - $(call tfvar, ANYWHERE_CACHE_ENABLED) + $(call tfvar, ANYWHERE_CACHE_ENABLED) \ + $(call tfvar, SERVER_SUBNETWORK_NAME) \ + $(call tfvar, API_SUBNETWORK_NAME) \ + $(call tfvar, CLICKHOUSE_SUBNETWORK_NAME) \ + $(call tfvar, LOKI_SUBNETWORK_NAME) \ + $(call tfvar, SERVER_NETWORK_TAG) \ + $(call tfvar, API_NETWORK_TAG) \ + $(call tfvar, CLICKHOUSE_NETWORK_TAG) \ + $(call tfvar, LOKI_NETWORK_TAG) .PHONY: init init: diff --git a/iac/provider-gcp/main.tf b/iac/provider-gcp/main.tf index e63e367f6a..01e9460bce 100644 --- a/iac/provider-gcp/main.tf +++ b/iac/provider-gcp/main.tf @@ -200,6 +200,18 @@ module "cluster" { clickhouse_boot_disk_type = var.clickhouse_boot_disk_type loki_boot_disk_type = var.loki_boot_disk_type + # Per-pool subnetwork overrides + server_subnetwork_name = var.server_subnetwork_name + api_subnetwork_name = var.api_subnetwork_name + clickhouse_subnetwork_name = var.clickhouse_subnetwork_name + loki_subnetwork_name = var.loki_subnetwork_name + + # Per-pool network tag overrides + server_network_tag = var.server_network_tag + api_network_tag = var.api_network_tag + clickhouse_network_tag = var.clickhouse_network_tag + loki_network_tag = var.loki_network_tag + # ClickHouse stateful data disk clickhouse_stateful_disk_type = var.clickhouse_stateful_disk_type clickhouse_stateful_disk_size_gb = var.clickhouse_stateful_disk_size_gb diff --git a/iac/provider-gcp/nomad-cluster/main.tf b/iac/provider-gcp/nomad-cluster/main.tf index 33c04042b5..99c57b8f70 100644 --- a/iac/provider-gcp/nomad-cluster/main.tf +++ b/iac/provider-gcp/nomad-cluster/main.tf @@ -4,6 +4,18 @@ locals { build_base_hugepages_percentage = 60 client_base_hugepages_percentage = 80 + # Per-pool effective subnetwork names (null when not overridden) + server_subnetwork_name = var.server_subnetwork_name != "" ? var.server_subnetwork_name : null + api_subnetwork_name = var.api_subnetwork_name != "" ? var.api_subnetwork_name : null + clickhouse_subnetwork_name = var.clickhouse_subnetwork_name != "" ? var.clickhouse_subnetwork_name : null + loki_subnetwork_name = var.loki_subnetwork_name != "" ? var.loki_subnetwork_name : null + + # Per-pool effective network tags + server_network_tags = distinct(compact([var.cluster_tag_name, var.server_network_tag])) + api_network_tags = distinct(compact([var.cluster_tag_name, var.api_network_tag])) + clickhouse_network_tags = distinct(compact([var.cluster_tag_name, var.clickhouse_network_tag])) + loki_network_tags = distinct(compact([var.cluster_tag_name, var.loki_network_tag])) + nfs_mount_path = "/orchestrator/shared-store" nfs_mount_subdir = "chunks-cache" nfs_mount_opts = join(",", [ // for more docs, see https://linux.die.net/man/5/nfs @@ -166,11 +178,13 @@ module "build_cluster" { cluster_name = "${var.prefix}${var.build_cluster_name}-${each.key}" image_family = var.build_image_family network_name = var.network_name + subnetwork_name = each.value.subnetwork_name base_hugepages_percentage = coalesce((each.value.hugepages_percentage), local.build_base_hugepages_percentage) network_interface_type = each.value.network_interface_type node_labels = each.value.node_labels cluster_tag_name = var.cluster_tag_name + network_tag = each.value.network_tag node_pool = var.build_node_pool nomad_port = var.nomad_port consul_acl_token_secret = var.consul_acl_token_secret @@ -225,11 +239,13 @@ module "client_cluster" { cluster_name = each.key == "default" ? "${var.prefix}${var.client_cluster_name}" : "${var.prefix}${var.client_cluster_name}-${each.key}" image_family = var.client_image_family network_name = var.network_name + subnetwork_name = each.value.subnetwork_name base_hugepages_percentage = coalesce((each.value.hugepages_percentage), local.client_base_hugepages_percentage) network_interface_type = each.value.network_interface_type node_labels = each.value.node_labels cluster_tag_name = var.cluster_tag_name + network_tag = each.value.network_tag node_pool = var.orchestrator_node_pool nomad_port = var.nomad_port consul_acl_token_secret = var.consul_acl_token_secret diff --git a/iac/provider-gcp/nomad-cluster/nodepool-api.tf b/iac/provider-gcp/nomad-cluster/nodepool-api.tf index d0bc81328e..b43c9333e4 100644 --- a/iac/provider-gcp/nomad-cluster/nodepool-api.tf +++ b/iac/provider-gcp/nomad-cluster/nodepool-api.tf @@ -112,7 +112,7 @@ resource "google_compute_instance_template" "api" { goog-ops-agent-policy = "v2-x86-template-1-2-0-${var.gcp_zone}" } : {}) ) - tags = [var.cluster_tag_name] + tags = local.api_network_tags metadata_startup_script = local.api_startup_script metadata = merge( { api_cluster = "TRUE" }, @@ -134,7 +134,8 @@ resource "google_compute_instance_template" "api" { } network_interface { - network = var.network_name + network = var.network_name + subnetwork = local.api_subnetwork_name dynamic "access_config" { for_each = var.api_use_nat ? [] : ["public_ip"] diff --git a/iac/provider-gcp/nomad-cluster/nodepool-clickhouse.tf b/iac/provider-gcp/nomad-cluster/nodepool-clickhouse.tf index 5d8219ed20..16245d9118 100644 --- a/iac/provider-gcp/nomad-cluster/nodepool-clickhouse.tf +++ b/iac/provider-gcp/nomad-cluster/nodepool-clickhouse.tf @@ -119,7 +119,7 @@ resource "google_compute_instance_template" "clickhouse" { labels = merge( var.labels, ) - tags = [var.cluster_tag_name] + tags = local.clickhouse_network_tags metadata_startup_script = local.clickhouse_start_script metadata = { enable-osconfig = "TRUE", @@ -139,7 +139,8 @@ resource "google_compute_instance_template" "clickhouse" { } network_interface { - network = var.network_name + network = var.network_name + subnetwork = local.clickhouse_subnetwork_name access_config {} } diff --git a/iac/provider-gcp/nomad-cluster/nodepool-control-server.tf b/iac/provider-gcp/nomad-cluster/nodepool-control-server.tf index 0658675757..89d616585b 100644 --- a/iac/provider-gcp/nomad-cluster/nodepool-control-server.tf +++ b/iac/provider-gcp/nomad-cluster/nodepool-control-server.tf @@ -95,7 +95,7 @@ resource "google_compute_instance_template" "server" { instance_description = null machine_type = var.server_machine_type - tags = [var.cluster_tag_name] + tags = local.server_network_tags metadata_startup_script = local.server_startup_script metadata = { enable-osconfig = "TRUE", @@ -121,7 +121,8 @@ resource "google_compute_instance_template" "server" { } network_interface { - network = var.network_name + network = var.network_name + subnetwork = local.server_subnetwork_name # Create access config dynamically. If a public ip is requested, we just need the empty `access_config` block # to automatically assign an external IP address. diff --git a/iac/provider-gcp/nomad-cluster/nodepool-loki.tf b/iac/provider-gcp/nomad-cluster/nodepool-loki.tf index 8922c26829..88369f8f88 100644 --- a/iac/provider-gcp/nomad-cluster/nodepool-loki.tf +++ b/iac/provider-gcp/nomad-cluster/nodepool-loki.tf @@ -88,7 +88,7 @@ resource "google_compute_instance_template" "loki" { goog-ops-agent-policy = "v2-x86-template-1-2-0-${var.gcp_zone}" } : {}) ) - tags = [var.cluster_tag_name] + tags = local.loki_network_tags metadata_startup_script = local.loki_startup_script metadata = merge( { loki_cluster = "TRUE" }, @@ -110,7 +110,8 @@ resource "google_compute_instance_template" "loki" { } network_interface { - network = var.network_name + network = var.network_name + subnetwork = local.loki_subnetwork_name dynamic "access_config" { for_each = ["public_ip"] diff --git a/iac/provider-gcp/nomad-cluster/variables.tf b/iac/provider-gcp/nomad-cluster/variables.tf index 6fccce0406..725a4fa4e4 100644 --- a/iac/provider-gcp/nomad-cluster/variables.tf +++ b/iac/provider-gcp/nomad-cluster/variables.tf @@ -131,6 +131,8 @@ variable "client_clusters_config" { hugepages_percentage = optional(number) network_interface_type = optional(string) node_labels = optional(list(string), []) + subnetwork_name = optional(string) + network_tag = optional(string) })) } @@ -164,6 +166,8 @@ variable "build_clusters_config" { hugepages_percentage = optional(number) network_interface_type = optional(string) node_labels = optional(list(string), []) + subnetwork_name = optional(string) + network_tag = optional(string) })) } @@ -392,3 +396,53 @@ variable "additional_api_paths_handled_by_ingress" { variable "ingress_timeout_seconds" { type = number } + +# Per-pool subnetwork overrides +variable "server_subnetwork_name" { + description = "Subnetwork override for server MIG. Leave empty to use network default." + type = string + default = "" +} + +variable "api_subnetwork_name" { + description = "Subnetwork override for API MIG. Leave empty to use network default." + type = string + default = "" +} + +variable "clickhouse_subnetwork_name" { + description = "Subnetwork override for ClickHouse MIG. Leave empty to use network default." + type = string + default = "" +} + +variable "loki_subnetwork_name" { + description = "Subnetwork override for Loki MIG. Leave empty to use network default." + type = string + default = "" +} + +# Per-pool network tag overrides +variable "server_network_tag" { + description = "Additional network tag for server MIG." + type = string + default = "" +} + +variable "api_network_tag" { + description = "Additional network tag for API MIG." + type = string + default = "" +} + +variable "clickhouse_network_tag" { + description = "Additional network tag for ClickHouse MIG." + type = string + default = "" +} + +variable "loki_network_tag" { + description = "Additional network tag for Loki MIG." + type = string + default = "" +} diff --git a/iac/provider-gcp/nomad-cluster/worker-cluster/nodepool.tf b/iac/provider-gcp/nomad-cluster/worker-cluster/nodepool.tf index b5479f20c1..75609059ec 100644 --- a/iac/provider-gcp/nomad-cluster/worker-cluster/nodepool.tf +++ b/iac/provider-gcp/nomad-cluster/worker-cluster/nodepool.tf @@ -160,7 +160,7 @@ resource "google_compute_instance_template" "template" { goog-ops-agent-policy = "v2-x86-template-1-2-0-${var.gcp_zone}" } : {}) ) - tags = [var.cluster_tag_name] + tags = distinct(compact([var.cluster_tag_name, var.network_tag])) metadata_startup_script = local.startup_script metadata = { enable-osconfig = "TRUE", @@ -208,8 +208,9 @@ resource "google_compute_instance_template" "template" { } network_interface { - network = var.network_name - nic_type = var.network_interface_type + network = var.network_name + subnetwork = var.subnetwork_name + nic_type = var.network_interface_type dynamic "access_config" { for_each = ["public_ip"] diff --git a/iac/provider-gcp/nomad-cluster/worker-cluster/variables.tf b/iac/provider-gcp/nomad-cluster/worker-cluster/variables.tf index 89ecb879d9..63de340672 100644 --- a/iac/provider-gcp/nomad-cluster/worker-cluster/variables.tf +++ b/iac/provider-gcp/nomad-cluster/worker-cluster/variables.tf @@ -91,6 +91,18 @@ variable "cluster_tag_name" { type = string } +variable "network_tag" { + description = "Additional network tag for this worker cluster. Added alongside cluster_tag_name." + type = string + default = null +} + +variable "subnetwork_name" { + description = "Subnetwork override for this worker cluster. Leave null to use network default." + type = string + default = null +} + # SERVICE ACCOUNT & AUTHENTICATION variable "google_service_account_email" { diff --git a/iac/provider-gcp/variables.tf b/iac/provider-gcp/variables.tf index 1ce86faeec..0c81be296e 100644 --- a/iac/provider-gcp/variables.tf +++ b/iac/provider-gcp/variables.tf @@ -505,6 +505,8 @@ variable "client_clusters_config" { hugepages_percentage = optional(number) network_interface_type = optional(string) node_labels = optional(list(string), []) + subnetwork_name = optional(string) + network_tag = optional(string) })) description = <