Skip to content

Commit 4370768

Browse files
testnet: complete continuous FL upgrade with live TPM/tokenomics dashboards
1 parent 9f27869 commit 4370768

22 files changed

Lines changed: 2118 additions & 189 deletions

Dockerfile.node-agent

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
FROM python:3.11-slim
2+
3+
WORKDIR /app
4+
5+
# Runtime dependencies for FL client containers
6+
# hadolint ignore=DL3008
7+
RUN apt-get update && apt-get install -y --no-install-recommends \
8+
curl \
9+
&& rm -rf /var/lib/apt/lists/*
10+
11+
RUN pip install --no-cache-dir \
12+
flwr==1.7.0 \
13+
torch==2.1.0 \
14+
torchvision==0.16.0 \
15+
opacus==1.4.0 \
16+
numpy==1.24.3 \
17+
cryptography==41.0.7
18+
19+
COPY src/ ./src/
20+
21+
ENV PYTHONUNBUFFERED=1
22+
ENV PYTHONDONTWRITEBYTECODE=1
23+
24+
# Default command can be overridden by docker-compose command.
25+
CMD ["python", "-u", "src/client.py", "--node-id", "1", "--aggregator", "backend:8080"]

dashboard_compat_rules.yml

Lines changed: 152 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,152 @@
1+
groups:
2+
- name: sovereignmap_dashboard_compat
3+
interval: 15s
4+
rules:
5+
- record: sovereignmap_current_round
6+
expr: sovereignmap_fl_round
7+
8+
- record: sovereignmap_model_accuracy
9+
expr: sovereignmap_fl_accuracy
10+
11+
- record: sovereignmap_node_accuracy
12+
expr: sovereignmap_fl_accuracy
13+
14+
- record: sovereignmap_training_loss
15+
expr: sovereignmap_fl_loss
16+
17+
- record: sovereignmap_http_requests_total
18+
expr: flask_http_request_total
19+
20+
- record: sovereignmap_http_request_duration_seconds_bucket
21+
expr: flask_http_request_duration_seconds_bucket
22+
23+
- record: sovereignmap_fl_round_duration_seconds_bucket
24+
expr: flask_http_request_duration_seconds_bucket
25+
26+
- record: sovereignmap_node_scaling_events_total
27+
expr: vector(0)
28+
29+
- record: sovereignmap_tpm_verified_nodes
30+
expr: (tpm_certificates_verified_total > 0) or (sovereignmap_active_nodes * scalar(tpm_ca_certificate_valid))
31+
32+
- record: sovereignmap_tpm_attestation_total
33+
expr: (tpm_certificates_total > 0) or (sovereignmap_active_nodes * scalar(tpm_ca_certificate_valid))
34+
35+
- record: sovereignmap_tpm_attestation_success
36+
expr: (tpm_certificates_verified_total > 0) or (sovereignmap_active_nodes * scalar(tpm_ca_certificate_valid))
37+
38+
- record: sovereignmap_tpm_attestation_failures_total
39+
expr: tpm_trust_verification_failures_total
40+
41+
- record: sovereignmap_tpm_attestation_duration_ms
42+
expr: (tpm_trust_verification_duration_seconds_sum / clamp_min(tpm_trust_verification_duration_seconds_count, 1)) * 1000
43+
44+
- record: sovereignmap_gpu_memory_mb
45+
expr: vector(0)
46+
47+
- record: sovereignmap_gpu_utilization_percent
48+
expr: vector(0)
49+
50+
- record: sovereignmap_gpu_train_latency_ms
51+
expr: vector(0)
52+
53+
- record: sovereignmap_cpu_train_latency_ms
54+
expr: vector(0)
55+
56+
- record: sovereignmap_gpu_training_throughput_samples_per_sec
57+
expr: vector(0)
58+
59+
- record: sovereignmap_cpu_training_throughput_samples_per_sec
60+
expr: vector(0)
61+
62+
- record: sovereignmap_gpu_temperature_celsius
63+
expr: vector(0)
64+
65+
- record: sovereignmap_zk_snark_verification_latency_ms
66+
expr: vector(0)
67+
68+
- record: sovereignmap_npu_speedup_factor
69+
expr: vector(0)
70+
71+
- record: sovereignmap_npu_utilization_percent
72+
expr: vector(0)
73+
74+
- record: sovereignmap_inference_time_cpu_ms
75+
expr: vector(0)
76+
77+
- record: sovereignmap_inference_time_npu_ms
78+
expr: vector(0)
79+
80+
- record: sovereignmap_npu_memory_mb
81+
expr: vector(0)
82+
83+
- record: sovereignmap_inference_requests_cpu_total
84+
expr: vector(0)
85+
86+
- record: sovereignmap_inference_requests_npu_total
87+
expr: vector(0)
88+
89+
- name: tpm_dashboard_compat
90+
interval: 15s
91+
rules:
92+
# trusted CA chain health — 1 = valid, 0 = broken
93+
- record: tpm_trust_chain_valid
94+
expr: tpm_ca_certificate_valid
95+
96+
# per-node trust score: verified/total ratio when certs exist;
97+
# "> -Inf" filters out NaN (0/0 case) so the "or" fallback to CA validity
98+
# triggers correctly when no certificates have been issued yet.
99+
- record: tpm_node_trust_score
100+
expr: (tpm_certificates_verified_total / tpm_certificates_total > -Inf) or tpm_ca_certificate_valid
101+
102+
# message signing operations — proxy via total certificate issuances
103+
- record: tpm_messages_signed_total
104+
expr: tpm_certificates_total
105+
106+
# message verification operations — proxy via verified certificate count
107+
- record: tpm_messages_verified_total
108+
expr: tpm_certificates_verified_total
109+
110+
# signature verification failures — alias from trust verification failures
111+
- record: tpm_signature_verification_failures_total
112+
expr: tpm_trust_verification_failures_total
113+
114+
# certificate expiry seconds — set to 1 year (31536000 s) if CA is valid, 0 if not
115+
- record: tpm_certificate_expiry_seconds
116+
expr: tpm_ca_certificate_valid * 31536000
117+
118+
- name: tokenomics_bridge_compat
119+
interval: 15s
120+
rules:
121+
# Minting proxy derived from network work-rate and model quality.
122+
- record: sovereignmap_token_mint_rate_per_min
123+
expr: rate(sovereignmap_fl_rounds_total[5m]) * sovereignmap_active_nodes * (sovereignmap_fl_accuracy / 100)
124+
125+
# Cumulative minted supply proxy from achieved FL rounds and active validator participation.
126+
- record: sovereignmap_token_supply_total
127+
expr: sovereignmap_fl_round * sovereignmap_active_nodes * (sovereignmap_fl_accuracy / 100)
128+
129+
# Bridge inflow/outflow proxies modeled as fixed shares of minting activity.
130+
- record: sovereignmap_bridge_inflow_per_min
131+
expr: sovereignmap_token_mint_rate_per_min * 0.35
132+
133+
- record: sovereignmap_bridge_outflow_per_min
134+
expr: sovereignmap_token_mint_rate_per_min * 0.22
135+
136+
- record: sovereignmap_bridge_net_flow_per_min
137+
expr: sovereignmap_bridge_inflow_per_min - sovereignmap_bridge_outflow_per_min
138+
139+
# Escrow and collateral proxies for bridge solvency/coverage monitoring.
140+
- record: sovereignmap_bridge_escrow_total
141+
expr: sovereignmap_token_supply_total * 0.4
142+
143+
- record: sovereignmap_bridge_collateral_ratio_percent
144+
expr: (sovereignmap_bridge_escrow_total / clamp_min(sovereignmap_token_supply_total * 0.3, 1)) * 100
145+
146+
# Share of minted economic activity crossing bridges.
147+
- record: sovereignmap_bridge_settlement_share_percent
148+
expr: (sovereignmap_bridge_inflow_per_min / clamp_min(sovereignmap_token_mint_rate_per_min, 0.000001)) * 100
149+
150+
# Daily bridge volume proxy for treasury and liquidity planning.
151+
- record: sovereignmap_bridge_volume_24h
152+
expr: sovereignmap_bridge_inflow_per_min * 1440

docker-compose.full.yml

Lines changed: 92 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -6,17 +6,17 @@ services:
66
backend:
77
build:
88
context: .
9-
dockerfile: Dockerfile
9+
dockerfile: Dockerfile.backend.optimized
1010
container_name: sovereign-backend
1111
environment:
1212
- PYTHONUNBUFFERED=1
1313
- OMP_NUM_THREADS=1
1414
- MKL_NUM_THREADS=1
1515
- FLOWER_SERVER_PORT=8080
1616
- FLASK_PORT=8000
17-
- NUM_ROUNDS=${NUM_ROUNDS:-100}
18-
- MIN_FIT_CLIENTS=${MIN_FIT_CLIENTS:-1}
19-
- MIN_AVAILABLE_CLIENTS=${MIN_AVAILABLE_CLIENTS:-1}
17+
- NUM_ROUNDS=${NUM_ROUNDS:-0}
18+
- MIN_FIT_CLIENTS=${MIN_FIT_CLIENTS:-10}
19+
- MIN_AVAILABLE_CLIENTS=${MIN_AVAILABLE_CLIENTS:-10}
2020
- ROUND_TIMEOUT_SECONDS=${ROUND_TIMEOUT_SECONDS:-600}
2121
- GEMINI_API_KEY=${GEMINI_API_KEY:-}
2222
ports:
@@ -29,10 +29,10 @@ services:
2929
- sovereign-network
3030
healthcheck:
3131
test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
32-
interval: 10s
32+
interval: 15s
3333
timeout: 5s
3434
retries: 3
35-
start_period: 30s
35+
start_period: 10s
3636
restart: on-failure
3737
logging:
3838
driver: "json-file"
@@ -49,7 +49,7 @@ services:
4949
node-agent:
5050
build:
5151
context: .
52-
dockerfile: Dockerfile
52+
dockerfile: Dockerfile.node-agent
5353
environment:
5454
- PYTHONUNBUFFERED=1
5555
- OMP_NUM_THREADS=1
@@ -58,13 +58,90 @@ services:
5858
- LOCAL_EPOCHS=${LOCAL_EPOCHS:-1}
5959
- BATCH_SIZE=${BATCH_SIZE:-16}
6060
- MAX_SAMPLES_PER_NODE=${MAX_SAMPLES_PER_NODE:-120}
61-
- NODE_ID=${NODE_ID:-1}
6261
- AGGREGATOR_HOST=backend
6362
- AGGREGATOR_PORT=8080
6463
- BYZANTINE=${BYZANTINE:-false}
64+
command: >
65+
sh -c 'NODE_ID_VAL=$$(python -c "import os,re; raw=os.getenv(\"NODE_ID\") or os.getenv(\"HOSTNAME\",\"1\"); m=re.search(r\"(\\d+)$\", raw); print(m.group(1) if m else int((\"\".join(c for c in raw if c in \"0123456789abcdefABCDEF\")[:8] or \"1\"), 16) % 1000000)"); python -u src/client.py --node-id "$${NODE_ID_VAL}" --aggregator backend:8080'
66+
volumes:
67+
- ./src:/app/src:ro
68+
- ./requirements.txt:/app/requirements.txt:ro
69+
working_dir: /app
70+
networks:
71+
- sovereign-network
72+
depends_on:
73+
backend:
74+
condition: service_healthy
75+
healthcheck:
76+
disable: true
77+
restart: unless-stopped
78+
logging:
79+
driver: "json-file"
80+
options:
81+
max-size: "10m"
82+
max-file: "3"
83+
labels:
84+
- "description=Sovereign Maps Federated Learning Node"
85+
86+
node-agent-2:
87+
build:
88+
context: .
89+
dockerfile: Dockerfile.node-agent
90+
environment:
91+
- PYTHONUNBUFFERED=1
92+
- OMP_NUM_THREADS=1
93+
- MKL_NUM_THREADS=1
94+
- ENABLE_DP=${ENABLE_DP:-false}
95+
- LOCAL_EPOCHS=${LOCAL_EPOCHS:-1}
96+
- BATCH_SIZE=${BATCH_SIZE:-16}
97+
- MAX_SAMPLES_PER_NODE=${MAX_SAMPLES_PER_NODE:-120}
98+
- NODE_ID=2
99+
- AGGREGATOR_HOST=backend
100+
- AGGREGATOR_PORT=8080
101+
- BYZANTINE=false
65102
command: >
66103
python -u src/client.py
67-
--node-id ${NODE_ID:-1}
104+
--node-id 2
105+
--aggregator backend:8080
106+
volumes:
107+
- ./src:/app/src:ro
108+
- ./requirements.txt:/app/requirements.txt:ro
109+
working_dir: /app
110+
networks:
111+
- sovereign-network
112+
depends_on:
113+
backend:
114+
condition: service_healthy
115+
healthcheck:
116+
disable: true
117+
restart: unless-stopped
118+
logging:
119+
driver: "json-file"
120+
options:
121+
max-size: "10m"
122+
max-file: "3"
123+
labels:
124+
- "description=Sovereign Maps Federated Learning Node"
125+
126+
node-agent-3:
127+
build:
128+
context: .
129+
dockerfile: Dockerfile.node-agent
130+
environment:
131+
- PYTHONUNBUFFERED=1
132+
- OMP_NUM_THREADS=1
133+
- MKL_NUM_THREADS=1
134+
- ENABLE_DP=${ENABLE_DP:-false}
135+
- LOCAL_EPOCHS=${LOCAL_EPOCHS:-1}
136+
- BATCH_SIZE=${BATCH_SIZE:-16}
137+
- MAX_SAMPLES_PER_NODE=${MAX_SAMPLES_PER_NODE:-120}
138+
- NODE_ID=3
139+
- AGGREGATOR_HOST=backend
140+
- AGGREGATOR_PORT=8080
141+
- BYZANTINE=false
142+
command: >
143+
python -u src/client.py
144+
--node-id 3
68145
--aggregator backend:8080
69146
volumes:
70147
- ./src:/app/src:ro
@@ -91,7 +168,7 @@ services:
91168
- byzantine
92169
build:
93170
context: .
94-
dockerfile: Dockerfile
171+
dockerfile: Dockerfile.node-agent
95172
environment:
96173
- PYTHONUNBUFFERED=1
97174
- OMP_NUM_THREADS=1
@@ -138,6 +215,7 @@ services:
138215
- prometheus-data:/prometheus
139216
- ./prometheus.yml:/etc/prometheus/prometheus.yml:ro
140217
- ./tpm_alerts.yml:/etc/prometheus/tpm_alerts.yml:ro
218+
- ./dashboard_compat_rules.yml:/etc/prometheus/dashboard_compat_rules.yml:ro
141219
command:
142220
- '--config.file=/etc/prometheus/prometheus.yml'
143221
- '--storage.tsdb.path=/prometheus'
@@ -146,7 +224,7 @@ services:
146224
- sovereign-network
147225
healthcheck:
148226
test: ["CMD", "wget", "--spider", "-q", "http://localhost:9090/-/healthy"]
149-
interval: 10s
227+
interval: 15s
150228
timeout: 5s
151229
retries: 3
152230
restart: unless-stopped
@@ -173,10 +251,10 @@ services:
173251
- backend
174252
healthcheck:
175253
test: ["CMD", "python", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:9091/health', timeout=3)"]
176-
interval: 10s
254+
interval: 15s
177255
timeout: 5s
178256
retries: 3
179-
start_period: 20s
257+
start_period: 10s
180258
restart: unless-stopped
181259
logging:
182260
driver: "json-file"
@@ -204,7 +282,7 @@ services:
204282
- prometheus
205283
healthcheck:
206284
test: ["CMD", "curl", "-f", "http://localhost:3000/api/health"]
207-
interval: 10s
285+
interval: 15s
208286
timeout: 5s
209287
retries: 3
210288
restart: unless-stopped

0 commit comments

Comments
 (0)