Skip to content

Commit a6fca10

Browse files
feat: implement full platform upgrade batch (telemetry, reliability, security, ci)
1 parent 17ba0c6 commit a6fca10

23 files changed

Lines changed: 1051 additions & 26 deletions
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
name: Observability CI
2+
3+
on:
4+
push:
5+
branches: [main]
6+
pull_request:
7+
branches: [main]
8+
9+
permissions:
10+
contents: read
11+
12+
jobs:
13+
dashboard-query-validation:
14+
name: Validate Dashboard Queries
15+
runs-on: ubuntu-latest
16+
steps:
17+
- name: Checkout
18+
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
19+
20+
- name: Set up Python
21+
uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d
22+
with:
23+
python-version: "3.11"
24+
25+
- name: Validate Grafana queries
26+
run: python scripts/check_dashboard_queries.py
Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
name: Security Supply Chain
2+
3+
on:
4+
push:
5+
branches: [main]
6+
pull_request:
7+
branches: [main]
8+
9+
permissions:
10+
contents: read
11+
security-events: write
12+
13+
jobs:
14+
sbom-and-image-scan:
15+
name: SBOM and Trivy Scan
16+
runs-on: ubuntu-latest
17+
steps:
18+
- name: Checkout
19+
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
20+
21+
- name: Build backend image
22+
run: docker build -f Dockerfile.backend.optimized -t sovereign-backend:ci .
23+
24+
- name: Generate SBOM (CycloneDX)
25+
uses: anchore/sbom-action@85f897d6f3eeb9d30980dc9f2138eb33cd6b2e8b
26+
with:
27+
image: sovereign-backend:ci
28+
format: cyclonedx-json
29+
output-file: sbom-backend.cdx.json
30+
31+
- name: Scan image with Trivy
32+
uses: aquasecurity/trivy-action@ea7f5b0ea4e9037fe8f8f6a9db8f2de41f8bb9f0
33+
with:
34+
image-ref: sovereign-backend:ci
35+
format: sarif
36+
output: trivy-results.sarif
37+
severity: CRITICAL,HIGH
38+
exit-code: 1
39+
40+
- name: Upload Trivy SARIF
41+
uses: github/codeql-action/upload-sarif@7fd9dc0f3f4f8f6e8bbbf38c3444b3248ce8b248
42+
with:
43+
sarif_file: trivy-results.sarif
44+
45+
- name: Upload SBOM artifact
46+
uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
47+
with:
48+
name: sbom-backend-cdx
49+
path: sbom-backend.cdx.json

Dockerfile.backend.optimized

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -33,11 +33,12 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
3333
ca-certificates \
3434
&& rm -rf /var/lib/apt/lists/*
3535

36-
# Copy Python dependencies from builder (significantly smaller image)
37-
COPY --from=builder /root/.local /root/.local
36+
# Copy Python dependencies from builder into non-root accessible location.
37+
COPY --from=builder /root/.local /opt/python
3838

3939
# Set environment variables for Python optimization
40-
ENV PATH=/root/.local/bin:$PATH \
40+
ENV PATH=/opt/python/bin:$PATH \
41+
PYTHONPATH=/opt/python/lib/python3.11/site-packages \
4142
PYTHONUNBUFFERED=1 \
4243
PYTHONDONTWRITEBYTECODE=1 \
4344
PYTHONOPTIMIZE=2
@@ -52,9 +53,13 @@ COPY src/ ./src/
5253
COPY config/ ./config/
5354

5455
# Create data directory
55-
RUN mkdir -p /app/data && chmod 755 /app/data
56+
RUN mkdir -p /app/data \
57+
&& chmod 755 /app/data \
58+
&& useradd --system --create-home --uid 10001 appuser \
59+
&& chown -R appuser:appuser /app
5660

57-
# Keep root runtime so Python can resolve packages installed under /root/.local.
61+
# Run as non-root for runtime hardening.
62+
USER appuser
5863

5964
# Health check with proper error handling
6065
HEALTHCHECK --interval=30s --timeout=10s --start-period=20s --retries=3 \

Dockerfile.node-agent

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,8 +18,12 @@ RUN pip install --no-cache-dir \
1818

1919
COPY src/ ./src/
2020

21+
RUN useradd --system --create-home --uid 10001 appuser && chown -R appuser:appuser /app
22+
2123
ENV PYTHONUNBUFFERED=1
2224
ENV PYTHONDONTWRITEBYTECODE=1
2325

26+
USER appuser
27+
2428
# Default command can be overridden by docker-compose command.
2529
CMD ["python", "-u", "src/client.py", "--node-id", "1", "--aggregator", "backend:8080"]

config/channels/dev.env

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
# Development channel
2+
NUM_ROUNDS=0
3+
MIN_FIT_CLIENTS=3
4+
MIN_AVAILABLE_CLIENTS=3
5+
ROUND_TIMEOUT_SECONDS=300
6+
TARGET_ROUNDS_PER_MIN=0.6
7+
MAX_REPLICAS=12
8+
MIN_REPLICAS=3

config/channels/prod.env

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
# Production channel
2+
NUM_ROUNDS=0
3+
MIN_FIT_CLIENTS=20
4+
MIN_AVAILABLE_CLIENTS=20
5+
ROUND_TIMEOUT_SECONDS=900
6+
TARGET_ROUNDS_PER_MIN=1.5
7+
MAX_REPLICAS=60
8+
MIN_REPLICAS=15

config/channels/stage.env

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
# Staging channel
2+
NUM_ROUNDS=0
3+
MIN_FIT_CLIENTS=10
4+
MIN_AVAILABLE_CLIENTS=10
5+
ROUND_TIMEOUT_SECONDS=600
6+
TARGET_ROUNDS_PER_MIN=1.0
7+
MAX_REPLICAS=30
8+
MIN_REPLICAS=8

dashboard_compat_rules.yml

Lines changed: 13 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -30,16 +30,16 @@ groups:
3030
expr: (tpm_certificates_verified_total > 0) or (sovereignmap_active_nodes * scalar(tpm_ca_certificate_valid))
3131

3232
- record: sovereignmap_tpm_attestation_total
33-
expr: (tpm_certificates_total > 0) or (sovereignmap_active_nodes * scalar(tpm_ca_certificate_valid))
33+
expr: sum(tpm_node_attestation_events_total) or (tpm_certificates_total > 0) or (sovereignmap_active_nodes * scalar(tpm_ca_certificate_valid))
3434

3535
- record: sovereignmap_tpm_attestation_success
36-
expr: (tpm_certificates_verified_total > 0) or (sovereignmap_active_nodes * scalar(tpm_ca_certificate_valid))
36+
expr: sum(tpm_node_attestation_events_total{result="success"}) or (tpm_certificates_verified_total > 0) or (sovereignmap_active_nodes * scalar(tpm_ca_certificate_valid))
3737

3838
- record: sovereignmap_tpm_attestation_failures_total
39-
expr: tpm_trust_verification_failures_total
39+
expr: sum(tpm_node_attestation_events_total{result="failure"}) or tpm_trust_verification_failures_total
4040

4141
- record: sovereignmap_tpm_attestation_duration_ms
42-
expr: (tpm_trust_verification_duration_seconds_sum / clamp_min(tpm_trust_verification_duration_seconds_count, 1)) * 1000
42+
expr: avg(tpm_node_attestation_latency_ms) or ((tpm_trust_verification_duration_seconds_sum / clamp_min(tpm_trust_verification_duration_seconds_count, 1)) * 1000)
4343

4444
- record: sovereignmap_gpu_memory_mb
4545
expr: vector(0)
@@ -120,33 +120,33 @@ groups:
120120
rules:
121121
# Minting proxy derived from network work-rate and model quality.
122122
- record: sovereignmap_token_mint_rate_per_min
123-
expr: rate(sovereignmap_fl_rounds_total[5m]) * sovereignmap_active_nodes * (sovereignmap_fl_accuracy / 100)
123+
expr: tokenomics_mint_rate_per_min or ((rate(sovereignmap_fl_rounds_total[5m]) * sovereignmap_active_nodes * (sovereignmap_fl_accuracy / 100)) unless on() tokenomics_mint_rate_per_min)
124124

125125
# Cumulative minted supply proxy from achieved FL rounds and active validator participation.
126126
- record: sovereignmap_token_supply_total
127-
expr: sovereignmap_fl_round * sovereignmap_active_nodes * (sovereignmap_fl_accuracy / 100)
127+
expr: tokenomics_token_supply_total or ((sovereignmap_fl_round * sovereignmap_active_nodes * (sovereignmap_fl_accuracy / 100)) unless on() tokenomics_token_supply_total)
128128

129129
# Bridge inflow/outflow proxies modeled as fixed shares of minting activity.
130130
- record: sovereignmap_bridge_inflow_per_min
131-
expr: sovereignmap_token_mint_rate_per_min * 0.35
131+
expr: tokenomics_bridge_inflow_per_min or ((sovereignmap_token_mint_rate_per_min * 0.35) unless on() tokenomics_bridge_inflow_per_min)
132132

133133
- record: sovereignmap_bridge_outflow_per_min
134-
expr: sovereignmap_token_mint_rate_per_min * 0.22
134+
expr: tokenomics_bridge_outflow_per_min or ((sovereignmap_token_mint_rate_per_min * 0.22) unless on() tokenomics_bridge_outflow_per_min)
135135

136136
- record: sovereignmap_bridge_net_flow_per_min
137-
expr: sovereignmap_bridge_inflow_per_min - sovereignmap_bridge_outflow_per_min
137+
expr: tokenomics_bridge_net_flow_per_min or ((sovereignmap_bridge_inflow_per_min - sovereignmap_bridge_outflow_per_min) unless on() tokenomics_bridge_net_flow_per_min)
138138

139139
# Escrow and collateral proxies for bridge solvency/coverage monitoring.
140140
- record: sovereignmap_bridge_escrow_total
141-
expr: sovereignmap_token_supply_total * 0.4
141+
expr: tokenomics_bridge_escrow_total or ((sovereignmap_token_supply_total * 0.4) unless on() tokenomics_bridge_escrow_total)
142142

143143
- record: sovereignmap_bridge_collateral_ratio_percent
144-
expr: (sovereignmap_bridge_escrow_total / clamp_min(sovereignmap_token_supply_total * 0.3, 1)) * 100
144+
expr: tokenomics_bridge_collateral_ratio_percent or (((sovereignmap_bridge_escrow_total / clamp_min(sovereignmap_token_supply_total * 0.3, 1)) * 100) unless on() tokenomics_bridge_collateral_ratio_percent)
145145

146146
# Share of minted economic activity crossing bridges.
147147
- record: sovereignmap_bridge_settlement_share_percent
148-
expr: (sovereignmap_bridge_inflow_per_min / clamp_min(sovereignmap_token_mint_rate_per_min, 0.000001)) * 100
148+
expr: tokenomics_bridge_settlement_share_percent or (((sovereignmap_bridge_inflow_per_min / clamp_min(sovereignmap_token_mint_rate_per_min, 0.000001)) * 100) unless on() tokenomics_bridge_settlement_share_percent)
149149

150150
# Daily bridge volume proxy for treasury and liquidity planning.
151151
- record: sovereignmap_bridge_volume_24h
152-
expr: sovereignmap_bridge_inflow_per_min * 1440
152+
expr: tokenomics_bridge_volume_24h or ((sovereignmap_bridge_inflow_per_min * 1440) unless on() tokenomics_bridge_volume_24h)
Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
apiVersion: autoscaling/v2
2+
kind: HorizontalPodAutoscaler
3+
metadata:
4+
name: sovereign-node-agent-hpa
5+
namespace: sovereign-map
6+
spec:
7+
scaleTargetRef:
8+
apiVersion: apps/v1
9+
kind: Deployment
10+
name: sovereign-node-agent
11+
minReplicas: 10
12+
maxReplicas: 100
13+
metrics:
14+
- type: Resource
15+
resource:
16+
name: cpu
17+
target:
18+
type: Utilization
19+
averageUtilization: 65
20+
---
21+
apiVersion: autoscaling/v2
22+
kind: HorizontalPodAutoscaler
23+
metadata:
24+
name: sovereign-backend-hpa
25+
namespace: sovereign-map
26+
spec:
27+
scaleTargetRef:
28+
apiVersion: apps/v1
29+
kind: Deployment
30+
name: sovereign-backend
31+
minReplicas: 1
32+
maxReplicas: 5
33+
metrics:
34+
- type: Resource
35+
resource:
36+
name: cpu
37+
target:
38+
type: Utilization
39+
averageUtilization: 70

docker-compose.full.yml

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ services:
1818
- MIN_FIT_CLIENTS=${MIN_FIT_CLIENTS:-10}
1919
- MIN_AVAILABLE_CLIENTS=${MIN_AVAILABLE_CLIENTS:-10}
2020
- ROUND_TIMEOUT_SECONDS=${ROUND_TIMEOUT_SECONDS:-600}
21+
- TPM_METRICS_ENDPOINT=${TPM_METRICS_ENDPOINT:-http://tpm-metrics:9091/event/attestation}
2122
- GEMINI_API_KEY=${GEMINI_API_KEY:-}
2223
ports:
2324
- "8000:8000" # Flask metrics API
@@ -34,6 +35,12 @@ services:
3435
retries: 3
3536
start_period: 10s
3637
restart: on-failure
38+
security_opt:
39+
- no-new-privileges:true
40+
cap_drop:
41+
- ALL
42+
tmpfs:
43+
- /tmp
3744
logging:
3845
driver: "json-file"
3946
options:
@@ -75,6 +82,12 @@ services:
7582
healthcheck:
7683
disable: true
7784
restart: unless-stopped
85+
security_opt:
86+
- no-new-privileges:true
87+
cap_drop:
88+
- ALL
89+
tmpfs:
90+
- /tmp
7891
logging:
7992
driver: "json-file"
8093
options:
@@ -216,6 +229,7 @@ services:
216229
- ./prometheus.yml:/etc/prometheus/prometheus.yml:ro
217230
- ./tpm_alerts.yml:/etc/prometheus/tpm_alerts.yml:ro
218231
- ./dashboard_compat_rules.yml:/etc/prometheus/dashboard_compat_rules.yml:ro
232+
- ./fl_slo_alerts.yml:/etc/prometheus/fl_slo_alerts.yml:ro
219233
command:
220234
- '--config.file=/etc/prometheus/prometheus.yml'
221235
- '--storage.tsdb.path=/prometheus'
@@ -256,6 +270,48 @@ services:
256270
retries: 3
257271
start_period: 10s
258272
restart: unless-stopped
273+
security_opt:
274+
- no-new-privileges:true
275+
cap_drop:
276+
- ALL
277+
tmpfs:
278+
- /tmp
279+
logging:
280+
driver: "json-file"
281+
options:
282+
max-size: "10m"
283+
max-file: "3"
284+
285+
tokenomics-metrics:
286+
image: python:3.11-slim
287+
container_name: sovereign-tokenomics-metrics
288+
volumes:
289+
- ./tokenomics_metrics_exporter.py:/app/tokenomics_metrics_exporter.py:ro
290+
- ./test-data/tokenomics-telemetry.json:/app/data/tokenomics-telemetry.json:ro
291+
working_dir: /app
292+
command: ["sh", "-c", "python -m pip install --no-cache-dir prometheus-client flask && python tokenomics_metrics_exporter.py"]
293+
environment:
294+
- TOKENOMICS_SOURCE_FILE=/app/data/tokenomics-telemetry.json
295+
- TOKENOMICS_METRICS_PORT=9105
296+
ports:
297+
- "9105:9105"
298+
networks:
299+
- sovereign-network
300+
depends_on:
301+
- backend
302+
healthcheck:
303+
test: ["CMD", "python", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:9105/health', timeout=3)"]
304+
interval: 15s
305+
timeout: 5s
306+
retries: 3
307+
start_period: 10s
308+
restart: unless-stopped
309+
security_opt:
310+
- no-new-privileges:true
311+
cap_drop:
312+
- ALL
313+
tmpfs:
314+
- /tmp
259315
logging:
260316
driver: "json-file"
261317
options:

0 commit comments

Comments
 (0)