Skip to content

Commit e7ff913

Browse files
committed
fix: address remaining code review issues in metrics
- Fix defer in loop: close response body immediately after decode - Fix memory leak: cleanup stale CPU stats for deleted clones - Fix CPU rollover: detect counter reset when container restarts - Remove redundant prevCPUStatsMu mutex (already protected by c.mu) - Remove unused DatasetInfo metric
1 parent a51c6fe commit e7ff913

3 files changed

Lines changed: 33 additions & 31 deletions

File tree

PROMETHEUS.md

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,6 @@ The endpoint is publicly accessible (no authentication required) and returns met
7575
|-------------|------|--------|-------------|
7676
| `dblab_datasets_total` | Gauge | `pool` | Total number of datasets (slots) in the pool |
7777
| `dblab_datasets_available` | Gauge | `pool` | Number of available (non-busy) dataset slots for reuse |
78-
| `dblab_dataset_info` | Gauge | `pool`, `dataset_name` | Information about a dataset (1=busy, 0=available) |
7978

8079
## Prometheus Configuration
8180

engine/internal/srv/metrics/collector.go

Lines changed: 33 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -32,16 +32,15 @@ type containerCPUState struct {
3232

3333
// Collector collects metrics from DBLab components.
3434
type Collector struct {
35-
mu sync.Mutex
36-
metrics *Metrics
37-
cloning *cloning.Base
38-
retrieval *retrieval.Retrieval
39-
pm *pool.Manager
40-
engProps *global.EngineProps
41-
dockerClient *client.Client
42-
startedAt time.Time
43-
prevCPUStats map[string]containerCPUState
44-
prevCPUStatsMu sync.RWMutex
35+
mu sync.Mutex
36+
metrics *Metrics
37+
cloning *cloning.Base
38+
retrieval *retrieval.Retrieval
39+
pm *pool.Manager
40+
engProps *global.EngineProps
41+
dockerClient *client.Client
42+
startedAt time.Time
43+
prevCPUStats map[string]containerCPUState
4544
}
4645

4746
// NewCollector creates a new metrics collector.
@@ -221,22 +220,27 @@ func (c *Collector) getContainerStats(ctx context.Context, clones []*models.Clon
221220
return result
222221
}
223222

223+
activeCloneIDs := make(map[string]struct{})
224+
224225
for _, clone := range clones {
225226
if clone == nil || clone.Status.Code != models.StatusOK {
226227
continue
227228
}
228229

230+
activeCloneIDs[clone.ID] = struct{}{}
231+
229232
stats, err := c.dockerClient.ContainerStatsOneShot(ctx, clone.ID)
230233
if err != nil {
231234
log.Dbg(fmt.Sprintf("failed to get container stats for clone %s: %v", clone.ID, err))
232235
continue
233236
}
234237

235-
defer stats.Body.Close()
236-
237238
var statsJSON container.StatsResponse
238-
if err := json.NewDecoder(stats.Body).Decode(&statsJSON); err != nil {
239-
log.Dbg(fmt.Sprintf("failed to decode container stats for clone %s: %v", clone.ID, err))
239+
decodeErr := json.NewDecoder(stats.Body).Decode(&statsJSON)
240+
stats.Body.Close()
241+
242+
if decodeErr != nil {
243+
log.Dbg(fmt.Sprintf("failed to decode container stats for clone %s: %v", clone.ID, decodeErr))
240244
continue
241245
}
242246

@@ -251,6 +255,8 @@ func (c *Collector) getContainerStats(ctx context.Context, clones []*models.Clon
251255
}
252256
}
253257

258+
c.cleanupStaleCPUStats(activeCloneIDs)
259+
254260
return result
255261
}
256262

@@ -261,17 +267,13 @@ func (c *Collector) calculateCPUPercent(cloneID string, stats *container.StatsRe
261267
currentSystemUsage := stats.CPUStats.SystemUsage
262268
now := time.Now()
263269

264-
c.prevCPUStatsMu.RLock()
265270
prevStats, hasPrev := c.prevCPUStats[cloneID]
266-
c.prevCPUStatsMu.RUnlock()
267271

268-
c.prevCPUStatsMu.Lock()
269272
c.prevCPUStats[cloneID] = containerCPUState{
270273
totalUsage: currentTotalUsage,
271274
systemUsage: currentSystemUsage,
272275
timestamp: now,
273276
}
274-
c.prevCPUStatsMu.Unlock()
275277

276278
if !hasPrev {
277279
return 0
@@ -282,10 +284,14 @@ func (c *Collector) calculateCPUPercent(cloneID string, stats *container.StatsRe
282284
return 0
283285
}
284286

287+
if currentTotalUsage < prevStats.totalUsage || currentSystemUsage < prevStats.systemUsage {
288+
return 0
289+
}
290+
285291
cpuDelta := float64(currentTotalUsage - prevStats.totalUsage)
286292
systemDelta := float64(currentSystemUsage - prevStats.systemUsage)
287293

288-
if systemDelta <= 0 || cpuDelta < 0 {
294+
if systemDelta <= 0 {
289295
return 0
290296
}
291297

@@ -301,6 +307,14 @@ func (c *Collector) calculateCPUPercent(cloneID string, stats *container.StatsRe
301307
return (cpuDelta / systemDelta) * cpuCount * 100.0
302308
}
303309

310+
func (c *Collector) cleanupStaleCPUStats(activeCloneIDs map[string]struct{}) {
311+
for cloneID := range c.prevCPUStats {
312+
if _, ok := activeCloneIDs[cloneID]; !ok {
313+
delete(c.prevCPUStats, cloneID)
314+
}
315+
}
316+
}
317+
304318
func (c *Collector) collectSnapshotMetrics() {
305319
snapshots, err := c.cloning.GetSnapshots()
306320
if err != nil {

engine/internal/srv/metrics/metrics.go

Lines changed: 0 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,6 @@ type Metrics struct {
6060
// dataset metrics (for logical mode - non-busy slots)
6161
DatasetsTotal *prometheus.GaugeVec
6262
DatasetsAvailable *prometheus.GaugeVec
63-
DatasetInfo *prometheus.GaugeVec
6463
}
6564

6665
// NewMetrics creates a new Metrics instance with all Prometheus metrics.
@@ -342,14 +341,6 @@ func NewMetrics() *Metrics {
342341
},
343342
[]string{"pool"},
344343
),
345-
DatasetInfo: prometheus.NewGaugeVec(
346-
prometheus.GaugeOpts{
347-
Namespace: namespace,
348-
Name: "dataset_info",
349-
Help: "Information about a dataset (1=busy, 0=available)",
350-
},
351-
[]string{"pool", "dataset_name"},
352-
),
353344
}
354345
}
355346

@@ -390,7 +381,6 @@ func (m *Metrics) Register(reg prometheus.Registerer) error {
390381
m.BranchInfo,
391382
m.DatasetsTotal,
392383
m.DatasetsAvailable,
393-
m.DatasetInfo,
394384
}
395385

396386
for _, c := range collectors {
@@ -431,5 +421,4 @@ func (m *Metrics) Reset() {
431421
m.BranchInfo.Reset()
432422
m.DatasetsTotal.Reset()
433423
m.DatasetsAvailable.Reset()
434-
m.DatasetInfo.Reset()
435424
}

0 commit comments

Comments
 (0)