Skip to content

Commit 658a0ca

Browse files
Merge pull request #920 from erikdarlingdata/feature/917-lite-finops-memory-recommendation
Fix #917 (Lite) — base FinOps memory recommendation on 7-day P95
2 parents 4e58e8a + 6ce9e37 commit 658a0ca

1 file changed

Lines changed: 82 additions & 22 deletions

File tree

Lite/Services/LocalDataService.FinOps.Recommendations.cs

Lines changed: 82 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -193,23 +193,55 @@ IF @sql <> N''
193193
// 3. Memory right-sizing score (from DuckDB)
194194
try
195195
{
196+
/* Use P95 of total_server_memory_mb (perfmon "Total Server Memory" — the
197+
full set of memory SQL has committed) over 7 days, not a single-sample
198+
snapshot of buffer_pool_mb (data cache only). The earlier version
199+
could fire right after a service restart or on servers where plan
200+
cache / workspace memory dominates, falsely showing "buffer pool 0%". */
196201
var util = await GetUtilizationEfficiencyAsync(serverId);
197202
if (util != null && util.PhysicalMemoryMb > 8192)
198203
{
199-
var bpRatio = util.PhysicalMemoryMb > 0 ? (decimal)util.BufferPoolMb / util.PhysicalMemoryMb : 0m;
200-
if (bpRatio < 0.50m)
204+
int p95Mb = 0;
205+
long sampleCount = 0;
206+
using (var conn = await OpenConnectionAsync())
207+
using (var cmd = conn.CreateCommand())
201208
{
202-
var targetMb = Math.Max(8192, util.BufferPoolMb * 2);
203-
recommendations.Add(new RecommendationRow
209+
cmd.CommandText = @"
210+
SELECT
211+
PERCENTILE_CONT(0.95) WITHIN GROUP (ORDER BY total_server_memory_mb) AS p95_mb,
212+
COUNT(*) AS sample_count
213+
FROM v_memory_stats
214+
WHERE server_id = $1
215+
AND collection_time >= $2";
216+
cmd.Parameters.Add(new DuckDBParameter { Value = serverId });
217+
cmd.Parameters.Add(new DuckDBParameter { Value = DateTime.UtcNow.AddDays(-7) });
218+
219+
using var reader = await cmd.ExecuteReaderAsync();
220+
if (await reader.ReadAsync())
204221
{
205-
Category = "Memory",
206-
Severity = bpRatio < 0.30m ? "High" : "Medium",
207-
Confidence = "Medium",
208-
Finding = $"Memory over-provisioned (buffer pool uses {bpRatio:P0} of {util.PhysicalMemoryMb / 1024}GB RAM)",
209-
Detail = $"Buffer pool is {util.BufferPoolMb:N0} MB out of {util.PhysicalMemoryMb:N0} MB physical RAM ({bpRatio:P0} utilization). " +
210-
$"Consider reducing to ~{targetMb / 1024}GB.",
211-
EstMonthlySavings = monthlyCost > 0 ? monthlyCost * (1m - (decimal)targetMb / util.PhysicalMemoryMb) * 0.30m : null
212-
});
222+
p95Mb = reader.IsDBNull(0) ? 0 : Convert.ToInt32(reader.GetValue(0));
223+
sampleCount = reader.IsDBNull(1) ? 0L : ToInt64(reader.GetValue(1));
224+
}
225+
}
226+
227+
// Need at least ~1 day of samples (one per minute baseline) to trust the P95
228+
if (sampleCount >= 500)
229+
{
230+
var memRatio = (decimal)p95Mb / util.PhysicalMemoryMb;
231+
if (memRatio < 0.50m)
232+
{
233+
var targetMb = Math.Max(8192, p95Mb * 2);
234+
recommendations.Add(new RecommendationRow
235+
{
236+
Category = "Memory",
237+
Severity = memRatio < 0.30m ? "High" : "Medium",
238+
Confidence = "Medium",
239+
Finding = $"Memory over-provisioned (P95 SQL memory uses {memRatio:P0} of {util.PhysicalMemoryMb / 1024}GB RAM)",
240+
Detail = $"P95 SQL Server memory over 7 days is {p95Mb:N0} MB out of {util.PhysicalMemoryMb:N0} MB physical RAM ({memRatio:P0} utilization). " +
241+
$"Consider reducing to ~{targetMb / 1024}GB.",
242+
EstMonthlySavings = monthlyCost > 0 ? monthlyCost * (1m - (decimal)targetMb / util.PhysicalMemoryMb) * 0.30m : null
243+
});
244+
}
213245
}
214246
}
215247
}
@@ -439,11 +471,16 @@ ORDER BY times_ran_long DESC
439471
var vmUtil = await GetUtilizationEfficiencyAsync(serverId);
440472
if (vmUtil != null)
441473
{
442-
// CPU data comes from 24-hour window in GetUtilizationEfficiencyAsync.
443-
// For a 7-day P95 we query DuckDB directly.
474+
/* Memory side previously read util.BufferPoolMb (a single snapshot
475+
of perfmon "Database Cache Memory") — only the data-cache slice
476+
of the buffer pool, ignoring plan cache / workspace / locks /
477+
CLR — and could trigger right after a service restart. Now use
478+
7-day P95 of total_server_memory_mb from v_memory_stats, the
479+
same signal the Utilization tab shows. */
444480
decimal p95Cpu7d = vmUtil.P95CpuPct;
445481
int cpuCount = vmUtil.CpuCount;
446-
int bpMb = vmUtil.BufferPoolMb;
482+
int p95MemMb = 0;
483+
long memSampleCount = 0;
447484
int physMb = vmUtil.PhysicalMemoryMb;
448485

449486
// Try 7-day P95 from DuckDB for better accuracy
@@ -467,6 +504,29 @@ FROM v_cpu_utilization_stats
467504
}
468505
catch { /* fall back to 24-hour P95 */ }
469506

507+
try
508+
{
509+
using var memConn = await OpenConnectionAsync();
510+
using var memCmd = memConn.CreateCommand();
511+
memCmd.CommandText = @"
512+
SELECT
513+
PERCENTILE_CONT(0.95) WITHIN GROUP (ORDER BY total_server_memory_mb) AS p95_mb,
514+
COUNT(*) AS sample_count
515+
FROM v_memory_stats
516+
WHERE server_id = $1
517+
AND collection_time >= $2";
518+
memCmd.Parameters.Add(new DuckDBParameter { Value = serverId });
519+
memCmd.Parameters.Add(new DuckDBParameter { Value = DateTime.UtcNow.AddDays(-7) });
520+
521+
using var memReader = await memCmd.ExecuteReaderAsync();
522+
if (await memReader.ReadAsync())
523+
{
524+
p95MemMb = memReader.IsDBNull(0) ? 0 : Convert.ToInt32(memReader.GetValue(0));
525+
memSampleCount = memReader.IsDBNull(1) ? 0L : ToInt64(memReader.GetValue(1));
526+
}
527+
}
528+
catch { /* if we cannot get 7-day P95 memory, skip the memory prescription */ }
529+
470530
// CPU prescription: only if >= 4 cores
471531
if (cpuCount >= 4)
472532
{
@@ -493,14 +553,14 @@ FROM v_cpu_utilization_stats
493553
}
494554
}
495555

496-
// Memory prescription: only if >= 4096 MB
497-
if (physMb >= 4096 && physMb > 0)
556+
// Memory prescription: needs >= 4 GB physical and at least ~1 day of samples
557+
if (physMb >= 4096 && physMb > 0 && memSampleCount >= 500)
498558
{
499-
var bpRatio = (decimal)bpMb / physMb;
559+
var memRatio = (decimal)p95MemMb / physMb;
500560
int targetMb = 0;
501-
if (bpRatio < 0.25m)
561+
if (memRatio < 0.25m)
502562
targetMb = Math.Max(4096, physMb / 4);
503-
else if (bpRatio < 0.40m)
563+
else if (memRatio < 0.40m)
504564
targetMb = Math.Max(4096, physMb / 2);
505565

506566
if (targetMb > 0 && targetMb < physMb)
@@ -510,8 +570,8 @@ FROM v_cpu_utilization_stats
510570
Category = "Hardware",
511571
Severity = "Medium",
512572
Confidence = "Medium",
513-
Finding = $"Memory: reduce from {physMb / 1024}GB to {targetMb / 1024}GB (buffer pool uses {bpRatio:P0})",
514-
Detail = $"Buffer pool is using {bpMb:N0} MB of {physMb:N0} MB physical RAM ({bpRatio:P0}). " +
573+
Finding = $"Memory: reduce from {physMb / 1024}GB to {targetMb / 1024}GB (P95 SQL memory uses {memRatio:P0})",
574+
Detail = $"P95 SQL Server memory over 7 days is {p95MemMb:N0} MB of {physMb:N0} MB physical RAM ({memRatio:P0}). " +
515575
$"Reducing to {targetMb / 1024}GB would still leave headroom.",
516576
EstMonthlySavings = monthlyCost > 0
517577
? monthlyCost * (1m - (decimal)targetMb / physMb) * 0.30m

0 commit comments

Comments
 (0)