Skip to content

Commit abb40ef

Browse files
authored
fix: reduce CUDA benchmark noise on codspeed (#7749)
- Remove noisy/synthetic benchmarks from codspeed CI: - throughput_cuda (pure memory bandwidth, not Vortex logic) - for_cuda (u8/u16 undersaturate the GPU) - filter_cuda + zstd_cuda (entire NVIDIA kernels shard, 10-25% cross-run swing) - Only build benchmarks each shard needs - Run only 100M element input sizes on codspeed - Increase warm_up_time 1ns -> 500ms - Consistent benchmark naming: cuda/{encoding}/{params}/{size} with size always the last path segment Signed-off-by: Alexander Droste <alexander.droste@protonmail.com>
1 parent c4feed7 commit abb40ef

12 files changed

Lines changed: 288 additions & 313 deletions

File tree

.github/workflows/codspeed.yml

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -73,8 +73,7 @@ jobs:
7373
include:
7474
- { shard: 1, name: "Bitpacked", benches: "bitpacked_cuda" }
7575
- { shard: 2, name: "Dynamic dispatch", benches: "dynamic_dispatch_cuda" }
76-
- { shard: 3, name: "Standalone kernels", benches: "alp_cuda date_time_parts_cuda dict_cuda for_cuda runend_cuda throughput_cuda" }
77-
- { shard: 4, name: "NVIDIA kernels", benches: "filter_cuda zstd_cuda" }
76+
- { shard: 3, name: "Standalone kernels", benches: "alp_cuda date_time_parts_cuda dict_cuda runend_cuda" }
7877
name: "Benchmark with Codspeed (CUDA Shard #${{ matrix.shard }} - ${{ matrix.name }})"
7978
timeout-minutes: 30
8079
runs-on: runs-on=${{ github.run_id }}/family=g5/image=ubuntu24-gpu-x64/tag=bench-codspeed-cuda-${{ matrix.shard }}
@@ -96,7 +95,7 @@ jobs:
9695
with:
9796
tool: cargo-codspeed
9897
- name: Build benchmarks
99-
run: cargo codspeed build -m walltime -p vortex-cuda --profile bench
98+
run: cargo codspeed build -m walltime $(printf -- '--bench %s ' ${{ matrix.benches }}) --profile bench
10099
- name: Run benchmarks
101100
uses: CodSpeedHQ/action@d872884a306dd4853acf0f584f4b706cf0cc72a2
102101
env:

vortex-cuda/benches/alp_cuda.rs

Lines changed: 33 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -38,10 +38,9 @@ use vortex_cuda::executor::CudaArrayExt;
3838
use vortex_cuda_macros::cuda_available;
3939
use vortex_cuda_macros::cuda_not_available;
4040

41+
use crate::bench_config::BENCH_SIZES;
4142
use crate::timed_launch_strategy::TimedLaunchStrategy;
4243

43-
const N_ROWS: usize = 100_000_000;
44-
4544
/// Patch frequencies to benchmark (as fractions).
4645
const PATCH_FREQUENCIES: &[(f64, &str)] = &[(0.0, "0%"), (0.01, "1%"), (0.10, "10%")];
4746

@@ -93,35 +92,38 @@ fn benchmark_alp_decode_typed<T>(c: &mut Criterion, type_name: &str)
9392
where
9493
T: ALPFloat + NativePType + DeviceRepr,
9594
{
96-
let mut group = c.benchmark_group(format!("alp_cuda_{}", type_name));
97-
98-
let nbytes = N_ROWS * size_of::<T>();
99-
group.throughput(Throughput::Bytes(nbytes as u64));
100-
101-
for &(patch_freq, patch_label) in PATCH_FREQUENCIES {
102-
let array = make_alp_array::<T>(N_ROWS, patch_freq);
103-
104-
group.bench_with_input(
105-
BenchmarkId::new("alp_decode", patch_label),
106-
&array,
107-
|b, array| {
108-
b.iter_custom(|iters| {
109-
let timed = TimedLaunchStrategy::default();
110-
let timer = timed.timer();
111-
112-
let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())
113-
.vortex_expect("failed to create execution context")
114-
.with_dispatch_mode(CudaDispatchMode::StandaloneOnly)
115-
.with_launch_strategy(Arc::new(timed));
116-
117-
for _ in 0..iters {
118-
block_on(array.clone().into_array().execute_cuda(&mut cuda_ctx)).unwrap();
119-
}
120-
121-
Duration::from_nanos(timer.load(Ordering::Relaxed))
122-
});
123-
},
124-
);
95+
let mut group = c.benchmark_group(format!("cuda/alp_{}", type_name));
96+
97+
for &(len, len_str) in BENCH_SIZES {
98+
group.throughput(Throughput::Bytes((len * size_of::<T>()) as u64));
99+
100+
for &(patch_freq, patch_label) in PATCH_FREQUENCIES {
101+
let array = make_alp_array::<T>(len, patch_freq);
102+
103+
group.bench_with_input(
104+
BenchmarkId::new(patch_label, len_str),
105+
&array,
106+
|b, array| {
107+
b.iter_custom(|iters| {
108+
let timed = TimedLaunchStrategy::default();
109+
let timer = timed.timer();
110+
111+
let mut cuda_ctx =
112+
CudaSession::create_execution_ctx(&VortexSession::empty())
113+
.vortex_expect("failed to create execution context")
114+
.with_dispatch_mode(CudaDispatchMode::StandaloneOnly)
115+
.with_launch_strategy(Arc::new(timed));
116+
117+
for _ in 0..iters {
118+
block_on(array.clone().into_array().execute_cuda(&mut cuda_ctx))
119+
.unwrap();
120+
}
121+
122+
Duration::from_nanos(timer.load(Ordering::Relaxed))
123+
});
124+
},
125+
);
126+
}
125127
}
126128

127129
group.finish();

vortex-cuda/benches/bench_config/mod.rs

Lines changed: 19 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,16 @@ use std::time::Duration;
55

66
use criterion::Criterion;
77

8+
/// Benchmark input sizes.
9+
///
10+
/// On codspeed, only the 100M variant runs — kernels under ~200 µs
11+
/// (i.e. the 10M cases) swing 15-45% across ephemeral GPU instances,
12+
/// drowning real regressions in noise. Locally both sizes run.
13+
#[cfg(not(codspeed))]
14+
pub const BENCH_SIZES: &[(usize, &str)] = &[(10_000_000, "10M"), (100_000_000, "100M")];
15+
#[cfg(codspeed)]
16+
pub const BENCH_SIZES: &[(usize, &str)] = &[(100_000_000, "100M")];
17+
818
/// Returns a [`Criterion`] configuration tuned for CUDA benchmarks.
919
///
1020
/// All benchmarks use `iter_custom` with precise CUDA event timing.
@@ -15,21 +25,21 @@ use criterion::Criterion;
1525
/// Stability comes from a high `sample_size` (many independent launches)
1626
/// rather than many iterations per sample.
1727
///
18-
/// `warm_up_time` runs at least one full iteration before sampling, giving
19-
/// the GPU a chance to reach steady state (clock boost, cache warming).
20-
/// If a single launch exceeds the warm-up budget, criterion still completes
21-
/// it before moving on.
28+
/// `warm_up_time` is set to 500 ms — long enough to JIT-compile PTX,
29+
/// boost GPU clocks, and warm caches, while keeping total runtime under
30+
/// 2 minutes even for the largest benchmark binary (~18 benchmarks).
31+
///
32+
/// `sample_size` is 10: with 100M inputs the kernels are long enough
33+
/// (>500 µs) that within-run variance is low. Cross-run stability
34+
/// comes from the large input size, not from averaging many samples.
2235
pub(super) fn cuda_bench_config() -> Criterion {
23-
// Number of independent kernel launches.
2436
let sample_size = 10;
2537

2638
Criterion::default()
2739
.without_plots()
2840
.sample_size(sample_size)
29-
// One ns is enough to JIT-compile kernels and warm GPU caches.
30-
// Criterion always finishes the in-flight iteration even if this
31-
// budget is exceeded.
32-
.warm_up_time(Duration::from_nanos(1))
41+
// Enough for PTX JIT, GPU clock boost, and cache warming.
42+
.warm_up_time(Duration::from_millis(500))
3343
// Forces `iters = 1`: criterion's planner estimates iteration cost
3444
// from wall time (which includes GPU context setup), not the
3545
// GPU-timed duration returned by `iter_custom`. A real

vortex-cuda/benches/bitpacked_cuda.rs

Lines changed: 57 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -40,8 +40,6 @@ use vortex_cuda_macros::cuda_not_available;
4040

4141
use crate::timed_launch_strategy::TimedLaunchStrategy;
4242

43-
const N_ROWS: usize = 100_000_000;
44-
4543
/// Patch frequencies to benchmark (as fractions)
4644
const PATCH_FREQUENCIES: &[(f64, &str)] = &[(0.01, "1%"), (0.10, "10%")];
4745

@@ -112,61 +110,16 @@ where
112110
T: BitPacked + NativePType + DeviceRepr + Add<Output = T> + From<u8>,
113111
T::Physical: DeviceRepr,
114112
{
115-
let mut group = c.benchmark_group(format!("bitunpack_cuda_{}", type_name));
116-
117-
let array = make_bitpacked_array::<T>(bit_width, N_ROWS);
118-
let nbytes = N_ROWS * size_of::<T>();
119-
120-
group.throughput(Throughput::Bytes(nbytes as u64));
121-
122-
group.bench_with_input(
123-
BenchmarkId::new("bitunpack", format!("{}bw", bit_width)),
124-
&array,
125-
|b, array| {
126-
b.iter_custom(|iters| {
127-
let timed = TimedLaunchStrategy::default();
128-
let timer = timed.timer();
129-
130-
let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())
131-
.vortex_expect("failed to create execution context")
132-
.with_dispatch_mode(CudaDispatchMode::StandaloneOnly)
133-
.with_launch_strategy(Arc::new(timed));
134-
135-
for _ in 0..iters {
136-
block_on(array.clone().into_array().execute_cuda(&mut cuda_ctx)).unwrap();
137-
}
138-
139-
Duration::from_nanos(timer.load(Ordering::Relaxed))
140-
});
141-
},
142-
);
143-
144-
group.finish();
145-
}
146-
147-
fn benchmark_bitunpack(c: &mut Criterion) {
148-
benchmark_bitunpack_typed::<u8>(c, 3, "u8");
149-
benchmark_bitunpack_typed::<u16>(c, 5, "u16");
150-
benchmark_bitunpack_typed::<u32>(c, 6, "u32");
151-
benchmark_bitunpack_typed::<u64>(c, 8, "u64");
152-
}
153-
154-
/// Benchmark function for unpacking with patches at various frequencies
155-
fn benchmark_bitunpack_with_patches_typed<T>(c: &mut Criterion, type_name: &str)
156-
where
157-
T: BitPacked + NativePType + DeviceRepr + Add<Output = T> + From<u8>,
158-
T::Physical: DeviceRepr,
159-
{
160-
let mut group = c.benchmark_group(format!("bitunpack_cuda_patched_{}", type_name));
113+
let mut group = c.benchmark_group(format!("cuda/bitpacked_{}", type_name));
161114

162-
let nbytes = N_ROWS * size_of::<T>();
163-
group.throughput(Throughput::Bytes(nbytes as u64));
115+
for &(n_rows, size_str) in bench_config::BENCH_SIZES {
116+
let array = make_bitpacked_array::<T>(bit_width, n_rows);
117+
let nbytes = n_rows * size_of::<T>();
164118

165-
for &(patch_freq, patch_label) in PATCH_FREQUENCIES {
166-
let array = make_bitpacked_array_with_patches::<T>(N_ROWS, patch_freq);
119+
group.throughput(Throughput::Bytes(nbytes as u64));
167120

168121
group.bench_with_input(
169-
BenchmarkId::new("bitunpack_patched", patch_label),
122+
BenchmarkId::new(format!("unpack/{}bw", bit_width), size_str),
170123
&array,
171124
|b, array| {
172125
b.iter_custom(|iters| {
@@ -191,6 +144,57 @@ where
191144
group.finish();
192145
}
193146

147+
fn benchmark_bitunpack(c: &mut Criterion) {
148+
benchmark_bitunpack_typed::<u8>(c, 3, "u8");
149+
benchmark_bitunpack_typed::<u16>(c, 5, "u16");
150+
benchmark_bitunpack_typed::<u32>(c, 6, "u32");
151+
benchmark_bitunpack_typed::<u64>(c, 8, "u64");
152+
}
153+
154+
/// Benchmark function for unpacking with patches at various frequencies
155+
fn benchmark_bitunpack_with_patches_typed<T>(c: &mut Criterion, type_name: &str)
156+
where
157+
T: BitPacked + NativePType + DeviceRepr + Add<Output = T> + From<u8>,
158+
T::Physical: DeviceRepr,
159+
{
160+
let mut group = c.benchmark_group(format!("cuda/bitpacked_patched_{}", type_name));
161+
162+
for &(n_rows, size_str) in bench_config::BENCH_SIZES {
163+
let nbytes = n_rows * size_of::<T>();
164+
group.throughput(Throughput::Bytes(nbytes as u64));
165+
166+
for &(patch_freq, patch_label) in PATCH_FREQUENCIES {
167+
let array = make_bitpacked_array_with_patches::<T>(n_rows, patch_freq);
168+
169+
group.bench_with_input(
170+
BenchmarkId::new(format!("unpack/{}", patch_label), size_str),
171+
&array,
172+
|b, array| {
173+
b.iter_custom(|iters| {
174+
let timed = TimedLaunchStrategy::default();
175+
let timer = timed.timer();
176+
177+
let mut cuda_ctx =
178+
CudaSession::create_execution_ctx(&VortexSession::empty())
179+
.vortex_expect("failed to create execution context")
180+
.with_dispatch_mode(CudaDispatchMode::StandaloneOnly)
181+
.with_launch_strategy(Arc::new(timed));
182+
183+
for _ in 0..iters {
184+
block_on(array.clone().into_array().execute_cuda(&mut cuda_ctx))
185+
.unwrap();
186+
}
187+
188+
Duration::from_nanos(timer.load(Ordering::Relaxed))
189+
});
190+
},
191+
);
192+
}
193+
}
194+
195+
group.finish();
196+
}
197+
194198
fn benchmark_bitunpack_with_patches(c: &mut Criterion) {
195199
benchmark_bitunpack_with_patches_typed::<u8>(c, "u8");
196200
benchmark_bitunpack_with_patches_typed::<u16>(c, "u16");

vortex-cuda/benches/date_time_parts_cuda.rs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -52,16 +52,16 @@ fn make_datetimeparts_array(len: usize, time_unit: TimeUnit) -> DateTimePartsArr
5252
}
5353

5454
fn benchmark_datetimeparts(c: &mut Criterion) {
55-
let mut group = c.benchmark_group("datetimeparts_cuda");
55+
let mut group = c.benchmark_group("cuda/datetimeparts");
5656

57-
for (len, len_str) in [(10_000_000usize, "10M"), (100_000_000usize, "100M")] {
57+
for &(len, len_str) in bench_config::BENCH_SIZES {
5858
group.throughput(Throughput::Bytes((len * size_of::<i64>()) as u64));
5959

6060
let (time_unit, unit_str) = (TimeUnit::Milliseconds, "ms");
6161
let dtp_array = make_datetimeparts_array(len, time_unit);
6262

6363
group.bench_with_input(
64-
BenchmarkId::new("datetimeparts", format!("{len_str}_{unit_str}")),
64+
BenchmarkId::new(unit_str, len_str),
6565
&dtp_array,
6666
|b, dtp_array| {
6767
b.iter_custom(|iters| {

vortex-cuda/benches/dict_cuda.rs

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -34,10 +34,9 @@ use vortex_cuda::executor::CudaArrayExt;
3434
use vortex_cuda_macros::cuda_available;
3535
use vortex_cuda_macros::cuda_not_available;
3636

37+
use crate::bench_config::BENCH_SIZES;
3738
use crate::timed_launch_strategy::TimedLaunchStrategy;
3839

39-
const BENCH_ARGS: &[(usize, &str)] = &[(10_000_000, "10M")];
40-
4140
/// Configuration for a dictionary benchmark specifying value and code types along with dictionary size.
4241
struct DictBenchConfig {
4342
dict_size: usize,
@@ -75,21 +74,21 @@ where
7574
C: NativePType + DeviceRepr + TryFrom<usize>,
7675
<C as TryFrom<usize>>::Error: Debug,
7776
{
78-
let mut group = c.benchmark_group("dict_cuda");
77+
let mut group = c.benchmark_group("cuda/dict");
7978

80-
for (len, len_str) in BENCH_ARGS {
79+
for (len, len_str) in BENCH_SIZES {
8180
// Throughput is based on output size (values read from dictionary)
8281
group.throughput(Throughput::Bytes((len * size_of::<V>()) as u64));
8382

8483
let dict_array = make_dict_array_typed::<V, C>(*len, config.dict_size);
8584

8685
group.bench_with_input(
8786
BenchmarkId::new(
88-
"dict",
8987
format!(
90-
"{len_str}_{}_values_{}_codes",
88+
"{}_values_{}_codes",
9189
config.value_type_name, config.code_type_name
9290
),
91+
len_str,
9392
),
9493
&dict_array,
9594
|b, dict_array| {

0 commit comments

Comments
 (0)