vortex-data
diff --git a/‎vortex-cuda/benches/alp_cuda.rs‎
Lines changed: 2 additions & 2 deletions b/‎vortex-cuda/benches/alp_cuda.rs‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎vortex-cuda/benches/bench_config/mod.rs‎
Lines changed: 2 additions & 6 deletions b/‎vortex-cuda/benches/bench_config/mod.rs‎
Lines changed: 2 additions & 6 deletions
diff --git a/‎vortex-cuda/benches/bitpacked_cuda.rs‎
Lines changed: 13 additions & 4 deletions b/‎vortex-cuda/benches/bitpacked_cuda.rs‎
Lines changed: 13 additions & 4 deletions
diff --git a/‎vortex-cuda/benches/date_time_parts_cuda.rs‎
Lines changed: 2 additions & 2 deletions b/‎vortex-cuda/benches/date_time_parts_cuda.rs‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎vortex-cuda/benches/dict_cuda.rs‎
Lines changed: 2 additions & 2 deletions b/‎vortex-cuda/benches/dict_cuda.rs‎
Lines changed: 2 additions & 2 deletions
@@ -92,7 +92,7 @@ fn benchmark_alp_decode_typed<T>(c: &mut Criterion, type_name: &str)
 where
     T: ALPFloat + NativePType + DeviceRepr,
 {
-    let mut group = c.benchmark_group(format!("cuda/alp_{}", type_name));
+    let mut group = c.benchmark_group("cuda");
 
     for &(len, len_str) in BENCH_SIZES {
         group.throughput(Throughput::Bytes((len * size_of::<T>()) as u64));
@@ -101,7 +101,7 @@ where
             let array = make_alp_array::<T>(len, patch_freq);
 
             group.bench_with_input(
-                BenchmarkId::new(patch_label, len_str),
+                BenchmarkId::new(format!("cuda/alp_{}/{}", type_name, patch_label), len_str),
                 &array,
                 |b, array| {
                     b.iter_custom(|iters| {
 
@@ -7,12 +7,8 @@ use criterion::Criterion;
 
 /// Benchmark input sizes.
 ///
-/// On codspeed, only the 100M variant runs — kernels under ~200 µs
-/// (i.e. the 10M cases) swing 15-45% across ephemeral GPU instances,
-/// drowning real regressions in noise. Locally both sizes run.
-#[cfg(not(codspeed))]
-pub const BENCH_SIZES: &[(usize, &str)] = &[(10_000_000, "10M"), (100_000_000, "100M")];
-#[cfg(codspeed)]
+/// 100M elements keeps every kernel above ~500 µs, well above the
+/// ~15 µs CUDA driver noise floor that caused 15-45% swings at 10M.
 pub const BENCH_SIZES: &[(usize, &str)] = &[(100_000_000, "100M")];
 
 /// Returns a [`Criterion`] configuration tuned for CUDA benchmarks.
 
@@ -110,7 +110,7 @@ where
     T: BitPacked + NativePType + DeviceRepr + Add<Output = T> + From<u8>,
     T::Physical: DeviceRepr,
 {
-    let mut group = c.benchmark_group(format!("cuda/bitpacked_{}", type_name));
+    let mut group = c.benchmark_group("cuda");
 
     for &(n_rows, size_str) in bench_config::BENCH_SIZES {
         let array = make_bitpacked_array::<T>(bit_width, n_rows);
@@ -119,7 +119,10 @@ where
         group.throughput(Throughput::Bytes(nbytes as u64));
 
         group.bench_with_input(
-            BenchmarkId::new(format!("unpack/{}bw", bit_width), size_str),
+            BenchmarkId::new(
+                format!("cuda/bitpacked_{}/unpack/{}bw", type_name, bit_width),
+                size_str,
+            ),
             &array,
             |b, array| {
                 b.iter_custom(|iters| {
@@ -157,7 +160,7 @@ where
     T: BitPacked + NativePType + DeviceRepr + Add<Output = T> + From<u8>,
     T::Physical: DeviceRepr,
 {
-    let mut group = c.benchmark_group(format!("cuda/bitpacked_patched_{}", type_name));
+    let mut group = c.benchmark_group("cuda");
 
     for &(n_rows, size_str) in bench_config::BENCH_SIZES {
         let nbytes = n_rows * size_of::<T>();
@@ -167,7 +170,13 @@ where
             let array = make_bitpacked_array_with_patches::<T>(n_rows, patch_freq);
 
             group.bench_with_input(
-                BenchmarkId::new(format!("unpack/{}", patch_label), size_str),
+                BenchmarkId::new(
+                    format!(
+                        "cuda/bitpacked_patched_{}/unpack/{}",
+                        type_name, patch_label
+                    ),
+                    size_str,
+                ),
                 &array,
                 |b, array| {
                     b.iter_custom(|iters| {
 
@@ -52,7 +52,7 @@ fn make_datetimeparts_array(len: usize, time_unit: TimeUnit) -> DateTimePartsArr
 }
 
 fn benchmark_datetimeparts(c: &mut Criterion) {
-    let mut group = c.benchmark_group("cuda/datetimeparts");
+    let mut group = c.benchmark_group("cuda");
 
     for &(len, len_str) in bench_config::BENCH_SIZES {
         group.throughput(Throughput::Bytes((len * size_of::<i64>()) as u64));
@@ -61,7 +61,7 @@ fn benchmark_datetimeparts(c: &mut Criterion) {
         let dtp_array = make_datetimeparts_array(len, time_unit);
 
         group.bench_with_input(
-            BenchmarkId::new(unit_str, len_str),
+            BenchmarkId::new(format!("cuda/datetimeparts/{unit_str}"), len_str),
             &dtp_array,
             |b, dtp_array| {
                 b.iter_custom(|iters| {
 
@@ -74,7 +74,7 @@ where
     C: NativePType + DeviceRepr + TryFrom<usize>,
     <C as TryFrom<usize>>::Error: Debug,
 {
-    let mut group = c.benchmark_group("cuda/dict");
+    let mut group = c.benchmark_group("cuda");
 
     for (len, len_str) in BENCH_SIZES {
         // Throughput is based on output size (values read from dictionary)
@@ -85,7 +85,7 @@ where
         group.bench_with_input(
             BenchmarkId::new(
                 format!(
-                    "{}_values_{}_codes",
+                    "cuda/dict/{}_values_{}_codes",
                     config.value_type_name, config.code_type_name
                 ),
                 len_str,