vortex-data
diff --git a/‎.github/workflows/codspeed.yml‎
Lines changed: 2 additions & 3 deletions b/‎.github/workflows/codspeed.yml‎
Lines changed: 2 additions & 3 deletions
diff --git a/‎vortex-cuda/benches/alp_cuda.rs‎
Lines changed: 33 additions & 31 deletions b/‎vortex-cuda/benches/alp_cuda.rs‎
Lines changed: 33 additions & 31 deletions
diff --git a/‎vortex-cuda/benches/bench_config/mod.rs‎
Lines changed: 19 additions & 9 deletions b/‎vortex-cuda/benches/bench_config/mod.rs‎
Lines changed: 19 additions & 9 deletions
diff --git a/‎vortex-cuda/benches/bitpacked_cuda.rs‎
Lines changed: 57 additions & 53 deletions b/‎vortex-cuda/benches/bitpacked_cuda.rs‎
Lines changed: 57 additions & 53 deletions
diff --git a/‎vortex-cuda/benches/date_time_parts_cuda.rs‎
Lines changed: 3 additions & 3 deletions b/‎vortex-cuda/benches/date_time_parts_cuda.rs‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎vortex-cuda/benches/dict_cuda.rs‎
Lines changed: 5 additions & 6 deletions b/‎vortex-cuda/benches/dict_cuda.rs‎
Lines changed: 5 additions & 6 deletions
@@ -73,8 +73,7 @@ jobs:
         include:
           - { shard: 1, name: "Bitpacked", benches: "bitpacked_cuda" }
           - { shard: 2, name: "Dynamic dispatch", benches: "dynamic_dispatch_cuda" }
-          - { shard: 3, name: "Standalone kernels", benches: "alp_cuda date_time_parts_cuda dict_cuda for_cuda runend_cuda throughput_cuda" }
-          - { shard: 4, name: "NVIDIA kernels", benches: "filter_cuda zstd_cuda" }
+          - { shard: 3, name: "Standalone kernels", benches: "alp_cuda date_time_parts_cuda dict_cuda runend_cuda" }
     name: "Benchmark with Codspeed (CUDA Shard #${{ matrix.shard }} - ${{ matrix.name }})"
     timeout-minutes: 30
     runs-on: runs-on=${{ github.run_id }}/family=g5/image=ubuntu24-gpu-x64/tag=bench-codspeed-cuda-${{ matrix.shard }}
@@ -96,7 +95,7 @@ jobs:
         with:
           tool: cargo-codspeed
       - name: Build benchmarks
-        run: cargo codspeed build -m walltime -p vortex-cuda --profile bench
+        run: cargo codspeed build -m walltime $(printf -- '--bench %s ' ${{ matrix.benches }}) --profile bench
       - name: Run benchmarks
         uses: CodSpeedHQ/action@d872884a306dd4853acf0f584f4b706cf0cc72a2
         env:
 
@@ -38,10 +38,9 @@ use vortex_cuda::executor::CudaArrayExt;
 use vortex_cuda_macros::cuda_available;
 use vortex_cuda_macros::cuda_not_available;
 
+use crate::bench_config::BENCH_SIZES;
 use crate::timed_launch_strategy::TimedLaunchStrategy;
 
-const N_ROWS: usize = 100_000_000;
-
 /// Patch frequencies to benchmark (as fractions).
 const PATCH_FREQUENCIES: &[(f64, &str)] = &[(0.0, "0%"), (0.01, "1%"), (0.10, "10%")];
 
@@ -93,35 +92,38 @@ fn benchmark_alp_decode_typed<T>(c: &mut Criterion, type_name: &str)
 where
     T: ALPFloat + NativePType + DeviceRepr,
 {
-    let mut group = c.benchmark_group(format!("alp_cuda_{}", type_name));
-
-    let nbytes = N_ROWS * size_of::<T>();
-    group.throughput(Throughput::Bytes(nbytes as u64));
-
-    for &(patch_freq, patch_label) in PATCH_FREQUENCIES {
-        let array = make_alp_array::<T>(N_ROWS, patch_freq);
-
-        group.bench_with_input(
-            BenchmarkId::new("alp_decode", patch_label),
-            &array,
-            |b, array| {
-                b.iter_custom(|iters| {
-                    let timed = TimedLaunchStrategy::default();
-                    let timer = timed.timer();
-
-                    let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())
-                        .vortex_expect("failed to create execution context")
-                        .with_dispatch_mode(CudaDispatchMode::StandaloneOnly)
-                        .with_launch_strategy(Arc::new(timed));
-
-                    for _ in 0..iters {
-                        block_on(array.clone().into_array().execute_cuda(&mut cuda_ctx)).unwrap();
-                    }
-
-                    Duration::from_nanos(timer.load(Ordering::Relaxed))
-                });
-            },
-        );
+    let mut group = c.benchmark_group(format!("cuda/alp_{}", type_name));
+
+    for &(len, len_str) in BENCH_SIZES {
+        group.throughput(Throughput::Bytes((len * size_of::<T>()) as u64));
+
+        for &(patch_freq, patch_label) in PATCH_FREQUENCIES {
+            let array = make_alp_array::<T>(len, patch_freq);
+
+            group.bench_with_input(
+                BenchmarkId::new(patch_label, len_str),
+                &array,
+                |b, array| {
+                    b.iter_custom(|iters| {
+                        let timed = TimedLaunchStrategy::default();
+                        let timer = timed.timer();
+
+                        let mut cuda_ctx =
+                            CudaSession::create_execution_ctx(&VortexSession::empty())
+                                .vortex_expect("failed to create execution context")
+                                .with_dispatch_mode(CudaDispatchMode::StandaloneOnly)
+                                .with_launch_strategy(Arc::new(timed));
+
+                        for _ in 0..iters {
+                            block_on(array.clone().into_array().execute_cuda(&mut cuda_ctx))
+                                .unwrap();
+                        }
+
+                        Duration::from_nanos(timer.load(Ordering::Relaxed))
+                    });
+                },
+            );
+        }
     }
 
     group.finish();
 
@@ -5,6 +5,16 @@ use std::time::Duration;
 
 use criterion::Criterion;
 
+/// Benchmark input sizes.
+///
+/// On codspeed, only the 100M variant runs — kernels under ~200 µs
+/// (i.e. the 10M cases) swing 15-45% across ephemeral GPU instances,
+/// drowning real regressions in noise. Locally both sizes run.
+#[cfg(not(codspeed))]
+pub const BENCH_SIZES: &[(usize, &str)] = &[(10_000_000, "10M"), (100_000_000, "100M")];
+#[cfg(codspeed)]
+pub const BENCH_SIZES: &[(usize, &str)] = &[(100_000_000, "100M")];
+
 /// Returns a [`Criterion`] configuration tuned for CUDA benchmarks.
 ///
 /// All benchmarks use `iter_custom` with precise CUDA event timing.
@@ -15,21 +25,21 @@ use criterion::Criterion;
 /// Stability comes from a high `sample_size` (many independent launches)
 /// rather than many iterations per sample.
 ///
-/// `warm_up_time` runs at least one full iteration before sampling, giving
-/// the GPU a chance to reach steady state (clock boost, cache warming).
-/// If a single launch exceeds the warm-up budget, criterion still completes
-/// it before moving on.
+/// `warm_up_time` is set to 500 ms — long enough to JIT-compile PTX,
+/// boost GPU clocks, and warm caches, while keeping total runtime under
+/// 2 minutes even for the largest benchmark binary (~18 benchmarks).
+///
+/// `sample_size` is 10: with 100M inputs the kernels are long enough
+/// (>500 µs) that within-run variance is low. Cross-run stability
+/// comes from the large input size, not from averaging many samples.
 pub(super) fn cuda_bench_config() -> Criterion {
-    // Number of independent kernel launches.
     let sample_size = 10;
 
     Criterion::default()
         .without_plots()
         .sample_size(sample_size)
-        // One ns is enough to JIT-compile kernels and warm GPU caches.
-        // Criterion always finishes the in-flight iteration even if this
-        // budget is exceeded.
-        .warm_up_time(Duration::from_nanos(1))
+        // Enough for PTX JIT, GPU clock boost, and cache warming.
+        .warm_up_time(Duration::from_millis(500))
         // Forces `iters = 1`: criterion's planner estimates iteration cost
         // from wall time (which includes GPU context setup), not the
         // GPU-timed duration returned by `iter_custom`. A real
 
@@ -40,8 +40,6 @@ use vortex_cuda_macros::cuda_not_available;
 
 use crate::timed_launch_strategy::TimedLaunchStrategy;
 
-const N_ROWS: usize = 100_000_000;
-
 /// Patch frequencies to benchmark (as fractions)
 const PATCH_FREQUENCIES: &[(f64, &str)] = &[(0.01, "1%"), (0.10, "10%")];
 
@@ -112,61 +110,16 @@ where
     T: BitPacked + NativePType + DeviceRepr + Add<Output = T> + From<u8>,
     T::Physical: DeviceRepr,
 {
-    let mut group = c.benchmark_group(format!("bitunpack_cuda_{}", type_name));
-
-    let array = make_bitpacked_array::<T>(bit_width, N_ROWS);
-    let nbytes = N_ROWS * size_of::<T>();
-
-    group.throughput(Throughput::Bytes(nbytes as u64));
-
-    group.bench_with_input(
-        BenchmarkId::new("bitunpack", format!("{}bw", bit_width)),
-        &array,
-        |b, array| {
-            b.iter_custom(|iters| {
-                let timed = TimedLaunchStrategy::default();
-                let timer = timed.timer();
-
-                let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())
-                    .vortex_expect("failed to create execution context")
-                    .with_dispatch_mode(CudaDispatchMode::StandaloneOnly)
-                    .with_launch_strategy(Arc::new(timed));
-
-                for _ in 0..iters {
-                    block_on(array.clone().into_array().execute_cuda(&mut cuda_ctx)).unwrap();
-                }
-
-                Duration::from_nanos(timer.load(Ordering::Relaxed))
-            });
-        },
-    );
-
-    group.finish();
-}
-
-fn benchmark_bitunpack(c: &mut Criterion) {
-    benchmark_bitunpack_typed::<u8>(c, 3, "u8");
-    benchmark_bitunpack_typed::<u16>(c, 5, "u16");
-    benchmark_bitunpack_typed::<u32>(c, 6, "u32");
-    benchmark_bitunpack_typed::<u64>(c, 8, "u64");
-}
-
-/// Benchmark function for unpacking with patches at various frequencies
-fn benchmark_bitunpack_with_patches_typed<T>(c: &mut Criterion, type_name: &str)
-where
-    T: BitPacked + NativePType + DeviceRepr + Add<Output = T> + From<u8>,
-    T::Physical: DeviceRepr,
-{
-    let mut group = c.benchmark_group(format!("bitunpack_cuda_patched_{}", type_name));
+    let mut group = c.benchmark_group(format!("cuda/bitpacked_{}", type_name));
 
-    let nbytes = N_ROWS * size_of::<T>();
-    group.throughput(Throughput::Bytes(nbytes as u64));
+    for &(n_rows, size_str) in bench_config::BENCH_SIZES {
+        let array = make_bitpacked_array::<T>(bit_width, n_rows);
+        let nbytes = n_rows * size_of::<T>();
 
-    for &(patch_freq, patch_label) in PATCH_FREQUENCIES {
-        let array = make_bitpacked_array_with_patches::<T>(N_ROWS, patch_freq);
+        group.throughput(Throughput::Bytes(nbytes as u64));
 
         group.bench_with_input(
-            BenchmarkId::new("bitunpack_patched", patch_label),
+            BenchmarkId::new(format!("unpack/{}bw", bit_width), size_str),
             &array,
             |b, array| {
                 b.iter_custom(|iters| {
@@ -191,6 +144,57 @@ where
     group.finish();
 }
 
+fn benchmark_bitunpack(c: &mut Criterion) {
+    benchmark_bitunpack_typed::<u8>(c, 3, "u8");
+    benchmark_bitunpack_typed::<u16>(c, 5, "u16");
+    benchmark_bitunpack_typed::<u32>(c, 6, "u32");
+    benchmark_bitunpack_typed::<u64>(c, 8, "u64");
+}
+
+/// Benchmark function for unpacking with patches at various frequencies
+fn benchmark_bitunpack_with_patches_typed<T>(c: &mut Criterion, type_name: &str)
+where
+    T: BitPacked + NativePType + DeviceRepr + Add<Output = T> + From<u8>,
+    T::Physical: DeviceRepr,
+{
+    let mut group = c.benchmark_group(format!("cuda/bitpacked_patched_{}", type_name));
+
+    for &(n_rows, size_str) in bench_config::BENCH_SIZES {
+        let nbytes = n_rows * size_of::<T>();
+        group.throughput(Throughput::Bytes(nbytes as u64));
+
+        for &(patch_freq, patch_label) in PATCH_FREQUENCIES {
+            let array = make_bitpacked_array_with_patches::<T>(n_rows, patch_freq);
+
+            group.bench_with_input(
+                BenchmarkId::new(format!("unpack/{}", patch_label), size_str),
+                &array,
+                |b, array| {
+                    b.iter_custom(|iters| {
+                        let timed = TimedLaunchStrategy::default();
+                        let timer = timed.timer();
+
+                        let mut cuda_ctx =
+                            CudaSession::create_execution_ctx(&VortexSession::empty())
+                                .vortex_expect("failed to create execution context")
+                                .with_dispatch_mode(CudaDispatchMode::StandaloneOnly)
+                                .with_launch_strategy(Arc::new(timed));
+
+                        for _ in 0..iters {
+                            block_on(array.clone().into_array().execute_cuda(&mut cuda_ctx))
+                                .unwrap();
+                        }
+
+                        Duration::from_nanos(timer.load(Ordering::Relaxed))
+                    });
+                },
+            );
+        }
+    }
+
+    group.finish();
+}
+
 fn benchmark_bitunpack_with_patches(c: &mut Criterion) {
     benchmark_bitunpack_with_patches_typed::<u8>(c, "u8");
     benchmark_bitunpack_with_patches_typed::<u16>(c, "u16");
 
@@ -52,16 +52,16 @@ fn make_datetimeparts_array(len: usize, time_unit: TimeUnit) -> DateTimePartsArr
 }
 
 fn benchmark_datetimeparts(c: &mut Criterion) {
-    let mut group = c.benchmark_group("datetimeparts_cuda");
+    let mut group = c.benchmark_group("cuda/datetimeparts");
 
-    for (len, len_str) in [(10_000_000usize, "10M"), (100_000_000usize, "100M")] {
+    for &(len, len_str) in bench_config::BENCH_SIZES {
         group.throughput(Throughput::Bytes((len * size_of::<i64>()) as u64));
 
         let (time_unit, unit_str) = (TimeUnit::Milliseconds, "ms");
         let dtp_array = make_datetimeparts_array(len, time_unit);
 
         group.bench_with_input(
-            BenchmarkId::new("datetimeparts", format!("{len_str}_{unit_str}")),
+            BenchmarkId::new(unit_str, len_str),
             &dtp_array,
             |b, dtp_array| {
                 b.iter_custom(|iters| {
 
@@ -34,10 +34,9 @@ use vortex_cuda::executor::CudaArrayExt;
 use vortex_cuda_macros::cuda_available;
 use vortex_cuda_macros::cuda_not_available;
 
+use crate::bench_config::BENCH_SIZES;
 use crate::timed_launch_strategy::TimedLaunchStrategy;
 
-const BENCH_ARGS: &[(usize, &str)] = &[(10_000_000, "10M")];
-
 /// Configuration for a dictionary benchmark specifying value and code types along with dictionary size.
 struct DictBenchConfig {
     dict_size: usize,
@@ -75,21 +74,21 @@ where
     C: NativePType + DeviceRepr + TryFrom<usize>,
     <C as TryFrom<usize>>::Error: Debug,
 {
-    let mut group = c.benchmark_group("dict_cuda");
+    let mut group = c.benchmark_group("cuda/dict");
 
-    for (len, len_str) in BENCH_ARGS {
+    for (len, len_str) in BENCH_SIZES {
         // Throughput is based on output size (values read from dictionary)
         group.throughput(Throughput::Bytes((len * size_of::<V>()) as u64));
 
         let dict_array = make_dict_array_typed::<V, C>(*len, config.dict_size);
 
         group.bench_with_input(
             BenchmarkId::new(
-                "dict",
                 format!(
-                    "{len_str}_{}_values_{}_codes",
+                    "{}_values_{}_codes",
                     config.value_type_name, config.code_type_name
                 ),
+                len_str,
             ),
             &dict_array,
             |b, dict_array| {