Skip to content

Commit a001ba8

Browse files
brainhartBrian Hart
andauthored
Fix dtype mismatch in FileStatsLayoutReader for stat scalars (#7639)
## Summary Use the stat's own dtype (e.g. u64 for NullCount) rather than the field dtype when constructing stat scalars in stats_ref. This fixes IS NULL pruning on nullable timestamp columns which previously failed with a dtype mismatch. ## Testing Added regression test Signed-off-by: Brian Hart <brian@brainhart.dev> Co-authored-by: Brian Hart <brian@brainhart.dev>
1 parent a1d9493 commit a001ba8

1 file changed

Lines changed: 64 additions & 1 deletion

File tree

vortex-file/src/v2/file_stats_reader.rs

Lines changed: 64 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -126,7 +126,10 @@ impl StatsCatalog for FileStatsLayoutReader {
126126

127127
let stat_value = field_stats.get(stat)?.as_exact()?;
128128
let field_dtype = self.struct_fields.field_by_index(field_idx)?;
129-
let stat_scalar = Scalar::try_new(field_dtype, Some(stat_value)).ok()?;
129+
// Use the stat's own dtype rather than the field dtype. For example,
130+
// NullCount is always u64 regardless of the column type.
131+
let stat_dtype = stat.dtype(&field_dtype)?;
132+
let stat_scalar = Scalar::try_new(stat_dtype, Some(stat_value)).ok()?;
130133

131134
Some(lit(stat_scalar))
132135
}
@@ -209,16 +212,20 @@ mod tests {
209212

210213
use vortex_array::ArrayContext;
211214
use vortex_array::IntoArray as _;
215+
use vortex_array::arrays::PrimitiveArray;
212216
use vortex_array::arrays::StructArray;
217+
use vortex_array::arrays::datetime::TemporalData;
213218
use vortex_array::dtype::DType;
214219
use vortex_array::dtype::Nullability;
215220
use vortex_array::dtype::PType;
216221
use vortex_array::expr::get_item;
217222
use vortex_array::expr::gt;
223+
use vortex_array::expr::is_null;
218224
use vortex_array::expr::lit;
219225
use vortex_array::expr::root;
220226
use vortex_array::expr::stats::Precision;
221227
use vortex_array::expr::stats::Stat;
228+
use vortex_array::extension::datetime::TimeUnit;
222229
use vortex_array::scalar::ScalarValue;
223230
use vortex_array::scalar_fn::session::ScalarFnSession;
224231
use vortex_array::session::ArraySession;
@@ -232,6 +239,7 @@ mod tests {
232239
use vortex_layout::LayoutStrategy;
233240
use vortex_layout::layouts::flat::writer::FlatLayoutStrategy;
234241
use vortex_layout::layouts::table::TableStrategy;
242+
use vortex_layout::segments::SegmentSink;
235243
use vortex_layout::segments::TestSegments;
236244
use vortex_layout::sequence::SequenceId;
237245
use vortex_layout::sequence::SequentialArrayStreamExt;
@@ -337,4 +345,59 @@ mod tests {
337345
Ok(())
338346
})
339347
}
348+
349+
/// Regression test: `IS NULL` on a nullable timestamp column must not fail with a
350+
/// dtype mismatch. The bug was that `stats_ref` used the *field* dtype (timestamp)
351+
/// for the `NullCount` stat scalar instead of the stat's own dtype (u64).
352+
#[test]
353+
fn is_null_pruning_on_nullable_timestamp_column() -> VortexResult<()> {
354+
block_on(|handle| async {
355+
let session = SESSION.clone().with_handle(handle);
356+
let ctx = ArrayContext::empty();
357+
let segments = Arc::new(TestSegments::default());
358+
let (ptr, eof) = SequenceId::root().split();
359+
360+
// Build a struct with a nullable timestamp column containing some nulls.
361+
let prim_array =
362+
PrimitiveArray::from_option_iter([Some(1_000_000i64), None, Some(3_000_000)])
363+
.into_array();
364+
let ts_data = TemporalData::new_timestamp(prim_array, TimeUnit::Microseconds, None);
365+
let ts_dtype = ts_data.dtype().clone();
366+
let ts_array = ts_data.into_array();
367+
368+
let struct_array = StructArray::from_fields([("deleted_at", ts_array)].as_slice())?;
369+
370+
let strategy = TableStrategy::new(
371+
Arc::new(FlatLayoutStrategy::default()),
372+
Arc::new(FlatLayoutStrategy::default()),
373+
);
374+
let layout = strategy
375+
.write_stream(
376+
ctx,
377+
Arc::clone(&segments) as Arc<dyn SegmentSink>,
378+
struct_array.into_array().to_array_stream().sequenced(ptr),
379+
eof,
380+
&session,
381+
)
382+
.await?;
383+
384+
let child = layout.new_reader("".into(), segments, &SESSION)?;
385+
386+
// File-level stats: 1 null in deleted_at.
387+
let mut stats = StatsSet::default();
388+
stats.set(Stat::NullCount, Precision::exact(ScalarValue::from(1u64)));
389+
let file_stats = FileStatistics::new(Arc::from([stats]), Arc::from([ts_dtype]));
390+
391+
let reader = FileStatsLayoutReader::new(child, file_stats, SESSION.clone());
392+
393+
// `is_null(deleted_at)` — should NOT panic or error due to dtype mismatch.
394+
let expr = is_null(get_item("deleted_at", root()));
395+
let mask = Mask::new_true(3);
396+
let result = reader.pruning_evaluation(&(0..3), &expr, mask)?.await?;
397+
// null_count is 1 (non-zero), so is_null is not falsified => not pruned.
398+
assert_eq!(result, Mask::new_true(3));
399+
400+
Ok(())
401+
})
402+
}
340403
}

0 commit comments

Comments
 (0)