@@ -67,25 +67,21 @@ use crate::duckdb::DataChunkRef;
6767use crate :: duckdb:: DuckdbStringMapRef ;
6868use crate :: duckdb:: ExpressionRef ;
6969use crate :: duckdb:: LogicalType ;
70+ use crate :: duckdb:: PartitionData ;
7071use crate :: duckdb:: TableFilterSetRef ;
7172use crate :: duckdb:: TableFunction ;
7273use crate :: duckdb:: TableInitInput ;
74+ use crate :: duckdb:: Value ;
7375use crate :: exporter:: ArrayExporter ;
7476use crate :: exporter:: ConversionCache ;
7577
76- /// Taken from
77- /// https://github.com/duckdb/duckdb/blob/dc11eadd8f0a7c600f0034810706605ebe10d5b9/src/include/duckdb/common/constants.hpp#L44
78- ///
79- /// If DuckDB requests a zero-column projection from read_vortex like count(*),
80- /// its planner tries to get any column:
81- /// https://github.com/duckdb/duckdb/blob/dc11eadd8f0a7c600f0034810706605ebe10d5b9/src/planner/operator/logical_get.cpp#L149
82- ///
83- /// If you define COLUMN_IDENTIFIER_EMPTY, planner takes it, otherwise the
84- /// first column. As we don't want to fill the output chunk and we can leave
85- /// it uninitialized in this case, we define COLUMN_IDENTIFIER_EMPTY as a
86- /// virtual column.
87- /// See virtual_columns in vortex-duckdb/cpp/table_function.cpp
88- static EMPTY_COLUMN_IDX : u64 = 18446744073709551614 ;
78+ /// File index virtual column, may be requested either by user or optimizer.
79+ static FILE_INDEX_COLUMN_IDX : u64 = 9223372036854775810 ;
80+
81+ /// See duckdb/src/common/constants.cpp
82+ fn is_virtual_column ( id : u64 ) -> bool {
83+ id >= 9223372036854775808u64
84+ }
8985
9086/// A trait for table functions that resolve to a [`DataSourceRef`].
9187///
@@ -149,14 +145,15 @@ pub struct DataSourceGlobal {
149145 batch_id : AtomicU64 ,
150146 bytes_total : Arc < AtomicU64 > ,
151147 bytes_read : AtomicU64 ,
148+ file_index_column_pos : Option < usize > ,
152149}
153150
154151/// Per-thread local scan state.
155152pub struct DataSourceLocal {
156153 iterator : DataSourceIterator ,
157154 exporter : Option < ArrayExporter > ,
158- /// The unique batch id of the last chunk exported via scan().
159- batch_id : Option < u64 > ,
155+ batch_id : u64 ,
156+ file_idx : usize ,
160157}
161158
162159/// Returns scan progress as a percentage (0.0–100.0).
@@ -281,7 +278,7 @@ impl<T: DataSourceTableFunction> TableFunction for T {
281278 let column_ids = init_input. column_ids ( ) ;
282279 let projection_ids = init_input. projection_ids ( ) ;
283280
284- let projection_expr =
281+ let ( projection_expr, file_idx_pos ) =
285282 extract_projection_expr ( projection_ids, column_ids, & bind_data. column_fields ) ;
286283 let filter_expr = extract_table_filter_expr (
287284 init_input. table_filter_set ( ) ,
@@ -317,10 +314,14 @@ impl<T: DataSourceTableFunction> TableFunction for T {
317314 // first available array chunk.
318315 let stream = scan
319316 . partitions ( )
320- . map ( move |partition| {
317+ . enumerate ( )
318+ . map ( move |( file_idx, partition) | {
321319 // We create a new conversion cache scoped to the partition, since there's no point
322320 // caching anything across partitions.
323- let cache = Arc :: new ( ConversionCache :: default ( ) ) ;
321+ let cache = Arc :: new ( ConversionCache {
322+ file_idx,
323+ ..Default :: default ( )
324+ } ) ;
324325 let tx = tx. clone ( ) ;
325326
326327 RUNTIME . handle ( ) . spawn ( async move {
@@ -356,6 +357,7 @@ impl<T: DataSourceTableFunction> TableFunction for T {
356357 batch_id : AtomicU64 :: new ( 0 ) ,
357358 bytes_total : Arc :: new ( AtomicU64 :: new ( 0 ) ) ,
358359 bytes_read : AtomicU64 :: new ( 0 ) ,
360+ file_index_column_pos : file_idx_pos,
359361 } )
360362 }
361363
@@ -381,7 +383,8 @@ impl<T: DataSourceTableFunction> TableFunction for T {
381383 Ok ( DataSourceLocal {
382384 iterator : global. iterator . clone ( ) ,
383385 exporter : None ,
384- batch_id : None ,
386+ batch_id : 0 ,
387+ file_idx : 0 ,
385388 } )
386389 }
387390
@@ -399,6 +402,7 @@ impl<T: DataSourceTableFunction> TableFunction for T {
399402 return Ok ( ( ) ) ;
400403 } ;
401404 let ( array_result, conversion_cache) = result?;
405+ local_state. file_idx = conversion_cache. file_idx ;
402406 let array_result = array_result. optimize_recursive ( ctx. session ( ) ) ?;
403407
404408 let array_result: StructArray = if let Some ( array) = array_result. as_opt :: < Struct > ( )
@@ -423,30 +427,36 @@ impl<T: DataSourceTableFunction> TableFunction for T {
423427 ctx,
424428 ) ?) ;
425429 // Relaxed since there is no intra-instruction ordering required.
426- local_state. batch_id = Some ( global_state. batch_id . fetch_add ( 1 , Ordering :: Relaxed ) ) ;
430+ local_state. batch_id = global_state. batch_id . fetch_add ( 1 , Ordering :: Relaxed ) ;
427431 }
428432
429433 let exporter = local_state
430434 . exporter
431435 . as_mut ( )
432436 . vortex_expect ( "error: exporter missing" ) ;
437+ let has_more_data = exporter. export ( chunk, global_state. file_index_column_pos ) ?;
433438
434- let has_more_data = exporter. export ( chunk) ?;
435439 global_state
436440 . bytes_read
437441 . fetch_add ( chunk. len ( ) , Ordering :: Relaxed ) ;
438442
439443 if !has_more_data {
440444 // This exporter is fully consumed.
441445 local_state. exporter = None ;
442- local_state. batch_id = None ;
446+ local_state. batch_id = 0 ;
443447 } else {
444448 break ;
445449 }
446450 }
447451
448452 assert ! ( !chunk. is_empty( ) ) ;
449453
454+ if let Some ( pos) = global_state. file_index_column_pos {
455+ chunk
456+ . get_vector_mut ( pos)
457+ . reference_value ( & Value :: from ( local_state. file_idx as u64 ) ) ;
458+ }
459+
450460 Ok ( ( ) )
451461 }
452462
@@ -518,10 +528,11 @@ impl<T: DataSourceTableFunction> TableFunction for T {
518528 _bind_data : & Self :: BindData ,
519529 _global_init_data : & Self :: GlobalState ,
520530 local_init_data : & mut Self :: LocalState ,
521- ) -> VortexResult < u64 > {
522- local_init_data
523- . batch_id
524- . ok_or_else ( || vortex_err ! ( "batch id missing, no batches exported" ) )
531+ ) -> PartitionData {
532+ PartitionData {
533+ batch_index : local_init_data. batch_id ,
534+ file_index : local_init_data. file_idx ,
535+ }
525536 }
526537
527538 fn to_string ( bind_data : & Self :: BindData , map : & mut DuckdbStringMapRef ) {
@@ -557,39 +568,46 @@ fn extract_schema_from_dtype(dtype: &DType) -> VortexResult<Vec<DuckdbField>> {
557568 Ok ( fields)
558569}
559570
560- /// Creates a projection expression from raw projection/column ID slices and column names.
571+ /// Creates a projection expression from raw projection/column ID slices and
572+ /// column names.
573+ /// If FILE_INDEX_COLUMN_IDX is present, returns its position as second
574+ /// parameter
561575fn extract_projection_expr (
562576 projection_ids : Option < & [ u64 ] > ,
563577 column_ids : & [ u64 ] ,
564578 column_fields : & [ DuckdbField ] ,
565- ) -> Expression {
566- // Projection ids may be empty, in which case you need to use projection_ids
579+ ) -> ( Expression , Option < usize > ) {
580+ // If projection ids are empty, use column_ids.
567581 // See duckdb/src/planner/operator/logical_get.cpp#L168
568- let ( projection_ids , has_projection_ids) = match projection_ids {
582+ let ( ids , has_projection_ids) = match projection_ids {
569583 Some ( ids) => ( ids, true ) ,
570584 None => ( column_ids, false ) ,
571585 } ;
572586
573- // duckdb index is u64 (size_t) but in Rust u64 and usize are different things.
587+ let mut file_idx_pos = None ;
588+
574589 #[ expect( clippy:: cast_possible_truncation) ]
575- let names = projection_ids
590+ let names = ids
576591 . iter ( )
577- . filter ( |p| * * p != EMPTY_COLUMN_IDX )
578- . map ( |mut idx| {
579- if has_projection_ids {
580- idx = & column_ids[ * idx as usize ] ;
592+ . enumerate ( )
593+ . map ( |( column_pos, & column_id) | {
594+ let column_id = if has_projection_ids {
595+ column_ids[ column_id as usize ]
596+ } else {
597+ column_id
598+ } ;
599+
600+ if column_id == FILE_INDEX_COLUMN_IDX {
601+ file_idx_pos = Some ( column_pos) ;
581602 }
582603
583- #[ expect( clippy:: cast_possible_truncation) ]
584- & column_fields
585- . get ( * idx as usize )
586- . vortex_expect ( "prune idx in column names" )
587- . name
604+ column_id
588605 } )
589- . map ( |s| Arc :: from ( s. as_str ( ) ) )
606+ . filter ( |& col_id| !is_virtual_column ( col_id) )
607+ . map ( |col_id| Arc :: from ( column_fields[ col_id as usize ] . name . as_str ( ) ) )
590608 . collect :: < FieldNames > ( ) ;
591609
592- select ( names, root ( ) )
610+ ( select ( names, root ( ) ) , file_idx_pos )
593611}
594612
595613/// Creates a table filter expression from the table filter set, column metadata, additional
@@ -604,6 +622,10 @@ fn extract_table_filter_expr(
604622 let mut table_filter_exprs: HashSet < Expression > = if let Some ( filter) = table_filter_set {
605623 filter
606624 . into_iter ( )
625+ . filter ( |( idx, _) | {
626+ let idx_u: usize = idx. as_ ( ) ;
627+ !is_virtual_column ( column_ids[ idx_u] . as_ ( ) )
628+ } )
607629 . map ( |( idx, ex) | {
608630 let idx_u: usize = idx. as_ ( ) ;
609631 let col_idx: usize = column_ids[ idx_u] . as_ ( ) ;
0 commit comments