Skip to content

Commit 90f46e6

Browse files
committed
initial
Signed-off-by: Mikhail Kot <to@myrrc.dev>
1 parent c73dbb2 commit 90f46e6

4 files changed

Lines changed: 162 additions & 67 deletions

File tree

vortex-duckdb/cpp/table_function.cpp

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -285,6 +285,19 @@ void c_function(ClientContext &context, TableFunctionInput &input, DataChunk &ou
285285
}
286286
}
287287

288+
/*
289+
* Table filter pushdown is used twice in duckdb:
290+
*
291+
* 1. Planning time: duckdb uses file metadata (filename, hive_partitioning
292+
* options in MultiFileReader) to prune files based on filename or hive
293+
* partition data i.e. month, year, etc. This happens before any file IO.
294+
* We don't use this because we have own file-level pruning in
295+
* FileStatsLayoutReader.
296+
*
297+
* 2. Scan time. As we have filter_pushdown = true, filter expressions are
298+
* converted to TableFilterSet and pushed down to Vortex. We convert them to
299+
* vortex expressions and use as filter options while initializing the scan.
300+
*/
288301
void c_pushdown_complex_filter(ClientContext &,
289302
LogicalGet &,
290303
FunctionData *bind_data,
@@ -300,8 +313,6 @@ void c_pushdown_complex_filter(ClientContext &,
300313
if (error_out) {
301314
throw BinderException(IntoErrString(error_out));
302315
}
303-
304-
// If the pushdown complex filter returns true, we can remove the filter from the list.
305316
iter = pushed ? filters.erase(iter) : std::next(iter);
306317
}
307318
}
@@ -440,6 +451,10 @@ extern "C" duckdb_state duckdb_vx_tfunc_register(duckdb_database ffi_db, const d
440451
};
441452
};
442453

454+
tf.pushdown_expression = [](auto &, auto &, auto &) {
455+
return true;
456+
};
457+
443458
tf.arguments.resize(vtab->parameter_count);
444459
for (size_t i = 0; i < vtab->parameter_count; i++) {
445460
tf.arguments[i] = *reinterpret_cast<LogicalType *>(vtab->parameters[i]);

vortex-duckdb/src/convert/expr.rs

Lines changed: 136 additions & 63 deletions
Original file line numberDiff line numberDiff line change
@@ -9,10 +9,12 @@ use vortex::error::VortexError;
99
use vortex::error::VortexExpect;
1010
use vortex::error::VortexResult;
1111
use vortex::error::vortex_bail;
12+
use vortex::error::vortex_ensure;
1213
use vortex::error::vortex_err;
1314
use vortex::expr::Expression;
1415
use vortex::expr::and_collect;
1516
use vortex::expr::col;
17+
use vortex::expr::get_item;
1618
use vortex::expr::is_not_null;
1719
use vortex::expr::is_null;
1820
use vortex::expr::list_contains;
@@ -32,48 +34,120 @@ use vortex::scalar_fn::fns::operators::Operator;
3234

3335
use crate::cpp::DUCKDB_VX_EXPR_TYPE;
3436
use crate::duckdb;
37+
use crate::duckdb::BoundFunction;
38+
use crate::duckdb::BoundOperator;
3539

36-
const DUCKDB_FUNCTION_NAME_CONTAINS: &str = "contains";
37-
38-
fn like_pattern_str(value: &duckdb::ExpressionRef) -> VortexResult<Option<String>> {
40+
fn from_bound_str(value: &duckdb::ExpressionRef) -> VortexResult<String> {
3941
match value.as_class().vortex_expect("unknown class") {
4042
duckdb::ExpressionClass::BoundConstant(constant) => {
41-
Ok(Some(format!("%{}%", constant.value.as_string().as_str())))
43+
Ok(constant.value.as_string().as_str().to_owned())
4244
}
43-
_ => Ok(None),
45+
_ => vortex_bail!("Expected string expression, got {:?}", value.as_class_id()),
4446
}
4547
}
4648

49+
fn try_from_bound_function(
50+
func: &BoundFunction,
51+
col_sub: Option<&Expression>,
52+
) -> VortexResult<Option<Expression>> {
53+
let expr = match func.scalar_function.name() {
54+
"struct_extract" => {
55+
let children: Vec<_> = func.children().collect();
56+
vortex_ensure!(children.len() == 2);
57+
let Some(child) = try_from_expression_inner(children[0], col_sub)? else {
58+
return Ok(None);
59+
};
60+
let field = from_bound_str(children[1])?;
61+
get_item(field, child)
62+
}
63+
"contains" => {
64+
let children: Vec<_> = func.children().collect();
65+
vortex_ensure!(children.len() == 2);
66+
let Some(value) = try_from_expression_inner(children[0], col_sub)? else {
67+
return Ok(None);
68+
};
69+
let pattern = from_bound_str(children[1])?;
70+
let pattern = lit(format!("%{pattern}%"));
71+
Like.new_expr(LikeOptions::default(), [value, pattern])
72+
}
73+
like @ ("~~" | "!~~") => {
74+
let children: Vec<_> = func.children().collect();
75+
vortex_ensure!(children.len() == 2);
76+
let Some(string) = try_from_expression_inner(children[0], col_sub)? else {
77+
return Ok(None);
78+
};
79+
let Some(target) = try_from_expression_inner(children[1], col_sub)? else {
80+
return Ok(None);
81+
};
82+
let opts = LikeOptions {
83+
negated: like == "!~~",
84+
case_insensitive: false,
85+
};
86+
Like.new_expr(opts, [string, target])
87+
}
88+
_ => {
89+
debug!("bound function {}", func.scalar_function.name());
90+
return Ok(None);
91+
}
92+
};
93+
94+
Ok(Some(expr))
95+
}
96+
4797
pub fn try_from_bound_expression(
4898
value: &duckdb::ExpressionRef,
4999
) -> VortexResult<Option<Expression>> {
100+
try_from_expression_inner(value, None)
101+
}
102+
103+
pub(super) fn try_from_bound_expression_with_col_sub(
104+
value: &duckdb::ExpressionRef,
105+
col_sub: &Expression,
106+
) -> VortexResult<Option<Expression>> {
107+
try_from_expression_inner(value, Some(col_sub))
108+
}
109+
110+
fn try_from_expression_inner(
111+
value: &duckdb::ExpressionRef,
112+
col_sub: Option<&Expression>,
113+
) -> VortexResult<Option<Expression>> {
114+
//println!(
115+
// "from\n\texpresssion={value}\n\tcol={col_sub:?}\n\tid={:?}",
116+
// value.as_class_id()
117+
//);
50118
let Some(value) = value.as_class() else {
51119
debug!("no expression class id {:?}", value.as_class_id());
52120
return Ok(None);
53121
};
54122
Ok(Some(match value {
123+
duckdb::ExpressionClass::BoundRef => {
124+
let Some(col) = col_sub else {
125+
vortex_bail!("BoundRef requested but no column supplied");
126+
};
127+
col.clone()
128+
}
55129
duckdb::ExpressionClass::BoundColumnRef(col_ref) => col(col_ref.name.as_ref()),
56130
duckdb::ExpressionClass::BoundConstant(const_) => lit(Scalar::try_from(const_.value)?),
57131
duckdb::ExpressionClass::BoundComparison(compare) => {
58132
let operator: Operator = compare.op.try_into()?;
59133

60-
let Some(left) = try_from_bound_expression(compare.left)? else {
134+
let Some(left) = try_from_expression_inner(compare.left, col_sub)? else {
61135
return Ok(None);
62136
};
63-
let Some(right) = try_from_bound_expression(compare.right)? else {
137+
let Some(right) = try_from_expression_inner(compare.right, col_sub)? else {
64138
return Ok(None);
65139
};
66140

67141
Binary.new_expr(operator, [left, right])
68142
}
69143
duckdb::ExpressionClass::BoundBetween(between) => {
70-
let Some(array) = try_from_bound_expression(between.input)? else {
144+
let Some(array) = try_from_expression_inner(between.input, col_sub)? else {
71145
return Ok(None);
72146
};
73-
let Some(lower) = try_from_bound_expression(between.lower)? else {
147+
let Some(lower) = try_from_expression_inner(between.lower, col_sub)? else {
74148
return Ok(None);
75149
};
76-
let Some(upper) = try_from_bound_expression(between.upper)? else {
150+
let Some(upper) = try_from_expression_inner(between.upper, col_sub)? else {
77151
return Ok(None);
78152
};
79153
Between.new_expr(
@@ -98,7 +172,7 @@ pub fn try_from_bound_expression(
98172
| DUCKDB_VX_EXPR_TYPE::DUCKDB_VX_EXPR_TYPE_OPERATOR_IS_NOT_NULL => {
99173
let children: Vec<_> = operator.children().collect();
100174
assert_eq!(children.len(), 1);
101-
let Some(child) = try_from_bound_expression(children[0])? else {
175+
let Some(child) = try_from_expression_inner(children[0], col_sub)? else {
102176
return Ok(None);
103177
};
104178
match operator.op {
@@ -111,67 +185,23 @@ pub fn try_from_bound_expression(
111185
}
112186
}
113187
DUCKDB_VX_EXPR_TYPE::DUCKDB_VX_EXPR_TYPE_COMPARE_IN => {
114-
// First child is element, rest form the list.
115-
let children: Vec<_> = operator.children().collect();
116-
assert!(children.len() >= 2);
117-
let Some(element) = try_from_bound_expression(children[0])? else {
118-
return Ok(None);
119-
};
120-
121-
let Some(list_elements) = children
122-
.iter()
123-
.skip(1)
124-
.map(|c| {
125-
let Some(value) = try_from_bound_expression(c)? else {
126-
return Ok(None);
127-
};
128-
Ok(Some(
129-
value
130-
.as_opt::<Literal>()
131-
.ok_or_else(|| {
132-
vortex_err!("cannot have a non literal in a in_list")
133-
})?
134-
.clone(),
135-
))
136-
})
137-
.collect::<VortexResult<Option<Vec<_>>>>()?
138-
else {
139-
return Ok(None);
140-
};
141-
let list = Scalar::list(
142-
Arc::new(list_elements[0].dtype().clone()),
143-
list_elements,
144-
Nullability::Nullable,
145-
);
146-
list_contains(lit(list), element)
147-
}
148-
_ => {
149-
debug!(op=?operator.op, "cannot be pushed down");
150-
return Ok(None);
188+
return try_from_compare_in(operator, col_sub, false);
151189
}
152-
},
153-
duckdb::ExpressionClass::BoundFunction(func) => match func.scalar_function.name() {
154-
DUCKDB_FUNCTION_NAME_CONTAINS => {
155-
let children: Vec<_> = func.children().collect();
156-
assert_eq!(children.len(), 2);
157-
let Some(value) = try_from_bound_expression(children[0])? else {
158-
return Ok(None);
159-
};
160-
let Some(pattern_lit) = like_pattern_str(children[1])? else {
161-
vortex_bail!("expected pattern to be bound string")
162-
};
163-
let pattern = lit(pattern_lit);
164-
Like.new_expr(LikeOptions::default(), [value, pattern])
190+
DUCKDB_VX_EXPR_TYPE::DUCKDB_VX_EXPR_TYPE_COMPARE_NOT_IN => {
191+
return try_from_compare_in(operator, col_sub, true);
165192
}
166193
_ => {
167-
debug!("bound function {}", func.scalar_function.name());
194+
debug!(op=?operator.op, "cannot be pushed down");
168195
return Ok(None);
169196
}
170197
},
198+
duckdb::ExpressionClass::BoundFunction(func) => {
199+
return try_from_bound_function(&func, col_sub);
200+
}
171201
duckdb::ExpressionClass::BoundConjunction(conj) => {
172202
let Some(children) = conj
173203
.children()
174-
.map(try_from_bound_expression)
204+
.map(|c| try_from_expression_inner(c, col_sub))
175205
.collect::<VortexResult<Option<Vec<_>>>>()?
176206
else {
177207
return Ok(None);
@@ -189,6 +219,49 @@ pub fn try_from_bound_expression(
189219
}))
190220
}
191221

222+
fn try_from_compare_in(
223+
operator: BoundOperator,
224+
col_sub: Option<&Expression>,
225+
not_in: bool,
226+
) -> VortexResult<Option<Expression>> {
227+
// First child is element, rest form the list.
228+
let children: Vec<_> = operator.children().collect();
229+
assert!(children.len() >= 2);
230+
let Some(element) = try_from_expression_inner(children[0], col_sub)? else {
231+
//println!("no expression for element={}", children[0]);
232+
return Ok(None);
233+
};
234+
235+
let Some(list_elements) = children
236+
.iter()
237+
.skip(1)
238+
.map(|c| {
239+
let Some(value) = try_from_expression_inner(c, col_sub)? else {
240+
//println!("no expression for child={c}");
241+
return Ok(None);
242+
};
243+
Ok(Some(
244+
value
245+
.as_opt::<Literal>()
246+
.ok_or_else(|| vortex_err!("cannot have a non literal in a in_list"))?
247+
.clone(),
248+
))
249+
})
250+
.collect::<VortexResult<Option<Vec<_>>>>()?
251+
else {
252+
//println!("no list children");
253+
return Ok(None);
254+
};
255+
let list = Scalar::list(
256+
Arc::new(list_elements[0].dtype().clone()),
257+
list_elements,
258+
Nullability::Nullable,
259+
);
260+
261+
let expr = list_contains(lit(list), element);
262+
Ok(Some(if not_in { not(expr) } else { expr }))
263+
}
264+
192265
impl TryFrom<DUCKDB_VX_EXPR_TYPE> for Operator {
193266
type Error = VortexError;
194267

vortex-duckdb/src/convert/table_filter.rs

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -123,8 +123,10 @@ pub fn try_from_table_filter(
123123
)
124124
}
125125
TableFilterClass::ExpressionRef(expr) => {
126-
// TODO(ngates): figure out which column ID DuckDB is using for the expression.
127-
vortex_bail!("expression table filter is not supported: {}", expr);
126+
match super::expr::try_from_bound_expression_with_col_sub(expr, col)? {
127+
Some(expression) => expression,
128+
None => return Ok(None),
129+
}
128130
}
129131
TableFilterClass::Bloom => {
130132
vortex_bail!("bloom filter table filter is not supported")

vortex-duckdb/src/duckdb/expr.rs

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -139,6 +139,9 @@ impl ExpressionRef {
139139
bind_info: out.bind_info,
140140
})
141141
}
142+
cpp::DUCKDB_VX_EXPR_CLASS::DUCKDB_VX_EXPR_CLASS_BOUND_REF => {
143+
ExpressionClass::BoundRef
144+
}
142145
_ => {
143146
return None;
144147
}
@@ -155,6 +158,8 @@ pub enum ExpressionClass<'a> {
155158
BoundBetween(BoundBetween<'a>),
156159
BoundOperator(BoundOperator<'a>),
157160
BoundFunction(BoundFunction<'a>),
161+
/// Column inside ExpressionFilter for expression pushed down to Vortex.
162+
BoundRef,
158163
}
159164

160165
pub struct BoundColumnRef {

0 commit comments

Comments
 (0)