Skip to content

Commit 46e024c

Browse files
committed
Fix: push window dim filters into model subqueries, preserve preagg grain
Remove window_dim_extra_dims logic that added extra dimensions to preagg subqueries, changing the CTE's aggregation grain. Instead, push window dim filters into each owning model's model_filters dict. The recursive generate() call handles window dim filter placement correctly in its own outer WHERE, preserving the requested dimension grain. Also fix merge conflict markers in test_advanced.py from prior stash.
1 parent 6503eae commit 46e024c

4 files changed

Lines changed: 124 additions & 76 deletions

File tree

sidemantic/sql/generator.py

Lines changed: 9 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -1468,28 +1468,6 @@ def _generate_with_preaggregation(
14681468
all_filters, all_model_names
14691469
)
14701470

1471-
# Collect window dimension columns that need to be projected through
1472-
# preagg CTEs so the outer WHERE can reference them.
1473-
window_dim_extra_dims: dict[str, list[str]] = {}
1474-
for model_name, wf_list in window_dim_filters.items():
1475-
for f_expr in wf_list:
1476-
try:
1477-
parsed_f = sqlglot.parse_one(f_expr, dialect=self.dialect)
1478-
for col in parsed_f.find_all(exp.Column):
1479-
clean = (col.table or "").replace("_cte", "")
1480-
if clean in all_model_names:
1481-
model_obj = self.graph.get_model(clean)
1482-
if model_obj:
1483-
dim_obj = model_obj.get_dimension(col.name)
1484-
if dim_obj and dim_obj.window is not None:
1485-
dim_ref = f"{clean}.{col.name}"
1486-
if clean not in window_dim_extra_dims:
1487-
window_dim_extra_dims[clean] = []
1488-
if dim_ref not in window_dim_extra_dims[clean]:
1489-
window_dim_extra_dims[clean].append(dim_ref)
1490-
except Exception:
1491-
pass
1492-
14931471
# Generate a pre-aggregated CTE for each metric model
14941472
preagg_ctes = []
14951473
cte_names = []
@@ -1498,25 +1476,18 @@ def _generate_with_preaggregation(
14981476
cte_name = f"{model_name}_preagg"
14991477
cte_names.append(cte_name)
15001478

1501-
# Only pass filters relevant to this model's sub-query.
1502-
# Window-dim filters are NOT pushed here: they are applied as outer
1503-
# WHERE on the final preagg join so that ALL models' metrics are
1504-
# constrained, not just the model that owns the window dimension.
1505-
model_filters = pushdown_by_model.get(model_name, [])
1506-
1507-
# Add extra window dim columns to the sub-query dimensions so they
1508-
# get projected through the preagg CTE for outer WHERE access.
1509-
sub_dimensions = list(dimensions)
1510-
for extra_dim in window_dim_extra_dims.get(model_name, []):
1511-
if extra_dim not in sub_dimensions:
1512-
sub_dimensions.append(extra_dim)
1479+
# Pass pushdown filters plus any window-dim filters for this model.
1480+
# Window-dim filters are pushed into the model's sub-query (not the
1481+
# outer preagg join) so the recursive generate() handles them in its
1482+
# own outer WHERE, preserving the requested dimension grain.
1483+
model_filters = pushdown_by_model.get(model_name, []) + window_dim_filters.get(model_name, [])
15131484

15141485
# Generate sub-query for this model's metrics at the dimension grain
15151486
# We call generate() recursively but it won't trigger pre-aggregation
15161487
# again because each sub-query has metrics from only one model
15171488
sub_query = self.generate(
15181489
metrics=model_metrics,
1519-
dimensions=sub_dimensions,
1490+
dimensions=dimensions,
15201491
filters=model_filters,
15211492
segments=None, # Already resolved
15221493
order_by=None,
@@ -1597,25 +1568,18 @@ def _generate_with_preaggregation(
15971568

15981569
final_query = f"SELECT\n {select_str}\nFROM {from_str}"
15991570

1600-
# Apply shared filters and window-dim filters on the outer query.
1601-
# Window-dim filters are applied here (not inside model sub-queries) so
1602-
# that ALL models' metrics are constrained by the filter, not just the
1603-
# model that owns the window dimension.
1571+
# Apply shared filters (cross-model or metric-level) on the outer query.
16041572
# Rewrite table references from model/model_cte to model_preagg using
16051573
# sqlglot AST rewriting so we don't accidentally mangle column names
16061574
# that happen to contain a model name as a substring.
1607-
all_outer_filters = list(shared_filters)
1608-
for wf_list in window_dim_filters.values():
1609-
all_outer_filters.extend(wf_list)
1610-
1611-
if all_outer_filters:
1575+
if shared_filters:
16121576
preagg_table_map = {}
16131577
for model_name in metrics_by_model:
16141578
preagg_table_map[model_name] = f"{model_name}_preagg"
16151579
preagg_table_map[f"{model_name}_cte"] = f"{model_name}_preagg"
16161580

16171581
rewritten = []
1618-
for f in all_outer_filters:
1582+
for f in shared_filters:
16191583
try:
16201584
parsed = sqlglot.parse_one(f, dialect=self.dialect)
16211585
for col in parsed.find_all(exp.Column):

tests/metrics/test_advanced.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -567,8 +567,6 @@ def test_wow_percent_change_partitions_by_dimension():
567567
assert rows[2][3] is None
568568
assert rows[3][0] == "B"
569569
assert abs(rows[3][3] - (-50.0)) < 0.1
570-
<<<<<<< Updated upstream
571-
=======
572570

573571

574572
def test_multistep_conversion_funnel():
@@ -1562,4 +1560,3 @@ def test_multistep_funnel_order_by_metric_name():
15621560
# Both execute without error, confirming the metric-named column exists
15631561
for row in rows:
15641562
assert row[0] in ("US", "EU") # region
1565-
>>>>>>> Stashed changes

tests/metrics/test_symmetric_aggs.py

Lines changed: 89 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -324,3 +324,92 @@ def test_symmetric_aggregates_with_data():
324324
assert revenues == [100, 200] # Correct totals, not inflated
325325

326326
conn.close()
327+
328+
329+
def test_preagg_grain_preserved_with_filters():
330+
"""Test that preagg subqueries preserve the requested dimension grain when filters are applied.
331+
332+
Filters should not introduce extra dimensions into preagg subqueries. Each
333+
preagg CTE must produce exactly one row per dimension key so the FULL OUTER
334+
JOIN does not inflate results.
335+
"""
336+
conn = duckdb.connect(":memory:")
337+
338+
conn.execute("""
339+
CREATE TABLE raw_orders AS
340+
SELECT * FROM (VALUES
341+
(1, '2024-01-01'::DATE, 100, 'shipped'),
342+
(2, '2024-01-01'::DATE, 200, 'pending'),
343+
(3, '2024-01-02'::DATE, 150, 'shipped')
344+
) AS t(id, order_date, amount, status)
345+
""")
346+
347+
conn.execute("""
348+
CREATE TABLE raw_items AS
349+
SELECT * FROM (VALUES
350+
(1, 1, 5),
351+
(2, 1, 3),
352+
(3, 2, 10),
353+
(4, 3, 7)
354+
) AS t(id, order_id, quantity)
355+
""")
356+
357+
graph = SemanticGraph()
358+
359+
orders = Model(
360+
name="orders",
361+
table="raw_orders",
362+
primary_key="id",
363+
dimensions=[
364+
Dimension(name="order_date", type="time", sql="order_date"),
365+
Dimension(name="status", type="categorical", sql="status"),
366+
],
367+
metrics=[Metric(name="revenue", agg="sum", sql="amount")],
368+
relationships=[
369+
Relationship(name="items", type="one_to_many", sql="id", foreign_key="order_id"),
370+
],
371+
)
372+
373+
items = Model(
374+
name="items",
375+
table="raw_items",
376+
primary_key="id",
377+
dimensions=[],
378+
metrics=[Metric(name="total_qty", agg="sum", sql="quantity")],
379+
relationships=[
380+
Relationship(name="orders", type="many_to_one", foreign_key="order_id", primary_key="id"),
381+
],
382+
)
383+
384+
graph.add_model(orders)
385+
graph.add_model(items)
386+
387+
generator = SQLGenerator(graph)
388+
389+
# Query with a filter on orders.status: preagg should still produce one
390+
# row per order_date, not one row per (order_date, status).
391+
sql = generator.generate(
392+
metrics=["orders.revenue", "items.total_qty"],
393+
dimensions=["orders.order_date"],
394+
filters=["orders.status = 'shipped'"],
395+
)
396+
397+
result = conn.execute(sql)
398+
rows = fetch_rows(result)
399+
400+
# Verify no duplicate dimension keys
401+
dates = [row[0] for row in rows]
402+
assert len(dates) == len(set(dates)), f"Duplicate dimension keys in preagg result: {dates}"
403+
404+
# Verify correct values:
405+
# The orders filter (status='shipped') only constrains orders_preagg.
406+
# items_preagg aggregates ALL items by order_date (joined via orders).
407+
# 2024-01-01: revenue=100 (shipped), total_qty=18 (items for orders 1+2)
408+
# 2024-01-02: revenue=150 (shipped), total_qty=7 (items for order 3)
409+
rows_sorted = sorted(rows, key=lambda r: r[0])
410+
assert rows_sorted[0][1] == 100 # 2024-01-01 revenue (shipped only)
411+
assert rows_sorted[0][2] == 18 # 2024-01-01 total_qty (all items for date)
412+
assert rows_sorted[1][1] == 150 # 2024-01-02 revenue
413+
assert rows_sorted[1][2] == 7 # 2024-01-02 total_qty
414+
415+
conn.close()

tests/optimizations/test_predicate_pushdown.py

Lines changed: 26 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -519,13 +519,13 @@ def test_window_dimension_filter_with_regular_filter(layer):
519519
pytest.main([__file__, "-v"])
520520

521521

522-
def test_window_dim_filter_applied_as_outer_where_in_preagg(layer):
523-
"""Test that window-dim filters are applied as outer WHERE in the preagg path.
522+
def test_window_dim_filter_pushed_into_model_subquery_in_preagg(layer):
523+
"""Test that window-dim filters are pushed into model subqueries in the preagg path.
524524
525525
When metrics come from multiple models (triggering pre-aggregation), a filter
526-
on a window dimension must be applied as an outer WHERE on the final preagg
527-
join so that ALL models' metrics are constrained. The window dim column is
528-
projected through the owning model's preagg CTE to make it available.
526+
on a window dimension is pushed into the owning model's recursive generate()
527+
call. This preserves the preagg grain (no extra dimensions added) while the
528+
recursive generate() handles the window dim filter in its own outer WHERE.
529529
"""
530530
from sidemantic.core.model import Relationship
531531

@@ -588,26 +588,24 @@ def test_window_dim_filter_applied_as_outer_where_in_preagg(layer):
588588
assert "orders_preagg" in sql, "Should use pre-aggregation path"
589589
assert "order_items_preagg" in sql, "Should use pre-aggregation path"
590590

591-
# The window dim column should be projected in the orders preagg CTE
591+
# The window dim filter should be inside the orders preagg CTE (pushed into
592+
# the model subquery), not in the outer WHERE
592593
preagg_start = sql.index("orders_preagg AS (")
593594
preagg_end = sql.index("order_items_preagg AS (")
594595
orders_subquery = sql[preagg_start:preagg_end]
595-
assert "next_status" in orders_subquery, "Window dim column should be projected in orders preagg CTE"
596+
assert "next_status" in orders_subquery, "Window dim filter should be in orders preagg subquery"
596597

597-
# The filter should appear in the outer WHERE referencing the preagg CTE,
598-
# so that BOTH models' metrics are constrained by the filter
598+
# The outer query should NOT have the window dim filter (it's handled inside subquery)
599599
outer_query = sql[sql.rindex("SELECT") :]
600-
assert "orders_preagg.next_status" in outer_query, (
601-
"Window-dim filter should be in outer WHERE referencing preagg CTE"
602-
)
600+
assert "next_status" not in outer_query, "Window dim filter should NOT be in outer WHERE"
603601

604602

605-
def test_multi_model_window_dim_filter_applied_as_outer_where_in_preagg(layer):
606-
"""Test that multi-model window-dim filters are applied as outer WHERE in preagg.
603+
def test_multi_model_window_dim_filter_pushed_into_model_subquery_in_preagg(layer):
604+
"""Test that multi-model window-dim filters are pushed into model subqueries in preagg.
607605
608606
When a filter references columns from multiple models and at least one is a
609-
window dimension, the window dim column is projected through the preagg CTE
610-
and the filter is applied on the outer WHERE so both models are constrained.
607+
window dimension, the filter is pushed into the owning model's subquery so
608+
the recursive generate() handles it correctly without changing the preagg grain.
611609
"""
612610
from sidemantic.core.model import Relationship
613611

@@ -672,24 +670,24 @@ def test_multi_model_window_dim_filter_applied_as_outer_where_in_preagg(layer):
672670
assert "orders_preagg" in sql, "Should use pre-aggregation path"
673671
assert "order_items_preagg" in sql, "Should use pre-aggregation path"
674672

675-
# The window dim column should be projected in the orders preagg CTE
673+
# The window dim filter should be inside the orders preagg CTE
676674
preagg_start = sql.index("orders_preagg AS (")
677675
preagg_end = sql.index("order_items_preagg AS (")
678676
orders_subquery = sql[preagg_start:preagg_end]
679-
assert "next_status" in orders_subquery, "Window dim column should be projected in orders preagg CTE"
677+
assert "next_status" in orders_subquery, "Window dim filter should be in orders preagg subquery"
680678

681-
# The filter should appear in the outer WHERE
679+
# The outer query should NOT have the window dim filter
682680
outer_query = sql[sql.rindex("SELECT") :]
683-
assert "orders_preagg.next_status" in outer_query, "Multi-model window-dim filter should be in outer WHERE"
681+
assert "next_status" not in outer_query, "Window dim filter should NOT be in outer WHERE"
684682

685683

686-
def test_mixed_metric_and_window_dim_filter_applied_as_outer_where_in_preagg(layer):
687-
"""Test that a filter referencing both a metric and a window dim is applied as outer WHERE.
684+
def test_mixed_metric_and_window_dim_filter_pushed_into_model_subquery_in_preagg(layer):
685+
"""Test that a filter referencing both a metric and a window dim is pushed into model subquery.
688686
689687
When a filter like "orders.next_status = 'complete' OR orders.revenue > 100"
690688
references both a window dimension (next_status) and a metric (revenue), the
691-
window dim check takes priority and the filter is applied on the outer preagg
692-
WHERE. The window dim column is projected through the preagg CTE.
689+
window dim check takes priority and the filter is pushed into the owning model's
690+
subquery to preserve the preagg grain.
693691
"""
694692
from sidemantic.core.model import Relationship
695693

@@ -752,12 +750,12 @@ def test_mixed_metric_and_window_dim_filter_applied_as_outer_where_in_preagg(lay
752750
assert "orders_preagg" in sql, "Should use pre-aggregation path"
753751
assert "order_items_preagg" in sql, "Should use pre-aggregation path"
754752

755-
# The window dim column should be projected in the orders preagg CTE
753+
# The window dim filter should be inside the orders preagg CTE
756754
preagg_start = sql.index("orders_preagg AS (")
757755
preagg_end = sql.index("order_items_preagg AS (")
758756
orders_subquery = sql[preagg_start:preagg_end]
759-
assert "next_status" in orders_subquery, "Window dim column should be projected in orders preagg CTE"
757+
assert "next_status" in orders_subquery, "Window dim filter should be in orders preagg subquery"
760758

761-
# The filter should be in the outer WHERE, constraining both models
759+
# The outer query should NOT have the window dim filter
762760
outer_query = sql[sql.rindex("SELECT") :]
763-
assert "orders_preagg.next_status" in outer_query, "Mixed metric/window-dim filter should be in outer WHERE"
761+
assert "next_status" not in outer_query, "Window dim filter should NOT be in outer WHERE"

0 commit comments

Comments
 (0)