Skip to content

Commit 3ea641f

Browse files
Add window function support to dimensions (#118)
* Add window function support to dimensions * Auto-update JSON schema * Fix: route window dimension filters to outer query Window function dimensions (LEAD, LAG, etc.) are computed in the CTE SELECT but haven't been evaluated at CTE WHERE time. The filter pushdown logic now detects dimensions with window != None and keeps their filters in the outer query. Also ensures columns referenced by outer-query filters are included in CTE SELECT lists. * Fix: exclude window-dim filters from preagg shared filters * Fix: exclude window dimensions from pre-aggregation materialization Window dimensions (LEAD, LAG, etc.) are incompatible with GROUP BY in rollup materialization SQL. generate_materialization_sql now raises a clear ValueError when a pre-aggregation references a dimension that has a window expression set, for both regular dimensions and time dimensions. * Fix: route multi-model window filters away from preagg shared WHERE * Fix: window dim check takes priority over metric check in filter classification * Fix: apply window dim filters as outer WHERE in preagg path Window dimension filters were only pushed into the model sub-query that owned the window dimension, leaving other models' metrics unfiltered. Now the window dim column is projected through the owning model's preagg CTE and the filter is applied on the outer preagg WHERE, constraining all models' results. * Fix: resolve {model} in filters, add metric-named output column _strip_model_prefixes now replaces {model} with the actual model name before sqlglot parsing, so placeholder-based filters no longer leak literal {model} tokens into WHERE clauses. Multistep funnel output now includes the last-step count aliased to the metric name, so ORDER BY metric_name works without runtime errors. * Fix: push window dim filters into model subqueries, preserve preagg grain Remove window_dim_extra_dims logic that added extra dimensions to preagg subqueries, changing the CTE's aggregation grain. Instead, push window dim filters into each owning model's model_filters dict. The recursive generate() call handles window dim filter placement correctly in its own outer WHERE, preserving the requested dimension grain. Also fix merge conflict markers in test_advanced.py from prior stash. --------- Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>
1 parent 7cc14b7 commit 3ea641f

9 files changed

Lines changed: 788 additions & 9 deletions

File tree

sidemantic-schema.json

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -169,6 +169,19 @@
169169
"default": null,
170170
"description": "Named format (e.g., 'usd', 'percent', 'decimal_2')",
171171
"title": "Value Format Name"
172+
},
173+
"window": {
174+
"anyOf": [
175+
{
176+
"type": "string"
177+
},
178+
{
179+
"type": "null"
180+
}
181+
],
182+
"default": null,
183+
"description": "Window function expression (e.g., 'LEAD(event) OVER (PARTITION BY person_id ORDER BY timestamp)')",
184+
"title": "Window"
172185
}
173186
},
174187
"required": [
@@ -1933,6 +1946,19 @@
19331946
"default": null,
19341947
"description": "Named format (e.g., 'usd', 'percent', 'decimal_2')",
19351948
"title": "Value Format Name"
1949+
},
1950+
"window": {
1951+
"anyOf": [
1952+
{
1953+
"type": "string"
1954+
},
1955+
{
1956+
"type": "null"
1957+
}
1958+
],
1959+
"default": null,
1960+
"description": "Window function expression (e.g., 'LEAD(event) OVER (PARTITION BY person_id ORDER BY timestamp)')",
1961+
"title": "Window"
19361962
}
19371963
},
19381964
"required": [

sidemantic/adapters/sidemantic.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -265,6 +265,7 @@ def _parse_model(self, model_def: dict) -> Model | None:
265265
value_format_name=dim_def.get("value_format_name"),
266266
parent=dim_def.get("parent"),
267267
metadata=dim_def.get("metadata"),
268+
window=dim_def.get("window"),
268269
)
269270
dimensions.append(dimension)
270271

@@ -541,6 +542,8 @@ def _export_model(self, model: Model) -> dict:
541542
dim_def["value_format_name"] = dim.value_format_name
542543
if dim.parent:
543544
dim_def["parent"] = dim.parent
545+
if dim.window:
546+
dim_def["window"] = dim.window
544547
result["dimensions"].append(dim_def)
545548

546549
# Export metrics (model-level aggregations)

sidemantic/core/dimension.py

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,12 @@ class Dimension(BaseModel):
3232
# Arbitrary metadata (ai_context, custom_extensions, etc.)
3333
meta: dict[str, Any] | None = Field(None, description="Arbitrary metadata for extensions")
3434

35+
# Window function expression
36+
window: str | None = Field(
37+
None,
38+
description="Window function expression (e.g., 'LEAD(event) OVER (PARTITION BY person_id ORDER BY timestamp)')",
39+
)
40+
3541
# Visibility
3642
public: bool = Field(True, description="Whether dimension is visible in API/UI")
3743

@@ -68,7 +74,22 @@ def __hash__(self) -> int:
6874

6975
@property
7076
def sql_expr(self) -> str:
71-
"""Get SQL expression, defaulting to name if not specified."""
77+
"""Get the base SQL expression, defaulting to name if not specified.
78+
79+
Always returns the row-level expression (``sql`` or ``name``), never the
80+
window function. Use ``window_sql_expr`` when you need the window
81+
expression for CTE projection.
82+
"""
83+
return self.sql or self.name
84+
85+
@property
86+
def window_sql_expr(self) -> str:
87+
"""Get the window SQL expression if set, otherwise fall back to sql_expr.
88+
89+
Use this in CTE SELECT lists where window functions should be projected.
90+
"""
91+
if self.window:
92+
return self.window
7293
return self.sql or self.name
7394

7495
def with_granularity(self, granularity: str) -> str:

sidemantic/core/pre_aggregation.py

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -143,16 +143,28 @@ def generate_materialization_sql(self, model: Any) -> str:
143143
if self.time_dimension and self.granularity:
144144
time_dim = model.get_dimension(self.time_dimension)
145145
if time_dim:
146+
if time_dim.window:
147+
raise ValueError(
148+
f"Cannot use window dimension '{self.time_dimension}' as time_dimension "
149+
f"in pre-aggregation '{self.name}': window functions are incompatible "
150+
f"with GROUP BY in rollup materialization"
151+
)
146152
col_name = f"{self.time_dimension}_{self.granularity}"
147153
select_exprs.append(f"DATE_TRUNC('{self.granularity}', {time_dim.sql_expr}) as {col_name}")
148154
group_by_positions.append(str(pos))
149155
pos += 1
150156

151-
# Add dimensions
157+
# Add dimensions (reject window dimensions - incompatible with GROUP BY)
152158
if self.dimensions:
153159
for dim_name in self.dimensions:
154160
dim = model.get_dimension(dim_name)
155161
if dim:
162+
if dim.window:
163+
raise ValueError(
164+
f"Cannot use window dimension '{dim_name}' in pre-aggregation "
165+
f"'{self.name}': window functions are incompatible with "
166+
f"GROUP BY in rollup materialization"
167+
)
156168
select_exprs.append(f"{dim.sql_expr} as {dim_name}")
157169
group_by_positions.append(str(pos))
158170
pos += 1

sidemantic/sql/generator.py

Lines changed: 27 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1114,10 +1114,13 @@ def replace_model_placeholder(sql_expr: str) -> str:
11141114
for dimension in model.dimensions:
11151115
if dimension.name in needed_dimensions and dimension.name not in columns_added:
11161116
# For time dimensions with granularity, apply DATE_TRUNC
1117+
# Use window_sql_expr for CTE projection so window functions
1118+
# (LEAD, LAG, etc.) are evaluated here.
1119+
base_expr = dimension.window_sql_expr
11171120
if dimension.type == "time" and dimension.granularity:
1118-
dim_sql = self._date_trunc(dimension.granularity, dimension.sql_expr)
1121+
dim_sql = self._date_trunc(dimension.granularity, base_expr)
11191122
else:
1120-
dim_sql = dimension.sql_expr
1123+
dim_sql = base_expr
11211124
# Replace {model} placeholder with actual table reference
11221125
dim_sql = replace_model_placeholder(dim_sql)
11231126
select_cols.append(f"{dim_sql} AS {self._quote_alias(dimension.name)}")
@@ -1242,7 +1245,7 @@ def collect_measures_from_metric(metric_ref: str, visited: set[str] | None = Non
12421245
continue
12431246
dim = model.get_dimension(col_name)
12441247
if dim:
1245-
dim_sql = replace_model_placeholder(dim.sql_expr)
1248+
dim_sql = replace_model_placeholder(dim.window_sql_expr)
12461249
select_cols.append(f"{dim_sql} AS {self._quote_alias(col_name)}")
12471250
columns_added.add(col_name)
12481251
continue
@@ -1525,10 +1528,10 @@ def _generate_with_preaggregation(
15251528
cte_name = f"{model_name}_preagg"
15261529
cte_names.append(cte_name)
15271530

1528-
# Only pass filters relevant to this model's sub-query.
1529-
# Window-dim filters are included here (not in shared_filters) because
1530-
# preagg CTEs only project query dimensions/metrics, not window dims.
1531-
# The recursive generate() call handles them correctly in its outer WHERE.
1531+
# Pass pushdown filters plus any window-dim filters for this model.
1532+
# Window-dim filters are pushed into the model's sub-query (not the
1533+
# outer preagg join) so the recursive generate() handles them in its
1534+
# own outer WHERE, preserving the requested dimension grain.
15321535
model_filters = pushdown_by_model.get(model_name, []) + window_dim_filters.get(model_name, [])
15331536

15341537
# Generate sub-query for this model's metrics at the dimension grain
@@ -1897,7 +1900,24 @@ def _build_main_select(
18971900
if references_metric:
18981901
break
18991902

1903+
# Check if filter also references a window dimension.
1904+
# Window dims are projected as regular columns in the CTE, so
1905+
# they belong in WHERE, not HAVING (they aren't aggregated).
1906+
references_window_dim = False
19001907
if references_metric:
1908+
for model_name in [base_model_name] + other_models:
1909+
model_obj = self.graph.get_model(model_name)
1910+
if not model_obj:
1911+
continue
1912+
for field_name in re.findall(f"{model_name}\\.([a-zA-Z_][a-zA-Z0-9_]*)", filter_expr):
1913+
dim = model_obj.get_dimension(field_name)
1914+
if dim and getattr(dim, "window", None) is not None:
1915+
references_window_dim = True
1916+
break
1917+
if references_window_dim:
1918+
break
1919+
1920+
if references_metric and not references_window_dim:
19011921
having_filters.append(filter_expr)
19021922
else:
19031923
where_filters.append(filter_expr)

tests/adapters/sidemantic_adapter/test_parsing.py

Lines changed: 142 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -145,5 +145,147 @@ def test_adapter_validation():
145145
assert len(errors) == 0
146146

147147

148+
def test_dimension_window_field(tmp_path):
149+
"""Test that dimensions with window expressions are parsed and use window as sql_expr."""
150+
adapter = SidemanticAdapter()
151+
yaml_path = tmp_path / "events.yml"
152+
yaml_path.write_text(
153+
"""
154+
models:
155+
- name: events
156+
table: public.events
157+
primary_key: event_id
158+
dimensions:
159+
- name: event
160+
type: categorical
161+
162+
- name: next_event
163+
type: categorical
164+
sql: event
165+
window: "LEAD(event) OVER (PARTITION BY person_id ORDER BY timestamp)"
166+
description: The next event after this one for the same person
167+
168+
- name: next_timestamp
169+
type: time
170+
sql: timestamp
171+
window: "LEAD(timestamp) OVER (PARTITION BY person_id ORDER BY timestamp)"
172+
description: Timestamp of the next event
173+
174+
- name: plain_dim
175+
type: categorical
176+
sql: status
177+
178+
metrics:
179+
- name: event_count
180+
agg: count
181+
"""
182+
)
183+
184+
graph = adapter.parse(yaml_path)
185+
model = graph.models["events"]
186+
187+
# Dimension without window: sql_expr returns sql or name
188+
event_dim = model.get_dimension("event")
189+
assert event_dim.window is None
190+
assert event_dim.sql_expr == "event"
191+
192+
plain_dim = model.get_dimension("plain_dim")
193+
assert plain_dim.window is None
194+
assert plain_dim.sql_expr == "status"
195+
196+
# Dimension with window: sql_expr returns the base expression,
197+
# window_sql_expr returns the window function
198+
next_event = model.get_dimension("next_event")
199+
assert next_event.window == "LEAD(event) OVER (PARTITION BY person_id ORDER BY timestamp)"
200+
assert next_event.sql == "event"
201+
assert next_event.sql_expr == "event"
202+
assert next_event.window_sql_expr == "LEAD(event) OVER (PARTITION BY person_id ORDER BY timestamp)"
203+
204+
next_ts = model.get_dimension("next_timestamp")
205+
assert next_ts.window == "LEAD(timestamp) OVER (PARTITION BY person_id ORDER BY timestamp)"
206+
assert next_ts.sql == "timestamp"
207+
assert next_ts.sql_expr == "timestamp"
208+
assert next_ts.window_sql_expr == next_ts.window
209+
210+
211+
def test_dimension_window_roundtrip(tmp_path):
212+
"""Test that window dimensions survive YAML export/import roundtrip."""
213+
adapter = SidemanticAdapter()
214+
yaml_path = tmp_path / "events.yml"
215+
yaml_path.write_text(
216+
"""
217+
models:
218+
- name: events
219+
table: public.events
220+
primary_key: event_id
221+
dimensions:
222+
- name: next_event
223+
type: categorical
224+
sql: event
225+
window: "LEAD(event) OVER (PARTITION BY person_id ORDER BY timestamp)"
226+
227+
metrics:
228+
- name: event_count
229+
agg: count
230+
"""
231+
)
232+
233+
graph = adapter.parse(yaml_path)
234+
235+
# Export
236+
export_path = tmp_path / "exported.yml"
237+
adapter.export(graph, export_path)
238+
239+
# Re-import
240+
graph2 = adapter.parse(export_path)
241+
model2 = graph2.models["events"]
242+
dim = model2.get_dimension("next_event")
243+
assert dim.window == "LEAD(event) OVER (PARTITION BY person_id ORDER BY timestamp)"
244+
assert dim.sql == "event"
245+
assert dim.window_sql_expr == dim.window
246+
assert dim.sql_expr == "event"
247+
248+
249+
def test_dimension_window_in_sql_generation():
250+
"""Test that window dimensions produce correct SQL in generated queries."""
251+
from sidemantic.core.dimension import Dimension
252+
from sidemantic.core.metric import Metric
253+
from sidemantic.core.model import Model
254+
from sidemantic.core.semantic_graph import SemanticGraph
255+
from sidemantic.sql.generator import SQLGenerator
256+
257+
graph = SemanticGraph()
258+
model = Model(
259+
name="events",
260+
table="public.events",
261+
primary_key="event_id",
262+
dimensions=[
263+
Dimension(
264+
name="event",
265+
type="categorical",
266+
),
267+
Dimension(
268+
name="next_event",
269+
type="categorical",
270+
sql="event",
271+
window="LEAD(event) OVER (PARTITION BY person_id ORDER BY timestamp)",
272+
),
273+
],
274+
metrics=[
275+
Metric(name="event_count", agg="count"),
276+
],
277+
)
278+
graph.add_model(model)
279+
280+
gen = SQLGenerator(graph, dialect="duckdb")
281+
sql = gen.generate(
282+
metrics=["events.event_count"],
283+
dimensions=["events.next_event"],
284+
)
285+
286+
# The window expression should appear in the generated SQL
287+
assert "LEAD(event) OVER (PARTITION BY person_id ORDER BY timestamp)" in sql
288+
289+
148290
if __name__ == "__main__":
149291
pytest.main([__file__, "-v"])

0 commit comments

Comments
 (0)