Skip to content

Commit 67fa80b

Browse files
committed
Fix: alias timestamp expressions in step CTEs, parenthesize step predicates
1 parent 7f56b48 commit 67fa80b

2 files changed

Lines changed: 139 additions & 7 deletions

File tree

sidemantic/sql/generator.py

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2537,11 +2537,14 @@ def _generate_multistep_conversion_query(
25372537

25382538
dim_aliases = [alias for alias, _ in dim_entries]
25392539

2540-
# Build from_clause variant for non-first steps (aliased as 's' for joining)
2540+
# Build from_clause variant for non-first steps (aliased as 's' for joining).
2541+
# Project the timestamp expression as __ts so later CTEs can reference
2542+
# s.__ts regardless of whether ts_sql is a simple column or an expression
2543+
# like CAST(created_at AS DATE).
25412544
if model.sql:
2542-
from_clause_s = f"({model.sql}) AS s"
2545+
from_clause_s = f"(SELECT *, {ts_sql} AS __ts FROM ({model.sql}) AS _src) AS s"
25432546
else:
2544-
from_clause_s = f"{model.table} s"
2547+
from_clause_s = f"(SELECT *, {ts_sql} AS __ts FROM {model.table}) AS s"
25452548

25462549
# Build sequential CTE chain: each step derives its timestamp from the
25472550
# prior step's completion, ensuring correct chronological ordering.
@@ -2564,15 +2567,15 @@ def _generate_multistep_conversion_query(
25642567
f" SELECT\n"
25652568
f" {select_str}\n"
25662569
f" FROM {from_clause}\n"
2567-
f" WHERE {step_expr}{filter_clause}\n"
2570+
f" WHERE ({step_expr}){filter_clause}\n"
25682571
f" GROUP BY\n"
25692572
f" {group_str}\n"
25702573
f")"
25712574
)
25722575
else:
25732576
# Step N: join source to step N-1, only consider events at or after prior step
25742577
prev = f"step_{i - 1}"
2575-
select_parts = [f"s.{metric.entity} AS entity", f"MIN(s.{ts_sql}) AS step_{i}_ts"]
2578+
select_parts = [f"s.{metric.entity} AS entity", f"MIN(s.__ts) AS step_{i}_ts"]
25762579
for alias, _sql_col in dim_entries:
25772580
select_parts.append(f"{prev}.{alias}")
25782581
select_str = ",\n ".join(select_parts)
@@ -2586,8 +2589,8 @@ def _generate_multistep_conversion_query(
25862589
f" {select_str}\n"
25872590
f" FROM {from_clause_s}\n"
25882591
f" JOIN {prev} ON s.{metric.entity} = {prev}.entity\n"
2589-
f" AND s.{ts_sql} >= {prev}.step_{i - 1}_ts\n"
2590-
f" WHERE {step_expr}{filter_clause}\n"
2592+
f" AND s.__ts >= {prev}.step_{i - 1}_ts\n"
2593+
f" WHERE ({step_expr}){filter_clause}\n"
25912594
f" GROUP BY\n"
25922595
f" {group_str}\n"
25932596
f")"

tests/metrics/test_advanced.py

Lines changed: 129 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1134,3 +1134,132 @@ def test_multistep_funnel_only_prior_step2_not_counted():
11341134
assert rows[0][0] == 2 # total_entities
11351135
assert rows[0][1] == 2 # step_1_count (both signed up)
11361136
assert rows[0][2] == 1 # step_2_count (only user 2)
1137+
1138+
1139+
def test_multistep_funnel_expression_timestamp():
1140+
"""Test that a time dimension using an expression works in step CTEs.
1141+
1142+
When the time dimension's sql_expr is something like CAST(ts AS DATE)
1143+
rather than a bare column name, the generated SQL must not produce
1144+
invalid fragments like s.CAST(ts AS DATE). The generator should alias
1145+
the expression and reference the alias in later steps.
1146+
"""
1147+
events = Model(
1148+
name="events",
1149+
sql="""
1150+
SELECT 1 AS user_id, 'signup' AS event, '2024-01-01 10:00:00'::TIMESTAMP AS created_at
1151+
UNION ALL SELECT 1, 'purchase', '2024-01-02 14:00:00'::TIMESTAMP
1152+
UNION ALL SELECT 2, 'signup', '2024-01-01 08:00:00'::TIMESTAMP
1153+
UNION ALL SELECT 3, 'signup', '2024-01-01 09:00:00'::TIMESTAMP
1154+
""",
1155+
primary_key="user_id",
1156+
dimensions=[
1157+
Dimension(name="user_id", sql="user_id", type="categorical"),
1158+
Dimension(name="event", sql="event", type="categorical"),
1159+
# Expression-based time dimension (not a bare column)
1160+
Dimension(name="event_date", sql="CAST(created_at AS DATE)", type="time"),
1161+
],
1162+
)
1163+
1164+
funnel = Metric(
1165+
name="expr_ts_funnel",
1166+
type="conversion",
1167+
entity="user_id",
1168+
steps=[
1169+
"event = 'signup'",
1170+
"event = 'purchase'",
1171+
],
1172+
)
1173+
1174+
graph = SemanticGraph()
1175+
graph.add_model(events)
1176+
graph.add_metric(funnel)
1177+
1178+
generator = SQLGenerator(graph)
1179+
sql = generator.generate(metrics=["expr_ts_funnel"], dimensions=[])
1180+
1181+
print("\nExpression timestamp funnel SQL:")
1182+
print(sql)
1183+
1184+
conn = duckdb.connect(":memory:")
1185+
result = conn.execute(sql)
1186+
rows = df_rows(result)
1187+
1188+
# User 1: signup -> purchase (valid path)
1189+
# User 2: signup only
1190+
# User 3: signup only
1191+
assert rows[0][0] == 3 # total_entities
1192+
assert rows[0][1] == 3 # step_1_count
1193+
assert rows[0][2] == 1 # step_2_count (only user 1)
1194+
1195+
1196+
def test_multistep_funnel_or_step_with_filter():
1197+
"""Test that step predicates containing OR are correctly parenthesized.
1198+
1199+
Without parenthesization, WHERE event = 'a' OR event = 'b' AND region = 'US'
1200+
would be parsed as event = 'a' OR (event = 'b' AND region = 'US'), which is
1201+
incorrect. The step predicate should be wrapped: WHERE (event = 'a' OR event = 'b')
1202+
AND region = 'US'.
1203+
"""
1204+
events = Model(
1205+
name="events",
1206+
sql="""
1207+
SELECT 1 AS user_id, 'email_signup' AS event, 'US' AS region, '2024-01-01'::TIMESTAMP AS ts
1208+
UNION ALL SELECT 1, 'purchase', 'US', '2024-01-02'::TIMESTAMP
1209+
UNION ALL SELECT 2, 'social_signup', 'US', '2024-01-01'::TIMESTAMP
1210+
UNION ALL SELECT 2, 'purchase', 'US', '2024-01-02'::TIMESTAMP
1211+
UNION ALL SELECT 3, 'email_signup', 'EU', '2024-01-01'::TIMESTAMP
1212+
UNION ALL SELECT 3, 'purchase', 'EU', '2024-01-02'::TIMESTAMP
1213+
UNION ALL SELECT 4, 'social_signup', 'EU', '2024-01-01'::TIMESTAMP
1214+
""",
1215+
primary_key="user_id",
1216+
dimensions=[
1217+
Dimension(name="user_id", sql="user_id", type="categorical"),
1218+
Dimension(name="event", sql="event", type="categorical"),
1219+
Dimension(name="region", sql="region", type="categorical"),
1220+
Dimension(name="ts", sql="ts", type="time"),
1221+
],
1222+
)
1223+
1224+
funnel = Metric(
1225+
name="or_step_funnel",
1226+
type="conversion",
1227+
entity="user_id",
1228+
steps=[
1229+
"event = 'email_signup' OR event = 'social_signup'",
1230+
"event = 'purchase'",
1231+
],
1232+
)
1233+
1234+
graph = SemanticGraph()
1235+
graph.add_model(events)
1236+
graph.add_metric(funnel)
1237+
1238+
generator = SQLGenerator(graph)
1239+
# Filter to US only
1240+
sql = generator.generate(
1241+
metrics=["or_step_funnel"],
1242+
dimensions=[],
1243+
filters=["region = 'US'"],
1244+
)
1245+
1246+
print("\nOR step with filter funnel SQL:")
1247+
print(sql)
1248+
1249+
conn = duckdb.connect(":memory:")
1250+
result = conn.execute(sql)
1251+
rows = df_rows(result)
1252+
1253+
# With correct parenthesization:
1254+
# Step 1: (event = 'email_signup' OR event = 'social_signup') AND region = 'US'
1255+
# -> users 1 (email_signup, US), 2 (social_signup, US) = 2
1256+
# User 3 is EU, user 4 is EU -> filtered out
1257+
# Step 2: (event = 'purchase') AND region = 'US'
1258+
# -> users 1, 2 (both purchased in US) = 2
1259+
#
1260+
# Without parenthesization (the bug):
1261+
# Step 1: event = 'email_signup' OR (event = 'social_signup' AND region = 'US')
1262+
# -> users 1 (email), 2 (social+US), 3 (email) = 3 (WRONG: user 3 EU leaks in)
1263+
assert rows[0][0] == 2 # total_entities (US signups only)
1264+
assert rows[0][1] == 2 # step_1_count
1265+
assert rows[0][2] == 2 # step_2_count (both US users purchased)

0 commit comments

Comments
 (0)