Skip to content

Commit 1fcbc53

Browse files
committed
Fix: enforce chronological funnel order, normalize qualified filters
1 parent 132443c commit 1fcbc53

2 files changed

Lines changed: 213 additions & 41 deletions

File tree

sidemantic/sql/generator.py

Lines changed: 65 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,32 @@ def _build_interval(self, num: str, unit: str) -> str:
7575
# Standard SQL: INTERVAL '7 days'
7676
return f"INTERVAL '{num} {unit}'"
7777

78+
def _strip_model_prefixes(self, filters: list[str], model_name: str) -> list[str]:
79+
"""Strip model name prefixes from filter column references.
80+
81+
Conversion and retention CTEs query the raw table directly, so qualified
82+
references like ``events.region`` need to become just ``region``.
83+
84+
Args:
85+
filters: List of filter SQL expressions
86+
model_name: Model name whose prefix should be stripped
87+
88+
Returns:
89+
Filters with model prefixes removed
90+
"""
91+
result = []
92+
for f in filters:
93+
try:
94+
parsed = sqlglot.parse_one(f, dialect=self.dialect)
95+
for column in parsed.find_all(exp.Column):
96+
tbl = column.table
97+
if tbl and tbl.replace("_cte", "") == model_name:
98+
column.set("table", None)
99+
result.append(parsed.sql(dialect=self.dialect))
100+
except Exception:
101+
result.append(f)
102+
return result
103+
78104
def _quote_alias(self, name: str) -> str:
79105
"""Quote an identifier for use as a SQL alias.
80106
@@ -2413,7 +2439,9 @@ def _generate_multistep_conversion_query(
24132439
) -> str:
24142440
"""Generate SQL for N-step conversion funnel metrics.
24152441
2416-
Uses per-entity BOOL_OR aggregation for each step, then sums in the outer query.
2442+
Uses timestamp-aware chronological ordering: for each entity, tracks
2443+
the earliest timestamp at which each step was satisfied (via MIN/CASE),
2444+
then gates each step on occurring after all prior steps.
24172445
24182446
Args:
24192447
metric: The Metric object with steps defined
@@ -2454,16 +2482,37 @@ def _generate_multistep_conversion_query(
24542482
if not model:
24552483
raise ValueError(f"No model found for conversion metric {metric_name}")
24562484

2485+
# Find timestamp dimension: prefer model.default_time_dimension, fall back to first time dim
2486+
timestamp_dim = None
2487+
if model.default_time_dimension:
2488+
timestamp_dim = model.get_dimension(model.default_time_dimension)
2489+
if not timestamp_dim:
2490+
for dim in model.dimensions:
2491+
if dim.type == "time":
2492+
timestamp_dim = dim
2493+
break
2494+
2495+
if not timestamp_dim:
2496+
raise ValueError(
2497+
f"Multi-step conversion funnel requires a time dimension on model '{model.name}' "
2498+
"to enforce chronological step ordering"
2499+
)
2500+
2501+
ts_sql = timestamp_dim.sql_expr
2502+
24572503
# Build FROM clause
24582504
if model.sql:
24592505
from_clause = f"({model.sql}) AS t"
24602506
else:
24612507
from_clause = model.table
24622508

2509+
# Normalize filters: strip model name prefixes so they work inside CTEs
2510+
normalized_filters = self._strip_model_prefixes(filters or [], model.name)
2511+
24632512
# Build WHERE filter clause
24642513
filter_clause = ""
2465-
if filters:
2466-
filter_clause = "\n WHERE " + " AND ".join(filters)
2514+
if normalized_filters:
2515+
filter_clause = "\n WHERE " + " AND ".join(normalized_filters)
24672516

24682517
# Resolve dimension columns for GROUP BY support
24692518
dim_entries: list[tuple[str, str]] = []
@@ -2486,10 +2535,12 @@ def _generate_multistep_conversion_query(
24862535

24872536
dim_aliases = [alias for alias, _ in dim_entries]
24882537

2489-
# Build inner CTE: per-entity BOOL_OR for each step
2538+
# Build inner CTE: per-entity MIN timestamp for each step.
2539+
# MIN(CASE WHEN step_expr THEN timestamp END) captures the earliest time
2540+
# the step was satisfied, enabling chronological ordering in the outer query.
24902541
step_cols = []
24912542
for i, step_expr in enumerate(metric.steps, 1):
2492-
step_cols.append(f"BOOL_OR({step_expr}) AS step_{i}")
2543+
step_cols.append(f"MIN(CASE WHEN {step_expr} THEN {ts_sql} END) AS step_{i}_ts")
24932544

24942545
inner_select_parts = [f"{metric.entity} AS entity"]
24952546
if dim_entries:
@@ -2505,16 +2556,20 @@ def _generate_multistep_conversion_query(
25052556
group_by_inner = ",\n ".join(group_by_inner_parts)
25062557

25072558
# Build outer SELECT: total entities (only step 1 entrants) + sum of each step.
2508-
# Gate each step on completion of all prior steps for monotonic funnels.
2559+
# Gate each step on chronological order: step_N_ts must be >= step_(N-1)_ts.
25092560
outer_select_parts = []
25102561
if dim_aliases:
25112562
for alias in dim_aliases:
25122563
outer_select_parts.append(f"{alias}")
2513-
outer_select_parts.append("SUM(step_1::int) AS total_entities")
2564+
outer_select_parts.append("SUM(CASE WHEN step_1_ts IS NOT NULL THEN 1 ELSE 0 END) AS total_entities")
25142565
for i in range(1, len(metric.steps) + 1):
2515-
# Each step requires all prior steps to be true
2516-
gate_expr = " AND ".join(f"step_{j}" for j in range(1, i + 1))
2517-
outer_select_parts.append(f"SUM(({gate_expr})::int) AS step_{i}_count")
2566+
# Each step requires all prior steps to have been completed in chronological order
2567+
conditions = ["step_1_ts IS NOT NULL"]
2568+
for j in range(2, i + 1):
2569+
conditions.append(f"step_{j}_ts IS NOT NULL")
2570+
conditions.append(f"step_{j}_ts >= step_{j - 1}_ts")
2571+
gate_expr = " AND ".join(conditions)
2572+
outer_select_parts.append(f"SUM(CASE WHEN {gate_expr} THEN 1 ELSE 0 END) AS step_{i}_count")
25182573
outer_select = ",\n ".join(outer_select_parts)
25192574

25202575
# Build GROUP BY, ORDER BY, LIMIT for outer query

tests/metrics/test_advanced.py

Lines changed: 148 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -570,25 +570,26 @@ def test_wow_percent_change_partitions_by_dimension():
570570

571571

572572
def test_multistep_conversion_funnel():
573-
"""Test N-step conversion funnel metric with BOOL_OR aggregation pattern."""
573+
"""Test N-step conversion funnel metric with timestamp-aware chronological ordering."""
574574
events = Model(
575575
name="events",
576576
sql="""
577-
SELECT 1 AS person_id, 'Application Installed' AS event
578-
UNION ALL SELECT 1, 'note.opened'
579-
UNION ALL SELECT 1, 'note.created'
580-
UNION ALL SELECT 1, 'git.sync'
581-
UNION ALL SELECT 2, 'Application Installed'
582-
UNION ALL SELECT 2, 'note.opened'
583-
UNION ALL SELECT 2, 'note.created'
584-
UNION ALL SELECT 3, 'Application Installed'
585-
UNION ALL SELECT 3, 'note.opened'
586-
UNION ALL SELECT 4, 'Application Installed'
577+
SELECT 1 AS person_id, 'Application Installed' AS event, '2024-01-01'::TIMESTAMP AS ts
578+
UNION ALL SELECT 1, 'note.opened', '2024-01-02'::TIMESTAMP
579+
UNION ALL SELECT 1, 'note.created', '2024-01-03'::TIMESTAMP
580+
UNION ALL SELECT 1, 'git.sync', '2024-01-04'::TIMESTAMP
581+
UNION ALL SELECT 2, 'Application Installed', '2024-01-01'::TIMESTAMP
582+
UNION ALL SELECT 2, 'note.opened', '2024-01-02'::TIMESTAMP
583+
UNION ALL SELECT 2, 'note.created', '2024-01-03'::TIMESTAMP
584+
UNION ALL SELECT 3, 'Application Installed', '2024-01-01'::TIMESTAMP
585+
UNION ALL SELECT 3, 'note.opened', '2024-01-02'::TIMESTAMP
586+
UNION ALL SELECT 4, 'Application Installed', '2024-01-01'::TIMESTAMP
587587
""",
588588
primary_key="person_id",
589589
dimensions=[
590590
Dimension(name="person_id", sql="person_id", type="categorical"),
591591
Dimension(name="event", sql="event", type="categorical"),
592+
Dimension(name="ts", sql="ts", type="time"),
592593
],
593594
)
594595

@@ -638,15 +639,16 @@ def test_multistep_conversion_2_steps_via_steps_list():
638639
events = Model(
639640
name="events",
640641
sql="""
641-
SELECT 1 AS user_id, 'signup' AS event
642-
UNION ALL SELECT 1, 'purchase'
643-
UNION ALL SELECT 2, 'signup'
644-
UNION ALL SELECT 3, 'signup'
642+
SELECT 1 AS user_id, 'signup' AS event, '2024-01-01'::TIMESTAMP AS ts
643+
UNION ALL SELECT 1, 'purchase', '2024-01-02'::TIMESTAMP
644+
UNION ALL SELECT 2, 'signup', '2024-01-01'::TIMESTAMP
645+
UNION ALL SELECT 3, 'signup', '2024-01-01'::TIMESTAMP
645646
""",
646647
primary_key="user_id",
647648
dimensions=[
648649
Dimension(name="user_id", sql="user_id", type="categorical"),
649650
Dimension(name="event", sql="event", type="categorical"),
651+
Dimension(name="ts", sql="ts", type="time"),
650652
],
651653
)
652654

@@ -742,14 +744,15 @@ def test_conversion_metric_steps_and_base_event():
742744
events = Model(
743745
name="events",
744746
sql="""
745-
SELECT 1 AS person_id, 'install' AS event
746-
UNION ALL SELECT 1, 'activate'
747-
UNION ALL SELECT 2, 'install'
747+
SELECT 1 AS person_id, 'install' AS event, '2024-01-01'::TIMESTAMP AS ts
748+
UNION ALL SELECT 1, 'activate', '2024-01-02'::TIMESTAMP
749+
UNION ALL SELECT 2, 'install', '2024-01-01'::TIMESTAMP
748750
""",
749751
primary_key="person_id",
750752
dimensions=[
751753
Dimension(name="person_id", sql="person_id", type="categorical"),
752754
Dimension(name="event", sql="event", type="categorical"),
755+
Dimension(name="ts", sql="ts", type="time"),
753756
],
754757
)
755758

@@ -777,7 +780,7 @@ def test_conversion_metric_steps_and_base_event():
777780
result = conn.execute(sql)
778781
rows = df_rows(result)
779782

780-
# steps takes precedence: uses BOOL_OR pattern
783+
# steps takes precedence: uses timestamp-aware pattern
781784
assert rows[0][0] == 2 # total_entities
782785
assert rows[0][1] == 2 # step_1_count (install)
783786
assert rows[0][2] == 1 # step_2_count (activate)
@@ -819,18 +822,19 @@ def test_multistep_funnel_monotonic_counts():
819822
events = Model(
820823
name="events",
821824
sql="""
822-
SELECT 1 AS person_id, 'install' AS event
823-
UNION ALL SELECT 1, 'activate'
824-
UNION ALL SELECT 1, 'purchase'
825-
UNION ALL SELECT 2, 'install'
826-
UNION ALL SELECT 2, 'activate'
827-
UNION ALL SELECT 3, 'install'
828-
UNION ALL SELECT 5, 'purchase'
825+
SELECT 1 AS person_id, 'install' AS event, '2024-01-01'::TIMESTAMP AS ts
826+
UNION ALL SELECT 1, 'activate', '2024-01-02'::TIMESTAMP
827+
UNION ALL SELECT 1, 'purchase', '2024-01-03'::TIMESTAMP
828+
UNION ALL SELECT 2, 'install', '2024-01-01'::TIMESTAMP
829+
UNION ALL SELECT 2, 'activate', '2024-01-02'::TIMESTAMP
830+
UNION ALL SELECT 3, 'install', '2024-01-01'::TIMESTAMP
831+
UNION ALL SELECT 5, 'purchase', '2024-01-01'::TIMESTAMP
829832
""",
830833
primary_key="person_id",
831834
dimensions=[
832835
Dimension(name="person_id", sql="person_id", type="categorical"),
833836
Dimension(name="event", sql="event", type="categorical"),
837+
Dimension(name="ts", sql="ts", type="time"),
834838
],
835839
)
836840

@@ -875,16 +879,17 @@ def test_multistep_funnel_total_excludes_non_entrants():
875879
events = Model(
876880
name="events",
877881
sql="""
878-
SELECT 1 AS uid, 'signup' AS event
879-
UNION ALL SELECT 1, 'purchase'
880-
UNION ALL SELECT 2, 'signup'
881-
UNION ALL SELECT 3, 'browse'
882-
UNION ALL SELECT 4, 'browse'
882+
SELECT 1 AS uid, 'signup' AS event, '2024-01-01'::TIMESTAMP AS ts
883+
UNION ALL SELECT 1, 'purchase', '2024-01-02'::TIMESTAMP
884+
UNION ALL SELECT 2, 'signup', '2024-01-01'::TIMESTAMP
885+
UNION ALL SELECT 3, 'browse', '2024-01-01'::TIMESTAMP
886+
UNION ALL SELECT 4, 'browse', '2024-01-01'::TIMESTAMP
883887
""",
884888
primary_key="uid",
885889
dimensions=[
886890
Dimension(name="uid", sql="uid", type="categorical"),
887891
Dimension(name="event", sql="event", type="categorical"),
892+
Dimension(name="ts", sql="ts", type="time"),
888893
],
889894
)
890895

@@ -913,3 +918,115 @@ def test_multistep_funnel_total_excludes_non_entrants():
913918
assert rows[0][0] == 2 # total_entities = step 1 entrants
914919
assert rows[0][1] == 2 # step_1_count
915920
assert rows[0][2] == 1 # step_2_count (only user 1 purchased)
921+
922+
923+
def test_multistep_funnel_chronological_order_enforced():
924+
"""Test that funnel steps must occur in chronological order.
925+
926+
An entity that does step 2 before step 1 should NOT count as having
927+
completed step 2 in the funnel, even if both events exist.
928+
"""
929+
events = Model(
930+
name="events",
931+
sql="""
932+
SELECT 1 AS user_id, 'signup' AS event, '2024-01-01'::TIMESTAMP AS ts
933+
UNION ALL SELECT 1, 'purchase', '2024-01-05'::TIMESTAMP
934+
UNION ALL SELECT 2, 'purchase', '2024-01-01'::TIMESTAMP
935+
UNION ALL SELECT 2, 'signup', '2024-01-10'::TIMESTAMP
936+
UNION ALL SELECT 3, 'signup', '2024-01-01'::TIMESTAMP
937+
""",
938+
primary_key="user_id",
939+
dimensions=[
940+
Dimension(name="user_id", sql="user_id", type="categorical"),
941+
Dimension(name="event", sql="event", type="categorical"),
942+
Dimension(name="ts", sql="ts", type="time"),
943+
],
944+
)
945+
946+
funnel = Metric(
947+
name="ordered_funnel",
948+
type="conversion",
949+
entity="user_id",
950+
steps=[
951+
"event = 'signup'",
952+
"event = 'purchase'",
953+
],
954+
)
955+
956+
graph = SemanticGraph()
957+
graph.add_model(events)
958+
graph.add_metric(funnel)
959+
960+
generator = SQLGenerator(graph)
961+
sql = generator.generate(metrics=["ordered_funnel"], dimensions=[])
962+
963+
conn = duckdb.connect(":memory:")
964+
result = conn.execute(sql)
965+
rows = df_rows(result)
966+
967+
# User 1: signup(Jan 1) -> purchase(Jan 5) = chronological, counts
968+
# User 2: purchase(Jan 1) -> signup(Jan 10) = REVERSED, does NOT count for step 2
969+
# User 3: signup(Jan 1) only = step 1 only
970+
# total_entities = 3 (all have signup)
971+
# step_1_count = 3
972+
# step_2_count = 1 (only user 1 has purchase AFTER signup)
973+
assert rows[0][0] == 3 # total_entities
974+
assert rows[0][1] == 3 # step_1_count
975+
assert rows[0][2] == 1 # step_2_count (only user 1, NOT user 2)
976+
977+
978+
def test_multistep_funnel_qualified_filters():
979+
"""Test that model-qualified filters work with multistep funnels.
980+
981+
Filters like 'events.region = ...' should be normalized to 'region = ...'
982+
for SQL-backed models using FROM (<sql>) AS t.
983+
"""
984+
events = Model(
985+
name="events",
986+
sql="""
987+
SELECT 1 AS user_id, 'signup' AS event, 'US' AS region, '2024-01-01'::TIMESTAMP AS ts
988+
UNION ALL SELECT 1, 'purchase', 'US', '2024-01-02'::TIMESTAMP
989+
UNION ALL SELECT 2, 'signup', 'US', '2024-01-01'::TIMESTAMP
990+
UNION ALL SELECT 3, 'signup', 'EU', '2024-01-01'::TIMESTAMP
991+
UNION ALL SELECT 3, 'purchase', 'EU', '2024-01-02'::TIMESTAMP
992+
""",
993+
primary_key="user_id",
994+
dimensions=[
995+
Dimension(name="user_id", sql="user_id", type="categorical"),
996+
Dimension(name="event", sql="event", type="categorical"),
997+
Dimension(name="region", sql="region", type="categorical"),
998+
Dimension(name="ts", sql="ts", type="time"),
999+
],
1000+
)
1001+
1002+
funnel = Metric(
1003+
name="regional_funnel",
1004+
type="conversion",
1005+
entity="user_id",
1006+
steps=[
1007+
"event = 'signup'",
1008+
"event = 'purchase'",
1009+
],
1010+
)
1011+
1012+
graph = SemanticGraph()
1013+
graph.add_model(events)
1014+
graph.add_metric(funnel)
1015+
1016+
generator = SQLGenerator(graph)
1017+
# Use model-qualified filter: events.region = 'US'
1018+
sql = generator.generate(
1019+
metrics=["regional_funnel"],
1020+
dimensions=[],
1021+
filters=["events.region = 'US'"],
1022+
)
1023+
1024+
conn = duckdb.connect(":memory:")
1025+
result = conn.execute(sql)
1026+
rows = df_rows(result)
1027+
1028+
# Only US users: user 1 (signup + purchase), user 2 (signup only)
1029+
# User 3 is EU, filtered out
1030+
assert rows[0][0] == 2 # total_entities (US users with signup)
1031+
assert rows[0][1] == 2 # step_1_count
1032+
assert rows[0][2] == 1 # step_2_count (only user 1 purchased)

0 commit comments

Comments
 (0)