Fix colNDVBySeg attnum index mismatch in column-specific ANALYZE (#1680)

yjhjstz · web-flow · commit fac66115396c · 2026-04-22T20:38:59.000-07:00
* Fix colNDVBySeg index mismatch in do_analyze_rel

When ANALYZE is run on specific columns (e.g., ANALYZE t (col)) or when
a table has dropped columns, the vacattrstats loop index `i` diverges
from the attribute's actual attnum-1 index used by colNDVBySeg.

Two fixes:
1. QD side (line 887): read colNDVBySeg[attnum-1] instead of
   colNDVBySeg[i] when storing stadistinctbyseg.
2. Segment side (line 1011): write ctx-&gt;stadistincts[attnum-1] instead
   of ctx-&gt;stadistincts[i] when collecting per-segment NDV.

* Add regression test for colNDVBySeg index mismatch in do_analyze_rel

ANALYZE t(b) puts column b at loop index i=0 on the QD, but b has
attnum=2, so attnum-1=1 != i=0. The fix in do_analyze_rel (using
attnum-1 instead of i to index colNDVBySeg) ensures stadistinctbyseg
is read from the correct per-segment NDV slot.

Test verifies stadistinctbyseg for column b equals 100 (all distinct)
rather than ~5 (NDV of column a at index 0).
diff --git a/src/backend/commands/analyze.c b/src/backend/commands/analyze.c
@@ -884,7 +884,7 @@ do_analyze_rel(Relation onerel, VacuumParams *params,
 
 			if (Gp_role == GP_ROLE_DISPATCH && GpPolicyIsPartitioned(onerel->rd_cdbpolicy))
 			{
-				stats->stadistinctbyseg = colNDVBySeg[i];
+				stats->stadistinctbyseg = colNDVBySeg[stats->attr->attnum - 1];
 			}
 
 			stats->tupDesc = onerel->rd_att;
@@ -1008,7 +1008,7 @@ do_analyze_rel(Relation onerel, VacuumParams *params,
 
 			if (Gp_role == GP_ROLE_EXECUTE) {
 				Assert(ctx->stadistincts);
-				ctx->stadistincts[i] = Float8GetDatum(stats->stadistinct);
+				ctx->stadistincts[stats->attr->attnum - 1] = Float8GetDatum(stats->stadistinct);
 			}
 
 			MemoryContextResetAndDeleteChildren(col_context);
diff --git a/src/test/regress/expected/analyze.out b/src/test/regress/expected/analyze.out
@@ -1314,3 +1314,30 @@ select * from pg_stats where tablename like 'part2';
 (1 row)
 
 drop table multipart cascade;
+--
+-- Test column-specific ANALYZE correctly uses attnum-based NDV index (not loop index).
+-- When ANALYZE t(b) is run, the QD loop has i=0 for column b (attnum=2),
+-- so attnum-1=1 != i=0. Without the fix, colNDVBySeg[i=0] reads column a's NDV
+-- instead of column b's NDV.
+--
+CREATE TABLE analyze_col_ndv_drop (a int, b int, c int) DISTRIBUTED BY (a);
+INSERT INTO analyze_col_ndv_drop SELECT i%5, i, i%50 FROM generate_series(1, 100) i;
+-- ANALYZE specific column b: QD loop has i=0, b.attnum=2, so attnum-1=1 != i=0
+ANALYZE analyze_col_ndv_drop (b);
+-- stadistinctbyseg for b should be 100 (all distinct), not ~5 (NDV of column a at index 0)
+SELECT a.attname,
+       CASE WHEN s.stakind1 = 8 THEN array_to_string(s.stavalues1, ',')
+            WHEN s.stakind2 = 8 THEN array_to_string(s.stavalues2, ',')
+            WHEN s.stakind3 = 8 THEN array_to_string(s.stavalues3, ',')
+            WHEN s.stakind4 = 8 THEN array_to_string(s.stavalues4, ',')
+            WHEN s.stakind5 = 8 THEN array_to_string(s.stavalues5, ',')
+       END AS stadistinctbyseg
+FROM pg_statistic s
+JOIN pg_attribute a ON a.attrelid = s.starelid AND a.attnum = s.staattnum
+WHERE s.starelid = 'analyze_col_ndv_drop'::regclass AND a.attname = 'b';
+ attname | stadistinctbyseg 
+---------+------------------
+ b       | 100
+(1 row)
+
+DROP TABLE analyze_col_ndv_drop;
diff --git a/src/test/regress/sql/analyze.sql b/src/test/regress/sql/analyze.sql
@@ -677,3 +677,26 @@ analyze verbose p2;
 select * from pg_stats where tablename like 'part2';
 
 drop table multipart cascade;
+
+--
+-- Test column-specific ANALYZE correctly uses attnum-based NDV index (not loop index).
+-- When ANALYZE t(b) is run, the QD loop has i=0 for column b (attnum=2),
+-- so attnum-1=1 != i=0. Without the fix, colNDVBySeg[i=0] reads column a's NDV
+-- instead of column b's NDV.
+--
+CREATE TABLE analyze_col_ndv_drop (a int, b int, c int) DISTRIBUTED BY (a);
+INSERT INTO analyze_col_ndv_drop SELECT i%5, i, i%50 FROM generate_series(1, 100) i;
+-- ANALYZE specific column b: QD loop has i=0, b.attnum=2, so attnum-1=1 != i=0
+ANALYZE analyze_col_ndv_drop (b);
+-- stadistinctbyseg for b should be 100 (all distinct), not ~5 (NDV of column a at index 0)
+SELECT a.attname,
+       CASE WHEN s.stakind1 = 8 THEN array_to_string(s.stavalues1, ',')
+            WHEN s.stakind2 = 8 THEN array_to_string(s.stavalues2, ',')
+            WHEN s.stakind3 = 8 THEN array_to_string(s.stavalues3, ',')
+            WHEN s.stakind4 = 8 THEN array_to_string(s.stavalues4, ',')
+            WHEN s.stakind5 = 8 THEN array_to_string(s.stavalues5, ',')
+       END AS stadistinctbyseg
+FROM pg_statistic s
+JOIN pg_attribute a ON a.attrelid = s.starelid AND a.attnum = s.staattnum
+WHERE s.starelid = 'analyze_col_ndv_drop'::regclass AND a.attname = 'b';
+DROP TABLE analyze_col_ndv_drop;

Original file line number	Diff line number	Diff line change
`@@ -884,7 +884,7 @@ do_analyze_rel(Relation onerel, VacuumParams *params,`
`884`	`884`
`885`	`885`	`if (Gp_role == GP_ROLE_DISPATCH && GpPolicyIsPartitioned(onerel->rd_cdbpolicy))`
`886`	`886`	`{`
`887`		`- stats->stadistinctbyseg = colNDVBySeg[i];`
	`887`	`+ stats->stadistinctbyseg = colNDVBySeg[stats->attr->attnum - 1];`
`888`	`888`	`}`
`889`	`889`
`890`	`890`	`stats->tupDesc = onerel->rd_att;`
`@@ -1008,7 +1008,7 @@ do_analyze_rel(Relation onerel, VacuumParams *params,`
`1008`	`1008`
`1009`	`1009`	`if (Gp_role == GP_ROLE_EXECUTE) {`
`1010`	`1010`	`Assert(ctx->stadistincts);`
`1011`		`- ctx->stadistincts[i] = Float8GetDatum(stats->stadistinct);`
	`1011`	`+ ctx->stadistincts[stats->attr->attnum - 1] = Float8GetDatum(stats->stadistinct);`
`1012`	`1012`	`}`
`1013`	`1013`
`1014`	`1014`	`MemoryContextResetAndDeleteChildren(col_context);`