diff --git a/clippy.toml b/clippy.toml new file mode 100644 index 0000000..66eb2d7 --- /dev/null +++ b/clippy.toml @@ -0,0 +1,5 @@ +allow-indexing-slicing-in-tests = true +allow-panic-in-tests = true +allow-unwrap-in-tests = true +allow-expect-in-tests = true +allow-dbg-in-tests = true diff --git a/crates/dry_run_cli/src/main.rs b/crates/dry_run_cli/src/main.rs index 8020d12..338766e 100644 --- a/crates/dry_run_cli/src/main.rs +++ b/crates/dry_run_cli/src/main.rs @@ -7,7 +7,6 @@ use clap::{Parser, Subcommand}; use dry_run_core::history::{ DatabaseId, PutOutcome, SnapshotKey, SnapshotRef, SnapshotStore, TimeRange, }; -use dry_run_core::schema::{NodeColumnStats, NodeIndexStats, NodeStats, NodeTableStats}; use dry_run_core::{DryRun, HistoryStore, ProjectConfig}; use rmcp::ServiceExt; @@ -36,8 +35,6 @@ enum Command { }, Import { file: PathBuf, - #[arg(long, num_args = 1..)] - stats: Vec, }, Probe { #[arg(long, env = "DATABASE_URL")] @@ -59,8 +56,6 @@ enum Command { #[arg(short, long)] output: Option, #[arg(long)] - stats_only: bool, - #[arg(long)] name: Option, }, Snapshot { @@ -71,10 +66,6 @@ enum Command { #[command(subcommand)] action: ProfileAction, }, - Stats { - #[command(subcommand)] - action: StatsAction, - }, Drift { #[arg(long, env = "DATABASE_URL")] db: Option, @@ -97,18 +88,6 @@ enum Command { }, } -#[derive(Subcommand)] -enum StatsAction { - Apply { - #[arg(long, env = "DATABASE_URL")] - db: Option, - #[arg(long, short)] - schema_file: Option, - #[arg(long, short)] - node: Option, - }, -} - #[derive(Subcommand)] enum SnapshotAction { Take { @@ -117,6 +96,24 @@ enum SnapshotAction { #[arg(long)] history_db: Option, }, + /// Capture activity counters from a replica. + /// + /// Connects to `--from ` (a replica) and writes a single + /// activity_stats row tagged with `--label`. Use `dryrun snapshot take` + /// against the primary instead to capture schema and planner stats. + Activity { + /// Replica connection URL (must report pg_is_in_recovery() = true) + #[arg(long)] + from: String, + /// Label identifying this node in the history db (e.g. `replica1`) + #[arg(long)] + label: String, + /// Allow capture even if no schema snapshot exists for the project yet. + #[arg(long)] + allow_orphan: bool, + #[arg(long)] + history_db: Option, + }, List { #[arg(long, env = "DATABASE_URL")] db: Option, @@ -176,7 +173,6 @@ async fn run(cli: Cli) -> anyhow::Result<()> { ref source, pretty, ref output, - stats_only, ref name, } => { cmd_dump_schema( @@ -184,16 +180,12 @@ async fn run(cli: Cli) -> anyhow::Result<()> { source.as_deref(), pretty, output.clone(), - stats_only, name.clone(), ) .await } Command::Init { ref db } => cmd_init(db.as_deref()).await, - Command::Import { - ref file, - ref stats, - } => cmd_import(&cli, file, stats).await, + Command::Import { ref file } => cmd_import(&cli, file).await, Command::Lint { ref schema_name, pretty, @@ -201,7 +193,6 @@ async fn run(cli: Cli) -> anyhow::Result<()> { } => cmd_lint(&cli, schema_name.as_deref(), pretty, json).await, Command::Snapshot { ref action } => cmd_snapshot(&cli, action).await, Command::Profile { ref action } => cmd_profile(&cli, action), - Command::Stats { ref action } => cmd_stats(&cli, action).await, Command::Drift { ref db, ref against, @@ -259,7 +250,6 @@ async fn cmd_dump_schema( source: Option<&str>, pretty: bool, output: Option, - stats_only: bool, name: Option, ) -> anyhow::Result<()> { let resolved = active_resolved_profile(cli, source, None)?; @@ -270,81 +260,9 @@ async fn cmd_dump_schema( let name = name.or_else(|| resolved.database_id.as_ref().map(|d| d.0.clone())); let ctx = DryRun::connect(db_url).await?; - if stats_only { - let source = - name.ok_or_else(|| anyhow::anyhow!("--name is required when using --stats-only"))?; - let node_stats = ctx.introspect_stats_only(&source).await?; - - let json = if pretty { - serde_json::to_string_pretty(&node_stats)? - } else { - serde_json::to_string(&node_stats)? - }; - - if let Some(path) = &output { - std::fs::write(path, &json)?; - eprintln!( - "Stats written to {} ({} tables, {} indexes)", - path.display(), - node_stats.table_stats.len(), - node_stats.index_stats.len() - ); - } else { - println!("{json}"); - } - return Ok(()); - } - let mut snapshot = ctx.introspect_schema().await?; snapshot.source = name; - if let Some(ref source) = snapshot.source { - let mut table_stats = Vec::new(); - let mut index_stats = Vec::new(); - let mut column_stats = Vec::new(); - - for table in &snapshot.tables { - if let Some(ref ts) = table.stats { - table_stats.push(NodeTableStats { - schema: table.schema.clone(), - table: table.name.clone(), - stats: ts.clone(), - }); - } - for idx in &table.indexes { - if let Some(ref is) = idx.stats { - index_stats.push(NodeIndexStats { - schema: table.schema.clone(), - table: table.name.clone(), - index_name: idx.name.clone(), - stats: is.clone(), - }); - } - } - for col in &table.columns { - if let Some(ref cs) = col.stats { - column_stats.push(NodeColumnStats { - schema: table.schema.clone(), - table: table.name.clone(), - column: col.name.clone(), - stats: cs.clone(), - }); - } - } - } - - let is_standby = ctx.is_standby().await?; - - snapshot.node_stats = vec![NodeStats { - source: source.clone(), - timestamp: snapshot.timestamp, - is_standby, - table_stats, - index_stats, - column_stats, - }]; - } - let json = if pretty { serde_json::to_string_pretty(&snapshot)? } else { @@ -516,6 +434,15 @@ async fn cmd_snapshot(cli: &Cli, action: &SnapshotAction) -> anyhow::Result<()> SnapshotAction::Take { db, history_db } => { let db_url = require_db_url(db.as_deref())?; let ctx = DryRun::connect(db_url).await?; + + if ctx.is_standby().await? { + anyhow::bail!( + "`dryrun snapshot take` must run against the primary; \ + use `dryrun snapshot activity --from --label ` \ + to capture activity from a replica" + ); + } + let store = open_history_store(history_db.as_deref())?; let snapshot = ctx.introspect_schema().await?; @@ -526,7 +453,8 @@ async fn cmd_snapshot(cli: &Cli, action: &SnapshotAction) -> anyhow::Result<()> let resolved = config.resolve_profile(Some(db_url), None, profile, &cwd)?; let key = complete_key(&resolved, &snapshot.database); - match store.put(&key, &snapshot).await? { + let schema_outcome = store.put(&key, &snapshot).await?; + match schema_outcome { PutOutcome::Inserted => { println!("Snapshot saved: {}", snapshot.content_hash); println!( @@ -535,19 +463,116 @@ async fn cmd_snapshot(cli: &Cli, action: &SnapshotAction) -> anyhow::Result<()> snapshot.views.len(), snapshot.functions.len() ); + } + PutOutcome::Deduped => { + println!("Schema unchanged (hash: {})", snapshot.content_hash); + } + } + + let planner = ctx.introspect_planner_stats(&snapshot.content_hash).await?; + let planner_outcome = store.put_planner_stats(&key, &planner).await?; + match planner_outcome { + PutOutcome::Inserted => { println!( - " project={} database={}", - key.project_id.0, key.database_id.0 + "Planner stats saved: {} ({} tables, {} columns, {} indexes)", + planner.content_hash, + planner.tables.len(), + planner.columns.len(), + planner.indexes.len(), + ); + } + PutOutcome::Deduped => { + println!("Planner stats unchanged (hash: {})", planner.content_hash); + } + } + + let activity = ctx + .introspect_activity_stats(&snapshot.content_hash, "primary") + .await?; + let activity_outcome = store.put_activity_stats(&key, &activity).await?; + match activity_outcome { + PutOutcome::Inserted => { + println!( + "Activity stats saved: {} (label=primary, {} tables, {} indexes)", + activity.content_hash, + activity.tables.len(), + activity.indexes.len(), + ); + } + PutOutcome::Deduped => { + println!("Activity stats unchanged (hash: {})", activity.content_hash); + } + } + + println!( + " project={} database={}", + key.project_id.0, key.database_id.0 + ); + Ok(()) + } + SnapshotAction::Activity { + from, + label, + allow_orphan, + history_db, + } => { + let ctx = DryRun::connect(from).await?; + if !ctx.is_standby().await? { + anyhow::bail!( + "`dryrun snapshot activity` must run against a standby \ + (--from must report pg_is_in_recovery() = true); \ + use `dryrun snapshot take` against the primary instead" + ); + } + + let store = open_history_store(history_db.as_deref())?; + let cwd = std::env::current_dir().unwrap_or_default(); + let config = ProjectConfig::discover(&cwd) + .map(|(_, c)| Ok(c)) + .unwrap_or_else(|| ProjectConfig::parse(""))?; + let resolved = config.resolve_profile(Some(from), None, profile, &cwd)?; + let database = ctx.current_database().await?; + let key = complete_key(&resolved, &database); + + let schema_ref = match store.latest_schema_hash(&key).await? { + Some(h) => h, + None if *allow_orphan => String::new(), + None => anyhow::bail!( + "no schema snapshot found for project={} database={}; \ + run `dryrun snapshot take` against the primary first, \ + or pass --allow-orphan to capture activity anyway", + key.project_id.0, + key.database_id.0, + ), + }; + + let activity = ctx.introspect_activity_stats(&schema_ref, label).await?; + match store.put_activity_stats(&key, &activity).await? { + PutOutcome::Inserted => { + println!( + "Activity stats saved: {} (label={}, {} tables, {} indexes)", + activity.content_hash, + label, + activity.tables.len(), + activity.indexes.len(), ); } PutOutcome::Deduped => { - println!("Schema unchanged (hash: {})", snapshot.content_hash); println!( - " project={} database={}", - key.project_id.0, key.database_id.0 + "Activity stats unchanged (hash: {}, label={})", + activity.content_hash, label ); } } + if schema_ref.is_empty() { + println!(" (orphan capture: no matching schema snapshot)"); + } else { + println!(" schema_ref={schema_ref}"); + } + println!( + " project={} database={}", + key.project_id.0, key.database_id.0 + ); Ok(()) } SnapshotAction::List { db, history_db } => { @@ -695,37 +720,14 @@ fn cmd_profile(cli: &Cli, action: &ProfileAction) -> anyhow::Result<()> { Ok(()) } -async fn cmd_import( - cli: &Cli, - file: &std::path::Path, - stats_files: &[PathBuf], -) -> anyhow::Result<()> { +async fn cmd_import(cli: &Cli, file: &std::path::Path) -> anyhow::Result<()> { let json = std::fs::read_to_string(file)?; - let mut snapshot: dry_run_core::SchemaSnapshot = serde_json::from_str(&json) + let snapshot: dry_run_core::SchemaSnapshot = serde_json::from_str(&json) .map_err(|e| anyhow::anyhow!("invalid schema JSON in '{}': {e}", file.display()))?; - if !stats_files.is_empty() { - for stats_path in stats_files { - let stats_json = std::fs::read_to_string(stats_path)?; - let node_stats: dry_run_core::NodeStats = - serde_json::from_str(&stats_json).map_err(|e| { - anyhow::anyhow!("invalid stats JSON in '{}': {e}", stats_path.display()) - })?; - eprintln!( - " merging stats from '{}' ({} tables, {} indexes)", - node_stats.source, - node_stats.table_stats.len(), - node_stats.index_stats.len() - ); - snapshot.node_stats.push(node_stats); - } - } - let data_dir = dry_run_core::history::default_data_dir()?; std::fs::create_dir_all(&data_dir)?; - // route to the resolved profile's schema_file when one is configured; - // fall back to .dryrun/schema.json let out_path = active_resolved_profile(cli, None, None) .ok() .and_then(|r| r.schema_file) @@ -737,72 +739,13 @@ async fn cmd_import( std::fs::write(&out_path, &out_json)?; eprintln!( - "Imported {} tables to {}{}", + "Imported {} tables to {}", snapshot.tables.len(), out_path.display(), - if snapshot.node_stats.is_empty() { - String::new() - } else { - format!(" (with {} node stats)", snapshot.node_stats.len()) - } ); Ok(()) } -async fn cmd_stats(cli: &Cli, action: &StatsAction) -> anyhow::Result<()> { - match action { - StatsAction::Apply { - db, - schema_file, - node, - } => { - let resolved = active_resolved_profile(cli, db.as_deref(), schema_file.as_deref())?; - let db_url = resolved - .db_url - .as_deref() - .ok_or_else(|| anyhow::anyhow!("--db or a profile with db_url is required"))?; - - let snapshot = match resolved.schema_file.as_deref() { - Some(path) => load_schema_file(path)?, - None => resolve_schema(schema_file.as_deref(), None, None)?, - }; - - let ctx = DryRun::connect(db_url).await?; - - let result = - dry_run_core::schema::apply_stats(ctx.pool(), &snapshot, node.as_deref()).await?; - - // pg_regresql warning - if !result.regresql_loaded { - eprintln!(); - eprintln!(" pg_regresql extension is not loaded."); - eprintln!(" Without it, PostgreSQL ignores pg_class.reltuples/relpages and uses"); - eprintln!(" physical file sizes instead. Your injected row counts will have no"); - eprintln!(" effect on EXPLAIN cost estimates."); - eprintln!(); - eprintln!(" Install: sudo pgxn install pg_regresql"); - eprintln!(" Then: CREATE EXTENSION pg_regresql;"); - eprintln!(" See: https://github.com/boringSQL/regresql"); - eprintln!(); - } - - eprintln!( - "Applied: {} tables, {} indexes, {} columns", - result.tables_updated, result.indexes_updated, result.columns_injected - ); - - if !result.skipped.is_empty() { - eprintln!("Skipped ({}):", result.skipped.len()); - for s in &result.skipped { - eprintln!(" {s}"); - } - } - - Ok(()) - } - } -} - async fn cmd_drift( cli: &Cli, db: Option<&str>, @@ -1041,54 +984,86 @@ async fn cmd_mcp_serve( let candidates = schema_candidate_paths(schema_path, project_config.as_ref(), cli.profile.as_deref()); - // try to load schema — if missing, start in uninitialized mode; - // if file exists but is broken, propagate the error - let schema_path_result = - resolve_schema_path(schema_path, project_config.as_ref(), cli.profile.as_deref()); + let resolved_profile = project_config.as_ref().and_then(|c| { + c.resolve_profile(None, schema_path, cli.profile.as_deref(), &cwd) + .ok() + }); + + let json_snapshot: Option = + resolve_schema_path(schema_path, project_config.as_ref(), cli.profile.as_deref()) + .ok() + .and_then(|p| load_schema_file(&p).ok()); - let server = match schema_path_result { - Ok(schema_file) => { - let json = std::fs::read_to_string(&schema_file)?; - let snapshot: dry_run_core::SchemaSnapshot = serde_json::from_str(&json)?; + // SnapshotKey for profile/database (if present) + let snapshot_key = resolved_profile.as_ref().and_then(|r| { + let db_name = r + .database_id + .as_ref() + .map(|d| d.0.clone()) + .or_else(|| json_snapshot.as_ref().map(|s| s.database.clone()))?; + Some(complete_key(r, &db_name)) + }); + + // try history.db file, if not found fall back to JSON file + let history_store = HistoryStore::open_default().ok(); + + let annotated_from_history = match (history_store.as_ref(), snapshot_key.as_ref()) { + (Some(store), Some(key)) => store.get_annotated(key, SnapshotRef::Latest).await.ok(), + _ => None, + }; + + let annotated = match annotated_from_history { + Some(a) => { eprintln!( - "dryrun: loaded schema from {} ({} tables)", - schema_file.display(), - snapshot.tables.len() + "dryrun: loaded annotated snapshot from history.db ({} tables, planner: {}, activity nodes: {})", + a.schema.tables.len(), + if a.planner.is_some() { "yes" } else { "no" }, + a.activity_by_node.len(), ); + Some(a) + } + None => json_snapshot.map(|s| { + eprintln!( + "dryrun: loaded {} tables from schema.json (planner/activity unavailable; run `dryrun snapshot take` to capture stats)", + s.tables.len(), + ); + mcp::wrap_schema_only(s) + }), + }; - // optional --db enables live tools (explain_query, refresh_schema) - let effective_db = db.map(|s| s.to_string()).or_else(|| { - if let Some(ref config) = project_config - && let Ok(resolved) = - config.resolve_profile(None, None, cli.profile.as_deref(), &cwd) - { - return resolved.db_url; - } - None - }); + // --db enables live tools (explain_query, refresh_schema) + let effective_db = db + .map(|s| s.to_string()) + .or_else(|| resolved_profile.as_ref().and_then(|r| r.db_url.clone())); - let db_connection = if let Some(ref db_url) = effective_db { - let ctx = DryRun::connect(db_url).await?; - eprintln!("dryrun: connected to local db (live tools enabled)"); - Some((db_url.as_str(), ctx)) - } else { - eprintln!("dryrun: offline mode (explain_query, refresh_schema disabled)"); - None - }; + let db_connection = if let Some(ref db_url) = effective_db { + let ctx = DryRun::connect(db_url).await?; + eprintln!("dryrun: connected to live db (live tools enabled)"); + Some((db_url.as_str(), ctx)) + } else { + eprintln!("dryrun: offline mode (explain_query, refresh_schema disabled)"); + None + }; - mcp::DryRunServer::from_snapshot_with_db( - snapshot, + let server = match annotated { + Some(a) => { + let mut s = mcp::DryRunServer::from_annotated_with_db( + a, db_connection, lint_config, pgmustard_api_key, get_version(), candidates, - ) + ); + if let Some(store) = history_store { + s = s.with_history(store, snapshot_key); + } + s } - Err(_) => { + None => { eprintln!( "dryrun: no schema found — starting in uninitialized mode\n\ - dryrun: use the reload_schema tool after running dump-schema" + dryrun: use the reload_schema tool after running dump-schema or snapshot take" ); mcp::DryRunServer::uninitialized(lint_config, get_version(), candidates) } @@ -1139,7 +1114,6 @@ mod tests { functions: vec![], extensions: vec![], gucs: vec![], - node_stats: vec![], } } diff --git a/crates/dry_run_cli/src/mcp/helpers.rs b/crates/dry_run_cli/src/mcp/helpers.rs index 707227a..4a3aa78 100644 --- a/crates/dry_run_cli/src/mcp/helpers.rs +++ b/crates/dry_run_cli/src/mcp/helpers.rs @@ -1,4 +1,4 @@ -use dry_run_core::schema::NodeStats; +use dry_run_core::schema::{AnnotatedSnapshot, QualifiedName}; use rmcp::ErrorData as McpError; pub fn to_mcp_err(e: dry_run_core::Error) -> McpError { @@ -23,51 +23,78 @@ pub fn format_number(n: i64) -> String { result.chars().rev().collect() } +// Render a per-node activity table for one (schema, table) pair, attached +// as a trailer to MCP tool output. +// +// Sizing columns (`reltuples`, `relpages`, `table_size`) come from the +// planner snapshot — those are byte-identical across replicas (they're +// replicated via WAL), so it would be misleading to render one column per +// node. Counter columns (`seq_scan`, `idx_scan`) come from each node's +// activity row and naturally vary node-to-node. +// +// Returns None when there's no activity at all (single-node, no captures +// yet); the caller skips the section in that case. pub fn format_node_table_breakdown( - node_stats: &[NodeStats], + annotated: &AnnotatedSnapshot, schema: &str, table: &str, ) -> Option { - if node_stats.is_empty() { + if annotated.activity_by_node.is_empty() { return None; } - let newest = node_stats.iter().map(|ns| ns.timestamp).max(); + let qn = QualifiedName::new(schema, table); + let view = annotated.view(); + + // Pull sizing once — it's the same regardless of which node we're + // displaying. `unwrap_or` zeros so the table still renders cleanly + // when the planner snapshot is missing. + let reltuples = view.reltuples(&qn).unwrap_or(0.0); + let relpages = view.relpages(&qn).unwrap_or(0); + let table_size = view.table_size(&qn).unwrap_or(0); + + // Stale = "this node's activity capture is more than 7 days older + // than the freshest one in the bundle." Surfaces forgotten replicas. + let newest = annotated + .activity_by_node + .values() + .map(|a| a.timestamp) + .max(); let stale_threshold = newest.map(|t| t - chrono::TimeDelta::days(7)); let mut lines: Vec = Vec::new(); lines.push(format!( "\nPer-node breakdown ({} node(s)):\n", - node_stats.len() + annotated.activity_by_node.len() )); lines.push(format!( "{:<16} {:>12} {:>10} {:>10} {:>10} {:>12} {}", "", "reltuples", "relpages", "seq_scan", "idx_scan", "table_size", "collected" )); - for ns in node_stats { - let ts = ns - .table_stats - .iter() - .find(|t| t.table == table && t.schema == schema); - - if let Some(ts) = ts { - let size_mb = ts.stats.table_size / (1024 * 1024); - let collected = ns.timestamp.format("%Y-%m-%d %H:%M"); - let stale = stale_threshold.is_some_and(|threshold| ns.timestamp < threshold); + for (label, activity) in &annotated.activity_by_node { + let ta = activity.tables.iter().find(|e| e.table == qn); + if let Some(ta) = ta { + let size_mb = table_size / (1024 * 1024); + let collected = activity.timestamp.format("%Y-%m-%d %H:%M"); + let stale = stale_threshold.is_some_and(|threshold| activity.timestamp < threshold); + // idx_scan_sum on a single index would be ambiguous here — + // the table-level row aggregates across all indexes already + // (TableActivity.idx_scan), so we read it directly off the + // entry. lines.push(format!( "{:<16} {:>12} {:>10} {:>10} {:>10} {:>9} MB {}{}", - ns.source, - format_number(ts.stats.reltuples as i64), - format_number(ts.stats.relpages), - format_number(ts.stats.seq_scan), - format_number(ts.stats.idx_scan), + label, + format_number(reltuples as i64), + format_number(relpages), + format_number(ta.activity.seq_scan), + format_number(ta.activity.idx_scan), format_number(size_mb), collected, if stale { " (stale)" } else { "" }, )); } else { - lines.push(format!("{:<16} (no data for this table)", ns.source)); + lines.push(format!("{:<16} (no data for this table)", label)); } } diff --git a/crates/dry_run_cli/src/mcp/mod.rs b/crates/dry_run_cli/src/mcp/mod.rs index a3e4814..c57650a 100644 --- a/crates/dry_run_cli/src/mcp/mod.rs +++ b/crates/dry_run_cli/src/mcp/mod.rs @@ -2,4 +2,4 @@ mod helpers; mod params; mod server; -pub use server::DryRunServer; +pub use server::{DryRunServer, wrap_schema_only}; diff --git a/crates/dry_run_cli/src/mcp/server.rs b/crates/dry_run_cli/src/mcp/server.rs index bcd6221..dc16aae 100644 --- a/crates/dry_run_cli/src/mcp/server.rs +++ b/crates/dry_run_cli/src/mcp/server.rs @@ -13,24 +13,91 @@ use tracing::info; use dry_run_core::audit::AuditConfig; use dry_run_core::history::{SnapshotKey, SnapshotRef, SnapshotStore}; use dry_run_core::lint::LintConfig; -use dry_run_core::schema::{ - ConstraintKind, detect_seq_scan_imbalance, detect_stale_stats, detect_unused_indexes, - effective_table_stats, -}; -use dry_run_core::{DryRun, HistoryStore, SchemaSnapshot}; +use dry_run_core::schema::{ConstraintKind, NodeSelector, QualifiedName}; +use dry_run_core::{AnnotatedSnapshot, DryRun, HistoryStore, SchemaSnapshot}; use crate::pgmustard::PgMustardClient; use super::helpers::{format_node_table_breakdown, format_number, to_mcp_err}; use super::params::*; +async fn persist_refresh( + store: &HistoryStore, + key: &SnapshotKey, + schema: &SchemaSnapshot, + planner: Option<&dry_run_core::PlannerStatsSnapshot>, + activity_by_node: &std::collections::BTreeMap, +) { + if let Err(e) = store.put(key, schema).await { + tracing::warn!(error = %e, "failed to persist schema"); + } + if let Some(p) = planner + && let Err(e) = store.put_planner_stats(key, p).await + { + tracing::warn!(error = %e, "failed to persist planner stats"); + } + if let Some(a) = activity_by_node.get("primary") + && let Err(e) = store.put_activity_stats(key, a).await + { + tracing::warn!(error = %e, "failed to persist activity stats"); + } +} + +pub fn wrap_schema_only(schema: SchemaSnapshot) -> AnnotatedSnapshot { + AnnotatedSnapshot { + schema, + planner: None, + activity_by_node: std::collections::BTreeMap::new(), + } +} + +fn build_inline( + schema: SchemaSnapshot, + planner: Option, + primary_activity: Option, +) -> AnnotatedSnapshot { + let mut activity_by_node = std::collections::BTreeMap::new(); + if let Some(a) = primary_activity { + activity_by_node.insert("primary".to_string(), a); + } + AnnotatedSnapshot { + schema, + planner, + activity_by_node, + } +} + +async fn rebuild_after_refresh( + schema: SchemaSnapshot, + planner: Option, + primary_activity: Option, + history: Option<(&HistoryStore, &SnapshotKey)>, +) -> AnnotatedSnapshot { + let mut annotated = build_inline(schema, planner, primary_activity); + if let Some((store, key)) = history { + persist_refresh( + store, + key, + &annotated.schema, + annotated.planner.as_ref(), + &annotated.activity_by_node, + ) + .await; + match store.get_annotated(key, SnapshotRef::Latest).await { + Ok(a) => annotated = a, + Err(e) => tracing::warn!(error = %e, "history reload after refresh failed"), + } + } + annotated +} + #[derive(Clone)] pub struct DryRunServer { ctx: Option>, app_version: String, pg_version_display: String, database_name: String, - schema: Arc>>, + schema: Arc>>, history: Option>, snapshot_key: Option, lint_config: LintConfig, @@ -41,8 +108,8 @@ pub struct DryRunServer { } impl DryRunServer { - pub fn from_snapshot_with_db( - snapshot: SchemaSnapshot, + pub fn from_annotated_with_db( + annotated: AnnotatedSnapshot, db: Option<(&str, DryRun)>, lint_config: LintConfig, pgmustard_api_key: Option, @@ -52,16 +119,18 @@ impl DryRunServer { let ctx = db.map(|(_url, ctx)| Arc::new(ctx)); let pg_version_display = - dry_run_core::PgVersion::parse_from_version_string(&snapshot.pg_version) + dry_run_core::PgVersion::parse_from_version_string(&annotated.schema.pg_version) .map(|v| format!("{}.{}.{}", v.major, v.minor, v.patch)) .unwrap_or_default(); - let database_name = snapshot.database.clone(); + let database_name = annotated.schema.database.clone(); info!( - tables = snapshot.tables.len(), - database = %snapshot.database, + tables = annotated.schema.tables.len(), + database = %annotated.schema.database, + planner = annotated.planner.is_some(), + activity_nodes = annotated.activity_by_node.len(), live_db = ctx.is_some(), - "loaded schema from file" + "loaded annotated snapshot" ); Self { @@ -69,7 +138,7 @@ impl DryRunServer { app_version: app_version.to_string(), pg_version_display, database_name, - schema: Arc::new(RwLock::new(Some(snapshot))), + schema: Arc::new(RwLock::new(Some(annotated))), history: None, snapshot_key: None, lint_config, @@ -111,7 +180,6 @@ impl DryRunServer { } } - #[allow(dead_code)] pub fn with_history(mut self, store: HistoryStore, key: Option) -> Self { self.history = Some(Arc::new(store)); self.snapshot_key = key; @@ -119,6 +187,10 @@ impl DryRunServer { } async fn get_schema(&self) -> Result { + Ok(self.get_annotated().await?.schema) + } + + async fn get_annotated(&self) -> Result { let guard = self.schema.read().await; guard.clone().ok_or_else(|| { McpError::internal_error( @@ -190,7 +262,7 @@ impl DryRunServer { &self, Parameters(params): Parameters, ) -> Result { - let snapshot = self.get_schema().await?; + let annotated = self.get_annotated().await?; let limit = params.limit.unwrap_or(50); let offset = params.offset.unwrap_or(0); let sort_by = params.sort.as_deref().unwrap_or("name"); @@ -202,19 +274,23 @@ impl DryRunServer { size: i64, } - let mut entries: Vec = snapshot + // Default node selector: "primary" — single-node planner data + // is the right fit for a row-count summary. The "N nodes" suffix + // counts how many distinct activity captures we have, which + // signals "we have multi-node data for this cluster" but doesn't + // change the headline number. + let view = annotated.view(); + let node_count = annotated.activity_by_node.len(); + + let mut entries: Vec = annotated + .schema .tables .iter() .filter(|t| params.schema.as_ref().is_none_or(|s| &t.schema == s)) .map(|t| { - let node_count = if snapshot.node_stats.is_empty() { - 0 - } else { - snapshot.node_stats.len() - }; - let stats = effective_table_stats(t, &snapshot); - let rows = stats.as_ref().map(|s| s.reltuples).unwrap_or(0.0); - let size = stats.as_ref().map(|s| s.table_size).unwrap_or(0); + let qn = QualifiedName::new(&t.schema, &t.name); + let rows = view.reltuples(&qn).unwrap_or(0.0); + let size = view.table_size(&qn).unwrap_or(0); let row_est = if rows > 0.0 { if node_count > 0 { format!(" (~{} rows, {} nodes)", rows as i64, node_count) @@ -295,10 +371,15 @@ impl DryRunServer { &self, Parameters(params): Parameters, ) -> Result { - let snapshot = self.get_schema().await?; + // Pull the annotated bundle — every stats field this tool surfaces + // (reltuples, dead tuples, last vacuum, per-node breakdown, column + // profiles) reads from planner / activity, not from the legacy + // embedded fields. + let annotated = self.get_annotated().await?; let schema_name = params.schema.as_deref().unwrap_or("public"); - let table = snapshot + let table = annotated + .schema .tables .iter() .find(|t| t.name == params.table && t.schema == schema_name) @@ -310,33 +391,113 @@ impl DryRunServer { })?; let detail = params.detail.as_deref().unwrap_or("summary"); - let table_rows = effective_table_stats(table, &snapshot) - .map(|s| s.reltuples) - .unwrap_or(0.0); - - // build column profiles + let qn = QualifiedName::new(schema_name, ¶ms.table); + let view = annotated.view(); + let table_rows = view.reltuples(&qn).unwrap_or(0.0); + + // Build column profiles — pull each column's stats out of the + // planner snapshot via `column_stats(qn, name)`. Profile is None + // when no stats are present, in which case the column is omitted + // from the profiles array (matches legacy behavior). let profiles: Vec = table .columns .iter() .filter_map(|col| { - dry_run_core::schema::profile_column(col, table_rows).map(|p| { + let stats = view.column_stats(&qn, &col.name); + dry_run_core::schema::profile_column(&col.name, &col.type_name, stats, table_rows) + .map(|p| { + serde_json::json!({ + "column": col.name, + "profile": p, + }) + }) + }) + .collect(); + + // Synthesize a "stats" JSON object that mirrors the legacy + // TableStats shape, but built from planner sizing + (merged-or-single) + // activity. Returns an empty object when no stats are captured — + // intentionally distinct from `null` so consumers can tell the + // difference between "no snapshot yet" (object missing) vs. + // "snapshot exists, no rows for this table" (object empty). + let synth_stats = serde_json::json!({ + "reltuples": view.reltuples(&qn), + "relpages": view.relpages(&qn), + "table_size": view.table_size(&qn), + "dead_tuples": view.n_dead_tup_sum(&qn), + "seq_scan": view.seq_scan_sum(&qn), + "last_vacuum": view.last_vacuum_max(&qn), + "last_analyze": view.last_analyze_max(&qn), + "vacuum_count": view.vacuum_count_sum(&qn), + }); + + // Enrich partition_info with per-child sizing and nactivity. + let synth_partition_info = table.partition_info.as_ref().map(|pi| { + let children: Vec = pi + .children + .iter() + .map(|c| { + let cqn = QualifiedName::new(&c.schema, &c.name); serde_json::json!({ - "column": col.name, - "profile": p, + "schema": c.schema, + "name": c.name, + "bound": c.bound, + "reltuples": view.reltuples(&cqn), + "table_size": view.table_size(&cqn), + "dead_tuples": view.n_dead_tup_sum(&cqn), + "seq_scan": view.seq_scan_sum(&cqn), + "last_vacuum": view.last_vacuum_max(&cqn), }) }) + .collect(); + serde_json::json!({ + "strategy": pi.strategy, + "key": pi.key, + "children": children, }) - .collect(); + }); let mut json_val = match detail { "full" => { let mut v = serde_json::to_value(table).map_err(|e| { McpError::internal_error(format!("serialization error: {e}"), None) })?; - if let Some(obj) = v.as_object_mut() - && !profiles.is_empty() - { - obj.insert("column_profiles".into(), serde_json::Value::Array(profiles)); + if let Some(obj) = v.as_object_mut() { + obj.insert("stats".into(), synth_stats.clone()); + if let Some(pi) = synth_partition_info.clone() { + obj.insert("partition_info".into(), pi); + } + + // inject snapshot-derived stats + let idx_full: Vec = table + .indexes + .iter() + .map(|i| { + let idx_qn = QualifiedName::new(&table.schema, &i.name); + let sizing = view.index_sizing(&idx_qn); + serde_json::json!({ + "name": i.name, + "columns": i.columns, + "include_columns": i.include_columns, + "index_type": i.index_type, + "is_unique": i.is_unique, + "is_primary": i.is_primary, + "predicate": i.predicate, + "definition": i.definition, + "is_valid": i.is_valid, + "backs_constraint": i.backs_constraint, + "idx_scan": view.idx_scan_sum(&idx_qn), + "idx_scan_per_node": view.idx_scan_per_node(&idx_qn), + "size_bytes": sizing.map(|s| s.size), + "relpages": sizing.map(|s| s.relpages), + "reltuples": sizing.map(|s| s.reltuples), + }) + }) + .collect(); + obj.insert("indexes".into(), serde_json::Value::Array(idx_full)); + if !profiles.is_empty() { + obj.insert("column_profiles".into(), serde_json::Value::Array(profiles)); + } } v } @@ -344,12 +505,15 @@ impl DryRunServer { let mut result = serde_json::json!({ "schema": table.schema, "name": table.name, - "stats": table.stats, + "stats": synth_stats, }); - if let Some(obj) = result.as_object_mut() - && !profiles.is_empty() - { - obj.insert("column_profiles".into(), serde_json::Value::Array(profiles)); + if let Some(obj) = result.as_object_mut() { + if let Some(pi) = synth_partition_info.clone() { + obj.insert("partition_info".into(), pi); + } + if !profiles.is_empty() { + obj.insert("column_profiles".into(), serde_json::Value::Array(profiles)); + } } result } @@ -379,6 +543,8 @@ impl DryRunServer { .indexes .iter() .map(|i| { + let idx_qn = QualifiedName::new(&table.schema, &i.name); + let sizing = view.index_sizing(&idx_qn); serde_json::json!({ "name": i.name, "columns": i.columns, @@ -388,6 +554,10 @@ impl DryRunServer { "predicate": i.predicate, "definition": i.definition, "is_valid": i.is_valid, + "idx_scan": view.idx_scan_sum(&idx_qn), + "size_bytes": sizing.map(|s| s.size), + "relpages": sizing.map(|s| s.relpages), + "reltuples": sizing.map(|s| s.reltuples), }) }) .collect(); @@ -398,8 +568,8 @@ impl DryRunServer { "constraints": table.constraints, "indexes": compact_idxs, "comment": table.comment, - "stats": table.stats, - "partition_info": table.partition_info, + "stats": synth_stats, + "partition_info": synth_partition_info.clone(), }); if let Some(obj) = result.as_object_mut() && !profiles.is_empty() @@ -426,8 +596,9 @@ impl DryRunServer { let mut text = serde_json::to_string_pretty(&json_val) .map_err(|e| McpError::internal_error(format!("serialization error: {e}"), None))?; - if let Some(breakdown) = - format_node_table_breakdown(&snapshot.node_stats, schema_name, ¶ms.table) + // Per-node breakdown trailer — only meaningful when we have ≥ 2 + // nodes' worth of activity. Single-node clusters skip the section. + if let Some(breakdown) = format_node_table_breakdown(&annotated, schema_name, ¶ms.table) { text.push_str(&breakdown); } @@ -703,8 +874,9 @@ impl DryRunServer { &self, Parameters(params): Parameters, ) -> Result { - let snapshot = self.get_schema().await?; - let result = dry_run_core::query::validate_query(¶ms.sql, &snapshot) + let annotated = self.get_annotated().await?; + let view = annotated.view(); + let result = dry_run_core::query::validate_query(¶ms.sql, &view) .map_err(|e| McpError::invalid_params(format!("SQL parse error: {e}"), None))?; let hint = if result.valid && !result.warnings.is_empty() { @@ -734,14 +906,17 @@ impl DryRunServer { &self, Parameters(params): Parameters, ) -> Result { - let schema = self.get_schema().await.ok(); + // Pull annotated so plan-warning rules have planner reltuples + // available as a fallback when the plan's own row estimate is zero. + let annotated = self.get_annotated().await.ok(); + let view = annotated.as_ref().map(|a| a.view()); let ctx = self.require_live_db()?; let result = dry_run_core::query::explain_query( ctx.pool(), ¶ms.sql, params.analyze.unwrap_or(false), - schema.as_ref(), + view.as_ref(), ) .await .map_err(|e| McpError::invalid_params(format!("EXPLAIN failed: {e}"), None))?; @@ -771,17 +946,26 @@ impl DryRunServer { &self, Parameters(params): Parameters, ) -> Result { - let schema = self.get_schema().await?; + // Pull the annotated bundle — advise's stats-aware refinements + // (selectivity, partial-index suggestions, per-replica seq_scan + // breakdown) all hang off planner/activity, not the raw schema. + let annotated = self.get_annotated().await?; let pg_version = - dry_run_core::PgVersion::parse_from_version_string(&schema.pg_version).ok(); + dry_run_core::PgVersion::parse_from_version_string(&annotated.schema.pg_version).ok(); let include_idx = params.include_index_suggestions.unwrap_or(true); + // Default node selector: "primary" for a single-node view — + // advise is a planner-stats-driven tool and primary is where + // those originate. Per-node breakdowns inside advise itself + // still iterate every node via `seq_scan_per_node`. + let view = annotated.view(); + let explain_result = if let Some(ctx) = &self.ctx { dry_run_core::query::explain_query( ctx.pool(), ¶ms.sql, params.analyze.unwrap_or(false), - Some(&schema), + Some(&view), ) .await .ok() @@ -792,7 +976,7 @@ impl DryRunServer { let advise_result = dry_run_core::query::advise_with_index_suggestions( ¶ms.sql, explain_result.as_ref().map(|r| &r.plan), - &schema, + &view, pg_version.as_ref(), include_idx, ) @@ -841,9 +1025,9 @@ impl DryRunServer { &self, Parameters(params): Parameters, ) -> Result { - let schema = self.get_schema().await?; + let annotated = self.get_annotated().await?; let pg_version = - dry_run_core::PgVersion::parse_from_version_string(&schema.pg_version).ok(); + dry_run_core::PgVersion::parse_from_version_string(&annotated.schema.pg_version).ok(); // Parse the plan JSON — supports both wrapped [{"Plan": ...}] and bare {"Plan": ...} let plan_value = if let Some(arr) = params.plan_json.as_array() { @@ -859,12 +1043,13 @@ impl DryRunServer { let plan = dry_run_core::query::parse_plan_json(plan_value) .map_err(|e| McpError::invalid_params(format!("failed to parse plan: {e}"), None))?; - let warnings = dry_run_core::query::detect_plan_warnings(&plan, Some(&schema)); + let view = annotated.view(); + let warnings = dry_run_core::query::detect_plan_warnings(&plan, Some(&view)); let advise_result = dry_run_core::query::advise_with_index_suggestions( ¶ms.sql, Some(&plan), - &schema, + &view, pg_version.as_ref(), params.include_index_suggestions.unwrap_or(true), ) @@ -942,13 +1127,13 @@ impl DryRunServer { &self, Parameters(params): Parameters, ) -> Result { - let schema = self.get_schema().await?; + let annotated = self.get_annotated().await?; let pg_version = - dry_run_core::PgVersion::parse_from_version_string(&schema.pg_version).ok(); + dry_run_core::PgVersion::parse_from_version_string(&annotated.schema.pg_version).ok(); + let view = annotated.view(); - let checks = - dry_run_core::query::check_migration(¶ms.ddl, &schema, pg_version.as_ref()) - .map_err(|e| McpError::invalid_params(format!("DDL parse error: {e}"), None))?; + let checks = dry_run_core::query::check_migration(¶ms.ddl, &view, pg_version.as_ref()) + .map_err(|e| McpError::invalid_params(format!("DDL parse error: {e}"), None))?; if checks.is_empty() { return Ok(CallToolResult::success(vec![Content::text( @@ -985,24 +1170,23 @@ impl DryRunServer { &self, Parameters(params): Parameters, ) -> Result { - let snapshot = self.get_schema().await?; - - let target = { - let mut filtered = snapshot.clone(); - if let Some(schema_filter) = ¶ms.schema { - filtered.tables.retain(|t| &t.schema == schema_filter); - } - if let Some(table_filter) = ¶ms.table { - filtered.tables.retain(|t| &t.name == table_filter); - } - filtered - }; + // Pull the full annotated bundle — we need it for the audit pass, + // which contains stats-aware rules. Lint itself is DDL-only and + // just borrows `target.schema` below. + let mut target = self.get_annotated().await?; + if let Some(schema_filter) = ¶ms.schema { + target.schema.tables.retain(|t| &t.schema == schema_filter); + } + if let Some(table_filter) = ¶ms.table { + target.schema.tables.retain(|t| &t.name == table_filter); + } let scope = params.scope.as_deref().unwrap_or("all"); let mut result = serde_json::Map::new(); if scope == "all" || scope == "conventions" { - let report = dry_run_core::lint::lint_schema(&target, &self.lint_config); + // Conventions/lint reads no stats — DDL only. + let report = dry_run_core::lint::lint_schema(&target.schema, &self.lint_config); let compact = dry_run_core::lint::compact_report(&report, 5); result.insert( "conventions".into(), @@ -1011,7 +1195,9 @@ impl DryRunServer { } let has_ddl_fixes = if scope == "all" || scope == "audit" { - let report = dry_run_core::audit::run_audit(&target, &self.audit_config); + // Audit needs planner sizing for the bloat / vacuum-defaults rules + // — pass the annotated view so those have a chance to fire. + let report = dry_run_core::audit::run_audit(&target.view(), &self.audit_config); let has_fixes = report.findings.iter().any(|f| f.ddl_fix.is_some()); result.insert( "audit".into(), @@ -1046,17 +1232,17 @@ impl DryRunServer { &self, Parameters(params): Parameters, ) -> Result { - let snapshot = { - let mut filtered = self.get_schema().await?.clone(); - if let Some(schema_filter) = ¶ms.schema { - filtered.tables.retain(|t| &t.schema == schema_filter); - } - if let Some(table_filter) = ¶ms.table { - filtered.tables.retain(|t| &t.name == table_filter); - } - filtered - }; - let results = dry_run_core::schema::vacuum::analyze_vacuum_health(&snapshot); + let mut annotated = self.get_annotated().await?; + if let Some(schema_filter) = ¶ms.schema { + annotated + .schema + .tables + .retain(|t| &t.schema == schema_filter); + } + if let Some(table_filter) = ¶ms.table { + annotated.schema.tables.retain(|t| &t.name == table_filter); + } + let results = dry_run_core::schema::vacuum::analyze_vacuum_health(&annotated.view()); if results.is_empty() { let text = self.wrap_text("No tables with significant row counts found.", None); @@ -1078,24 +1264,26 @@ impl DryRunServer { &self, Parameters(params): Parameters, ) -> Result { - let snapshot = { - let mut filtered = self.get_schema().await?.clone(); - if let Some(schema_filter) = ¶ms.schema { - filtered.tables.retain(|t| &t.schema == schema_filter); - filtered.node_stats.iter_mut().for_each(|ns| { - ns.table_stats.retain(|ts| &ts.schema == schema_filter); - ns.index_stats.retain(|is| &is.schema == schema_filter); - }); - } - if let Some(table_filter) = ¶ms.table { - filtered.tables.retain(|t| &t.name == table_filter); - filtered.node_stats.iter_mut().for_each(|ns| { - ns.table_stats.retain(|ts| &ts.table == table_filter); - ns.index_stats.retain(|is| &is.table == table_filter); - }); - } - filtered - }; + // Pull the cached annotated bundle and clone it — we filter + // tables in-place to honor the schema/table query params, and we + // don't want those mutations to leak back into the shared cache. + // + // Activity rows reference qualified-name keys, not table OIDs, so + // they're naturally narrowed by the lookups in + // `AnnotatedSnapshot::unused_indexes` / `seq_scan_imbalance` once + // we've thinned out `schema.tables`. No need to scrub the + // activity_by_node map by hand. + let mut annotated = self.get_annotated().await?; + if let Some(schema_filter) = ¶ms.schema { + annotated + .schema + .tables + .retain(|t| &t.schema == schema_filter); + } + if let Some(table_filter) = ¶ms.table { + annotated.schema.tables.retain(|t| &t.name == table_filter); + } + let kind = params.kind.as_deref().unwrap_or("all"); let mut result = serde_json::Map::new(); @@ -1109,7 +1297,10 @@ impl DryRunServer { let mut found_unused = false; if run_stale { - let stale = detect_stale_stats(&snapshot.node_stats, 7); + // 7-day staleness threshold — matches the legacy default. + // `stale_stats` walks every node in the selector and emits one + // entry per (node, table) that's stale or never analyzed. + let stale = annotated.stale_stats(&NodeSelector::All, 7); found_stale = !stale.is_empty(); result.insert( "stale_stats".into(), @@ -1118,7 +1309,10 @@ impl DryRunServer { } if run_unused { - let unused = detect_unused_indexes(&snapshot.node_stats, &snapshot.tables); + // Cluster-wide question — sum scans across all known nodes. + // An index that's unused on the primary may still be hot on + // a read replica, so we deliberately don't restrict to one node. + let unused = annotated.unused_indexes(&NodeSelector::All); found_unused = !unused.is_empty(); result.insert( "unused_indexes".into(), @@ -1128,13 +1322,11 @@ impl DryRunServer { if run_anomalies { let mut anomalies = Vec::new(); - for table in &snapshot.tables { - let schema_name = &table.schema; - if let Some(imb) = - detect_seq_scan_imbalance(&snapshot.node_stats, schema_name, &table.name) - { + for table in &annotated.schema.tables { + let qn = dry_run_core::schema::QualifiedName::new(&table.schema, &table.name); + if let Some(imb) = annotated.seq_scan_imbalance(&qn) { anomalies.push(serde_json::json!({ - "table": format!("{}.{}", schema_name, table.name), + "table": format!("{}.{}", table.schema, table.name), "type": "seq_scan_imbalance", "hot_node": imb.hot_node, "multiplier": format!("{}x", imb.multiplier), @@ -1146,7 +1338,10 @@ impl DryRunServer { if run_bloated { let threshold = params.threshold.unwrap_or(1.5); - let bloated = dry_run_core::schema::detect_bloated_indexes(&snapshot.tables, threshold); + // Bloat needs IndexSizing from the planner snapshot — pass the + // annotated view so the rule can pull it via `index_sizing()`. + let bloated = + dry_run_core::schema::detect_bloated_indexes(&annotated.view(), threshold); result.insert( "bloated_indexes".into(), serde_json::to_value(&bloated).unwrap_or(serde_json::Value::Null), @@ -1181,14 +1376,17 @@ impl DryRunServer { &self, Parameters(params): Parameters, ) -> Result { - let snapshot = self.get_schema().await?; + let annotated = self.get_annotated().await?; let schema_name = params.schema.as_deref().unwrap_or("public"); let qualified = format!("{schema_name}.{}", params.table); + let qn = QualifiedName::new(schema_name, ¶ms.table); - if snapshot.node_stats.is_empty() { + if annotated.activity_by_node.is_empty() { + // No per-node activity captured — can't compare. Tell the user + // exactly which command will populate it. return Ok(CallToolResult::success(vec![Content::text( - "No per-node stats available. Import stats with:\n \ - dryrun import schema.json --stats r1.json r2.json" + "No per-node activity stats available. Capture from each replica with:\n \ + dryrun snapshot activity --from --label " .to_string(), )])); } @@ -1196,19 +1394,18 @@ impl DryRunServer { let mut lines: Vec = Vec::new(); lines.push(format!( "Stats for {qualified} across {} node(s):", - snapshot.node_stats.len() + annotated.activity_by_node.len() )); - if let Some(breakdown) = - format_node_table_breakdown(&snapshot.node_stats, schema_name, ¶ms.table) + if let Some(breakdown) = format_node_table_breakdown(&annotated, schema_name, ¶ms.table) { lines.push(breakdown); } - // anomaly detection: seq_scan imbalance - if let Some(imb) = - detect_seq_scan_imbalance(&snapshot.node_stats, schema_name, ¶ms.table) - { + // Anomaly detection — flag if one node is doing 5x+ the seq_scans + // of the quietest non-zero node. Often points at a routing + // misconfiguration or an unindexed query slipping past primary. + if let Some(imb) = annotated.seq_scan_imbalance(&qn) { lines.push(String::new()); lines.push(format!( "⚠ {} has {}x more seq_scans than the lowest node — \ @@ -1217,16 +1414,22 @@ impl DryRunServer { )); } - // per-index breakdown + // Per-index breakdown — pull each index belonging to this table + // out of the schema, then ask each node's activity what its + // idx_scan counter is for that index. let mut index_data: std::collections::BTreeMap> = std::collections::BTreeMap::new(); - for ns in &snapshot.node_stats { - for is in &ns.index_stats { - if is.table == params.table && is.schema == schema_name { - index_data - .entry(is.index_name.clone()) - .or_default() - .push((ns.source.clone(), is.stats.idx_scan)); + if let Some(table) = annotated + .schema + .tables + .iter() + .find(|t| t.name == params.table && t.schema == schema_name) + { + for idx in &table.indexes { + let idx_qn = QualifiedName::new(schema_name, &idx.name); + let per_node = annotated.view().idx_scan_per_node(&idx_qn); + if !per_node.is_empty() { + index_data.insert(idx.name.clone(), per_node); } } } @@ -1243,8 +1446,9 @@ impl DryRunServer { } } - // flag unused indexes for this table - let unused = detect_unused_indexes(&snapshot.node_stats, &snapshot.tables); + // Flag unused indexes for this table — `unused_indexes` already + // skips primary keys and aggregates across selected nodes. + let unused = annotated.unused_indexes(&NodeSelector::All); for entry in &unused { if entry.schema == schema_name && entry.table == params.table { let size_mb = entry.total_size_bytes / (1024 * 1024); @@ -1286,21 +1490,52 @@ impl DryRunServer { #[tool(description = "Force re-introspection of the database schema (requires live DB)")] async fn refresh_schema(&self) -> Result { let ctx = self.require_live_db()?; - let snapshot = ctx + let schema = ctx .introspect_schema() .await .map_err(|e| McpError::internal_error(format!("introspection failed: {e}"), None))?; + let hash = schema.content_hash.clone(); + let planner = ctx + .introspect_planner_stats(&hash) + .await + .inspect_err(|e| tracing::warn!(error = %e, "planner stats unavailable")) + .ok(); + let primary = ctx + .introspect_activity_stats(&hash, "primary") + .await + .inspect_err(|e| tracing::warn!(error = %e, "primary activity unavailable")) + .ok(); + + let history = self + .history + .as_ref() + .zip(self.snapshot_key.as_ref()) + .map(|(s, k)| (s.as_ref(), k)); + let annotated = rebuild_after_refresh(schema, planner, primary, history).await; let body = format!( - "Schema refreshed: {} tables, {} views, {} functions (hash: {})", - snapshot.tables.len(), - snapshot.views.len(), - snapshot.functions.len(), - &snapshot.content_hash[..16], + "Schema refreshed: {} tables, {} views, {} functions (hash: {})\n\ + Planner stats: {}\n\ + Activity stats: {} node(s) [{}]", + annotated.schema.tables.len(), + annotated.schema.views.len(), + annotated.schema.functions.len(), + &annotated.schema.content_hash[..16], + if annotated.planner.is_some() { + "captured" + } else { + "unavailable" + }, + annotated.activity_by_node.len(), + annotated + .activity_by_node + .keys() + .cloned() + .collect::>() + .join(", "), ); - *self.schema.write().await = Some(snapshot); - + *self.schema.write().await = Some(annotated); let text = self.wrap_text(&body, None); Ok(CallToolResult::success(vec![Content::text(text)])) } @@ -1334,7 +1569,7 @@ impl DryRunServer { snapshot.functions.len(), ); - *self.schema.write().await = Some(snapshot); + *self.schema.write().await = Some(wrap_schema_only(snapshot)); let text = self.wrap_text(&body, None); return Ok(CallToolResult::success(vec![Content::text(text)])); @@ -1357,249 +1592,8 @@ impl DryRunServer { } #[cfg(test)] -mod tests { - use super::*; - - #[test] - fn deserialize_analyze_plan_params() { - let json = serde_json::json!({ - "sql": "SELECT * FROM orders WHERE customer_id = 42", - "plan_json": [{"Plan": { - "Node Type": "Seq Scan", - "Relation Name": "orders", - "Schema": "public", - "Startup Cost": 0.0, - "Total Cost": 450.0, - "Plan Rows": 10000, - "Plan Width": 48 - }}] - }); - let params: AnalyzePlanParams = serde_json::from_value(json).unwrap(); - assert_eq!(params.sql, "SELECT * FROM orders WHERE customer_id = 42"); - assert!(params.plan_json.is_array()); - // default value - assert_eq!(params.include_index_suggestions, Some(true)); - } - - #[test] - fn deserialize_analyze_plan_params_with_explicit_false() { - let json = serde_json::json!({ - "sql": "SELECT 1", - "plan_json": {"Plan": {"Node Type": "Result", "Startup Cost": 0.0, "Total Cost": 0.01, "Plan Rows": 1, "Plan Width": 4}}, - "include_index_suggestions": false - }); - let params: AnalyzePlanParams = serde_json::from_value(json).unwrap(); - assert_eq!(params.include_index_suggestions, Some(false)); - assert!(params.plan_json.is_object()); - } - - #[test] - fn plan_json_extraction_wrapped_array() { - let plan_json = serde_json::json!([{ - "Plan": { - "Node Type": "Seq Scan", - "Relation Name": "users", - "Schema": "public", - "Startup Cost": 0.0, - "Total Cost": 35.5, - "Plan Rows": 2550, - "Plan Width": 64 - } - }]); - let plan_value = plan_json - .as_array() - .and_then(|arr| arr.first()) - .and_then(|obj| obj.get("Plan")) - .unwrap(); - let plan = dry_run_core::query::parse_plan_json(plan_value).unwrap(); - assert_eq!(plan.node_type, "Seq Scan"); - assert_eq!(plan.relation_name.as_deref(), Some("users")); - } - - #[test] - fn plan_json_extraction_bare_object() { - let plan_json = serde_json::json!({ - "Plan": { - "Node Type": "Index Scan", - "Relation Name": "orders", - "Schema": "public", - "Index Name": "orders_pkey", - "Startup Cost": 0.0, - "Total Cost": 8.27, - "Plan Rows": 1, - "Plan Width": 64 - } - }); - let plan_value = plan_json.get("Plan").unwrap(); - let plan = dry_run_core::query::parse_plan_json(plan_value).unwrap(); - assert_eq!(plan.node_type, "Index Scan"); - } - - #[test] - fn plan_json_missing_plan_key_array() { - let plan_json = serde_json::json!([{"Something": "else"}]); - let result = plan_json - .as_array() - .and_then(|arr| arr.first()) - .and_then(|obj| obj.get("Plan")); - assert!(result.is_none()); - } - - #[test] - fn plan_json_missing_plan_key_object() { - let plan_json = serde_json::json!({"NotPlan": {}}); - assert!(plan_json.get("Plan").is_none()); - } - - #[tokio::test] - async fn list_tables_includes_pg_version() { - let snapshot = test_snapshot(); - let server = DryRunServer::from_snapshot_with_db( - snapshot, - None, - LintConfig::default(), - None, - "test", - vec![], - ); - let result = server - .list_tables(Parameters(ListTablesParams { - schema: None, - sort: None, - limit: None, - offset: None, - })) - .await - .unwrap(); - let text = result.content.first().unwrap(); - let text_str = format!("{text:?}"); - assert!( - text_str.contains("PostgreSQL 18.3.0"), - "list_tables output should contain PG version" - ); - } - - #[tokio::test] - async fn describe_table_includes_pg_version() { - let snapshot = test_snapshot(); - let server = DryRunServer::from_snapshot_with_db( - snapshot, - None, - LintConfig::default(), - None, - "test", - vec![], - ); - let result = server - .describe_table(Parameters(DescribeTableParams { - table: "orders".into(), - schema: None, - detail: None, - })) - .await - .unwrap(); - let text = result.content.first().unwrap(); - let text_str = format!("{text:?}"); - assert!( - text_str.contains("pg_version"), - "describe_table output should contain pg_version field" - ); - } - - fn test_snapshot() -> dry_run_core::SchemaSnapshot { - use dry_run_core::schema::*; - SchemaSnapshot { - pg_version: "PostgreSQL 18.3.0 on x86_64-pc-linux-gnu".into(), - database: "testdb".into(), - timestamp: chrono::Utc::now(), - content_hash: "abc123".into(), - source: None, - tables: vec![Table { - oid: 1, - schema: "public".into(), - name: "orders".into(), - columns: vec![Column { - name: "id".into(), - ordinal: 1, - type_name: "bigint".into(), - nullable: false, - default: None, - identity: None, - generated: None, - comment: None, - statistics_target: None, - stats: None, - }], - constraints: vec![], - indexes: vec![], - comment: None, - stats: Some(TableStats { - reltuples: 50000.0, - relpages: 625, - dead_tuples: 0, - last_vacuum: None, - last_autovacuum: None, - last_analyze: None, - last_autoanalyze: None, - seq_scan: 0, - idx_scan: 0, - table_size: 5000000, - }), - partition_info: None, - policies: vec![], - triggers: vec![], - reloptions: vec![], - rls_enabled: false, - }], - enums: vec![], - domains: vec![], - composites: vec![], - views: vec![], - functions: vec![], - extensions: vec![], - gucs: vec![], - node_stats: vec![], - } - } - - #[test] - fn analyze_plan_with_analyze_buffers_data() { - // realistic EXPLAIN (ANALYZE, BUFFERS, FORMAT JSON) output - let plan_json = serde_json::json!([{ - "Plan": { - "Node Type": "Seq Scan", - "Relation Name": "orders", - "Schema": "public", - "Startup Cost": 0.0, - "Total Cost": 15234.5, - "Plan Rows": 500000, - "Plan Width": 120, - "Actual Rows": 487320, - "Actual Loops": 1, - "Actual Startup Time": 0.02, - "Actual Total Time": 320.5, - "Shared Hit Blocks": 8000, - "Shared Read Blocks": 2000, - "Filter": "(customer_id = 42)", - "Rows Removed by Filter": 487278 - }, - "Planning Time": 0.1, - "Execution Time": 320.6 - }]); - let plan_value = plan_json - .as_array() - .unwrap() - .first() - .unwrap() - .get("Plan") - .unwrap(); - let plan = dry_run_core::query::parse_plan_json(plan_value).unwrap(); - assert_eq!(plan.total_cost, 15234.5); - assert_eq!(plan.actual_rows, Some(487320.0)); - assert_eq!(plan.shared_hit_blocks, Some(8000)); - assert_eq!(plan.rows_removed_by_filter, Some(487278.0)); - } -} +#[path = "server_tests.rs"] +mod tests; #[tool_handler] impl ServerHandler for DryRunServer { diff --git a/crates/dry_run_cli/src/mcp/server_tests.rs b/crates/dry_run_cli/src/mcp/server_tests.rs new file mode 100644 index 0000000..cc2b859 --- /dev/null +++ b/crates/dry_run_cli/src/mcp/server_tests.rs @@ -0,0 +1,422 @@ +use super::*; + +#[test] +fn deserialize_analyze_plan_params() { + let json = serde_json::json!({ + "sql": "SELECT * FROM orders WHERE customer_id = 42", + "plan_json": [{"Plan": { + "Node Type": "Seq Scan", + "Relation Name": "orders", + "Schema": "public", + "Startup Cost": 0.0, + "Total Cost": 450.0, + "Plan Rows": 10000, + "Plan Width": 48 + }}] + }); + let params: AnalyzePlanParams = serde_json::from_value(json).unwrap(); + assert_eq!(params.sql, "SELECT * FROM orders WHERE customer_id = 42"); + assert!(params.plan_json.is_array()); + // default value + assert_eq!(params.include_index_suggestions, Some(true)); +} + +#[test] +fn deserialize_analyze_plan_params_with_explicit_false() { + let json = serde_json::json!({ + "sql": "SELECT 1", + "plan_json": {"Plan": {"Node Type": "Result", "Startup Cost": 0.0, "Total Cost": 0.01, "Plan Rows": 1, "Plan Width": 4}}, + "include_index_suggestions": false + }); + let params: AnalyzePlanParams = serde_json::from_value(json).unwrap(); + assert_eq!(params.include_index_suggestions, Some(false)); + assert!(params.plan_json.is_object()); +} + +#[test] +fn plan_json_extraction_wrapped_array() { + let plan_json = serde_json::json!([{ + "Plan": { + "Node Type": "Seq Scan", + "Relation Name": "users", + "Schema": "public", + "Startup Cost": 0.0, + "Total Cost": 35.5, + "Plan Rows": 2550, + "Plan Width": 64 + } + }]); + let plan_value = plan_json + .as_array() + .and_then(|arr| arr.first()) + .and_then(|obj| obj.get("Plan")) + .unwrap(); + let plan = dry_run_core::query::parse_plan_json(plan_value).unwrap(); + assert_eq!(plan.node_type, "Seq Scan"); + assert_eq!(plan.relation_name.as_deref(), Some("users")); +} + +#[test] +fn plan_json_extraction_bare_object() { + let plan_json = serde_json::json!({ + "Plan": { + "Node Type": "Index Scan", + "Relation Name": "orders", + "Schema": "public", + "Index Name": "orders_pkey", + "Startup Cost": 0.0, + "Total Cost": 8.27, + "Plan Rows": 1, + "Plan Width": 64 + } + }); + let plan_value = plan_json.get("Plan").unwrap(); + let plan = dry_run_core::query::parse_plan_json(plan_value).unwrap(); + assert_eq!(plan.node_type, "Index Scan"); +} + +#[test] +fn plan_json_missing_plan_key_array() { + let plan_json = serde_json::json!([{"Something": "else"}]); + let result = plan_json + .as_array() + .and_then(|arr| arr.first()) + .and_then(|obj| obj.get("Plan")); + assert!(result.is_none()); +} + +#[test] +fn plan_json_missing_plan_key_object() { + let plan_json = serde_json::json!({"NotPlan": {}}); + assert!(plan_json.get("Plan").is_none()); +} + +#[tokio::test] +async fn list_tables_includes_pg_version() { + let snapshot = test_snapshot(); + let server = DryRunServer::from_annotated_with_db( + crate::mcp::wrap_schema_only(snapshot), + None, + LintConfig::default(), + None, + "test", + vec![], + ); + let result = server + .list_tables(Parameters(ListTablesParams { + schema: None, + sort: None, + limit: None, + offset: None, + })) + .await + .unwrap(); + let text = result.content.first().unwrap(); + let text_str = format!("{text:?}"); + assert!( + text_str.contains("PostgreSQL 18.3.0"), + "list_tables output should contain PG version" + ); +} + +#[tokio::test] +async fn describe_table_includes_pg_version() { + let snapshot = test_snapshot(); + let server = DryRunServer::from_annotated_with_db( + crate::mcp::wrap_schema_only(snapshot), + None, + LintConfig::default(), + None, + "test", + vec![], + ); + let result = server + .describe_table(Parameters(DescribeTableParams { + table: "orders".into(), + schema: None, + detail: None, + })) + .await + .unwrap(); + let text = result.content.first().unwrap(); + let text_str = format!("{text:?}"); + assert!( + text_str.contains("pg_version"), + "describe_table output should contain pg_version field" + ); +} + +fn test_snapshot() -> dry_run_core::SchemaSnapshot { + use dry_run_core::schema::*; + SchemaSnapshot { + pg_version: "PostgreSQL 18.3.0 on x86_64-pc-linux-gnu".into(), + database: "testdb".into(), + timestamp: chrono::Utc::now(), + content_hash: "abc123".into(), + source: None, + tables: vec![Table { + oid: 1, + schema: "public".into(), + name: "orders".into(), + columns: vec![Column { + name: "id".into(), + ordinal: 1, + type_name: "bigint".into(), + nullable: false, + default: None, + identity: None, + generated: None, + comment: None, + statistics_target: None, + }], + constraints: vec![], + indexes: vec![], + comment: None, + partition_info: None, + policies: vec![], + triggers: vec![], + reloptions: vec![], + rls_enabled: false, + }], + enums: vec![], + domains: vec![], + composites: vec![], + views: vec![], + functions: vec![], + extensions: vec![], + gucs: vec![], + } +} + +#[test] +fn analyze_plan_with_analyze_buffers_data() { + // realistic EXPLAIN (ANALYZE, BUFFERS, FORMAT JSON) output + let plan_json = serde_json::json!([{ + "Plan": { + "Node Type": "Seq Scan", + "Relation Name": "orders", + "Schema": "public", + "Startup Cost": 0.0, + "Total Cost": 15234.5, + "Plan Rows": 500000, + "Plan Width": 120, + "Actual Rows": 487320, + "Actual Loops": 1, + "Actual Startup Time": 0.02, + "Actual Total Time": 320.5, + "Shared Hit Blocks": 8000, + "Shared Read Blocks": 2000, + "Filter": "(customer_id = 42)", + "Rows Removed by Filter": 487278 + }, + "Planning Time": 0.1, + "Execution Time": 320.6 + }]); + let plan_value = plan_json + .as_array() + .unwrap() + .first() + .unwrap() + .get("Plan") + .unwrap(); + let plan = dry_run_core::query::parse_plan_json(plan_value).unwrap(); + assert_eq!(plan.total_cost, 15234.5); + assert_eq!(plan.actual_rows, Some(487320.0)); + assert_eq!(plan.shared_hit_blocks, Some(8000)); + assert_eq!(plan.rows_removed_by_filter, Some(487278.0)); +} + +#[tokio::test] +async fn persist_refresh_writes_activity_for_primary() { + use dry_run_core::history::{DatabaseId, ProjectId}; + use dry_run_core::schema::{ + ActivityStatsSnapshot, IndexActivity, IndexActivityEntry, NodeIdentity, QualifiedName, + TableActivity, TableActivityEntry, + }; + + let dir = tempfile::TempDir::new().unwrap(); + let store = HistoryStore::open(&dir.path().join("history.db")).unwrap(); + let key = SnapshotKey { + project_id: ProjectId("test".into()), + database_id: DatabaseId("test-db".into()), + }; + + let schema = test_snapshot(); + let schema_hash = schema.content_hash.clone(); + + let activity = ActivityStatsSnapshot { + pg_version: schema.pg_version.clone(), + database: schema.database.clone(), + timestamp: chrono::Utc::now(), + content_hash: "act-h1".into(), + schema_ref_hash: schema_hash.clone(), + node: NodeIdentity { + label: "primary".into(), + host: "localhost".into(), + is_standby: false, + replication_lag_bytes: None, + stats_reset: None, + }, + tables: vec![TableActivityEntry { + table: QualifiedName::new("public", "orders"), + activity: TableActivity { + seq_scan: 1, + idx_scan: 0, + n_live_tup: 0, + n_dead_tup: 0, + last_vacuum: None, + last_autovacuum: None, + last_analyze: None, + last_autoanalyze: None, + vacuum_count: 0, + autovacuum_count: 0, + analyze_count: 0, + autoanalyze_count: 0, + }, + }], + indexes: vec![IndexActivityEntry { + index: QualifiedName::new("public", "orders_pkey"), + activity: IndexActivity { + idx_scan: 0, + idx_tup_read: 0, + idx_tup_fetch: 0, + }, + }], + }; + + let mut activity_by_node = std::collections::BTreeMap::new(); + activity_by_node.insert("primary".to_string(), activity); + + super::persist_refresh(&store, &key, &schema, None, &activity_by_node).await; + + let bundle = store + .get_annotated(&key, SnapshotRef::Latest) + .await + .unwrap(); + assert_eq!(bundle.schema.content_hash, schema_hash); + assert!( + bundle.activity_by_node.contains_key("primary"), + "persist_refresh should have written activity_stats for 'primary'" + ); +} + +fn make_activity_row( + schema_ref: &str, + label: &str, + hash: &str, +) -> dry_run_core::ActivityStatsSnapshot { + use dry_run_core::schema::{ + ActivityStatsSnapshot, IndexActivity, IndexActivityEntry, NodeIdentity, QualifiedName, + TableActivity, TableActivityEntry, + }; + ActivityStatsSnapshot { + pg_version: "PostgreSQL 18.3.0".into(), + database: "testdb".into(), + timestamp: chrono::Utc::now(), + content_hash: hash.into(), + schema_ref_hash: schema_ref.into(), + node: NodeIdentity { + label: label.into(), + host: format!("host-{label}"), + is_standby: label != "primary", + replication_lag_bytes: None, + stats_reset: None, + }, + tables: vec![TableActivityEntry { + table: QualifiedName::new("public", "orders"), + activity: TableActivity { + seq_scan: 1, + idx_scan: 0, + n_live_tup: 0, + n_dead_tup: 0, + last_vacuum: None, + last_autovacuum: None, + last_analyze: None, + last_autoanalyze: None, + vacuum_count: 0, + autovacuum_count: 0, + analyze_count: 0, + autoanalyze_count: 0, + }, + }], + indexes: vec![IndexActivityEntry { + index: QualifiedName::new("public", "orders_pkey"), + activity: IndexActivity { + idx_scan: 0, + idx_tup_read: 0, + idx_tup_fetch: 0, + }, + }], + } +} + +#[test] +fn build_inline_inserts_primary_when_present() { + let bundle = super::build_inline( + test_snapshot(), + None, + Some(make_activity_row("abc123", "primary", "act-1")), + ); + assert_eq!(bundle.activity_by_node.len(), 1); + assert!(bundle.activity_by_node.contains_key("primary")); +} + +#[test] +fn build_inline_yields_empty_map_without_activity() { + let bundle = super::build_inline(test_snapshot(), None, None); + assert!(bundle.activity_by_node.is_empty()); + assert!(bundle.planner.is_none()); +} + +// Regression for 2f85792: refresh must not drop replica activity rows +// already in history.db. Before the fix, the cache was rebuilt with +// primary-only. This exercises the cache-rebuild logic directly via +// `rebuild_after_refresh`, no live DB needed. +#[tokio::test] +async fn rebuild_after_refresh_preserves_replica_activity() { + let dir = tempfile::TempDir::new().unwrap(); + let store = HistoryStore::open(&dir.path().join("history.db")).unwrap(); + let key = SnapshotKey { + project_id: dry_run_core::history::ProjectId("test".into()), + database_id: dry_run_core::history::DatabaseId("test-db".into()), + }; + + let schema = test_snapshot(); + let schema_hash = schema.content_hash.clone(); + + SnapshotStore::put(&store, &key, &schema) + .await + .expect("seed schema"); + let replica = make_activity_row(&schema_hash, "replica1", "replica-h1"); + store + .put_activity_stats(&key, &replica) + .await + .expect("seed replica activity"); + + let live_primary = make_activity_row(&schema_hash, "primary", "primary-h1"); + let bundle = + super::rebuild_after_refresh(schema, None, Some(live_primary), Some((&store, &key))).await; + + assert!( + bundle.activity_by_node.contains_key("primary"), + "freshly-introspected primary activity must end up in the cache" + ); + assert!( + bundle.activity_by_node.contains_key("replica1"), + "pre-seeded replica1 activity must survive rebuild \ + (regression: rebuild used to drop everything except primary)" + ); +} + +#[tokio::test] +async fn rebuild_after_refresh_without_history_uses_inline_only() { + let bundle = super::rebuild_after_refresh( + test_snapshot(), + None, + Some(make_activity_row("abc123", "primary", "primary-h1")), + None, + ) + .await; + assert_eq!(bundle.activity_by_node.len(), 1); + assert!(bundle.activity_by_node.contains_key("primary")); +} diff --git a/crates/dry_run_core/src/audit/mod.rs b/crates/dry_run_core/src/audit/mod.rs index dadb4e5..b92d7da 100644 --- a/crates/dry_run_core/src/audit/mod.rs +++ b/crates/dry_run_core/src/audit/mod.rs @@ -3,11 +3,18 @@ pub mod types; pub use types::{AuditConfig, AuditFinding, AuditReport, AuditSummary}; -use crate::schema::SchemaSnapshot; +use crate::schema::AnnotatedSchema; +// Public audit entry point — takes the annotated view because two of the +// rules under the hood (`indexes/bloated`, `vacuum/large_table_defaults`) +// need planner sizing / row counts. DDL-only rules just hop through to +// `annotated.schema` internally; callers who only have a bare +// `SchemaSnapshot` can wrap it in a stats-less `AnnotatedSnapshot` to +// adapt — those rules will simply produce no findings, matching the +// pre-split behavior. #[must_use] -pub fn run_audit(schema: &SchemaSnapshot, config: &AuditConfig) -> AuditReport { - let tables_analyzed = schema.tables.len(); - let findings = rules::run_all_audit_rules(schema, config); +pub fn run_audit(annotated: &AnnotatedSchema<'_>, config: &AuditConfig) -> AuditReport { + let tables_analyzed = annotated.schema.tables.len(); + let findings = rules::run_all_audit_rules(annotated, config); AuditReport::new(findings, tables_analyzed) } diff --git a/crates/dry_run_core/src/audit/rules/fk_graph.rs b/crates/dry_run_core/src/audit/rules/fk_graph.rs index 78eb056..7e18a34 100644 --- a/crates/dry_run_core/src/audit/rules/fk_graph.rs +++ b/crates/dry_run_core/src/audit/rules/fk_graph.rs @@ -254,7 +254,6 @@ mod tests { generated: None, comment: None, statistics_target: None, - stats: None, } } @@ -293,7 +292,6 @@ mod tests { constraints, indexes: vec![], comment: None, - stats: None, partition_info: None, policies: vec![], triggers: vec![], @@ -317,7 +315,6 @@ mod tests { functions: vec![], extensions: vec![], gucs: vec![], - node_stats: vec![], } } diff --git a/crates/dry_run_core/src/audit/rules/indexes.rs b/crates/dry_run_core/src/audit/rules/indexes.rs index f111571..8bcb899 100644 --- a/crates/dry_run_core/src/audit/rules/indexes.rs +++ b/crates/dry_run_core/src/audit/rules/indexes.rs @@ -231,13 +231,15 @@ pub fn check_wide_column_indexes(schema: &SchemaSnapshot) -> Vec { const DEFAULT_BLOAT_THRESHOLD: f64 = 1.5; #[must_use] -pub fn check_bloated_indexes(schema: &SchemaSnapshot) -> Vec { +pub fn check_bloated_indexes(annotated: &crate::schema::AnnotatedSchema<'_>) -> Vec { let mut findings = Vec::new(); - for table in &schema.tables { + for table in &annotated.schema.tables { let qualified = format!("{}.{}", table.schema, table.name); for idx in &table.indexes { - if let Some(est) = crate::schema::bloat::estimate_index_bloat(idx, table) + let qn = crate::schema::QualifiedName::new(&table.schema, &idx.name); + let sizing = annotated.index_sizing(&qn); + if let Some(est) = crate::schema::bloat::estimate_index_bloat(idx, sizing, table) && est.bloat_ratio > DEFAULT_BLOAT_THRESHOLD { findings.push(AuditFinding { @@ -277,7 +279,6 @@ mod tests { generated: None, comment: None, statistics_target: None, - stats: None, } } @@ -293,7 +294,6 @@ mod tests { definition: format!("CREATE INDEX {name} ON ..."), is_valid: true, backs_constraint: false, - stats: None, } } @@ -306,7 +306,6 @@ mod tests { constraints: vec![], indexes, comment: None, - stats: None, partition_info: None, policies: vec![], triggers: vec![], @@ -330,7 +329,6 @@ mod tests { functions: vec![], extensions: vec![], gucs: vec![], - node_stats: vec![], } } diff --git a/crates/dry_run_core/src/audit/rules/mod.rs b/crates/dry_run_core/src/audit/rules/mod.rs index 72d6afa..913f0f3 100644 --- a/crates/dry_run_core/src/audit/rules/mod.rs +++ b/crates/dry_run_core/src/audit/rules/mod.rs @@ -3,13 +3,31 @@ mod indexes; mod schema; use super::types::{AuditConfig, AuditFinding}; -use crate::schema::SchemaSnapshot; - -// Runs all audit rules and returns findings, skipping disabled ones +use crate::schema::AnnotatedSchema; + +// Top-level audit entry point — runs every rule against the annotated +// snapshot, skipping anything the caller disabled via `config.disabled_rules`. +// +// Rules split into two groups based on what they need: +// - DDL-only rules (naming, FK shape, duplicate indexes, …) read just +// `annotated.schema`. They worked fine before the snapshot split and +// they keep working — we hand them the schema reference directly. +// - Stats-aware rules (`indexes/bloated`, `vacuum/large_table_defaults`) +// need planner sizing or activity counters. They take the full +// `&AnnotatedSchema` and use accessors like `index_sizing()` / +// `reltuples()` so they're robust to "no stats captured yet" — they +// simply produce no findings in that degenerate case rather than +// panicking or lying. #[must_use] -pub fn run_all_audit_rules(snapshot: &SchemaSnapshot, config: &AuditConfig) -> Vec { +pub fn run_all_audit_rules( + annotated: &AnnotatedSchema<'_>, + config: &AuditConfig, +) -> Vec { let mut findings = Vec::new(); let disabled = &config.disabled_rules; + // Most rules just want DDL — pull the schema reference out once so + // the per-rule sites stay readable. + let snapshot = annotated.schema; macro_rules! run_rule { ($id:expr, $check:expr) => { @@ -19,7 +37,7 @@ pub fn run_all_audit_rules(snapshot: &SchemaSnapshot, config: &AuditConfig) -> V }; } - // index rules + // ---- index rules ---- run_rule!( "indexes/duplicate", indexes::check_duplicate_indexes(snapshot) @@ -36,9 +54,11 @@ pub fn run_all_audit_rules(snapshot: &SchemaSnapshot, config: &AuditConfig) -> V "indexes/wide_columns", indexes::check_wide_column_indexes(snapshot) ); - run_rule!("indexes/bloated", indexes::check_bloated_indexes(snapshot)); + // bloated indexes need IndexSizing from the planner snapshot — gets + // the annotated view, not the raw schema. + run_rule!("indexes/bloated", indexes::check_bloated_indexes(annotated)); - // FK rules + // ---- FK rules ---- run_rule!( "fk/type_mismatch", fk_graph::check_fk_type_mismatch(snapshot) @@ -46,27 +66,28 @@ pub fn run_all_audit_rules(snapshot: &SchemaSnapshot, config: &AuditConfig) -> V run_rule!("fk/circular", fk_graph::check_circular_fks(snapshot)); run_rule!("fk/orphan", fk_graph::check_orphan_tables(snapshot)); - // PK rules + // ---- PK rules ---- run_rule!( "pk/non_sequential", schema::check_pk_non_sequential(snapshot) ); - // naming rules + // ---- naming rules ---- run_rule!("naming/bool_prefix", schema::check_bool_prefix(snapshot)); run_rule!("naming/reserved", schema::check_reserved_words(snapshot)); run_rule!("naming/id_mismatch", schema::check_id_mismatch(snapshot)); - // documentation rules + // ---- documentation rules ---- run_rule!( "docs/no_comment", schema::check_no_comment(snapshot, config) ); - // storage rules + // ---- storage rules ---- + // vacuum check needs reltuples from the planner — passes annotated. run_rule!( "vacuum/large_table_defaults", - schema::check_vacuum_large_table_defaults(snapshot) + schema::check_vacuum_large_table_defaults(annotated) ); findings @@ -74,6 +95,8 @@ pub fn run_all_audit_rules(snapshot: &SchemaSnapshot, config: &AuditConfig) -> V #[cfg(test)] mod tests { + use std::collections::BTreeMap; + use super::*; use crate::schema::*; @@ -94,14 +117,25 @@ mod tests { functions: vec![], extensions: vec![], gucs: vec![], - node_stats: vec![], + } + } + + // Build a stats-less annotated wrapper around a schema — mirrors + // what the audit harness sees when no planner / activity rows exist + // (e.g. fresh project, before the first `dryrun snapshot take`). + fn ddl_only(schema: SchemaSnapshot) -> AnnotatedSnapshot { + AnnotatedSnapshot { + schema, + planner: None, + activity_by_node: BTreeMap::new(), } } #[test] fn empty_schema_produces_no_findings() { let config = AuditConfig::default(); - let findings = run_all_audit_rules(&empty_schema(), &config); + let snap = ddl_only(empty_schema()); + let findings = run_all_audit_rules(&snap.view(), &config); assert!(findings.is_empty()); } @@ -122,12 +156,10 @@ mod tests { generated: None, comment: None, statistics_target: None, - stats: None, }], constraints: vec![], indexes: vec![], comment: None, - stats: None, partition_info: None, policies: vec![], triggers: vec![], @@ -136,16 +168,17 @@ mod tests { }], ..empty_schema() }; + let snap = ddl_only(schema); let config = AuditConfig::default(); - let findings = run_all_audit_rules(&schema, &config); + let findings = run_all_audit_rules(&snap.view(), &config); assert!(findings.iter().any(|f| f.rule == "naming/reserved")); let config = AuditConfig { disabled_rules: vec!["naming/reserved".into()], ..AuditConfig::default() }; - let findings = run_all_audit_rules(&schema, &config); + let findings = run_all_audit_rules(&snap.view(), &config); assert!(!findings.iter().any(|f| f.rule == "naming/reserved")); } } diff --git a/crates/dry_run_core/src/audit/rules/schema.rs b/crates/dry_run_core/src/audit/rules/schema.rs index ead2f11..5b1ae9b 100644 --- a/crates/dry_run_core/src/audit/rules/schema.rs +++ b/crates/dry_run_core/src/audit/rules/schema.rs @@ -336,18 +336,32 @@ pub fn check_no_comment(schema: &SchemaSnapshot, config: &AuditConfig) -> Vec Vec { - use crate::schema::effective_table_stats; +pub fn check_vacuum_large_table_defaults( + annotated: &crate::schema::AnnotatedSchema<'_>, +) -> Vec { + use crate::schema::QualifiedName; let mut findings = Vec::new(); - for table in &schema.tables { - let stats = match effective_table_stats(table, schema) { - Some(s) if s.reltuples >= 1_000_000.0 => s, + for table in &annotated.schema.tables { + let qn = QualifiedName::new(&table.schema, &table.name); + // Threshold: only worth nagging once a table is genuinely large. + let reltuples = match annotated.reltuples(&qn) { + Some(r) if r >= 1_000_000.0 => r, _ => continue, }; + // If the operator already set per-table autovacuum_* reloptions + // they've thought about it — don't second-guess. let has_overrides = table .reloptions .iter() @@ -356,13 +370,18 @@ pub fn check_vacuum_large_table_defaults(schema: &SchemaSnapshot) -> Vec Vec Result { - crate::schema::fetch_stats_only(&self.pool, source).await + pub async fn introspect_planner_stats( + &self, + schema_ref_hash: &str, + ) -> Result { + crate::schema::introspect_planner_stats(&self.pool, schema_ref_hash).await + } + + pub async fn introspect_activity_stats( + &self, + schema_ref_hash: &str, + label: &str, + ) -> Result { + crate::schema::introspect_activity_stats(&self.pool, schema_ref_hash, label).await } pub async fn is_standby(&self) -> Result { diff --git a/crates/dry_run_core/src/diff/mod.rs b/crates/dry_run_core/src/diff/mod.rs index af4674e..2ba6a9d 100644 --- a/crates/dry_run_core/src/diff/mod.rs +++ b/crates/dry_run_core/src/diff/mod.rs @@ -75,7 +75,6 @@ mod tests { functions: vec![], extensions: vec![], gucs: vec![], - node_stats: vec![], } } @@ -88,7 +87,6 @@ mod tests { constraints: vec![], indexes: vec![], comment: None, - stats: None, partition_info: None, policies: vec![], triggers: vec![], @@ -153,7 +151,6 @@ mod tests { generated: None, comment: None, statistics_target: None, - stats: None, }); local.tables.push(shared); diff --git a/crates/dry_run_core/src/history/store.rs b/crates/dry_run_core/src/history/store.rs index ef90152..496b28b 100644 --- a/crates/dry_run_core/src/history/store.rs +++ b/crates/dry_run_core/src/history/store.rs @@ -1,16 +1,19 @@ +use std::collections::BTreeMap; use std::path::{Path, PathBuf}; use std::sync::{Arc, Mutex}; use async_trait::async_trait; use chrono::{DateTime, Utc}; use rusqlite::{Connection, params}; -use tracing::{debug, info}; +use tracing::{debug, info, warn}; use crate::error::{Error, Result}; use crate::history::snapshot_store::{ PutOutcome, SnapshotKey, SnapshotRef, SnapshotStore, TimeRange, }; -use crate::schema::SchemaSnapshot; +use crate::schema::{ + ActivityStatsSnapshot, AnnotatedSnapshot, PlannerStatsSnapshot, SchemaSnapshot, +}; pub struct HistoryStore { conn: Arc>, @@ -27,29 +30,276 @@ pub struct SnapshotSummary { } impl HistoryStore { + const SCHEMA_VERSION: i32 = 2; + pub fn open(path: &Path) -> Result { if let Some(parent) = path.parent() { std::fs::create_dir_all(parent) .map_err(|e| Error::History(format!("cannot create directory: {e}")))?; } + let existed = path.exists(); + let conn = Connection::open(path) .map_err(|e| Error::History(format!("cannot open history db: {e}")))?; + let conn = if existed { + let version: i32 = conn + .query_row("PRAGMA user_version", [], |row| row.get(0)) + .map_err(|e| Error::History(format!("cannot read user_version: {e}")))?; + + match version.cmp(&Self::SCHEMA_VERSION) { + std::cmp::Ordering::Equal => conn, + std::cmp::Ordering::Less => { + warn!( + path = %path.display(), + from = version, + to = Self::SCHEMA_VERSION, + "history db on stale schema version; resetting", + ); + drop(conn); + std::fs::remove_file(path).map_err(|e| { + Error::History(format!("cannot remove stale history db: {e}")) + })?; + Connection::open(path) + .map_err(|e| Error::History(format!("cannot reopen history db: {e}")))? + } + std::cmp::Ordering::Greater => { + return Err(Error::History( + "history db is from a newer version of dryrun".into(), + )); + } + } + } else { + conn + }; + let store = Self { conn: Arc::new(Mutex::new(conn)), }; store.migrate()?; + store.set_user_version(Self::SCHEMA_VERSION)?; debug!(path = %path.display(), "history store opened"); Ok(store) } + fn set_user_version(&self, version: i32) -> Result<()> { + let conn = lock_conn(&self.conn)?; + conn.pragma_update(None, "user_version", version) + .map_err(|e| Error::History(format!("cannot set user_version: {e}")))?; + Ok(()) + } + pub fn open_default() -> Result { let path = default_history_path()?; Self::open(&path) } + pub async fn latest_schema_hash(&self, key: &SnapshotKey) -> Result> { + let pid = key.project_id.0.clone(); + let did = key.database_id.0.clone(); + run_blocking(&self.conn, move |conn| { + let row: rusqlite::Result = conn.query_row( + "SELECT content_hash FROM snapshots + WHERE project_id = ?1 AND database_id = ?2 AND kind = 'schema' + ORDER BY timestamp DESC LIMIT 1", + params![pid, did], + |r| r.get(0), + ); + match row { + Ok(h) => Ok(Some(h)), + Err(rusqlite::Error::QueryReturnedNoRows) => Ok(None), + Err(e) => Err(e.into()), + } + }) + .await + } + + pub async fn put_planner_stats( + &self, + key: &SnapshotKey, + snap: &PlannerStatsSnapshot, + ) -> Result { + let key = key.clone(); + let snap = snap.clone(); + run_blocking(&self.conn, move |conn| { + let pid = &key.project_id.0; + let did = &key.database_id.0; + + let latest: Option = conn + .query_row( + "SELECT content_hash FROM snapshots + WHERE project_id = ?1 AND database_id = ?2 AND kind = 'planner_stats' + ORDER BY timestamp DESC LIMIT 1", + params![pid, did], + |row| row.get(0), + ) + .ok(); + + if latest.as_deref() == Some(snap.content_hash.as_str()) { + debug!(hash = %snap.content_hash, "planner stats unchanged, skipping put"); + return Ok(PutOutcome::Deduped); + } + + let json = serde_json::to_string(&snap) + .map_err(|e| Error::History(format!("cannot serialize planner stats: {e}")))?; + + conn.execute( + "INSERT INTO snapshots (kind, timestamp, content_hash, schema_ref_hash, + database_name, snapshot_json, project_id, database_id) + VALUES ('planner_stats', ?1, ?2, ?3, ?4, ?5, ?6, ?7)", + params![ + snap.timestamp.to_rfc3339(), + snap.content_hash, + snap.schema_ref_hash, + snap.database, + json, + pid, + did, + ], + )?; + + info!(hash = %snap.content_hash, schema_ref = %snap.schema_ref_hash, + project = %pid, database = %did, "planner stats put"); + Ok(PutOutcome::Inserted) + }) + .await + } + + pub async fn put_activity_stats( + &self, + key: &SnapshotKey, + snap: &ActivityStatsSnapshot, + ) -> Result { + let key = key.clone(); + let snap = snap.clone(); + run_blocking(&self.conn, move |conn| { + let pid = &key.project_id.0; + let did = &key.database_id.0; + let label = &snap.node.label; + + let latest: Option = conn + .query_row( + "SELECT content_hash FROM snapshots + WHERE project_id = ?1 AND database_id = ?2 + AND kind = 'activity_stats' AND node_label = ?3 + ORDER BY timestamp DESC LIMIT 1", + params![pid, did, label], + |row| row.get(0), + ) + .ok(); + + if latest.as_deref() == Some(snap.content_hash.as_str()) { + debug!(hash = %snap.content_hash, label = %label, + "activity stats unchanged, skipping put"); + return Ok(PutOutcome::Deduped); + } + + let json = serde_json::to_string(&snap) + .map_err(|e| Error::History(format!("cannot serialize activity stats: {e}")))?; + + conn.execute( + "INSERT INTO snapshots (kind, timestamp, content_hash, schema_ref_hash, + node_label, database_name, snapshot_json, + project_id, database_id) + VALUES ('activity_stats', ?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8)", + params![ + snap.timestamp.to_rfc3339(), + snap.content_hash, + snap.schema_ref_hash, + label, + snap.database, + json, + pid, + did, + ], + )?; + + info!(hash = %snap.content_hash, schema_ref = %snap.schema_ref_hash, + label = %label, project = %pid, database = %did, + "activity stats put"); + Ok(PutOutcome::Inserted) + }) + .await + } + + pub async fn get_annotated( + &self, + key: &SnapshotKey, + at: SnapshotRef, + ) -> Result { + let schema = SnapshotStore::get(self, key, at.clone()).await?; + let schema_hash = schema.content_hash.clone(); + let pid = key.project_id.0.clone(); + let did = key.database_id.0.clone(); + + let planner = { + let pid = pid.clone(); + let did = did.clone(); + let h = schema_hash.clone(); + run_blocking(&self.conn, move |conn| { + let row: rusqlite::Result = conn.query_row( + "SELECT snapshot_json FROM snapshots + WHERE project_id = ?1 AND database_id = ?2 + AND kind = 'planner_stats' AND schema_ref_hash = ?3 + ORDER BY timestamp DESC LIMIT 1", + params![pid, did, h], + |r| r.get(0), + ); + match row { + Ok(j) => Ok(Some( + serde_json::from_str::(&j).map_err(|e| { + Error::History(format!("corrupt planner stats JSON: {e}")) + })?, + )), + Err(rusqlite::Error::QueryReturnedNoRows) => Ok(None), + Err(e) => Err(e.into()), + } + }) + .await? + }; + + let activity_by_node: BTreeMap = { + let h = schema_hash.clone(); + run_blocking(&self.conn, move |conn| { + // For each node_label, pick the latest row at this schema ref. + let mut stmt = conn.prepare( + "SELECT node_label, snapshot_json FROM snapshots a + WHERE project_id = ?1 AND database_id = ?2 + AND kind = 'activity_stats' AND schema_ref_hash = ?3 + AND node_label IS NOT NULL + AND timestamp = ( + SELECT MAX(b.timestamp) FROM snapshots b + WHERE b.project_id = a.project_id + AND b.database_id = a.database_id + AND b.kind = 'activity_stats' + AND b.schema_ref_hash = a.schema_ref_hash + AND b.node_label = a.node_label + )", + )?; + let rows = stmt.query_map(params![pid, did, h], |r| { + Ok((r.get::<_, String>(0)?, r.get::<_, String>(1)?)) + })?; + let mut out: BTreeMap = BTreeMap::new(); + for row in rows { + let (label, json) = row?; + let snap: ActivityStatsSnapshot = serde_json::from_str(&json) + .map_err(|e| Error::History(format!("corrupt activity stats JSON: {e}")))?; + out.insert(label, snap); + } + Ok(out) + }) + .await? + }; + + Ok(AnnotatedSnapshot { + schema, + planner, + activity_by_node, + }) + } + pub fn list_keys(&self) -> Result> { let conn = lock_conn(&self.conn)?; let mut stmt = conn.prepare( @@ -73,16 +323,24 @@ impl HistoryStore { let conn = lock_conn(&self.conn)?; conn.execute_batch( "CREATE TABLE IF NOT EXISTS snapshots ( - id INTEGER PRIMARY KEY AUTOINCREMENT, - timestamp TEXT NOT NULL, - content_hash TEXT NOT NULL, - database_name TEXT NOT NULL, - snapshot_json TEXT NOT NULL, - project_id TEXT, - database_id TEXT + id INTEGER PRIMARY KEY AUTOINCREMENT, + kind TEXT NOT NULL DEFAULT 'schema' + CHECK (kind IN ('schema','planner_stats','activity_stats')), + timestamp TEXT NOT NULL, + content_hash TEXT NOT NULL, + schema_ref_hash TEXT, + node_label TEXT, + database_name TEXT NOT NULL, + snapshot_json TEXT NOT NULL, + project_id TEXT, + database_id TEXT ); CREATE INDEX IF NOT EXISTS idx_snapshots_content_hash - ON snapshots(content_hash);", + ON snapshots(content_hash); + CREATE INDEX IF NOT EXISTS idx_snapshots_kind_schema_ref + ON snapshots(kind, schema_ref_hash); + CREATE INDEX IF NOT EXISTS idx_snapshots_kind_node_ts + ON snapshots(kind, node_label, timestamp DESC);", ) .map_err(|e| Error::History(format!("migration failed: {e}")))?; Ok(()) @@ -147,7 +405,7 @@ impl SnapshotStore for HistoryStore { let latest: Option = conn .query_row( "SELECT content_hash FROM snapshots - WHERE project_id = ?1 AND database_id = ?2 + WHERE project_id = ?1 AND database_id = ?2 AND kind = 'schema' ORDER BY timestamp DESC LIMIT 1", params![pid, did], |row| row.get(0), @@ -163,9 +421,9 @@ impl SnapshotStore for HistoryStore { .map_err(|e| Error::History(format!("cannot serialize snapshot: {e}")))?; conn.execute( - "INSERT INTO snapshots (timestamp, content_hash, database_name, + "INSERT INTO snapshots (kind, timestamp, content_hash, database_name, snapshot_json, project_id, database_id) - VALUES (?1, ?2, ?3, ?4, ?5, ?6)", + VALUES ('schema', ?1, ?2, ?3, ?4, ?5, ?6)", params![ snap.timestamp.to_rfc3339(), snap.content_hash, @@ -189,21 +447,23 @@ impl SnapshotStore for HistoryStore { let row = match &at { SnapshotRef::Latest => conn.query_row( "SELECT snapshot_json FROM snapshots - WHERE project_id = ?1 AND database_id = ?2 + WHERE project_id = ?1 AND database_id = ?2 AND kind = 'schema' ORDER BY timestamp DESC LIMIT 1", params![pid, did], |r| r.get::<_, String>(0), ), SnapshotRef::At(ts) => conn.query_row( "SELECT snapshot_json FROM snapshots - WHERE project_id = ?1 AND database_id = ?2 AND timestamp <= ?3 + WHERE project_id = ?1 AND database_id = ?2 AND kind = 'schema' + AND timestamp <= ?3 ORDER BY timestamp DESC LIMIT 1", params![pid, did, ts.to_rfc3339()], |r| r.get::<_, String>(0), ), SnapshotRef::Hash(h) => conn.query_row( "SELECT snapshot_json FROM snapshots - WHERE project_id = ?1 AND database_id = ?2 AND content_hash = ?3 + WHERE project_id = ?1 AND database_id = ?2 AND kind = 'schema' + AND content_hash = ?3 LIMIT 1", params![pid, did, h], |r| r.get::<_, String>(0), @@ -237,7 +497,7 @@ impl SnapshotStore for HistoryStore { "SELECT id, timestamp, content_hash, database_name, project_id, database_id FROM snapshots - WHERE project_id = ?1 AND database_id = ?2", + WHERE project_id = ?1 AND database_id = ?2 AND kind = 'schema'", ); let mut bound: Vec> = vec![Box::new(pid), Box::new(did)]; if let Some(from) = range.from { @@ -273,7 +533,8 @@ impl SnapshotStore for HistoryStore { run_blocking(&self.conn, move |conn| { Ok(conn.execute( "DELETE FROM snapshots - WHERE project_id = ?1 AND database_id = ?2 AND timestamp < ?3", + WHERE project_id = ?1 AND database_id = ?2 AND kind = 'schema' + AND timestamp < ?3", params![pid, did, cutoff.to_rfc3339()], )?) }) @@ -304,7 +565,6 @@ mod trait_tests { functions: vec![], extensions: vec![], gucs: vec![], - node_stats: vec![], } } @@ -611,4 +871,177 @@ mod trait_tests { ("p", "billing") ); } + + use crate::schema::{ + ActivityStatsSnapshot, IndexActivity, IndexActivityEntry, NodeIdentity, + PlannerStatsSnapshot, QualifiedName, TableActivity, TableActivityEntry, + }; + + fn make_planner(schema_ref: &str, db: &str, hash: &str) -> PlannerStatsSnapshot { + PlannerStatsSnapshot { + pg_version: "PostgreSQL 17.0".into(), + database: db.into(), + timestamp: Utc::now(), + content_hash: hash.into(), + schema_ref_hash: schema_ref.into(), + tables: vec![], + columns: vec![], + indexes: vec![], + } + } + + fn make_activity(schema_ref: &str, db: &str, label: &str, hash: &str) -> ActivityStatsSnapshot { + ActivityStatsSnapshot { + pg_version: "PostgreSQL 17.0".into(), + database: db.into(), + timestamp: Utc::now(), + content_hash: hash.into(), + schema_ref_hash: schema_ref.into(), + node: NodeIdentity { + label: label.into(), + host: format!("host-{label}"), + is_standby: label != "primary", + replication_lag_bytes: None, + stats_reset: None, + }, + tables: vec![TableActivityEntry { + table: QualifiedName::new("public", "orders"), + activity: TableActivity { + seq_scan: 1, + idx_scan: 2, + n_live_tup: 0, + n_dead_tup: 0, + last_vacuum: None, + last_autovacuum: None, + last_analyze: None, + last_autoanalyze: None, + vacuum_count: 0, + autovacuum_count: 0, + analyze_count: 0, + autoanalyze_count: 0, + }, + }], + indexes: vec![IndexActivityEntry { + index: QualifiedName::new("public", "orders_pkey"), + activity: IndexActivity { + idx_scan: 0, + idx_tup_read: 0, + idx_tup_fetch: 0, + }, + }], + } + } + + #[tokio::test] + async fn snapshot_get_filters_to_kind_schema() { + // Regression: planner_stats rows must not bleed into SnapshotStore::get(Latest). + let (_dir, store) = temp_store(); + let k = key("p", "auth"); + + let schema = make_snap("schema-h1", "auth"); + store.put(&k, &schema).await.unwrap(); + + // Insert a newer planner_stats row referring to the schema. + let planner = make_planner("schema-h1", "auth", "planner-h1"); + store.put_planner_stats(&k, &planner).await.unwrap(); + + let got = store.get(&k, SnapshotRef::Latest).await.unwrap(); + assert_eq!(got.content_hash, "schema-h1"); + } + + #[tokio::test] + async fn get_annotated_joins_schema_planner_and_single_node_activity() { + let (_dir, store) = temp_store(); + let k = key("p", "auth"); + + let schema = make_snap("schema-h1", "auth"); + store.put(&k, &schema).await.unwrap(); + let planner = make_planner("schema-h1", "auth", "planner-h1"); + store.put_planner_stats(&k, &planner).await.unwrap(); + let primary = make_activity("schema-h1", "auth", "primary", "act-primary-1"); + store.put_activity_stats(&k, &primary).await.unwrap(); + + let bundle = store.get_annotated(&k, SnapshotRef::Latest).await.unwrap(); + assert_eq!(bundle.schema.content_hash, "schema-h1"); + assert!(bundle.planner.is_some()); + assert_eq!(bundle.activity_by_node.len(), 1); + assert!(bundle.activity_by_node.contains_key("primary")); + } + + #[tokio::test] + async fn get_annotated_returns_multiple_activity_nodes() { + let (_dir, store) = temp_store(); + let k = key("p", "auth"); + store + .put(&k, &make_snap("schema-h1", "auth")) + .await + .unwrap(); + for label in ["primary", "replica1", "replica2"] { + let a = make_activity("schema-h1", "auth", label, &format!("act-{label}")); + store.put_activity_stats(&k, &a).await.unwrap(); + } + + let bundle = store.get_annotated(&k, SnapshotRef::Latest).await.unwrap(); + assert_eq!(bundle.activity_by_node.len(), 3); + let labels: Vec<&str> = bundle.node_labels().collect(); + assert_eq!(labels, vec!["primary", "replica1", "replica2"]); + } + + #[tokio::test] + async fn get_annotated_excludes_planner_with_stale_schema_ref() { + // Planner attached to schema A. New schema B replaces A as latest. + // get_annotated(Latest) must return planner=None — strict-match on schema_ref_hash. + let (_dir, store) = temp_store(); + let k = key("p", "auth"); + + store.put(&k, &make_snap("schema-A", "auth")).await.unwrap(); + let planner = make_planner("schema-A", "auth", "planner-A"); + store.put_planner_stats(&k, &planner).await.unwrap(); + + // small sleep to ensure later timestamp ordering + tokio::time::sleep(std::time::Duration::from_millis(5)).await; + store.put(&k, &make_snap("schema-B", "auth")).await.unwrap(); + + let bundle = store.get_annotated(&k, SnapshotRef::Latest).await.unwrap(); + assert_eq!(bundle.schema.content_hash, "schema-B"); + assert!( + bundle.planner.is_none(), + "planner attached to old schema must not bleed onto new schema" + ); + } + + #[tokio::test] + async fn get_annotated_with_no_stats_returns_empty_bundle() { + let (_dir, store) = temp_store(); + let k = key("p", "auth"); + store + .put(&k, &make_snap("schema-h1", "auth")) + .await + .unwrap(); + + let bundle = store.get_annotated(&k, SnapshotRef::Latest).await.unwrap(); + assert!(bundle.planner.is_none()); + assert!(bundle.activity_by_node.is_empty()); + } + + #[tokio::test] + async fn get_annotated_picks_latest_per_node_label() { + let (_dir, store) = temp_store(); + let k = key("p", "auth"); + store + .put(&k, &make_snap("schema-h1", "auth")) + .await + .unwrap(); + + // Two activity rows for the same label; only the latest should appear. + let first = make_activity("schema-h1", "auth", "primary", "act-1"); + store.put_activity_stats(&k, &first).await.unwrap(); + tokio::time::sleep(std::time::Duration::from_millis(5)).await; + let second = make_activity("schema-h1", "auth", "primary", "act-2"); + store.put_activity_stats(&k, &second).await.unwrap(); + + let bundle = store.get_annotated(&k, SnapshotRef::Latest).await.unwrap(); + let primary = bundle.activity_by_node.get("primary").unwrap(); + assert_eq!(primary.content_hash, "act-2"); + } } diff --git a/crates/dry_run_core/src/lib.rs b/crates/dry_run_core/src/lib.rs index 131bd0e..1867a00 100644 --- a/crates/dry_run_core/src/lib.rs +++ b/crates/dry_run_core/src/lib.rs @@ -17,5 +17,8 @@ pub use diff::SchemaChangeset; pub use error::{Error, Result}; pub use history::HistoryStore; pub use lint::LintConfig; -pub use schema::{ApplyResult, NodeStats, SchemaSnapshot}; +pub use schema::{ + ActivityStatsSnapshot, AnnotatedSchema, AnnotatedSnapshot, MergedActivity, NodeIdentity, + NodeSelector, PlannerStatsSnapshot, SchemaSnapshot, +}; pub use version::PgVersion; diff --git a/crates/dry_run_core/src/lint/rules/mod.rs b/crates/dry_run_core/src/lint/rules/mod.rs index 196738c..5e2bc4b 100644 --- a/crates/dry_run_core/src/lint/rules/mod.rs +++ b/crates/dry_run_core/src/lint/rules/mod.rs @@ -221,7 +221,6 @@ mod tests { generated: None, comment: None, statistics_target: None, - stats: None, } } @@ -250,7 +249,6 @@ mod tests { definition: format!("CREATE INDEX {name} ON ..."), is_valid: true, backs_constraint: false, - stats: None, } } @@ -268,7 +266,6 @@ mod tests { constraints, indexes, comment: None, - stats: None, partition_info: None, policies: vec![], triggers: vec![], @@ -292,7 +289,6 @@ mod tests { functions: vec![], extensions: vec![], gucs: vec![], - node_stats: vec![], } } @@ -448,7 +444,6 @@ mod tests { constraints: vec![], indexes: vec![], comment: None, - stats: None, partition_info: Some(PartitionInfo { strategy: PartitionStrategy::Range, key: "created_at".into(), @@ -518,7 +513,6 @@ mod tests { generated: None, comment: None, statistics_target: None, - stats: None, } } @@ -573,7 +567,6 @@ mod tests { constraints: vec![], indexes: vec![], comment: None, - stats: None, partition_info: Some(PartitionInfo { strategy: PartitionStrategy::Hash, key: "id".into(), @@ -766,7 +759,6 @@ mod tests { constraints: vec![], indexes: vec![], comment: None, - stats: None, partition_info: Some(PartitionInfo { strategy: PartitionStrategy::Range, key: "created_at".into(), diff --git a/crates/dry_run_core/src/query/advise.rs b/crates/dry_run_core/src/query/advise.rs index 186952d..4e3aa6b 100644 --- a/crates/dry_run_core/src/query/advise.rs +++ b/crates/dry_run_core/src/query/advise.rs @@ -4,7 +4,7 @@ use super::plan::PlanNode; use super::suggest::{self, IndexSuggestion}; use crate::error::Result; use crate::jit; -use crate::schema::{self, Column, SchemaSnapshot}; +use crate::schema::{self, AnnotatedSchema, ColumnStats, QualifiedName}; use crate::version::PgVersion; #[derive(Debug, Clone, Serialize, Deserialize)] @@ -24,13 +24,20 @@ pub struct AdviseResult { pub index_suggestions: Vec, } +// Top-level advise pass — walks the plan tree and emits per-node advice. +// +// Takes the annotated view rather than a raw `&SchemaSnapshot` because +// the per-node refinements (selectivity hints, partial-index suggestions, +// per-replica seq_scan breakdown) all need planner column stats and +// activity counters. Without those, advise still works — it just +// degrades to "DDL-only" recommendations. pub fn advise( plan: &PlanNode, - schema: &SchemaSnapshot, + annotated: &AnnotatedSchema<'_>, pg_version: Option<&PgVersion>, ) -> Vec { let mut advice = Vec::new(); - walk_for_advice(plan, schema, pg_version, &mut advice); + walk_for_advice(plan, annotated, pg_version, &mut advice); advice } @@ -39,17 +46,19 @@ pub fn advise( pub fn advise_with_index_suggestions( sql: &str, plan: Option<&PlanNode>, - schema: &SchemaSnapshot, + annotated: &AnnotatedSchema<'_>, pg_version: Option<&PgVersion>, include_index_suggestions: bool, ) -> Result { let advice = match plan { - Some(p) => advise(p, schema, pg_version), + Some(p) => advise(p, annotated, pg_version), None => Vec::new(), }; let index_suggestions = if include_index_suggestions { - suggest::suggest_index(sql, schema, plan, pg_version)? + // suggest_index reads `reltuples` for size cutoffs — pass the + // annotated view so it has access to planner sizing. + suggest::suggest_index(sql, annotated, plan, pg_version)? } else { Vec::new() }; @@ -62,23 +71,23 @@ pub fn advise_with_index_suggestions( fn walk_for_advice( node: &PlanNode, - schema: &SchemaSnapshot, + annotated: &AnnotatedSchema<'_>, pg_version: Option<&PgVersion>, advice: &mut Vec, ) { - advise_seq_scan(node, schema, pg_version, advice); + advise_seq_scan(node, annotated, pg_version, advice); advise_nested_loop_seq_scan(node, pg_version, advice); - advise_sort(node, schema, pg_version, advice); + advise_sort(node, pg_version, advice); advise_cte(node, advice); for child in &node.children { - walk_for_advice(child, schema, pg_version, advice); + walk_for_advice(child, annotated, pg_version, advice); } } fn advise_seq_scan( node: &PlanNode, - schema: &SchemaSnapshot, + annotated: &AnnotatedSchema<'_>, pg_version: Option<&PgVersion>, advice: &mut Vec, ) { @@ -95,8 +104,10 @@ fn advise_seq_scan( let schema_name = node.schema.as_deref().unwrap_or("public"); let qualified = format!("{schema_name}.{table_name}"); + let qn = QualifiedName::new(schema_name, table_name); - let table = schema + let table = annotated + .schema .tables .iter() .find(|t| t.name == *table_name && t.schema == schema_name); @@ -135,47 +146,45 @@ fn advise_seq_scan( let (ddl, recommendation) = if let Some(filter_col_name) = &filter_col { let col_obj = table.and_then(|t| t.columns.iter().find(|c| c.name == *filter_col_name)); let col_type = col_obj.map(|c| c.type_name.as_str()).unwrap_or("unknown"); + // Column stats live in the planner snapshot, keyed by qualified + // table name + column name. Returns None if there's no planner + // capture yet — in which case we fall back to non-stats advice. + let col_stats = annotated.column_stats(&qn, filter_col_name); let (idx_type, rec) = suggest_index_type(&qualified, col_type, filter_col_name); let mut recommendation = rec; - // stats-aware refinements - if let Some(col) = col_obj - && col.stats.is_some() - { + // Stats-aware refinements — only meaningful when we actually have + // column stats. The plan's row estimate is the floor; if planner + // sizing reports more rows than the plan rows estimate (which can + // happen on stale plan estimates), prefer the larger number. + if col_stats.is_some() { let mut table_rows = node.plan_rows; - if let Some(t) = table - && let Some(s) = &t.stats - && s.reltuples > table_rows + if let Some(rt) = annotated.reltuples(&qn) + && rt > table_rows { - table_rows = s.reltuples; + table_rows = rt; } - recommendation.push_str(&stats_aware_advice(col, filter_col_name, table_rows)); + recommendation.push_str(&stats_aware_advice(col_stats, filter_col_name, table_rows)); } let idx_name = format!("idx_{table_name}_{filter_col_name}"); - // prefer partial index for high-null or skewed columns - let ddl = if let Some(col) = col_obj { - if col.stats.as_ref().and_then(|s| s.null_frac).unwrap_or(0.0) > 0.5 { - format!( - "CREATE INDEX CONCURRENTLY {idx_name} ON {schema_name}.{table_name} USING {idx_type}({filter_col_name}) WHERE {filter_col_name} IS NOT NULL;" - ) - } else if let Some(stats) = &col.stats { - if let Some((dominant, _freq)) = schema::has_skewed_distribution(stats, 0.5) { - format!( - "CREATE INDEX CONCURRENTLY {idx_name} ON {schema_name}.{table_name} USING {idx_type}({filter_col_name}) WHERE {filter_col_name} != '{dominant}';" - ) - } else { - format!( - "CREATE INDEX CONCURRENTLY {idx_name} ON {schema_name}.{table_name} USING {idx_type}({filter_col_name});" - ) - } - } else { - format!( - "CREATE INDEX CONCURRENTLY {idx_name} ON {schema_name}.{table_name} USING {idx_type}({filter_col_name});" - ) - } + // Prefer a partial index for high-null or skewed columns — a tiny + // selective index is much cheaper than a full one when most rows + // would never match the predicate. Falls through to a plain + // CREATE INDEX when stats aren't available. + let null_frac = col_stats.and_then(|s| s.null_frac).unwrap_or(0.0); + let ddl = if null_frac > 0.5 { + format!( + "CREATE INDEX CONCURRENTLY {idx_name} ON {schema_name}.{table_name} USING {idx_type}({filter_col_name}) WHERE {filter_col_name} IS NOT NULL;" + ) + } else if let Some(stats) = col_stats + && let Some((dominant, _freq)) = schema::has_skewed_distribution(stats, 0.5) + { + format!( + "CREATE INDEX CONCURRENTLY {idx_name} ON {schema_name}.{table_name} USING {idx_type}({filter_col_name}) WHERE {filter_col_name} != '{dominant}';" + ) } else { format!( "CREATE INDEX CONCURRENTLY {idx_name} ON {schema_name}.{table_name} USING {idx_type}({filter_col_name});" @@ -192,28 +201,20 @@ fn advise_seq_scan( let mut full_recommendation = recommendation; - // enrich with per-node context when available - let node_seq_scans: Vec<(&str, i64)> = schema - .node_stats - .iter() - .filter_map(|ns| { - ns.table_stats - .iter() - .find(|ts| ts.table == *table_name && ts.schema == schema_name) - .map(|ts| (ns.source.as_str(), ts.stats.seq_scan)) - }) - .collect(); - - if node_seq_scans.len() >= 2 { - let total: i64 = node_seq_scans.iter().map(|(_, v)| *v).sum(); - let parts: Vec = node_seq_scans + // Per-node breakdown — surfaces "this replica is doing the unindexed + // work, the others aren't" patterns. Empty when we only have one node + // (or none); skipping the note in that case avoids noise. + let per_node = annotated.seq_scan_per_node(&qn); + if per_node.len() >= 2 { + let total: i64 = per_node.iter().map(|(_, v)| *v).sum(); + let parts: Vec = per_node .iter() .map(|(src, v)| format!("{src}: {v}")) .collect(); full_recommendation.push_str(&format!( "\n\nNote: across {} nodes, seq_scan totals {} ({}). \ Check if specific replicas are serving unindexed query patterns.", - node_seq_scans.len(), + per_node.len(), total, parts.join(", ") )); @@ -276,12 +277,7 @@ fn advise_nested_loop_seq_scan( }); } -fn advise_sort( - node: &PlanNode, - _schema: &SchemaSnapshot, - pg_version: Option<&PgVersion>, - advice: &mut Vec, -) { +fn advise_sort(node: &PlanNode, pg_version: Option<&PgVersion>, advice: &mut Vec) { if node.node_type != "Sort" || node.plan_rows < 10_000.0 { return; } @@ -323,15 +319,21 @@ fn advise_sort( }); } -fn stats_aware_advice(col: &Column, filter_col: &str, table_rows: f64) -> String { - let stats = match &col.stats { +// Build a recommendation suffix grounded in column stats — selectivity, +// dominant-value skew, null fraction, physical correlation. Returns an +// empty string when no stats are available, which lets the caller stitch +// it on unconditionally without a `match`. +fn stats_aware_advice(stats: Option<&ColumnStats>, filter_col: &str, table_rows: f64) -> String { + let stats = match stats { Some(s) => s, None => return String::new(), }; let mut parts = Vec::new(); - // selectivity assessment - let sel = schema::column_selectivity(col, table_rows); + // Selectivity — the fraction of rows a value-equality predicate is + // expected to match. Low cardinality (≤ 5 distinct values) → high + // selectivity → poor index usefulness; we call that out explicitly. + let sel = schema::column_selectivity(Some(stats), table_rows); if let Some(nd) = stats.n_distinct { if nd > 0.0 && nd <= 5.0 { parts.push(format!( @@ -468,227 +470,5 @@ fn find_table_in_subtree(node: &PlanNode) -> Option<(String, String)> { } #[cfg(test)] -mod tests { - use chrono::Utc; - - use super::*; - use crate::schema::*; - - fn empty_schema() -> SchemaSnapshot { - SchemaSnapshot { - pg_version: "PostgreSQL 17.0".into(), - database: "test".into(), - timestamp: Utc::now(), - content_hash: "test".into(), - source: None, - tables: vec![Table { - oid: 1, - schema: "public".into(), - name: "orders".into(), - columns: vec![ - Column { - name: "id".into(), - ordinal: 1, - type_name: "bigint".into(), - nullable: false, - default: None, - identity: None, - generated: None, - comment: None, - statistics_target: None, - stats: None, - }, - Column { - name: "customer_id".into(), - ordinal: 2, - type_name: "bigint".into(), - nullable: false, - default: None, - identity: None, - generated: None, - comment: None, - statistics_target: None, - stats: None, - }, - Column { - name: "data".into(), - ordinal: 3, - type_name: "jsonb".into(), - nullable: true, - default: None, - identity: None, - generated: None, - comment: None, - statistics_target: None, - stats: None, - }, - ], - constraints: vec![], - indexes: vec![], - comment: None, - stats: None, - partition_info: None, - policies: vec![], - triggers: vec![], - reloptions: vec![], - rls_enabled: false, - }], - enums: vec![], - domains: vec![], - composites: vec![], - views: vec![], - functions: vec![], - extensions: vec![], - gucs: vec![], - node_stats: vec![], - } - } - - fn make_seq_scan(table: &str, rows: f64, filter: Option<&str>) -> PlanNode { - PlanNode { - node_type: "Seq Scan".into(), - relation_name: Some(table.into()), - schema: Some("public".into()), - alias: None, - startup_cost: 0.0, - total_cost: rows * 0.01, - plan_rows: rows, - plan_width: 64, - actual_rows: None, - actual_loops: None, - actual_startup_time: None, - actual_total_time: None, - shared_hit_blocks: None, - shared_read_blocks: None, - index_name: None, - index_cond: None, - filter: filter.map(String::from), - rows_removed_by_filter: None, - sort_key: None, - sort_method: None, - hash_cond: None, - join_type: None, - subplans_removed: None, - cte_name: None, - parent_relationship: None, - children: vec![], - } - } - - #[test] - fn advise_seq_scan_suggests_btree() { - let schema = empty_schema(); - let plan = make_seq_scan("orders", 100_000.0, Some("(customer_id = 42)")); - let advice = advise(&plan, &schema, None); - assert!(!advice.is_empty()); - assert!(advice[0].ddl.as_ref().unwrap().contains("btree")); - assert!(advice[0].ddl.as_ref().unwrap().contains("customer_id")); - assert!(advice[0].ddl.as_ref().unwrap().contains("CONCURRENTLY")); - } - - #[test] - fn advise_seq_scan_jsonb_suggests_gin() { - let schema = empty_schema(); - let plan = make_seq_scan("orders", 100_000.0, Some("(data @> '{}'::jsonb)")); - let advice = advise(&plan, &schema, None); - assert!(!advice.is_empty()); - assert!(advice[0].ddl.as_ref().unwrap().contains("gin")); - } - - #[test] - fn advise_small_table_no_advice() { - let schema = empty_schema(); - let plan = make_seq_scan("orders", 50.0, Some("(id = 1)")); - let advice = advise(&plan, &schema, None); - assert!(advice.is_empty()); - } - - #[test] - fn advise_includes_version_note() { - let schema = empty_schema(); - let plan = make_seq_scan("orders", 100_000.0, Some("(customer_id = 42)")); - let pg14 = PgVersion { - major: 14, - minor: 0, - patch: 0, - }; - let advice = advise(&plan, &schema, Some(&pg14)); - assert!(!advice.is_empty()); - assert!(advice[0].version_note.is_some()); - } - - #[test] - fn advise_seq_scan_includes_node_context() { - let mut schema = empty_schema(); - schema.node_stats = vec![ - NodeStats { - source: "master".into(), - timestamp: Utc::now(), - is_standby: false, - table_stats: vec![NodeTableStats { - schema: "public".into(), - table: "orders".into(), - stats: TableStats { - reltuples: 100_000.0, - relpages: 1250, - dead_tuples: 0, - last_vacuum: None, - last_autovacuum: None, - last_analyze: None, - last_autoanalyze: None, - seq_scan: 100, - idx_scan: 5000, - table_size: 10_000_000, - }, - }], - index_stats: vec![], - column_stats: vec![], - }, - NodeStats { - source: "replica-1".into(), - timestamp: Utc::now(), - is_standby: true, - table_stats: vec![NodeTableStats { - schema: "public".into(), - table: "orders".into(), - stats: TableStats { - reltuples: 100_000.0, - relpages: 1250, - dead_tuples: 0, - last_vacuum: None, - last_autovacuum: None, - last_analyze: None, - last_autoanalyze: None, - seq_scan: 42000, - idx_scan: 1000, - table_size: 10_000_000, - }, - }], - index_stats: vec![], - column_stats: vec![], - }, - ]; - let plan = make_seq_scan("orders", 100_000.0, Some("(customer_id = 42)")); - let advice = advise(&plan, &schema, None); - assert!(!advice.is_empty()); - assert!(advice[0].recommendation.contains("across 2 nodes")); - assert!(advice[0].recommendation.contains("master: 100")); - assert!(advice[0].recommendation.contains("replica-1: 42000")); - } - - #[test] - fn extract_column_simple() { - assert_eq!( - extract_column_from_filter("(customer_id = 42)"), - Some("customer_id".into()) - ); - assert_eq!( - extract_column_from_filter("(status IS NOT NULL)"), - Some("status".into()) - ); - assert_eq!( - extract_column_from_filter("(t.name = 'foo')"), - Some("name".into()) - ); - } -} +#[path = "advise_tests.rs"] +mod tests; diff --git a/crates/dry_run_core/src/query/advise_tests.rs b/crates/dry_run_core/src/query/advise_tests.rs new file mode 100644 index 0000000..7595a3b --- /dev/null +++ b/crates/dry_run_core/src/query/advise_tests.rs @@ -0,0 +1,251 @@ +use std::collections::BTreeMap; + +use chrono::Utc; + +use super::*; +use crate::schema::*; +use crate::schema::{ + ActivityStatsSnapshot, AnnotatedSnapshot, IndexActivityEntry, NodeIdentity, + PlannerStatsSnapshot, TableActivity, TableActivityEntry, TableSizing, TableSizingEntry, +}; + +fn empty_schema() -> SchemaSnapshot { + SchemaSnapshot { + pg_version: "PostgreSQL 17.0".into(), + database: "test".into(), + timestamp: Utc::now(), + content_hash: "test".into(), + source: None, + tables: vec![Table { + oid: 1, + schema: "public".into(), + name: "orders".into(), + columns: vec![ + Column { + name: "id".into(), + ordinal: 1, + type_name: "bigint".into(), + nullable: false, + default: None, + identity: None, + generated: None, + comment: None, + statistics_target: None, + }, + Column { + name: "customer_id".into(), + ordinal: 2, + type_name: "bigint".into(), + nullable: false, + default: None, + identity: None, + generated: None, + comment: None, + statistics_target: None, + }, + Column { + name: "data".into(), + ordinal: 3, + type_name: "jsonb".into(), + nullable: true, + default: None, + identity: None, + generated: None, + comment: None, + statistics_target: None, + }, + ], + constraints: vec![], + indexes: vec![], + comment: None, + partition_info: None, + policies: vec![], + triggers: vec![], + reloptions: vec![], + rls_enabled: false, + }], + enums: vec![], + domains: vec![], + composites: vec![], + views: vec![], + functions: vec![], + extensions: vec![], + gucs: vec![], + } +} + +fn make_seq_scan(table: &str, rows: f64, filter: Option<&str>) -> PlanNode { + PlanNode { + node_type: "Seq Scan".into(), + relation_name: Some(table.into()), + schema: Some("public".into()), + alias: None, + startup_cost: 0.0, + total_cost: rows * 0.01, + plan_rows: rows, + plan_width: 64, + actual_rows: None, + actual_loops: None, + actual_startup_time: None, + actual_total_time: None, + shared_hit_blocks: None, + shared_read_blocks: None, + index_name: None, + index_cond: None, + filter: filter.map(String::from), + rows_removed_by_filter: None, + sort_key: None, + sort_method: None, + hash_cond: None, + join_type: None, + subplans_removed: None, + cte_name: None, + parent_relationship: None, + children: vec![], + } +} + +// Wrap a bare schema in an empty annotated bundle — no planner, no +// activity. Mirrors what the MCP server hands tool bodies before +// any `dryrun snapshot take` has run. +fn ddl_only(schema: SchemaSnapshot) -> AnnotatedSnapshot { + AnnotatedSnapshot { + schema, + planner: None, + activity_by_node: BTreeMap::new(), + } +} + +#[test] +fn advise_seq_scan_suggests_btree() { + let snap = ddl_only(empty_schema()); + let plan = make_seq_scan("orders", 100_000.0, Some("(customer_id = 42)")); + let advice = advise(&plan, &snap.view(), None); + assert!(!advice.is_empty()); + assert!(advice[0].ddl.as_ref().unwrap().contains("btree")); + assert!(advice[0].ddl.as_ref().unwrap().contains("customer_id")); + assert!(advice[0].ddl.as_ref().unwrap().contains("CONCURRENTLY")); +} + +#[test] +fn advise_seq_scan_jsonb_suggests_gin() { + let snap = ddl_only(empty_schema()); + let plan = make_seq_scan("orders", 100_000.0, Some("(data @> '{}'::jsonb)")); + let advice = advise(&plan, &snap.view(), None); + assert!(!advice.is_empty()); + assert!(advice[0].ddl.as_ref().unwrap().contains("gin")); +} + +#[test] +fn advise_small_table_no_advice() { + let snap = ddl_only(empty_schema()); + let plan = make_seq_scan("orders", 50.0, Some("(id = 1)")); + let advice = advise(&plan, &snap.view(), None); + assert!(advice.is_empty()); +} + +#[test] +fn advise_includes_version_note() { + let snap = ddl_only(empty_schema()); + let plan = make_seq_scan("orders", 100_000.0, Some("(customer_id = 42)")); + let pg14 = PgVersion { + major: 14, + minor: 0, + patch: 0, + }; + let advice = advise(&plan, &snap.view(), Some(&pg14)); + assert!(!advice.is_empty()); + assert!(advice[0].version_note.is_some()); +} + +// Helper: build an ActivityStatsSnapshot for one node with a single +// table activity row carrying the supplied seq_scan counter. +fn activity_for(label: &str, seq_scan: i64) -> ActivityStatsSnapshot { + ActivityStatsSnapshot { + pg_version: "PostgreSQL 17.0".into(), + database: "test".into(), + timestamp: Utc::now(), + content_hash: format!("h-{label}"), + schema_ref_hash: "sh".into(), + node: NodeIdentity { + label: label.into(), + host: label.into(), + is_standby: label != "master", + replication_lag_bytes: None, + stats_reset: None, + }, + tables: vec![TableActivityEntry { + table: QualifiedName::new("public", "orders"), + activity: TableActivity { + seq_scan, + idx_scan: 0, + n_live_tup: 0, + n_dead_tup: 0, + last_vacuum: None, + last_autovacuum: None, + last_analyze: None, + last_autoanalyze: None, + vacuum_count: 0, + autovacuum_count: 0, + analyze_count: 0, + autoanalyze_count: 0, + }, + }], + indexes: Vec::::new(), + } +} + +#[test] +fn advise_seq_scan_includes_node_context() { + // Two-node cluster — primary handles indexed traffic, replica + // is doing the seq scans. The recommendation should call that + // out with the per-node breakdown. + let mut activity_by_node = BTreeMap::new(); + activity_by_node.insert("master".into(), activity_for("master", 100)); + activity_by_node.insert("replica-1".into(), activity_for("replica-1", 42000)); + let snap = AnnotatedSnapshot { + schema: empty_schema(), + planner: Some(PlannerStatsSnapshot { + pg_version: "PostgreSQL 17.0".into(), + database: "test".into(), + timestamp: Utc::now(), + content_hash: "ph".into(), + schema_ref_hash: "sh".into(), + tables: vec![TableSizingEntry { + table: QualifiedName::new("public", "orders"), + sizing: TableSizing { + reltuples: 100_000.0, + relpages: 1250, + table_size: 10_000_000, + total_size: None, + index_size: None, + }, + }], + columns: vec![], + indexes: vec![], + }), + activity_by_node, + }; + let plan = make_seq_scan("orders", 100_000.0, Some("(customer_id = 42)")); + let advice = advise(&plan, &snap.view(), None); + assert!(!advice.is_empty()); + assert!(advice[0].recommendation.contains("across 2 nodes")); + assert!(advice[0].recommendation.contains("master: 100")); + assert!(advice[0].recommendation.contains("replica-1: 42000")); +} + +#[test] +fn extract_column_simple() { + assert_eq!( + extract_column_from_filter("(customer_id = 42)"), + Some("customer_id".into()) + ); + assert_eq!( + extract_column_from_filter("(status IS NOT NULL)"), + Some("status".into()) + ); + assert_eq!( + extract_column_from_filter("(t.name = 'foo')"), + Some("name".into()) + ); +} diff --git a/crates/dry_run_core/src/query/antipatterns.rs b/crates/dry_run_core/src/query/antipatterns.rs index 8e1afbd..dcdde4e 100644 --- a/crates/dry_run_core/src/query/antipatterns.rs +++ b/crates/dry_run_core/src/query/antipatterns.rs @@ -1,20 +1,29 @@ use super::parse::ParsedQuery; use super::validate::{ValidationWarning, WarningSeverity}; -use crate::schema::{SchemaSnapshot, effective_table_stats}; +use crate::schema::{AnnotatedSchema, QualifiedName, SchemaSnapshot}; const LARGE_TABLE_THRESHOLD: f64 = 10_000.0; +// Detect anti-patterns in a parsed SQL statement. +// +// Most rules are pure DDL — they look at parsed query structure plus the +// schema to spot SELECT *, missing WHERE clauses, partition-key updates, +// etc. The one stats-aware rule (`detect_unbounded_query`) needs +// reltuples to know whether a missing WHERE on a small lookup table is +// fine vs. a missing WHERE on a 100M-row event table is a footgun. So +// the entry point takes the annotated view; sub-rules that only need +// DDL borrow `annotated.schema` internally. pub fn detect_antipatterns( parsed: &ParsedQuery, - schema: &SchemaSnapshot, + annotated: &AnnotatedSchema<'_>, warnings: &mut Vec, ) { detect_select_star(parsed, warnings); - detect_unbounded_query(parsed, schema, warnings); + detect_unbounded_query(parsed, annotated, warnings); detect_cartesian_join(parsed, warnings); detect_dml_without_where(parsed, warnings); - detect_partition_key_antipatterns(parsed, schema, warnings); - detect_partition_key_update(parsed, schema, warnings); + detect_partition_key_antipatterns(parsed, annotated.schema, warnings); + detect_partition_key_update(parsed, annotated.schema, warnings); } fn detect_select_star(parsed: &ParsedQuery, warnings: &mut Vec) { @@ -30,7 +39,7 @@ fn detect_select_star(parsed: &ParsedQuery, warnings: &mut Vec, warnings: &mut Vec, ) { if parsed.info.statement_type != "SELECT" { @@ -42,12 +51,17 @@ fn detect_unbounded_query( for table_ref in &parsed.info.tables { let schema_name = table_ref.schema.as_deref().unwrap_or("public"); - if let Some(table) = schema + // Only fire when reltuples > LARGE_TABLE_THRESHOLD. When there's + // no planner snapshot — fresh project, replica-only capture — + // we get None and silently skip, since we can't tell whether + // the table is small enough to safely scan or not. + if let Some(table) = annotated + .schema .tables .iter() .find(|t| t.name == table_ref.name && t.schema == schema_name) { - let reltuples = effective_table_stats(table, schema).map(|s| s.reltuples); + let reltuples = annotated.reltuples(&QualifiedName::new(schema_name, &table.name)); if let Some(rows) = reltuples && rows > LARGE_TABLE_THRESHOLD @@ -257,7 +271,6 @@ mod tests { constraints: vec![], indexes: vec![], comment: None, - stats: None, partition_info: Some(PartitionInfo { strategy: PartitionStrategy::Range, key: "created_at".into(), @@ -286,7 +299,6 @@ mod tests { functions: vec![], extensions: vec![], gucs: vec![], - node_stats: vec![], } } diff --git a/crates/dry_run_core/src/query/explain.rs b/crates/dry_run_core/src/query/explain.rs index ef20afa..ea101fa 100644 --- a/crates/dry_run_core/src/query/explain.rs +++ b/crates/dry_run_core/src/query/explain.rs @@ -4,7 +4,7 @@ use sqlx::PgPool; use super::plan::{PlanNode, parse_plan_json}; use super::plan_warnings::detect_plan_warnings; use crate::error::{Error, Result}; -use crate::schema::SchemaSnapshot; +use crate::schema::AnnotatedSchema; #[derive(Debug, Clone, Serialize, Deserialize)] pub struct ExplainResult { @@ -37,7 +37,7 @@ pub async fn explain_query( pool: &PgPool, sql: &str, analyze: bool, - schema: Option<&SchemaSnapshot>, + annotated: Option<&AnnotatedSchema<'_>>, ) -> Result { let explain_sql = if analyze { format!("EXPLAIN (ANALYZE, BUFFERS, FORMAT JSON) {sql}") @@ -96,7 +96,7 @@ pub async fn explain_query( None }; - let warnings = detect_plan_warnings(&plan, schema); + let warnings = detect_plan_warnings(&plan, annotated); Ok(ExplainResult { plan, diff --git a/crates/dry_run_core/src/query/migration.rs b/crates/dry_run_core/src/query/migration.rs index 62d085d..ad428d8 100644 --- a/crates/dry_run_core/src/query/migration.rs +++ b/crates/dry_run_core/src/query/migration.rs @@ -3,7 +3,7 @@ use serde::{Deserialize, Serialize}; use crate::error::{Error, Result}; use crate::jit; -use crate::schema::SchemaSnapshot; +use crate::schema::{AnnotatedSchema, QualifiedName, SchemaSnapshot}; use crate::version::PgVersion; #[derive(Debug, Clone, Serialize, Deserialize)] @@ -28,9 +28,15 @@ pub enum SafetyRating { Dangerous, } +// Inspect a DDL string and emit safety / lock-impact checks for each +// statement. Takes the annotated view because two of the inner analyses +// reach for stats: `lookup_table_stats` synthesizes the "(2 GB, ~50M +// rows)" flavor text from planner sizing, and the SET NOT NULL path +// reads column null_frac to predict whether the constraint scan will +// actually find offending rows. pub fn check_migration( ddl: &str, - schema: &SchemaSnapshot, + annotated: &AnnotatedSchema<'_>, pg_version: Option<&PgVersion>, ) -> Result> { let result = @@ -44,24 +50,24 @@ pub fn check_migration( for cmd_node in &stmt.cmds { if let Some(pg_query::protobuf::node::Node::AlterTableCmd(cmd)) = &cmd_node.node && let Some(check) = - analyze_alter_table_cmd(cmd, &result, schema, pg_version) + analyze_alter_table_cmd(cmd, &result, annotated, pg_version) { checks.push(check); } } } NodeRef::IndexStmt(idx) => { - checks.push(analyze_create_index(idx, schema, pg_version)); + checks.push(analyze_create_index(idx, annotated, pg_version)); } NodeRef::RenameStmt(ren) => { - checks.push(analyze_rename(ren, schema)); + checks.push(analyze_rename(ren, annotated.schema)); } _ => {} } } if checks.is_empty() - && let Some(check) = fallback_keyword_check(ddl, schema, pg_version) + && let Some(check) = fallback_keyword_check(ddl, annotated.schema, pg_version) { checks.push(check); } @@ -72,7 +78,7 @@ pub fn check_migration( fn analyze_alter_table_cmd( cmd: &pg_query::protobuf::AlterTableCmd, parse_result: &pg_query::ParseResult, - schema: &SchemaSnapshot, + annotated: &AnnotatedSchema<'_>, pg_version: Option<&PgVersion>, ) -> Option { let subtype = pg_query::protobuf::AlterTableType::try_from(cmd.subtype).ok()?; @@ -83,7 +89,7 @@ fn analyze_alter_table_cmd( .map(|(name, _)| name.clone()) .unwrap_or_default(); - let (table_size, row_estimate) = lookup_table_stats(schema, &table_name); + let (table_size, row_estimate) = lookup_table_stats(annotated, &table_name); match subtype { pg_query::protobuf::AlterTableType::AtAddColumn => { @@ -174,20 +180,33 @@ fn analyze_alter_table_cmd( let mut rec = e.to_string(); - // check column stats for null_frac context - if !cmd.name.is_empty() - && let Some(col) = find_column(schema, &table_name, &cmd.name) - && let Some(nf) = col.stats.as_ref().and_then(|s| s.null_frac) { - if nf == 0.0 { - rec.push_str("\n\nDATA CHECK: Column currently has 0% NULLs. The scan will pass, but ACCESS EXCLUSIVE lock is still held."); - } else if let Some(rows) = row_estimate { - let null_rows = (nf * rows) as i64; - rec.push_str(&format!( - "\n\nDATA CHECK: Column has ~{:.0}% NULLs (~{} rows) that must be backfilled before this constraint can be applied.", - nf * 100.0, null_rows - )); - } - } + // Check column stats for null_frac context — pulls the + // ColumnStats out of the planner snapshot so we can warn + // the user about how many rows would currently fail the new + // NOT NULL constraint. Skipped when there's no planner + // snapshot — better to omit the data check than to bluff + // a "0% NULLs" estimate we can't actually verify. + let col_stats = if !cmd.name.is_empty() { + let (schema_part, name_part) = if let Some((s, n)) = table_name.rsplit_once('.') { + (s, n) + } else { + ("public", table_name.as_str()) + }; + annotated.column_stats(&QualifiedName::new(schema_part, name_part), &cmd.name) + } else { + None + }; + if let Some(nf) = col_stats.and_then(|s| s.null_frac) { + if nf == 0.0 { + rec.push_str("\n\nDATA CHECK: Column currently has 0% NULLs. The scan will pass, but ACCESS EXCLUSIVE lock is still held."); + } else if let Some(rows) = row_estimate { + let null_rows = (nf * rows) as i64; + rec.push_str(&format!( + "\n\nDATA CHECK: Column has ~{:.0}% NULLs (~{} rows) that must be backfilled before this constraint can be applied.", + nf * 100.0, null_rows + )); + } + } Some(MigrationCheck { operation: "SET NOT NULL".into(), @@ -222,9 +241,14 @@ fn analyze_alter_table_cmd( }) } - pg_query::protobuf::AlterTableType::AtAddConstraint => { - analyze_add_constraint(cmd, &table_name, table_size, row_estimate, schema, pg_version) - } + pg_query::protobuf::AlterTableType::AtAddConstraint => analyze_add_constraint( + cmd, + &table_name, + table_size, + row_estimate, + annotated.schema, + pg_version, + ), pg_query::protobuf::AlterTableType::AtValidateConstraint => Some(MigrationCheck { operation: "VALIDATE CONSTRAINT".into(), @@ -316,7 +340,7 @@ fn analyze_add_constraint( fn analyze_create_index( idx: &pg_query::protobuf::IndexStmt, - schema: &SchemaSnapshot, + annotated: &AnnotatedSchema<'_>, _pg_version: Option<&PgVersion>, ) -> MigrationCheck { let table_name = idx @@ -331,7 +355,7 @@ fn analyze_create_index( }) .unwrap_or_default(); - let (table_size, row_estimate) = lookup_table_stats(schema, &table_name); + let (table_size, row_estimate) = lookup_table_stats(annotated, &table_name); let (safety, recommendation, lock_type) = if idx.concurrent { ( @@ -426,40 +450,25 @@ fn fallback_keyword_check( None } -fn find_column<'a>( - schema: &'a SchemaSnapshot, +// Pull (formatted_size, row_estimate) for a table out of the planner +// snapshot. Both fields end up in MigrationCheck so the LLM consumer can +// say things like "ALTER COLUMN TYPE on a 12 GB table will hold ACCESS +// EXCLUSIVE for ~minutes". Returns (None, None) when there's no planner +// snapshot — caller's flavor text just omits the size context in that +// case rather than guessing. +fn lookup_table_stats( + annotated: &AnnotatedSchema<'_>, table_name: &str, - col_name: &str, -) -> Option<&'a crate::schema::Column> { - let (schema_part, name_part) = if let Some((s, n)) = table_name.rsplit_once('.') { - (s, n) - } else { - ("public", table_name) - }; - schema - .tables - .iter() - .find(|t| t.name == name_part && t.schema == schema_part) - .and_then(|t| t.columns.iter().find(|c| c.name == col_name)) -} - -fn lookup_table_stats(schema: &SchemaSnapshot, table_name: &str) -> (Option, Option) { +) -> (Option, Option) { let (schema_part, name_part) = if let Some((s, n)) = table_name.rsplit_once('.') { (s, n) } else { ("public", table_name) }; - - schema - .tables - .iter() - .find(|t| t.name == name_part && t.schema == schema_part) - .and_then(|t| t.stats.as_ref()) - .map(|s| { - let size = format_bytes(s.table_size); - (Some(size), Some(s.reltuples)) - }) - .unwrap_or((None, None)) + let qn = QualifiedName::new(schema_part, name_part); + let size = annotated.table_size(&qn).map(format_bytes); + let rows = annotated.reltuples(&qn); + (size, rows) } fn format_bytes(bytes: i64) -> String { @@ -485,10 +494,45 @@ fn version_behavior_add_column(pg_version: Option<&PgVersion>) -> Option #[cfg(test)] mod tests { + use std::collections::BTreeMap; + use chrono::Utc; use super::*; use crate::schema::*; + use crate::schema::{AnnotatedSnapshot, PlannerStatsSnapshot, TableSizing, TableSizingEntry}; + + // Build a stats-bearing AnnotatedSnapshot for the migration tests. + // Most check_migration outputs reference table size / row count in + // their flavor text — we hand-roll a 2 GB / 5M-row planner row so + // the tests can exercise that path without spelunking. + fn empty_annotated() -> AnnotatedSnapshot { + let schema = empty_schema(); + let planner = PlannerStatsSnapshot { + pg_version: "PostgreSQL 17.0".into(), + database: "test".into(), + timestamp: Utc::now(), + content_hash: "ph".into(), + schema_ref_hash: schema.content_hash.clone(), + tables: vec![TableSizingEntry { + table: QualifiedName::new("public", "orders"), + sizing: TableSizing { + reltuples: 5_000_000.0, + relpages: 262144, + table_size: 2_147_483_648, + total_size: None, + index_size: None, + }, + }], + columns: vec![], + indexes: vec![], + }; + AnnotatedSnapshot { + schema, + planner: Some(planner), + activity_by_node: BTreeMap::new(), + } + } fn empty_schema() -> SchemaSnapshot { SchemaSnapshot { @@ -505,18 +549,8 @@ mod tests { constraints: vec![], indexes: vec![], comment: None, - stats: Some(TableStats { - reltuples: 5_000_000.0, - relpages: 262144, - dead_tuples: 0, - last_vacuum: None, - last_autovacuum: None, - last_analyze: None, - last_autoanalyze: None, - seq_scan: 0, - idx_scan: 0, - table_size: 2_147_483_648, - }), + // Stats now live in the PlannerStatsSnapshot built by + // `empty_annotated`; the legacy embedded field stays None. partition_info: None, policies: vec![], triggers: vec![], @@ -530,7 +564,6 @@ mod tests { functions: vec![], extensions: vec![], gucs: vec![], - node_stats: vec![], } } @@ -546,7 +579,7 @@ mod tests { fn add_column_no_default_safe() { let checks = check_migration( "ALTER TABLE orders ADD COLUMN notes text", - &empty_schema(), + &empty_annotated().view(), Some(&pg17()), ) .unwrap(); @@ -559,7 +592,7 @@ mod tests { fn add_column_with_default() { let checks = check_migration( "ALTER TABLE orders ADD COLUMN status text DEFAULT 'pending'", - &empty_schema(), + &empty_annotated().view(), Some(&pg17()), ) .unwrap(); @@ -572,7 +605,7 @@ mod tests { fn create_index_without_concurrently() { let checks = check_migration( "CREATE INDEX idx_orders_status ON orders(status)", - &empty_schema(), + &empty_annotated().view(), Some(&pg17()), ) .unwrap(); @@ -585,7 +618,7 @@ mod tests { fn create_index_concurrently_safe() { let checks = check_migration( "CREATE INDEX CONCURRENTLY idx_orders_status ON orders(status)", - &empty_schema(), + &empty_annotated().view(), Some(&pg17()), ) .unwrap(); @@ -602,7 +635,7 @@ mod tests { }; let checks = check_migration( "ALTER TABLE orders ALTER COLUMN status SET NOT NULL", - &empty_schema(), + &empty_annotated().view(), Some(&pg12), ) .unwrap(); @@ -616,7 +649,7 @@ mod tests { fn alter_column_type_dangerous() { let checks = check_migration( "ALTER TABLE orders ALTER COLUMN id TYPE bigint", - &empty_schema(), + &empty_annotated().view(), Some(&pg17()), ) .unwrap(); @@ -628,7 +661,7 @@ mod tests { fn drop_column_safe() { let checks = check_migration( "ALTER TABLE orders DROP COLUMN legacy", - &empty_schema(), + &empty_annotated().view(), Some(&pg17()), ) .unwrap(); @@ -640,7 +673,7 @@ mod tests { fn includes_table_size() { let checks = check_migration( "ALTER TABLE orders ADD COLUMN x text", - &empty_schema(), + &empty_annotated().view(), Some(&pg17()), ) .unwrap(); diff --git a/crates/dry_run_core/src/query/plan_warnings.rs b/crates/dry_run_core/src/query/plan_warnings.rs index 9be8b5a..6e11153 100644 --- a/crates/dry_run_core/src/query/plan_warnings.rs +++ b/crates/dry_run_core/src/query/plan_warnings.rs @@ -1,32 +1,50 @@ use super::explain::PlanWarning; use super::plan::PlanNode; use crate::jit; -use crate::schema::SchemaSnapshot; +use crate::schema::{AnnotatedSchema, QualifiedName, SchemaSnapshot}; const SEQ_SCAN_ROW_THRESHOLD: f64 = 5_000.0; -pub fn detect_plan_warnings(plan: &PlanNode, schema: Option<&SchemaSnapshot>) -> Vec { +// Plan warnings — walks an EXPLAIN tree and surfaces patterns worth +// flagging (large seq scans, nested-loop antipatterns, missing partition +// pruning, materialized CTEs). +// +// Schema reference is `Option<&AnnotatedSchema>` because warnings work +// just fine without one — the plan itself usually has all the info we +// need. The schema unlocks two refinements: +// - `detect_seq_scan_large_table` falls back to planner reltuples when +// the plan's own `plan_rows` is zero (some EXPLAIN paths emit that). +// - partition / CTE warnings need the DDL to know which tables are +// partitioned. They read `annotated.schema` directly. +pub fn detect_plan_warnings( + plan: &PlanNode, + annotated: Option<&AnnotatedSchema<'_>>, +) -> Vec { let mut warnings = Vec::new(); - walk_plan(plan, schema, &mut warnings); + walk_plan(plan, annotated, &mut warnings); warnings } -fn walk_plan(node: &PlanNode, schema: Option<&SchemaSnapshot>, warnings: &mut Vec) { - detect_seq_scan_large_table(node, schema, warnings); +fn walk_plan( + node: &PlanNode, + annotated: Option<&AnnotatedSchema<'_>>, + warnings: &mut Vec, +) { + detect_seq_scan_large_table(node, annotated, warnings); detect_nested_loop_seq_scan(node, warnings); detect_sort_without_index(node, warnings); detect_high_rows_removed(node, warnings); - detect_partition_pruning_issues(node, schema, warnings); - detect_cte_materialized(node, schema, warnings); + detect_partition_pruning_issues(node, annotated.map(|a| a.schema), warnings); + detect_cte_materialized(node, annotated.map(|a| a.schema), warnings); for child in &node.children { - walk_plan(child, schema, warnings); + walk_plan(child, annotated, warnings); } } fn detect_seq_scan_large_table( node: &PlanNode, - schema: Option<&SchemaSnapshot>, + annotated: Option<&AnnotatedSchema<'_>>, warnings: &mut Vec, ) { if node.node_type != "Seq Scan" { @@ -38,16 +56,16 @@ fn detect_seq_scan_large_table( None => return, }; + // Prefer the plan's own row estimate; fall back to planner reltuples + // when it's zero (some EXPLAIN modes don't emit it). When neither is + // available we treat the row count as zero, which suppresses the + // warning — better silent than wrong. let row_count = if node.plan_rows > 0.0 { node.plan_rows - } else if let Some(schema) = schema { + } else if let Some(annotated) = annotated { let schema_name = node.schema.as_deref().unwrap_or("public"); - schema - .tables - .iter() - .find(|t| t.name == *table_name && t.schema == schema_name) - .and_then(|t| t.stats.as_ref()) - .map(|s| s.reltuples) + annotated + .reltuples(&QualifiedName::new(schema_name, table_name)) .unwrap_or(0.0) } else { 0.0 @@ -355,7 +373,6 @@ mod tests { constraints: vec![], indexes: vec![], comment: None, - stats: None, partition_info: Some(PartitionInfo { strategy: PartitionStrategy::Range, key: "created_at".into(), @@ -394,13 +411,23 @@ mod tests { functions: vec![], extensions: vec![], gucs: vec![], - node_stats: vec![], + } + } + + // Wrap a bare schema in an empty annotated bundle — partition / CTE + // tests don't need stats, just DDL. + fn ddl_view(schema: &SchemaSnapshot) -> AnnotatedSchema<'_> { + AnnotatedSchema { + schema, + planner: None, + merged: None, } } #[test] fn no_pruning_warns() { let schema = partitioned_schema(); + let view = ddl_view(&schema); // Append scanning all 4 partitions, no SubplansRemoved let plan = PlanNode { node_type: "Append".into(), @@ -412,7 +439,7 @@ mod tests { ], ..make_seq_scan("", 0.0) }; - let warnings = detect_plan_warnings(&plan, Some(&schema)); + let warnings = detect_plan_warnings(&plan, Some(&view)); assert!( warnings .iter() @@ -423,6 +450,7 @@ mod tests { #[test] fn good_pruning_no_warning() { let schema = partitioned_schema(); + let view = ddl_view(&schema); // Only 1 partition scanned, 3 pruned let plan = PlanNode { node_type: "Append".into(), @@ -430,7 +458,7 @@ mod tests { children: vec![make_seq_scan("orders_q1", 1000.0)], ..make_seq_scan("", 0.0) }; - let warnings = detect_plan_warnings(&plan, Some(&schema)); + let warnings = detect_plan_warnings(&plan, Some(&view)); assert!( !warnings .iter() @@ -441,6 +469,7 @@ mod tests { #[test] fn partial_pruning_info() { let schema = partitioned_schema(); + let view = ddl_view(&schema); // 3 partitions still scanned but 1 pruned — scanning > half let plan = PlanNode { node_type: "Append".into(), @@ -452,7 +481,7 @@ mod tests { ], ..make_seq_scan("", 0.0) }; - let warnings = detect_plan_warnings(&plan, Some(&schema)); + let warnings = detect_plan_warnings(&plan, Some(&view)); assert!( warnings .iter() diff --git a/crates/dry_run_core/src/query/suggest.rs b/crates/dry_run_core/src/query/suggest.rs index 7f06c38..a368e5a 100644 --- a/crates/dry_run_core/src/query/suggest.rs +++ b/crates/dry_run_core/src/query/suggest.rs @@ -3,7 +3,7 @@ use serde::{Deserialize, Serialize}; use super::parse::parse_sql; use super::plan::PlanNode; use crate::error::Result; -use crate::schema::{SchemaSnapshot, Table, effective_table_stats}; +use crate::schema::{AnnotatedSchema, QualifiedName, Table}; use crate::version::PgVersion; #[derive(Debug, Clone, Serialize, Deserialize)] @@ -20,7 +20,7 @@ pub struct IndexSuggestion { pub(crate) fn suggest_index( sql: &str, - schema: &SchemaSnapshot, + annotated: &AnnotatedSchema<'_>, plan: Option<&PlanNode>, _pg_version: Option<&PgVersion>, ) -> Result> { @@ -28,20 +28,21 @@ pub(crate) fn suggest_index( let mut suggestions = Vec::new(); if let Some(plan) = plan { - suggest_from_plan(plan, schema, &mut suggestions); + suggest_from_plan(plan, annotated, &mut suggestions); } - suggest_from_query_structure(&parsed, schema, &mut suggestions); + suggest_from_query_structure(&parsed, annotated, &mut suggestions); dedup_suggestions(&mut suggestions); Ok(suggestions) } -// plan-based suggestions - +// Plan-based suggestions — walks an EXPLAIN plan tree looking for +// patterns that an index could fix. Reads only DDL plus reltuples (for +// the "is this table large enough to bother" cutoff). fn suggest_from_plan( node: &PlanNode, - schema: &SchemaSnapshot, + annotated: &AnnotatedSchema<'_>, suggestions: &mut Vec, ) { if node.node_type == "Seq Scan" @@ -49,7 +50,8 @@ fn suggest_from_plan( && let Some(table_name) = &node.relation_name { let schema_name = node.schema.as_deref().unwrap_or("public"); - let table = schema + let table = annotated + .schema .tables .iter() .find(|t| t.name == *table_name && t.schema == schema_name); @@ -111,15 +113,19 @@ fn suggest_from_plan( } for child in &node.children { - suggest_from_plan(child, schema, suggestions); + suggest_from_plan(child, annotated, suggestions); } } -// query structure-based suggestions - +// Query-structure-based suggestions — uses the parsed SQL to spot +// WHERE-clause filter columns on large tables that lack a leading index. +// +// "Large" is gated on planner reltuples; tables under the threshold or +// without any planner snapshot at all are silently skipped — there's no +// useful suggestion to make in those cases. fn suggest_from_query_structure( parsed: &super::parse::ParsedQuery, - schema: &SchemaSnapshot, + annotated: &AnnotatedSchema<'_>, suggestions: &mut Vec, ) { for (alias, col_name) in &parsed.info.filter_columns { @@ -137,22 +143,24 @@ fn suggest_from_query_structure( if let Some(table_ref) = table_ref { let schema_name = table_ref.schema.as_deref().unwrap_or("public"); - let table = schema + let table = annotated + .schema .tables .iter() .find(|t| t.name == table_ref.name && t.schema == schema_name); if let Some(table) = table { - let effective_stats = effective_table_stats(table, schema); - let is_large = effective_stats - .as_ref() - .is_some_and(|s| s.reltuples >= 1000.0); + let qn = QualifiedName::new(&table.schema, &table.name); + // Reltuples is the only stat this rule needs — comes + // from the planner snapshot (always None on a fresh + // project, in which case we skip). + let reltuples = annotated.reltuples(&qn).unwrap_or(0.0); + let is_large = reltuples >= 1000.0; if is_large && !has_leading_index(Some(table), col_name) { let idx_type = choose_index_type(Some(table), col_name); let qualified = format!("{}.{}", table.schema, table.name); let idx_name = format!("idx_{}_{col_name}", table.name); - let reltuples = effective_stats.as_ref().map(|s| s.reltuples).unwrap_or(0.0); suggestions.push(IndexSuggestion { table: qualified.clone(), @@ -245,8 +253,41 @@ fn dedup_suggestions(suggestions: &mut Vec) { mod tests { use chrono::Utc; + use std::collections::BTreeMap; + use super::*; use crate::schema::*; + use crate::schema::{AnnotatedSnapshot, PlannerStatsSnapshot, TableSizing, TableSizingEntry}; + + // Build a stats-bearing AnnotatedSnapshot — wraps the legacy + // `test_schema()` fixture and bolts on a planner snapshot with the + // reltuples each test relies on. `with_size` lets the small-table + // case override the row count without hand-rolling another schema. + fn test_annotated(reltuples: f64) -> AnnotatedSnapshot { + AnnotatedSnapshot { + schema: test_schema(), + planner: Some(PlannerStatsSnapshot { + pg_version: "PostgreSQL 17.0".into(), + database: "test".into(), + timestamp: Utc::now(), + content_hash: "ph".into(), + schema_ref_hash: "sh".into(), + tables: vec![TableSizingEntry { + table: QualifiedName::new("public", "users"), + sizing: TableSizing { + reltuples, + relpages: 6250, + table_size: 50_000_000, + total_size: None, + index_size: None, + }, + }], + columns: vec![], + indexes: vec![], + }), + activity_by_node: BTreeMap::new(), + } + } fn test_schema() -> SchemaSnapshot { SchemaSnapshot { @@ -270,7 +311,6 @@ mod tests { generated: None, comment: None, statistics_target: None, - stats: None, }, Column { name: "email".into(), @@ -282,7 +322,6 @@ mod tests { generated: None, comment: None, statistics_target: None, - stats: None, }, Column { name: "data".into(), @@ -294,24 +333,12 @@ mod tests { generated: None, comment: None, statistics_target: None, - stats: None, }, ], constraints: vec![], indexes: vec![], comment: None, - stats: Some(TableStats { - reltuples: 500_000.0, - relpages: 6250, - dead_tuples: 0, - last_vacuum: None, - last_autovacuum: None, - last_analyze: None, - last_autoanalyze: None, - seq_scan: 0, - idx_scan: 0, - table_size: 50_000_000, - }), + // Stats now live in PlannerStatsSnapshot — see test_annotated. partition_info: None, policies: vec![], triggers: vec![], @@ -325,16 +352,15 @@ mod tests { functions: vec![], extensions: vec![], gucs: vec![], - node_stats: vec![], } } #[test] fn suggest_from_where_clause() { - let schema = test_schema(); + let snap = test_annotated(500_000.0); let suggestions = suggest_index( "SELECT * FROM users WHERE email = 'test@example.com'", - &schema, + &snap.view(), None, None, ) @@ -348,10 +374,10 @@ mod tests { #[test] fn suggest_gin_for_jsonb() { - let schema = test_schema(); + let snap = test_annotated(500_000.0); let suggestions = suggest_index( "SELECT * FROM users u WHERE u.data = '{}'", - &schema, + &snap.view(), None, None, ) @@ -365,16 +391,41 @@ mod tests { #[test] fn no_suggestion_for_small_table() { - let mut schema = test_schema(); - schema.tables[0].stats.as_mut().unwrap().reltuples = 50.0; - let suggestions = - suggest_index("SELECT * FROM users WHERE email = 'x'", &schema, None, None).unwrap(); + // Tiny reltuples (< 1000) → suggest_from_query_structure short-circuits. + let snap = test_annotated(50.0); + let suggestions = suggest_index( + "SELECT * FROM users WHERE email = 'x'", + &snap.view(), + None, + None, + ) + .unwrap(); + assert!(suggestions.is_empty()); + } + + #[test] + fn no_suggestion_when_planner_absent() { + // Degradation case: no planner → reltuples returns None → 0.0 → + // is_large is false → no suggestion. Pins the new "no data → no + // suggestions" path. + let snap = AnnotatedSnapshot { + schema: test_schema(), + planner: None, + activity_by_node: BTreeMap::new(), + }; + let suggestions = suggest_index( + "SELECT * FROM users WHERE email = 'x'", + &snap.view(), + None, + None, + ) + .unwrap(); assert!(suggestions.is_empty()); } #[test] fn no_duplicate_suggestions() { - let schema = test_schema(); + let snap = test_annotated(500_000.0); let plan = PlanNode { node_type: "Seq Scan".into(), relation_name: Some("users".into()), @@ -405,7 +456,7 @@ mod tests { }; let suggestions = suggest_index( "SELECT * FROM users WHERE email = 'test@example.com'", - &schema, + &snap.view(), Some(&plan), None, ) diff --git a/crates/dry_run_core/src/query/validate.rs b/crates/dry_run_core/src/query/validate.rs index 6401e38..05639dd 100644 --- a/crates/dry_run_core/src/query/validate.rs +++ b/crates/dry_run_core/src/query/validate.rs @@ -3,7 +3,7 @@ use serde::{Deserialize, Serialize}; use super::antipatterns::detect_antipatterns; use super::parse::{ParsedQuery, ReferencedTable, parse_sql}; use crate::error::Result; -use crate::schema::SchemaSnapshot; +use crate::schema::{AnnotatedSchema, SchemaSnapshot}; #[derive(Debug, Clone, Serialize, Deserialize)] pub struct ValidationResult { @@ -34,11 +34,17 @@ pub struct ResolvedStar { pub columns: Vec, } -pub fn validate_query(sql: &str, schema: &SchemaSnapshot) -> Result { +// Top-level validation entry point — combines existence checks (DDL only) +// with anti-pattern detection (mostly DDL, one stats-aware rule). Takes +// the annotated view so anti-pattern rules can reach planner stats; the +// existence-check sub-helpers borrow `annotated.schema` directly since +// they need nothing from planner / activity. +pub fn validate_query(sql: &str, annotated: &AnnotatedSchema<'_>) -> Result { let parsed = parse_sql(sql)?; let mut errors = Vec::new(); let mut warnings = Vec::new(); let mut resolved_star = Vec::new(); + let schema = annotated.schema; // check each referenced table exists for table_ref in &parsed.info.tables { @@ -83,7 +89,7 @@ pub fn validate_query(sql: &str, schema: &SchemaSnapshot) -> Result Option { - let stats = index.stats.as_ref()?; +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct BloatedIndexEntry { + pub schema: String, + pub table: String, + pub index_name: String, + pub bloat_ratio: f64, + pub actual_pages: i64, + pub expected_pages: i64, + pub size_bytes: i64, + pub definition: String, +} + +pub fn detect_bloated_indexes( + annotated: &AnnotatedSchema<'_>, + threshold: f64, +) -> Vec { + let mut entries = Vec::new(); + + for table in &annotated.schema.tables { + for idx in &table.indexes { + let qn = QualifiedName::new(&table.schema, &idx.name); + let sizing = annotated.index_sizing(&qn); + if let Some(est) = estimate_index_bloat(idx, sizing, table) + && est.bloat_ratio > threshold + { + entries.push(BloatedIndexEntry { + schema: table.schema.clone(), + table: table.name.clone(), + index_name: idx.name.clone(), + bloat_ratio: est.bloat_ratio, + actual_pages: est.actual_pages, + expected_pages: est.expected_pages, + size_bytes: sizing.map(|s| s.size).unwrap_or(0), + definition: idx.definition.clone(), + }); + } + } + } + + entries.sort_by(|a, b| { + b.bloat_ratio + .partial_cmp(&a.bloat_ratio) + .unwrap_or(std::cmp::Ordering::Equal) + }); + entries +} + +pub fn estimate_index_bloat( + index: &Index, + sizing: Option<&IndexSizing>, + table: &Table, +) -> Option { + let s = sizing?; estimate_index_bloat_from_stats( - stats.reltuples, - stats.relpages, + s.reltuples, + s.relpages, &index.columns, table, &index.index_type, @@ -114,7 +166,7 @@ fn lookup_type_width(type_name: &str) -> usize { #[cfg(test)] mod tests { use super::*; - use crate::schema::{Column, IndexStats}; + use crate::schema::Column; fn make_table_with_cols(cols: Vec<(&str, &str)>) -> Table { Table { @@ -134,13 +186,11 @@ mod tests { generated: None, comment: None, statistics_target: None, - stats: None, }) .collect(), constraints: vec![], indexes: vec![], comment: None, - stats: None, partition_info: None, policies: vec![], triggers: vec![], @@ -175,11 +225,9 @@ mod tests { assert_eq!(lookup_type_width("unknown_type"), DEFAULT_WIDTH); } - #[test] - fn bloat_from_index() { - let table = make_table_with_cols(vec![("id", "bigint")]); - let idx = Index { - name: "test_pkey".into(), + fn bare_index(name: &str) -> Index { + Index { + name: name.into(), columns: vec!["id".into()], include_columns: vec![], index_type: "btree".into(), @@ -189,16 +237,26 @@ mod tests { backs_constraint: false, predicate: None, definition: String::new(), - stats: Some(IndexStats { - idx_scan: 1000, - idx_tup_read: 5000, - idx_tup_fetch: 4000, - size: 8192 * 500, - relpages: 500, - reltuples: 100_000.0, - }), + } + } + + #[test] + fn bloat_estimated_when_index_sizing_present() { + let table = make_table_with_cols(vec![("id", "bigint")]); + let idx = bare_index("test_pkey"); + let sizing = IndexSizing { + size: 8192 * 500, + relpages: 500, + reltuples: 100_000.0, }; - let est = estimate_index_bloat(&idx, &table); + let est = estimate_index_bloat(&idx, Some(&sizing), &table); assert!(est.is_some()); } + + #[test] + fn bloat_returns_none_without_sizing() { + let table = make_table_with_cols(vec![("id", "bigint")]); + let idx = bare_index("test_pkey"); + assert!(estimate_index_bloat(&idx, None, &table).is_none()); + } } diff --git a/crates/dry_run_core/src/schema/hash.rs b/crates/dry_run_core/src/schema/hash.rs index 820cd55..169ebfe 100644 --- a/crates/dry_run_core/src/schema/hash.rs +++ b/crates/dry_run_core/src/schema/hash.rs @@ -87,3 +87,173 @@ fn hex_encode(bytes: impl AsRef<[u8]>) -> String { s }) } + +#[cfg(test)] +mod tests { + use super::*; + + fn empty_table(schema: &str, name: &str) -> Table { + Table { + oid: 1, + schema: schema.into(), + name: name.into(), + columns: vec![Column { + name: "id".into(), + ordinal: 1, + type_name: "int4".into(), + nullable: false, + default: None, + identity: None, + generated: None, + comment: None, + statistics_target: None, + }], + constraints: vec![], + indexes: vec![Index { + name: format!("{name}_pkey"), + columns: vec!["id".into()], + include_columns: vec![], + index_type: "btree".into(), + is_unique: true, + is_primary: true, + predicate: None, + definition: format!("CREATE UNIQUE INDEX {name}_pkey ON {schema}.{name} (id)"), + is_valid: true, + backs_constraint: true, + }], + comment: None, + partition_info: None, + policies: vec![], + triggers: vec![], + reloptions: vec![], + rls_enabled: false, + } + } + + fn input_for<'a>(tables: &'a [Table]) -> HashInput<'a> { + HashInput { + pg_version: "PostgreSQL 17.0", + tables, + enums: &[], + domains: &[], + composites: &[], + views: &[], + functions: &[], + extensions: &[], + } + } + + #[test] + fn content_hash_changes_when_ddl_changes() { + let a = empty_table("public", "orders"); + let b = empty_table("public", "orders_v2"); + assert_ne!( + compute_content_hash(&input_for(&[a])), + compute_content_hash(&input_for(&[b])), + ); + } + + #[test] + fn content_hash_changes_when_column_added() { + let a = empty_table("public", "orders"); + let mut b = empty_table("public", "orders"); + b.columns.push(Column { + name: "total".into(), + ordinal: 2, + type_name: "numeric".into(), + nullable: true, + default: None, + identity: None, + generated: None, + comment: None, + statistics_target: None, + }); + assert_ne!( + compute_content_hash(&input_for(&[a])), + compute_content_hash(&input_for(&[b])), + ); + } + + #[test] + fn content_hash_changes_when_column_type_changes() { + let a = empty_table("public", "orders"); + let mut b = empty_table("public", "orders"); + b.columns[0].type_name = "int8".into(); + assert_ne!( + compute_content_hash(&input_for(&[a])), + compute_content_hash(&input_for(&[b])), + ); + } + + #[test] + fn content_hash_changes_when_column_nullability_changes() { + let a = empty_table("public", "orders"); + let mut b = empty_table("public", "orders"); + b.columns[0].nullable = !b.columns[0].nullable; + assert_ne!( + compute_content_hash(&input_for(&[a])), + compute_content_hash(&input_for(&[b])), + ); + } + + #[test] + fn content_hash_changes_when_index_added() { + let a = empty_table("public", "orders"); + let mut b = empty_table("public", "orders"); + b.indexes.push(Index { + name: "orders_id_idx".into(), + columns: vec!["id".into()], + include_columns: vec![], + index_type: "btree".into(), + is_unique: false, + is_primary: false, + predicate: None, + definition: "CREATE INDEX orders_id_idx ON public.orders (id)".into(), + is_valid: true, + backs_constraint: false, + }); + assert_ne!( + compute_content_hash(&input_for(&[a])), + compute_content_hash(&input_for(&[b])), + ); + } + + #[test] + fn content_hash_changes_when_pg_version_changes() { + let t = empty_table("public", "orders"); + let tables = vec![t]; + let mut a = input_for(&tables); + let mut b = input_for(&tables); + a.pg_version = "PostgreSQL 16.4"; + b.pg_version = "PostgreSQL 17.0"; + assert_ne!(compute_content_hash(&a), compute_content_hash(&b)); + } + + #[test] + fn content_hash_changes_when_enum_added() { + let tables: Vec = vec![]; + let no_enums = HashInput { + pg_version: "PostgreSQL 17.0", + tables: &tables, + enums: &[], + domains: &[], + composites: &[], + views: &[], + functions: &[], + extensions: &[], + }; + let with_enum_vec = vec![EnumType { + schema: "public".into(), + name: "order_status".into(), + labels: vec!["new".into(), "shipped".into()], + }]; + let with_enum = HashInput { + enums: &with_enum_vec, + ..no_enums + }; + assert_ne!( + compute_content_hash(&no_enums), + compute_content_hash(&with_enum), + ); + } +} diff --git a/crates/dry_run_core/src/schema/inject.rs b/crates/dry_run_core/src/schema/inject.rs deleted file mode 100644 index b12accb..0000000 --- a/crates/dry_run_core/src/schema/inject.rs +++ /dev/null @@ -1,770 +0,0 @@ -use sqlx::PgPool; -use tracing::info; - -use crate::error::{Error, Result}; -use crate::schema::types::{ColumnStats, IndexStats, NodeStats, SchemaSnapshot, TableStats}; - -#[derive(Debug)] -pub struct ApplyResult { - pub tables_updated: usize, - pub indexes_updated: usize, - pub columns_injected: usize, - pub skipped: Vec, - pub regresql_loaded: bool, -} - -/// Resolved stats ready for injection — flat lists with schema-qualified names. -#[derive(Debug)] -struct ResolvedStats { - tables: Vec<(String, String, TableStats)>, - indexes: Vec<(String, String, String, IndexStats)>, - columns: Vec<(String, String, String, String, ColumnStats)>, // schema, table, column, type_name, stats -} - -/// Column metadata from the target database. -struct ColumnMeta { - attrelid: i64, - attnum: i16, - type_name: String, - eq_opr: Option, - lt_opr: Option, -} - -/// Apply captured statistics from a SchemaSnapshot to a target PostgreSQL database. -/// -/// When `node` is Some, uses that specific node's stats from node_stats. -/// When None, uses single node_stats entry or falls back to inline stats. -pub async fn apply_stats( - pool: &PgPool, - snapshot: &SchemaSnapshot, - node: Option<&str>, -) -> Result { - check_inject_privileges(pool).await?; - let regresql_loaded = check_regresql(pool).await; - let resolved = resolve_stats(snapshot, node)?; - - let mut result = ApplyResult { - tables_updated: 0, - indexes_updated: 0, - columns_injected: 0, - skipped: Vec::new(), - regresql_loaded, - }; - - let mut tx = pool - .begin() - .await - .map_err(|e| Error::StatsInjection(format!("failed to begin transaction: {e}")))?; - - // phase 1: pg_class for tables - for (schema, table, stats) in &resolved.tables { - match update_pg_class(&mut tx, schema, table, "r", stats.reltuples, stats.relpages).await { - Ok(true) => result.tables_updated += 1, - Ok(false) => { - result - .skipped - .push(format!("{schema}.{table}: not found on target")); - } - Err(e) => { - result.skipped.push(format!("{schema}.{table}: {e}")); - } - } - } - - // phase 2: pg_class for indexes - for (schema, _table, index_name, stats) in &resolved.indexes { - match update_pg_class( - &mut tx, - schema, - index_name, - "i", - stats.reltuples, - stats.relpages, - ) - .await - { - Ok(true) => result.indexes_updated += 1, - Ok(false) => { - result - .skipped - .push(format!("index {schema}.{index_name}: not found on target")); - } - Err(e) => { - result - .skipped - .push(format!("index {schema}.{index_name}: {e}")); - } - } - } - - // phase 3: pg_statistic for columns - for (schema, table, column, type_name, stats) in &resolved.columns { - let meta = match lookup_column_meta(&mut tx, schema, table, column).await { - Ok(Some(m)) => m, - Ok(None) => { - result.skipped.push(format!( - "{schema}.{table}.{column}: column not found on target" - )); - continue; - } - Err(e) => { - result - .skipped - .push(format!("{schema}.{table}.{column}: {e}")); - continue; - } - }; - - // validate the type can be used for casting on target - let resolved_type = match validate_type_name(&mut tx, type_name).await { - Ok(Some(t)) => t, - Ok(None) => { - result.skipped.push(format!( - "{schema}.{table}.{column}: type '{type_name}' not recognized on target" - )); - continue; - } - Err(e) => { - result.skipped.push(format!( - "{schema}.{table}.{column}: type validation failed: {e}" - )); - continue; - } - }; - - let meta = ColumnMeta { - type_name: resolved_type, - ..meta - }; - - match inject_column_stats(&mut tx, &meta, stats).await { - Ok(true) => result.columns_injected += 1, - Ok(false) => { - result - .skipped - .push(format!("{schema}.{table}.{column}: no stats to inject")); - } - Err(e) => { - result - .skipped - .push(format!("{schema}.{table}.{column}: {e}")); - } - } - } - - tx.commit() - .await - .map_err(|e| Error::StatsInjection(format!("failed to commit: {e}")))?; - - info!( - tables = result.tables_updated, - indexes = result.indexes_updated, - columns = result.columns_injected, - skipped = result.skipped.len(), - "stats injection complete" - ); - - Ok(result) -} - -/// Check whether pg_regresql extension is loaded. Returns true if loaded. -async fn check_regresql(pool: &PgPool) -> bool { - // check if the extension is available at all - let available: bool = sqlx::query_scalar( - "SELECT EXISTS (SELECT 1 FROM pg_available_extensions WHERE name = 'pg_regresql')", - ) - .fetch_one(pool) - .await - .unwrap_or(false); - - if !available { - // maybe it's loaded directly (not installed as extension but via LOAD) - // check shared_preload_libraries - let spl: String = sqlx::query_scalar("SHOW shared_preload_libraries") - .fetch_one(pool) - .await - .unwrap_or_default(); - - if spl.contains("pg_regresql") { - return true; - } - - return false; - } - - // check if actually loaded — the extension registers a hook, which we can detect - // by checking if it appears in shared_preload_libraries or was LOADed - let spl: String = sqlx::query_scalar("SHOW shared_preload_libraries") - .fetch_one(pool) - .await - .unwrap_or_default(); - - if spl.contains("pg_regresql") { - return true; - } - - // also check if it's been CREATE EXTENSION'd (it might auto-load via session_preload_libraries) - let created: bool = sqlx::query_scalar( - "SELECT EXISTS (SELECT 1 FROM pg_extension WHERE extname = 'pg_regresql')", - ) - .fetch_one(pool) - .await - .unwrap_or(false); - - created -} - -fn resolve_stats(snapshot: &SchemaSnapshot, node: Option<&str>) -> Result { - if let Some(node_name) = node { - // explicit node selection - let ns = snapshot - .node_stats - .iter() - .find(|n| n.source == node_name) - .ok_or_else(|| { - let available: Vec<&str> = snapshot - .node_stats - .iter() - .map(|n| n.source.as_str()) - .collect(); - Error::StatsInjection(format!( - "node '{}' not found. Available: {}", - node_name, - if available.is_empty() { - "(none)".to_string() - } else { - available.join(", ") - } - )) - })?; - return Ok(resolve_from_node_stats(ns, snapshot)); - } - - if snapshot.node_stats.len() == 1 { - return Ok(resolve_from_node_stats(&snapshot.node_stats[0], snapshot)); - } - - if !snapshot.node_stats.is_empty() { - let available: Vec<&str> = snapshot - .node_stats - .iter() - .map(|n| n.source.as_str()) - .collect(); - return Err(Error::StatsInjection(format!( - "multiple node stats found ({}). Use --node to select one: {}", - snapshot.node_stats.len(), - available.join(", ") - ))); - } - - // fallback: inline stats from tables/indexes/columns - Ok(resolve_from_inline(snapshot)) -} - -fn resolve_from_node_stats(ns: &NodeStats, snapshot: &SchemaSnapshot) -> ResolvedStats { - let tables: Vec<_> = ns - .table_stats - .iter() - .map(|t| (t.schema.clone(), t.table.clone(), t.stats.clone())) - .collect(); - - let indexes: Vec<_> = ns - .index_stats - .iter() - .map(|i| { - ( - i.schema.clone(), - i.table.clone(), - i.index_name.clone(), - i.stats.clone(), - ) - }) - .collect(); - - // for column stats from node_stats, we need type_name from the snapshot - let columns: Vec<_> = ns - .column_stats - .iter() - .filter_map(|cs| { - let type_name = find_column_type(snapshot, &cs.schema, &cs.table, &cs.column)?; - Some(( - cs.schema.clone(), - cs.table.clone(), - cs.column.clone(), - type_name, - cs.stats.clone(), - )) - }) - .collect(); - - ResolvedStats { - tables, - indexes, - columns, - } -} - -fn resolve_from_inline(snapshot: &SchemaSnapshot) -> ResolvedStats { - let mut tables = Vec::new(); - let mut indexes = Vec::new(); - let mut columns = Vec::new(); - - for table in &snapshot.tables { - if let Some(ref ts) = table.stats { - tables.push((table.schema.clone(), table.name.clone(), ts.clone())); - } - for idx in &table.indexes { - if let Some(ref is) = idx.stats { - indexes.push(( - table.schema.clone(), - table.name.clone(), - idx.name.clone(), - is.clone(), - )); - } - } - for col in &table.columns { - if let Some(ref cs) = col.stats { - columns.push(( - table.schema.clone(), - table.name.clone(), - col.name.clone(), - col.type_name.clone(), - cs.clone(), - )); - } - } - } - - ResolvedStats { - tables, - indexes, - columns, - } -} - -fn find_column_type( - snapshot: &SchemaSnapshot, - schema: &str, - table: &str, - column: &str, -) -> Option { - snapshot - .tables - .iter() - .find(|t| t.schema == schema && t.name == table)? - .columns - .iter() - .find(|c| c.name == column) - .map(|c| c.type_name.clone()) -} - -async fn check_inject_privileges(pool: &PgPool) -> Result<()> { - let has_privs: bool = sqlx::query_scalar( - "SELECT has_table_privilege(current_user, 'pg_catalog.pg_statistic', 'INSERT') \ - AND has_table_privilege(current_user, 'pg_catalog.pg_class', 'UPDATE')", - ) - .fetch_one(pool) - .await - .map_err(|e| Error::StatsInjection(format!("privilege check failed: {e}")))?; - - if !has_privs { - return Err(Error::Privilege( - "need INSERT on pg_statistic and UPDATE on pg_class (requires superuser or table owner)" - .to_string(), - )); - } - - Ok(()) -} - -async fn update_pg_class( - tx: &mut sqlx::Transaction<'_, sqlx::Postgres>, - schema: &str, - name: &str, - relkind: &str, - reltuples: f64, - relpages: i64, -) -> Result { - let reltuples_f32 = reltuples as f32; - let relpages_i32 = relpages as i32; - - let result = sqlx::query( - "UPDATE pg_catalog.pg_class \ - SET reltuples = $1::real, relpages = $2::int \ - WHERE oid = ( \ - SELECT c.oid FROM pg_catalog.pg_class c \ - JOIN pg_catalog.pg_namespace n ON n.oid = c.relnamespace \ - WHERE n.nspname = $3 AND c.relname = $4 \ - AND c.relkind = ANY(CASE WHEN $5 = 'i' THEN ARRAY['i'] ELSE ARRAY['r','p'] END) \ - )", - ) - .bind(reltuples_f32) - .bind(relpages_i32) - .bind(schema) - .bind(name) - .bind(relkind) - .execute(&mut **tx) - .await?; - - Ok(result.rows_affected() > 0) -} - -async fn lookup_column_meta( - tx: &mut sqlx::Transaction<'_, sqlx::Postgres>, - schema: &str, - table: &str, - column: &str, -) -> Result> { - type ColumnMetaRow = (i64, i16, i64, Option, Option); - let row: Option = sqlx::query_as( - "SELECT a.attrelid::bigint, \ - a.attnum::smallint, \ - a.atttypid::bigint, \ - (SELECT o.oid::bigint FROM pg_catalog.pg_operator o \ - WHERE o.oprname = '=' AND o.oprleft = a.atttypid AND o.oprright = a.atttypid \ - AND o.oprnamespace = (SELECT oid FROM pg_namespace WHERE nspname = 'pg_catalog') \ - LIMIT 1), \ - (SELECT o.oid::bigint FROM pg_catalog.pg_operator o \ - WHERE o.oprname = '<' AND o.oprleft = a.atttypid AND o.oprright = a.atttypid \ - AND o.oprnamespace = (SELECT oid FROM pg_namespace WHERE nspname = 'pg_catalog') \ - LIMIT 1) \ - FROM pg_catalog.pg_attribute a \ - JOIN pg_catalog.pg_class c ON c.oid = a.attrelid \ - JOIN pg_catalog.pg_namespace n ON n.oid = c.relnamespace \ - WHERE n.nspname = $1 AND c.relname = $2 AND a.attname = $3 \ - AND a.attnum > 0 AND NOT a.attisdropped", - ) - .bind(schema) - .bind(table) - .bind(column) - .fetch_optional(&mut **tx) - .await?; - - Ok( - row.map(|(attrelid, attnum, _atttypid, eq_opr, lt_opr)| ColumnMeta { - attrelid, - attnum, - type_name: String::new(), // filled in by caller after validate_type_name - eq_opr, - lt_opr, - }), - ) -} - -/// Validate a type name against the target database, returning normalized form. -async fn validate_type_name( - tx: &mut sqlx::Transaction<'_, sqlx::Postgres>, - type_name: &str, -) -> Result> { - let result: Option<(String,)> = sqlx::query_as(&format!( - "SELECT '{}'::regtype::text", - type_name.replace('\'', "''") - )) - .fetch_optional(&mut **tx) - .await - .ok() - .flatten(); - - Ok(result.map(|(t,)| t)) -} - -/// Inject column statistics into pg_statistic. Returns true if a row was inserted. -async fn inject_column_stats( - tx: &mut sqlx::Transaction<'_, sqlx::Postgres>, - meta: &ColumnMeta, - stats: &ColumnStats, -) -> Result { - // need at least something to inject - let has_anything = stats.null_frac.is_some() - || stats.n_distinct.is_some() - || stats.most_common_vals.is_some() - || stats.histogram_bounds.is_some() - || stats.correlation.is_some(); - - if !has_anything { - return Ok(false); - } - - // delete existing row - sqlx::query( - "DELETE FROM pg_catalog.pg_statistic \ - WHERE starelid = $1::oid AND staattnum = $2::smallint AND stainherit = false", - ) - .bind(meta.attrelid as i32) - .bind(meta.attnum) - .execute(&mut **tx) - .await?; - - let null_frac = stats.null_frac.unwrap_or(0.0) as f32; - let n_distinct = stats.n_distinct.unwrap_or(0.0) as f32; - - // build slot assignments - let mut slot_kinds = [0i16; 5]; - let mut slot_ops = [0i64; 5]; - let mut slot_numbers: [Option; 5] = [None, None, None, None, None]; - let mut slot_values: [Option; 5] = [None, None, None, None, None]; - - let mut slot_idx = 0; - - // MCV slot (stakind = 1) - if let (Some(mcv_vals), Some(mcv_freqs)) = (&stats.most_common_vals, &stats.most_common_freqs) - && let Some(eq_op) = meta.eq_opr - { - slot_kinds[slot_idx] = 1; - slot_ops[slot_idx] = eq_op; - slot_numbers[slot_idx] = Some(mcv_freqs.clone()); - slot_values[slot_idx] = Some(mcv_vals.clone()); - slot_idx += 1; - } - - // Histogram slot (stakind = 2) - if let Some(ref hist) = stats.histogram_bounds - && let Some(lt_op) = meta.lt_opr - { - slot_kinds[slot_idx] = 2; - slot_ops[slot_idx] = lt_op; - slot_values[slot_idx] = Some(hist.clone()); - slot_idx += 1; - } - - // Correlation slot (stakind = 3) - if let Some(corr) = stats.correlation - && let Some(lt_op) = meta.lt_opr - { - slot_kinds[slot_idx] = 3; - slot_ops[slot_idx] = lt_op; - slot_numbers[slot_idx] = Some(format!("{{{corr}}}")); - // no stavalues for correlation - } - - // Build dynamic INSERT — we need dynamic SQL because stavalues is anyarray - // and we need to cast to the actual column type - let type_name = &meta.type_name; - - // construct the values expressions for each slot - let mut value_exprs = Vec::new(); - for i in 0..5 { - let numbers_expr = match &slot_numbers[i] { - Some(n) => format!("'{n}'::real[]"), - None => "NULL".to_string(), - }; - let values_expr = match &slot_values[i] { - Some(v) => { - // the values from pg_stats are already in PG array literal format - let escaped = v.replace('\'', "''"); - format!("'{escaped}'::{type_name}[]") - } - None => "NULL".to_string(), - }; - value_exprs.push((slot_kinds[i], slot_ops[i], numbers_expr, values_expr)); - } - - let sql = format!( - "INSERT INTO pg_catalog.pg_statistic ( \ - starelid, staattnum, stainherit, stanullfrac, stawidth, stadistinct, \ - stakind1, staop1, stanumbers1, stavalues1, \ - stakind2, staop2, stanumbers2, stavalues2, \ - stakind3, staop3, stanumbers3, stavalues3, \ - stakind4, staop4, stanumbers4, stavalues4, \ - stakind5, staop5, stanumbers5, stavalues5 \ - ) VALUES ( \ - {relid}::oid, {attnum}::smallint, false, {null_frac}::real, 0::int, {n_distinct}::real, \ - {k1}::smallint, {o1}::oid, {n1}, {v1}, \ - {k2}::smallint, {o2}::oid, {n2}, {v2}, \ - {k3}::smallint, {o3}::oid, {n3}, {v3}, \ - {k4}::smallint, {o4}::oid, {n4}, {v4}, \ - {k5}::smallint, {o5}::oid, {n5}, {v5} \ - )", - relid = meta.attrelid, - attnum = meta.attnum, - null_frac = null_frac, - n_distinct = n_distinct, - k1 = value_exprs[0].0, - o1 = value_exprs[0].1, - n1 = value_exprs[0].2, - v1 = value_exprs[0].3, - k2 = value_exprs[1].0, - o2 = value_exprs[1].1, - n2 = value_exprs[1].2, - v2 = value_exprs[1].3, - k3 = value_exprs[2].0, - o3 = value_exprs[2].1, - n3 = value_exprs[2].2, - v3 = value_exprs[2].3, - k4 = value_exprs[3].0, - o4 = value_exprs[3].1, - n4 = value_exprs[3].2, - v4 = value_exprs[3].3, - k5 = value_exprs[4].0, - o5 = value_exprs[4].1, - n5 = value_exprs[4].2, - v5 = value_exprs[4].3, - ); - - sqlx::query(&sql).execute(&mut **tx).await.map_err(|e| { - Error::StatsInjection(format!( - "pg_statistic insert for attrelid={} attnum={}: {e}", - meta.attrelid, meta.attnum - )) - })?; - - Ok(true) -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_resolve_inline_stats() { - use crate::schema::types::*; - use chrono::Utc; - - let snapshot = SchemaSnapshot { - pg_version: "16.0".to_string(), - database: "test".to_string(), - timestamp: Utc::now(), - content_hash: String::new(), - source: None, - tables: vec![Table { - oid: 1, - schema: "public".to_string(), - name: "users".to_string(), - columns: vec![Column { - name: "id".to_string(), - ordinal: 1, - type_name: "integer".to_string(), - nullable: false, - default: None, - identity: None, - generated: None, - comment: None, - statistics_target: None, - stats: Some(ColumnStats { - null_frac: Some(0.0), - n_distinct: Some(-1.0), - most_common_vals: None, - most_common_freqs: None, - histogram_bounds: Some("{1,100,200}".to_string()), - correlation: Some(1.0), - }), - }], - constraints: vec![], - indexes: vec![], - comment: None, - stats: Some(TableStats { - reltuples: 1000.0, - relpages: 10, - dead_tuples: 0, - last_vacuum: None, - last_autovacuum: None, - last_analyze: None, - last_autoanalyze: None, - seq_scan: 50, - idx_scan: 100, - table_size: 81920, - }), - partition_info: None, - policies: vec![], - triggers: vec![], - reloptions: vec![], - rls_enabled: false, - }], - enums: vec![], - domains: vec![], - composites: vec![], - views: vec![], - functions: vec![], - extensions: vec![], - gucs: vec![], - node_stats: vec![], - }; - - let resolved = resolve_stats(&snapshot, None).unwrap(); - assert_eq!(resolved.tables.len(), 1); - assert_eq!(resolved.tables[0].0, "public"); - assert_eq!(resolved.tables[0].1, "users"); - assert_eq!(resolved.columns.len(), 1); - assert_eq!(resolved.columns[0].3, "integer"); // type_name - } - - #[test] - fn test_resolve_node_stats_not_found() { - use chrono::Utc; - - let snapshot = SchemaSnapshot { - pg_version: "16.0".to_string(), - database: "test".to_string(), - timestamp: Utc::now(), - content_hash: String::new(), - source: None, - tables: vec![], - enums: vec![], - domains: vec![], - composites: vec![], - views: vec![], - functions: vec![], - extensions: vec![], - gucs: vec![], - node_stats: vec![NodeStats { - source: "prod-1".to_string(), - timestamp: Utc::now(), - is_standby: false, - table_stats: vec![], - index_stats: vec![], - column_stats: vec![], - }], - }; - - let err = resolve_stats(&snapshot, Some("prod-2")).unwrap_err(); - let msg = err.to_string(); - assert!(msg.contains("prod-2")); - assert!(msg.contains("prod-1")); - } - - #[test] - fn test_resolve_multiple_nodes_requires_selection() { - use chrono::Utc; - - let snapshot = SchemaSnapshot { - pg_version: "16.0".to_string(), - database: "test".to_string(), - timestamp: Utc::now(), - content_hash: String::new(), - source: None, - tables: vec![], - enums: vec![], - domains: vec![], - composites: vec![], - views: vec![], - functions: vec![], - extensions: vec![], - gucs: vec![], - node_stats: vec![ - NodeStats { - source: "prod-1".to_string(), - timestamp: Utc::now(), - is_standby: false, - table_stats: vec![], - index_stats: vec![], - column_stats: vec![], - }, - NodeStats { - source: "prod-2".to_string(), - timestamp: Utc::now(), - is_standby: true, - table_stats: vec![], - index_stats: vec![], - column_stats: vec![], - }, - ], - }; - - let err = resolve_stats(&snapshot, None).unwrap_err(); - let msg = err.to_string(); - assert!(msg.contains("multiple")); - assert!(msg.contains("prod-1")); - assert!(msg.contains("prod-2")); - } -} diff --git a/crates/dry_run_core/src/schema/introspect/indexes.rs b/crates/dry_run_core/src/schema/introspect/indexes.rs index a99fe46..1188e93 100644 --- a/crates/dry_run_core/src/schema/introspect/indexes.rs +++ b/crates/dry_run_core/src/schema/introspect/indexes.rs @@ -76,41 +76,3 @@ pub(super) async fn fetch_indexes(pool: &PgPool) -> Result> { }) .collect()) } - -pub(super) async fn fetch_index_stats(pool: &PgPool) -> Result> { - let rows: Vec = sqlx::query( - r#" - SELECT s.relid::int4 AS table_oid, - s.indexrelname AS index_name, - COALESCE(s.idx_scan, 0)::int8 AS idx_scan, - COALESCE(s.idx_tup_read, 0)::int8 AS idx_tup_read, - COALESCE(s.idx_tup_fetch, 0)::int8 AS idx_tup_fetch, - pg_catalog.pg_relation_size(s.indexrelid)::int8 AS index_size, - ci.relpages::int8 AS index_relpages, - ci.reltuples::float8 AS index_reltuples - FROM pg_catalog.pg_stat_user_indexes s - JOIN pg_catalog.pg_class ct ON ct.oid = s.relid - JOIN pg_catalog.pg_namespace n ON n.oid = ct.relnamespace - JOIN pg_catalog.pg_class ci ON ci.oid = s.indexrelid - WHERE n.nspname NOT IN ('pg_catalog', 'information_schema', 'pg_toast') - AND n.nspname NOT LIKE 'pg_temp_%' - ORDER BY s.relid, s.indexrelname - "#, - ) - .fetch_all(pool) - .await?; - - Ok(rows - .iter() - .map(|r| RawIndexStats { - table_oid: r.get::("table_oid") as u32, - index_name: r.get("index_name"), - idx_scan: r.get("idx_scan"), - idx_tup_read: r.get("idx_tup_read"), - idx_tup_fetch: r.get("idx_tup_fetch"), - size: r.get("index_size"), - relpages: r.get("index_relpages"), - reltuples: r.get("index_reltuples"), - }) - .collect()) -} diff --git a/crates/dry_run_core/src/schema/introspect/mod.rs b/crates/dry_run_core/src/schema/introspect/mod.rs index 49716c7..599853c 100644 --- a/crates/dry_run_core/src/schema/introspect/mod.rs +++ b/crates/dry_run_core/src/schema/introspect/mod.rs @@ -15,9 +15,12 @@ use sqlx::postgres::PgRow; use sqlx::{PgPool, Row}; use tracing::info; +use sha2::{Digest, Sha256}; + use super::hash::{HashInput, compute_content_hash}; +use super::snapshot::*; use super::types::*; -use crate::error::Result; +use crate::error::{Error, Result}; pub async fn introspect_schema(pool: &PgPool) -> Result { let pg_version: String = sqlx::query_scalar("SELECT version()") @@ -28,7 +31,8 @@ pub async fn introspect_schema(pool: &PgPool) -> Result { .fetch_one(pool) .await?; - // Group 1: table-centric data. + // Group 1: table-centric data. Stats now live in PlannerStatsSnapshot / + // ActivityStatsSnapshot; introspect_schema is DDL-only. let ( raw_tables, raw_columns, @@ -36,13 +40,10 @@ pub async fn introspect_schema(pool: &PgPool) -> Result { table_comments, column_comments, raw_indexes, - raw_table_stats, - raw_column_stats, raw_partitions, raw_partition_children, raw_policies, raw_triggers, - raw_index_stats, ) = tokio::try_join!( tables::fetch_tables(pool), tables::fetch_columns(pool), @@ -50,17 +51,14 @@ pub async fn introspect_schema(pool: &PgPool) -> Result { comments::fetch_table_comments(pool), comments::fetch_column_comments(pool), indexes::fetch_indexes(pool), - tables::fetch_table_stats(pool), - tables::fetch_column_stats(pool), partitions::fetch_partition_info(pool), partitions::fetch_partition_children(pool), policies::fetch_policies(pool), policies::fetch_triggers(pool), - indexes::fetch_index_stats(pool), )?; // Group 2: top-level objects. - let (enums, domains, composites, views, functions, extensions, gucs, is_standby) = tokio::try_join!( + let (enums, domains, composites, views, functions, extensions, gucs, _is_standby) = tokio::try_join!( catalog::fetch_enums(pool), catalog::fetch_domains(pool), catalog::fetch_composites(pool), @@ -71,21 +69,6 @@ pub async fn introspect_schema(pool: &PgPool) -> Result { fetch_is_standby(pool), )?; - let with_vacuum = raw_table_stats - .iter() - .filter(|s| s.last_autovacuum.is_some()) - .count(); - if with_vacuum == 0 && !raw_table_stats.is_empty() { - if is_standby { - info!("all vacuum timestamps are null;expected on standby"); - } else { - tracing::warn!( - "all vacuum/analyze timestamps are null on primary! \ - check that the role has pg_read_all_stats privilege" - ); - } - } - let tables = assemble_tables( raw_tables, raw_columns, @@ -93,13 +76,10 @@ pub async fn introspect_schema(pool: &PgPool) -> Result { table_comments, column_comments, raw_indexes, - raw_table_stats, - raw_column_stats, raw_partitions, raw_partition_children, raw_policies, raw_triggers, - raw_index_stats, ); let content_hash = compute_content_hash(&HashInput { @@ -127,7 +107,6 @@ pub async fn introspect_schema(pool: &PgPool) -> Result { functions, extensions, gucs, - node_stats: vec![], }; info!( @@ -145,29 +124,141 @@ pub async fn introspect_schema(pool: &PgPool) -> Result { Ok(snapshot) } -pub async fn fetch_stats_only(pool: &PgPool, source: &str) -> Result { - let (raw_table_stats, raw_index_stats, raw_column_stats, is_standby) = tokio::try_join!( - stats::fetch_named_table_stats(pool), - stats::fetch_named_index_stats(pool), +pub async fn fetch_is_standby(pool: &PgPool) -> Result { + let row: PgRow = sqlx::query("SELECT pg_catalog.pg_is_in_recovery() AS is_standby") + .fetch_one(pool) + .await?; + Ok(row.get("is_standby")) +} + +// Snapshot split: planner-only and per-node activity captures + +pub async fn introspect_planner_stats( + pool: &PgPool, + schema_ref_hash: &str, +) -> Result { + if fetch_is_standby(pool).await? { + return Err(Error::Introspection( + "planner stats must be captured from the primary; \ + use `dryrun snapshot activity --from ` for per-node activity" + .into(), + )); + } + + let pg_version: String = sqlx::query_scalar("SELECT version()") + .fetch_one(pool) + .await?; + let database: String = sqlx::query_scalar("SELECT current_database()") + .fetch_one(pool) + .await?; + + let (table_sizing, index_sizing, columns) = tokio::try_join!( + stats::fetch_named_table_sizing(pool), + stats::fetch_named_index_sizing(pool), stats::fetch_named_column_stats(pool), - fetch_is_standby(pool), )?; - Ok(NodeStats { - source: source.to_string(), + let mut snapshot = PlannerStatsSnapshot { + pg_version, + database, timestamp: Utc::now(), - is_standby, - table_stats: raw_table_stats, - index_stats: raw_index_stats, - column_stats: raw_column_stats, - }) + content_hash: String::new(), + schema_ref_hash: schema_ref_hash.to_string(), + tables: table_sizing, + columns, + indexes: index_sizing, + }; + snapshot.content_hash = hash_payload(&snapshot)?; + + info!( + tables = snapshot.tables.len(), + columns = snapshot.columns.len(), + indexes = snapshot.indexes.len(), + hash = %snapshot.content_hash, + schema_ref = %snapshot.schema_ref_hash, + "planner stats introspection complete" + ); + + Ok(snapshot) } -pub async fn fetch_is_standby(pool: &PgPool) -> Result { - let row: PgRow = sqlx::query("SELECT pg_catalog.pg_is_in_recovery() AS is_standby") +pub async fn introspect_activity_stats( + pool: &PgPool, + schema_ref_hash: &str, + label: &str, +) -> Result { + let pg_version: String = sqlx::query_scalar("SELECT version()") .fetch_one(pool) .await?; - Ok(row.get("is_standby")) + let database: String = sqlx::query_scalar("SELECT current_database()") + .fetch_one(pool) + .await?; + + let (node, table_activity, index_activity) = tokio::try_join!( + resolve_node_identity(pool, label), + stats::fetch_named_table_activity(pool), + stats::fetch_named_index_activity(pool), + )?; + + let mut snapshot = ActivityStatsSnapshot { + pg_version, + database, + timestamp: Utc::now(), + content_hash: String::new(), + schema_ref_hash: schema_ref_hash.to_string(), + node, + tables: table_activity, + indexes: index_activity, + }; + snapshot.content_hash = hash_payload(&snapshot)?; + + info!( + label = %snapshot.node.label, + is_standby = snapshot.node.is_standby, + tables = snapshot.tables.len(), + indexes = snapshot.indexes.len(), + hash = %snapshot.content_hash, + schema_ref = %snapshot.schema_ref_hash, + "activity stats introspection complete" + ); + + Ok(snapshot) +} + +async fn resolve_node_identity(pool: &PgPool, label: &str) -> Result { + let row: PgRow = sqlx::query( + r#" + SELECT pg_catalog.pg_is_in_recovery() AS is_standby, + COALESCE(host(pg_catalog.inet_server_addr())::text, '') AS host, + (SELECT stats_reset + FROM pg_catalog.pg_stat_database + WHERE datname = current_database()) AS stats_reset, + CASE + WHEN pg_catalog.pg_is_in_recovery() + THEN pg_catalog.pg_wal_lsn_diff( + pg_catalog.pg_last_wal_receive_lsn(), + pg_catalog.pg_last_wal_replay_lsn())::int8 + ELSE NULL + END AS lag_bytes + "#, + ) + .fetch_one(pool) + .await?; + + Ok(NodeIdentity { + label: label.to_string(), + host: row.get::("host"), + is_standby: row.get("is_standby"), + replication_lag_bytes: row.get::, _>("lag_bytes"), + stats_reset: row.get("stats_reset"), + }) +} + +fn hash_payload(value: &T) -> Result { + let json = serde_json::to_vec(value) + .map_err(|e| Error::Introspection(format!("cannot serialize for hashing: {e}")))?; + let digest = Sha256::digest(&json); + Ok(format!("{digest:x}")) } // --------------------------------------------------------------------------- @@ -184,13 +275,10 @@ fn assemble_tables( table_comments: Vec, column_comments: Vec, raw_indexes: Vec, - raw_table_stats: Vec, - raw_column_stats: Vec, raw_partitions: Vec, raw_partition_children: Vec, raw_policies: Vec, raw_triggers: Vec, - raw_index_stats: Vec, ) -> Vec
{ // --- Columns --- let mut columns_by_oid: HashMap> = HashMap::new(); @@ -208,7 +296,6 @@ fn assemble_tables( generated: rc.generated, comment: None, statistics_target: rc.statistics_target, - stats: None, }); } @@ -254,50 +341,9 @@ fn assemble_tables( } } - // --- Column stats --- - let mut col_stats_map: HashMap<(u32, String), ColumnStats> = HashMap::new(); - for cs in raw_column_stats { - col_stats_map.insert( - (cs.table_oid, cs.column_name), - ColumnStats { - null_frac: cs.null_frac, - n_distinct: cs.n_distinct, - most_common_vals: cs.most_common_vals, - most_common_freqs: cs.most_common_freqs, - histogram_bounds: cs.histogram_bounds, - correlation: cs.correlation, - }, - ); - } - - for (oid, cols) in &mut columns_by_oid { - for col in cols.iter_mut() { - if let Some(stats) = col_stats_map.remove(&(*oid, col.name.clone())) { - col.stats = Some(stats); - } - } - } - - // --- Index stats --- - let mut idx_stats_map: HashMap<(u32, String), IndexStats> = HashMap::new(); - for ris in raw_index_stats { - idx_stats_map.insert( - (ris.table_oid, ris.index_name), - IndexStats { - idx_scan: ris.idx_scan, - idx_tup_read: ris.idx_tup_read, - idx_tup_fetch: ris.idx_tup_fetch, - size: ris.size, - relpages: ris.relpages, - reltuples: ris.reltuples, - }, - ); - } - // --- Indexes --- let mut indexes_by_oid: HashMap> = HashMap::new(); for ri in raw_indexes { - let stats = idx_stats_map.remove(&(ri.table_oid, ri.name.clone())); indexes_by_oid.entry(ri.table_oid).or_default().push(Index { name: ri.name, columns: ri.columns, @@ -309,32 +355,9 @@ fn assemble_tables( definition: ri.definition, is_valid: ri.is_valid, backs_constraint: ri.backs_constraint, - stats, }); } - // --- Table stats --- - let stats_by_oid: HashMap = raw_table_stats - .into_iter() - .map(|s| { - ( - s.table_oid, - TableStats { - reltuples: s.reltuples, - relpages: s.relpages, - dead_tuples: s.dead_tuples, - last_vacuum: s.last_vacuum, - last_autovacuum: s.last_autovacuum, - last_analyze: s.last_analyze, - last_autoanalyze: s.last_autoanalyze, - seq_scan: s.seq_scan, - idx_scan: s.idx_scan, - table_size: s.table_size, - }, - ) - }) - .collect(); - // --- Partition info --- let mut children_by_parent: HashMap> = HashMap::new(); for pc in raw_partition_children { @@ -402,7 +425,6 @@ fn assemble_tables( constraints: constraints_by_oid.remove(&rt.oid).unwrap_or_default(), indexes: indexes_by_oid.remove(&rt.oid).unwrap_or_default(), comment: table_comment_map.get(&rt.oid).cloned(), - stats: stats_by_oid.get(&rt.oid).cloned(), partition_info: partition_info_by_oid.get(&rt.oid).cloned(), policies: policies_by_oid.remove(&rt.oid).unwrap_or_default(), triggers: triggers_by_oid.remove(&rt.oid).unwrap_or_default(), @@ -411,3 +433,45 @@ fn assemble_tables( }) .collect() } + +#[cfg(test)] +mod tests { + use chrono::TimeZone; + + use super::*; + + fn fixed_planner() -> PlannerStatsSnapshot { + PlannerStatsSnapshot { + pg_version: "PostgreSQL 17.0".into(), + database: "accounts".into(), + timestamp: Utc.with_ymd_and_hms(2026, 1, 1, 0, 0, 0).unwrap(), + content_hash: String::new(), + schema_ref_hash: "schema-h1".into(), + tables: vec![], + columns: vec![], + indexes: vec![], + } + } + + #[test] + fn hash_payload_is_deterministic_for_identical_inputs() { + let a = fixed_planner(); + let b = fixed_planner(); + assert_eq!(hash_payload(&a).unwrap(), hash_payload(&b).unwrap()); + } + + #[test] + fn hash_payload_changes_when_payload_changes() { + let a = fixed_planner(); + let mut b = fixed_planner(); + b.schema_ref_hash = "schema-h2".into(); + assert_ne!(hash_payload(&a).unwrap(), hash_payload(&b).unwrap()); + } + + #[test] + fn hash_payload_emits_hex_sha256() { + let h = hash_payload(&fixed_planner()).unwrap(); + assert_eq!(h.len(), 64); + assert!(h.chars().all(|c| c.is_ascii_hexdigit())); + } +} diff --git a/crates/dry_run_core/src/schema/introspect/raw_types.rs b/crates/dry_run_core/src/schema/introspect/raw_types.rs index 277a9f9..51b4a20 100644 --- a/crates/dry_run_core/src/schema/introspect/raw_types.rs +++ b/crates/dry_run_core/src/schema/introspect/raw_types.rs @@ -1,5 +1,3 @@ -use chrono::{DateTime, Utc}; - pub(super) struct RawTable { pub oid: u32, pub schema: String, @@ -57,31 +55,6 @@ pub(super) struct RawIndex { pub backs_constraint: bool, } -pub(super) struct RawTableStats { - pub table_oid: u32, - pub reltuples: f64, - pub relpages: i64, - pub dead_tuples: i64, - pub last_vacuum: Option>, - pub last_autovacuum: Option>, - pub last_analyze: Option>, - pub last_autoanalyze: Option>, - pub seq_scan: i64, - pub idx_scan: i64, - pub table_size: i64, -} - -pub(super) struct RawColumnStats { - pub table_oid: u32, - pub column_name: String, - pub null_frac: Option, - pub n_distinct: Option, - pub most_common_vals: Option, - pub most_common_freqs: Option, - pub histogram_bounds: Option, - pub correlation: Option, -} - pub(super) struct RawPartitionInfo { pub table_oid: u32, pub strategy: String, @@ -110,14 +83,3 @@ pub(super) struct RawTrigger { pub name: String, pub definition: String, } - -pub(super) struct RawIndexStats { - pub table_oid: u32, - pub index_name: String, - pub idx_scan: i64, - pub idx_tup_read: i64, - pub idx_tup_fetch: i64, - pub size: i64, - pub relpages: i64, - pub reltuples: f64, -} diff --git a/crates/dry_run_core/src/schema/introspect/stats.rs b/crates/dry_run_core/src/schema/introspect/stats.rs index e920be9..3cdff7c 100644 --- a/crates/dry_run_core/src/schema/introspect/stats.rs +++ b/crates/dry_run_core/src/schema/introspect/stats.rs @@ -1,24 +1,106 @@ use sqlx::postgres::PgRow; use sqlx::{PgPool, Row}; +use super::super::snapshot::*; use super::super::types::*; use crate::error::Result; -pub(super) async fn fetch_named_table_stats(pool: &PgPool) -> Result> { +pub(super) async fn fetch_named_column_stats(pool: &PgPool) -> Result> { let rows: Vec = sqlx::query( r#" - SELECT n.nspname AS schema_name, - c.relname AS table_name, - c.reltuples::float8 AS reltuples, - c.relpages::int8 AS relpages, - COALESCE(s.n_dead_tup, 0)::int8 AS dead_tuples, + SELECT s.schemaname AS schema_name, + s.tablename AS table_name, + s.attname AS column_name, + s.null_frac::float8 AS null_frac, + s.n_distinct::float8 AS n_distinct, + s.most_common_vals::text AS most_common_vals, + s.most_common_freqs::text AS most_common_freqs, + s.histogram_bounds::text AS histogram_bounds, + s.correlation::float8 AS correlation + FROM pg_catalog.pg_stats s + WHERE s.schemaname NOT IN ('pg_catalog', 'information_schema', 'pg_toast') + AND s.schemaname NOT LIKE 'pg_temp_%' + ORDER BY s.schemaname, s.tablename, s.attname + "#, + ) + .fetch_all(pool) + .await?; + + Ok(rows + .iter() + .map(|r| ColumnStatsEntry { + table: QualifiedName::new( + r.get::("schema_name"), + r.get::("table_name"), + ), + column: r.get("column_name"), + stats: ColumnStats { + null_frac: r.get::, _>("null_frac"), + n_distinct: r.get::, _>("n_distinct"), + most_common_vals: r.get("most_common_vals"), + most_common_freqs: r.get("most_common_freqs"), + histogram_bounds: r.get("histogram_bounds"), + correlation: r.get::, _>("correlation"), + }, + }) + .collect()) +} + +pub(super) async fn fetch_named_table_sizing(pool: &PgPool) -> Result> { + let rows: Vec = sqlx::query( + r#" + SELECT n.nspname AS schema_name, + c.relname AS table_name, + c.reltuples::float8 AS reltuples, + c.relpages::int8 AS relpages, + pg_catalog.pg_relation_size(c.oid)::int8 AS table_size, + pg_catalog.pg_total_relation_size(c.oid)::int8 AS total_size, + pg_catalog.pg_indexes_size(c.oid)::int8 AS index_size + FROM pg_catalog.pg_class c + JOIN pg_catalog.pg_namespace n ON n.oid = c.relnamespace + WHERE c.relkind IN ('r', 'p') + AND n.nspname NOT IN ('pg_catalog', 'information_schema', 'pg_toast') + AND n.nspname NOT LIKE 'pg_temp_%' + "#, + ) + .fetch_all(pool) + .await?; + + Ok(rows + .iter() + .map(|r| TableSizingEntry { + table: QualifiedName::new( + r.get::("schema_name"), + r.get::("table_name"), + ), + sizing: TableSizing { + reltuples: r.get("reltuples"), + relpages: r.get("relpages"), + table_size: r.get("table_size"), + total_size: Some(r.get("total_size")), + index_size: Some(r.get("index_size")), + }, + }) + .collect()) +} + +pub(super) async fn fetch_named_table_activity(pool: &PgPool) -> Result> { + let rows: Vec = sqlx::query( + r#" + SELECT n.nspname AS schema_name, + c.relname AS table_name, + COALESCE(s.seq_scan, 0)::int8 AS seq_scan, + COALESCE(s.idx_scan, 0)::int8 AS idx_scan, + COALESCE(s.n_live_tup, 0)::int8 AS n_live_tup, + COALESCE(s.n_dead_tup, 0)::int8 AS n_dead_tup, s.last_vacuum, s.last_autovacuum, s.last_analyze, s.last_autoanalyze, - COALESCE(s.seq_scan, 0)::int8 AS seq_scan, - COALESCE(s.idx_scan, 0)::int8 AS idx_scan, - pg_catalog.pg_total_relation_size(c.oid)::int8 AS table_size + COALESCE(s.vacuum_count, 0)::int8 AS vacuum_count, + COALESCE(s.autovacuum_count, 0)::int8 AS autovacuum_count, + COALESCE(s.analyze_count, 0)::int8 AS analyze_count, + COALESCE(s.autoanalyze_count, 0)::int8 AS autoanalyze_count FROM pg_catalog.pg_class c JOIN pg_catalog.pg_namespace n ON n.oid = c.relnamespace LEFT JOIN pg_catalog.pg_stat_user_tables s ON s.relid = c.oid @@ -32,44 +114,43 @@ pub(super) async fn fetch_named_table_stats(pool: &PgPool) -> Result("schema_name"), + r.get::("table_name"), + ), + activity: TableActivity { + seq_scan: r.get("seq_scan"), + idx_scan: r.get("idx_scan"), + n_live_tup: r.get("n_live_tup"), + n_dead_tup: r.get("n_dead_tup"), last_vacuum: r.get("last_vacuum"), last_autovacuum: r.get("last_autovacuum"), last_analyze: r.get("last_analyze"), last_autoanalyze: r.get("last_autoanalyze"), - seq_scan: r.get("seq_scan"), - idx_scan: r.get("idx_scan"), - table_size: r.get("table_size"), + vacuum_count: r.get("vacuum_count"), + autovacuum_count: r.get("autovacuum_count"), + analyze_count: r.get("analyze_count"), + autoanalyze_count: r.get("autoanalyze_count"), }, }) .collect()) } -pub(super) async fn fetch_named_index_stats(pool: &PgPool) -> Result> { +pub(super) async fn fetch_named_index_sizing(pool: &PgPool) -> Result> { let rows: Vec = sqlx::query( r#" SELECT n.nspname AS schema_name, - s.relname AS table_name, - s.indexrelname AS index_name, - COALESCE(s.idx_scan, 0)::int8 AS idx_scan, - COALESCE(s.idx_tup_read, 0)::int8 AS idx_tup_read, - COALESCE(s.idx_tup_fetch, 0)::int8 AS idx_tup_fetch, - pg_catalog.pg_relation_size(s.indexrelid)::int8 AS index_size, - ci.relpages::int8 AS index_relpages, - ci.reltuples::float8 AS index_reltuples - FROM pg_catalog.pg_stat_user_indexes s - JOIN pg_catalog.pg_class ct ON ct.oid = s.relid - JOIN pg_catalog.pg_namespace n ON n.oid = ct.relnamespace - JOIN pg_catalog.pg_class ci ON ci.oid = s.indexrelid - WHERE n.nspname NOT IN ('pg_catalog', 'information_schema', 'pg_toast') + ci.relname AS index_name, + pg_catalog.pg_relation_size(ci.oid)::int8 AS index_size, + ci.relpages::int8 AS relpages, + ci.reltuples::float8 AS reltuples + FROM pg_catalog.pg_class ci + JOIN pg_catalog.pg_namespace n ON n.oid = ci.relnamespace + WHERE ci.relkind = 'i' + AND n.nspname NOT IN ('pg_catalog', 'information_schema', 'pg_toast') AND n.nspname NOT LIKE 'pg_temp_%' - ORDER BY n.nspname, s.relname, s.indexrelname + ORDER BY n.nspname, ci.relname "#, ) .fetch_all(pool) @@ -77,38 +158,34 @@ pub(super) async fn fetch_named_index_stats(pool: &PgPool) -> Result("schema_name"), + r.get::("index_name"), + ), + sizing: IndexSizing { size: r.get("index_size"), - relpages: r.get("index_relpages"), - reltuples: r.get("index_reltuples"), + relpages: r.get("relpages"), + reltuples: r.get("reltuples"), }, }) .collect()) } -pub(super) async fn fetch_named_column_stats(pool: &PgPool) -> Result> { +pub(super) async fn fetch_named_index_activity(pool: &PgPool) -> Result> { let rows: Vec = sqlx::query( r#" - SELECT s.schemaname AS schema_name, - s.tablename AS table_name, - s.attname AS column_name, - s.null_frac::float8 AS null_frac, - s.n_distinct::float8 AS n_distinct, - s.most_common_vals::text AS most_common_vals, - s.most_common_freqs::text AS most_common_freqs, - s.histogram_bounds::text AS histogram_bounds, - s.correlation::float8 AS correlation - FROM pg_catalog.pg_stats s - WHERE s.schemaname NOT IN ('pg_catalog', 'information_schema', 'pg_toast') - AND s.schemaname NOT LIKE 'pg_temp_%' - ORDER BY s.schemaname, s.tablename, s.attname + SELECT n.nspname AS schema_name, + s.indexrelname AS index_name, + COALESCE(s.idx_scan, 0)::int8 AS idx_scan, + COALESCE(s.idx_tup_read, 0)::int8 AS idx_tup_read, + COALESCE(s.idx_tup_fetch, 0)::int8 AS idx_tup_fetch + FROM pg_catalog.pg_stat_user_indexes s + JOIN pg_catalog.pg_class ci ON ci.oid = s.indexrelid + JOIN pg_catalog.pg_namespace n ON n.oid = ci.relnamespace + WHERE n.nspname NOT IN ('pg_catalog', 'information_schema', 'pg_toast') + AND n.nspname NOT LIKE 'pg_temp_%' + ORDER BY n.nspname, s.indexrelname "#, ) .fetch_all(pool) @@ -116,17 +193,15 @@ pub(super) async fn fetch_named_column_stats(pool: &PgPool) -> Result, _>("null_frac"), - n_distinct: r.get::, _>("n_distinct"), - most_common_vals: r.get("most_common_vals"), - most_common_freqs: r.get("most_common_freqs"), - histogram_bounds: r.get("histogram_bounds"), - correlation: r.get::, _>("correlation"), + .map(|r| IndexActivityEntry { + index: QualifiedName::new( + r.get::("schema_name"), + r.get::("index_name"), + ), + activity: IndexActivity { + idx_scan: r.get("idx_scan"), + idx_tup_read: r.get("idx_tup_read"), + idx_tup_fetch: r.get("idx_tup_fetch"), }, }) .collect()) diff --git a/crates/dry_run_core/src/schema/introspect/tables.rs b/crates/dry_run_core/src/schema/introspect/tables.rs index d3e6364..80e1f2a 100644 --- a/crates/dry_run_core/src/schema/introspect/tables.rs +++ b/crates/dry_run_core/src/schema/introspect/tables.rs @@ -148,88 +148,3 @@ pub(super) async fn fetch_constraints(pool: &PgPool) -> Result Result> { - let rows: Vec = sqlx::query( - r#" - SELECT c.oid::int4 AS table_oid, - c.reltuples::float8 AS reltuples, - c.relpages::int8 AS relpages, - COALESCE(s.n_dead_tup, 0)::int8 AS dead_tuples, - s.last_vacuum, - s.last_autovacuum, - s.last_analyze, - s.last_autoanalyze, - COALESCE(s.seq_scan, 0)::int8 AS seq_scan, - COALESCE(s.idx_scan, 0)::int8 AS idx_scan, - pg_catalog.pg_total_relation_size(c.oid)::int8 AS table_size - FROM pg_catalog.pg_class c - JOIN pg_catalog.pg_namespace n ON n.oid = c.relnamespace - LEFT JOIN pg_catalog.pg_stat_user_tables s - ON s.relid = c.oid - WHERE c.relkind IN ('r', 'p') - AND n.nspname NOT IN ('pg_catalog', 'information_schema', 'pg_toast') - AND n.nspname NOT LIKE 'pg_temp_%' - "#, - ) - .fetch_all(pool) - .await?; - - let stats: Vec = rows - .iter() - .map(|r| RawTableStats { - table_oid: r.get::("table_oid") as u32, - reltuples: r.get("reltuples"), - relpages: r.get("relpages"), - dead_tuples: r.get("dead_tuples"), - last_vacuum: r.get("last_vacuum"), - last_autovacuum: r.get("last_autovacuum"), - last_analyze: r.get("last_analyze"), - last_autoanalyze: r.get("last_autoanalyze"), - seq_scan: r.get("seq_scan"), - idx_scan: r.get("idx_scan"), - table_size: r.get("table_size"), - }) - .collect(); - - tracing::info!(total = stats.len(), "table stats fetched"); - - Ok(stats) -} - -pub(super) async fn fetch_column_stats(pool: &PgPool) -> Result> { - let rows: Vec = sqlx::query( - r#" - SELECT c.oid::int4 AS table_oid, - s.attname AS column_name, - s.null_frac::float8 AS null_frac, - s.n_distinct::float8 AS n_distinct, - s.most_common_vals::text AS most_common_vals, - s.most_common_freqs::text AS most_common_freqs, - s.histogram_bounds::text AS histogram_bounds, - s.correlation::float8 AS correlation - FROM pg_catalog.pg_class c - JOIN pg_catalog.pg_namespace n ON n.oid = c.relnamespace - JOIN pg_catalog.pg_stats s - ON s.schemaname = n.nspname AND s.tablename = c.relname - WHERE c.relkind IN ('r', 'p') - AND n.nspname NOT IN ('pg_catalog', 'information_schema', 'pg_toast') - "#, - ) - .fetch_all(pool) - .await?; - - Ok(rows - .iter() - .map(|r| RawColumnStats { - table_oid: r.get::("table_oid") as u32, - column_name: r.get("column_name"), - null_frac: r.get::, _>("null_frac"), - n_distinct: r.get::, _>("n_distinct"), - most_common_vals: r.get("most_common_vals"), - most_common_freqs: r.get("most_common_freqs"), - histogram_bounds: r.get("histogram_bounds"), - correlation: r.get::, _>("correlation"), - }) - .collect()) -} diff --git a/crates/dry_run_core/src/schema/mod.rs b/crates/dry_run_core/src/schema/mod.rs index fdaecf8..0d6366a 100644 --- a/crates/dry_run_core/src/schema/mod.rs +++ b/crates/dry_run_core/src/schema/mod.rs @@ -1,13 +1,16 @@ pub mod bloat; mod hash; -pub mod inject; mod introspect; pub mod profile; +mod snapshot; mod types; pub mod vacuum; +pub use bloat::*; pub use hash::{HashInput, compute_content_hash}; -pub use inject::{ApplyResult, apply_stats}; -pub use introspect::{fetch_is_standby, fetch_stats_only, introspect_schema}; +pub use introspect::{ + fetch_is_standby, introspect_activity_stats, introspect_planner_stats, introspect_schema, +}; pub use profile::*; +pub use snapshot::*; pub use types::*; diff --git a/crates/dry_run_core/src/schema/profile.rs b/crates/dry_run_core/src/schema/profile.rs index c115d62..e18408a 100644 --- a/crates/dry_run_core/src/schema/profile.rs +++ b/crates/dry_run_core/src/schema/profile.rs @@ -1,6 +1,6 @@ use serde::Serialize; -use super::types::{Column, ColumnStats}; +use super::types::ColumnStats; #[derive(Debug, Clone, Serialize)] pub struct ColumnProfile { @@ -18,9 +18,21 @@ pub struct ColumnProfile { pub note: Option, } -/// Build a human-readable profile for a single column. -pub fn profile_column(col: &Column, table_rows: f64) -> Option { - let s = col.stats.as_ref()?; +// Build a human-readable profile for a single column. +// +// Decoupled from `&Column` after the snapshot split — column stats now +// live in `PlannerStatsSnapshot.columns`, not on the DDL node — so we take +// the type name and the stats reference separately. Callers (typically MCP +// tool bodies) thread the stats via `annotated.column_stats(qn, col_name)`, +// which returns `None` when no planner snapshot exists yet; in that case +// the profiler simply returns `None` and the consumer skips that column. +pub fn profile_column( + col_name: &str, + type_name: &str, + stats: Option<&ColumnStats>, + table_rows: f64, +) -> Option { + let s = stats?; Some(ColumnProfile { cardinality: profile_cardinality(s, table_rows), @@ -29,13 +41,18 @@ pub fn profile_column(col: &Column, table_rows: f64) -> Option { physical_order: profile_correlation(s), value_range: profile_range(s), top_values: parse_top_values(s, 5), - note: profile_note(col, s, table_rows), + note: profile_note(col_name, type_name, s, table_rows), }) } -/// Estimated selectivity (0..1) for equality predicate on this column. -pub fn column_selectivity(col: &Column, table_rows: f64) -> f64 { - let s = match col.stats.as_ref() { +// Estimated selectivity (0..1) for an equality predicate on this column. +// +// Same shape change as `profile_column` — takes `Option<&ColumnStats>` +// directly rather than reaching into a `&Column`. Returns the neutral +// 0.5 default when stats are missing, preserving the legacy behavior: +// callers don't have to special-case the no-data path. +pub fn column_selectivity(stats: Option<&ColumnStats>, table_rows: f64) -> f64 { + let s = match stats { Some(s) => s, None => return 0.5, }; @@ -206,13 +223,18 @@ fn parse_top_values(s: &ColumnStats, limit: usize) -> Vec { .collect() } -fn profile_note(col: &Column, s: &ColumnStats, table_rows: f64) -> Option { +fn profile_note( + _col_name: &str, + type_name: &str, + s: &ColumnStats, + table_rows: f64, +) -> Option { // low-cardinality text column -> suggest enum if let Some(nd) = s.n_distinct && nd > 0.0 && nd <= 10.0 { - let t = col.type_name.to_lowercase(); + let t = type_name.to_lowercase(); if t.contains("text") || t.contains("varchar") || t.contains("character varying") { return Some("Consider using an enum type".to_string()); } @@ -320,20 +342,11 @@ mod tests { } } - fn make_col(type_name: &str, stats: Option) -> Column { - Column { - name: "test_col".to_string(), - ordinal: 1, - type_name: type_name.to_string(), - nullable: true, - default: None, - identity: None, - generated: None, - comment: None, - statistics_target: None, - stats, - } - } + // The legacy `make_col` helper went away with the signature change — + // `Column` import is gone too. Test inputs now build `ColumnStats` + // directly and hand them to `profile_column` / `column_selectivity`, + // which mirrors how production code threads them via + // `AnnotatedSchema::column_stats`. #[test] fn test_parse_pg_array_simple() { @@ -388,23 +401,25 @@ mod tests { #[test] fn test_column_selectivity_negative_distinct() { - let col = make_col("integer", Some(make_stats(Some(-0.5)))); - let sel = column_selectivity(&col, 10000.0); + let s = make_stats(Some(-0.5)); + let sel = column_selectivity(Some(&s), 10000.0); // -0.5 -> 5000 distinct -> selectivity 0.0002 assert!((sel - 0.0002).abs() < 0.0001); } #[test] fn test_column_selectivity_positive_distinct() { - let col = make_col("integer", Some(make_stats(Some(100.0)))); - let sel = column_selectivity(&col, 10000.0); + let s = make_stats(Some(100.0)); + let sel = column_selectivity(Some(&s), 10000.0); assert!((sel - 0.01).abs() < 0.0001); } #[test] fn test_column_selectivity_no_stats() { - let col = make_col("integer", None); - assert_eq!(column_selectivity(&col, 1000.0), 0.5); + // Degradation path — when the planner snapshot hasn't been + // captured yet, callers pass `None` and we fall back to the + // neutral 0.5 default rather than refusing to estimate. + assert_eq!(column_selectivity(None, 1000.0), 0.5); } #[test] @@ -470,8 +485,7 @@ mod tests { fn test_profile_note_enum_suggestion() { let mut s = make_stats(Some(3.0)); s.null_frac = Some(0.0); - let col = make_col("text", Some(s)); - let note = profile_note(&col, col.stats.as_ref().unwrap(), 1000.0); + let note = profile_note("status", "text", &s, 1000.0); assert_eq!(note, Some("Consider using an enum type".to_string())); } @@ -479,19 +493,31 @@ mod tests { fn test_profile_note_high_nulls() { let mut s = make_stats(Some(100.0)); s.null_frac = Some(0.9); - let col = make_col("integer", Some(s)); - let note = profile_note(&col, col.stats.as_ref().unwrap(), 1000.0); + let note = profile_note("optional_field", "integer", &s, 1000.0); assert!(note.unwrap().contains("partial index")); } #[test] fn test_profile_column_returns_none_without_stats() { - let col = make_col("integer", None); - assert!(profile_column(&col, 1000.0).is_none()); + // No planner snapshot for this column → no profile produced. + // Mirrors the production path where MCP tools call + // `annotated.column_stats(qn, col_name)` and pass through whatever + // it returns. + assert!(profile_column("test_col", "integer", None, 1000.0).is_none()); } #[test] fn test_profile_column_returns_some_with_stats() { + let s = make_stats(Some(50.0)); + let p = profile_column("test_col", "integer", Some(&s), 1000.0) + .expect("profile should build when stats present"); + assert!(p.cardinality.contains("low")); + } + + #[test] + fn test_profile_column_full_when_rich_stats() { + // Rich-stats case — every field populated, exercises every + // sub-formatter inside `profile_column`. let s = ColumnStats { null_frac: Some(0.1), n_distinct: Some(-0.8), @@ -500,8 +526,7 @@ mod tests { histogram_bounds: Some("{1,100}".to_string()), correlation: Some(0.99), }; - let col = make_col("integer", Some(s)); - let p = profile_column(&col, 10000.0).unwrap(); + let p = profile_column("col", "integer", Some(&s), 10000.0).unwrap(); assert!(p.cardinality.contains("high")); assert_eq!(p.nulls, "10.0% (~1000 rows)"); assert!(p.physical_order.is_some()); diff --git a/crates/dry_run_core/src/schema/snapshot.rs b/crates/dry_run_core/src/schema/snapshot.rs new file mode 100644 index 0000000..52ae9e9 --- /dev/null +++ b/crates/dry_run_core/src/schema/snapshot.rs @@ -0,0 +1,606 @@ +use std::collections::BTreeMap; + +use chrono::{DateTime, Utc}; +use serde::{Deserialize, Serialize}; + +use super::types::{ColumnStats, Index, SchemaSnapshot, null_as_empty_vec}; + +#[derive(Debug, Clone)] +pub struct NodeImbalanceInfo { + pub hot_node: String, + pub multiplier: i64, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct StaleStatsEntry { + pub node: String, + pub schema: String, + pub table: String, + pub last_analyzed_days_ago: Option, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct UnusedIndexEntry { + pub schema: String, + pub table: String, + pub index_name: String, + pub total_idx_scan: i64, + pub total_size_bytes: i64, + pub is_unique: bool, + pub definition: String, +} + +#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)] +pub struct QualifiedName { + pub schema: String, + pub name: String, +} + +impl QualifiedName { + pub fn new(schema: impl Into, name: impl Into) -> Self { + Self { + schema: schema.into(), + name: name.into(), + } + } +} + +impl std::fmt::Display for QualifiedName { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}.{}", self.schema, self.name) + } +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct TableSizing { + pub reltuples: f64, + #[serde(default)] + pub relpages: i64, + pub table_size: i64, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub total_size: Option, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub index_size: Option, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct TableActivity { + pub seq_scan: i64, + pub idx_scan: i64, + #[serde(default)] + pub n_live_tup: i64, + #[serde(default)] + pub n_dead_tup: i64, + pub last_vacuum: Option>, + pub last_autovacuum: Option>, + pub last_analyze: Option>, + pub last_autoanalyze: Option>, + #[serde(default)] + pub vacuum_count: i64, + #[serde(default)] + pub autovacuum_count: i64, + #[serde(default)] + pub analyze_count: i64, + #[serde(default)] + pub autoanalyze_count: i64, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct IndexSizing { + pub size: i64, + #[serde(default)] + pub relpages: i64, + #[serde(default)] + pub reltuples: f64, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct IndexActivity { + pub idx_scan: i64, + pub idx_tup_read: i64, + pub idx_tup_fetch: i64, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct NodeIdentity { + pub label: String, + pub host: String, + pub is_standby: bool, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub replication_lag_bytes: Option, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub stats_reset: Option>, +} + +// Vec<...Entry> rather than HashMap in the persisted shape: +// JSON map keys must be strings, and a tuple key (table, column) does not +// round-trip through serde_json. Readers build a HashMap on load. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct TableSizingEntry { + pub table: QualifiedName, + pub sizing: TableSizing, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct TableActivityEntry { + pub table: QualifiedName, + pub activity: TableActivity, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct ColumnStatsEntry { + pub table: QualifiedName, + pub column: String, + pub stats: ColumnStats, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct IndexSizingEntry { + pub index: QualifiedName, + pub sizing: IndexSizing, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct IndexActivityEntry { + pub index: QualifiedName, + pub activity: IndexActivity, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct PlannerStatsSnapshot { + pub pg_version: String, + pub database: String, + pub timestamp: DateTime, + pub content_hash: String, + pub schema_ref_hash: String, + #[serde(default, deserialize_with = "null_as_empty_vec")] + pub tables: Vec, + #[serde(default, deserialize_with = "null_as_empty_vec")] + pub columns: Vec, + #[serde(default, deserialize_with = "null_as_empty_vec")] + pub indexes: Vec, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct ActivityStatsSnapshot { + pub pg_version: String, + pub database: String, + pub timestamp: DateTime, + pub content_hash: String, + pub schema_ref_hash: String, + pub node: NodeIdentity, + #[serde(default, deserialize_with = "null_as_empty_vec")] + pub tables: Vec, + #[serde(default, deserialize_with = "null_as_empty_vec")] + pub indexes: Vec, +} + +// In-memory views — never persisted, so no serde derive. + +#[derive(Debug, Clone)] +pub enum NodeSelector { + All, + Some(Vec), +} + +#[derive(Debug)] +pub struct AnnotatedSchema<'a> { + pub schema: &'a SchemaSnapshot, + pub planner: Option<&'a PlannerStatsSnapshot>, + pub merged: Option>, +} + +#[derive(Debug)] +pub struct MergedActivity<'a> { + pub schema_ref_hash: String, + pub nodes: Vec<&'a ActivityStatsSnapshot>, + pub window_start: DateTime, + pub partial: bool, +} + +impl<'a> MergedActivity<'a> { + pub fn idx_scan_sum(&self, ix: &QualifiedName) -> i64 { + self.nodes + .iter() + .filter_map(|n| { + n.indexes + .iter() + .find(|e| &e.index == ix) + .map(|e| e.activity.idx_scan) + }) + .sum() + } + + pub fn idx_scan_per_node(&self, ix: &QualifiedName) -> Vec<(String, i64)> { + self.nodes + .iter() + .map(|n| { + let scan = n + .indexes + .iter() + .find(|e| &e.index == ix) + .map(|e| e.activity.idx_scan) + .unwrap_or(0); + (n.node.label.clone(), scan) + }) + .collect() + } + + pub fn seq_scan_sum(&self, t: &QualifiedName) -> i64 { + self.nodes + .iter() + .filter_map(|n| { + n.tables + .iter() + .find(|e| &e.table == t) + .map(|e| e.activity.seq_scan) + }) + .sum() + } + + pub fn seq_scan_per_node(&self, t: &QualifiedName) -> Vec<(String, i64)> { + self.nodes + .iter() + .map(|n| { + let scan = n + .tables + .iter() + .find(|e| &e.table == t) + .map(|e| e.activity.seq_scan) + .unwrap_or(0); + (n.node.label.clone(), scan) + }) + .collect() + } + + // max across nodes of max(last_vacuum, last_autovacuum) — "did anything vacuum" + pub fn last_vacuum_max(&self, t: &QualifiedName) -> Option> { + self.nodes + .iter() + .filter_map(|n| { + n.tables.iter().find(|e| &e.table == t).and_then(|e| { + match (e.activity.last_vacuum, e.activity.last_autovacuum) { + (Some(a), Some(b)) => Some(a.max(b)), + (Some(a), None) => Some(a), + (None, Some(b)) => Some(b), + (None, None) => None, + } + }) + }) + .max() + } + + pub fn n_dead_tup_sum(&self, t: &QualifiedName) -> i64 { + self.nodes + .iter() + .filter_map(|n| { + n.tables + .iter() + .find(|e| &e.table == t) + .map(|e| e.activity.n_dead_tup) + }) + .sum() + } + + pub fn last_analyze_max(&self, t: &QualifiedName) -> Option> { + self.nodes + .iter() + .filter_map(|n| { + n.tables.iter().find(|e| &e.table == t).and_then(|e| { + match (e.activity.last_analyze, e.activity.last_autoanalyze) { + (Some(a), Some(b)) => Some(a.max(b)), + (Some(a), None) => Some(a), + (None, Some(b)) => Some(b), + (None, None) => None, + } + }) + }) + .max() + } + + pub fn vacuum_count_sum(&self, t: &QualifiedName) -> i64 { + self.nodes + .iter() + .filter_map(|n| { + n.tables + .iter() + .find(|e| &e.table == t) + .map(|e| e.activity.vacuum_count + e.activity.autovacuum_count) + }) + .sum() + } +} + +// Planner reads serve sizing / column histograms; activity reads delegate +// to MergedActivity, which transparently aggregates across whatever nodes +// the snapshot has captured (one or many). When no activity is present +// the accessors return 0 / None / empty, so consumers never have to +// branch on "is there activity data". +impl<'a> AnnotatedSchema<'a> { + pub fn reltuples(&self, t: &QualifiedName) -> Option { + self.planner? + .tables + .iter() + .find(|e| &e.table == t) + .map(|e| e.sizing.reltuples) + } + + pub fn table_size(&self, t: &QualifiedName) -> Option { + self.planner? + .tables + .iter() + .find(|e| &e.table == t) + .map(|e| e.sizing.table_size) + } + + pub fn relpages(&self, t: &QualifiedName) -> Option { + self.planner? + .tables + .iter() + .find(|e| &e.table == t) + .map(|e| e.sizing.relpages) + } + + pub fn column_stats(&self, t: &QualifiedName, col: &str) -> Option<&'a ColumnStats> { + self.planner? + .columns + .iter() + .find(|e| &e.table == t && e.column == col) + .map(|e| &e.stats) + } + + pub fn index_sizing(&self, ix: &QualifiedName) -> Option<&'a IndexSizing> { + self.planner? + .indexes + .iter() + .find(|e| &e.index == ix) + .map(|e| &e.sizing) + } + + pub fn idx_scan_sum(&self, ix: &QualifiedName) -> i64 { + self.merged.as_ref().map_or(0, |m| m.idx_scan_sum(ix)) + } + + pub fn idx_scan_per_node(&self, ix: &QualifiedName) -> Vec<(String, i64)> { + self.merged + .as_ref() + .map_or_else(Vec::new, |m| m.idx_scan_per_node(ix)) + } + + pub fn seq_scan_per_node(&self, t: &QualifiedName) -> Vec<(String, i64)> { + self.merged + .as_ref() + .map_or_else(Vec::new, |m| m.seq_scan_per_node(t)) + } + + pub fn seq_scan_sum(&self, t: &QualifiedName) -> i64 { + self.merged.as_ref().map_or(0, |m| m.seq_scan_sum(t)) + } + + pub fn n_dead_tup_sum(&self, t: &QualifiedName) -> i64 { + self.merged.as_ref().map_or(0, |m| m.n_dead_tup_sum(t)) + } + + pub fn last_vacuum_max(&self, t: &QualifiedName) -> Option> { + self.merged.as_ref().and_then(|m| m.last_vacuum_max(t)) + } + + pub fn last_analyze_max(&self, t: &QualifiedName) -> Option> { + self.merged.as_ref().and_then(|m| m.last_analyze_max(t)) + } + + pub fn vacuum_count_sum(&self, t: &QualifiedName) -> i64 { + self.merged.as_ref().map_or(0, |m| m.vacuum_count_sum(t)) + } +} + +#[derive(Debug, Clone)] +pub struct AnnotatedSnapshot { + pub schema: SchemaSnapshot, + pub planner: Option, + pub activity_by_node: BTreeMap, +} + +impl AnnotatedSnapshot { + pub fn view(&self) -> AnnotatedSchema<'_> { + AnnotatedSchema { + schema: &self.schema, + planner: self.planner.as_ref(), + merged: self.merged(&NodeSelector::All), + } + } + + pub fn merged(&self, selector: &NodeSelector) -> Option> { + let nodes: Vec<&ActivityStatsSnapshot> = match selector { + NodeSelector::All => self.activity_by_node.values().collect(), + NodeSelector::Some(labels) => labels + .iter() + .filter_map(|l| self.activity_by_node.get(l)) + .collect(), + }; + if nodes.is_empty() { + return None; + } + let schema_ref_hash = nodes[0].schema_ref_hash.clone(); + let partial = nodes.iter().any(|n| n.node.stats_reset.is_none()); + let window_start = nodes + .iter() + .map(|n| n.node.stats_reset.unwrap_or(n.timestamp)) + .min() + .unwrap_or(nodes[0].timestamp); + Some(MergedActivity { + schema_ref_hash, + nodes, + window_start, + partial, + }) + } + + pub fn node_labels(&self) -> impl Iterator { + self.activity_by_node.keys().map(|s| s.as_str()) + } + + // Indexes with zero scans across the requested nodes. Mirrors + // `detect_unused_indexes` (legacy NodeStats path), but reads from the + // activity_by_node map. Skips primary keys. + pub fn unused_indexes(&self, selector: &NodeSelector) -> Vec { + use std::collections::BTreeMap; + + let nodes: Vec<&ActivityStatsSnapshot> = match selector { + NodeSelector::All => self.activity_by_node.values().collect(), + NodeSelector::Some(labels) => labels + .iter() + .filter_map(|l| self.activity_by_node.get(l)) + .collect(), + }; + + // Build (qualified_index, sum, max_size) by walking each node's index activity, + // joined to the planner's index sizing for byte counts. + #[derive(Default)] + struct Agg { + total_idx_scan: i64, + max_size: i64, + } + let mut agg: BTreeMap = BTreeMap::new(); + for n in &nodes { + for ie in &n.indexes { + let entry = agg.entry(ie.index.clone()).or_default(); + entry.total_idx_scan += ie.activity.idx_scan; + } + } + if let Some(p) = &self.planner { + for ie in &p.indexes { + if let Some(entry) = agg.get_mut(&ie.index) + && ie.sizing.size > entry.max_size + { + entry.max_size = ie.sizing.size; + } + } + } + + let idx_lookup: BTreeMap<(&str, &str), &Index> = self + .schema + .tables + .iter() + .flat_map(|t| { + t.indexes + .iter() + .map(move |idx| (t.schema.as_str(), t.name.as_str(), idx)) + }) + .map(|(s, _t, idx)| ((s, idx.name.as_str()), idx)) + .collect(); + + let mut entries = Vec::new(); + for (qn, a) in &agg { + if a.total_idx_scan != 0 { + continue; + } + let idx_info = idx_lookup.get(&(qn.schema.as_str(), qn.name.as_str())); + if idx_info.is_some_and(|idx| idx.is_primary) { + continue; + } + + // table name comes from the schema's index → owning table mapping + let owning_table = self + .schema + .tables + .iter() + .find(|t| t.schema == qn.schema && t.indexes.iter().any(|idx| idx.name == qn.name)) + .map(|t| t.name.clone()) + .unwrap_or_default(); + + entries.push(UnusedIndexEntry { + schema: qn.schema.clone(), + table: owning_table, + index_name: qn.name.clone(), + total_idx_scan: 0, + total_size_bytes: a.max_size, + is_unique: idx_info.is_some_and(|idx| idx.is_unique), + definition: idx_info + .map(|idx| idx.definition.clone()) + .unwrap_or_default(), + }); + } + entries.sort_by_key(|b| std::cmp::Reverse(b.total_size_bytes)); + entries + } + + // Tables whose last_analyze (or last_autoanalyze) is older than `days`, + // or which have never been analyzed. One entry per (node, table). + pub fn stale_stats(&self, selector: &NodeSelector, days: i64) -> Vec { + let nodes: Vec<&ActivityStatsSnapshot> = match selector { + NodeSelector::All => self.activity_by_node.values().collect(), + NodeSelector::Some(labels) => labels + .iter() + .filter_map(|l| self.activity_by_node.get(l)) + .collect(), + }; + let now = chrono::Utc::now(); + let threshold = chrono::TimeDelta::days(days); + let mut entries = Vec::new(); + for n in nodes { + for te in &n.tables { + let last = te.activity.last_analyze.max(te.activity.last_autoanalyze); + match last { + Some(when) if now - when > threshold => { + entries.push(StaleStatsEntry { + node: n.node.label.clone(), + schema: te.table.schema.clone(), + table: te.table.name.clone(), + last_analyzed_days_ago: Some((now - when).num_days()), + }); + } + None => { + entries.push(StaleStatsEntry { + node: n.node.label.clone(), + schema: te.table.schema.clone(), + table: te.table.name.clone(), + last_analyzed_days_ago: None, + }); + } + _ => {} + } + } + } + entries + } + + // 5x+ seq_scan imbalance between hottest and coldest non-zero node. + pub fn seq_scan_imbalance(&self, t: &QualifiedName) -> Option { + let scans: Vec<(&str, i64)> = self + .activity_by_node + .values() + .filter_map(|n| { + n.tables + .iter() + .find(|e| &e.table == t) + .map(|e| (n.node.label.as_str(), e.activity.seq_scan)) + }) + .collect(); + if scans.len() < 2 { + return None; + } + let nonzero: Vec<(&str, i64)> = scans.into_iter().filter(|(_, v)| *v > 0).collect(); + if nonzero.len() < 2 { + return None; + } + let min = nonzero.iter().map(|(_, v)| *v).min().unwrap_or(1); + let (hot_node, max) = nonzero + .iter() + .max_by_key(|(_, v)| *v) + .copied() + .unwrap_or(("", 1)); + if min > 0 && max / min >= 5 { + Some(NodeImbalanceInfo { + hot_node: hot_node.to_string(), + multiplier: max / min, + }) + } else { + None + } + } +} + +#[cfg(test)] +#[path = "snapshot_tests.rs"] +mod tests; diff --git a/crates/dry_run_core/src/schema/snapshot_tests.rs b/crates/dry_run_core/src/schema/snapshot_tests.rs new file mode 100644 index 0000000..0eb196b --- /dev/null +++ b/crates/dry_run_core/src/schema/snapshot_tests.rs @@ -0,0 +1,845 @@ +use super::super::types::*; +use super::*; + +#[test] +fn qualified_name_displays_schema_dot_name() { + let qn = QualifiedName::new("public", "orders"); + assert_eq!(qn.to_string(), "public.orders"); +} + +#[test] +fn qualified_name_round_trips_through_serde() { + let qn = QualifiedName::new("public", "orders"); + let json = serde_json::to_string(&qn).unwrap(); + let back: QualifiedName = serde_json::from_str(&json).unwrap(); + assert_eq!(back, qn); +} + +fn sample_planner_stats() -> PlannerStatsSnapshot { + PlannerStatsSnapshot { + pg_version: "PostgreSQL 17.0".into(), + database: "accounts".into(), + timestamp: Utc::now(), + content_hash: "abc123".into(), + schema_ref_hash: "def456".into(), + tables: vec![TableSizingEntry { + table: QualifiedName::new("public", "orders"), + sizing: TableSizing { + reltuples: 1234.0, + relpages: 42, + table_size: 1_000_000, + total_size: Some(2_000_000), + index_size: Some(1_000_000), + }, + }], + columns: vec![ColumnStatsEntry { + table: QualifiedName::new("public", "orders"), + column: "user_id".into(), + stats: ColumnStats { + null_frac: Some(0.0), + n_distinct: Some(-0.5), + most_common_vals: None, + most_common_freqs: None, + histogram_bounds: None, + correlation: Some(0.1), + }, + }], + indexes: vec![IndexSizingEntry { + index: QualifiedName::new("public", "orders_pkey"), + sizing: IndexSizing { + size: 8192, + relpages: 1, + reltuples: 1234.0, + }, + }], + } +} + +fn sample_activity_stats() -> ActivityStatsSnapshot { + ActivityStatsSnapshot { + pg_version: "PostgreSQL 17.0".into(), + database: "accounts".into(), + timestamp: Utc::now(), + content_hash: "h1".into(), + schema_ref_hash: "h2".into(), + node: NodeIdentity { + label: "primary".into(), + host: "10.0.0.1".into(), + is_standby: false, + replication_lag_bytes: None, + stats_reset: None, + }, + tables: vec![TableActivityEntry { + table: QualifiedName::new("public", "orders"), + activity: TableActivity { + seq_scan: 7, + idx_scan: 100, + n_live_tup: 1000, + n_dead_tup: 5, + last_vacuum: None, + last_autovacuum: None, + last_analyze: None, + last_autoanalyze: None, + vacuum_count: 0, + autovacuum_count: 1, + analyze_count: 0, + autoanalyze_count: 1, + }, + }], + indexes: vec![IndexActivityEntry { + index: QualifiedName::new("public", "orders_pkey"), + activity: IndexActivity { + idx_scan: 100, + idx_tup_read: 200, + idx_tup_fetch: 150, + }, + }], + } +} + +#[test] +fn planner_stats_round_trips_through_json() { + let snap = sample_planner_stats(); + let json = serde_json::to_string(&snap).unwrap(); + let back: PlannerStatsSnapshot = serde_json::from_str(&json).unwrap(); + assert_eq!(back.tables.len(), 1); + assert_eq!(back.tables[0].table, snap.tables[0].table); + assert_eq!(back.columns.len(), 1); + assert_eq!(back.columns[0].column, "user_id"); + assert_eq!(back.indexes.len(), 1); + assert_eq!(back.indexes[0].index.name, "orders_pkey"); + assert_eq!(back.schema_ref_hash, "def456"); +} + +#[test] +fn activity_stats_round_trips_through_json() { + let snap = sample_activity_stats(); + let json = serde_json::to_string(&snap).unwrap(); + let back: ActivityStatsSnapshot = serde_json::from_str(&json).unwrap(); + assert_eq!(back.node.label, "primary"); + assert!(!back.node.is_standby); + assert_eq!(back.tables[0].activity.seq_scan, 7); + assert_eq!(back.indexes[0].activity.idx_scan, 100); +} + +#[test] +fn activity_stats_accepts_missing_optional_fields() { + // Older payloads without the *_count fields and without lag should still load. + let json = r#"{ + "pg_version": "PostgreSQL 17.0", + "database": "accounts", + "timestamp": "2026-01-01T00:00:00Z", + "content_hash": "h1", + "schema_ref_hash": "h2", + "node": { + "label": "replica1", + "host": "10.0.0.2", + "is_standby": true + }, + "tables": [{ + "table": {"schema": "public", "name": "orders"}, + "activity": { + "seq_scan": 1, + "idx_scan": 2, + "last_vacuum": null, + "last_autovacuum": null, + "last_analyze": null, + "last_autoanalyze": null + } + }], + "indexes": [] + }"#; + let back: ActivityStatsSnapshot = serde_json::from_str(json).unwrap(); + assert!(back.node.is_standby); + assert!(back.node.replication_lag_bytes.is_none()); + assert_eq!(back.tables[0].activity.n_live_tup, 0); + assert_eq!(back.tables[0].activity.vacuum_count, 0); +} + +#[test] +fn node_selector_variants_are_constructable() { + let _ = NodeSelector::All; + match NodeSelector::Some(vec!["primary".into(), "replica1".into()]) { + NodeSelector::Some(v) => assert_eq!(v.len(), 2), + NodeSelector::All => panic!("wrong variant"), + } +} + +fn activity_for( + label: &str, + idx_scan: i64, + seq_scan: i64, + n_dead_tup: i64, + last_vacuum: Option>, + last_autovacuum: Option>, + stats_reset: Option>, +) -> ActivityStatsSnapshot { + ActivityStatsSnapshot { + pg_version: "PostgreSQL 17.0".into(), + database: "accounts".into(), + timestamp: Utc::now(), + content_hash: format!("hash-{label}"), + schema_ref_hash: "schema-h".into(), + node: NodeIdentity { + label: label.into(), + host: format!("10.0.0.{label}"), + is_standby: label != "primary", + replication_lag_bytes: None, + stats_reset, + }, + tables: vec![TableActivityEntry { + table: QualifiedName::new("public", "orders"), + activity: TableActivity { + seq_scan, + idx_scan, + n_live_tup: 0, + n_dead_tup, + last_vacuum, + last_autovacuum, + last_analyze: None, + last_autoanalyze: None, + vacuum_count: 0, + autovacuum_count: 0, + analyze_count: 0, + autoanalyze_count: 0, + }, + }], + indexes: vec![IndexActivityEntry { + index: QualifiedName::new("public", "orders_pkey"), + activity: IndexActivity { + idx_scan, + idx_tup_read: 0, + idx_tup_fetch: 0, + }, + }], + } +} + +fn empty_schema_snap() -> SchemaSnapshot { + SchemaSnapshot { + pg_version: "PostgreSQL 17.0".into(), + database: "accounts".into(), + timestamp: Utc::now(), + content_hash: "schema-h".into(), + source: None, + tables: vec![], + enums: vec![], + domains: vec![], + composites: vec![], + views: vec![], + functions: vec![], + extensions: vec![], + gucs: vec![], + } +} + +fn snap_with_nodes(nodes: Vec) -> AnnotatedSnapshot { + let mut activity_by_node = BTreeMap::new(); + for n in nodes { + activity_by_node.insert(n.node.label.clone(), n); + } + AnnotatedSnapshot { + schema: empty_schema_snap(), + planner: None, + activity_by_node, + } +} + +#[test] +fn merged_activity_idx_scan_sum_across_nodes() { + let snap = snap_with_nodes(vec![ + activity_for("primary", 10, 0, 0, None, None, None), + activity_for("replica1", 20, 0, 0, None, None, None), + activity_for("replica2", 5, 0, 0, None, None, None), + ]); + let merged = snap.merged(&NodeSelector::All).expect("3 nodes"); + let ix = QualifiedName::new("public", "orders_pkey"); + assert_eq!(merged.idx_scan_sum(&ix), 35); +} + +#[test] +fn merged_activity_idx_scan_per_node_returns_breakdown() { + let snap = snap_with_nodes(vec![ + activity_for("primary", 10, 0, 0, None, None, None), + activity_for("replica1", 20, 0, 0, None, None, None), + ]); + let merged = snap.merged(&NodeSelector::All).unwrap(); + let ix = QualifiedName::new("public", "orders_pkey"); + let per_node = merged.idx_scan_per_node(&ix); + // BTreeMap ordering: primary < replica1 + assert_eq!( + per_node, + vec![("primary".into(), 10), ("replica1".into(), 20)] + ); +} + +#[test] +fn merged_activity_seq_scan_sum_across_nodes() { + let snap = snap_with_nodes(vec![ + activity_for("primary", 0, 3, 0, None, None, None), + activity_for("replica1", 0, 7, 0, None, None, None), + ]); + let merged = snap.merged(&NodeSelector::All).unwrap(); + let t = QualifiedName::new("public", "orders"); + assert_eq!(merged.seq_scan_sum(&t), 10); +} + +#[test] +fn merged_activity_n_dead_tup_sums_across_nodes() { + let snap = snap_with_nodes(vec![ + activity_for("primary", 0, 0, 100, None, None, None), + activity_for("replica1", 0, 0, 50, None, None, None), + ]); + let merged = snap.merged(&NodeSelector::All).unwrap(); + let t = QualifiedName::new("public", "orders"); + assert_eq!(merged.n_dead_tup_sum(&t), 150); +} + +#[test] +fn merged_activity_last_vacuum_max_picks_max_across_nodes_and_kinds() { + let early = "2026-01-01T00:00:00Z".parse::>().unwrap(); + let mid = "2026-02-01T00:00:00Z".parse::>().unwrap(); + let late = "2026-03-01T00:00:00Z".parse::>().unwrap(); + let snap = snap_with_nodes(vec![ + // primary: manual at early, autovacuum at mid → node max = mid + activity_for("primary", 0, 0, 0, Some(early), Some(mid), None), + // replica1: autovacuum at late → node max = late + activity_for("replica1", 0, 0, 0, None, Some(late), None), + ]); + let merged = snap.merged(&NodeSelector::All).unwrap(); + let t = QualifiedName::new("public", "orders"); + assert_eq!(merged.last_vacuum_max(&t), Some(late)); +} + +#[test] +fn merged_activity_last_vacuum_max_returns_none_when_never_vacuumed() { + let snap = snap_with_nodes(vec![ + activity_for("primary", 0, 0, 0, None, None, None), + activity_for("replica1", 0, 0, 0, None, None, None), + ]); + let merged = snap.merged(&NodeSelector::All).unwrap(); + let t = QualifiedName::new("public", "orders"); + assert_eq!(merged.last_vacuum_max(&t), None); +} + +#[test] +fn annotated_snapshot_view_with_no_activity_has_no_merged() { + let snap = snap_with_nodes(vec![]); + let view = snap.view(); + assert!(view.merged.is_none()); +} + +#[test] +fn annotated_snapshot_view_single_node_populates_merged() { + let snap = snap_with_nodes(vec![activity_for("primary", 1, 0, 0, None, None, None)]); + let view = snap.view(); + let merged = view + .merged + .expect("single node still produces a merged view"); + assert_eq!(merged.nodes.len(), 1); + assert_eq!(merged.nodes[0].node.label, "primary"); +} + +#[test] +fn annotated_snapshot_view_multi_node_populates_merged() { + let snap = snap_with_nodes(vec![ + activity_for("primary", 1, 0, 0, None, None, None), + activity_for("replica1", 2, 0, 0, None, None, None), + ]); + let view = snap.view(); + let merged = view.merged.expect("multi-node should produce merged view"); + assert_eq!(merged.nodes.len(), 2); +} + +#[test] +fn annotated_snapshot_merged_partial_when_any_node_lacks_reset() { + let reset = "2026-04-01T00:00:00Z".parse::>().unwrap(); + let snap = snap_with_nodes(vec![ + activity_for("primary", 0, 0, 0, None, None, Some(reset)), + activity_for("replica1", 0, 0, 0, None, None, None), + ]); + let merged = snap.merged(&NodeSelector::All).unwrap(); + assert!( + merged.partial, + "partial should be true when a node lacks stats_reset" + ); +} + +#[test] +fn annotated_snapshot_merged_window_start_is_min_reset() { + let early = "2026-01-01T00:00:00Z".parse::>().unwrap(); + let later = "2026-02-01T00:00:00Z".parse::>().unwrap(); + let snap = snap_with_nodes(vec![ + activity_for("primary", 0, 0, 0, None, None, Some(later)), + activity_for("replica1", 0, 0, 0, None, None, Some(early)), + ]); + let merged = snap.merged(&NodeSelector::All).unwrap(); + assert_eq!(merged.window_start, early); + assert!(!merged.partial); +} + +#[test] +fn annotated_snapshot_merged_node_selector_some_filters() { + let snap = snap_with_nodes(vec![ + activity_for("primary", 1, 0, 0, None, None, None), + activity_for("replica1", 2, 0, 0, None, None, None), + activity_for("replica2", 4, 0, 0, None, None, None), + ]); + let merged = snap + .merged(&NodeSelector::Some(vec![ + "replica1".into(), + "replica2".into(), + ])) + .unwrap(); + let ix = QualifiedName::new("public", "orders_pkey"); + assert_eq!(merged.idx_scan_sum(&ix), 6); + assert_eq!(merged.nodes.len(), 2); +} + +#[test] +fn annotated_snapshot_merged_returns_none_for_empty_selector() { + let snap = snap_with_nodes(vec![]); + assert!(snap.merged(&NodeSelector::All).is_none()); +} + +// ----------------------------------------------------------------------- +// Layer A: AnnotatedSchema accessors — planner reads + activity fall-through +// ----------------------------------------------------------------------- + +fn planner_for_orders(reltuples: f64, table_size: i64) -> PlannerStatsSnapshot { + PlannerStatsSnapshot { + pg_version: "PostgreSQL 17.0".into(), + database: "accounts".into(), + timestamp: Utc::now(), + content_hash: "ph".into(), + schema_ref_hash: "schema-h".into(), + tables: vec![TableSizingEntry { + table: QualifiedName::new("public", "orders"), + sizing: TableSizing { + reltuples, + relpages: 7, + table_size, + total_size: None, + index_size: None, + }, + }], + columns: vec![ColumnStatsEntry { + table: QualifiedName::new("public", "orders"), + column: "user_id".into(), + stats: ColumnStats { + null_frac: Some(0.1), + n_distinct: Some(-0.5), + most_common_vals: None, + most_common_freqs: None, + histogram_bounds: None, + correlation: Some(0.5), + }, + }], + indexes: vec![IndexSizingEntry { + index: QualifiedName::new("public", "orders_pkey"), + sizing: IndexSizing { + size: 16384, + relpages: 2, + reltuples, + }, + }], + } +} + +fn snap_with_planner(p: PlannerStatsSnapshot) -> AnnotatedSnapshot { + AnnotatedSnapshot { + schema: empty_schema_snap(), + planner: Some(p), + activity_by_node: BTreeMap::new(), + } +} + +fn snap_full( + planner: Option, + activity: Vec, +) -> AnnotatedSnapshot { + let mut activity_by_node = BTreeMap::new(); + for a in activity { + activity_by_node.insert(a.node.label.clone(), a); + } + AnnotatedSnapshot { + schema: empty_schema_snap(), + planner, + activity_by_node, + } +} + +#[test] +fn reltuples_reads_from_planner() { + let snap = snap_with_planner(planner_for_orders(1234.0, 1_000_000)); + let view = snap.view(); + assert_eq!( + view.reltuples(&QualifiedName::new("public", "orders")), + Some(1234.0) + ); +} + +#[test] +fn reltuples_returns_none_when_planner_missing() { + let snap = snap_full(None, vec![]); + let view = snap.view(); + assert!( + view.reltuples(&QualifiedName::new("public", "orders")) + .is_none() + ); +} + +#[test] +fn reltuples_returns_none_for_unknown_table() { + let snap = snap_with_planner(planner_for_orders(1234.0, 1_000_000)); + let view = snap.view(); + assert!( + view.reltuples(&QualifiedName::new("public", "ghost")) + .is_none() + ); +} + +#[test] +fn table_size_relpages_index_sizing_read_from_planner() { + let snap = snap_with_planner(planner_for_orders(50.0, 99)); + let view = snap.view(); + let t = QualifiedName::new("public", "orders"); + let ix = QualifiedName::new("public", "orders_pkey"); + assert_eq!(view.table_size(&t), Some(99)); + assert_eq!(view.relpages(&t), Some(7)); + assert_eq!(view.index_sizing(&ix).map(|s| s.size), Some(16384)); +} + +#[test] +fn column_stats_reads_from_planner() { + let snap = snap_with_planner(planner_for_orders(1.0, 1)); + let view = snap.view(); + let stats = view + .column_stats(&QualifiedName::new("public", "orders"), "user_id") + .expect("user_id stats"); + assert_eq!(stats.null_frac, Some(0.1)); + assert!( + view.column_stats(&QualifiedName::new("public", "orders"), "ghost") + .is_none() + ); +} + +#[test] +fn idx_scan_sum_falls_through_merged_to_single_to_zero() { + let ix = QualifiedName::new("public", "orders_pkey"); + + // 1. multi-node activity → uses merged + let multi = snap_full( + None, + vec![ + activity_for("primary", 10, 0, 0, None, None, None), + activity_for("replica1", 5, 0, 0, None, None, None), + ], + ); + assert_eq!(multi.view().idx_scan_sum(&ix), 15); + + // 2. single-node activity, merged is None → reads single + let single = snap_full( + None, + vec![activity_for("primary", 7, 0, 0, None, None, None)], + ); + assert_eq!(single.view().idx_scan_sum(&ix), 7); + + // 3. no activity at all → 0 + let none = snap_full(None, vec![]); + assert_eq!(none.view().idx_scan_sum(&ix), 0); +} + +#[test] +fn seq_scan_sum_falls_through_merged_to_single_to_zero() { + let t = QualifiedName::new("public", "orders"); + let multi = snap_full( + None, + vec![ + activity_for("primary", 0, 3, 0, None, None, None), + activity_for("replica1", 0, 4, 0, None, None, None), + ], + ); + let single = snap_full( + None, + vec![activity_for("primary", 0, 9, 0, None, None, None)], + ); + let none = snap_full(None, vec![]); + assert_eq!(multi.view().seq_scan_sum(&t), 7); + assert_eq!(single.view().seq_scan_sum(&t), 9); + assert_eq!(none.view().seq_scan_sum(&t), 0); +} + +#[test] +fn n_dead_tup_sum_falls_through_merged_to_single_to_zero() { + let t = QualifiedName::new("public", "orders"); + let multi = snap_full( + None, + vec![ + activity_for("primary", 0, 0, 100, None, None, None), + activity_for("replica1", 0, 0, 50, None, None, None), + ], + ); + let single = snap_full( + None, + vec![activity_for("primary", 0, 0, 42, None, None, None)], + ); + let none = snap_full(None, vec![]); + assert_eq!(multi.view().n_dead_tup_sum(&t), 150); + assert_eq!(single.view().n_dead_tup_sum(&t), 42); + assert_eq!(none.view().n_dead_tup_sum(&t), 0); +} + +#[test] +fn last_vacuum_max_falls_through_merged_to_single_to_none() { + let t = QualifiedName::new("public", "orders"); + let early = "2026-01-01T00:00:00Z".parse::>().unwrap(); + let late = "2026-03-01T00:00:00Z".parse::>().unwrap(); + let multi = snap_full( + None, + vec![ + activity_for("primary", 0, 0, 0, Some(early), None, None), + activity_for("replica1", 0, 0, 0, None, Some(late), None), + ], + ); + let single = snap_full( + None, + vec![activity_for("primary", 0, 0, 0, Some(early), None, None)], + ); + let none = snap_full(None, vec![]); + assert_eq!(multi.view().last_vacuum_max(&t), Some(late)); + assert_eq!(single.view().last_vacuum_max(&t), Some(early)); + assert!(none.view().last_vacuum_max(&t).is_none()); +} + +#[test] +fn idx_scan_per_node_works_for_single_and_multi() { + let ix = QualifiedName::new("public", "orders_pkey"); + let single = snap_full( + None, + vec![activity_for("primary", 7, 0, 0, None, None, None)], + ); + assert_eq!( + single.view().idx_scan_per_node(&ix), + vec![("primary".into(), 7)] + ); + + let multi = snap_full( + None, + vec![ + activity_for("primary", 1, 0, 0, None, None, None), + activity_for("replica1", 2, 0, 0, None, None, None), + ], + ); + assert_eq!( + multi.view().idx_scan_per_node(&ix), + vec![("primary".into(), 1), ("replica1".into(), 2)], + ); + + let none = snap_full(None, vec![]); + assert!(none.view().idx_scan_per_node(&ix).is_empty()); +} + +#[test] +fn single_node_and_multi_node_one_node_parity_for_cluster_sums() { + // The "merged is None when only one node" trap: single-node activity vs. + // a one-entry activity_by_node map must produce the same totals. + let ix = QualifiedName::new("public", "orders_pkey"); + let t = QualifiedName::new("public", "orders"); + // build via view default (single-node mode, merged = None) + let one = snap_full( + None, + vec![activity_for("primary", 11, 5, 3, None, None, None)], + ); + let view = one.view(); + assert_eq!(view.idx_scan_sum(&ix), 11); + assert_eq!(view.seq_scan_sum(&t), 5); + assert_eq!(view.n_dead_tup_sum(&t), 3); +} + +#[test] +fn no_panic_on_fully_empty_annotated() { + let snap = AnnotatedSnapshot { + schema: empty_schema_snap(), + planner: None, + activity_by_node: BTreeMap::new(), + }; + let view = snap.view(); + let t = QualifiedName::new("public", "orders"); + let ix = QualifiedName::new("public", "orders_pkey"); + assert!(view.reltuples(&t).is_none()); + assert!(view.table_size(&t).is_none()); + assert!(view.relpages(&t).is_none()); + assert!(view.column_stats(&t, "x").is_none()); + assert!(view.index_sizing(&ix).is_none()); + assert_eq!(view.seq_scan_sum(&t), 0); + assert_eq!(view.idx_scan_sum(&ix), 0); + assert!(view.idx_scan_per_node(&ix).is_empty()); + assert_eq!(view.n_dead_tup_sum(&t), 0); + assert!(view.last_vacuum_max(&t).is_none()); + assert!(view.last_analyze_max(&t).is_none()); + assert_eq!(view.vacuum_count_sum(&t), 0); +} + +// ----------------------------------------------------------------------- +// Layer A: AnnotatedSnapshot helpers — parity with legacy free functions +// ----------------------------------------------------------------------- + +fn schema_with_index_def(idx_name: &str, is_primary: bool, is_unique: bool) -> SchemaSnapshot { + SchemaSnapshot { + tables: vec![Table { + oid: 1, + schema: "public".into(), + name: "orders".into(), + columns: vec![], + constraints: vec![], + indexes: vec![Index { + name: idx_name.into(), + columns: vec!["id".into()], + include_columns: vec![], + index_type: "btree".into(), + is_unique, + is_primary, + predicate: None, + definition: format!("CREATE INDEX {idx_name} ON public.orders (id)"), + is_valid: true, + backs_constraint: false, + }], + comment: None, + partition_info: None, + policies: vec![], + triggers: vec![], + reloptions: vec![], + rls_enabled: false, + }], + ..empty_schema_snap() + } +} + +#[test] +fn unused_indexes_aggregates_across_nodes() { + let schema = schema_with_index_def("idx_dead", false, false); + let planner = PlannerStatsSnapshot { + pg_version: "PostgreSQL 17.0".into(), + database: "accounts".into(), + timestamp: Utc::now(), + content_hash: "ph".into(), + schema_ref_hash: "schema-h".into(), + tables: vec![], + columns: vec![], + indexes: vec![IndexSizingEntry { + index: QualifiedName::new("public", "idx_dead"), + sizing: IndexSizing { + size: 16384, + relpages: 2, + reltuples: 0.0, + }, + }], + }; + let mut activity_by_node = BTreeMap::new(); + for label in ["primary", "replica1"] { + activity_by_node.insert( + label.into(), + ActivityStatsSnapshot { + pg_version: "PostgreSQL 17.0".into(), + database: "accounts".into(), + timestamp: Utc::now(), + content_hash: format!("h-{label}"), + schema_ref_hash: "schema-h".into(), + node: NodeIdentity { + label: label.into(), + host: label.into(), + is_standby: label != "primary", + replication_lag_bytes: None, + stats_reset: None, + }, + tables: vec![], + indexes: vec![IndexActivityEntry { + index: QualifiedName::new("public", "idx_dead"), + activity: IndexActivity { + idx_scan: 0, + idx_tup_read: 0, + idx_tup_fetch: 0, + }, + }], + }, + ); + } + let snap = AnnotatedSnapshot { + schema, + planner: Some(planner), + activity_by_node, + }; + let result = snap.unused_indexes(&NodeSelector::All); + assert_eq!(result.len(), 1); + assert_eq!(result[0].index_name, "idx_dead"); + assert_eq!(result[0].total_size_bytes, 16384); + assert_eq!(result[0].total_idx_scan, 0); +} + +#[test] +fn unused_indexes_skips_primary_keys() { + let schema = schema_with_index_def("orders_pkey", true, true); + let snap = AnnotatedSnapshot { + schema, + planner: None, + activity_by_node: { + let mut m = BTreeMap::new(); + m.insert( + "primary".into(), + ActivityStatsSnapshot { + pg_version: "PostgreSQL 17.0".into(), + database: "accounts".into(), + timestamp: Utc::now(), + content_hash: "a".into(), + schema_ref_hash: "s".into(), + node: NodeIdentity { + label: "primary".into(), + host: "p".into(), + is_standby: false, + replication_lag_bytes: None, + stats_reset: None, + }, + tables: vec![], + indexes: vec![IndexActivityEntry { + index: QualifiedName::new("public", "orders_pkey"), + activity: IndexActivity { + idx_scan: 0, + idx_tup_read: 0, + idx_tup_fetch: 0, + }, + }], + }, + ); + m + }, + }; + assert!(snap.unused_indexes(&NodeSelector::All).is_empty()); +} + +#[test] +fn unused_indexes_empty_when_no_activity() { + let schema = schema_with_index_def("idx_dead", false, false); + let snap = AnnotatedSnapshot { + schema, + planner: None, + activity_by_node: BTreeMap::new(), + }; + assert!(snap.unused_indexes(&NodeSelector::All).is_empty()); +} + +#[test] +fn seq_scan_imbalance_flags_hot_node() { + let snap = snap_full( + None, + vec![ + activity_for("primary", 0, 1000, 0, None, None, None), + activity_for("replica1", 0, 100, 0, None, None, None), + ], + ); + let result = snap + .seq_scan_imbalance(&QualifiedName::new("public", "orders")) + .expect("10x imbalance should fire"); + assert_eq!(result.hot_node, "primary"); + assert_eq!(result.multiplier, 10); +} diff --git a/crates/dry_run_core/src/schema/types.rs b/crates/dry_run_core/src/schema/types.rs index b2a2744..3f49b24 100644 --- a/crates/dry_run_core/src/schema/types.rs +++ b/crates/dry_run_core/src/schema/types.rs @@ -1,7 +1,7 @@ use chrono::{DateTime, Utc}; use serde::{Deserialize, Deserializer, Serialize}; -fn null_as_empty_vec<'de, D, T>(deserializer: D) -> Result, D::Error> +pub(super) fn null_as_empty_vec<'de, D, T>(deserializer: D) -> Result, D::Error> where D: Deserializer<'de>, T: Deserialize<'de>, @@ -25,8 +25,6 @@ pub struct SchemaSnapshot { pub functions: Vec, pub extensions: Vec, pub gucs: Vec, - #[serde(default, skip_serializing_if = "Vec::is_empty")] - pub node_stats: Vec, } #[derive(Debug, Clone, Serialize, Deserialize)] @@ -41,7 +39,6 @@ pub struct Table { #[serde(default, deserialize_with = "null_as_empty_vec")] pub indexes: Vec, pub comment: Option, - pub stats: Option, pub partition_info: Option, #[serde(default, deserialize_with = "null_as_empty_vec")] pub policies: Vec, @@ -69,7 +66,6 @@ pub struct Column { pub comment: Option, #[serde(default, skip_serializing_if = "Option::is_none")] pub statistics_target: Option, - pub stats: Option, } #[derive(Debug, Clone, Serialize, Deserialize)] @@ -125,41 +121,12 @@ pub struct Index { pub is_valid: bool, #[serde(default)] pub backs_constraint: bool, - #[serde(default, skip_serializing_if = "Option::is_none")] - pub stats: Option, } fn default_true() -> bool { true } -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct IndexStats { - pub idx_scan: i64, - pub idx_tup_read: i64, - pub idx_tup_fetch: i64, - pub size: i64, - #[serde(default)] - pub relpages: i64, - #[serde(default)] - pub reltuples: f64, -} - -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct TableStats { - pub reltuples: f64, - #[serde(default)] - pub relpages: i64, - pub dead_tuples: i64, - pub last_vacuum: Option>, - pub last_autovacuum: Option>, - pub last_analyze: Option>, - pub last_autoanalyze: Option>, - pub seq_scan: i64, - pub idx_scan: i64, - pub table_size: i64, -} - #[derive(Debug, Clone, Serialize, Deserialize)] pub struct ColumnStats { pub null_frac: Option, @@ -312,709 +279,3 @@ pub struct GucSetting { pub setting: String, pub unit: Option, } - -pub fn aggregate_table_stats( - node_stats: &[NodeStats], - schema: &str, - table: &str, -) -> Option { - let matching: Vec<&TableStats> = node_stats - .iter() - .flat_map(|ns| &ns.table_stats) - .filter(|nts| nts.schema == schema && nts.table == table) - .map(|nts| &nts.stats) - .collect(); - - if matching.is_empty() { - return None; - } - - // max reltuples (all replicas should be close, take max for safety) - let reltuples = matching.iter().map(|s| s.reltuples).fold(0.0_f64, f64::max); - let relpages = matching.iter().map(|s| s.relpages).max().unwrap_or(0); - let dead_tuples = matching.iter().map(|s| s.dead_tuples).max().unwrap_or(0); - let seq_scan: i64 = matching.iter().map(|s| s.seq_scan).sum(); - let idx_scan: i64 = matching.iter().map(|s| s.idx_scan).sum(); - let table_size = matching.iter().map(|s| s.table_size).max().unwrap_or(0); - - // Vacuum/analyze timestamps only make sense from primary nodes - // (autovacuum doesn't run on standbys, so timestamps are always null there). - let primary_stats: Vec<&TableStats> = node_stats - .iter() - .filter(|ns| !ns.is_standby) - .flat_map(|ns| &ns.table_stats) - .filter(|nts| nts.schema == schema && nts.table == table) - .map(|nts| &nts.stats) - .collect(); - - let last_vacuum = primary_stats.iter().filter_map(|s| s.last_vacuum).max(); - let last_autovacuum = primary_stats.iter().filter_map(|s| s.last_autovacuum).max(); - let last_analyze = primary_stats.iter().filter_map(|s| s.last_analyze).max(); - let last_autoanalyze = primary_stats - .iter() - .filter_map(|s| s.last_autoanalyze) - .max(); - - Some(TableStats { - reltuples, - relpages, - dead_tuples, - last_vacuum, - last_autovacuum, - last_analyze, - last_autoanalyze, - seq_scan, - idx_scan, - table_size, - }) -} - -// Per-table summary aggregated across all nodes. -#[derive(Debug, Clone)] -pub struct TableSummary { - pub schema: String, - pub table: String, - pub total_seq_scan: i64, - pub total_idx_scan: i64, - /// (node source, seq_scan) for each node that has stats for this table. - pub per_node_seq: Vec<(String, i64)>, -} - -// Anomaly flag for a table's stats. -#[derive(Debug, Clone, PartialEq, Eq)] -pub enum TableFlag { - // seq_scan / idx_scan ratio is suspiciously high. - HighSeqIdxRatio, - // Table has seq_scans but zero idx_scans. - SeqScanOnly, - // One node handles disproportionately more seq_scans. - NodeImbalance, -} - -// Detected seq_scan imbalance across nodes. -#[derive(Debug, Clone)] -pub struct NodeImbalanceInfo { - pub hot_node: String, - pub multiplier: i64, -} - -// A single table with stale or missing analyze stats. -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct StaleStatsEntry { - pub node: String, - pub schema: String, - pub table: String, - pub last_analyzed_days_ago: Option, -} - -// Aggregate per-table stats across all nodes, preserving per-node seq_scan breakdown. -pub fn summarize_table_stats(node_stats: &[NodeStats]) -> Vec { - use std::collections::BTreeMap; - - let mut agg: BTreeMap = BTreeMap::new(); - - for ns in node_stats { - for ts in &ns.table_stats { - let key = format!("{}.{}", ts.schema, ts.table); - let entry = agg.entry(key).or_insert_with(|| TableSummary { - schema: ts.schema.clone(), - table: ts.table.clone(), - total_seq_scan: 0, - total_idx_scan: 0, - per_node_seq: Vec::new(), - }); - entry.total_seq_scan += ts.stats.seq_scan; - entry.total_idx_scan += ts.stats.idx_scan; - entry - .per_node_seq - .push((ns.source.clone(), ts.stats.seq_scan)); - } - } - - agg.into_values().collect() -} - -// Compute anomaly flags for a single table summary. -pub fn detect_table_flags(summary: &TableSummary, node_stats: &[NodeStats]) -> Vec { - let mut flags = Vec::new(); - - if summary.total_seq_scan > 100 && summary.total_idx_scan > 0 { - let ratio = summary.total_seq_scan as f64 / summary.total_idx_scan as f64; - if ratio > 0.5 { - flags.push(TableFlag::HighSeqIdxRatio); - } - } else if summary.total_seq_scan > 100 && summary.total_idx_scan == 0 { - flags.push(TableFlag::SeqScanOnly); - } - - if detect_seq_scan_imbalance(node_stats, &summary.schema, &summary.table).is_some() { - flags.push(TableFlag::NodeImbalance); - } - - flags -} - -// Detect tables with stale or missing analyze stats across nodes. -pub fn detect_stale_stats(node_stats: &[NodeStats], stale_days: i64) -> Vec { - let now = chrono::Utc::now(); - let threshold = chrono::TimeDelta::days(stale_days); - let mut entries = Vec::new(); - - for ns in node_stats { - for ts in &ns.table_stats { - let last_analyzed = ts.stats.last_analyze.max(ts.stats.last_autoanalyze); - - match last_analyzed { - Some(when) if now - when > threshold => { - entries.push(StaleStatsEntry { - node: ns.source.clone(), - schema: ts.schema.clone(), - table: ts.table.clone(), - last_analyzed_days_ago: Some((now - when).num_days()), - }); - } - None => { - entries.push(StaleStatsEntry { - node: ns.source.clone(), - schema: ts.schema.clone(), - table: ts.table.clone(), - last_analyzed_days_ago: None, - }); - } - _ => {} - } - } - } - - entries -} - -/// Detect seq_scan imbalance for a single table across nodes. -/// Returns `Some` if max/min seq_scan >= 5x among nodes with nonzero scans. -pub fn detect_seq_scan_imbalance( - node_stats: &[NodeStats], - schema: &str, - table: &str, -) -> Option { - let seq_scans: Vec<(&str, i64)> = node_stats - .iter() - .filter_map(|ns| { - ns.table_stats - .iter() - .find(|t| t.table == table && t.schema == schema) - .map(|t| (ns.source.as_str(), t.stats.seq_scan)) - }) - .collect(); - - if seq_scans.len() < 2 { - return None; - } - - let nonzero: Vec<(&str, i64)> = seq_scans.into_iter().filter(|(_, v)| *v > 0).collect(); - if nonzero.len() < 2 { - return None; - } - - let min = nonzero.iter().map(|(_, v)| *v).min().unwrap_or(1); - let (hot_node, max) = nonzero - .iter() - .max_by_key(|(_, v)| *v) - .copied() - .unwrap_or(("", 1)); - - if min > 0 && max / min >= 5 { - Some(NodeImbalanceInfo { - hot_node: hot_node.to_string(), - multiplier: max / min, - }) - } else { - None - } -} - -// A single unused index (idx_scan = 0 across all nodes). -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct UnusedIndexEntry { - pub schema: String, - pub table: String, - pub index_name: String, - pub total_idx_scan: i64, - pub total_size_bytes: i64, - pub is_unique: bool, - pub definition: String, -} - -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct BloatedIndexEntry { - pub schema: String, - pub table: String, - pub index_name: String, - pub bloat_ratio: f64, - pub actual_pages: i64, - pub expected_pages: i64, - pub definition: String, -} - -pub fn detect_bloated_indexes(tables: &[Table], threshold: f64) -> Vec { - let mut entries = Vec::new(); - - for table in tables { - for idx in &table.indexes { - if let Some(est) = super::bloat::estimate_index_bloat(idx, table) - && est.bloat_ratio > threshold - { - entries.push(BloatedIndexEntry { - schema: table.schema.clone(), - table: table.name.clone(), - index_name: idx.name.clone(), - bloat_ratio: est.bloat_ratio, - actual_pages: est.actual_pages, - expected_pages: est.expected_pages, - definition: idx.definition.clone(), - }); - } - } - } - - entries.sort_by(|a, b| { - b.bloat_ratio - .partial_cmp(&a.bloat_ratio) - .unwrap_or(std::cmp::Ordering::Equal) - }); - entries -} - -/// Detect indexes with zero scans across all nodes. -/// Skips primary key indexes — those are never droppable. -/// When `node_stats` is empty, falls back to `Table.indexes[].stats`. -pub fn detect_unused_indexes(node_stats: &[NodeStats], tables: &[Table]) -> Vec { - use std::collections::BTreeMap; - - let mut entries = Vec::new(); - - if node_stats.is_empty() { - // single-node fallback: use table-level index stats - for t in tables { - for idx in &t.indexes { - if idx.is_primary { - continue; - } - if let Some(ref stats) = idx.stats - && stats.idx_scan == 0 - { - entries.push(UnusedIndexEntry { - schema: t.schema.clone(), - table: t.name.clone(), - index_name: idx.name.clone(), - total_idx_scan: 0, - total_size_bytes: stats.size, - is_unique: idx.is_unique, - definition: idx.definition.clone(), - }); - } - } - } - } else { - // multi-node: aggregate idx_scan and size by (schema, table, index_name) - #[derive(Default)] - struct Agg { - total_idx_scan: i64, - max_size: i64, - } - - let mut agg: BTreeMap<(String, String, String), Agg> = BTreeMap::new(); - for ns in node_stats { - for is in &ns.index_stats { - let key = (is.schema.clone(), is.table.clone(), is.index_name.clone()); - let entry = agg.entry(key).or_default(); - entry.total_idx_scan += is.stats.idx_scan; - if is.stats.size > entry.max_size { - entry.max_size = is.stats.size; - } - } - } - - // build index lookup from tables - let idx_lookup: BTreeMap<(&str, &str, &str), &Index> = tables - .iter() - .flat_map(|t| { - t.indexes - .iter() - .map(move |idx| (t.schema.as_str(), t.name.as_str(), idx.name.as_str(), idx)) - }) - .map(|(s, t, n, idx)| ((s, t, n), idx)) - .collect(); - - for ((schema, table, index_name), a) in &agg { - if a.total_idx_scan != 0 { - continue; - } - - let idx_info = idx_lookup.get(&(schema.as_str(), table.as_str(), index_name.as_str())); - - // skip primary keys - if idx_info.is_some_and(|idx| idx.is_primary) { - continue; - } - - entries.push(UnusedIndexEntry { - schema: schema.clone(), - table: table.clone(), - index_name: index_name.clone(), - total_idx_scan: 0, - total_size_bytes: a.max_size, - is_unique: idx_info.is_some_and(|idx| idx.is_unique), - definition: idx_info - .map(|idx| idx.definition.clone()) - .unwrap_or_default(), - }); - } - } - - // sort by size descending (biggest waste first) - entries.sort_by_key(|b| std::cmp::Reverse(b.total_size_bytes)); - entries -} - -#[cfg(test)] -mod tests { - use super::*; - - fn make_index_stats(idx_scan: i64, size: i64) -> IndexStats { - IndexStats { - idx_scan, - idx_tup_read: 0, - idx_tup_fetch: 0, - size, - relpages: 0, - reltuples: 0.0, - } - } - - fn make_index( - name: &str, - is_primary: bool, - is_unique: bool, - stats: Option, - ) -> Index { - Index { - name: name.into(), - columns: vec!["col".into()], - include_columns: vec![], - index_type: "btree".into(), - is_unique, - is_primary, - predicate: None, - definition: format!("CREATE INDEX {name} ON t (col)"), - is_valid: true, - backs_constraint: false, - stats, - } - } - - fn make_table(name: &str, indexes: Vec) -> Table { - Table { - oid: 0, - schema: "public".into(), - name: name.into(), - columns: vec![], - constraints: vec![], - indexes, - comment: None, - stats: None, - partition_info: None, - policies: vec![], - triggers: vec![], - reloptions: vec![], - rls_enabled: false, - } - } - - fn make_node_stats(source: &str, index_stats: Vec) -> NodeStats { - NodeStats { - source: source.into(), - timestamp: chrono::Utc::now(), - is_standby: false, - table_stats: vec![], - index_stats, - column_stats: vec![], - } - } - - // --- single-node (empty node_stats) tests --- - - #[test] - fn test_single_node_unused_index_detected() { - let tables = vec![make_table( - "orders", - vec![make_index( - "idx_unused", - false, - false, - Some(make_index_stats(0, 8192)), - )], - )]; - - let result = detect_unused_indexes(&[], &tables); - assert_eq!(result.len(), 1); - assert_eq!(result[0].index_name, "idx_unused"); - assert_eq!(result[0].total_size_bytes, 8192); - } - - #[test] - fn test_single_node_used_index_not_reported() { - let tables = vec![make_table( - "orders", - vec![make_index( - "idx_used", - false, - false, - Some(make_index_stats(42, 8192)), - )], - )]; - - let result = detect_unused_indexes(&[], &tables); - assert!(result.is_empty()); - } - - #[test] - fn test_single_node_primary_key_skipped() { - let tables = vec![make_table( - "orders", - vec![make_index( - "orders_pkey", - true, - true, - Some(make_index_stats(0, 8192)), - )], - )]; - - let result = detect_unused_indexes(&[], &tables); - assert!(result.is_empty()); - } - - #[test] - fn test_single_node_no_stats_skipped() { - let tables = vec![make_table( - "orders", - vec![make_index("idx_no_stats", false, false, None)], - )]; - - let result = detect_unused_indexes(&[], &tables); - assert!(result.is_empty()); - } - - #[test] - fn test_single_node_unique_flag_preserved() { - let tables = vec![make_table( - "orders", - vec![make_index( - "idx_unique_unused", - false, - true, - Some(make_index_stats(0, 4096)), - )], - )]; - - let result = detect_unused_indexes(&[], &tables); - assert_eq!(result.len(), 1); - assert!(result[0].is_unique); - } - - // --- multi-node tests --- - - #[test] - fn test_multi_node_unused_across_all_nodes() { - let tables = vec![make_table( - "orders", - vec![make_index("idx_unused", false, false, None)], - )]; - - let node_stats = vec![ - make_node_stats( - "node1", - vec![NodeIndexStats { - schema: "public".into(), - table: "orders".into(), - index_name: "idx_unused".into(), - stats: make_index_stats(0, 8192), - }], - ), - make_node_stats( - "node2", - vec![NodeIndexStats { - schema: "public".into(), - table: "orders".into(), - index_name: "idx_unused".into(), - stats: make_index_stats(0, 16384), - }], - ), - ]; - - let result = detect_unused_indexes(&node_stats, &tables); - assert_eq!(result.len(), 1); - assert_eq!(result[0].index_name, "idx_unused"); - // should use max size across nodes - assert_eq!(result[0].total_size_bytes, 16384); - } - - #[test] - fn test_multi_node_used_on_one_node_not_reported() { - let tables = vec![make_table( - "orders", - vec![make_index("idx_partial_use", false, false, None)], - )]; - - let node_stats = vec![ - make_node_stats( - "node1", - vec![NodeIndexStats { - schema: "public".into(), - table: "orders".into(), - index_name: "idx_partial_use".into(), - stats: make_index_stats(0, 8192), - }], - ), - make_node_stats( - "node2", - vec![NodeIndexStats { - schema: "public".into(), - table: "orders".into(), - index_name: "idx_partial_use".into(), - stats: make_index_stats(5, 8192), - }], - ), - ]; - - let result = detect_unused_indexes(&node_stats, &tables); - assert!(result.is_empty()); - } - - #[test] - fn test_multi_node_primary_key_skipped() { - let tables = vec![make_table( - "orders", - vec![make_index("orders_pkey", true, true, None)], - )]; - - let node_stats = vec![make_node_stats( - "node1", - vec![NodeIndexStats { - schema: "public".into(), - table: "orders".into(), - index_name: "orders_pkey".into(), - stats: make_index_stats(0, 8192), - }], - )]; - - let result = detect_unused_indexes(&node_stats, &tables); - assert!(result.is_empty()); - } - - #[test] - fn test_multi_node_sorted_by_size_desc() { - let tables = vec![make_table( - "orders", - vec![ - make_index("idx_small", false, false, None), - make_index("idx_big", false, false, None), - ], - )]; - - let node_stats = vec![make_node_stats( - "node1", - vec![ - NodeIndexStats { - schema: "public".into(), - table: "orders".into(), - index_name: "idx_small".into(), - stats: make_index_stats(0, 1024), - }, - NodeIndexStats { - schema: "public".into(), - table: "orders".into(), - index_name: "idx_big".into(), - stats: make_index_stats(0, 999_999), - }, - ], - )]; - - let result = detect_unused_indexes(&node_stats, &tables); - assert_eq!(result.len(), 2); - assert_eq!(result[0].index_name, "idx_big"); - assert_eq!(result[1].index_name, "idx_small"); - } - - #[test] - fn test_multi_node_unknown_index_still_reported() { - // index in node_stats but not in tables — should still appear with defaults - let tables: Vec
= vec![]; - - let node_stats = vec![make_node_stats( - "node1", - vec![NodeIndexStats { - schema: "public".into(), - table: "orders".into(), - index_name: "idx_ghost".into(), - stats: make_index_stats(0, 4096), - }], - )]; - - let result = detect_unused_indexes(&node_stats, &tables); - assert_eq!(result.len(), 1); - assert_eq!(result[0].index_name, "idx_ghost"); - assert!(!result[0].is_unique); - assert!(result[0].definition.is_empty()); - } - - #[test] - fn test_empty_inputs_returns_empty() { - let result = detect_unused_indexes(&[], &[]); - assert!(result.is_empty()); - } -} - -// use aggregated multi-node stats over table-level stats -pub fn effective_table_stats(table: &Table, schema: &SchemaSnapshot) -> Option { - if !schema.node_stats.is_empty() - && let Some(agg) = aggregate_table_stats(&schema.node_stats, &table.schema, &table.name) - { - return Some(agg); - } - table.stats.clone() -} - -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct NodeStats { - pub source: String, - pub timestamp: DateTime, - #[serde(default)] - pub is_standby: bool, - pub table_stats: Vec, - pub index_stats: Vec, - #[serde(default, skip_serializing_if = "Vec::is_empty")] - pub column_stats: Vec, -} - -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct NodeTableStats { - pub schema: String, - pub table: String, - pub stats: TableStats, -} - -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct NodeIndexStats { - pub schema: String, - pub table: String, - pub index_name: String, - pub stats: IndexStats, -} - -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct NodeColumnStats { - pub schema: String, - pub table: String, - pub column: String, - pub stats: ColumnStats, -} diff --git a/crates/dry_run_core/src/schema/vacuum.rs b/crates/dry_run_core/src/schema/vacuum.rs index 3fa6951..7851105 100644 --- a/crates/dry_run_core/src/schema/vacuum.rs +++ b/crates/dry_run_core/src/schema/vacuum.rs @@ -1,6 +1,7 @@ use serde::{Deserialize, Serialize}; -use super::types::{GucSetting, SchemaSnapshot, effective_table_stats}; +use super::snapshot::{AnnotatedSchema, QualifiedName}; +use super::types::GucSetting; #[derive(Debug, Clone)] pub struct AutovacuumDefaults { @@ -78,15 +79,17 @@ fn parse_reloptions(reloptions: &[String]) -> std::collections::HashMap Vec { - let defaults = parse_autovacuum_defaults(&snap.gucs); +pub fn analyze_vacuum_health(annotated: &AnnotatedSchema<'_>) -> Vec { + let defaults = parse_autovacuum_defaults(&annotated.schema.gucs); let mut results = Vec::new(); - for table in &snap.tables { - let stats = match effective_table_stats(table, snap) { - Some(s) if s.reltuples >= 10_000.0 => s, + for table in &annotated.schema.tables { + let qn = QualifiedName::new(&table.schema, &table.name); + let reltuples = match annotated.reltuples(&qn) { + Some(r) if r >= 10_000.0 => r, _ => continue, }; + let dead_tuples = annotated.n_dead_tup_sum(&qn); let opts = parse_reloptions(&table.reloptions); let has_overrides = opts.keys().any(|k| k.starts_with("autovacuum_")); @@ -121,10 +124,10 @@ pub fn analyze_vacuum_health(snap: &SchemaSnapshot) -> Vec { av_enabled = v == "on" || v == "true"; } - let trigger_at = threshold as f64 + scale_factor * stats.reltuples; - let analyze_trigger = analyze_threshold as f64 + analyze_scale_factor * stats.reltuples; + let trigger_at = threshold as f64 + scale_factor * reltuples; + let analyze_trigger = analyze_threshold as f64 + analyze_scale_factor * reltuples; let progress = if trigger_at > 0.0 { - stats.dead_tuples as f64 / trigger_at + dead_tuples as f64 / trigger_at } else { 0.0 }; @@ -138,8 +141,8 @@ pub fn analyze_vacuum_health(snap: &SchemaSnapshot) -> Vec { ); } - if stats.reltuples >= 1_000_000.0 && !has_overrides { - let mut suggested_vac_sf = 100_000.0 / stats.reltuples; + if reltuples >= 1_000_000.0 && !has_overrides { + let mut suggested_vac_sf = 100_000.0 / reltuples; suggested_vac_sf = (suggested_vac_sf * 1000.0).round() / 1000.0; if suggested_vac_sf < 0.001 { suggested_vac_sf = 0.001; @@ -147,7 +150,7 @@ pub fn analyze_vacuum_health(snap: &SchemaSnapshot) -> Vec { let suggested_az_sf = (suggested_vac_sf / 2.0 * 1000.0).round() / 1000.0; // threshold: ~1% of rows, clamped to 500..5000 - let suggested_vac_thresh = ((stats.reltuples * 0.01) as i64).clamp(500, 5000); + let suggested_vac_thresh = ((reltuples * 0.01) as i64).clamp(500, 5000); let suggested_az_thresh = (suggested_vac_thresh / 2).max(250); recommendations.push(format!( @@ -156,16 +159,16 @@ pub fn analyze_vacuum_health(snap: &SchemaSnapshot) -> Vec { autovacuum_vacuum_threshold={suggested_vac_thresh}, \ autovacuum_analyze_scale_factor={suggested_az_sf}, \ autovacuum_analyze_threshold={suggested_az_thresh}", - stats.reltuples as i64 / 1000 + reltuples as i64 / 1000 )); } - if stats.reltuples > 0.0 && stats.dead_tuples as f64 / stats.reltuples > 0.10 { + if reltuples > 0.0 && dead_tuples as f64 / reltuples > 0.10 { recommendations.push(format!( "high dead tuple ratio: {} dead / {}k live ({:.1}%)", - stats.dead_tuples, - stats.reltuples as i64 / 1000, - stats.dead_tuples as f64 / stats.reltuples * 100.0 + dead_tuples, + reltuples as i64 / 1000, + dead_tuples as f64 / reltuples * 100.0 )); } @@ -179,8 +182,8 @@ pub fn analyze_vacuum_health(snap: &SchemaSnapshot) -> Vec { results.push(VacuumHealth { schema: table.schema.clone(), table: table.name.clone(), - reltuples: stats.reltuples, - dead_tuples: stats.dead_tuples, + reltuples, + dead_tuples, vacuum_trigger_at: trigger_at, vacuum_progress: progress, has_overrides, @@ -203,135 +206,5 @@ pub fn analyze_vacuum_health(snap: &SchemaSnapshot) -> Vec { } #[cfg(test)] -mod tests { - use super::*; - use crate::schema::*; - - fn make_table_with_stats(name: &str, reltuples: f64, dead: i64) -> Table { - Table { - oid: 0, - schema: "public".into(), - name: name.into(), - columns: vec![], - constraints: vec![], - indexes: vec![], - comment: None, - stats: Some(TableStats { - reltuples, - relpages: 1000, - dead_tuples: dead, - last_vacuum: None, - last_autovacuum: None, - last_analyze: None, - last_autoanalyze: None, - seq_scan: 0, - idx_scan: 0, - table_size: 0, - }), - partition_info: None, - policies: vec![], - triggers: vec![], - reloptions: vec![], - rls_enabled: false, - } - } - - fn make_snap(tables: Vec
) -> SchemaSnapshot { - SchemaSnapshot { - pg_version: "16.0".into(), - database: "test".into(), - timestamp: chrono::Utc::now(), - content_hash: String::new(), - source: None, - tables, - enums: vec![], - domains: vec![], - composites: vec![], - views: vec![], - functions: vec![], - extensions: vec![], - gucs: vec![], - node_stats: vec![], - } - } - - #[test] - fn skips_small_tables() { - let snap = make_snap(vec![make_table_with_stats("tiny", 100.0, 10)]); - let results = analyze_vacuum_health(&snap); - assert!(results.is_empty()); - } - - #[test] - fn reports_large_table_with_defaults() { - let snap = make_snap(vec![make_table_with_stats("big", 5_000_000.0, 100)]); - let results = analyze_vacuum_health(&snap); - assert_eq!(results.len(), 1); - assert!( - results[0] - .recommendations - .iter() - .any(|r| r.contains("large table")) - ); - } - - #[test] - fn reports_high_dead_ratio() { - let snap = make_snap(vec![make_table_with_stats("dirty", 100_000.0, 20_000)]); - let results = analyze_vacuum_health(&snap); - assert_eq!(results.len(), 1); - assert!( - results[0] - .recommendations - .iter() - .any(|r| r.contains("high dead tuple")) - ); - } - - #[test] - fn disabled_autovacuum_warns() { - let mut table = make_table_with_stats("bad", 100_000.0, 100); - table.reloptions = vec!["autovacuum_enabled=false".into()]; - let snap = make_snap(vec![table]); - let results = analyze_vacuum_health(&snap); - assert_eq!(results.len(), 1); - assert!( - results[0] - .recommendations - .iter() - .any(|r| r.contains("disabled")) - ); - assert!(!results[0].autovacuum_enabled); - } - - #[test] - fn parses_defaults_from_gucs() { - let gucs = vec![ - GucSetting { - name: "autovacuum_vacuum_threshold".into(), - setting: "100".into(), - unit: None, - }, - GucSetting { - name: "autovacuum_vacuum_scale_factor".into(), - setting: "0.05".into(), - unit: None, - }, - GucSetting { - name: "autovacuum_analyze_threshold".into(), - setting: "200".into(), - unit: None, - }, - GucSetting { - name: "autovacuum_analyze_scale_factor".into(), - setting: "0.02".into(), - unit: None, - }, - ]; - let d = parse_autovacuum_defaults(&gucs); - assert_eq!(d.vacuum_threshold, 100); - assert!((d.vacuum_scale_factor - 0.05).abs() < f64::EPSILON); - assert_eq!(d.analyze_threshold, 200); - assert!((d.analyze_scale_factor - 0.02).abs() < f64::EPSILON); - } -} +#[path = "vacuum_tests.rs"] +mod tests; diff --git a/crates/dry_run_core/src/schema/vacuum_tests.rs b/crates/dry_run_core/src/schema/vacuum_tests.rs new file mode 100644 index 0000000..c75f59c --- /dev/null +++ b/crates/dry_run_core/src/schema/vacuum_tests.rs @@ -0,0 +1,298 @@ +use std::collections::BTreeMap; + +use super::*; +use crate::schema::*; + +fn ddl_table(name: &str) -> Table { + Table { + oid: 0, + schema: "public".into(), + name: name.into(), + columns: vec![], + constraints: vec![], + indexes: vec![], + comment: None, + partition_info: None, + policies: vec![], + triggers: vec![], + reloptions: vec![], + rls_enabled: false, + } +} + +fn make_snap(tables: Vec
) -> SchemaSnapshot { + SchemaSnapshot { + pg_version: "16.0".into(), + database: "test".into(), + timestamp: chrono::Utc::now(), + content_hash: String::new(), + source: None, + tables, + enums: vec![], + domains: vec![], + composites: vec![], + views: vec![], + functions: vec![], + extensions: vec![], + gucs: vec![], + } +} + +fn annotated( + tables: Vec
, + sizing: Vec<(&str, f64, i64)>, + dead_by_table: Vec<(&str, i64)>, +) -> AnnotatedSnapshot { + let schema = make_snap(tables); + let planner = PlannerStatsSnapshot { + pg_version: "16.0".into(), + database: "test".into(), + timestamp: chrono::Utc::now(), + content_hash: "ph".into(), + schema_ref_hash: "sh".into(), + tables: sizing + .into_iter() + .map(|(name, reltuples, table_size)| TableSizingEntry { + table: QualifiedName::new("public", name), + sizing: TableSizing { + reltuples, + relpages: 1000, + table_size, + total_size: None, + index_size: None, + }, + }) + .collect(), + columns: vec![], + indexes: vec![], + }; + let activity = ActivityStatsSnapshot { + pg_version: "16.0".into(), + database: "test".into(), + timestamp: chrono::Utc::now(), + content_hash: "ah".into(), + schema_ref_hash: "sh".into(), + node: NodeIdentity { + label: "primary".into(), + host: "p".into(), + is_standby: false, + replication_lag_bytes: None, + stats_reset: None, + }, + tables: dead_by_table + .into_iter() + .map(|(name, dead)| TableActivityEntry { + table: QualifiedName::new("public", name), + activity: TableActivity { + seq_scan: 0, + idx_scan: 0, + n_live_tup: 0, + n_dead_tup: dead, + last_vacuum: None, + last_autovacuum: None, + last_analyze: None, + last_autoanalyze: None, + vacuum_count: 0, + autovacuum_count: 0, + analyze_count: 0, + autoanalyze_count: 0, + }, + }) + .collect(), + indexes: Vec::::new(), + }; + let mut activity_by_node = BTreeMap::new(); + activity_by_node.insert("primary".into(), activity); + AnnotatedSnapshot { + schema, + planner: Some(planner), + activity_by_node, + } +} + +#[test] +fn skips_small_tables() { + let snap = annotated( + vec![ddl_table("tiny")], + vec![("tiny", 100.0, 0)], + vec![("tiny", 10)], + ); + let results = analyze_vacuum_health(&snap.view()); + assert!(results.is_empty()); +} + +#[test] +fn reports_large_table_with_defaults() { + let snap = annotated( + vec![ddl_table("big")], + vec![("big", 5_000_000.0, 0)], + vec![("big", 100)], + ); + let results = analyze_vacuum_health(&snap.view()); + assert_eq!(results.len(), 1); + assert!( + results[0] + .recommendations + .iter() + .any(|r| r.contains("large table")) + ); +} + +#[test] +fn reports_high_dead_ratio() { + let snap = annotated( + vec![ddl_table("dirty")], + vec![("dirty", 100_000.0, 0)], + vec![("dirty", 20_000)], + ); + let results = analyze_vacuum_health(&snap.view()); + assert_eq!(results.len(), 1); + assert!( + results[0] + .recommendations + .iter() + .any(|r| r.contains("high dead tuple")) + ); +} + +#[test] +fn disabled_autovacuum_warns() { + let mut table = ddl_table("bad"); + table.reloptions = vec!["autovacuum_enabled=false".into()]; + let snap = annotated(vec![table], vec![("bad", 100_000.0, 0)], vec![("bad", 100)]); + let results = analyze_vacuum_health(&snap.view()); + assert_eq!(results.len(), 1); + assert!( + results[0] + .recommendations + .iter() + .any(|r| r.contains("disabled")) + ); + assert!(!results[0].autovacuum_enabled); +} + +#[test] +fn skipped_when_planner_absent() { + // Degradation case: schema has the table but planner is None → reltuples + // returns None → skipped. Pins the new "no data → no findings" path. + let snap = AnnotatedSnapshot { + schema: make_snap(vec![ddl_table("big")]), + planner: None, + activity_by_node: BTreeMap::new(), + }; + assert!(analyze_vacuum_health(&snap.view()).is_empty()); +} + +#[test] +fn dead_tuples_summed_across_replicas() { + // 3-node cluster, dead_tuples reported per node. Cluster sum drives the + // ratio check. + let schema = make_snap(vec![ddl_table("hot")]); + let planner = PlannerStatsSnapshot { + pg_version: "16.0".into(), + database: "test".into(), + timestamp: chrono::Utc::now(), + content_hash: "ph".into(), + schema_ref_hash: "sh".into(), + tables: vec![TableSizingEntry { + table: QualifiedName::new("public", "hot"), + sizing: TableSizing { + reltuples: 100_000.0, + relpages: 1000, + table_size: 0, + total_size: None, + index_size: None, + }, + }], + columns: vec![], + indexes: vec![], + }; + let mut activity_by_node = BTreeMap::new(); + for (label, dead) in [ + ("primary", 8_000_i64), + ("replica1", 7_000), + ("replica2", 6_000), + ] { + activity_by_node.insert( + label.into(), + ActivityStatsSnapshot { + pg_version: "16.0".into(), + database: "test".into(), + timestamp: chrono::Utc::now(), + content_hash: format!("h-{label}"), + schema_ref_hash: "sh".into(), + node: NodeIdentity { + label: label.into(), + host: label.into(), + is_standby: label != "primary", + replication_lag_bytes: None, + stats_reset: None, + }, + tables: vec![TableActivityEntry { + table: QualifiedName::new("public", "hot"), + activity: TableActivity { + seq_scan: 0, + idx_scan: 0, + n_live_tup: 0, + n_dead_tup: dead, + last_vacuum: None, + last_autovacuum: None, + last_analyze: None, + last_autoanalyze: None, + vacuum_count: 0, + autovacuum_count: 0, + analyze_count: 0, + autoanalyze_count: 0, + }, + }], + indexes: vec![], + }, + ); + } + let snap = AnnotatedSnapshot { + schema, + planner: Some(planner), + activity_by_node, + }; + let results = analyze_vacuum_health(&snap.view()); + assert_eq!(results.len(), 1); + // 8k+7k+6k = 21k dead vs 100k live → 21% > 10% threshold + assert_eq!(results[0].dead_tuples, 21_000); + assert!( + results[0] + .recommendations + .iter() + .any(|r| r.contains("high dead tuple")) + ); +} + +#[test] +fn parses_defaults_from_gucs() { + let gucs = vec![ + GucSetting { + name: "autovacuum_vacuum_threshold".into(), + setting: "100".into(), + unit: None, + }, + GucSetting { + name: "autovacuum_vacuum_scale_factor".into(), + setting: "0.05".into(), + unit: None, + }, + GucSetting { + name: "autovacuum_analyze_threshold".into(), + setting: "200".into(), + unit: None, + }, + GucSetting { + name: "autovacuum_analyze_scale_factor".into(), + setting: "0.02".into(), + unit: None, + }, + ]; + let d = parse_autovacuum_defaults(&gucs); + assert_eq!(d.vacuum_threshold, 100); + assert!((d.vacuum_scale_factor - 0.05).abs() < f64::EPSILON); + assert_eq!(d.analyze_threshold, 200); + assert!((d.analyze_scale_factor - 0.02).abs() < f64::EPSILON); +}