vortex-data
diff --git a/‎benchmarks-website/migrate/src/classifier.rs‎
Lines changed: 47 additions & 33 deletions b/‎benchmarks-website/migrate/src/classifier.rs‎
Lines changed: 47 additions & 33 deletions
diff --git a/‎benchmarks-website/migrate/src/commits.rs‎
Lines changed: 24 additions & 29 deletions b/‎benchmarks-website/migrate/src/commits.rs‎
Lines changed: 24 additions & 29 deletions
diff --git a/‎benchmarks-website/migrate/src/migrate.rs‎
Lines changed: 126 additions & 27 deletions b/‎benchmarks-website/migrate/src/migrate.rs‎
Lines changed: 126 additions & 27 deletions
diff --git a/‎benchmarks-website/migrate/src/source.rs‎
Lines changed: 1 addition & 1 deletion b/‎benchmarks-website/migrate/src/source.rs‎
Lines changed: 1 addition & 1 deletion
@@ -398,7 +398,7 @@ pub enum V3Bin {
 pub fn classify(record: &V2Record) -> Option<V3Bin> {
     let cls = classify_v2(record)?;
     match &cls.group {
-        V2Group::RandomAccess => bin_random_access(&cls, record),
+        V2Group::RandomAccess => bin_random_access(record),
         V2Group::Compression => bin_compression_time(&cls, record),
         V2Group::CompressionSize => bin_compression_size(&cls, record),
         V2Group::Query { .. } => bin_query(&cls, record),
@@ -537,7 +537,16 @@ pub fn classify_outcome(record: &V2Record) -> Outcome {
         return Outcome::Skip(Skip::DerivedRatio);
     }
     let bin = match &cls.group {
-        V2Group::RandomAccess => bin_random_access(&cls, record),
+        V2Group::RandomAccess => match bin_random_access(record) {
+            Some(b) => Some(b),
+            // Legacy 2-part `random-access/<format>-…` records carry
+            // no dataset and are intentionally dropped by
+            // `bin_random_access`. Route them to Skip so the
+            // `Outcome::Unknown` arm below — and the 5%
+            // uncategorized gate in `migrate::run` — don't trip on
+            // them.
+            None => return Outcome::Skip(Skip::UnsupportedShape),
+        },
         V2Group::Compression => bin_compression_time(&cls, record),
         V2Group::CompressionSize => bin_compression_size(&cls, record),
         V2Group::Query { .. } => bin_query(&cls, record),
@@ -556,34 +565,34 @@ pub fn classify_outcome(record: &V2Record) -> Outcome {
     Outcome::Bin(bin)
 }
 
-fn bin_random_access(cls: &V2Classification, record: &V2Record) -> Option<V3Bin> {
-    // v2 chart name shape: "RANDOM ACCESS" or "DATASET/PATTERN" (uppercase).
-    // We store it as the v3 dataset value verbatim, lowercased so
-    // `/api/groups` returns canonical lowercase names.
-    let dataset = cls.chart.to_lowercase();
-    if dataset.is_empty() {
-        return None;
-    }
-    // Pull format from the raw, pre-rename v2 name so v3 stores the
-    // canonical `Format::name()` string (matching what the v3 live
-    // emitter writes). Raw shape is
+fn bin_random_access(record: &V2Record) -> Option<V3Bin> {
+    // Pull dataset and format from the raw, pre-rename v2 name so v3
+    // stores meaningful values. Raw shape is
     // `random-access/<dataset>/<pattern>/<format>-tokio-local-disk`
-    // (4-part) or `random-access/<format>-tokio-local-disk` (2-part
-    // legacy). After stripping the `-tokio-local-disk` suffix, map the
-    // v2 random-access ext label (`vortex`, from `Format::ext()`) to
-    // the canonical name (`vortex-file-compressed`, from
-    // `Format::name()`). `parquet` and `lance` match between ext and
-    // name. The `vortex` ext is shared by both `OnDiskVortex` (name
+    // (4-part). 2-part legacy records (`random-access/<format>-…`)
+    // carry no dataset and historically rendered as the placeholder
+    // string "RANDOM ACCESS"; drop them rather than emit a fake
+    // dataset. Deriving from the raw name (rather than `cls.chart`)
+    // also keeps this independent of v2's `normalizeChartName`.
+    //
+    // After stripping the `-tokio-local-disk` suffix, map the v2
+    // random-access ext label (`vortex`, from `Format::ext()`) to the
+    // canonical name (`vortex-file-compressed`, from `Format::name()`).
+    // `parquet` and `lance` match between ext and name. The `vortex`
+    // ext is shared by both `OnDiskVortex` (name
     // `vortex-file-compressed`) and `VortexCompact` (name
     // `vortex-compact`), but v2's random-access bench only emitted
     // `OnDiskVortex`, so mapping to `vortex-file-compressed` is
     // correct for all historical data.
     let parts: Vec<&str> = record.name.split('/').collect();
-    let raw = match parts.len() {
-        4 => parts[3],
-        2 => parts[1],
-        _ => return None,
-    };
+    if parts.len() != 4 {
+        return None;
+    }
+    if parts[1].is_empty() || parts[2].is_empty() {
+        return None;
+    }
+    let dataset = format!("{}/{}", parts[1], parts[2]).to_lowercase();
+    let raw = parts[3];
     if raw.is_empty() || raw == "default" {
         return None;
     }
@@ -668,15 +677,20 @@ fn bin_compression_size(cls: &V2Classification, record: &V2Record) -> Option<V3B
     }
     // Mirror the file-sizes ingest path's dataset_variant derivation
     // (see `migrate::migrate_file_sizes`): pull the SF out of the v2
-    // record's `dataset` object when present, drop empty / "1.0".
-    // Without this both code paths produce the same `mid` only by
-    // accident, so SF=10 file-sizes rows wouldn't merge with the
-    // matching data.json.gz "vortex size/tpch" rows.
-    let dataset_variant = record
-        .dataset
-        .as_ref()
-        .and_then(|d| crate::v2::dataset_scale_factor(d, dataset.as_str()))
-        .filter(|s| !s.is_empty() && s.as_str() != "1.0");
+    // record's `dataset` object when present and run it through
+    // `canonical_scale_factor` so `"1"`, `"1.0"`, `"10"` and `"10.0"`
+    // collapse to one canonical form. Without this both code paths
+    // produce the same `mid` only by accident, so SF=10 file-sizes
+    // rows wouldn't merge with the matching data.json.gz
+    // "vortex size/tpch" rows when one side wrote `"10"` and the
+    // other wrote `"10.0"`.
+    let dataset_variant = crate::v2::canonical_scale_factor(
+        record
+            .dataset
+            .as_ref()
+            .and_then(|d| crate::v2::dataset_scale_factor(d, dataset.as_str()))
+            .as_deref(),
+    );
     Some(V3Bin::CompressionSize {
         dataset,
         dataset_variant,
 
@@ -11,38 +11,20 @@ use duckdb::params;
 
 use crate::v2::V2Commit;
 
-/// Insert a v3 `commits` row for one v2 commit. Missing fields are
-/// filled with the empty string, matching the v3 schema's `NOT NULL`
-/// constraints; the call site logs a warning for each fallback so
-/// the operator can spot bad inputs.
+/// Insert a v3 `commits` row for one v2 commit. `tree_sha` and `url`
+/// remain required and use a warning-bearing empty-string fallback;
+/// the human-input fields (message, author/committer name and email)
+/// are nullable in the v3 schema, so empty / missing values map to
+/// SQL `NULL` instead of an empty string the UI would render as a
+/// blank cell.
 pub fn upsert_commit(tx: &Transaction<'_>, commit: &V2Commit) -> Result<UpsertOutcome> {
     let mut warnings = Vec::new();
     let timestamp = require_field(&commit.timestamp, "timestamp", &commit.id, &mut warnings);
-    let message = require_field(&commit.message, "message", &commit.id, &mut warnings);
-    let author_name = require_field(
-        &commit.author.as_ref().and_then(|p| p.name.clone()),
-        "author.name",
-        &commit.id,
-        &mut warnings,
-    );
-    let author_email = require_field(
-        &commit.author.as_ref().and_then(|p| p.email.clone()),
-        "author.email",
-        &commit.id,
-        &mut warnings,
-    );
-    let committer_name = require_field(
-        &commit.committer.as_ref().and_then(|p| p.name.clone()),
-        "committer.name",
-        &commit.id,
-        &mut warnings,
-    );
-    let committer_email = require_field(
-        &commit.committer.as_ref().and_then(|p| p.email.clone()),
-        "committer.email",
-        &commit.id,
-        &mut warnings,
-    );
+    let message = optional_field(&commit.message);
+    let author_name = optional_field(&commit.author.as_ref().and_then(|p| p.name.clone()));
+    let author_email = optional_field(&commit.author.as_ref().and_then(|p| p.email.clone()));
+    let committer_name = optional_field(&commit.committer.as_ref().and_then(|p| p.name.clone()));
+    let committer_email = optional_field(&commit.committer.as_ref().and_then(|p| p.email.clone()));
     let tree_sha = require_field(&commit.tree_id, "tree_id", &commit.id, &mut warnings);
     let url = require_field(&commit.url, "url", &commit.id, &mut warnings);
 
@@ -93,6 +75,19 @@ fn require_field(
     }
 }
 
+/// Coerce a v2-supplied `Option<String>` into a SQL-bindable
+/// `Option<String>`, treating an empty / whitespace-only value as
+/// missing. v2 sometimes wrote `""` for blank author / committer /
+/// message fields; storing those as actual `NULL` lets the UI
+/// distinguish "missing metadata" from "deliberately blank".
+fn optional_field(field: &Option<String>) -> Option<String> {
+    field
+        .as_deref()
+        .map(str::trim)
+        .filter(|s| !s.is_empty())
+        .map(str::to_string)
+}
+
 /// Per-call warning bag returned to the caller for logging.
 #[derive(Debug, Default)]
 pub struct UpsertOutcome {
 
@@ -49,10 +49,12 @@ use vortex_utils::aliases::hash_map::HashMap;
 use crate::classifier;
 use crate::classifier::V3Bin;
 use crate::commits::upsert_commit;
+use crate::source::KNOWN_FILE_SIZES_SUITES;
 use crate::source::Source;
 use crate::v2::V2Commit;
 use crate::v2::V2FileSize;
 use crate::v2::V2Record;
+use crate::v2::canonical_scale_factor;
 use crate::v2::index_commits;
 use crate::v2::runtime_as_i64;
 use crate::v2::value_as_f64;
@@ -171,27 +173,46 @@ pub fn run(source: &Source, target: &Path) -> Result<MigrationSummary> {
     }
 
     info!("Flushing accumulators to DuckDB");
-    summary.query_inserted = q.measurement_id.len() as u64;
-    summary.compression_time_inserted = ct.measurement_id.len() as u64;
-    summary.random_access_inserted = ra.measurement_id.len() as u64;
-    summary.compression_size_inserted = cs.rows.len() as u64;
-
-    flush(&conn, "query_measurements", build_query_batch(q)?)?;
-    flush(
-        &conn,
-        "compression_times",
-        build_compression_time_batch(ct)?,
-    )?;
-    flush(&conn, "random_access_times", build_random_access_batch(ra)?)?;
-    flush(
-        &conn,
-        "compression_sizes",
-        build_compression_size_batch(cs)?,
-    )?;
+    flush_all(&conn, q, ct, ra, cs, &mut summary)?;
 
     Ok(summary)
 }
 
+/// Flush each accumulator's batch and bump the matching per-fact
+/// summary counter only AFTER the flush succeeds. This way a flush
+/// failure leaves the counter at zero (or its previous value) rather
+/// than reporting rows that never landed in DuckDB.
+fn flush_all(
+    conn: &Connection,
+    q: QueryAccum,
+    ct: CompressionTimeAccum,
+    ra: RandomAccessAccum,
+    cs: CompressionSizeAccum,
+    summary: &mut MigrationSummary,
+) -> Result<()> {
+    let batch = build_query_batch(q)?;
+    let n = batch.num_rows() as u64;
+    flush(conn, "query_measurements", batch)?;
+    summary.query_inserted = n;
+
+    let batch = build_compression_time_batch(ct)?;
+    let n = batch.num_rows() as u64;
+    flush(conn, "compression_times", batch)?;
+    summary.compression_time_inserted = n;
+
+    let batch = build_random_access_batch(ra)?;
+    let n = batch.num_rows() as u64;
+    flush(conn, "random_access_times", batch)?;
+    summary.random_access_inserted = n;
+
+    let batch = build_compression_size_batch(cs)?;
+    let n = batch.num_rows() as u64;
+    flush(conn, "compression_sizes", batch)?;
+    summary.compression_size_inserted = n;
+
+    Ok(())
+}
+
 fn read_commits(source: &Source) -> Result<BTreeMap<String, V2Commit>> {
     let reader = source.open_commits_jsonl()?;
     let mut commits: Vec<V2Commit> = Vec::new();
@@ -409,11 +430,19 @@ fn migrate_file_sizes(
     cs: &mut CompressionSizeAccum,
 ) -> Result<()> {
     let reader = source.open_file_sizes(name)?;
-    let dataset_fallback = name
-        .strip_prefix("file-sizes-")
-        .and_then(|s| s.strip_suffix(".json.gz"))
-        .unwrap_or(name)
-        .to_string();
+    // Prefix unknown-id fallbacks with `unknown:` so they're clearly
+    // labeled in the UI rather than masquerading as a dataset name.
+    let dataset_fallback = {
+        let stripped = name
+            .strip_prefix("file-sizes-")
+            .and_then(|s| s.strip_suffix(".json.gz"))
+            .unwrap_or(name);
+        if KNOWN_FILE_SIZES_SUITES.contains(&stripped) {
+            stripped.to_string()
+        } else {
+            format!("unknown:{stripped}")
+        }
+    };
     let started = Instant::now();
     let mut last_log = Instant::now();
     for line in reader.lines() {
@@ -438,11 +467,10 @@ fn migrate_file_sizes(
         } else {
             sz.benchmark.clone()
         };
-        let dataset_variant = sz
-            .scale_factor
-            .as_ref()
-            .filter(|s| !s.is_empty() && s.as_str() != "1.0")
-            .cloned();
+        // Run SF through canonical_scale_factor so `"1"`, `"1.0"`, `"10"`
+        // and `"10.0"` collapse to one form, matching what
+        // `bin_compression_size` writes for the data.json.gz path.
+        let dataset_variant = canonical_scale_factor(sz.scale_factor.as_deref());
         let csr = CompressionSize {
             commit_sha: sz.commit_id.clone(),
             dataset,
@@ -834,3 +862,74 @@ impl std::fmt::Display for MigrationSummary {
         Ok(())
     }
 }
+
+#[cfg(test)]
+mod tests {
+    use vortex_bench_server::records::QueryMeasurement;
+
+    use super::*;
+
+    fn open_db_without(table: &str) -> (tempfile::TempDir, Connection) {
+        let dir = tempfile::TempDir::new().unwrap();
+        let path = dir.path().join("v3.duckdb");
+        let conn = open_target_db(&path).unwrap();
+        conn.execute_batch(&format!("DROP TABLE {table}")).unwrap();
+        (dir, conn)
+    }
+
+    fn one_query_row() -> QueryMeasurement {
+        QueryMeasurement {
+            commit_sha: "deadbeef".into(),
+            dataset: "clickbench".into(),
+            dataset_variant: None,
+            scale_factor: None,
+            query_idx: 7,
+            storage: "nvme".into(),
+            engine: "datafusion".into(),
+            format: "parquet".into(),
+            value_ns: 100,
+            all_runtimes_ns: vec![100],
+            peak_physical: None,
+            peak_virtual: None,
+            physical_delta: None,
+            virtual_delta: None,
+            env_triple: None,
+        }
+    }
+
+    #[test]
+    fn flush_all_does_not_overcount_on_failure() {
+        // Drop `compression_times` before flushing so the second
+        // flush in `flush_all` fails. The first (queries) succeeded,
+        // so its counter must be set; the failed table's counter and
+        // every later table's counter must stay at zero.
+        let (_dir, conn) = open_db_without("compression_times");
+
+        let mut summary = MigrationSummary::default();
+        let mut q = QueryAccum::default();
+        let qm = one_query_row();
+        let mid = vortex_bench_server::db::measurement_id_query(&qm);
+        q.push(mid, qm, &mut summary);
+
+        let ct = CompressionTimeAccum::default();
+        let ra = RandomAccessAccum::default();
+        let cs = CompressionSizeAccum::default();
+
+        let result = flush_all(&conn, q, ct, ra, cs, &mut summary);
+        assert!(result.is_err(), "expected flush to fail on missing table");
+
+        assert_eq!(
+            summary.query_inserted, 1,
+            "query flushed before the failure must be counted"
+        );
+        assert_eq!(
+            summary.compression_time_inserted, 0,
+            "failed flush must not bump the counter"
+        );
+        assert_eq!(summary.random_access_inserted, 0, "later flushes never ran");
+        assert_eq!(
+            summary.compression_size_inserted, 0,
+            "later flushes never ran"
+        );
+    }
+}
@@ -126,7 +126,7 @@ fn open_s3(name: &str) -> Result<Box<dyn Read + Send>> {
 /// The post-bench `file-sizes` step uploads `file-sizes-${{ matrix.id
 /// }}.json.gz`, so this list must match those IDs verbatim. Adding a
 /// new matrix entry to that workflow means adding the same ID here.
-const KNOWN_FILE_SIZES_SUITES: &[&str] = &[
+pub(crate) const KNOWN_FILE_SIZES_SUITES: &[&str] = &[
     "clickbench-nvme",
     "tpch-nvme",
     "tpch-s3",