Skip to content

Commit 266f7c6

Browse files
connortsui20claude
andcommitted
Benchmarks v3 migration to duckdb (#7646)
This is a one-shot migration binary to take all of the data from `data.json.gz` and bring it into a duckdb database. Simply gathers and aggregates everything into memory and writes data in chunks with arrow arrays. Insert row-by-row took way too long, and the appender API in duckdb does not support `BIGINT[]` for some reason... --------- Signed-off-by: Claude <noreply@anthropic.com> Signed-off-by: Connor Tsui <connor.tsui20@gmail.com> Co-authored-by: Claude <noreply@anthropic.com> Signed-off-by: Connor Tsui <connor.tsui20@gmail.com>
1 parent 8044a05 commit 266f7c6

13 files changed

Lines changed: 3325 additions & 0 deletions

File tree

Cargo.lock

Lines changed: 60 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,7 @@ members = [
6262
"benchmarks/vector-search-bench",
6363
# Benchmarks website v3 (alpha) - leaf binary, not part of vortex-* API
6464
"benchmarks-website/server",
65+
"benchmarks-website/migrate",
6566
]
6667
exclude = ["java/testfiles", "wasm-test"]
6768
resolver = "2"
Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
# SPDX-License-Identifier: Apache-2.0
2+
# SPDX-FileCopyrightText: Copyright the Vortex contributors
3+
4+
[package]
5+
name = "vortex-bench-migrate"
6+
version = "0.1.0-alpha.0"
7+
edition = "2024"
8+
rust-version = "1.91.0"
9+
license = "Apache-2.0"
10+
description = "One-shot historical migrator from the v2 benchmarks S3 dataset to a v3 DuckDB file"
11+
publish = false
12+
13+
[[bin]]
14+
name = "vortex-bench-migrate"
15+
path = "src/main.rs"
16+
17+
# Throwaway binary, not part of the vortex-* public API surface.
18+
# Errors use anyhow, and the crate is intentionally outside the
19+
# workspace public-api lockfile set.
20+
21+
[dependencies]
22+
anyhow = { workspace = true }
23+
arrow-array = { workspace = true }
24+
arrow-buffer = { workspace = true }
25+
arrow-schema = { workspace = true }
26+
clap = { workspace = true, features = ["derive"] }
27+
# track vortex-duckdb's bundled engine version (build.rs)
28+
duckdb = { version = "1.10502", features = ["bundled", "appender-arrow"] }
29+
flate2 = "1.1"
30+
reqwest = { workspace = true, features = ["json"] }
31+
serde = { workspace = true, features = ["derive"] }
32+
serde_json = { workspace = true }
33+
tokio = { workspace = true, features = ["rt-multi-thread", "macros"] }
34+
tracing = { workspace = true, features = ["std"] }
35+
tracing-subscriber = { workspace = true, features = ["env-filter", "fmt"] }
36+
vortex-bench-server = { path = "../server" }
37+
vortex-utils = { workspace = true }
38+
39+
[dev-dependencies]
40+
rstest = { workspace = true }
41+
tempfile = { workspace = true }

0 commit comments

Comments
 (0)