Skip to content

Commit 47efbdb

Browse files
committed
use arrow canonical extension name
Signed-off-by: Baris Palaska <barispalaska@gmail.com>
1 parent 8423533 commit 47efbdb

8 files changed

Lines changed: 201 additions & 108 deletions

File tree

Cargo.lock

Lines changed: 2 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

vortex-array/src/dtype/arrow.rs

Lines changed: 57 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,26 @@ use crate::extension::datetime::Timestamp;
5555

5656
const ARROW_EXT_NAME_VARIANT: &str = "arrow.parquet.variant";
5757

58+
/// `(vortex_id, arrow_canonical_name)` pairs — single source of truth for bijection between
59+
/// Vortex-internal extension ids and Arrow canonical extension names. Canonical extensions
60+
/// serialize metadata as raw UTF-8 (typically JSON) rather than base64-wrapped bytes.
61+
const CANONICAL_ALIASES: &[(&str, &str)] =
62+
&[("vortex.fixed_shape_tensor", "arrow.fixed_shape_tensor")];
63+
64+
fn vortex_id_to_arrow_canonical(vortex_id: &str) -> Option<&'static str> {
65+
CANONICAL_ALIASES
66+
.iter()
67+
.find(|(v, _)| *v == vortex_id)
68+
.map(|(_, a)| *a)
69+
}
70+
71+
fn arrow_canonical_to_vortex_id(arrow_name: &str) -> Option<&'static str> {
72+
CANONICAL_ALIASES
73+
.iter()
74+
.find(|(_, a)| *a == arrow_name)
75+
.map(|(v, _)| *v)
76+
}
77+
5878
/// Trait for converting Arrow types to Vortex types.
5979
pub trait FromArrowType<T>: Sized {
6080
/// Convert the Arrow type to a Vortex type.
@@ -277,7 +297,10 @@ fn dtype_from_field(field: &Field, dtypes: &DTypeSession) -> DType {
277297
return storage_dtype;
278298
};
279299

280-
let ext_id = ExtId::new(ext_name);
300+
let canonical_alias = arrow_canonical_to_vortex_id(ext_name);
301+
let is_canonical = canonical_alias.is_some();
302+
let ext_id = ExtId::new(canonical_alias.unwrap_or(ext_name));
303+
281304
let Some(plugin) = dtypes.registry().find(&ext_id) else {
282305
tracing::warn!(
283306
"Arrow field {:?} extension id {:?} not registered; using storage dtype",
@@ -287,7 +310,7 @@ fn dtype_from_field(field: &Field, dtypes: &DTypeSession) -> DType {
287310
return storage_dtype;
288311
};
289312

290-
let metadata_bytes = match decode_extension_metadata(field) {
313+
let metadata_bytes = match decode_extension_metadata(field, is_canonical) {
291314
Ok(bytes) => bytes,
292315
Err(e) => {
293316
tracing::warn!(
@@ -316,10 +339,15 @@ fn dtype_from_field(field: &Field, dtypes: &DTypeSession) -> DType {
316339
}
317340
}
318341

319-
/// Decodes base64-encoded extension metadata. Missing / empty values yield an empty vector.
320-
fn decode_extension_metadata(field: &Field) -> VortexResult<Vec<u8>> {
342+
/// Decode extension metadata bytes from a Field.
343+
///
344+
/// Canonical Arrow extensions store UTF-8 bytes directly (e.g. JSON). Non-canonical extensions
345+
/// store base64-encoded bytes so that arbitrary binary plugin output survives a String-typed
346+
/// metadata channel.
347+
fn decode_extension_metadata(field: &Field, is_canonical: bool) -> VortexResult<Vec<u8>> {
321348
match field.extension_type_metadata() {
322349
None | Some("") => Ok(Vec::new()),
350+
Some(s) if is_canonical => Ok(s.as_bytes().to_vec()),
323351
Some(s) => BASE64_STANDARD
324352
.decode(s)
325353
.map_err(|e| vortex_err!("failed to base64-decode {EXTENSION_TYPE_METADATA_KEY}: {e}")),
@@ -475,16 +503,25 @@ fn field_from_dtype(name: &str, dtype: &DType) -> VortexResult<Field> {
475503
}
476504

477505
let storage_arrow = ext.storage_dtype().to_arrow_dtype()?;
478-
let mut metadata = vec![(
479-
EXTENSION_TYPE_NAME_KEY.to_owned(),
480-
ext.id().as_str().to_owned(),
481-
)];
482506
let ext_meta_bytes = ext.serialize_metadata()?;
483-
if !ext_meta_bytes.is_empty() {
484-
metadata.push((
485-
EXTENSION_TYPE_METADATA_KEY.to_owned(),
507+
let (ext_name, meta_str) = match vortex_id_to_arrow_canonical(ext.id().as_str()) {
508+
Some(canonical) => {
509+
// Canonical Arrow extensions specify a UTF-8 metadata format (typically JSON),
510+
// read as-is by arrow-rs / pyarrow. The plugin owns producing those bytes.
511+
let s = String::from_utf8(ext_meta_bytes).map_err(|e| {
512+
vortex_err!("canonical extension {canonical} metadata must be valid UTF-8: {e}")
513+
})?;
514+
(canonical.to_owned(), s)
515+
}
516+
None => (
517+
ext.id().as_str().to_owned(),
486518
BASE64_STANDARD.encode(&ext_meta_bytes),
487-
));
519+
),
520+
};
521+
522+
let mut metadata = vec![(EXTENSION_TYPE_NAME_KEY.to_owned(), ext_name)];
523+
if !meta_str.is_empty() {
524+
metadata.push((EXTENSION_TYPE_METADATA_KEY.to_owned(), meta_str));
488525
}
489526
return Ok(Field::new(name, storage_arrow, dtype.is_nullable())
490527
.with_metadata(metadata.into_iter().collect()));
@@ -689,6 +726,14 @@ mod test {
689726
schema_null.to_arrow_schema().unwrap();
690727
}
691728

729+
#[test]
730+
fn canonical_aliases_bijection() {
731+
for (vortex_id, arrow_name) in CANONICAL_ALIASES {
732+
assert_eq!(vortex_id_to_arrow_canonical(vortex_id), Some(*arrow_name));
733+
assert_eq!(arrow_canonical_to_vortex_id(arrow_name), Some(*vortex_id));
734+
}
735+
}
736+
692737
#[test]
693738
fn test_unicode_field_names_roundtrip() {
694739
// Regression test for https://github.com/vortex-data/vortex/issues/5979.

vortex-tensor/Cargo.toml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,8 @@ half = { workspace = true }
2929
itertools = { workspace = true }
3030
num-traits = { workspace = true }
3131
prost = { workspace = true }
32+
serde = { workspace = true, features = ["derive"] }
33+
serde_json = { workspace = true }
3234

3335
[dev-dependencies]
3436
arrow-schema = { workspace = true }
@@ -37,4 +39,5 @@ mimalloc = { workspace = true }
3739
rand = { workspace = true }
3840
rand_distr = { workspace = true }
3941
rstest = { workspace = true }
42+
serde_json = { workspace = true }
4043
vortex-btrblocks = { path = "../vortex-btrblocks" }

vortex-tensor/src/tests/arrow_roundtrip.rs

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ use crate::types::fixed_shape::FixedShapeTensorMetadata;
2323
use crate::types::vector::Vector;
2424

2525
const VECTOR_EXT_NAME: &str = "vortex.tensor.vector";
26-
const FIXED_SHAPE_EXT_NAME: &str = "vortex.fixed_shape_tensor";
26+
const FIXED_SHAPE_EXT_NAME: &str = "arrow.fixed_shape_tensor";
2727

2828
fn vector_dtype(len: u32) -> DType {
2929
let storage = DType::FixedSizeList(
@@ -124,7 +124,14 @@ fn fixed_shape_tensor_metadata_roundtrip() {
124124
.map(String::as_str),
125125
Some(FIXED_SHAPE_EXT_NAME),
126126
);
127-
assert!(field.metadata().get(EXTENSION_TYPE_METADATA_KEY).is_some());
127+
128+
// Canonical extensions put raw JSON on the wire — pyarrow / arrow-rs read it directly
129+
// without base64. Parse it back to confirm the on-wire format.
130+
let meta_str = field.metadata().get(EXTENSION_TYPE_METADATA_KEY).unwrap();
131+
let parsed: serde_json::Value = serde_json::from_str(meta_str).unwrap();
132+
assert_eq!(parsed["shape"], serde_json::json!([2, 3, 4]));
133+
assert_eq!(parsed["dim_names"], serde_json::json!(["x", "y", "z"]));
134+
assert_eq!(parsed["permutation"], serde_json::json!([2, 0, 1]));
128135

129136
let recovered = DType::from_arrow_with_session(&schema, &SESSION);
130137
assert_eq!(recovered, original);
Lines changed: 126 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,126 @@
1+
// SPDX-License-Identifier: Apache-2.0
2+
// SPDX-FileCopyrightText: Copyright the Vortex contributors
3+
4+
//! Arrow canonical [`arrow.fixed_shape_tensor`] metadata serialization.
5+
//!
6+
//! The wire format is a UTF-8 JSON object placed in `ARROW:extension:metadata`, matching the
7+
//! Arrow specification and pyarrow / arrow-rs interop expectations.
8+
//!
9+
//! We roll our own serde rather than delegating to `arrow_schema::extension::FixedShapeTensor`
10+
//! because arrow-rs 58 serializes the field as `"permutations"` (plural) while the Arrow
11+
//! specification and pyarrow use `"permutation"` (singular). pyarrow silently ignores the
12+
//! misspelled key.
13+
//!
14+
//! [`arrow.fixed_shape_tensor`]: https://arrow.apache.org/docs/format/CanonicalExtensions.html#fixed-shape-tensor
15+
16+
use serde::Deserialize;
17+
use serde::Serialize;
18+
use vortex_error::VortexResult;
19+
use vortex_error::vortex_err;
20+
21+
use crate::types::fixed_shape::FixedShapeTensorMetadata;
22+
23+
#[derive(Serialize)]
24+
struct WireRef<'a> {
25+
shape: &'a [usize],
26+
#[serde(skip_serializing_if = "Option::is_none")]
27+
dim_names: Option<&'a [String]>,
28+
#[serde(skip_serializing_if = "Option::is_none")]
29+
permutation: Option<&'a [usize]>,
30+
}
31+
32+
#[derive(Deserialize)]
33+
struct Wire {
34+
shape: Vec<usize>,
35+
#[serde(default)]
36+
dim_names: Option<Vec<String>>,
37+
#[serde(default)]
38+
permutation: Option<Vec<usize>>,
39+
}
40+
41+
/// Serialize [`FixedShapeTensorMetadata`] to the Arrow canonical JSON representation.
42+
pub(crate) fn serialize(metadata: &FixedShapeTensorMetadata) -> VortexResult<Vec<u8>> {
43+
let wire = WireRef {
44+
shape: metadata.logical_shape(),
45+
dim_names: metadata.dim_names(),
46+
permutation: metadata.permutation(),
47+
};
48+
serde_json::to_vec(&wire)
49+
.map_err(|e| vortex_err!("fixed_shape_tensor canonical serialize: {e}"))
50+
}
51+
52+
/// Deserialize [`FixedShapeTensorMetadata`] from Arrow canonical JSON bytes.
53+
pub(crate) fn deserialize(bytes: &[u8]) -> VortexResult<FixedShapeTensorMetadata> {
54+
let wire: Wire = serde_json::from_slice(bytes)
55+
.map_err(|e| vortex_err!("fixed_shape_tensor canonical deserialize: {e}"))?;
56+
57+
let mut m = FixedShapeTensorMetadata::new(wire.shape);
58+
if let Some(names) = wire.dim_names {
59+
m = m.with_dim_names(names)?;
60+
}
61+
if let Some(perm) = wire.permutation {
62+
m = m.with_permutation(perm)?;
63+
}
64+
Ok(m)
65+
}
66+
67+
#[cfg(test)]
68+
mod tests {
69+
use rstest::rstest;
70+
71+
use super::*;
72+
73+
#[rstest]
74+
#[case::scalar_0d(FixedShapeTensorMetadata::new(vec![]))]
75+
#[case::vector_1d(FixedShapeTensorMetadata::new(vec![5]))]
76+
#[case::shape_only(FixedShapeTensorMetadata::new(vec![2, 3, 4]))]
77+
#[case::with_dim_names(
78+
FixedShapeTensorMetadata::new(vec![3, 4])
79+
.with_dim_names(vec!["rows".into(), "cols".into()])
80+
.unwrap()
81+
)]
82+
#[case::with_permutation(
83+
FixedShapeTensorMetadata::new(vec![2, 3, 4])
84+
.with_permutation(vec![2, 0, 1])
85+
.unwrap()
86+
)]
87+
#[case::all_fields(
88+
FixedShapeTensorMetadata::new(vec![2, 3, 4])
89+
.with_dim_names(vec!["x".into(), "y".into(), "z".into()]).unwrap()
90+
.with_permutation(vec![1, 2, 0]).unwrap()
91+
)]
92+
fn roundtrip(#[case] metadata: FixedShapeTensorMetadata) -> VortexResult<()> {
93+
let bytes = serialize(&metadata)?;
94+
let decoded = deserialize(&bytes)?;
95+
assert_eq!(decoded, metadata);
96+
Ok(())
97+
}
98+
99+
#[test]
100+
fn wire_format_matches_arrow_spec() -> VortexResult<()> {
101+
let metadata = FixedShapeTensorMetadata::new(vec![2, 3, 4])
102+
.with_dim_names(vec!["x".into(), "y".into(), "z".into()])?
103+
.with_permutation(vec![1, 2, 0])?;
104+
105+
let bytes = serialize(&metadata)?;
106+
let v: serde_json::Value =
107+
serde_json::from_slice(&bytes).map_err(|e| vortex_err!("parse wire: {e}"))?;
108+
109+
assert_eq!(v["shape"], serde_json::json!([2, 3, 4]));
110+
assert_eq!(v["dim_names"], serde_json::json!(["x", "y", "z"]));
111+
// Arrow spec uses singular "permutation"; guard against regressions to arrow-rs's plural.
112+
assert_eq!(v["permutation"], serde_json::json!([1, 2, 0]));
113+
assert!(v.get("permutations").is_none());
114+
Ok(())
115+
}
116+
117+
#[test]
118+
fn omits_optional_fields_when_unset() -> VortexResult<()> {
119+
let bytes = serialize(&FixedShapeTensorMetadata::new(vec![5]))?;
120+
let v: serde_json::Value =
121+
serde_json::from_slice(&bytes).map_err(|e| vortex_err!("parse wire: {e}"))?;
122+
assert!(v.get("dim_names").is_none());
123+
assert!(v.get("permutation").is_none());
124+
Ok(())
125+
}
126+
}

vortex-tensor/src/types/fixed_shape/mod.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,5 +14,5 @@ pub use matcher::FixedShapeTensorMatcherMetadata;
1414
mod metadata;
1515
pub use metadata::FixedShapeTensorMetadata;
1616

17-
mod proto;
17+
mod canonical;
1818
mod vtable;

vortex-tensor/src/types/fixed_shape/proto.rs

Lines changed: 0 additions & 90 deletions
This file was deleted.

0 commit comments

Comments
 (0)