|
| 1 | +// SPDX-License-Identifier: Apache-2.0 |
| 2 | +// SPDX-FileCopyrightText: Copyright the Vortex contributors |
| 3 | + |
| 4 | +//! Arrow canonical [`arrow.fixed_shape_tensor`] metadata serialization. |
| 5 | +//! |
| 6 | +//! The wire format is a UTF-8 JSON object placed in `ARROW:extension:metadata`, matching the |
| 7 | +//! Arrow specification and pyarrow / arrow-rs interop expectations. |
| 8 | +//! |
| 9 | +//! We roll our own serde rather than delegating to `arrow_schema::extension::FixedShapeTensor` |
| 10 | +//! because arrow-rs 58 serializes the field as `"permutations"` (plural) while the Arrow |
| 11 | +//! specification and pyarrow use `"permutation"` (singular). pyarrow silently ignores the |
| 12 | +//! misspelled key. |
| 13 | +//! |
| 14 | +//! [`arrow.fixed_shape_tensor`]: https://arrow.apache.org/docs/format/CanonicalExtensions.html#fixed-shape-tensor |
| 15 | +
|
| 16 | +use serde::Deserialize; |
| 17 | +use serde::Serialize; |
| 18 | +use vortex_error::VortexResult; |
| 19 | +use vortex_error::vortex_err; |
| 20 | + |
| 21 | +use crate::types::fixed_shape::FixedShapeTensorMetadata; |
| 22 | + |
| 23 | +#[derive(Serialize)] |
| 24 | +struct WireRef<'a> { |
| 25 | + shape: &'a [usize], |
| 26 | + #[serde(skip_serializing_if = "Option::is_none")] |
| 27 | + dim_names: Option<&'a [String]>, |
| 28 | + #[serde(skip_serializing_if = "Option::is_none")] |
| 29 | + permutation: Option<&'a [usize]>, |
| 30 | +} |
| 31 | + |
| 32 | +#[derive(Deserialize)] |
| 33 | +struct Wire { |
| 34 | + shape: Vec<usize>, |
| 35 | + #[serde(default)] |
| 36 | + dim_names: Option<Vec<String>>, |
| 37 | + #[serde(default)] |
| 38 | + permutation: Option<Vec<usize>>, |
| 39 | +} |
| 40 | + |
| 41 | +/// Serialize [`FixedShapeTensorMetadata`] to the Arrow canonical JSON representation. |
| 42 | +pub(crate) fn serialize(metadata: &FixedShapeTensorMetadata) -> VortexResult<Vec<u8>> { |
| 43 | + let wire = WireRef { |
| 44 | + shape: metadata.logical_shape(), |
| 45 | + dim_names: metadata.dim_names(), |
| 46 | + permutation: metadata.permutation(), |
| 47 | + }; |
| 48 | + serde_json::to_vec(&wire) |
| 49 | + .map_err(|e| vortex_err!("fixed_shape_tensor canonical serialize: {e}")) |
| 50 | +} |
| 51 | + |
| 52 | +/// Deserialize [`FixedShapeTensorMetadata`] from Arrow canonical JSON bytes. |
| 53 | +pub(crate) fn deserialize(bytes: &[u8]) -> VortexResult<FixedShapeTensorMetadata> { |
| 54 | + let wire: Wire = serde_json::from_slice(bytes) |
| 55 | + .map_err(|e| vortex_err!("fixed_shape_tensor canonical deserialize: {e}"))?; |
| 56 | + |
| 57 | + let mut m = FixedShapeTensorMetadata::new(wire.shape); |
| 58 | + if let Some(names) = wire.dim_names { |
| 59 | + m = m.with_dim_names(names)?; |
| 60 | + } |
| 61 | + if let Some(perm) = wire.permutation { |
| 62 | + m = m.with_permutation(perm)?; |
| 63 | + } |
| 64 | + Ok(m) |
| 65 | +} |
| 66 | + |
| 67 | +#[cfg(test)] |
| 68 | +mod tests { |
| 69 | + use rstest::rstest; |
| 70 | + |
| 71 | + use super::*; |
| 72 | + |
| 73 | + #[rstest] |
| 74 | + #[case::scalar_0d(FixedShapeTensorMetadata::new(vec![]))] |
| 75 | + #[case::vector_1d(FixedShapeTensorMetadata::new(vec![5]))] |
| 76 | + #[case::shape_only(FixedShapeTensorMetadata::new(vec![2, 3, 4]))] |
| 77 | + #[case::with_dim_names( |
| 78 | + FixedShapeTensorMetadata::new(vec![3, 4]) |
| 79 | + .with_dim_names(vec!["rows".into(), "cols".into()]) |
| 80 | + .unwrap() |
| 81 | + )] |
| 82 | + #[case::with_permutation( |
| 83 | + FixedShapeTensorMetadata::new(vec![2, 3, 4]) |
| 84 | + .with_permutation(vec![2, 0, 1]) |
| 85 | + .unwrap() |
| 86 | + )] |
| 87 | + #[case::all_fields( |
| 88 | + FixedShapeTensorMetadata::new(vec![2, 3, 4]) |
| 89 | + .with_dim_names(vec!["x".into(), "y".into(), "z".into()]).unwrap() |
| 90 | + .with_permutation(vec![1, 2, 0]).unwrap() |
| 91 | + )] |
| 92 | + fn roundtrip(#[case] metadata: FixedShapeTensorMetadata) -> VortexResult<()> { |
| 93 | + let bytes = serialize(&metadata)?; |
| 94 | + let decoded = deserialize(&bytes)?; |
| 95 | + assert_eq!(decoded, metadata); |
| 96 | + Ok(()) |
| 97 | + } |
| 98 | + |
| 99 | + #[test] |
| 100 | + fn wire_format_matches_arrow_spec() -> VortexResult<()> { |
| 101 | + let metadata = FixedShapeTensorMetadata::new(vec![2, 3, 4]) |
| 102 | + .with_dim_names(vec!["x".into(), "y".into(), "z".into()])? |
| 103 | + .with_permutation(vec![1, 2, 0])?; |
| 104 | + |
| 105 | + let bytes = serialize(&metadata)?; |
| 106 | + let v: serde_json::Value = |
| 107 | + serde_json::from_slice(&bytes).map_err(|e| vortex_err!("parse wire: {e}"))?; |
| 108 | + |
| 109 | + assert_eq!(v["shape"], serde_json::json!([2, 3, 4])); |
| 110 | + assert_eq!(v["dim_names"], serde_json::json!(["x", "y", "z"])); |
| 111 | + // Arrow spec uses singular "permutation"; guard against regressions to arrow-rs's plural. |
| 112 | + assert_eq!(v["permutation"], serde_json::json!([1, 2, 0])); |
| 113 | + assert!(v.get("permutations").is_none()); |
| 114 | + Ok(()) |
| 115 | + } |
| 116 | + |
| 117 | + #[test] |
| 118 | + fn omits_optional_fields_when_unset() -> VortexResult<()> { |
| 119 | + let bytes = serialize(&FixedShapeTensorMetadata::new(vec![5]))?; |
| 120 | + let v: serde_json::Value = |
| 121 | + serde_json::from_slice(&bytes).map_err(|e| vortex_err!("parse wire: {e}"))?; |
| 122 | + assert!(v.get("dim_names").is_none()); |
| 123 | + assert!(v.get("permutation").is_none()); |
| 124 | + Ok(()) |
| 125 | + } |
| 126 | +} |
0 commit comments