Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 30 additions & 2 deletions src/iceberg/json_serde.cc
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@
#include <nlohmann/json.hpp>

#include "iceberg/constants.h"
#include "iceberg/expression/json_serde_internal.h"
#include "iceberg/expression/literal.h"
#include "iceberg/json_serde_internal.h"
#include "iceberg/name_mapping.h"
#include "iceberg/partition_field.h"
Expand Down Expand Up @@ -298,6 +300,15 @@ nlohmann::json ToJson(const SchemaField& field) {
if (!field.doc().empty()) {
json[kDoc] = field.doc();
}
// Defaults are validated to be primitive literals matching the field type, so
// single-value serialization cannot fail here.
if (field.initial_default().has_value()) {
ICEBERG_ASSIGN_OR_THROW(json[kInitialDefault],
ToJson(field.initial_default()->get()));
}
if (field.write_default().has_value()) {
ICEBERG_ASSIGN_OR_THROW(json[kWriteDefault], ToJson(field.write_default()->get()));
}
return json;
}

Expand All @@ -310,7 +321,6 @@ nlohmann::json ToJson(const Type& type) {
nlohmann::json fields_json = nlohmann::json::array();
for (const auto& field : struct_type.fields()) {
fields_json.push_back(ToJson(field));
// TODO(gangwu): add default values
}
json[kFields] = fields_json;
return json;
Expand Down Expand Up @@ -552,9 +562,27 @@ Result<std::unique_ptr<SchemaField>> FieldFromJson(const nlohmann::json& json) {
ICEBERG_ASSIGN_OR_RAISE(auto name, GetJsonValue<std::string>(json, kName));
ICEBERG_ASSIGN_OR_RAISE(auto required, GetJsonValue<bool>(json, kRequired));
ICEBERG_ASSIGN_OR_RAISE(auto doc, GetJsonValueOrDefault<std::string>(json, kDoc));
ICEBERG_ASSIGN_OR_RAISE(std::optional<nlohmann::json> initial_default_json,
GetJsonValueOptional<nlohmann::json>(json, kInitialDefault));
ICEBERG_ASSIGN_OR_RAISE(std::optional<nlohmann::json> write_default_json,
GetJsonValueOptional<nlohmann::json>(json, kWriteDefault));

std::shared_ptr<const Literal> initial_default;
if (initial_default_json.has_value()) {
ICEBERG_ASSIGN_OR_RAISE(Literal literal,
LiteralFromJson(*initial_default_json, type.get()));
initial_default = std::make_shared<const Literal>(std::move(literal));
}
std::shared_ptr<const Literal> write_default;
if (write_default_json.has_value()) {
ICEBERG_ASSIGN_OR_RAISE(Literal literal,
LiteralFromJson(*write_default_json, type.get()));
write_default = std::make_shared<const Literal>(std::move(literal));
}
Comment on lines +571 to +581

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The deserialization first constructs a bare SchemaField, then conditionally calls WithInitialDefault/WithWriteDefault, each of which copies the entire field (including the shared_ptr<Type>). This is an unnecessary intermediate copy.

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Agreed — FieldFromJson now parses the defaults first and builds the field in one construction. Intermediate copy gone.


return std::make_unique<SchemaField>(field_id, std::move(name), std::move(type),
!required, doc);
!required, doc, std::move(initial_default),
std::move(write_default));
}

Result<std::unique_ptr<Schema>> SchemaFromJson(const nlohmann::json& json) {
Expand Down
26 changes: 23 additions & 3 deletions src/iceberg/schema.cc
Original file line number Diff line number Diff line change
Expand Up @@ -116,9 +116,15 @@ std::shared_ptr<Type> ReassignTypeIds(const std::shared_ptr<Type>& type,
SchemaField ReassignField(const SchemaField& field, int32_t new_id,
const Schema::GetId& get_id, Schema::IdMap& ids_to_reassigned,
Schema::IdMap& ids_to_original) {
return {new_id, std::string(field.name()),
// Reassigning IDs only rewrites the field ID and nested type IDs; share the field's
// (immutable) default values rather than copying them.
return {new_id,
std::string(field.name()),
ReassignTypeIds(field.type(), get_id, ids_to_reassigned, ids_to_original),
field.optional(), std::string(field.doc())};
field.optional(),
std::string(field.doc()),
field.initial_default_ptr(),
field.write_default_ptr()};
}

std::vector<SchemaField> ReassignIds(std::vector<SchemaField> fields,
Expand Down Expand Up @@ -447,7 +453,21 @@ Status Schema::Validate(int32_t format_version) const {
}
}

// TODO(GuoTao.yu): Check default values when they are supported
// Only the initial-default is gated on format version: it changes how existing
// data files are read (rows written before the column existed materialize this
// value), so it requires the v3 reader contract. A write-default only affects
// values written going forward and does not reinterpret existing data.
if (field.initial_default().has_value() &&
format_version < TableMetadata::kMinFormatVersionDefaultValues) {
return InvalidSchema(
"Invalid initial default for {}: non-null default ({}) is not supported "
"until v{}",
field.name(), field.initial_default()->get(),
TableMetadata::kMinFormatVersionDefaultValues);
}
if (field.initial_default().has_value() || field.write_default().has_value()) {
ICEBERG_RETURN_UNEXPECTED(field.Validate());
}
}

return {};
Expand Down
78 changes: 75 additions & 3 deletions src/iceberg/schema_field.cc
Original file line number Diff line number Diff line change
Expand Up @@ -21,19 +21,26 @@

#include <format>
#include <string_view>
#include <utility>

#include "iceberg/expression/literal.h"
#include "iceberg/type.h"
#include "iceberg/util/formatter.h" // IWYU pragma: keep
#include "iceberg/util/macros.h"

namespace iceberg {

SchemaField::SchemaField(int32_t field_id, std::string_view name,
std::shared_ptr<Type> type, bool optional, std::string_view doc)
std::shared_ptr<Type> type, bool optional, std::string_view doc,
std::shared_ptr<const Literal> initial_default,
std::shared_ptr<const Literal> write_default)
: field_id_(field_id),
name_(name),
type_(std::move(type)),
optional_(optional),
doc_(doc) {}
doc_(doc),
initial_default_(std::move(initial_default)),
write_default_(std::move(write_default)) {}

SchemaField SchemaField::MakeOptional(int32_t field_id, std::string_view name,
std::shared_ptr<Type> type, std::string_view doc) {
Expand All @@ -55,13 +62,64 @@ bool SchemaField::optional() const { return optional_; }

std::string_view SchemaField::doc() const { return doc_; }

std::optional<std::reference_wrapper<const Literal>> SchemaField::initial_default()
const {
if (initial_default_ == nullptr) {
return std::nullopt;
}
return std::cref(*initial_default_);
}

std::optional<std::reference_wrapper<const Literal>> SchemaField::write_default() const {
if (write_default_ == nullptr) {
return std::nullopt;
}
return std::cref(*write_default_);
}

const std::shared_ptr<const Literal>& SchemaField::initial_default_ptr() const {
return initial_default_;
}

const std::shared_ptr<const Literal>& SchemaField::write_default_ptr() const {
return write_default_;
}

namespace {

Status ValidateDefault(const SchemaField& field, const Literal& value,
std::string_view kind) {
if (value.IsNull() || value.IsAboveMax() || value.IsBelowMin()) {
return InvalidSchema("Invalid {} value for {}: must be a non-null value", kind,
field.name());
}
if (field.type() == nullptr || !field.type()->is_primitive()) {
return InvalidSchema("Invalid {} value for {}: {} (must be null)", kind, field.name(),
value);
}
if (*value.type() != *field.type()) {
return InvalidSchema("{} of field {} has type {} but expected {}", kind, field.name(),
*value.type(), *field.type());
}
return {};
}

} // namespace

Status SchemaField::Validate() const {
if (name_.empty()) [[unlikely]] {
return InvalidSchema("SchemaField cannot have empty name");
}
if (type_ == nullptr) [[unlikely]] {
return InvalidSchema("SchemaField cannot have null type");
}
if (initial_default_ != nullptr) {
ICEBERG_RETURN_UNEXPECTED(
ValidateDefault(*this, *initial_default_, "initial-default"));
}
if (write_default_ != nullptr) {
ICEBERG_RETURN_UNEXPECTED(ValidateDefault(*this, *write_default_, "write-default"));
}
return {};
}

Expand All @@ -72,9 +130,23 @@ std::string SchemaField::ToString() const {
return result;
}

namespace {

bool DefaultEquals(const std::shared_ptr<const Literal>& lhs,
const std::shared_ptr<const Literal>& rhs) {
if (lhs == nullptr || rhs == nullptr) {
return lhs == rhs;
}
return *lhs == *rhs;
}

} // namespace

bool SchemaField::Equals(const SchemaField& other) const {
return field_id_ == other.field_id_ && name_ == other.name_ && *type_ == *other.type_ &&
optional_ == other.optional_;
optional_ == other.optional_ &&
DefaultEquals(initial_default_, other.initial_default_) &&
DefaultEquals(write_default_, other.write_default_);
}

} // namespace iceberg
41 changes: 40 additions & 1 deletion src/iceberg/schema_field.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,9 @@
/// type (e.g. a struct).

#include <cstdint>
#include <functional>
#include <memory>
#include <optional>
#include <string>
#include <string_view>

Expand All @@ -46,8 +48,14 @@ class ICEBERG_EXPORT SchemaField : public iceberg::util::Formattable {
/// \param[in] type The field type.
/// \param[in] optional Whether values of this field are required or nullable.
/// \param[in] doc Optional documentation string for the field.
/// \param[in] initial_default The v3 `initial-default` value, or null if absent. The
/// field shares ownership of the (immutable) value.
/// \param[in] write_default The v3 `write-default` value, or null if absent. The field
/// shares ownership of the (immutable) value.
SchemaField(int32_t field_id, std::string_view name, std::shared_ptr<Type> type,
bool optional, std::string_view doc = {});
bool optional, std::string_view doc = {},
std::shared_ptr<const Literal> initial_default = nullptr,
std::shared_ptr<const Literal> write_default = nullptr);

/// \brief Construct an optional (nullable) field.
static SchemaField MakeOptional(int32_t field_id, std::string_view name,
Expand All @@ -71,6 +79,32 @@ class ICEBERG_EXPORT SchemaField : public iceberg::util::Formattable {
/// \brief Get the field documentation.
std::string_view doc() const;

/// \brief Get the default value for this field used when reading rows written
/// before the field existed (v3 `initial-default`). Empty if absent.
///
/// The returned reference is a non-owning view into a value owned by this field;
/// it remains valid for the lifetime of this SchemaField.
[[nodiscard]] std::optional<std::reference_wrapper<const Literal>> initial_default()
const;

/// \brief Get the default value for this field used when a writer does not
/// supply a value (v3 `write-default`). Empty if absent.
///
/// The returned reference is a non-owning view into a value owned by this field;
/// it remains valid for the lifetime of this SchemaField.
[[nodiscard]] std::optional<std::reference_wrapper<const Literal>> write_default()
const;

/// \brief Get the shared owning pointer to the `initial-default` value, or null if
/// absent. Prefer initial_default() for reading; this exists so a rebuilt field can
/// share the (immutable) value rather than copy it.
[[nodiscard]] const std::shared_ptr<const Literal>& initial_default_ptr() const;

/// \brief Get the shared owning pointer to the `write-default` value, or null if
/// absent. Prefer write_default() for reading; this exists so a rebuilt field can
/// share the (immutable) value rather than copy it.
[[nodiscard]] const std::shared_ptr<const Literal>& write_default_ptr() const;

[[nodiscard]] std::string ToString() const override;

Status Validate() const;
Expand Down Expand Up @@ -100,6 +134,11 @@ class ICEBERG_EXPORT SchemaField : public iceberg::util::Formattable {
std::shared_ptr<Type> type_;
bool optional_;
std::string doc_;
// Default values are owned by this field and never mutated after being set; copies
// of the field share the same payload (reference-counted) instead of deep-copying,
// like `type_` above. Sharing is unobservable because the payload is immutable.
std::shared_ptr<const Literal> initial_default_;
std::shared_ptr<const Literal> write_default_;
Comment on lines +140 to +141

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ReassignField constructs a new SchemaField via the 5-argument constructor which initializes initial_default_ and write_default_ to nullptr. When schema IDs are reassigned (e.g., copying a schema with fresh IDs via the Schema(get_id) path), all default values on fields are silently lost. We should copy all field properties including initialDefault and writeDefault.

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good catch, confirmed. Defaults are now constructor args, and ReassignField passes the source field's initial_default_ptr()/write_default_ptr() through, so they're shared with the reassigned field, not lost. Added ReassignIdsPreservesDefaultValues.

};

} // namespace iceberg
6 changes: 5 additions & 1 deletion src/iceberg/schema_util.cc
Original file line number Diff line number Diff line change
Expand Up @@ -172,10 +172,14 @@ Result<FieldProjection> ProjectNested(const Type& expected_type, const Type& sou
iter->second.local_index, prune_source));
} else if (MetadataColumns::IsMetadataColumn(field_id)) {
child_projection.kind = FieldProjection::Kind::kMetadata;
} else if (expected_field.initial_default().has_value()) {
// Rows written before the field existed assume its `initial-default` value.
child_projection.kind = FieldProjection::Kind::kDefault;
child_projection.from = expected_field.initial_default()->get();
} else if (expected_field.optional()) {
child_projection.kind = FieldProjection::Kind::kNull;
} else {
// TODO(gangwu): support default value for v3 and constant value
// TODO(gangwu): support constant value
return InvalidSchema("Missing required field: {}", expected_field.ToString());
}
result.children.emplace_back(std::move(child_projection));
Expand Down
Loading
Loading