Skip to content

Commit eb33022

Browse files
gongxun0928jiaqizho
authored andcommitted
performance: length_stream changes from int64 type to int32 type
In the current implementation, the length of a single tuple field will not exceed 2GB, so a single element of the lengths stream can use int32 to represent the length, so that each variable-length field can save 4 bytes. In the large-wide table test with 500 variable-length columns and 1 million tuples, 2GB of storage space was saved.
1 parent a08be4d commit eb33022

7 files changed

Lines changed: 22 additions & 19 deletions

File tree

contrib/pax_storage/src/cpp/storage/columns/pax_column.cc

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -178,7 +178,7 @@ template class PaxCommColumn<double>;
178178

179179
PaxNonFixedColumn::PaxNonFixedColumn(uint32 capacity) : estimated_size_(0) {
180180
data_ = PAX_NEW<DataBuffer<char>>(capacity * sizeof(char));
181-
lengths_ = PAX_NEW<DataBuffer<int64>>(capacity * sizeof(char));
181+
lengths_ = PAX_NEW<DataBuffer<int32>>(capacity * sizeof(char));
182182
}
183183

184184
PaxNonFixedColumn::PaxNonFixedColumn() : PaxNonFixedColumn(DEFAULT_CAPACITY) {}
@@ -188,7 +188,7 @@ PaxNonFixedColumn::~PaxNonFixedColumn() {
188188
PAX_DELETE(lengths_);
189189
}
190190

191-
void PaxNonFixedColumn::Set(DataBuffer<char> *data, DataBuffer<int64> *lengths,
191+
void PaxNonFixedColumn::Set(DataBuffer<char> *data, DataBuffer<int32> *lengths,
192192
size_t total_size) {
193193
PAX_DELETE(data_);
194194
PAX_DELETE(lengths_);
@@ -222,15 +222,15 @@ void PaxNonFixedColumn::Append(char *buffer, size_t size) {
222222
}
223223

224224
if (lengths_->Available() == 0) {
225-
lengths_->ReSize(lengths_->Used() + sizeof(int64), 2);
225+
lengths_->ReSize(lengths_->Used() + sizeof(int32), 2);
226226
}
227227

228228
estimated_size_ += size;
229229
data_->Write(buffer, origin_size);
230230
data_->Brush(size);
231231

232-
lengths_->Write(reinterpret_cast<int64 *>(&size), sizeof(int64));
233-
lengths_->Brush(sizeof(int64));
232+
lengths_->Write(reinterpret_cast<int32 *>(&size), sizeof(int32));
233+
lengths_->Brush(sizeof(int32));
234234

235235
offsets_.emplace_back(offsets_.empty()
236236
? 0
@@ -239,7 +239,7 @@ void PaxNonFixedColumn::Append(char *buffer, size_t size) {
239239
Assert(offsets_.size() == lengths_->GetSize());
240240
}
241241

242-
DataBuffer<int64> *PaxNonFixedColumn::GetLengthBuffer() const {
242+
DataBuffer<int32> *PaxNonFixedColumn::GetLengthBuffer() const {
243243
return lengths_;
244244
}
245245

contrib/pax_storage/src/cpp/storage/columns/pax_column.h

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -286,7 +286,7 @@ class PaxNonFixedColumn : public PaxColumn {
286286

287287
~PaxNonFixedColumn() override;
288288

289-
virtual void Set(DataBuffer<char> *data, DataBuffer<int64> *lengths,
289+
virtual void Set(DataBuffer<char> *data, DataBuffer<int32> *lengths,
290290
size_t total_size);
291291

292292
void Append(char *buffer, size_t size) override;
@@ -310,7 +310,7 @@ class PaxNonFixedColumn : public PaxColumn {
310310

311311
size_t GetNonNullRows() const override;
312312

313-
DataBuffer<int64> *GetLengthBuffer() const;
313+
DataBuffer<int32> *GetLengthBuffer() const;
314314

315315
DataBuffer<int32> *GetOffsetBuffer(bool append_last = false);
316316

@@ -321,8 +321,11 @@ class PaxNonFixedColumn : public PaxColumn {
321321
size_t estimated_size_;
322322
DataBuffer<char> *data_;
323323

324-
// orc needs to serialize int64 array
325-
DataBuffer<int64> *lengths_;
324+
// orc needs to serialize int32 array
325+
// the length of a single tuple field will not exceed 2GB,
326+
// so a variable-length element of the lengths stream can use int32
327+
// to represent the length
328+
DataBuffer<int32> *lengths_;
326329
std::vector<uint64> offsets_;
327330
};
328331

contrib/pax_storage/src/cpp/storage/columns/pax_column_test.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -732,7 +732,7 @@ TEST_P(PaxNonFixedColumnCompressTest,
732732
auto data_buffer_for_read =
733733
new DataBuffer<char>(encoded_buff, encoded_len, false, false);
734734
data_buffer_for_read->Brush(encoded_len);
735-
auto length_buffer_cpy = new DataBuffer<int64>(*length_buffer);
735+
auto length_buffer_cpy = new DataBuffer<int32>(*length_buffer);
736736
non_fixed_column_for_read->Set(data_buffer_for_read, length_buffer_cpy,
737737
origin_len);
738738
ASSERT_EQ(non_fixed_column_for_read->GetCompressLevel(), 5);

contrib/pax_storage/src/cpp/storage/columns/pax_columns.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -335,7 +335,7 @@ size_t PaxColumns::MeasureOrcDataBuffer(
335335

336336
switch (column->GetPaxColumnTypeInMem()) {
337337
case kTypeNonFixed: {
338-
size_t lengths_size = column_size * sizeof(int64);
338+
size_t lengths_size = column_size * sizeof(int32);
339339

340340
if ((buffer_len + lengths_size) % column->GetAlignSize() != 0) {
341341
auto align_buffer_len =

contrib/pax_storage/src/cpp/storage/columns/pax_encoding_non_fixed_column.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ PaxNonFixedEncodingColumn::~PaxNonFixedEncodingColumn() {
4949
}
5050

5151
void PaxNonFixedEncodingColumn::Set(DataBuffer<char> *data,
52-
DataBuffer<int64> *lengths,
52+
DataBuffer<int32> *lengths,
5353
size_t total_size) {
5454
if (compressor_) {
5555
Assert(!compress_route_);

contrib/pax_storage/src/cpp/storage/columns/pax_encoding_non_fixed_column.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ class PaxNonFixedEncodingColumn final : public PaxNonFixedColumn {
1515

1616
~PaxNonFixedEncodingColumn() override;
1717

18-
void Set(DataBuffer<char> *data, DataBuffer<int64> *lengths,
18+
void Set(DataBuffer<char> *data, DataBuffer<int32> *lengths,
1919
size_t total_size) override;
2020

2121
std::pair<char *, size_t> GetBuffer() override;

contrib/pax_storage/src/cpp/storage/orc/orc_format_reader.cc

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -425,19 +425,19 @@ static PaxColumn *BuildEncodingNonFixedColumn(
425425
uint32 column_lens_size = 0;
426426
uint64 column_lens_len = 0;
427427
uint64 column_data_len = 0;
428-
DataBuffer<int64> *column_len_buffer = nullptr;
428+
DataBuffer<int32> *column_len_buffer = nullptr;
429429
DataBuffer<char> *column_data_buffer = nullptr;
430430
PaxNonFixedColumn *pax_column = nullptr;
431431

432432
column_lens_size = static_cast<uint32>(len_stream.column());
433433
column_lens_len = static_cast<uint64>(len_stream.length());
434434

435-
column_len_buffer = PAX_NEW<DataBuffer<int64>>(
436-
reinterpret_cast<int64 *>(data_buffer->GetAvailableBuffer()),
435+
column_len_buffer = PAX_NEW<DataBuffer<int32>>(
436+
reinterpret_cast<int32 *>(data_buffer->GetAvailableBuffer()),
437437
column_lens_len, false, false);
438438

439-
Assert(column_lens_len >= column_lens_size * sizeof(int64));
440-
column_len_buffer->Brush(column_lens_size * sizeof(int64));
439+
Assert(column_lens_len >= column_lens_size * sizeof(int32));
440+
column_len_buffer->Brush(column_lens_size * sizeof(int32));
441441
data_buffer->Brush(column_lens_len);
442442

443443
column_data_len = data_stream.length();

0 commit comments

Comments
 (0)