diff --git a/.gitmodules b/.gitmodules index 038b1484190..1dc2cbf7153 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,3 +1,16 @@ [submodule "gpcontrib/gpcloud/test/googletest"] path = gpcontrib/gpcloud/test/googletest url = https://github.com/google/googletest.git +[submodule "contrib/pax_storage/src/cpp/contrib/googletest"] + path = contrib/pax_storage/src/cpp/contrib/googletest + url = https://code.hashdata.xyz/cloudberry/googletest +[submodule "contrib/pax_storage/src/cpp/contrib/tabulate"] + path = contrib/pax_storage/src/cpp/contrib/tabulate + url = https://code.hashdata.xyz/cloudberry/tabulate +[submodule "contrib/pax_storage/src/cpp/contrib/googlebench"] + path = contrib/pax_storage/src/cpp/contrib/googlebench + url = https://code.hashdata.xyz/cloudberry/benchmark.git +[submodule "contrib/pax_storage/src/cpp/contrib/cpp-stub"] + path = contrib/pax_storage/src/cpp/contrib/cpp-stub + url = https://code.hashdata.xyz/cloudberry/cpp-stub.git + branch = dev diff --git a/GNUmakefile.in b/GNUmakefile.in index 77c2ab55218..e6333e39bec 100644 --- a/GNUmakefile.in +++ b/GNUmakefile.in @@ -31,6 +31,9 @@ all: $(MAKE) -C contrib/pg_buffercache all ifeq ($(with_openssl), yes) $(MAKE) -C contrib/sslinfo all +endif +ifeq ($(enable_pax), yes) + $(MAKE) -C contrib/pax_storage all endif $(MAKE) -C gpMgmt all $(MAKE) -C gpcontrib all @@ -73,6 +76,9 @@ install: $(MAKE) -C contrib/tablefunc $@ $(MAKE) -C contrib/passwordcheck $@ $(MAKE) -C contrib/pg_buffercache $@ +ifeq ($(enable_pax), yes) + $(MAKE) -C contrib/pax_storage $@ +endif ifeq ($(with_openssl), yes) $(MAKE) -C contrib/sslinfo $@ endif @@ -180,6 +186,9 @@ ICW_TARGETS += contrib/extprotocol contrib/dblink contrib/pg_trgm ICW_TARGETS += contrib/indexscan contrib/hstore contrib/pgcrypto ICW_TARGETS += contrib/tablefunc contrib/passwordcheck ICW_TARGETS += contrib/pg_buffercache +ifeq ($(enable_pax), yes) +ICW_TARGETS += contrib/pax_storage +endif # sslinfo depends on openssl ifeq ($(with_openssl), yes) ICW_TARGETS += contrib/sslinfo diff --git a/configure b/configure index 57fec242cce..47b466a34e4 100755 --- a/configure +++ b/configure @@ -751,6 +751,7 @@ ICU_CFLAGS with_icu enable_thread_safety INCLUDES +enable_pax enable_preload_ic_module enable_ic_proxy enable_external_fts @@ -901,6 +902,7 @@ enable_gpcloud enable_external_fts enable_ic_proxy enable_preload_ic_module +enable_pax enable_thread_safety with_icu with_tcl @@ -1616,6 +1618,7 @@ Optional Features: library) --disable-preload-ic-module disable preload interconnect module + --enable-pax enable pax support --disable-thread-safety disable thread-safety in client libraries --enable-openssl-redirect enable redirect openssl interface to internal @@ -9098,6 +9101,36 @@ fi { $as_echo "$as_me:${as_lineno-$LINENO}: result: checking whether to build with preload ic module ... $enable_preload_ic_module" >&5 $as_echo "checking whether to build with preload ic module ... $enable_preload_ic_module" >&6; } +# +# pax support +# + + +# Check whether --enable-pax was given. +if test "${enable_pax+set}" = set; then : + enableval=$enable_pax; + case $enableval in + yes) + +$as_echo "#define USE_PAX_STORAGE 1" >>confdefs.h + + ;; + no) + : + ;; + *) + as_fn_error $? "no argument expected for --enable-pax option" "$LINENO" 5 + ;; + esac + +else + enable_pax=no + +fi + + +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: checking whether to build with pax support ... $enable_pax" >&5 +$as_echo "checking whether to build with pax support ... $enable_pax" >&6; } # # Include directories diff --git a/configure.ac b/configure.ac index d9df92f6768..1686a00416b 100644 --- a/configure.ac +++ b/configure.ac @@ -925,6 +925,16 @@ PGAC_ARG_BOOL(enable, preload-ic-module, yes, AC_MSG_RESULT([checking whether to build with preload ic module ... $enable_preload_ic_module]) AC_SUBST(enable_preload_ic_module) +# +# pax support +# +PGAC_ARG_BOOL(enable, pax, no, + [enable pax support], + [AC_DEFINE(USE_PAX_STORAGE, 1, + [Define to 1 to support pax])]) +AC_MSG_RESULT([checking whether to build with pax support ... $enable_pax]) +AC_SUBST(enable_pax) + # # Include directories # diff --git a/contrib/Makefile b/contrib/Makefile index fceafaffe8a..2292adb88f2 100644 --- a/contrib/Makefile +++ b/contrib/Makefile @@ -98,6 +98,12 @@ else ALWAYS_SUBDIRS += hstore_plpython jsonb_plpython ltree_plpython endif +ifeq ($(enable_pax),yes) +SUBDIRS += pax_storage +else +ALWAYS_SUBDIRS += pax_storage +endif + # Missing: # start-scripts \ (does not have a makefile) diff --git a/contrib/pax_storage/.ci/tf/qingcloud-provider.tf b/contrib/pax_storage/.ci/tf/qingcloud-provider.tf deleted file mode 100644 index 02941a05be8..00000000000 --- a/contrib/pax_storage/.ci/tf/qingcloud-provider.tf +++ /dev/null @@ -1,86 +0,0 @@ -variable "qingcloud_access_key" { - sensitive = true - type = string -} - -variable "qingcloud_secret_key" { - sensitive = true - type = string -} - -variable "qingcloud_zone" { - default = "pek3c" -} - -variable "instance_name" { - default = "ci" -} - -variable "instance_image" { - default = "img-qbpas5m2" -} - -variable "instance_class" { - default = 202 -} - -variable "instance_cpu" { - default = 16 -} - -variable "instance_memory" { - default = 16384 -} - -variable "instance_os_disk_size" { - default = 100 -} - -variable "instance_vxnet" { - default = "vxnet-5tjdylj" -} - -variable "instance_keypair" { - default = [ - "kp-o07unn26"] -} - -terraform { - required_providers { - qingcloud = { - source = "HashDataInc/qingcloud" - version = "1.2.7" - } - ansible = { - source = "nbering/ansible" - version = "1.0.4" - } - } -} - -provider "qingcloud" { - access_key = var.qingcloud_access_key - secret_key = var.qingcloud_secret_key - zone = var.qingcloud_zone -} - - -resource "qingcloud_instance" "ci" { - name = var.instance_name - image_id = var.instance_image - instance_class = var.instance_class - cpu = var.instance_cpu - memory = var.instance_memory - os_disk_size = var.instance_os_disk_size - managed_vxnet_id = var.instance_vxnet - keypair_ids = var.instance_keypair -} - -resource "ansible_host" "ci" { - inventory_hostname = qingcloud_instance.ci.private_ip - groups = [ - "runner"] - vars = { - ansible_user = "root" - } -} diff --git a/contrib/pax_storage/.clang-tidy b/contrib/pax_storage/.clang-tidy index 6b6594d4fb7..9e3bff8027b 100644 --- a/contrib/pax_storage/.clang-tidy +++ b/contrib/pax_storage/.clang-tidy @@ -12,7 +12,7 @@ Checks: '-*, modernize-avoid-bind, modernize-loop-convert, modernize-make-shared, - modernize-make-unique, + - modernize-make-unique, modernize-raw-string-literal, modernize-redundant-void-arg, modernize-replace-auto-ptr, @@ -37,7 +37,7 @@ Checks: '-*, readability-avoid-const-params-in-decls, readability-const-return-type, readability-container-size-empty, - readability-convert-member-functions-to-static, + - readability-convert-member-functions-to-static, readability-deleted-default, readability-make-member-function-const, readability-misplaced-array-index, @@ -52,7 +52,7 @@ Checks: '-*, readability-uniqueptr-delete-release, readability-redundant-member-init, readability-simplify-subscript-expr, - readability-simplify-boolean-expr, + - readability-simplify-boolean-expr, readability-inconsistent-declaration-parameter-name, readability-identifier-naming, @@ -68,7 +68,6 @@ Checks: '-*, bugprone-incorrect-roundings, bugprone-infinite-loop, bugprone-integer-division, - bugprone-macro-parentheses, bugprone-macro-repeated-side-effects, bugprone-misplaced-operator-in-strlen-in-alloc, bugprone-misplaced-pointer-artithmetic-in-alloc, @@ -225,4 +224,4 @@ CheckOptions: - key: modernize-use-transparent-functors.SafeMode value: 1 - key: modernize-use-emplace.IgnoreImplicitConstructors - value: 1 \ No newline at end of file + value: 1 diff --git a/contrib/pax_storage/.githooks/pre-push b/contrib/pax_storage/.githooks/pre-push deleted file mode 100755 index 22bebb8148b..00000000000 --- a/contrib/pax_storage/.githooks/pre-push +++ /dev/null @@ -1,25 +0,0 @@ -#!/bin/sh -# -# Verify what is about to be pushed. Called by "git -# push" after it has checked the remote status, but before anything has been -# pushed. If this script exits with a non-zero status nothing will be pushed. -# - -rc=0 - -if [ -x "./tools/cpplint.py" ]; then - echo "Running cpplint ..." - mkdir -p .tmp/ - ./tools/cpplint.py --counting=detailed --recursive . > .tmp/cpplint.log 2>&1 - rc=$? - if [ $rc -ne 0 ]; then - tail -n 1 .tmp/cpplint.log - echo "" - echo "ERROR cpplint returned errors!" - echo "ERROR Fix the problem and use 'git add' to update your changes." - echo "ERROR See `pwd`/.tmp/cpplint.log for more information." - echo "" - fi -fi - -exit $rc \ No newline at end of file diff --git a/contrib/pax_storage/.gitignore b/contrib/pax_storage/.gitignore index 76807d0dbc3..1fe686d3b2e 100644 --- a/contrib/pax_storage/.gitignore +++ b/contrib/pax_storage/.gitignore @@ -20,5 +20,5 @@ clang-tidy.result **/*.pb.cc # Executables -*.out +/*.out !src/data/expected/*.out diff --git a/contrib/pax_storage/.gitlab-ci.yml b/contrib/pax_storage/.gitlab-ci.yml deleted file mode 100644 index 51415fcc444..00000000000 --- a/contrib/pax_storage/.gitlab-ci.yml +++ /dev/null @@ -1,71 +0,0 @@ -stages: - - build - -.global_variables: &global_variables - # Runner instance name, passed to Terraform - TF_VAR_instance_name: "cbdb-test-pipeline-${CI_PIPELINE_ID}-job-${CI_JOB_ID}" - TF_VAR_qingcloud_access_key: "key" - TF_VAR_qingcloud_secret_key: "secret" - # Custom clone path on runner instance - GIT_SUBMODULE_STRATEGY: "normal" - GIT_DEPTH: 0 - CI_USER: root - # For internal deploy - ARTIFACTORY_USERNAME: "admin" - ARTIFACTORY_PASSWORD: "token" - AWS_ACCESS_KEY_ID: "${TF_VAR_qingcloud_access_key}" - AWS_SECRET_ACCESS_KEY: "${TF_VAR_qingcloud_secret_key}" - GIT_CLONE_PATH: "/code/gpdb_pax_src" - # cbdb project dir - CBDB_PROJECT_DIR: "/code/gpdb_src" - # For artifacts - BUCKET_INTERMEDIATE: "http://artifactory.hashdata.xyz/artifactory/hashdata-repository/intermediate-artifacts" - # For pax storage project - CBDB_PAX_BRANCH: $CI_COMMIT_BRANCH - -.build_script: &build_script - script: | - git clone -b feature-pax https://buildbot:Passw0rd@code.hashdata.xyz/cloudberry/cbdb.git $CBDB_PROJECT_DIR - cd /code/gpdb_src - git submodule update --init --recursive - cd /code - echo "${CI_PIPELINE_ID}" > ${CBDB_PROJECT_DIR}/BUILD_NUMBER - bash ${CBDB_PROJECT_DIR}/hd-ci/compile_cbdb.bash - bash ${GIT_CLONE_PATH}/hd-ci/compile_pax.bash - bash ${GIT_CLONE_PATH}/hd-ci/clang_tidy_pax.bash - cp ${CBDB_PROJECT_DIR}/cbdb-artifacts.txt ${CI_PROJECT_DIR}/cbdb-artifacts.txt - touch /code/CI_STATUS - -.build_artifacts: &build_artifacts - artifacts: - name: "artifacts" - when: always - paths: - - ${CI_PROJECT_DIR}/cbdb-artifacts.txt - - ${GIT_CLONE_PATH}/clang-tidy.result - reports: - dotenv: ${CI_PROJECT_DIR}/cbdb-artifacts.txt - -.cbdb_test_rules: &cbdb_test_rules - rules: - - if: $CI_COMMIT_TAG - when: never - - if: '$RUN_NIGHTLY_BUILD == "true"' - when: always - - if: '$RUN_TEST_BUILD == "true"' - when: always - - if: '$CI_PIPELINE_SOURCE == "pipeline"' - when: always - - when: always - -x86_64:build: - stage: build - variables: - <<: *global_variables - <<: *build_script - <<: *build_artifacts - <<: *cbdb_test_rules - timeout: 8 hours - retry: - max: 2 - when: always diff --git a/contrib/pax_storage/.gitmodules b/contrib/pax_storage/.gitmodules index 884a7c5972b..5c3c84dc64d 100644 --- a/contrib/pax_storage/.gitmodules +++ b/contrib/pax_storage/.gitmodules @@ -5,3 +5,6 @@ path = src/cpp/contrib/zstd url = https://code.hashdata.xyz/cloudberry/lib_zstd.git branch = v1.5.5 +[submodule "src/cpp/contrib/cpp-stub"] + path = src/cpp/contrib/cpp-stub + url = https://code.hashdata.xyz/cloudberry/cpp-stub.git diff --git a/contrib/pax_storage/CMakeLists.txt b/contrib/pax_storage/CMakeLists.txt index b4ab18181ea..2a6cde93a82 100644 --- a/contrib/pax_storage/CMakeLists.txt +++ b/contrib/pax_storage/CMakeLists.txt @@ -2,73 +2,74 @@ project(Pax) cmake_minimum_required (VERSION 3.11.0) set(CMAKE_CXX_STANDARD 14) -find_program( - PG_CONFIG pg_config - HINTS ${PG_PATH} - PATH_SUFFIXES bin - DOC "The path to the pg_config of the CBDB version to compile against") +set(TOP_DIR ${PROJECT_SOURCE_DIR}/../..) +set(CBDB_INCLUDE_DIR ${TOP_DIR}/src/include) +set(CMAKE_EXPORT_COMPILE_COMMANDS ON) +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -g") -if(NOT PG_CONFIG) - message(FATAL_ERROR "Unable to find 'pg_config'") -endif() - -# Function to call pg_config and extract values. -function(GET_PG_CONFIG var) - set(_temp) - - # Only call pg_config if the variable didn't already have a value. - if(NOT ${var}) - execute_process( - COMMAND ${PG_CONFIG} ${ARGN} - OUTPUT_VARIABLE _temp - OUTPUT_STRIP_TRAILING_WHITESPACE) - endif() - - set(${var} - ${_temp} - PARENT_SCOPE) -endfunction() +# Build gtest options +option(BUILD_GTEST "Build with google test" ON) -# Get CBDB configuration from pg_config -get_pg_config(PG_INCLUDEDIR --includedir) -# TODO check exists if this is needed -set(CBDB_INCLUDE_DIR ${PG_INCLUDEDIR}/postgresql/server) +option(BUILD_GBENCH "Build with google benchmark" OFF) -# Debug options -option(ENBALE_DEBUG "Enable debug" ON) +# Build pax tools +option(BUILD_TOOLS "Build with pax tools" ON) -# Build gtest options -option(BUILD_GTEST "Build with google test" ON) - -# Build pax format lib -option(BUILD_PAX_FORMAT "Build pax format lib" OFF) +# env CBDB_BUILD_TYPE is set by Lighting pipeline +if (NOT DEFINED ENV{CBDB_BUILD_TYPE}) + set(ENV{CBDB_BUILD_TYPE} "debug") +endif() +set(CBDB_BUILD_TYPE $ENV{CBDB_BUILD_TYPE}) +message(STATUS "env CBDB_BUILD_TYPE=$ENV{CBDB_BUILD_TYPE} => ${CBDB_BUILD_TYPE}") -if (ENBALE_DEBUG) - ADD_DEFINITIONS(-DENBALE_DEBUG) - # Use to build compile_commands.json - set(CMAKE_EXPORT_COMPILE_COMMANDS ON) - SET(CMAKE_BUILD_TYPE "Debug") - SET(CMAKE_CXX_FLAGS_DEBUG "$ENV{CXXFLAGS} -O0 -Wall -g -ggdb") - SET(CMAKE_CXX_FLAGS_RELEASE "$ENV{CXXFLAGS} -O3 -Wall") -else() +if (${CBDB_BUILD_TYPE} STREQUAL "release") SET(CMAKE_BUILD_TYPE "Release") + SET(CMAKE_CXX_FLAGS_RELEASE "$ENV{CXXFLAGS} -O3") # no need build gtest in release mode SET(BUILD_GTEST OFF) -endif(ENBALE_DEBUG) - -# Vec options -option(VEC_BUILD "Build pax vectorization version" OFF) -set(VEC_HOME "" CACHE STRING "Path to vectorization home") -if (VEC_BUILD) - -if("${VEC_HOME}" STREQUAL "") - message(FATAL_ERROR "No found vectorization home setting. Using -DVEC_HOME to spec vectorization home") +elseif(${CBDB_BUILD_TYPE} STREQUAL "debug") + ADD_DEFINITIONS(-DENABLE_DEBUG) + # Use to build compile_commands.json + SET(CMAKE_BUILD_TYPE "Debug") + SET(CMAKE_CXX_FLAGS_DEBUG "$ENV{CXXFLAGS} -O0 -ggdb") +else() + message(FATAL_ERROR "unknown CBDB_BUILD_TYPE: ${CBDB_BUILD_TYPE}") endif() -set(CBDB_ROOT_INCLUDE_DIR ${PG_INCLUDEDIR}) -ADD_DEFINITIONS(-DVEC_BUILD) +if(BUILD_GBENCH) + SET(BUILD_GTEST ON) +endif(BUILD_GBENCH) + +if (BUILD_GTEST) + SET(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -no-pie -fno-stack-protector -Wall -Wno-unused-function -Wno-unused-variable") + SET(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -fno-access-control -fno-inline -Wno-pmf-conversions -Wl,--allow-multiple-definition -no-pie -fno-stack-protector") +endif(BUILD_GTEST) + +# Plasma options +option (ENABLE_PLASMA "Enable plasma cache" OFF) +if (ENABLE_PLASMA) +ADD_DEFINITIONS(-DENABLE_PLASMA) +# plasma need use CXX_STANDARD 17 +set(CMAKE_CXX_STANDARD 17) +endif() +# Vec options +include(CheckSymbolExists) +SET(PG_CONFIG_HEADER_FILE "${CBDB_INCLUDE_DIR}/pg_config.h") +CHECK_SYMBOL_EXISTS(USE_VECTORIZATION "${PG_CONFIG_HEADER_FILE}" VEC_BUILD) +message(STATUS "pg_config.h => ${PG_CONFIG_HEADER_FILE}") +if (VEC_BUILD) + set(VEC_HOME "${PROJECT_SOURCE_DIR}/../vectorization") + ADD_DEFINITIONS(-DVEC_BUILD) + message(STATUS "Build pax with vectorization support, VEC_HOME=${VEC_HOME}") +else() + message(STATUS "Build pax without vectorization support") endif(VEC_BUILD) + +## find dependency +## depend on the value of the above option to check dependencies. +list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/") +include(FindDependencies) add_subdirectory(src/cpp) diff --git a/contrib/pax_storage/FindDependencies.cmake b/contrib/pax_storage/FindDependencies.cmake new file mode 100644 index 00000000000..c98e1eb3a67 --- /dev/null +++ b/contrib/pax_storage/FindDependencies.cmake @@ -0,0 +1,37 @@ +find_package(BISON REQUIRED) + +## protobuf +include(FindProtobuf) +find_package(Protobuf 3.6.1 REQUIRED) + +# ztsd +# in our image snapshot, zstd is managed using pkg-config, so so the pkg-config method is used first here +find_package(PkgConfig QUIET) +if(PKGCONFIG_FOUND) + pkg_check_modules(ZSTD libzstd) +endif() +if(NOT ZSTD_FOUND) + find_package(ZSTD QUIET) + if(NOT ZSTD_FOUND) + message(FATAL_ERROR "zstd not found") + endif() +endif() + +## for vectorazition +if (VEC_BUILD) + find_package(PkgConfig REQUIRED) + pkg_check_modules(GLIB REQUIRED glib-2.0) + + # resolve vectorization dependency, or the header files will miss + message(STATUS "Resolve vectorization dependency ...") + execute_process( + COMMAND make download_arrow + WORKING_DIRECTORY ${VEC_HOME} + RESULT_VARIABLE CMD_RESULT + ) + if(CMD_RESULT EQUAL 0) + message(STATUS "Resolve vectorization dependency succeeded.") + else() + message(FATAL_ERROR "Resolve vectorization dependency failed with result: ${CMD_RESULT}") + endif() +endif(VEC_BUILD) \ No newline at end of file diff --git a/contrib/pax_storage/Makefile b/contrib/pax_storage/Makefile new file mode 100644 index 00000000000..8a6f143713d --- /dev/null +++ b/contrib/pax_storage/Makefile @@ -0,0 +1,59 @@ +# contrib/pax_storage/Makefile + +MODULE_big = pax +OBJS = \ + $(WIN32RES) +PG_CPPFLAGS = -I/usr/local/include +PG_CXXFLAGS = -std=c++14 + +PGFILEDESC = "pax - PAX table access method" +SHLIB_LINK += -luuid + +REGRESS = setup +REGRESS += detoast ddl types update +# FIXME: several plans are bad in update_gp when use orca +# REGRESS += update_gp +REGRESS += teardown + +ifdef USE_PGXS +PG_CONFIG = pg_config +PGXS := $(shell $(PG_CONFIG) --pgxs) +include $(PGXS) +else +subdir = contrib/pax_storage +top_builddir = ../../ +include $(top_builddir)/src/Makefile.global +include $(top_srcdir)/contrib/contrib-global.mk +endif + +REGRESS_OPTS += --init-file=$(top_builddir)/src/test/regress/init_file + +.PHONY: all +all: build-pax + +.PHONY: install-data build-pax +build-pax: + @echo "build pax" + mkdir -p build + cd build && cmake .. -DCMAKE_INSTALL_PREFIX=$(DESTDIR)$(prefix) && make -j8 && cp src/cpp/libpax.so ../pax.so + +install-data: build-pax + $(INSTALL_DATA) pax-cdbinit--1.0.sql '$(DESTDIR)$(datadir)/cdb_init.d/pax-cdbinit--1.0.sql' + +.PHONY: install +install: install-data + @echo "install data" + make -C build install + +.PHONY: uninstall-data + +uninstall-data: + $(RM) '$(DESTDIR)$(datadir)/cdb_init.d/pax-cdbinit--1.0.sql' + +uninstall: uninstall-data + +clean-data: + $(RM) pax-cdbinit--1.0.sql + $(RM) -r build + +clean: clean-data diff --git a/contrib/pax_storage/README.md b/contrib/pax_storage/README.md index f26eeaf11be..e82817afed5 100644 --- a/contrib/pax_storage/README.md +++ b/contrib/pax_storage/README.md @@ -43,7 +43,7 @@ make -j ### Build GTEST 1. make sure already build pax with cmake option `-DBUILD_GTEST=on`, default value is on -2. better with debug cmake option `-DENBALE_DEBUG=on`, default value is on +2. better with debug cmake option `-DENABLE_DEBUG=on`, default value is on 3. run tests ``` diff --git a/contrib/pax_storage/expected/ddl.out b/contrib/pax_storage/expected/ddl.out new file mode 100644 index 00000000000..2917f049b80 --- /dev/null +++ b/contrib/pax_storage/expected/ddl.out @@ -0,0 +1,73 @@ +set default_table_access_method = 'pax'; +create table pax_test.t1( + id int, + name text not null, + height float not null, + decimal_col decimal(10, 2) not null, + created_at timestamp with time zone not null, + updated_at timestamp with time zone not null +) using pax distributed BY (id); +\d+ pax_test.t1 + Table "pax_test.t1" + Column | Type | Collation | Nullable | Default | Storage | Stats target | Description +-------------+--------------------------+-----------+----------+---------+----------+--------------+------------- + id | integer | | | | plain | | + name | text | | not null | | extended | | + height | double precision | | not null | | plain | | + decimal_col | numeric(10,2) | | not null | | main | | + created_at | timestamp with time zone | | not null | | plain | | + updated_at | timestamp with time zone | | not null | | plain | | +Distributed by: (id) + +create table pax_test.t2( + id int, + name text not null, + height float not null, + decimal_col decimal(10, 2) not null, + created_at timestamp with time zone not null, + updated_at timestamp with time zone not null +); +NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'id' as the Cloudberry Database data distribution key for this table. +HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew. +\d+ pax_test.t2 + Table "pax_test.t2" + Column | Type | Collation | Nullable | Default | Storage | Stats target | Description +-------------+--------------------------+-----------+----------+---------+----------+--------------+------------- + id | integer | | | | plain | | + name | text | | not null | | extended | | + height | double precision | | not null | | plain | | + decimal_col | numeric(10,2) | | not null | | main | | + created_at | timestamp with time zone | | not null | | plain | | + updated_at | timestamp with time zone | | not null | | plain | | +Distributed by: (id) + +insert into pax_test.t1 (id, name, height, decimal_col, created_at, updated_at) values + (1, 'Alice', 1.65, 1.23, '2023-05-17 17:56:49.633664+08', '2023-05-17 17:56:49.633664+08'), + (2, 'Bob', 1.75, 2.34, '2023-05-17 17:56:49.633664+08', '2023-05-17 17:56:49.633664+08'), + (3, 'Carol', 1.85, 3.45, '2023-05-17 17:56:49.633664+08', '2023-05-17 17:56:49.633664+08'); +alter table pax_test.t1 add column new_col1 int; +alter table pax_test.t1 add column new_col2 int default null; +alter table pax_test.t1 add column new_col3 int default 0; +alter table pax_test.t1 add column new_col4 int default 12; +select * from pax_test.t1; + id | name | height | decimal_col | created_at | updated_at | new_col1 | new_col2 | new_col3 | new_col4 +----+-------+--------+-------------+-------------------------------------+-------------------------------------+----------+----------+----------+---------- + 1 | Alice | 1.65 | 1.23 | Wed May 17 02:56:49.633664 2023 PDT | Wed May 17 02:56:49.633664 2023 PDT | | | 0 | 12 + 2 | Bob | 1.75 | 2.34 | Wed May 17 02:56:49.633664 2023 PDT | Wed May 17 02:56:49.633664 2023 PDT | | | 0 | 12 + 3 | Carol | 1.85 | 3.45 | Wed May 17 02:56:49.633664 2023 PDT | Wed May 17 02:56:49.633664 2023 PDT | | | 0 | 12 +(3 rows) + +alter table pax_test.t1 drop column new_col2; +alter table pax_test.t1 drop column new_col3; +vacuum pax_test.t1; +vacuum full pax_test.t1; +drop table pax_test.t1; +drop table pax_test.t2; +-- alter column with options +create table pax_test.t3 (v1 numeric(100,1)) with(compresstype=zstd, compresslevel=1); +alter table pax_test.t3 alter column v1 type numeric; +drop table pax_test.t3; +-- add column with options +create table pax_test.t4 (v1 text) with(compresstype=zstd, compresslevel=1); +alter table pax_test.t4 add column v2 text; +drop table pax_test.t4; diff --git a/contrib/pax_storage/expected/detoast.out b/contrib/pax_storage/expected/detoast.out new file mode 100644 index 00000000000..ded4073a380 --- /dev/null +++ b/contrib/pax_storage/expected/detoast.out @@ -0,0 +1,91 @@ +CREATE TABLE toasttest_external(f1 text); +NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'f1' as the Cloudberry Database data distribution key for this table. +HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew. +-- The storage `EXTERNAL` allows out-of-line storage but not compression. +alter table toasttest_external alter column f1 set storage external; +-- These tests are sensitive to block size. In CBDB, the block +-- size is 32 kB, whereas in PostgreSQL it's 8kB. Therefore make +-- the data 4x larger here. +INSERT INTO toasttest_external values (repeat('1234567890',300*4)); +INSERT INTO toasttest_external values (repeat('1234567890',300*4)); +INSERT INTO toasttest_external values (repeat('1234567890',300*4)); +INSERT INTO toasttest_external values (repeat('1234567890',300*4)); +-- expect >0 blocks +SELECT pg_relation_size(reltoastrelid) = 0 AS is_empty + FROM pg_class where relname = 'toasttest_external'; + is_empty +---------- + f +(1 row) + +create table toasttest_external_pax(f1 text) using pax; +NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'f1' as the Cloudberry Database data distribution key for this table. +HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew. +insert into toasttest_external_pax select * from toasttest_external; +drop table toasttest_external; +-- If pax insert toast here, Then after drop toasttest_external, toast +-- will not get the source data. +select length(f1) from toasttest_external_pax; + length +-------- + 12000 + 12000 + 12000 + 12000 +(4 rows) + +drop table toasttest_external_pax; +CREATE TABLE toasttest_compress(f1 text); +NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'f1' as the Cloudberry Database data distribution key for this table. +HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew. +-- The storage `MAIN` allows compression but not out-of-line storage. +alter table toasttest_compress alter column f1 set storage main; +-- about 1M +INSERT INTO toasttest_compress values (repeat('1234567890123456',1024 * 64)); +-- should be true, becase it's not store in toast table +SELECT pg_relation_size(reltoastrelid) = 0 AS is_empty FROM pg_class where relname = 'toasttest_compress'; + is_empty +---------- + t +(1 row) + +create table toasttest_compress_pax(f1 text) using pax; +NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'f1' as the Cloudberry Database data distribution key for this table. +HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew. +insert into toasttest_compress_pax select * from toasttest_compress; +drop table toasttest_compress; +select length(f1) from toasttest_compress_pax; + length +--------- + 1048576 +(1 row) + +drop table toasttest_compress_pax; +CREATE TABLE toasttest_extended(f1 text); +NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'f1' as the Cloudberry Database data distribution key for this table. +HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew. +-- The storage `EXTENDED` allows both compression and out-of-line storage. +alter table toasttest_extended alter column f1 set storage EXTENDED; +-- about 1M, will use out-of-line storage +INSERT INTO toasttest_extended values (repeat('1234567890123456',1024 * 64)); +-- about 80k , will use compression storage +INSERT INTO toasttest_extended values (repeat('1234567890123456',1024 * 5)); +SELECT pg_relation_size(reltoastrelid) = 0 AS is_empty FROM pg_class where relname = 'toasttest_extended'; + is_empty +---------- + f +(1 row) + +create table toasttest_extended_pax(f1 text) using pax; +NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'f1' as the Cloudberry Database data distribution key for this table. +HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew. +insert into toasttest_extended_pax select * from toasttest_extended; +drop table toasttest_extended; +select length(f1) from toasttest_extended_pax; + length +--------- + 1048576 + 81920 +(2 rows) + +drop table toasttest_extended_pax; diff --git a/contrib/pax_storage/src/data/sql/teardown.sql b/contrib/pax_storage/expected/setup.out similarity index 100% rename from contrib/pax_storage/src/data/sql/teardown.sql rename to contrib/pax_storage/expected/setup.out diff --git a/contrib/pax_storage/expected/teardown.out b/contrib/pax_storage/expected/teardown.out new file mode 100644 index 00000000000..e69de29bb2d diff --git a/contrib/pax_storage/src/data/expected/types.out b/contrib/pax_storage/expected/types.out similarity index 75% rename from contrib/pax_storage/src/data/expected/types.out rename to contrib/pax_storage/expected/types.out index 585ea81da8b..3f173f31c0e 100644 --- a/contrib/pax_storage/src/data/expected/types.out +++ b/contrib/pax_storage/expected/types.out @@ -1,8 +1,4 @@ --- start_ignore -create extension pax; -drop table if exists all_typbyval_pg_types; --- end_ignore -CREATE TABLE all_typbyval_pg_types ( +CREATE TABLE pax_test.all_typbyval_pg_types ( id int, bool_col bool, char_col char, @@ -24,10 +20,10 @@ CREATE TABLE all_typbyval_pg_types ( timestamptz_col timestamptz, pg_lsn_col pg_lsn ) USING pax distributed by (id); -insert into all_typbyval_pg_types values(1, true,'c',2,'cid',4.2,5,'2023-05-17 17:56:49',7,'2023-05-17 17:56:49',10,11.1111,12,'2023-05-17 17:56:49','2023-05-17 17:56:49', '16/0'), +insert into pax_test.all_typbyval_pg_types values(1, true,'c',2,'cid',4.2,5,'2023-05-17 17:56:49',7,'2023-05-17 17:56:49',10,11.1111,12,'2023-05-17 17:56:49','2023-05-17 17:56:49', '16/0'), (1, true,'c',2,'cid',4.2,5,'2023-05-17 17:56:49',7,'2023-05-17 17:56:49',10,11.1111,12,'2023-05-17 17:56:49','2023-05-17 17:56:49', '16/0'), (1, true,'c',2,'cid',4.2,5,'2023-05-17 17:56:49',7,'2023-05-17 17:56:49',10,11.1111,12,'2023-05-17 17:56:49','2023-05-17 17:56:49', '16/0'); -select * from all_typbyval_pg_types; +select * from pax_test.all_typbyval_pg_types; id | bool_col | char_col | int2_col | cid_col | float4_col | int4_col | date_col | oid_col | time_stamp_col | int8_col | float8_col | money_col | time_col | timestamptz_col | pg_lsn_col ----+----------+----------+----------+---------+------------+----------+------------+---------+--------------------------+----------+------------+-----------+----------+------------------------------+------------ 1 | t | c | 2 | 0 | 4.2 | 5 | 05-17-2023 | 7 | Wed May 17 17:56:49 2023 | 10 | 11.1111 | $12.00 | 17:56:49 | Wed May 17 17:56:49 2023 PDT | 16/0 @@ -35,10 +31,7 @@ select * from all_typbyval_pg_types; 1 | t | c | 2 | 0 | 4.2 | 5 | 05-17-2023 | 7 | Wed May 17 17:56:49 2023 | 10 | 11.1111 | $12.00 | 17:56:49 | Wed May 17 17:56:49 2023 PDT | 16/0 (3 rows) --- start_ignore -drop table if exists all_typlen_lt_0_pg_type; --- end_ignore -create table all_typlen_lt_0_pg_type ( +create table pax_test.all_typlen_lt_0_pg_type ( id int, name_col name, numeric_col numeric, @@ -46,13 +39,12 @@ create table all_typlen_lt_0_pg_type ( varchar_col varchar(128), point_col point ) USING pax distributed by (id); -insert into all_typlen_lt_0_pg_type values(1,'hello', 1.23, 'text', 'varchar', point(1,2)); -select * from all_typlen_lt_0_pg_type; +insert into pax_test.all_typlen_lt_0_pg_type values(1,'hello', 1.23, 'text', 'varchar', point(1,2)); +select * from pax_test.all_typlen_lt_0_pg_type; id | name_col | numeric_col | text_col | varchar_col | point_col ----+----------+-------------+----------+-------------+----------- 1 | hello | 1.23 | text | varchar | (1,2) (1 row) --- start_ignore -drop table if exists all_typbyval_pg_types; --- end_ignore +drop table pax_test.all_typbyval_pg_types; +drop table pax_test.all_typlen_lt_0_pg_type; diff --git a/contrib/pax_storage/expected/update.out b/contrib/pax_storage/expected/update.out new file mode 100644 index 00000000000..2420012b121 --- /dev/null +++ b/contrib/pax_storage/expected/update.out @@ -0,0 +1,624 @@ +set default_table_access_method = pax; +set pax.enable_filter = off; +-- +-- UPDATE ... SET = DEFAULT; +-- +CREATE TABLE update_test ( + a INT DEFAULT 10, + b INT, + c TEXT +); +CREATE TABLE upsert_test ( + a INT PRIMARY KEY, + b TEXT +); +INSERT INTO update_test VALUES (5, 10, 'foo'); +INSERT INTO update_test(b, a) VALUES (15, 10); +SELECT a,b,c FROM update_test ORDER BY a,b,c; + a | b | c +----+----+----- + 5 | 10 | foo + 10 | 15 | +(2 rows) + +UPDATE update_test SET a = DEFAULT, b = DEFAULT; +SELECT a,b,c FROM update_test ORDER BY a,b,c; + a | b | c +----+---+----- + 10 | | foo + 10 | | +(2 rows) + +-- aliases for the UPDATE target table +UPDATE update_test AS t SET b = 10 WHERE t.a = 10; +SELECT a,b,c FROM update_test ORDER BY a,b,c; + a | b | c +----+----+----- + 10 | 10 | foo + 10 | 10 | +(2 rows) + +UPDATE update_test t SET b = t.b + 10 WHERE t.a = 10; +SELECT a,b,c FROM update_test ORDER BY a,b,c; + a | b | c +----+----+----- + 10 | 20 | foo + 10 | 20 | +(2 rows) + +-- +-- Test VALUES in FROM +-- +UPDATE update_test SET a=v.i FROM (VALUES(100, 20)) AS v(i, j) + WHERE update_test.b = v.j; +SELECT a,b,c FROM update_test ORDER BY a,b,c; + a | b | c +-----+----+----- + 100 | 20 | foo + 100 | 20 | +(2 rows) + +-- fail, wrong data type: +UPDATE update_test SET a = v.* FROM (VALUES(100, 20)) AS v(i, j) + WHERE update_test.b = v.j; +ERROR: column "a" is of type integer but expression is of type record +LINE 1: UPDATE update_test SET a = v.* FROM (VALUES(100, 20)) AS v(i... + ^ +HINT: You will need to rewrite or cast the expression. +-- +-- Test multiple-set-clause syntax +-- +INSERT INTO update_test SELECT a,b+1,c FROM update_test; +SELECT * FROM update_test; + a | b | c +-----+----+----- + 100 | 20 | foo + 100 | 20 | + 100 | 21 | foo + 100 | 21 | +(4 rows) + +UPDATE update_test SET (c,b,a) = ('bugle', b+11, DEFAULT) WHERE c = 'foo'; +SELECT a,b,c FROM update_test ORDER BY a,b,c; + a | b | c +-----+----+------- + 10 | 31 | bugle + 10 | 32 | bugle + 100 | 20 | + 100 | 21 | +(4 rows) + +UPDATE update_test SET (c,b) = ('car', a+b), a = a + 1 WHERE a = 10; +SELECT a,b,c FROM update_test ORDER BY a,b,c; + a | b | c +-----+----+----- + 11 | 41 | car + 11 | 42 | car + 100 | 20 | + 100 | 21 | +(4 rows) + +-- fail, multi assignment to same column: +UPDATE update_test SET (c,b) = ('car', a+b), b = a + 1 WHERE a = 10; +ERROR: multiple assignments to same column "b" +-- uncorrelated sub-select: +UPDATE update_test + SET (b,a) = (select a,b from update_test where b = 41 and c = 'car') + WHERE a = 100 AND b = 20; +SELECT * FROM update_test; + a | b | c +-----+----+----- + 100 | 21 | + 11 | 41 | car + 11 | 42 | car + 41 | 11 | +(4 rows) + +-- correlated sub-select: +UPDATE update_test o + SET (b,a) = (select a+1,b from update_test i + where i.a=o.a and i.b=o.b and i.c is not distinct from o.c); +SELECT * FROM update_test; + a | b | c +----+-----+----- + 21 | 101 | + 41 | 12 | car + 42 | 12 | car + 11 | 42 | +(4 rows) + +-- fail, multiple rows supplied: +UPDATE update_test SET (b,a) = (select a+1,b from update_test); +ERROR: more than one row returned by a subquery used as an expression +-- set to null if no rows supplied: +UPDATE update_test SET (b,a) = (select a+1,b from update_test where a = 1000) + WHERE a = 11; +SELECT * FROM update_test; + a | b | c +----+-----+----- + 21 | 101 | + 41 | 12 | car + 42 | 12 | car + | | +(4 rows) + +-- *-expansion should work in this context: +UPDATE update_test SET (a,b) = ROW(v.*) FROM (VALUES(21, 100)) AS v(i, j) + WHERE update_test.a = v.i; +-- you might expect this to work, but syntactically it's not a RowExpr: +UPDATE update_test SET (a,b) = (v.*) FROM (VALUES(21, 101)) AS v(i, j) + WHERE update_test.a = v.i; +ERROR: source for a multiple-column UPDATE item must be a sub-SELECT or ROW() expression +LINE 1: UPDATE update_test SET (a,b) = (v.*) FROM (VALUES(21, 101)) ... + ^ +-- if an alias for the target table is specified, don't allow references +-- to the original table name +UPDATE update_test AS t SET b = update_test.b + 10 WHERE t.a = 10; +ERROR: invalid reference to FROM-clause entry for table "update_test" +LINE 1: UPDATE update_test AS t SET b = update_test.b + 10 WHERE t.a... + ^ +HINT: Perhaps you meant to reference the table alias "t". +-- Make sure that we can update to a TOASTed value. +UPDATE update_test SET c = repeat('x', 10000) WHERE c = 'car'; +SELECT a, b, char_length(c) FROM update_test; + a | b | char_length +----+-----+------------- + | | + 21 | 100 | + 41 | 12 | 10000 + 42 | 12 | 10000 +(4 rows) + +-- Check multi-assignment with a Result node to handle a one-time filter. +EXPLAIN (VERBOSE, COSTS OFF) +UPDATE update_test t + SET (a, b) = (SELECT b, a FROM update_test s WHERE s.a = t.a) + WHERE CURRENT_USER = SESSION_USER; + QUERY PLAN +---------------------------------------------------------------------------------------------------------- + Update on public.update_test t + -> Explicit Redistribute Motion 3:3 (slice1; segments: 3) + Output: ($1), ($2), t.c, ((SubPlan 1 (returns $1,$2))), t.ctid, t.gp_segment_id, t.*, (DMLAction) + -> Split + Output: ($1), ($2), t.c, ((SubPlan 1 (returns $1,$2))), t.ctid, t.gp_segment_id, t.*, DMLAction + -> Seq Scan on public.update_test t + Output: $1, $2, t.c, (SubPlan 1 (returns $1,$2)), t.ctid, t.gp_segment_id, t.* + SubPlan 1 (returns $1,$2) + -> Result + Output: s.b, s.a + Filter: (s.a = t.a) + -> Materialize + Output: s.b, s.a + -> Broadcast Motion 3:3 (slice2; segments: 3) + Output: s.b, s.a + -> Seq Scan on public.update_test s + Output: s.b, s.a + Optimizer: Postgres query optimizer +(18 rows) + +UPDATE update_test t + SET (a, b) = (SELECT b, a FROM update_test s WHERE s.a = t.a) + WHERE CURRENT_USER = SESSION_USER; +SELECT a, b, char_length(c) FROM update_test; + a | b | char_length +-----+----+------------- + | | + 100 | 21 | + 12 | 41 | 10000 + 12 | 42 | 10000 +(4 rows) + +-- Test ON CONFLICT DO UPDATE +INSERT INTO upsert_test VALUES(1, 'Boo'), (3, 'Zoo'); +-- uncorrelated sub-select: +WITH aaa AS (SELECT 1 AS a, 'Foo' AS b) INSERT INTO upsert_test + VALUES (1, 'Bar') ON CONFLICT(a) + DO UPDATE SET (b, a) = (SELECT b, a FROM aaa) RETURNING *; +ERROR: modification of distribution columns in OnConflictUpdate is not supported +-- correlated sub-select: +INSERT INTO upsert_test VALUES (1, 'Baz'), (3, 'Zaz') ON CONFLICT(a) + DO UPDATE SET (b, a) = (SELECT b || ', Correlated', a from upsert_test i WHERE i.a = upsert_test.a) + RETURNING *; +ERROR: modification of distribution columns in OnConflictUpdate is not supported +-- correlated sub-select (EXCLUDED.* alias): +INSERT INTO upsert_test VALUES (1, 'Bat'), (3, 'Zot') ON CONFLICT(a) + DO UPDATE SET (b, a) = (SELECT b || ', Excluded', a from upsert_test i WHERE i.a = excluded.a) + RETURNING *; +ERROR: modification of distribution columns in OnConflictUpdate is not supported +-- ON CONFLICT using system attributes in RETURNING, testing both the +-- inserting and updating paths. See bug report at: +-- https://www.postgresql.org/message-id/73436355-6432-49B1-92ED-1FE4F7E7E100%40finefun.com.au +INSERT INTO upsert_test VALUES (2, 'Beeble') ON CONFLICT(a) + DO UPDATE SET (b, a) = (SELECT b || ', Excluded', a from upsert_test i WHERE i.a = excluded.a) + RETURNING tableoid::regclass, xmin = pg_current_xact_id()::xid AS xmin_correct, xmax = 0 AS xmax_correct; +ERROR: modification of distribution columns in OnConflictUpdate is not supported +-- currently xmax is set after a conflict - that's probably not good, +-- but it seems worthwhile to have to be explicit if that changes. +INSERT INTO upsert_test VALUES (2, 'Brox') ON CONFLICT(a) + DO UPDATE SET (b, a) = (SELECT b || ', Excluded', a from upsert_test i WHERE i.a = excluded.a) + RETURNING tableoid::regclass, xmin = pg_current_xact_id()::xid AS xmin_correct, xmax = pg_current_xact_id()::xid AS xmax_correct; +ERROR: modification of distribution columns in OnConflictUpdate is not supported +DROP TABLE update_test; +DROP TABLE upsert_test; +-- Test ON CONFLICT DO UPDATE with partitioned table and non-identical children +CREATE TABLE upsert_test ( + a INT PRIMARY KEY, + b TEXT +) PARTITION BY LIST (a); +CREATE TABLE upsert_test_1 PARTITION OF upsert_test FOR VALUES IN (1); +CREATE TABLE upsert_test_2 (b TEXT, a INT PRIMARY KEY); +ALTER TABLE upsert_test ATTACH PARTITION upsert_test_2 FOR VALUES IN (2); +INSERT INTO upsert_test VALUES(1, 'Boo'), (2, 'Zoo'); +-- uncorrelated sub-select: +WITH aaa AS (SELECT 1 AS a, 'Foo' AS b) INSERT INTO upsert_test + VALUES (1, 'Bar') ON CONFLICT(a) + DO UPDATE SET (b, a) = (SELECT b, a FROM aaa) RETURNING *; +ERROR: modification of distribution columns in OnConflictUpdate is not supported +-- correlated sub-select: +WITH aaa AS (SELECT 1 AS ctea, ' Foo' AS cteb) INSERT INTO upsert_test + VALUES (1, 'Bar'), (2, 'Baz') ON CONFLICT(a) + DO UPDATE SET (b, a) = (SELECT upsert_test.b||cteb, upsert_test.a FROM aaa) RETURNING *; +ERROR: modification of distribution columns in OnConflictUpdate is not supported +DROP TABLE upsert_test; +--------------------------- +-- UPDATE with row movement +--------------------------- +-- When a partitioned table receives an UPDATE to the partitioned key and the +-- new values no longer meet the partition's bound, the row must be moved to +-- the correct partition for the new partition key (if one exists). We must +-- also ensure that updatable views on partitioned tables properly enforce any +-- WITH CHECK OPTION that is defined. The situation with triggers in this case +-- also requires thorough testing as partition key updates causing row +-- movement convert UPDATEs into DELETE+INSERT. +CREATE TABLE range_parted ( + a text, + b bigint, + c numeric, + d int, + e varchar +) PARTITION BY RANGE (a, b); +-- Create partitions intentionally in descending bound order, so as to test +-- that update-row-movement works with the leaf partitions not in bound order. +CREATE TABLE part_b_20_b_30 (e varchar, c numeric, a text, b bigint, d int); +-- GPDB: distribution policy must match the parent table. +alter table part_b_20_b_30 set distributed by (a); +ALTER TABLE range_parted ATTACH PARTITION part_b_20_b_30 FOR VALUES FROM ('b', 20) TO ('b', 30); +CREATE TABLE part_b_10_b_20 (e varchar, c numeric, a text, b bigint, d int) PARTITION BY RANGE (c); +alter table part_b_10_b_20 set distributed by (a); +CREATE TABLE part_b_1_b_10 PARTITION OF range_parted FOR VALUES FROM ('b', 1) TO ('b', 10); +ALTER TABLE range_parted ATTACH PARTITION part_b_10_b_20 FOR VALUES FROM ('b', 10) TO ('b', 20); +CREATE TABLE part_a_10_a_20 PARTITION OF range_parted FOR VALUES FROM ('a', 10) TO ('a', 20); +CREATE TABLE part_a_1_a_10 PARTITION OF range_parted FOR VALUES FROM ('a', 1) TO ('a', 10); +-- Check that partition-key UPDATE works sanely on a partitioned table that +-- does not have any child partitions. +UPDATE part_b_10_b_20 set b = b - 6; +-- Create some more partitions following the above pattern of descending bound +-- order, but let's make the situation a bit more complex by having the +-- attribute numbers of the columns vary from their parent partition. +CREATE TABLE part_c_100_200 (e varchar, c numeric, a text, b bigint, d int) PARTITION BY range (abs(d)); +ALTER TABLE part_c_100_200 DROP COLUMN e, DROP COLUMN c, DROP COLUMN a; +ALTER TABLE part_c_100_200 ADD COLUMN c numeric, ADD COLUMN e varchar, ADD COLUMN a text; +ALTER TABLE part_c_100_200 DROP COLUMN b; +ALTER TABLE part_c_100_200 ADD COLUMN b bigint; +CREATE TABLE part_d_1_15 PARTITION OF part_c_100_200 FOR VALUES FROM (1) TO (15); +CREATE TABLE part_d_15_20 PARTITION OF part_c_100_200 FOR VALUES FROM (15) TO (20); +ALTER TABLE part_b_10_b_20 ATTACH PARTITION part_c_100_200 FOR VALUES FROM (100) TO (200); +ERROR: distribution policy for "part_c_100_200" must be the same as that for "part_b_10_b_20" +-- GPDB: distribution policy must match the parent table, so the previous command fails. +-- Change the distribution key and try again. +alter table part_c_100_200 set distributed by (a); +ALTER TABLE part_b_10_b_20 ATTACH PARTITION part_c_100_200 FOR VALUES FROM (100) TO (200); +CREATE TABLE part_c_1_100 (e varchar, d int, c numeric, b bigint, a text); +alter table part_c_1_100 set distributed by (a); +ALTER TABLE part_b_10_b_20 ATTACH PARTITION part_c_1_100 FOR VALUES FROM (1) TO (100); +\set init_range_parted 'truncate range_parted; insert into range_parted VALUES (''a'', 1, 1, 1), (''a'', 10, 200, 1), (''b'', 12, 96, 1), (''b'', 13, 97, 2), (''b'', 15, 105, 16), (''b'', 17, 105, 19)' +\set show_data 'select tableoid::regclass::text COLLATE "C" partname, * from range_parted ORDER BY 1, 2, 3, 4, 5, 6' +:init_range_parted; +:show_data; + partname | a | b | c | d | e +----------------+---+----+-----+----+--- + part_a_10_a_20 | a | 10 | 200 | 1 | + part_a_1_a_10 | a | 1 | 1 | 1 | + part_c_1_100 | b | 12 | 96 | 1 | + part_c_1_100 | b | 13 | 97 | 2 | + part_d_15_20 | b | 15 | 105 | 16 | + part_d_15_20 | b | 17 | 105 | 19 | +(6 rows) + +-- The order of subplans should be in bound order +EXPLAIN (costs off) UPDATE range_parted set c = c - 50 WHERE c > 97; + QUERY PLAN +------------------------------------------------------- + Update on range_parted + Update on part_a_1_a_10 range_parted_1 + Update on part_a_10_a_20 range_parted_2 + Update on part_b_1_b_10 range_parted_3 + Update on part_c_1_100 range_parted_4 + Update on part_d_1_15 range_parted_5 + Update on part_d_15_20 range_parted_6 + Update on part_b_20_b_30 range_parted_7 + -> Append + -> Seq Scan on part_a_1_a_10 range_parted_1 + Filter: (c > '97'::numeric) + -> Seq Scan on part_a_10_a_20 range_parted_2 + Filter: (c > '97'::numeric) + -> Seq Scan on part_b_1_b_10 range_parted_3 + Filter: (c > '97'::numeric) + -> Seq Scan on part_c_1_100 range_parted_4 + Filter: (c > '97'::numeric) + -> Seq Scan on part_d_1_15 range_parted_5 + Filter: (c > '97'::numeric) + -> Seq Scan on part_d_15_20 range_parted_6 + Filter: (c > '97'::numeric) + -> Seq Scan on part_b_20_b_30 range_parted_7 + Filter: (c > '97'::numeric) +(23 rows) + +-- fail, row movement happens only within the partition subtree. +UPDATE part_c_100_200 set c = c - 20, d = c WHERE c = 105; +ERROR: new row for relation "part_c_100_200" violates partition constraint +DETAIL: Failing row contains (105, 85, null, b, 15). +-- fail, no partition key update, so no attempt to move tuple, +-- but "a = 'a'" violates partition constraint enforced by root partition) +UPDATE part_b_10_b_20 set a = 'a'; +ERROR: new row for relation "part_b_10_b_20" violates partition constraint +DETAIL: Failing row contains (null, 96, a, 12, 1). +-- ok, partition key update, no constraint violation +UPDATE range_parted set d = d - 10 WHERE d > 10; +-- ok, no partition key update, no constraint violation +UPDATE range_parted set e = d; +-- No row found +UPDATE part_c_1_100 set c = c + 20 WHERE c = 98; +-- ok, row movement +UPDATE part_b_10_b_20 set c = c + 20 returning c, b, a; + c | b | a +-----+----+--- + 116 | 12 | b + 117 | 13 | b + 125 | 15 | b + 125 | 17 | b +(4 rows) + +:show_data; + partname | a | b | c | d | e +----------------+---+----+-----+---+--- + part_a_10_a_20 | a | 10 | 200 | 1 | 1 + part_a_1_a_10 | a | 1 | 1 | 1 | 1 + part_d_1_15 | b | 12 | 116 | 1 | 1 + part_d_1_15 | b | 13 | 117 | 2 | 2 + part_d_1_15 | b | 15 | 125 | 6 | 6 + part_d_1_15 | b | 17 | 125 | 9 | 9 +(6 rows) + +-- fail, row movement happens only within the partition subtree. +UPDATE part_b_10_b_20 set b = b - 6 WHERE c > 116 returning *; +ERROR: new row for relation "part_b_10_b_20" violates partition constraint +DETAIL: Failing row contains (2, 117, b, 7, 2). +-- ok, row movement, with subset of rows moved into different partition. +UPDATE range_parted set b = b - 6 WHERE c > 116 returning a, b + c; + a | ?column? +---+---------- + a | 204 + b | 124 + b | 134 + b | 136 +(4 rows) + +:show_data; + partname | a | b | c | d | e +---------------+---+----+-----+---+--- + part_a_1_a_10 | a | 1 | 1 | 1 | 1 + part_a_1_a_10 | a | 4 | 200 | 1 | 1 + part_b_1_b_10 | b | 7 | 117 | 2 | 2 + part_b_1_b_10 | b | 9 | 125 | 6 | 6 + part_d_1_15 | b | 11 | 125 | 9 | 9 + part_d_1_15 | b | 12 | 116 | 1 | 1 +(6 rows) + +-- Common table needed for multiple test scenarios. +CREATE TABLE mintab(c1 int); +INSERT into mintab VALUES (120); +-- update partition key using updatable view. +CREATE VIEW upview AS SELECT * FROM range_parted WHERE (select c > c1 FROM mintab) WITH CHECK OPTION; +-- ok +UPDATE upview set c = 199 WHERE b = 4; +-- fail, check option violation +UPDATE upview set c = 120 WHERE b = 4; +ERROR: new row violates check option for view "upview" +DETAIL: Failing row contains (a, 4, 120, 1, 1). +-- fail, row movement with check option violation +UPDATE upview set a = 'b', b = 15, c = 120 WHERE b = 4; +ERROR: new row violates check option for view "upview" +DETAIL: Failing row contains (b, 15, 120, 1, 1). +-- ok, row movement, check option passes +UPDATE upview set a = 'b', b = 15 WHERE b = 4; +:show_data; + partname | a | b | c | d | e +---------------+---+----+-----+---+--- + part_a_1_a_10 | a | 1 | 1 | 1 | 1 + part_b_1_b_10 | b | 7 | 117 | 2 | 2 + part_b_1_b_10 | b | 9 | 125 | 6 | 6 + part_d_1_15 | b | 11 | 125 | 9 | 9 + part_d_1_15 | b | 12 | 116 | 1 | 1 + part_d_1_15 | b | 15 | 199 | 1 | 1 +(6 rows) + +-- cleanup +DROP VIEW upview; +-- RETURNING having whole-row vars. +:init_range_parted; +UPDATE range_parted set c = 95 WHERE a = 'b' and b > 10 and c > 100 returning (range_parted), *; + range_parted | a | b | c | d | e +---------------+---+----+----+----+--- + (b,15,95,16,) | b | 15 | 95 | 16 | + (b,17,95,19,) | b | 17 | 95 | 19 | +(2 rows) + +:show_data; + partname | a | b | c | d | e +----------------+---+----+-----+----+--- + part_a_10_a_20 | a | 10 | 200 | 1 | + part_a_1_a_10 | a | 1 | 1 | 1 | + part_c_1_100 | b | 12 | 96 | 1 | + part_c_1_100 | b | 13 | 97 | 2 | + part_c_1_100 | b | 15 | 95 | 16 | + part_c_1_100 | b | 17 | 95 | 19 | +(6 rows) + +-- Creating default partition for range +:init_range_parted; +create table part_def partition of range_parted default; +\d+ part_def + Table "public.part_def" + Column | Type | Collation | Nullable | Default | Storage | Stats target | Description +--------+-------------------+-----------+----------+---------+----------+--------------+------------- + a | text | | | | extended | | + b | bigint | | | | plain | | + c | numeric | | | | main | | + d | integer | | | | plain | | + e | character varying | | | | extended | | +Partition of: range_parted DEFAULT +Partition constraint: (NOT ((a IS NOT NULL) AND (b IS NOT NULL) AND (((a = 'a'::text) AND (b >= '1'::bigint) AND (b < '10'::bigint)) OR ((a = 'a'::text) AND (b >= '10'::bigint) AND (b < '20'::bigint)) OR ((a = 'b'::text) AND (b >= '1'::bigint) AND (b < '10'::bigint)) OR ((a = 'b'::text) AND (b >= '10'::bigint) AND (b < '20'::bigint)) OR ((a = 'b'::text) AND (b >= '20'::bigint) AND (b < '30'::bigint))))) + +insert into range_parted values ('c', 9); +-- ok +update part_def set a = 'd' where a = 'c'; +-- fail +update part_def set a = 'a' where a = 'd'; +ERROR: new row for relation "part_def" violates partition constraint +DETAIL: Failing row contains (a, 9, null, null, null). +:show_data; + partname | a | b | c | d | e +----------------+---+----+-----+----+--- + part_a_10_a_20 | a | 10 | 200 | 1 | + part_a_1_a_10 | a | 1 | 1 | 1 | + part_c_1_100 | b | 12 | 96 | 1 | + part_c_1_100 | b | 13 | 97 | 2 | + part_d_15_20 | b | 15 | 105 | 16 | + part_d_15_20 | b | 17 | 105 | 19 | + part_def | d | 9 | | | +(7 rows) + +-- Update row movement from non-default to default partition. +-- fail, default partition is not under part_a_10_a_20; +UPDATE part_a_10_a_20 set a = 'ad' WHERE a = 'a'; +ERROR: new row for relation "part_a_10_a_20" violates partition constraint +DETAIL: Failing row contains (ad, 10, 200, 1, null). +-- ok +UPDATE range_parted set a = 'ad' WHERE a = 'a'; +UPDATE range_parted set a = 'bd' WHERE a = 'b'; +:show_data; + partname | a | b | c | d | e +----------+----+----+-----+----+--- + part_def | ad | 1 | 1 | 1 | + part_def | ad | 10 | 200 | 1 | + part_def | bd | 12 | 96 | 1 | + part_def | bd | 13 | 97 | 2 | + part_def | bd | 15 | 105 | 16 | + part_def | bd | 17 | 105 | 19 | + part_def | d | 9 | | | +(7 rows) + +-- Update row movement from default to non-default partitions. +-- ok +UPDATE range_parted set a = 'a' WHERE a = 'ad'; +UPDATE range_parted set a = 'b' WHERE a = 'bd'; +:show_data; + partname | a | b | c | d | e +----------------+---+----+-----+----+--- + part_a_10_a_20 | a | 10 | 200 | 1 | + part_a_1_a_10 | a | 1 | 1 | 1 | + part_c_1_100 | b | 12 | 96 | 1 | + part_c_1_100 | b | 13 | 97 | 2 | + part_d_15_20 | b | 15 | 105 | 16 | + part_d_15_20 | b | 17 | 105 | 19 | + part_def | d | 9 | | | +(7 rows) + +-- Cleanup: range_parted no longer needed. +DROP TABLE range_parted; +CREATE TABLE list_parted ( + a text, + b int +) PARTITION BY list (a); +CREATE TABLE list_part1 PARTITION OF list_parted for VALUES in ('a', 'b'); +CREATE TABLE list_default PARTITION OF list_parted default; +INSERT into list_part1 VALUES ('a', 1); +INSERT into list_default VALUES ('d', 10); +-- fail +UPDATE list_default set a = 'a' WHERE a = 'd'; +ERROR: new row for relation "list_default" violates partition constraint +DETAIL: Failing row contains (a, 10). +-- ok +UPDATE list_default set a = 'x' WHERE a = 'd'; +DROP TABLE list_parted; +-------------- +-- Some more update-partition-key test scenarios below. This time use list +-- partitions. +-------------- +-- Setup for list partitions +CREATE TABLE list_parted (a numeric, b int, c int8) PARTITION BY list (a); +CREATE TABLE sub_parted PARTITION OF list_parted for VALUES in (1) PARTITION BY list (b); +CREATE TABLE sub_part1(b int, c int8, a numeric); +alter table sub_part1 set distributed by (a); -- GPDB: distribution policy must match the parent table. +ALTER TABLE sub_parted ATTACH PARTITION sub_part1 for VALUES in (1); +CREATE TABLE sub_part2(b int, c int8, a numeric); +alter table sub_part2 set distributed by (a); -- GPDB: distribution policy must match the parent table. +ALTER TABLE sub_parted ATTACH PARTITION sub_part2 for VALUES in (2); +CREATE TABLE list_part1(a numeric, b int, c int8); +ALTER TABLE list_parted ATTACH PARTITION list_part1 for VALUES in (2,3); +INSERT into list_parted VALUES (2,5,50); +INSERT into list_parted VALUES (3,6,60); +INSERT into sub_parted VALUES (1,1,60); +INSERT into sub_parted VALUES (1,2,10); +-- Test partition constraint violation when intermediate ancestor is used and +-- constraint is inherited from upper root. +UPDATE sub_parted set a = 2 WHERE c = 10; +ERROR: new row for relation "sub_parted" violates partition constraint +DETAIL: Failing row contains (2, 2, 10). +-- Test update-partition-key, where the unpruned partitions do not have their +-- partition keys updated. +SELECT tableoid::regclass::text, * FROM list_parted WHERE a = 2 ORDER BY 1; + tableoid | a | b | c +------------+---+---+---- + list_part1 | 2 | 5 | 50 +(1 row) + +UPDATE list_parted set b = c + a WHERE a = 2; +SELECT tableoid::regclass::text, * FROM list_parted WHERE a = 2 ORDER BY 1; + tableoid | a | b | c +------------+---+----+---- + list_part1 | 2 | 52 | 50 +(1 row) + +-- Cleanup: list_parted no longer needed. +DROP TABLE list_parted; +-- create custom operator class and hash function, for the same reason +-- explained in alter_table.sql +create or replace function dummy_hashint4(a int4, seed int8) returns int8 as +$$ begin return (a + seed); end; $$ language 'plpgsql' immutable; +create operator class custom_opclass for type int4 using hash as +operator 1 = , function 2 dummy_hashint4(int4, int8); +create table hash_parted ( + a int, + b int +) partition by hash (a custom_opclass, b custom_opclass); +create table hpart1 partition of hash_parted for values with (modulus 2, remainder 1); +create table hpart2 partition of hash_parted for values with (modulus 4, remainder 2); +create table hpart3 partition of hash_parted for values with (modulus 8, remainder 0); +create table hpart4 partition of hash_parted for values with (modulus 8, remainder 4); +insert into hpart1 values (1, 1); +insert into hpart2 values (2, 5); +insert into hpart4 values (3, 4); +-- fail +update hpart1 set a = 3, b=4 where a = 1; +ERROR: new row for relation "hpart1" violates partition constraint +DETAIL: Failing row contains (3, 4). +-- ok, row movement +update hash_parted set b = b - 1 where b = 1; +-- ok +update hash_parted set b = b + 8 where b = 1; +-- cleanup +drop table hash_parted; +drop operator class custom_opclass using hash; +drop function dummy_hashint4(a int4, seed int8); diff --git a/contrib/pax_storage/src/data/expected/update_gp.out b/contrib/pax_storage/expected/update_gp.out similarity index 77% rename from contrib/pax_storage/src/data/expected/update_gp.out rename to contrib/pax_storage/expected/update_gp.out index ba6ca2bc930..9a06ce92d04 100644 --- a/contrib/pax_storage/src/data/expected/update_gp.out +++ b/contrib/pax_storage/expected/update_gp.out @@ -1,9 +1,9 @@ +set default_table_access_method = pax; -- Test DELETE and UPDATE on an inherited table. -- The special aspect of this table is that the inherited table has -- a different distribution key. 'p' table's distribution key matches -- that of 'r', but 'p2's doesn't. Test that the planner adds a Motion -- node correctly for p2. -set default_table_access_method = 'pax'; create table todelete (a int) distributed by (a); create table parent (a int, b int, c int) distributed by (a); create table child (a int, b int, c int) inherits (parent) distributed by (b); @@ -19,14 +19,14 @@ update parent set c=c+100 from todelete where parent.a = todelete.a; select * from parent; a | b | c ----+----+----- + 1 | 1 | 1 5 | 5 | 105 9 | 9 | 9 - 10 | 10 | 10 - 6 | 6 | 106 + 7 | 7 | 107 2 | 2 | 2 8 | 8 | 8 - 7 | 7 | 107 - 1 | 1 | 1 + 10 | 10 | 10 + 6 | 6 | 106 (8 rows) drop table todelete; @@ -49,13 +49,13 @@ update target set b=target.b+100 where c = 3 and a in (select b from todelete); select * from target; a | b | c ---+-----+--- - 5 | 0 | 1 - 5 | 100 | 3 - 1 | 0 | 1 2 | 0 | 1 - 3 | 0 | 1 4 | 0 | 1 4 | 0 | 3 + 1 | 0 | 1 + 3 | 0 | 1 + 5 | 0 | 1 + 5 | 100 | 3 (7 rows) -- Also test an update with a qual that doesn't match any partition. The @@ -76,12 +76,10 @@ create table child_b (a int4, b int4) inherits (base_tbl) distributed by (b); NOTICE: merging column "a" with inherited definition NOTICE: merging column "b" with inherited definition insert into base_tbl select g, g from generate_series(1, 5) g; --- start_ignore explain (costs off) update base_tbl set a=a+1; -ERROR: can't split update for inherit table: base_tbl (preptlist.c:138) --- end_ignore +ERROR: can't split update for inherit table: base_tbl update base_tbl set a = 5; -ERROR: can't split update for inherit table: base_tbl (preptlist.c:138) +ERROR: can't split update for inherit table: base_tbl -- -- Explicit Distribution motion must be added if any of the child nodes -- contains any motion excluding the motions in initplans. @@ -96,8 +94,8 @@ CREATE TABLE keo3 ( sky_per character varying(24), bky_per character varying(24) INSERT INTO keo3 VALUES ('1', '1'); CREATE TABLE keo4 ( keo_para_required_period character varying(6), keo_para_budget_date character varying(24)) DISTRIBUTED RANDOMLY; INSERT INTO keo4 VALUES ('1', '1'); +ANALYZE keo1, keo2, keo3, keo4; -- Explicit Redistribution motion should be added in case of GPDB Planner (test case not applicable for ORCA) --- start_ignore EXPLAIN (COSTS OFF) UPDATE keo1 SET user_vie_act_cntr_marg_cum = 234.682 FROM ( SELECT a.user_vie_project_code_pk FROM keo1 a INNER JOIN keo2 b ON b.projects_pk=a.user_vie_project_code_pk @@ -107,41 +105,37 @@ EXPLAIN (COSTS OFF) UPDATE keo1 SET user_vie_act_cntr_marg_cum = 234.682 FROM (SELECT min (keo4.keo_para_budget_date) FROM keo4))) ) t1 WHERE t1.user_vie_project_code_pk = keo1.user_vie_project_code_pk; - QUERY PLAN -------------------------------------------------------------------------------------------------------------------------- + QUERY PLAN +------------------------------------------------------------------------------------------------------- Update on keo1 - InitPlan 3 (returns $2) (slice4) - -> Finalize Aggregate - InitPlan 2 (returns $1) (slice6) - -> Gather Motion 3:1 (slice7; segments: 3) - InitPlan 1 (returns $0) (slice8) - -> Finalize Aggregate - -> Gather Motion 3:1 (slice9; segments: 3) - -> Partial Aggregate - -> Seq Scan on keo4 + InitPlan 3 (returns $2) (slice3) + -> Aggregate + InitPlan 2 (returns $1) (slice5) + -> Gather Motion 3:1 (slice6; segments: 3) + InitPlan 1 (returns $0) (slice7) + -> Aggregate + -> Gather Motion 3:1 (slice8; segments: 3) + -> Seq Scan on keo4 -> Seq Scan on keo4 keo4_1 Filter: ((keo_para_budget_date)::text = $0) - -> Gather Motion 3:1 (slice5; segments: 3) - -> Partial Aggregate - -> Seq Scan on keo3 - Filter: ((bky_per)::text = ($1)::text) - -> Explicit Redistribute Motion 3:3 (slice1; segments: 3) + -> Gather Motion 3:1 (slice4; segments: 3) + -> Seq Scan on keo3 + Filter: ((bky_per)::text = ($1)::text) + -> Hash Join + Hash Cond: ((a.user_vie_project_code_pk)::text = (b.projects_pk)::text) -> Hash Join - Hash Cond: ((b.projects_pk)::text = (a.user_vie_project_code_pk)::text) - -> Seq Scan on keo2 b + Hash Cond: ((a.user_vie_project_code_pk)::text = (keo1.user_vie_project_code_pk)::text) + -> Broadcast Motion 3:3 (slice1; segments: 3) + -> Seq Scan on keo1 a + Filter: ((user_vie_fiscal_year_period_sk)::text = $2) -> Hash - -> Broadcast Motion 3:3 (slice2; segments: 3) - -> Hash Join - Hash Cond: ((keo1.user_vie_project_code_pk)::text = (a.user_vie_project_code_pk)::text) - -> Seq Scan on keo1 - -> Hash - -> Broadcast Motion 3:3 (slice3; segments: 3) - -> Seq Scan on keo1 a - Filter: ((user_vie_fiscal_year_period_sk)::text = $2) + -> Seq Scan on keo1 + -> Hash + -> Broadcast Motion 3:3 (slice2; segments: 3) + -> Seq Scan on keo2 b Optimizer: Postgres query optimizer -(30 rows) +(27 rows) --- end_ignore UPDATE keo1 SET user_vie_act_cntr_marg_cum = 234.682 FROM ( SELECT a.user_vie_project_code_pk FROM keo1 a INNER JOIN keo2 b ON b.projects_pk=a.user_vie_project_code_pk @@ -160,7 +154,6 @@ SELECT user_vie_act_cntr_marg_cum FROM keo1; -- Explicit Redistribution motion should not be added in case of GPDB Planner (test case not applicable to ORCA) CREATE TABLE keo5 (x int, y int) DISTRIBUTED BY (x); INSERT INTO keo5 VALUES (1,1); --- start_ignore EXPLAIN (COSTS OFF) DELETE FROM keo5 WHERE x IN (SELECT x FROM keo5 WHERE EXISTS (SELECT x FROM keo5 WHERE x < 2)); QUERY PLAN ------------------------------------------------------- @@ -181,7 +174,6 @@ EXPLAIN (COSTS OFF) DELETE FROM keo5 WHERE x IN (SELECT x FROM keo5 WHERE EXISTS Optimizer: Postgres query optimizer (15 rows) --- end_ignore DELETE FROM keo5 WHERE x IN (SELECT x FROM keo5 WHERE EXISTS (SELECT x FROM keo5 WHERE x < 2)); SELECT x FROM keo5; x @@ -194,34 +186,48 @@ DROP TABLE keo2; DROP TABLE keo3; DROP TABLE keo4; DROP TABLE keo5; --- start_ignore --- -- text types. We should support the following updates. --- -- --- CREATE TEMP TABLE ttab1 (a varchar(15), b integer) DISTRIBUTED BY (a); --- CREATE TEMP TABLE ttab2 (a varchar(15), b integer) DISTRIBUTED BY (a); --- UPDATE ttab1 SET b = ttab2.b FROM ttab2 WHERE ttab1.a = ttab2.a; --- DROP TABLE ttab1; --- DROP TABLE ttab2; --- CREATE TEMP TABLE ttab1 (a text, b integer) DISTRIBUTED BY (a); --- CREATE TEMP TABLE ttab2 (a text, b integer) DISTRIBUTED BY (a); --- UPDATE ttab1 SET b = ttab2.b FROM ttab2 WHERE ttab1.a = ttab2.a; --- DROP TABLE ttab1; --- DROP TABLE ttab2; --- CREATE TEMP TABLE ttab1 (a varchar, b integer) DISTRIBUTED BY (a); --- CREATE TEMP TABLE ttab2 (a varchar, b integer) DISTRIBUTED BY (a); --- UPDATE ttab1 SET b = ttab2.b FROM ttab2 WHERE ttab1.a = ttab2.a; --- DROP TABLE ttab1; --- DROP TABLE ttab2; --- CREATE TEMP TABLE ttab1 (a char(15), b integer) DISTRIBUTED BY (a); --- CREATE TEMP TABLE ttab2 (a char(15), b integer) DISTRIBUTED BY (a); --- UPDATE ttab1 SET b = ttab2.b FROM ttab2 WHERE ttab1.a = ttab2.a; --- DROP TABLE IF EXISTS update_distr_key; --- CREATE TEMP TABLE update_distr_key (a int, b int) DISTRIBUTED BY (a); --- INSERT INTO update_distr_key select i, i* 10 from generate_series(0, 9) i; --- UPDATE update_distr_key SET a = 5 WHERE b = 10; --- SELECT * from update_distr_key; --- DROP TABLE update_distr_key; --- end_ignore +-- +-- text types. We should support the following updates. +-- +CREATE TEMP TABLE ttab1 (a varchar(15), b integer) DISTRIBUTED BY (a); +CREATE TEMP TABLE ttab2 (a varchar(15), b integer) DISTRIBUTED BY (a); +UPDATE ttab1 SET b = ttab2.b FROM ttab2 WHERE ttab1.a = ttab2.a; +DROP TABLE ttab1; +DROP TABLE ttab2; +CREATE TEMP TABLE ttab1 (a text, b integer) DISTRIBUTED BY (a); +CREATE TEMP TABLE ttab2 (a text, b integer) DISTRIBUTED BY (a); +UPDATE ttab1 SET b = ttab2.b FROM ttab2 WHERE ttab1.a = ttab2.a; +DROP TABLE ttab1; +DROP TABLE ttab2; +CREATE TEMP TABLE ttab1 (a varchar, b integer) DISTRIBUTED BY (a); +CREATE TEMP TABLE ttab2 (a varchar, b integer) DISTRIBUTED BY (a); +UPDATE ttab1 SET b = ttab2.b FROM ttab2 WHERE ttab1.a = ttab2.a; +DROP TABLE ttab1; +DROP TABLE ttab2; +CREATE TEMP TABLE ttab1 (a char(15), b integer) DISTRIBUTED BY (a); +CREATE TEMP TABLE ttab2 (a char(15), b integer) DISTRIBUTED BY (a); +UPDATE ttab1 SET b = ttab2.b FROM ttab2 WHERE ttab1.a = ttab2.a; +DROP TABLE IF EXISTS update_distr_key; +NOTICE: table "update_distr_key" does not exist, skipping +CREATE TEMP TABLE update_distr_key (a int, b int) DISTRIBUTED BY (a); +INSERT INTO update_distr_key select i, i* 10 from generate_series(0, 9) i; +UPDATE update_distr_key SET a = 5 WHERE b = 10; +SELECT * from update_distr_key; + a | b +---+---- + 0 | 0 + 2 | 20 + 4 | 40 + 6 | 60 + 8 | 80 + 5 | 10 + 3 | 30 + 5 | 50 + 7 | 70 + 9 | 90 +(10 rows) + +DROP TABLE update_distr_key; -- below cases is to test multi-hash-cols CREATE TABLE tab3(c1 int, c2 int, c3 int, c4 int, c5 int) DISTRIBUTED BY (c1, c2, c3); CREATE TABLE tab5(c1 int, c2 int, c3 int, c4 int, c5 int) DISTRIBUTED BY (c1, c2, c3, c4, c5); @@ -233,14 +239,14 @@ SELECT gp_segment_id, * FROM tab3; ---------------+----+----+----+----+---- 0 | 5 | 5 | 5 | 5 | 5 0 | 8 | 8 | 8 | 8 | 8 - 2 | 1 | 1 | 1 | 1 | 1 - 2 | 2 | 2 | 2 | 2 | 2 - 2 | 6 | 6 | 6 | 6 | 6 - 2 | 7 | 7 | 7 | 7 | 7 1 | 3 | 3 | 3 | 3 | 3 1 | 4 | 4 | 4 | 4 | 4 1 | 9 | 9 | 9 | 9 | 9 1 | 10 | 10 | 10 | 10 | 10 + 2 | 1 | 1 | 1 | 1 | 1 + 2 | 2 | 2 | 2 | 2 | 2 + 2 | 6 | 6 | 6 | 6 | 6 + 2 | 7 | 7 | 7 | 7 | 7 (10 rows) UPDATE tab3 set c1 = 9 where c4 = 1; @@ -267,66 +273,69 @@ SELECT gp_segment_id, * FROM tab3; 1 | 4 | 4 | 4 | 4 | 4 1 | 9 | 9 | 9 | 9 | 9 1 | 10 | 10 | 10 | 10 | 10 + 0 | 5 | 5 | 5 | 5 | 5 + 0 | 8 | 8 | 8 | 8 | 8 2 | 2 | 2 | 2 | 2 | 2 2 | 6 | 6 | 6 | 6 | 6 2 | 7 | 7 | 7 | 7 | 7 2 | 5 | 6 | 1 | 1 | 1 - 0 | 5 | 5 | 5 | 5 | 5 - 0 | 8 | 8 | 8 | 8 | 8 (10 rows) UPDATE tab3 set (c1,c2,c3) = (3,2,1) where c4 = 1; SELECT gp_segment_id, * FROM tab3; gp_segment_id | c1 | c2 | c3 | c4 | c5 ---------------+----+----+----+----+---- - 0 | 5 | 5 | 5 | 5 | 5 - 0 | 8 | 8 | 8 | 8 | 8 - 1 | 3 | 3 | 3 | 3 | 3 - 1 | 4 | 4 | 4 | 4 | 4 - 1 | 9 | 9 | 9 | 9 | 9 - 1 | 10 | 10 | 10 | 10 | 10 2 | 2 | 2 | 2 | 2 | 2 2 | 6 | 6 | 6 | 6 | 6 2 | 7 | 7 | 7 | 7 | 7 2 | 3 | 2 | 1 | 1 | 1 + 1 | 3 | 3 | 3 | 3 | 3 + 1 | 4 | 4 | 4 | 4 | 4 + 1 | 9 | 9 | 9 | 9 | 9 + 1 | 10 | 10 | 10 | 10 | 10 + 0 | 5 | 5 | 5 | 5 | 5 + 0 | 8 | 8 | 8 | 8 | 8 (10 rows) UPDATE tab3 set c1 = 11 where c2 = 10 and c2 < 1; SELECT gp_segment_id, * FROM tab3; gp_segment_id | c1 | c2 | c3 | c4 | c5 ---------------+----+----+----+----+---- - 1 | 3 | 3 | 3 | 3 | 3 - 1 | 4 | 4 | 4 | 4 | 4 - 1 | 9 | 9 | 9 | 9 | 9 - 1 | 10 | 10 | 10 | 10 | 10 0 | 5 | 5 | 5 | 5 | 5 0 | 8 | 8 | 8 | 8 | 8 2 | 2 | 2 | 2 | 2 | 2 2 | 6 | 6 | 6 | 6 | 6 2 | 7 | 7 | 7 | 7 | 7 2 | 3 | 2 | 1 | 1 | 1 + 1 | 3 | 3 | 3 | 3 | 3 + 1 | 4 | 4 | 4 | 4 | 4 + 1 | 9 | 9 | 9 | 9 | 9 + 1 | 10 | 10 | 10 | 10 | 10 (10 rows) -- test tab5 SELECT gp_segment_id, * FROM tab5; gp_segment_id | c1 | c2 | c3 | c4 | c5 ---------------+----+----+----+----+---- - 0 | 4 | 4 | 4 | 4 | 4 - 0 | 9 | 9 | 9 | 9 | 9 - 0 | 10 | 10 | 10 | 10 | 10 + 2 | 6 | 6 | 6 | 6 | 6 + 2 | 7 | 7 | 7 | 7 | 7 + 2 | 8 | 8 | 8 | 8 | 8 1 | 1 | 1 | 1 | 1 | 1 1 | 2 | 2 | 2 | 2 | 2 1 | 3 | 3 | 3 | 3 | 3 1 | 5 | 5 | 5 | 5 | 5 - 2 | 6 | 6 | 6 | 6 | 6 - 2 | 7 | 7 | 7 | 7 | 7 - 2 | 8 | 8 | 8 | 8 | 8 + 0 | 4 | 4 | 4 | 4 | 4 + 0 | 9 | 9 | 9 | 9 | 9 + 0 | 10 | 10 | 10 | 10 | 10 (10 rows) UPDATE tab5 set c1 = 1000 where c4 = 1; SELECT gp_segment_id, * FROM tab5; gp_segment_id | c1 | c2 | c3 | c4 | c5 ---------------+------+----+----+----+---- + 0 | 4 | 4 | 4 | 4 | 4 + 0 | 9 | 9 | 9 | 9 | 9 + 0 | 10 | 10 | 10 | 10 | 10 1 | 2 | 2 | 2 | 2 | 2 1 | 3 | 3 | 3 | 3 | 3 1 | 5 | 5 | 5 | 5 | 5 @@ -334,9 +343,6 @@ SELECT gp_segment_id, * FROM tab5; 2 | 6 | 6 | 6 | 6 | 6 2 | 7 | 7 | 7 | 7 | 7 2 | 8 | 8 | 8 | 8 | 8 - 0 | 4 | 4 | 4 | 4 | 4 - 0 | 9 | 9 | 9 | 9 | 9 - 0 | 10 | 10 | 10 | 10 | 10 (10 rows) UPDATE tab5 set (c1,c2) = (9,10) where c4 = 1; @@ -359,12 +365,12 @@ UPDATE tab5 set (c1,c2,c4) = (5,8,6) where c4 = 1; SELECT gp_segment_id, * FROM tab5; gp_segment_id | c1 | c2 | c3 | c4 | c5 ---------------+----+----+----+----+---- - 2 | 6 | 6 | 6 | 6 | 6 - 2 | 7 | 7 | 7 | 7 | 7 - 2 | 8 | 8 | 8 | 8 | 8 1 | 2 | 2 | 2 | 2 | 2 1 | 3 | 3 | 3 | 3 | 3 1 | 5 | 5 | 5 | 5 | 5 + 2 | 6 | 6 | 6 | 6 | 6 + 2 | 7 | 7 | 7 | 7 | 7 + 2 | 8 | 8 | 8 | 8 | 8 0 | 4 | 4 | 4 | 4 | 4 0 | 9 | 9 | 9 | 9 | 9 0 | 10 | 10 | 10 | 10 | 10 @@ -375,6 +381,9 @@ UPDATE tab5 set (c1,c2,c3,c4,c5) = (1,2,3,0,6) where c5 = 1; SELECT gp_segment_id, * FROM tab5; gp_segment_id | c1 | c2 | c3 | c4 | c5 ---------------+----+----+----+----+---- + 0 | 4 | 4 | 4 | 4 | 4 + 0 | 9 | 9 | 9 | 9 | 9 + 0 | 10 | 10 | 10 | 10 | 10 1 | 2 | 2 | 2 | 2 | 2 1 | 3 | 3 | 3 | 3 | 3 1 | 5 | 5 | 5 | 5 | 5 @@ -382,15 +391,15 @@ SELECT gp_segment_id, * FROM tab5; 2 | 6 | 6 | 6 | 6 | 6 2 | 7 | 7 | 7 | 7 | 7 2 | 8 | 8 | 8 | 8 | 8 - 0 | 4 | 4 | 4 | 4 | 4 - 0 | 9 | 9 | 9 | 9 | 9 - 0 | 10 | 10 | 10 | 10 | 10 (10 rows) UPDATE tab5 set c1 = 11 where c3 = 10 and c3 < 1; SELECT gp_segment_id, * FROM tab5; gp_segment_id | c1 | c2 | c3 | c4 | c5 ---------------+----+----+----+----+---- + 0 | 4 | 4 | 4 | 4 | 4 + 0 | 9 | 9 | 9 | 9 | 9 + 0 | 10 | 10 | 10 | 10 | 10 1 | 2 | 2 | 2 | 2 | 2 1 | 3 | 3 | 3 | 3 | 3 1 | 5 | 5 | 5 | 5 | 5 @@ -398,12 +407,8 @@ SELECT gp_segment_id, * FROM tab5; 2 | 6 | 6 | 6 | 6 | 6 2 | 7 | 7 | 7 | 7 | 7 2 | 8 | 8 | 8 | 8 | 8 - 0 | 4 | 4 | 4 | 4 | 4 - 0 | 9 | 9 | 9 | 9 | 9 - 0 | 10 | 10 | 10 | 10 | 10 (10 rows) --- start_ignore EXPLAIN (COSTS OFF ) UPDATE tab3 SET C1 = C1 + 1, C5 = C5+1; QUERY PLAN --------------------------------------------------------------- @@ -414,7 +419,6 @@ EXPLAIN (COSTS OFF ) UPDATE tab3 SET C1 = C1 + 1, C5 = C5+1; Optimizer: Postgres query optimizer (5 rows) --- end_ignore -- clean up drop table tab3; drop table tab5; @@ -430,6 +434,12 @@ drop table if exists update_ao_table; NOTICE: table "update_ao_table" does not exist, skipping drop table if exists update_aoco_table; NOTICE: table "update_aoco_table" does not exist, skipping +drop table if exists p_1; +NOTICE: table "p_1" does not exist, skipping +drop table if exists p_2; +NOTICE: table "p_2" does not exist, skipping +drop table if exists subpartition_1; +NOTICE: table "subpartition_1" does not exist, skipping -- end_ignore -- Update normal table distribution key create table update_dist(a int) distributed by (a); @@ -449,43 +459,43 @@ insert into s select generate_series(1, 5), generate_series(1, 5) * 2; select * from r; a | b ---+---- - 1 | 2 - 5 | 10 - 2 | 4 3 | 6 4 | 8 + 5 | 10 + 1 | 2 + 2 | 4 (5 rows) select * from s; a | b ---+---- + 1 | 2 2 | 4 3 | 6 4 | 8 5 | 10 - 1 | 2 (5 rows) update r set a = r.a + 1 from s where r.a = s.a; select * from r; a | b ---+---- - 3 | 4 4 | 6 - 2 | 2 5 | 8 6 | 10 + 3 | 4 + 2 | 2 (5 rows) update r set a = r.a + 1 where a in (select a from s); select * from r; a | b ---+---- - 4 | 4 - 3 | 2 6 | 10 5 | 6 6 | 8 + 4 | 4 + 3 | 2 (5 rows) -- Update redistribution @@ -496,32 +506,32 @@ insert into s select generate_series(1, 5), generate_series(1, 5) * 2; select * from r; a | b ---+--- + 3 | 3 + 4 | 4 5 | 5 1 | 1 2 | 2 - 3 | 3 - 4 | 4 (5 rows) select * from s; a | b ---+---- - 1 | 2 - 5 | 10 - 2 | 4 3 | 6 4 | 8 + 5 | 10 + 1 | 2 + 2 | 4 (5 rows) update r set a = r.a + 1 from s where r.b = s.b; select * from r; a | b ---+--- - 3 | 3 - 3 | 2 1 | 1 + 3 | 3 5 | 5 5 | 4 + 3 | 2 (5 rows) update r set a = r.a + 1 where b in (select b from s); @@ -529,9 +539,9 @@ select * from r; a | b ---+--- 1 | 1 + 3 | 3 5 | 5 6 | 4 - 3 | 3 4 | 2 (5 rows) @@ -553,48 +563,82 @@ select * from r; select * from s; a | b ---+--- - 2 | 2 3 | 3 4 | 4 - 1 | 1 5 | 5 + 1 | 1 + 2 | 2 (5 rows) update s set a = s.a + 1 where exists (select 1 from r where s.a = r.b); select * from s; a | b ---+--- - 5 | 5 - 5 | 4 1 | 1 3 | 3 + 5 | 5 3 | 2 + 5 | 4 (5 rows) --- start_ignore -- Update ao table distribution key --- create table update_ao_table (a int, b int) WITH (appendonly=true) distributed by (a); --- insert into update_ao_table select g, g from generate_series(1, 5) g; --- select * from update_ao_table; --- update update_ao_table set a = a + 1 where b = 3; --- select * from update_ao_table; +create table update_ao_table (a int, b int) WITH (appendonly=true) distributed by (a); +insert into update_ao_table select g, g from generate_series(1, 5) g; +select * from update_ao_table; + a | b +---+--- + 1 | 1 + 2 | 2 + 3 | 3 + 4 | 4 + 5 | 5 +(5 rows) + +update update_ao_table set a = a + 1 where b = 3; +select * from update_ao_table; + a | b +---+--- + 1 | 1 + 2 | 2 + 4 | 4 + 5 | 5 + 4 | 3 +(5 rows) + -- Update aoco table distribution key --- create table update_aoco_table (a int, b int) WITH (appendonly=true, orientation=column) distributed by (a); --- insert into update_aoco_table select g,g from generate_series(1, 5) g; --- select * from update_aoco_table; --- update update_aoco_table set a = a + 1 where b = 3; --- select * from update_aoco_table; --- end_ignore +create table update_aoco_table (a int, b int) WITH (appendonly=true, orientation=column) distributed by (a); +insert into update_aoco_table select g,g from generate_series(1, 5) g; +select * from update_aoco_table; + a | b +---+--- + 1 | 1 + 2 | 2 + 3 | 3 + 4 | 4 + 5 | 5 +(5 rows) + +update update_aoco_table set a = a + 1 where b = 3; +select * from update_aoco_table; + a | b +---+--- + 1 | 1 + 2 | 2 + 4 | 4 + 5 | 5 + 4 | 3 +(5 rows) + -- Update prepare delete from s; insert into s select generate_series(1, 5), generate_series(1, 5); select * from r; a | b ---+---- + 1 | 2 2 | 4 3 | 6 4 | 8 - 1 | 2 5 | 10 (5 rows) @@ -602,10 +646,10 @@ select * from s; a | b ---+--- 1 | 1 - 5 | 5 2 | 2 3 | 3 4 | 4 + 5 | 5 (5 rows) prepare update_s(int) as update s set a = s.a + $1 where exists (select 1 from r where s.a = r.b); @@ -613,27 +657,26 @@ execute update_s(10); select * from s; a | b ----+--- - 3 | 3 1 | 1 - 12 | 2 - 5 | 5 14 | 4 + 3 | 3 + 5 | 5 + 12 | 2 (5 rows) -- Confirm that a split update is not created for a table excluded by -- constraints in the planner. create table nosplitupdate (a int) distributed by (a); --- start_ignore explain update nosplitupdate set a=0 where a=1 and a<1; QUERY PLAN ----------------------------------------------------------- - Update on nosplitupdate (cost=0.00..0.01 rows=0 width=0) - -> Result (cost=0.00..0.00 rows=0 width=46) + Update on nosplitupdate (cost=0.00..0.01 rows=1 width=0) + -> Result (cost=0.00..0.01 rows=1 width=0) One-Time Filter: false + Planning time: 0.271 ms Optimizer: Postgres query optimizer -(4 rows) +(5 rows) --- end_ignore -- test split-update when split-node's flow is entry create table tsplit_entry (c int); NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'c' as the Cloudberry Database data distribution key for this table. @@ -641,53 +684,56 @@ HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sur insert into tsplit_entry values (1), (2); analyze tsplit_entry; -- start_ignore +-- gp_segment_configuration scan is different when using different FTS explain update tsplit_entry set c = s.a from (select count(*) as a from gp_segment_configuration) s; QUERY PLAN ------------------------------------------------------------------------------------------------------------------ - Update on tsplit_entry (cost=10000000001.01..10000000002.17 rows=0 width=0) - -> Explicit Redistribute Motion 1:3 (slice1) (cost=10000000001.01..10000000002.17 rows=2 width=74) - -> Split (cost=10000000001.01..10000000002.08 rows=7 width=74) - -> Nested Loop (cost=10000000001.01..10000000002.08 rows=3 width=74) - -> Gather Motion 3:1 (slice2; segments: 3) (cost=0.00..1.03 rows=1 width=38) - -> Seq Scan on tsplit_entry (cost=0.00..1.01 rows=1 width=38) - -> Materialize (cost=1.01..1.04 rows=1 width=40) - -> Subquery Scan on s (cost=1.01..1.03 rows=1 width=40) - -> Aggregate (cost=1.01..1.02 rows=1 width=8) - -> Seq Scan on gp_segment_configuration (cost=0.00..1.01 rows=1 width=0) + Update on tsplit_entry (cost=10000000001.00..10000000003.18 rows=3 width=54) + -> Explicit Redistribute Motion 1:3 (slice) (cost=10000000001.00..10000000003.18 rows=7 width=54) + -> Split (cost=10000000001.00..10000000003.18 rows=7 width=54) + -> Nested Loop (cost=10000000001.00..10000000003.12 rows=4 width=54) + -> Gather Motion 3:1 (slice2; segments: 3) (cost=0.00..2.06 rows=2 width=14) + -> Seq Scan on tsplit_entry (cost=0.00..2.02 rows=1 width=14) + -> Materialize (cost=1.00..1.03 rows=1 width=40) + -> Subquery Scan on s (cost=1.00..1.02 rows=1 width=40) + -> Aggregate (cost=1.00..1.01 rows=1 width=8) + -> Function Scan on gp_get_segment_configuration (cost=0.00..1.00 rows=1 width=0) Optimizer: Postgres query optimizer (11 rows) - -- end_ignore update tsplit_entry set c = s.a from (select count(*) as a from gp_segment_configuration) s; --- start_ignore --- CREATE TABLE update_gp_foo ( --- a_dist int, --- b int, --- c_part int, --- d int --- ) --- WITH (appendonly=false) DISTRIBUTED BY (a_dist) PARTITION BY RANGE(c_part) --- ( --- PARTITION p20190305 START (1) END (2) WITH (tablename='update_gp_foo_1_prt_p20190305', appendonly=false) --- ); --- CREATE TABLE update_gp_foo1 ( --- a_dist int, --- b int, --- c_part int, --- d int --- ) --- WITH (appendonly=false) DISTRIBUTED BY (a_dist) PARTITION BY RANGE(c_part) --- ( --- PARTITION p20190305 START (1) END (2) WITH (tablename='update_gp_foo1_1_prt_p20190305', appendonly=false) --- ); --- INSERT INTO update_gp_foo VALUES (12, 40, 1, 50); --- INSERT INTO update_gp_foo1 VALUES (12, 3, 1, 50); --- UPDATE update_gp_foo --- SET b = update_gp_foo.c_part, --- d = update_gp_foo1.a_dist --- FROM update_gp_foo1; --- SELECT * from update_gp_foo; --- end_ignore +CREATE TABLE update_gp_foo ( + a_dist int, + b int, + c_part int, + d int +) +WITH (appendonly=false) DISTRIBUTED BY (a_dist) PARTITION BY RANGE(c_part) + ( + PARTITION p20190305 START (1) END (2) WITH (tablename='update_gp_foo_1_prt_p20190305', appendonly=false) + ); +CREATE TABLE update_gp_foo1 ( + a_dist int, + b int, + c_part int, + d int +) +WITH (appendonly=false) DISTRIBUTED BY (a_dist) PARTITION BY RANGE(c_part) + ( + PARTITION p20190305 START (1) END (2) WITH (tablename='update_gp_foo1_1_prt_p20190305', appendonly=false) + ); +INSERT INTO update_gp_foo VALUES (12, 40, 1, 50); +INSERT INTO update_gp_foo1 VALUES (12, 3, 1, 50); +UPDATE update_gp_foo +SET b = update_gp_foo.c_part, + d = update_gp_foo1.a_dist +FROM update_gp_foo1; +SELECT * from update_gp_foo; + a_dist | b | c_part | d +--------+---+--------+---- + 12 | 1 | 1 | 12 +(1 row) + -- Test insert on conflict do update -- Insert on conflict do update is an insert statement but might -- invoke ExecUpdate on segments, but updating distkeys of a table @@ -695,31 +741,24 @@ update tsplit_entry set c = s.a from (select count(*) as a from gp_segment_confi -- planning, if a `insert on conflict do update` statement set the -- dist keys of the table, it will raise an error. -- See github issue: https://github.com/greenplum-db/gpdb/issues/9444 --- start_ignore create table t_insert_on_conflict_update_distkey(a int, b int) distributed by (a); create unique index uidx_t_insert_on_conflict_update_distkey on t_insert_on_conflict_update_distkey(a, b); -ERROR: not supported on pax relations: IndexBuildRangeScan -- the following statement should error out because the on conflict update want to -- modify the tuple's distkey which might lead to wrong data distribution insert into t_insert_on_conflict_update_distkey values (1, 1) on conflict(a, b) do update set a = 1; ERROR: modification of distribution columns in OnConflictUpdate is not supported drop index uidx_t_insert_on_conflict_update_distkey; -ERROR: index "uidx_t_insert_on_conflict_update_distkey" does not exist drop table t_insert_on_conflict_update_distkey; -- randomly distributed table cannot add unique constrain, so next we test replicated table create table t_insert_on_conflict_update_distkey(a int, b int) distributed replicated; create unique index uidx_t_insert_on_conflict_update_distkey on t_insert_on_conflict_update_distkey(a, b); -ERROR: not supported on pax relations: IndexBuildRangeScan -- the following statement should succeed because replicated table does not contain distkey insert into t_insert_on_conflict_update_distkey values (1, 1) on conflict(a, b) do update set a = 1; -ERROR: there is no unique or exclusion constraint matching the ON CONFLICT specification --- end_ignore +ERROR: not implemented yet on pax relations: TupleInsertSpeculative -- Some tests on a partitioned table. CREATE TABLE update_gp_rangep (a int, b int, orig_a int) DISTRIBUTED BY (b) PARTITION BY RANGE (a); CREATE TABLE update_gp_rangep_1_to_10 PARTITION OF update_gp_rangep FOR VALUES FROM (1) TO (10); -NOTICE: table has parent, setting distribution columns to match parent table CREATE TABLE update_gp_rangep_10_to_20 PARTITION OF update_gp_rangep FOR VALUES FROM (10) TO (20); -NOTICE: table has parent, setting distribution columns to match parent table INSERT INTO update_gp_rangep SELECT g, g, g FROM generate_series(1, 4) g; -- Simple case: Same partition, same node. UPDATE update_gp_rangep SET a = 9 WHERE a = 1; @@ -729,24 +768,23 @@ UPDATE update_gp_rangep SET b = 1 WHERE a = 2; UPDATE update_gp_rangep SET a = 10 WHERE a = 3; -- Move row to different partition and also change distribution key UPDATE update_gp_rangep SET a = 11, b = 1 WHERE a = 4; --- start_ignore SELECT tableoid::regclass, * FROM update_gp_rangep ORDER BY orig_a; tableoid | a | b | orig_a ---------------------------+----+---+-------- update_gp_rangep_1_to_10 | 9 | 1 | 1 update_gp_rangep_1_to_10 | 2 | 1 | 2 + update_gp_rangep_10_to_20 | 10 | 3 | 3 update_gp_rangep_10_to_20 | 11 | 1 | 4 -(3 rows) +(4 rows) --- end_ignore -- Also do a lookup with specific distribution key. If the rows were not -- correctly moved across segments, this would fail to find them, assuming -- that direct dispatch is effective. SELECT tableoid::regclass, * FROM update_gp_rangep WHERE b = 1; tableoid | a | b | orig_a ---------------------------+----+---+-------- - update_gp_rangep_1_to_10 | 9 | 1 | 1 update_gp_rangep_1_to_10 | 2 | 1 | 2 + update_gp_rangep_1_to_10 | 9 | 1 | 1 update_gp_rangep_10_to_20 | 11 | 1 | 4 (3 rows) @@ -755,9 +793,7 @@ drop table r; drop table s; drop table update_dist; drop table update_ao_table; -ERROR: table "update_ao_table" does not exist drop table update_aoco_table; -ERROR: table "update_aoco_table" does not exist drop table nosplitupdate; drop table tsplit_entry; -- end_ignore diff --git a/contrib/pax_storage/expected/update_gp_optimizer.out b/contrib/pax_storage/expected/update_gp_optimizer.out new file mode 100644 index 00000000000..d335f4c3f61 --- /dev/null +++ b/contrib/pax_storage/expected/update_gp_optimizer.out @@ -0,0 +1,793 @@ +set default_table_access_method = pax; +-- Test DELETE and UPDATE on an inherited table. +-- The special aspect of this table is that the inherited table has +-- a different distribution key. 'p' table's distribution key matches +-- that of 'r', but 'p2's doesn't. Test that the planner adds a Motion +-- node correctly for p2. +create table todelete (a int) distributed by (a); +create table parent (a int, b int, c int) distributed by (a); +create table child (a int, b int, c int) inherits (parent) distributed by (b); +NOTICE: merging column "a" with inherited definition +NOTICE: merging column "b" with inherited definition +NOTICE: merging column "c" with inherited definition +insert into parent select g, g, g from generate_series(1,5) g; +insert into child select g, g, g from generate_series(6,10) g; +insert into todelete select generate_series(3,4); +delete from parent using todelete where parent.a = todelete.a; +insert into todelete select generate_series(5,7); +update parent set c=c+100 from todelete where parent.a = todelete.a; +select * from parent; + a | b | c +----+----+----- + 1 | 1 | 1 + 5 | 5 | 105 + 9 | 9 | 9 + 7 | 7 | 107 + 2 | 2 | 2 + 8 | 8 | 8 + 10 | 10 | 10 + 6 | 6 | 106 +(8 rows) + +drop table todelete; +drop table child; +drop table parent; +-- This is similar to the above, but with a partitioned table (which is +-- implemented by inheritance) rather than an explicitly inherited table. +-- The scans on some of the partitions degenerate into Result nodes with +-- False one-time filter, which don't need a Motion node. +create table todelete (a int, b int) distributed by (a); +create table target (a int, b int, c int) + distributed by (a) + partition by range (c) (start(1) end(5) every(1), default partition extra); +insert into todelete select g, g % 4 from generate_series(1, 10) g; +insert into target select g, 0, 3 from generate_series(1, 5) g; +insert into target select g, 0, 1 from generate_series(1, 5) g; +delete from target where c = 3 and a in (select b from todelete); +insert into todelete values (1, 5); +update target set b=target.b+100 where c = 3 and a in (select b from todelete); +select * from target; + a | b | c +---+-----+--- + 2 | 0 | 1 + 4 | 0 | 1 + 4 | 0 | 3 + 1 | 0 | 1 + 3 | 0 | 1 + 5 | 0 | 1 + 5 | 100 | 3 +(7 rows) + +-- Also test an update with a qual that doesn't match any partition. The +-- Append degenerates into a dummy Result with false One-Time Filter. +alter table target drop default partition; +update target set b = 10 where c = 10; +drop table todelete; +drop table target; +-- +-- Test updated on inheritance parent table, where some child tables need a +-- Split Update, but not all. +-- +create table base_tbl (a int4, b int4) distributed by (a); +create table child_a (a int4, b int4) inherits (base_tbl) distributed by (a); +NOTICE: merging column "a" with inherited definition +NOTICE: merging column "b" with inherited definition +create table child_b (a int4, b int4) inherits (base_tbl) distributed by (b); +NOTICE: merging column "a" with inherited definition +NOTICE: merging column "b" with inherited definition +insert into base_tbl select g, g from generate_series(1, 5) g; +explain (costs off) update base_tbl set a=a+1; +ERROR: can't split update for inherit table: base_tbl +update base_tbl set a = 5; +ERROR: can't split update for inherit table: base_tbl +-- +-- Explicit Distribution motion must be added if any of the child nodes +-- contains any motion excluding the motions in initplans. +-- These test cases and expectation are applicable for GPDB planner not for ORCA. +-- +SET gp_autostats_mode = NONE; +CREATE TABLE keo1 ( user_vie_project_code_pk character varying(24), user_vie_fiscal_year_period_sk character varying(24), user_vie_act_cntr_marg_cum character varying(24)) DISTRIBUTED RANDOMLY; +INSERT INTO keo1 VALUES ('1', '1', '1'); +CREATE TABLE keo2 ( projects_pk character varying(24)) DISTRIBUTED RANDOMLY; +INSERT INTO keo2 VALUES ('1'); +CREATE TABLE keo3 ( sky_per character varying(24), bky_per character varying(24)) DISTRIBUTED BY (sky_per); +INSERT INTO keo3 VALUES ('1', '1'); +CREATE TABLE keo4 ( keo_para_required_period character varying(6), keo_para_budget_date character varying(24)) DISTRIBUTED RANDOMLY; +INSERT INTO keo4 VALUES ('1', '1'); +ANALYZE keo1, keo2, keo3, keo4; +-- Explicit Redistribution motion should be added in case of GPDB Planner (test case not applicable for ORCA) +EXPLAIN (COSTS OFF) UPDATE keo1 SET user_vie_act_cntr_marg_cum = 234.682 FROM + ( SELECT a.user_vie_project_code_pk FROM keo1 a INNER JOIN keo2 b + ON b.projects_pk=a.user_vie_project_code_pk + WHERE a.user_vie_fiscal_year_period_sk = + (SELECT MAX (sky_per) FROM keo3 WHERE bky_per = + (SELECT keo4.keo_para_required_period FROM keo4 WHERE keo_para_budget_date = + (SELECT min (keo4.keo_para_budget_date) FROM keo4))) + ) t1 +WHERE t1.user_vie_project_code_pk = keo1.user_vie_project_code_pk; + QUERY PLAN +------------------------------------------------------------------------------------------------------- + Update on keo1 + InitPlan 3 (returns $2) (slice3) + -> Aggregate + InitPlan 2 (returns $1) (slice5) + -> Gather Motion 3:1 (slice6; segments: 3) + InitPlan 1 (returns $0) (slice7) + -> Aggregate + -> Gather Motion 3:1 (slice8; segments: 3) + -> Seq Scan on keo4 + -> Seq Scan on keo4 keo4_1 + Filter: ((keo_para_budget_date)::text = $0) + -> Gather Motion 3:1 (slice4; segments: 3) + -> Seq Scan on keo3 + Filter: ((bky_per)::text = ($1)::text) + -> Hash Join + Hash Cond: ((a.user_vie_project_code_pk)::text = (b.projects_pk)::text) + -> Hash Join + Hash Cond: ((a.user_vie_project_code_pk)::text = (keo1.user_vie_project_code_pk)::text) + -> Broadcast Motion 3:3 (slice1; segments: 3) + -> Seq Scan on keo1 a + Filter: ((user_vie_fiscal_year_period_sk)::text = $2) + -> Hash + -> Seq Scan on keo1 + -> Hash + -> Broadcast Motion 3:3 (slice2; segments: 3) + -> Seq Scan on keo2 b + Optimizer: Postgres query optimizer +(27 rows) + +UPDATE keo1 SET user_vie_act_cntr_marg_cum = 234.682 FROM + ( SELECT a.user_vie_project_code_pk FROM keo1 a INNER JOIN keo2 b + ON b.projects_pk=a.user_vie_project_code_pk + WHERE a.user_vie_fiscal_year_period_sk = + (SELECT MAX (sky_per) FROM keo3 WHERE bky_per = + (SELECT keo4.keo_para_required_period FROM keo4 WHERE keo_para_budget_date = + (SELECT min (keo4.keo_para_budget_date) FROM keo4))) + ) t1 +WHERE t1.user_vie_project_code_pk = keo1.user_vie_project_code_pk; +SELECT user_vie_act_cntr_marg_cum FROM keo1; + user_vie_act_cntr_marg_cum +---------------------------- + 234.682 +(1 row) + +-- Explicit Redistribution motion should not be added in case of GPDB Planner (test case not applicable to ORCA) +CREATE TABLE keo5 (x int, y int) DISTRIBUTED BY (x); +INSERT INTO keo5 VALUES (1,1); +EXPLAIN (COSTS OFF) DELETE FROM keo5 WHERE x IN (SELECT x FROM keo5 WHERE EXISTS (SELECT x FROM keo5 WHERE x < 2)); + QUERY PLAN +------------------------------------------------------- + Delete on keo5 + InitPlan 1 (returns $0) (slice1) + -> Gather Motion 3:1 (slice2; segments: 3) + -> Seq Scan on keo5 keo5_2 + Filter: (x < 2) + -> Result + One-Time Filter: $0 + -> Hash Join + Hash Cond: (keo5.x = keo5_1.x) + -> Seq Scan on keo5 + -> Hash + -> HashAggregate + Group Key: keo5_1.x + -> Seq Scan on keo5 keo5_1 + Optimizer: Postgres query optimizer +(15 rows) + +DELETE FROM keo5 WHERE x IN (SELECT x FROM keo5 WHERE EXISTS (SELECT x FROM keo5 WHERE x < 2)); +SELECT x FROM keo5; + x +--- +(0 rows) + +RESET gp_autostats_mode; +DROP TABLE keo1; +DROP TABLE keo2; +DROP TABLE keo3; +DROP TABLE keo4; +DROP TABLE keo5; +-- +-- text types. We should support the following updates. +-- +CREATE TEMP TABLE ttab1 (a varchar(15), b integer) DISTRIBUTED BY (a); +CREATE TEMP TABLE ttab2 (a varchar(15), b integer) DISTRIBUTED BY (a); +UPDATE ttab1 SET b = ttab2.b FROM ttab2 WHERE ttab1.a = ttab2.a; +DROP TABLE ttab1; +DROP TABLE ttab2; +CREATE TEMP TABLE ttab1 (a text, b integer) DISTRIBUTED BY (a); +CREATE TEMP TABLE ttab2 (a text, b integer) DISTRIBUTED BY (a); +UPDATE ttab1 SET b = ttab2.b FROM ttab2 WHERE ttab1.a = ttab2.a; +DROP TABLE ttab1; +DROP TABLE ttab2; +CREATE TEMP TABLE ttab1 (a varchar, b integer) DISTRIBUTED BY (a); +CREATE TEMP TABLE ttab2 (a varchar, b integer) DISTRIBUTED BY (a); +UPDATE ttab1 SET b = ttab2.b FROM ttab2 WHERE ttab1.a = ttab2.a; +DROP TABLE ttab1; +DROP TABLE ttab2; +CREATE TEMP TABLE ttab1 (a char(15), b integer) DISTRIBUTED BY (a); +CREATE TEMP TABLE ttab2 (a char(15), b integer) DISTRIBUTED BY (a); +UPDATE ttab1 SET b = ttab2.b FROM ttab2 WHERE ttab1.a = ttab2.a; +DROP TABLE IF EXISTS update_distr_key; +NOTICE: table "update_distr_key" does not exist, skipping +CREATE TEMP TABLE update_distr_key (a int, b int) DISTRIBUTED BY (a); +INSERT INTO update_distr_key select i, i* 10 from generate_series(0, 9) i; +UPDATE update_distr_key SET a = 5 WHERE b = 10; +SELECT * from update_distr_key; + a | b +---+---- + 0 | 0 + 2 | 20 + 4 | 40 + 6 | 60 + 8 | 80 + 5 | 10 + 3 | 30 + 5 | 50 + 7 | 70 + 9 | 90 +(10 rows) + +DROP TABLE update_distr_key; +-- below cases is to test multi-hash-cols +CREATE TABLE tab3(c1 int, c2 int, c3 int, c4 int, c5 int) DISTRIBUTED BY (c1, c2, c3); +CREATE TABLE tab5(c1 int, c2 int, c3 int, c4 int, c5 int) DISTRIBUTED BY (c1, c2, c3, c4, c5); +INSERT INTO tab3 SELECT i, i, i, i, i FROM generate_series(1, 10)i; +INSERT INTO tab5 SELECT i, i, i, i, i FROM generate_series(1, 10)i; +-- test tab3 +SELECT gp_segment_id, * FROM tab3; + gp_segment_id | c1 | c2 | c3 | c4 | c5 +---------------+----+----+----+----+---- + 0 | 5 | 5 | 5 | 5 | 5 + 0 | 8 | 8 | 8 | 8 | 8 + 1 | 3 | 3 | 3 | 3 | 3 + 1 | 4 | 4 | 4 | 4 | 4 + 1 | 9 | 9 | 9 | 9 | 9 + 1 | 10 | 10 | 10 | 10 | 10 + 2 | 1 | 1 | 1 | 1 | 1 + 2 | 2 | 2 | 2 | 2 | 2 + 2 | 6 | 6 | 6 | 6 | 6 + 2 | 7 | 7 | 7 | 7 | 7 +(10 rows) + +UPDATE tab3 set c1 = 9 where c4 = 1; +SELECT gp_segment_id, * FROM tab3; + gp_segment_id | c1 | c2 | c3 | c4 | c5 +---------------+----+----+----+----+---- + 1 | 3 | 3 | 3 | 3 | 3 + 1 | 4 | 4 | 4 | 4 | 4 + 1 | 9 | 9 | 9 | 9 | 9 + 1 | 10 | 10 | 10 | 10 | 10 + 2 | 2 | 2 | 2 | 2 | 2 + 2 | 6 | 6 | 6 | 6 | 6 + 2 | 7 | 7 | 7 | 7 | 7 + 0 | 5 | 5 | 5 | 5 | 5 + 0 | 8 | 8 | 8 | 8 | 8 + 0 | 9 | 1 | 1 | 1 | 1 +(10 rows) + +UPDATE tab3 set (c1,c2) = (5,6) where c4 = 1; +SELECT gp_segment_id, * FROM tab3; + gp_segment_id | c1 | c2 | c3 | c4 | c5 +---------------+----+----+----+----+---- + 1 | 3 | 3 | 3 | 3 | 3 + 1 | 4 | 4 | 4 | 4 | 4 + 1 | 9 | 9 | 9 | 9 | 9 + 1 | 10 | 10 | 10 | 10 | 10 + 0 | 5 | 5 | 5 | 5 | 5 + 0 | 8 | 8 | 8 | 8 | 8 + 2 | 2 | 2 | 2 | 2 | 2 + 2 | 6 | 6 | 6 | 6 | 6 + 2 | 7 | 7 | 7 | 7 | 7 + 2 | 5 | 6 | 1 | 1 | 1 +(10 rows) + +UPDATE tab3 set (c1,c2,c3) = (3,2,1) where c4 = 1; +SELECT gp_segment_id, * FROM tab3; + gp_segment_id | c1 | c2 | c3 | c4 | c5 +---------------+----+----+----+----+---- + 2 | 2 | 2 | 2 | 2 | 2 + 2 | 6 | 6 | 6 | 6 | 6 + 2 | 7 | 7 | 7 | 7 | 7 + 2 | 3 | 2 | 1 | 1 | 1 + 1 | 3 | 3 | 3 | 3 | 3 + 1 | 4 | 4 | 4 | 4 | 4 + 1 | 9 | 9 | 9 | 9 | 9 + 1 | 10 | 10 | 10 | 10 | 10 + 0 | 5 | 5 | 5 | 5 | 5 + 0 | 8 | 8 | 8 | 8 | 8 +(10 rows) + +UPDATE tab3 set c1 = 11 where c2 = 10 and c2 < 1; +SELECT gp_segment_id, * FROM tab3; + gp_segment_id | c1 | c2 | c3 | c4 | c5 +---------------+----+----+----+----+---- + 0 | 5 | 5 | 5 | 5 | 5 + 0 | 8 | 8 | 8 | 8 | 8 + 2 | 2 | 2 | 2 | 2 | 2 + 2 | 6 | 6 | 6 | 6 | 6 + 2 | 7 | 7 | 7 | 7 | 7 + 2 | 3 | 2 | 1 | 1 | 1 + 1 | 3 | 3 | 3 | 3 | 3 + 1 | 4 | 4 | 4 | 4 | 4 + 1 | 9 | 9 | 9 | 9 | 9 + 1 | 10 | 10 | 10 | 10 | 10 +(10 rows) + +-- test tab5 +SELECT gp_segment_id, * FROM tab5; + gp_segment_id | c1 | c2 | c3 | c4 | c5 +---------------+----+----+----+----+---- + 2 | 6 | 6 | 6 | 6 | 6 + 2 | 7 | 7 | 7 | 7 | 7 + 2 | 8 | 8 | 8 | 8 | 8 + 1 | 1 | 1 | 1 | 1 | 1 + 1 | 2 | 2 | 2 | 2 | 2 + 1 | 3 | 3 | 3 | 3 | 3 + 1 | 5 | 5 | 5 | 5 | 5 + 0 | 4 | 4 | 4 | 4 | 4 + 0 | 9 | 9 | 9 | 9 | 9 + 0 | 10 | 10 | 10 | 10 | 10 +(10 rows) + +UPDATE tab5 set c1 = 1000 where c4 = 1; +SELECT gp_segment_id, * FROM tab5; + gp_segment_id | c1 | c2 | c3 | c4 | c5 +---------------+------+----+----+----+---- + 0 | 4 | 4 | 4 | 4 | 4 + 0 | 9 | 9 | 9 | 9 | 9 + 0 | 10 | 10 | 10 | 10 | 10 + 1 | 2 | 2 | 2 | 2 | 2 + 1 | 3 | 3 | 3 | 3 | 3 + 1 | 5 | 5 | 5 | 5 | 5 + 1 | 1000 | 1 | 1 | 1 | 1 + 2 | 6 | 6 | 6 | 6 | 6 + 2 | 7 | 7 | 7 | 7 | 7 + 2 | 8 | 8 | 8 | 8 | 8 +(10 rows) + +UPDATE tab5 set (c1,c2) = (9,10) where c4 = 1; +SELECT gp_segment_id, * FROM tab5; + gp_segment_id | c1 | c2 | c3 | c4 | c5 +---------------+----+----+----+----+---- + 1 | 2 | 2 | 2 | 2 | 2 + 1 | 3 | 3 | 3 | 3 | 3 + 1 | 5 | 5 | 5 | 5 | 5 + 0 | 4 | 4 | 4 | 4 | 4 + 0 | 9 | 9 | 9 | 9 | 9 + 0 | 10 | 10 | 10 | 10 | 10 + 0 | 9 | 10 | 1 | 1 | 1 + 2 | 6 | 6 | 6 | 6 | 6 + 2 | 7 | 7 | 7 | 7 | 7 + 2 | 8 | 8 | 8 | 8 | 8 +(10 rows) + +UPDATE tab5 set (c1,c2,c4) = (5,8,6) where c4 = 1; +SELECT gp_segment_id, * FROM tab5; + gp_segment_id | c1 | c2 | c3 | c4 | c5 +---------------+----+----+----+----+---- + 1 | 2 | 2 | 2 | 2 | 2 + 1 | 3 | 3 | 3 | 3 | 3 + 1 | 5 | 5 | 5 | 5 | 5 + 2 | 6 | 6 | 6 | 6 | 6 + 2 | 7 | 7 | 7 | 7 | 7 + 2 | 8 | 8 | 8 | 8 | 8 + 0 | 4 | 4 | 4 | 4 | 4 + 0 | 9 | 9 | 9 | 9 | 9 + 0 | 10 | 10 | 10 | 10 | 10 + 0 | 5 | 8 | 1 | 6 | 1 +(10 rows) + +UPDATE tab5 set (c1,c2,c3,c4,c5) = (1,2,3,0,6) where c5 = 1; +SELECT gp_segment_id, * FROM tab5; + gp_segment_id | c1 | c2 | c3 | c4 | c5 +---------------+----+----+----+----+---- + 0 | 4 | 4 | 4 | 4 | 4 + 0 | 9 | 9 | 9 | 9 | 9 + 0 | 10 | 10 | 10 | 10 | 10 + 1 | 2 | 2 | 2 | 2 | 2 + 1 | 3 | 3 | 3 | 3 | 3 + 1 | 5 | 5 | 5 | 5 | 5 + 1 | 1 | 2 | 3 | 0 | 6 + 2 | 6 | 6 | 6 | 6 | 6 + 2 | 7 | 7 | 7 | 7 | 7 + 2 | 8 | 8 | 8 | 8 | 8 +(10 rows) + +UPDATE tab5 set c1 = 11 where c3 = 10 and c3 < 1; +SELECT gp_segment_id, * FROM tab5; + gp_segment_id | c1 | c2 | c3 | c4 | c5 +---------------+----+----+----+----+---- + 0 | 4 | 4 | 4 | 4 | 4 + 0 | 9 | 9 | 9 | 9 | 9 + 0 | 10 | 10 | 10 | 10 | 10 + 1 | 2 | 2 | 2 | 2 | 2 + 1 | 3 | 3 | 3 | 3 | 3 + 1 | 5 | 5 | 5 | 5 | 5 + 1 | 1 | 2 | 3 | 0 | 6 + 2 | 6 | 6 | 6 | 6 | 6 + 2 | 7 | 7 | 7 | 7 | 7 + 2 | 8 | 8 | 8 | 8 | 8 +(10 rows) + +EXPLAIN (COSTS OFF ) UPDATE tab3 SET C1 = C1 + 1, C5 = C5+1; + QUERY PLAN +--------------------------------------------------------------- + Update on tab3 + -> Explicit Redistribute Motion 3:3 (slice1; segments: 3) + -> Split + -> Seq Scan on tab3 + Optimizer: Postgres query optimizer +(5 rows) + +-- clean up +drop table tab3; +drop table tab5; +-- Update distribution key +-- start_ignore +drop table if exists r; +NOTICE: table "r" does not exist, skipping +drop table if exists s; +NOTICE: table "s" does not exist, skipping +drop table if exists update_dist; +NOTICE: table "update_dist" does not exist, skipping +drop table if exists update_ao_table; +NOTICE: table "update_ao_table" does not exist, skipping +drop table if exists update_aoco_table; +NOTICE: table "update_aoco_table" does not exist, skipping +-- end_ignore +-- Update normal table distribution key +create table update_dist(a int) distributed by (a); +insert into update_dist values(1); +update update_dist set a=0 where a=1; +select * from update_dist; + a +--- + 0 +(1 row) + +-- Update distribution key with join +create table r (a int, b int) distributed by (a); +create table s (a int, b int) distributed by (a); +insert into r select generate_series(1, 5), generate_series(1, 5) * 2; +insert into s select generate_series(1, 5), generate_series(1, 5) * 2; +select * from r; + a | b +---+---- + 3 | 6 + 4 | 8 + 5 | 10 + 1 | 2 + 2 | 4 +(5 rows) + +select * from s; + a | b +---+---- + 1 | 2 + 2 | 4 + 3 | 6 + 4 | 8 + 5 | 10 +(5 rows) + +update r set a = r.a + 1 from s where r.a = s.a; +select * from r; + a | b +---+---- + 4 | 6 + 5 | 8 + 6 | 10 + 3 | 4 + 2 | 2 +(5 rows) + +update r set a = r.a + 1 where a in (select a from s); +select * from r; + a | b +---+---- + 6 | 10 + 5 | 6 + 6 | 8 + 4 | 4 + 3 | 2 +(5 rows) + +-- Update redistribution +delete from r; +delete from s; +insert into r select generate_series(1, 5), generate_series(1, 5); +insert into s select generate_series(1, 5), generate_series(1, 5) * 2; +select * from r; + a | b +---+--- + 3 | 3 + 4 | 4 + 5 | 5 + 1 | 1 + 2 | 2 +(5 rows) + +select * from s; + a | b +---+---- + 3 | 6 + 4 | 8 + 5 | 10 + 1 | 2 + 2 | 4 +(5 rows) + +update r set a = r.a + 1 from s where r.b = s.b; +select * from r; + a | b +---+--- + 1 | 1 + 3 | 3 + 5 | 5 + 5 | 4 + 3 | 2 +(5 rows) + +update r set a = r.a + 1 where b in (select b from s); +select * from r; + a | b +---+--- + 1 | 1 + 3 | 3 + 5 | 5 + 6 | 4 + 4 | 2 +(5 rows) + +-- Update hash aggreate group by +delete from r; +delete from s; +insert into r select generate_series(1, 5), generate_series(1, 5) * 2; +insert into s select generate_series(1, 5), generate_series(1, 5); +select * from r; + a | b +---+---- + 1 | 2 + 2 | 4 + 3 | 6 + 4 | 8 + 5 | 10 +(5 rows) + +select * from s; + a | b +---+--- + 3 | 3 + 4 | 4 + 5 | 5 + 1 | 1 + 2 | 2 +(5 rows) + +update s set a = s.a + 1 where exists (select 1 from r where s.a = r.b); +select * from s; + a | b +---+--- + 1 | 1 + 3 | 3 + 5 | 5 + 3 | 2 + 5 | 4 +(5 rows) + +-- Update ao table distribution key +create table update_ao_table (a int, b int) WITH (appendonly=true) distributed by (a); +insert into update_ao_table select g, g from generate_series(1, 5) g; +select * from update_ao_table; + a | b +---+--- + 1 | 1 + 2 | 2 + 3 | 3 + 4 | 4 + 5 | 5 +(5 rows) + +update update_ao_table set a = a + 1 where b = 3; +select * from update_ao_table; + a | b +---+--- + 1 | 1 + 2 | 2 + 4 | 4 + 5 | 5 + 4 | 3 +(5 rows) + +-- Update aoco table distribution key +create table update_aoco_table (a int, b int) WITH (appendonly=true, orientation=column) distributed by (a); +insert into update_aoco_table select g,g from generate_series(1, 5) g; +select * from update_aoco_table; + a | b +---+--- + 1 | 1 + 2 | 2 + 3 | 3 + 4 | 4 + 5 | 5 +(5 rows) + +update update_aoco_table set a = a + 1 where b = 3; +select * from update_aoco_table; + a | b +---+--- + 1 | 1 + 2 | 2 + 4 | 4 + 5 | 5 + 4 | 3 +(5 rows) + +-- Update prepare +delete from s; +insert into s select generate_series(1, 5), generate_series(1, 5); +select * from r; + a | b +---+---- + 1 | 2 + 2 | 4 + 3 | 6 + 4 | 8 + 5 | 10 +(5 rows) + +select * from s; + a | b +---+--- + 1 | 1 + 2 | 2 + 3 | 3 + 4 | 4 + 5 | 5 +(5 rows) + +prepare update_s(int) as update s set a = s.a + $1 where exists (select 1 from r where s.a = r.b); +execute update_s(10); +select * from s; + a | b +----+--- + 1 | 1 + 14 | 4 + 3 | 3 + 5 | 5 + 12 | 2 +(5 rows) + +-- Confirm that a split update is not created for a table excluded by +-- constraints in the planner. +create table nosplitupdate (a int) distributed by (a); +explain update nosplitupdate set a=0 where a=1 and a<1; + QUERY PLAN +----------------------------------------------------------- + Update on nosplitupdate (cost=0.00..0.01 rows=1 width=0) + -> Result (cost=0.00..0.01 rows=1 width=0) + One-Time Filter: false + Planning time: 0.271 ms + Optimizer: Postgres query optimizer +(5 rows) + +-- test split-update when split-node's flow is entry +create table tsplit_entry (c int); +NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'c' as the Cloudberry Database data distribution key for this table. +HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew. +insert into tsplit_entry values (1), (2); +analyze tsplit_entry; +-- start_ignore +-- gp_segment_configuration scan is different when using different FTS +explain update tsplit_entry set c = s.a from (select count(*) as a from gp_segment_configuration) s; + QUERY PLAN +------------------------------------------------------------------------------------------------------------------ + Update on tsplit_entry (cost=10000000001.00..10000000003.18 rows=3 width=54) + -> Explicit Redistribute Motion 1:3 (slice) (cost=10000000001.00..10000000003.18 rows=7 width=54) + -> Split (cost=10000000001.00..10000000003.18 rows=7 width=54) + -> Nested Loop (cost=10000000001.00..10000000003.12 rows=4 width=54) + -> Gather Motion 3:1 (slice2; segments: 3) (cost=0.00..2.06 rows=2 width=14) + -> Seq Scan on tsplit_entry (cost=0.00..2.02 rows=1 width=14) + -> Materialize (cost=1.00..1.03 rows=1 width=40) + -> Subquery Scan on s (cost=1.00..1.02 rows=1 width=40) + -> Aggregate (cost=1.00..1.01 rows=1 width=8) + -> Function Scan on gp_get_segment_configuration (cost=0.00..1.00 rows=1 width=0) + Optimizer: Postgres query optimizer +(11 rows) +-- end_ignore +update tsplit_entry set c = s.a from (select count(*) as a from gp_segment_configuration) s; +CREATE TABLE update_gp_foo ( + a_dist int, + b int, + c_part int, + d int +) +WITH (appendonly=false) DISTRIBUTED BY (a_dist) PARTITION BY RANGE(c_part) + ( + PARTITION p20190305 START (1) END (2) WITH (tablename='update_gp_foo_1_prt_p20190305', appendonly=false) + ); +CREATE TABLE update_gp_foo1 ( + a_dist int, + b int, + c_part int, + d int +) +WITH (appendonly=false) DISTRIBUTED BY (a_dist) PARTITION BY RANGE(c_part) + ( + PARTITION p20190305 START (1) END (2) WITH (tablename='update_gp_foo1_1_prt_p20190305', appendonly=false) + ); +INSERT INTO update_gp_foo VALUES (12, 40, 1, 50); +INSERT INTO update_gp_foo1 VALUES (12, 3, 1, 50); +UPDATE update_gp_foo +SET b = update_gp_foo.c_part, + d = update_gp_foo1.a_dist +FROM update_gp_foo1; +SELECT * from update_gp_foo; + a_dist | b | c_part | d +--------+---+--------+---- + 12 | 1 | 1 | 12 +(1 row) + +-- Test insert on conflict do update +-- Insert on conflict do update is an insert statement but might +-- invoke ExecUpdate on segments, but updating distkeys of a table +-- may lead to wrong data distribution. We will check this before +-- planning, if a `insert on conflict do update` statement set the +-- dist keys of the table, it will raise an error. +-- See github issue: https://github.com/greenplum-db/gpdb/issues/9444 +create table t_insert_on_conflict_update_distkey(a int, b int) distributed by (a); +create unique index uidx_t_insert_on_conflict_update_distkey on t_insert_on_conflict_update_distkey(a, b); +-- the following statement should error out because the on conflict update want to +-- modify the tuple's distkey which might lead to wrong data distribution +insert into t_insert_on_conflict_update_distkey values (1, 1) on conflict(a, b) do update set a = 1; +ERROR: modification of distribution columns in OnConflictUpdate is not supported +drop index uidx_t_insert_on_conflict_update_distkey; +drop table t_insert_on_conflict_update_distkey; +-- randomly distributed table cannot add unique constrain, so next we test replicated table +create table t_insert_on_conflict_update_distkey(a int, b int) distributed replicated; +create unique index uidx_t_insert_on_conflict_update_distkey on t_insert_on_conflict_update_distkey(a, b); +-- the following statement should succeed because replicated table does not contain distkey +insert into t_insert_on_conflict_update_distkey values (1, 1) on conflict(a, b) do update set a = 1; +ERROR: not implemented yet on pax relations: TupleInsertSpeculative +-- Some tests on a partitioned table. +CREATE TABLE update_gp_rangep (a int, b int, orig_a int) DISTRIBUTED BY (b) PARTITION BY RANGE (a); +CREATE TABLE update_gp_rangep_1_to_10 PARTITION OF update_gp_rangep FOR VALUES FROM (1) TO (10); +CREATE TABLE update_gp_rangep_10_to_20 PARTITION OF update_gp_rangep FOR VALUES FROM (10) TO (20); +INSERT INTO update_gp_rangep SELECT g, g, g FROM generate_series(1, 4) g; +-- Simple case: Same partition, same node. +UPDATE update_gp_rangep SET a = 9 WHERE a = 1; +-- Distribution key update, same partition. +UPDATE update_gp_rangep SET b = 1 WHERE a = 2; +-- Move row to different partition, but no change in distribution key +UPDATE update_gp_rangep SET a = 10 WHERE a = 3; +-- Move row to different partition and also change distribution key +UPDATE update_gp_rangep SET a = 11, b = 1 WHERE a = 4; +SELECT tableoid::regclass, * FROM update_gp_rangep ORDER BY orig_a; + tableoid | a | b | orig_a +---------------------------+----+---+-------- + update_gp_rangep_1_to_10 | 9 | 1 | 1 + update_gp_rangep_1_to_10 | 2 | 1 | 2 + update_gp_rangep_10_to_20 | 10 | 3 | 3 + update_gp_rangep_10_to_20 | 11 | 1 | 4 +(4 rows) + +-- Also do a lookup with specific distribution key. If the rows were not +-- correctly moved across segments, this would fail to find them, assuming +-- that direct dispatch is effective. +SELECT tableoid::regclass, * FROM update_gp_rangep WHERE b = 1; + tableoid | a | b | orig_a +---------------------------+----+---+-------- + update_gp_rangep_1_to_10 | 2 | 1 | 2 + update_gp_rangep_1_to_10 | 9 | 1 | 1 + update_gp_rangep_10_to_20 | 11 | 1 | 4 +(3 rows) + +-- start_ignore +drop table r; +drop table s; +drop table update_dist; +drop table update_ao_table; +drop table update_aoco_table; +drop table nosplitupdate; +drop table tsplit_entry; +-- end_ignore diff --git a/contrib/pax_storage/hd-ci/clang_tidy_pax.bash b/contrib/pax_storage/hd-ci/clang_tidy_pax.bash index 6d94247c41d..108c4bdfaab 100644 --- a/contrib/pax_storage/hd-ci/clang_tidy_pax.bash +++ b/contrib/pax_storage/hd-ci/clang_tidy_pax.bash @@ -16,7 +16,7 @@ function do_git_diff() { exit 0 fi - modified_files=$(git diff --name-only $CBDB_PAX_DEV_BRANCH) + modified_files=$(git diff --name-only $CBDB_PAX_DEV_BRANCH -- ':!icw_test') for extension in "${CBDB_PAX_EXT[@]}"; do if echo "$modified_files" | grep -E -e "$extension" | grep -q -v "$CBDB_PAXC_GREP"; then files=$(echo "$modified_files" | grep -E -e "$extension" | grep -v "$CBDB_PAXC_GREP") diff --git a/contrib/pax_storage/sql/ddl.sql b/contrib/pax_storage/sql/ddl.sql new file mode 100644 index 00000000000..fbb390e12ae --- /dev/null +++ b/contrib/pax_storage/sql/ddl.sql @@ -0,0 +1,51 @@ +set default_table_access_method = 'pax'; + +create table pax_test.t1( + id int, + name text not null, + height float not null, + decimal_col decimal(10, 2) not null, + created_at timestamp with time zone not null, + updated_at timestamp with time zone not null +) using pax distributed BY (id); +\d+ pax_test.t1 + +create table pax_test.t2( + id int, + name text not null, + height float not null, + decimal_col decimal(10, 2) not null, + created_at timestamp with time zone not null, + updated_at timestamp with time zone not null +); +\d+ pax_test.t2 + +insert into pax_test.t1 (id, name, height, decimal_col, created_at, updated_at) values + (1, 'Alice', 1.65, 1.23, '2023-05-17 17:56:49.633664+08', '2023-05-17 17:56:49.633664+08'), + (2, 'Bob', 1.75, 2.34, '2023-05-17 17:56:49.633664+08', '2023-05-17 17:56:49.633664+08'), + (3, 'Carol', 1.85, 3.45, '2023-05-17 17:56:49.633664+08', '2023-05-17 17:56:49.633664+08'); + +alter table pax_test.t1 add column new_col1 int; +alter table pax_test.t1 add column new_col2 int default null; +alter table pax_test.t1 add column new_col3 int default 0; +alter table pax_test.t1 add column new_col4 int default 12; + +select * from pax_test.t1; + +alter table pax_test.t1 drop column new_col2; +alter table pax_test.t1 drop column new_col3; + +vacuum pax_test.t1; +vacuum full pax_test.t1; + +drop table pax_test.t1; +drop table pax_test.t2; + +-- alter column with options +create table pax_test.t3 (v1 numeric(100,1)) with(compresstype=zstd, compresslevel=1); +alter table pax_test.t3 alter column v1 type numeric; +drop table pax_test.t3; +-- add column with options +create table pax_test.t4 (v1 text) with(compresstype=zstd, compresslevel=1); +alter table pax_test.t4 add column v2 text; +drop table pax_test.t4; diff --git a/contrib/pax_storage/sql/detoast.sql b/contrib/pax_storage/sql/detoast.sql new file mode 100644 index 00000000000..0af5be392e7 --- /dev/null +++ b/contrib/pax_storage/sql/detoast.sql @@ -0,0 +1,51 @@ +CREATE TABLE toasttest_external(f1 text); +-- The storage `EXTERNAL` allows out-of-line storage but not compression. +alter table toasttest_external alter column f1 set storage external; +-- These tests are sensitive to block size. In CBDB, the block +-- size is 32 kB, whereas in PostgreSQL it's 8kB. Therefore make +-- the data 4x larger here. +INSERT INTO toasttest_external values (repeat('1234567890',300*4)); +INSERT INTO toasttest_external values (repeat('1234567890',300*4)); +INSERT INTO toasttest_external values (repeat('1234567890',300*4)); +INSERT INTO toasttest_external values (repeat('1234567890',300*4)); +-- expect >0 blocks +SELECT pg_relation_size(reltoastrelid) = 0 AS is_empty + FROM pg_class where relname = 'toasttest_external'; + +create table toasttest_external_pax(f1 text) using pax; +insert into toasttest_external_pax select * from toasttest_external; +drop table toasttest_external; +-- If pax insert toast here, Then after drop toasttest_external, toast +-- will not get the source data. +select length(f1) from toasttest_external_pax; +drop table toasttest_external_pax; + + +CREATE TABLE toasttest_compress(f1 text); +-- The storage `MAIN` allows compression but not out-of-line storage. +alter table toasttest_compress alter column f1 set storage main; +-- about 1M +INSERT INTO toasttest_compress values (repeat('1234567890123456',1024 * 64)); +-- should be true, becase it's not store in toast table +SELECT pg_relation_size(reltoastrelid) = 0 AS is_empty FROM pg_class where relname = 'toasttest_compress'; + +create table toasttest_compress_pax(f1 text) using pax; +insert into toasttest_compress_pax select * from toasttest_compress; +drop table toasttest_compress; +select length(f1) from toasttest_compress_pax; +drop table toasttest_compress_pax; + +CREATE TABLE toasttest_extended(f1 text); +-- The storage `EXTENDED` allows both compression and out-of-line storage. +alter table toasttest_extended alter column f1 set storage EXTENDED; +-- about 1M, will use out-of-line storage +INSERT INTO toasttest_extended values (repeat('1234567890123456',1024 * 64)); +-- about 80k , will use compression storage +INSERT INTO toasttest_extended values (repeat('1234567890123456',1024 * 5)); +SELECT pg_relation_size(reltoastrelid) = 0 AS is_empty FROM pg_class where relname = 'toasttest_extended'; + +create table toasttest_extended_pax(f1 text) using pax; +insert into toasttest_extended_pax select * from toasttest_extended; +drop table toasttest_extended; +select length(f1) from toasttest_extended_pax; +drop table toasttest_extended_pax; \ No newline at end of file diff --git a/contrib/pax_storage/sql/setup.sql b/contrib/pax_storage/sql/setup.sql new file mode 100644 index 00000000000..0966d0946a8 --- /dev/null +++ b/contrib/pax_storage/sql/setup.sql @@ -0,0 +1,3 @@ +-- start_ignore +create schema pax_test; +-- end_ignore diff --git a/contrib/pax_storage/sql/teardown.sql b/contrib/pax_storage/sql/teardown.sql new file mode 100644 index 00000000000..641380076f8 --- /dev/null +++ b/contrib/pax_storage/sql/teardown.sql @@ -0,0 +1,3 @@ +-- start_ignore +drop schema if exists pax_test; +-- end_ignore diff --git a/contrib/pax_storage/src/data/sql/types.sql b/contrib/pax_storage/sql/types.sql similarity index 54% rename from contrib/pax_storage/src/data/sql/types.sql rename to contrib/pax_storage/sql/types.sql index 6eb3e09f895..1509c634889 100644 --- a/contrib/pax_storage/src/data/sql/types.sql +++ b/contrib/pax_storage/sql/types.sql @@ -1,9 +1,5 @@ --- start_ignore -create extension pax; -drop table if exists all_typbyval_pg_types; --- end_ignore -CREATE TABLE all_typbyval_pg_types ( +CREATE TABLE pax_test.all_typbyval_pg_types ( id int, bool_col bool, char_col char, @@ -26,16 +22,12 @@ CREATE TABLE all_typbyval_pg_types ( pg_lsn_col pg_lsn ) USING pax distributed by (id); -insert into all_typbyval_pg_types values(1, true,'c',2,'cid',4.2,5,'2023-05-17 17:56:49',7,'2023-05-17 17:56:49',10,11.1111,12,'2023-05-17 17:56:49','2023-05-17 17:56:49', '16/0'), +insert into pax_test.all_typbyval_pg_types values(1, true,'c',2,'cid',4.2,5,'2023-05-17 17:56:49',7,'2023-05-17 17:56:49',10,11.1111,12,'2023-05-17 17:56:49','2023-05-17 17:56:49', '16/0'), (1, true,'c',2,'cid',4.2,5,'2023-05-17 17:56:49',7,'2023-05-17 17:56:49',10,11.1111,12,'2023-05-17 17:56:49','2023-05-17 17:56:49', '16/0'), (1, true,'c',2,'cid',4.2,5,'2023-05-17 17:56:49',7,'2023-05-17 17:56:49',10,11.1111,12,'2023-05-17 17:56:49','2023-05-17 17:56:49', '16/0'); -select * from all_typbyval_pg_types; +select * from pax_test.all_typbyval_pg_types; --- start_ignore -drop table if exists all_typlen_lt_0_pg_type; --- end_ignore - -create table all_typlen_lt_0_pg_type ( +create table pax_test.all_typlen_lt_0_pg_type ( id int, name_col name, numeric_col numeric, @@ -44,9 +36,8 @@ create table all_typlen_lt_0_pg_type ( point_col point ) USING pax distributed by (id); -insert into all_typlen_lt_0_pg_type values(1,'hello', 1.23, 'text', 'varchar', point(1,2)); -select * from all_typlen_lt_0_pg_type; +insert into pax_test.all_typlen_lt_0_pg_type values(1,'hello', 1.23, 'text', 'varchar', point(1,2)); +select * from pax_test.all_typlen_lt_0_pg_type; --- start_ignore -drop table if exists all_typbyval_pg_types; --- end_ignore \ No newline at end of file +drop table pax_test.all_typbyval_pg_types; +drop table pax_test.all_typlen_lt_0_pg_type; diff --git a/contrib/pax_storage/sql/update.sql b/contrib/pax_storage/sql/update.sql new file mode 100644 index 00000000000..669ba110f4c --- /dev/null +++ b/contrib/pax_storage/sql/update.sql @@ -0,0 +1,386 @@ +set default_table_access_method = pax; +set pax.enable_filter = off; + +-- +-- UPDATE ... SET = DEFAULT; +-- + +CREATE TABLE update_test ( + a INT DEFAULT 10, + b INT, + c TEXT +); + +CREATE TABLE upsert_test ( + a INT PRIMARY KEY, + b TEXT +); + +INSERT INTO update_test VALUES (5, 10, 'foo'); +INSERT INTO update_test(b, a) VALUES (15, 10); + +SELECT a,b,c FROM update_test ORDER BY a,b,c; + +UPDATE update_test SET a = DEFAULT, b = DEFAULT; + +SELECT a,b,c FROM update_test ORDER BY a,b,c; + +-- aliases for the UPDATE target table +UPDATE update_test AS t SET b = 10 WHERE t.a = 10; + +SELECT a,b,c FROM update_test ORDER BY a,b,c; + +UPDATE update_test t SET b = t.b + 10 WHERE t.a = 10; + +SELECT a,b,c FROM update_test ORDER BY a,b,c; + +-- +-- Test VALUES in FROM +-- + +UPDATE update_test SET a=v.i FROM (VALUES(100, 20)) AS v(i, j) + WHERE update_test.b = v.j; + +SELECT a,b,c FROM update_test ORDER BY a,b,c; + +-- fail, wrong data type: +UPDATE update_test SET a = v.* FROM (VALUES(100, 20)) AS v(i, j) + WHERE update_test.b = v.j; + +-- +-- Test multiple-set-clause syntax +-- + +INSERT INTO update_test SELECT a,b+1,c FROM update_test; +SELECT * FROM update_test; + +UPDATE update_test SET (c,b,a) = ('bugle', b+11, DEFAULT) WHERE c = 'foo'; +SELECT a,b,c FROM update_test ORDER BY a,b,c; +UPDATE update_test SET (c,b) = ('car', a+b), a = a + 1 WHERE a = 10; +SELECT a,b,c FROM update_test ORDER BY a,b,c; +-- fail, multi assignment to same column: +UPDATE update_test SET (c,b) = ('car', a+b), b = a + 1 WHERE a = 10; + +-- uncorrelated sub-select: +UPDATE update_test + SET (b,a) = (select a,b from update_test where b = 41 and c = 'car') + WHERE a = 100 AND b = 20; +SELECT * FROM update_test; +-- correlated sub-select: +UPDATE update_test o + SET (b,a) = (select a+1,b from update_test i + where i.a=o.a and i.b=o.b and i.c is not distinct from o.c); +SELECT * FROM update_test; +-- fail, multiple rows supplied: +UPDATE update_test SET (b,a) = (select a+1,b from update_test); +-- set to null if no rows supplied: +UPDATE update_test SET (b,a) = (select a+1,b from update_test where a = 1000) + WHERE a = 11; +SELECT * FROM update_test; +-- *-expansion should work in this context: +UPDATE update_test SET (a,b) = ROW(v.*) FROM (VALUES(21, 100)) AS v(i, j) + WHERE update_test.a = v.i; +-- you might expect this to work, but syntactically it's not a RowExpr: +UPDATE update_test SET (a,b) = (v.*) FROM (VALUES(21, 101)) AS v(i, j) + WHERE update_test.a = v.i; + +-- if an alias for the target table is specified, don't allow references +-- to the original table name +UPDATE update_test AS t SET b = update_test.b + 10 WHERE t.a = 10; + +-- Make sure that we can update to a TOASTed value. +UPDATE update_test SET c = repeat('x', 10000) WHERE c = 'car'; +SELECT a, b, char_length(c) FROM update_test; + +-- Check multi-assignment with a Result node to handle a one-time filter. +EXPLAIN (VERBOSE, COSTS OFF) +UPDATE update_test t + SET (a, b) = (SELECT b, a FROM update_test s WHERE s.a = t.a) + WHERE CURRENT_USER = SESSION_USER; +UPDATE update_test t + SET (a, b) = (SELECT b, a FROM update_test s WHERE s.a = t.a) + WHERE CURRENT_USER = SESSION_USER; +SELECT a, b, char_length(c) FROM update_test; + +-- Test ON CONFLICT DO UPDATE + +INSERT INTO upsert_test VALUES(1, 'Boo'), (3, 'Zoo'); +-- uncorrelated sub-select: +WITH aaa AS (SELECT 1 AS a, 'Foo' AS b) INSERT INTO upsert_test + VALUES (1, 'Bar') ON CONFLICT(a) + DO UPDATE SET (b, a) = (SELECT b, a FROM aaa) RETURNING *; +-- correlated sub-select: +INSERT INTO upsert_test VALUES (1, 'Baz'), (3, 'Zaz') ON CONFLICT(a) + DO UPDATE SET (b, a) = (SELECT b || ', Correlated', a from upsert_test i WHERE i.a = upsert_test.a) + RETURNING *; +-- correlated sub-select (EXCLUDED.* alias): +INSERT INTO upsert_test VALUES (1, 'Bat'), (3, 'Zot') ON CONFLICT(a) + DO UPDATE SET (b, a) = (SELECT b || ', Excluded', a from upsert_test i WHERE i.a = excluded.a) + RETURNING *; + +-- ON CONFLICT using system attributes in RETURNING, testing both the +-- inserting and updating paths. See bug report at: +-- https://www.postgresql.org/message-id/73436355-6432-49B1-92ED-1FE4F7E7E100%40finefun.com.au +INSERT INTO upsert_test VALUES (2, 'Beeble') ON CONFLICT(a) + DO UPDATE SET (b, a) = (SELECT b || ', Excluded', a from upsert_test i WHERE i.a = excluded.a) + RETURNING tableoid::regclass, xmin = pg_current_xact_id()::xid AS xmin_correct, xmax = 0 AS xmax_correct; +-- currently xmax is set after a conflict - that's probably not good, +-- but it seems worthwhile to have to be explicit if that changes. +INSERT INTO upsert_test VALUES (2, 'Brox') ON CONFLICT(a) + DO UPDATE SET (b, a) = (SELECT b || ', Excluded', a from upsert_test i WHERE i.a = excluded.a) + RETURNING tableoid::regclass, xmin = pg_current_xact_id()::xid AS xmin_correct, xmax = pg_current_xact_id()::xid AS xmax_correct; + +DROP TABLE update_test; +DROP TABLE upsert_test; + +-- Test ON CONFLICT DO UPDATE with partitioned table and non-identical children + +CREATE TABLE upsert_test ( + a INT PRIMARY KEY, + b TEXT +) PARTITION BY LIST (a); + +CREATE TABLE upsert_test_1 PARTITION OF upsert_test FOR VALUES IN (1); +CREATE TABLE upsert_test_2 (b TEXT, a INT PRIMARY KEY); +ALTER TABLE upsert_test ATTACH PARTITION upsert_test_2 FOR VALUES IN (2); + +INSERT INTO upsert_test VALUES(1, 'Boo'), (2, 'Zoo'); +-- uncorrelated sub-select: +WITH aaa AS (SELECT 1 AS a, 'Foo' AS b) INSERT INTO upsert_test + VALUES (1, 'Bar') ON CONFLICT(a) + DO UPDATE SET (b, a) = (SELECT b, a FROM aaa) RETURNING *; +-- correlated sub-select: +WITH aaa AS (SELECT 1 AS ctea, ' Foo' AS cteb) INSERT INTO upsert_test + VALUES (1, 'Bar'), (2, 'Baz') ON CONFLICT(a) + DO UPDATE SET (b, a) = (SELECT upsert_test.b||cteb, upsert_test.a FROM aaa) RETURNING *; + +DROP TABLE upsert_test; + + +--------------------------- +-- UPDATE with row movement +--------------------------- + +-- When a partitioned table receives an UPDATE to the partitioned key and the +-- new values no longer meet the partition's bound, the row must be moved to +-- the correct partition for the new partition key (if one exists). We must +-- also ensure that updatable views on partitioned tables properly enforce any +-- WITH CHECK OPTION that is defined. The situation with triggers in this case +-- also requires thorough testing as partition key updates causing row +-- movement convert UPDATEs into DELETE+INSERT. + +CREATE TABLE range_parted ( + a text, + b bigint, + c numeric, + d int, + e varchar +) PARTITION BY RANGE (a, b); + +-- Create partitions intentionally in descending bound order, so as to test +-- that update-row-movement works with the leaf partitions not in bound order. +CREATE TABLE part_b_20_b_30 (e varchar, c numeric, a text, b bigint, d int); +-- GPDB: distribution policy must match the parent table. +alter table part_b_20_b_30 set distributed by (a); +ALTER TABLE range_parted ATTACH PARTITION part_b_20_b_30 FOR VALUES FROM ('b', 20) TO ('b', 30); +CREATE TABLE part_b_10_b_20 (e varchar, c numeric, a text, b bigint, d int) PARTITION BY RANGE (c); +alter table part_b_10_b_20 set distributed by (a); +CREATE TABLE part_b_1_b_10 PARTITION OF range_parted FOR VALUES FROM ('b', 1) TO ('b', 10); +ALTER TABLE range_parted ATTACH PARTITION part_b_10_b_20 FOR VALUES FROM ('b', 10) TO ('b', 20); +CREATE TABLE part_a_10_a_20 PARTITION OF range_parted FOR VALUES FROM ('a', 10) TO ('a', 20); +CREATE TABLE part_a_1_a_10 PARTITION OF range_parted FOR VALUES FROM ('a', 1) TO ('a', 10); + +-- Check that partition-key UPDATE works sanely on a partitioned table that +-- does not have any child partitions. +UPDATE part_b_10_b_20 set b = b - 6; + +-- Create some more partitions following the above pattern of descending bound +-- order, but let's make the situation a bit more complex by having the +-- attribute numbers of the columns vary from their parent partition. +CREATE TABLE part_c_100_200 (e varchar, c numeric, a text, b bigint, d int) PARTITION BY range (abs(d)); +ALTER TABLE part_c_100_200 DROP COLUMN e, DROP COLUMN c, DROP COLUMN a; +ALTER TABLE part_c_100_200 ADD COLUMN c numeric, ADD COLUMN e varchar, ADD COLUMN a text; +ALTER TABLE part_c_100_200 DROP COLUMN b; +ALTER TABLE part_c_100_200 ADD COLUMN b bigint; +CREATE TABLE part_d_1_15 PARTITION OF part_c_100_200 FOR VALUES FROM (1) TO (15); +CREATE TABLE part_d_15_20 PARTITION OF part_c_100_200 FOR VALUES FROM (15) TO (20); + +ALTER TABLE part_b_10_b_20 ATTACH PARTITION part_c_100_200 FOR VALUES FROM (100) TO (200); + +-- GPDB: distribution policy must match the parent table, so the previous command fails. +-- Change the distribution key and try again. +alter table part_c_100_200 set distributed by (a); +ALTER TABLE part_b_10_b_20 ATTACH PARTITION part_c_100_200 FOR VALUES FROM (100) TO (200); + +CREATE TABLE part_c_1_100 (e varchar, d int, c numeric, b bigint, a text); +alter table part_c_1_100 set distributed by (a); +ALTER TABLE part_b_10_b_20 ATTACH PARTITION part_c_1_100 FOR VALUES FROM (1) TO (100); + +\set init_range_parted 'truncate range_parted; insert into range_parted VALUES (''a'', 1, 1, 1), (''a'', 10, 200, 1), (''b'', 12, 96, 1), (''b'', 13, 97, 2), (''b'', 15, 105, 16), (''b'', 17, 105, 19)' +\set show_data 'select tableoid::regclass::text COLLATE "C" partname, * from range_parted ORDER BY 1, 2, 3, 4, 5, 6' +:init_range_parted; +:show_data; + +-- The order of subplans should be in bound order +EXPLAIN (costs off) UPDATE range_parted set c = c - 50 WHERE c > 97; + +-- fail, row movement happens only within the partition subtree. +UPDATE part_c_100_200 set c = c - 20, d = c WHERE c = 105; +-- fail, no partition key update, so no attempt to move tuple, +-- but "a = 'a'" violates partition constraint enforced by root partition) +UPDATE part_b_10_b_20 set a = 'a'; +-- ok, partition key update, no constraint violation +UPDATE range_parted set d = d - 10 WHERE d > 10; +-- ok, no partition key update, no constraint violation +UPDATE range_parted set e = d; +-- No row found +UPDATE part_c_1_100 set c = c + 20 WHERE c = 98; +-- ok, row movement +UPDATE part_b_10_b_20 set c = c + 20 returning c, b, a; +:show_data; + +-- fail, row movement happens only within the partition subtree. +UPDATE part_b_10_b_20 set b = b - 6 WHERE c > 116 returning *; +-- ok, row movement, with subset of rows moved into different partition. +UPDATE range_parted set b = b - 6 WHERE c > 116 returning a, b + c; + +:show_data; + +-- Common table needed for multiple test scenarios. +CREATE TABLE mintab(c1 int); +INSERT into mintab VALUES (120); + +-- update partition key using updatable view. +CREATE VIEW upview AS SELECT * FROM range_parted WHERE (select c > c1 FROM mintab) WITH CHECK OPTION; +-- ok +UPDATE upview set c = 199 WHERE b = 4; +-- fail, check option violation +UPDATE upview set c = 120 WHERE b = 4; +-- fail, row movement with check option violation +UPDATE upview set a = 'b', b = 15, c = 120 WHERE b = 4; +-- ok, row movement, check option passes +UPDATE upview set a = 'b', b = 15 WHERE b = 4; + +:show_data; + +-- cleanup +DROP VIEW upview; + +-- RETURNING having whole-row vars. +:init_range_parted; +UPDATE range_parted set c = 95 WHERE a = 'b' and b > 10 and c > 100 returning (range_parted), *; +:show_data; + + +-- Creating default partition for range +:init_range_parted; +create table part_def partition of range_parted default; +\d+ part_def +insert into range_parted values ('c', 9); +-- ok +update part_def set a = 'd' where a = 'c'; +-- fail +update part_def set a = 'a' where a = 'd'; + +:show_data; + +-- Update row movement from non-default to default partition. +-- fail, default partition is not under part_a_10_a_20; +UPDATE part_a_10_a_20 set a = 'ad' WHERE a = 'a'; +-- ok +UPDATE range_parted set a = 'ad' WHERE a = 'a'; +UPDATE range_parted set a = 'bd' WHERE a = 'b'; +:show_data; +-- Update row movement from default to non-default partitions. +-- ok +UPDATE range_parted set a = 'a' WHERE a = 'ad'; +UPDATE range_parted set a = 'b' WHERE a = 'bd'; +:show_data; + +-- Cleanup: range_parted no longer needed. +DROP TABLE range_parted; + +CREATE TABLE list_parted ( + a text, + b int +) PARTITION BY list (a); +CREATE TABLE list_part1 PARTITION OF list_parted for VALUES in ('a', 'b'); +CREATE TABLE list_default PARTITION OF list_parted default; +INSERT into list_part1 VALUES ('a', 1); +INSERT into list_default VALUES ('d', 10); + +-- fail +UPDATE list_default set a = 'a' WHERE a = 'd'; +-- ok +UPDATE list_default set a = 'x' WHERE a = 'd'; + +DROP TABLE list_parted; + +-------------- +-- Some more update-partition-key test scenarios below. This time use list +-- partitions. +-------------- + +-- Setup for list partitions +CREATE TABLE list_parted (a numeric, b int, c int8) PARTITION BY list (a); +CREATE TABLE sub_parted PARTITION OF list_parted for VALUES in (1) PARTITION BY list (b); + +CREATE TABLE sub_part1(b int, c int8, a numeric); +alter table sub_part1 set distributed by (a); -- GPDB: distribution policy must match the parent table. +ALTER TABLE sub_parted ATTACH PARTITION sub_part1 for VALUES in (1); +CREATE TABLE sub_part2(b int, c int8, a numeric); +alter table sub_part2 set distributed by (a); -- GPDB: distribution policy must match the parent table. +ALTER TABLE sub_parted ATTACH PARTITION sub_part2 for VALUES in (2); + +CREATE TABLE list_part1(a numeric, b int, c int8); +ALTER TABLE list_parted ATTACH PARTITION list_part1 for VALUES in (2,3); + +INSERT into list_parted VALUES (2,5,50); +INSERT into list_parted VALUES (3,6,60); +INSERT into sub_parted VALUES (1,1,60); +INSERT into sub_parted VALUES (1,2,10); + +-- Test partition constraint violation when intermediate ancestor is used and +-- constraint is inherited from upper root. +UPDATE sub_parted set a = 2 WHERE c = 10; + +-- Test update-partition-key, where the unpruned partitions do not have their +-- partition keys updated. +SELECT tableoid::regclass::text, * FROM list_parted WHERE a = 2 ORDER BY 1; +UPDATE list_parted set b = c + a WHERE a = 2; +SELECT tableoid::regclass::text, * FROM list_parted WHERE a = 2 ORDER BY 1; + + +-- Cleanup: list_parted no longer needed. +DROP TABLE list_parted; + +-- create custom operator class and hash function, for the same reason +-- explained in alter_table.sql +create or replace function dummy_hashint4(a int4, seed int8) returns int8 as +$$ begin return (a + seed); end; $$ language 'plpgsql' immutable; +create operator class custom_opclass for type int4 using hash as +operator 1 = , function 2 dummy_hashint4(int4, int8); + +create table hash_parted ( + a int, + b int +) partition by hash (a custom_opclass, b custom_opclass); +create table hpart1 partition of hash_parted for values with (modulus 2, remainder 1); +create table hpart2 partition of hash_parted for values with (modulus 4, remainder 2); +create table hpart3 partition of hash_parted for values with (modulus 8, remainder 0); +create table hpart4 partition of hash_parted for values with (modulus 8, remainder 4); +insert into hpart1 values (1, 1); +insert into hpart2 values (2, 5); +insert into hpart4 values (3, 4); + +-- fail +update hpart1 set a = 3, b=4 where a = 1; +-- ok, row movement +update hash_parted set b = b - 1 where b = 1; +-- ok +update hash_parted set b = b + 8 where b = 1; + +-- cleanup +drop table hash_parted; +drop operator class custom_opclass using hash; +drop function dummy_hashint4(a int4, seed int8); diff --git a/contrib/pax_storage/src/data/sql/update_gp.sql b/contrib/pax_storage/sql/update_gp.sql similarity index 78% rename from contrib/pax_storage/src/data/sql/update_gp.sql rename to contrib/pax_storage/sql/update_gp.sql index 3fe87351ac5..30efc73f679 100644 --- a/contrib/pax_storage/src/data/sql/update_gp.sql +++ b/contrib/pax_storage/sql/update_gp.sql @@ -1,9 +1,10 @@ +set default_table_access_method = pax; + -- Test DELETE and UPDATE on an inherited table. -- The special aspect of this table is that the inherited table has -- a different distribution key. 'p' table's distribution key matches -- that of 'r', but 'p2's doesn't. Test that the planner adds a Motion -- node correctly for p2. -set default_table_access_method = 'pax'; create table todelete (a int) distributed by (a); create table parent (a int, b int, c int) distributed by (a); create table child (a int, b int, c int) inherits (parent) distributed by (b); @@ -62,9 +63,8 @@ create table base_tbl (a int4, b int4) distributed by (a); create table child_a (a int4, b int4) inherits (base_tbl) distributed by (a); create table child_b (a int4, b int4) inherits (base_tbl) distributed by (b); insert into base_tbl select g, g from generate_series(1, 5) g; --- start_ignore + explain (costs off) update base_tbl set a=a+1; --- end_ignore update base_tbl set a = 5; -- @@ -84,8 +84,8 @@ INSERT INTO keo3 VALUES ('1', '1'); CREATE TABLE keo4 ( keo_para_required_period character varying(6), keo_para_budget_date character varying(24)) DISTRIBUTED RANDOMLY; INSERT INTO keo4 VALUES ('1', '1'); +ANALYZE keo1, keo2, keo3, keo4; -- Explicit Redistribution motion should be added in case of GPDB Planner (test case not applicable for ORCA) --- start_ignore EXPLAIN (COSTS OFF) UPDATE keo1 SET user_vie_act_cntr_marg_cum = 234.682 FROM ( SELECT a.user_vie_project_code_pk FROM keo1 a INNER JOIN keo2 b ON b.projects_pk=a.user_vie_project_code_pk @@ -95,7 +95,6 @@ EXPLAIN (COSTS OFF) UPDATE keo1 SET user_vie_act_cntr_marg_cum = 234.682 FROM (SELECT min (keo4.keo_para_budget_date) FROM keo4))) ) t1 WHERE t1.user_vie_project_code_pk = keo1.user_vie_project_code_pk; --- end_ignore UPDATE keo1 SET user_vie_act_cntr_marg_cum = 234.682 FROM ( SELECT a.user_vie_project_code_pk FROM keo1 a INNER JOIN keo2 b ON b.projects_pk=a.user_vie_project_code_pk @@ -110,9 +109,7 @@ SELECT user_vie_act_cntr_marg_cum FROM keo1; -- Explicit Redistribution motion should not be added in case of GPDB Planner (test case not applicable to ORCA) CREATE TABLE keo5 (x int, y int) DISTRIBUTED BY (x); INSERT INTO keo5 VALUES (1,1); --- start_ignore EXPLAIN (COSTS OFF) DELETE FROM keo5 WHERE x IN (SELECT x FROM keo5 WHERE EXISTS (SELECT x FROM keo5 WHERE x < 2)); --- end_ignore DELETE FROM keo5 WHERE x IN (SELECT x FROM keo5 WHERE EXISTS (SELECT x FROM keo5 WHERE x < 2)); SELECT x FROM keo5; @@ -123,54 +120,52 @@ DROP TABLE keo3; DROP TABLE keo4; DROP TABLE keo5; --- start_ignore --- -- text types. We should support the following updates. --- -- +-- +-- text types. We should support the following updates. +-- --- CREATE TEMP TABLE ttab1 (a varchar(15), b integer) DISTRIBUTED BY (a); --- CREATE TEMP TABLE ttab2 (a varchar(15), b integer) DISTRIBUTED BY (a); +CREATE TEMP TABLE ttab1 (a varchar(15), b integer) DISTRIBUTED BY (a); +CREATE TEMP TABLE ttab2 (a varchar(15), b integer) DISTRIBUTED BY (a); --- UPDATE ttab1 SET b = ttab2.b FROM ttab2 WHERE ttab1.a = ttab2.a; +UPDATE ttab1 SET b = ttab2.b FROM ttab2 WHERE ttab1.a = ttab2.a; --- DROP TABLE ttab1; --- DROP TABLE ttab2; +DROP TABLE ttab1; +DROP TABLE ttab2; --- CREATE TEMP TABLE ttab1 (a text, b integer) DISTRIBUTED BY (a); --- CREATE TEMP TABLE ttab2 (a text, b integer) DISTRIBUTED BY (a); +CREATE TEMP TABLE ttab1 (a text, b integer) DISTRIBUTED BY (a); +CREATE TEMP TABLE ttab2 (a text, b integer) DISTRIBUTED BY (a); --- UPDATE ttab1 SET b = ttab2.b FROM ttab2 WHERE ttab1.a = ttab2.a; +UPDATE ttab1 SET b = ttab2.b FROM ttab2 WHERE ttab1.a = ttab2.a; --- DROP TABLE ttab1; --- DROP TABLE ttab2; +DROP TABLE ttab1; +DROP TABLE ttab2; --- CREATE TEMP TABLE ttab1 (a varchar, b integer) DISTRIBUTED BY (a); --- CREATE TEMP TABLE ttab2 (a varchar, b integer) DISTRIBUTED BY (a); +CREATE TEMP TABLE ttab1 (a varchar, b integer) DISTRIBUTED BY (a); +CREATE TEMP TABLE ttab2 (a varchar, b integer) DISTRIBUTED BY (a); --- UPDATE ttab1 SET b = ttab2.b FROM ttab2 WHERE ttab1.a = ttab2.a; +UPDATE ttab1 SET b = ttab2.b FROM ttab2 WHERE ttab1.a = ttab2.a; --- DROP TABLE ttab1; --- DROP TABLE ttab2; +DROP TABLE ttab1; +DROP TABLE ttab2; --- CREATE TEMP TABLE ttab1 (a char(15), b integer) DISTRIBUTED BY (a); --- CREATE TEMP TABLE ttab2 (a char(15), b integer) DISTRIBUTED BY (a); +CREATE TEMP TABLE ttab1 (a char(15), b integer) DISTRIBUTED BY (a); +CREATE TEMP TABLE ttab2 (a char(15), b integer) DISTRIBUTED BY (a); --- UPDATE ttab1 SET b = ttab2.b FROM ttab2 WHERE ttab1.a = ttab2.a; +UPDATE ttab1 SET b = ttab2.b FROM ttab2 WHERE ttab1.a = ttab2.a; --- DROP TABLE IF EXISTS update_distr_key; +DROP TABLE IF EXISTS update_distr_key; --- CREATE TEMP TABLE update_distr_key (a int, b int) DISTRIBUTED BY (a); --- INSERT INTO update_distr_key select i, i* 10 from generate_series(0, 9) i; +CREATE TEMP TABLE update_distr_key (a int, b int) DISTRIBUTED BY (a); +INSERT INTO update_distr_key select i, i* 10 from generate_series(0, 9) i; --- UPDATE update_distr_key SET a = 5 WHERE b = 10; +UPDATE update_distr_key SET a = 5 WHERE b = 10; --- SELECT * from update_distr_key; +SELECT * from update_distr_key; --- DROP TABLE update_distr_key; - --- end_ignore +DROP TABLE update_distr_key; -- below cases is to test multi-hash-cols CREATE TABLE tab3(c1 int, c2 int, c3 int, c4 int, c5 int) DISTRIBUTED BY (c1, c2, c3); @@ -202,9 +197,8 @@ UPDATE tab5 set (c1,c2,c3,c4,c5) = (1,2,3,0,6) where c5 = 1; SELECT gp_segment_id, * FROM tab5; UPDATE tab5 set c1 = 11 where c3 = 10 and c3 < 1; SELECT gp_segment_id, * FROM tab5; --- start_ignore + EXPLAIN (COSTS OFF ) UPDATE tab3 SET C1 = C1 + 1, C5 = C5+1; --- end_ignore -- clean up drop table tab3; @@ -260,21 +254,20 @@ select * from r; select * from s; update s set a = s.a + 1 where exists (select 1 from r where s.a = r.b); select * from s; --- start_ignore + -- Update ao table distribution key --- create table update_ao_table (a int, b int) WITH (appendonly=true) distributed by (a); --- insert into update_ao_table select g, g from generate_series(1, 5) g; --- select * from update_ao_table; --- update update_ao_table set a = a + 1 where b = 3; --- select * from update_ao_table; +create table update_ao_table (a int, b int) WITH (appendonly=true) distributed by (a); +insert into update_ao_table select g, g from generate_series(1, 5) g; +select * from update_ao_table; +update update_ao_table set a = a + 1 where b = 3; +select * from update_ao_table; -- Update aoco table distribution key --- create table update_aoco_table (a int, b int) WITH (appendonly=true, orientation=column) distributed by (a); --- insert into update_aoco_table select g,g from generate_series(1, 5) g; --- select * from update_aoco_table; --- update update_aoco_table set a = a + 1 where b = 3; --- select * from update_aoco_table; --- end_ignore +create table update_aoco_table (a int, b int) WITH (appendonly=true, orientation=column) distributed by (a); +insert into update_aoco_table select g,g from generate_series(1, 5) g; +select * from update_aoco_table; +update update_aoco_table set a = a + 1 where b = 3; +select * from update_aoco_table; -- Update prepare delete from s; @@ -288,9 +281,7 @@ select * from s; -- Confirm that a split update is not created for a table excluded by -- constraints in the planner. create table nosplitupdate (a int) distributed by (a); --- start_ignore explain update nosplitupdate set a=0 where a=1 and a<1; --- end_ignore -- test split-update when split-node's flow is entry create table tsplit_entry (c int); @@ -298,42 +289,42 @@ insert into tsplit_entry values (1), (2); analyze tsplit_entry; -- start_ignore +-- gp_segment_configuration scan is different when using different FTS explain update tsplit_entry set c = s.a from (select count(*) as a from gp_segment_configuration) s; -- end_ignore update tsplit_entry set c = s.a from (select count(*) as a from gp_segment_configuration) s; --- start_ignore --- CREATE TABLE update_gp_foo ( --- a_dist int, --- b int, --- c_part int, --- d int --- ) --- WITH (appendonly=false) DISTRIBUTED BY (a_dist) PARTITION BY RANGE(c_part) --- ( --- PARTITION p20190305 START (1) END (2) WITH (tablename='update_gp_foo_1_prt_p20190305', appendonly=false) --- ); - --- CREATE TABLE update_gp_foo1 ( --- a_dist int, --- b int, --- c_part int, --- d int --- ) --- WITH (appendonly=false) DISTRIBUTED BY (a_dist) PARTITION BY RANGE(c_part) --- ( --- PARTITION p20190305 START (1) END (2) WITH (tablename='update_gp_foo1_1_prt_p20190305', appendonly=false) --- ); - --- INSERT INTO update_gp_foo VALUES (12, 40, 1, 50); --- INSERT INTO update_gp_foo1 VALUES (12, 3, 1, 50); - --- UPDATE update_gp_foo --- SET b = update_gp_foo.c_part, --- d = update_gp_foo1.a_dist --- FROM update_gp_foo1; - --- SELECT * from update_gp_foo; --- end_ignore + +CREATE TABLE update_gp_foo ( + a_dist int, + b int, + c_part int, + d int +) +WITH (appendonly=false) DISTRIBUTED BY (a_dist) PARTITION BY RANGE(c_part) + ( + PARTITION p20190305 START (1) END (2) WITH (tablename='update_gp_foo_1_prt_p20190305', appendonly=false) + ); + +CREATE TABLE update_gp_foo1 ( + a_dist int, + b int, + c_part int, + d int +) +WITH (appendonly=false) DISTRIBUTED BY (a_dist) PARTITION BY RANGE(c_part) + ( + PARTITION p20190305 START (1) END (2) WITH (tablename='update_gp_foo1_1_prt_p20190305', appendonly=false) + ); + +INSERT INTO update_gp_foo VALUES (12, 40, 1, 50); +INSERT INTO update_gp_foo1 VALUES (12, 3, 1, 50); + +UPDATE update_gp_foo +SET b = update_gp_foo.c_part, + d = update_gp_foo1.a_dist +FROM update_gp_foo1; + +SELECT * from update_gp_foo; -- Test insert on conflict do update -- Insert on conflict do update is an insert statement but might @@ -342,7 +333,6 @@ update tsplit_entry set c = s.a from (select count(*) as a from gp_segment_confi -- planning, if a `insert on conflict do update` statement set the -- dist keys of the table, it will raise an error. -- See github issue: https://github.com/greenplum-db/gpdb/issues/9444 --- start_ignore create table t_insert_on_conflict_update_distkey(a int, b int) distributed by (a); create unique index uidx_t_insert_on_conflict_update_distkey on t_insert_on_conflict_update_distkey(a, b); @@ -358,7 +348,6 @@ create table t_insert_on_conflict_update_distkey(a int, b int) distributed repli create unique index uidx_t_insert_on_conflict_update_distkey on t_insert_on_conflict_update_distkey(a, b); -- the following statement should succeed because replicated table does not contain distkey insert into t_insert_on_conflict_update_distkey values (1, 1) on conflict(a, b) do update set a = 1; --- end_ignore -- Some tests on a partitioned table. CREATE TABLE update_gp_rangep (a int, b int, orig_a int) DISTRIBUTED BY (b) PARTITION BY RANGE (a); @@ -379,9 +368,8 @@ UPDATE update_gp_rangep SET a = 10 WHERE a = 3; -- Move row to different partition and also change distribution key UPDATE update_gp_rangep SET a = 11, b = 1 WHERE a = 4; --- start_ignore + SELECT tableoid::regclass, * FROM update_gp_rangep ORDER BY orig_a; --- end_ignore -- Also do a lookup with specific distribution key. If the rows were not -- correctly moved across segments, this would fail to find them, assuming -- that direct dispatch is effective. diff --git a/contrib/pax_storage/src/cpp/CMakeLists.txt b/contrib/pax_storage/src/cpp/CMakeLists.txt index 8ac5f846c7a..8b6cbe0fd29 100644 --- a/contrib/pax_storage/src/cpp/CMakeLists.txt +++ b/contrib/pax_storage/src/cpp/CMakeLists.txt @@ -1,242 +1,37 @@ cmake_minimum_required (VERSION 3.11.0) # protobuf -include(ExternalProject) -option(ORC_PREFER_STATIC_PROTOBUF "Prefer static protobuf library, if available" ON) -set(THIRDPARTY_CONFIGURE_COMMAND "${CMAKE_COMMAND}" -G "${CMAKE_GENERATOR}") -set(THIRDPARTY_DIR "${CMAKE_BINARY_DIR}/src/cpp/contrib") -set(THIRDPARTY_LOG_OPTIONS LOG_CONFIGURE 1 - LOG_BUILD 1 - LOG_INSTALL 1 - LOG_DOWNLOAD 1) -set(PROTOBUF_PREFIX "${THIRDPARTY_DIR}/protobuf_ep-install") -set(PROTOBUF_INCLUDE_DIR "${PROTOBUF_PREFIX}/include") -set(PROTOBUF_CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${PROTOBUF_PREFIX} - -DCMAKE_INSTALL_LIBDIR=lib - -DBUILD_SHARED_LIBS=OFF - -Dprotobuf_BUILD_TESTS=OFF) - -set(PROTOBUF_CMAKE_ARGS ${PROTOBUF_CMAKE_ARGS} -DCMAKE_POSITION_INDEPENDENT_CODE=ON) -set(PROTOBUF_STATIC_LIB_PREFIX ${CMAKE_STATIC_LIBRARY_PREFIX}) -set(PROTOBUF_STATIC_LIB "${PROTOBUF_PREFIX}/lib/${PROTOBUF_STATIC_LIB_PREFIX}protobuf${CMAKE_STATIC_LIBRARY_SUFFIX}") -message(STATUS "${PROTOBUF_STATIC_LIB}") -set(PROTOC_STATIC_LIB "${PROTOBUF_PREFIX}/lib/${PROTOBUF_STATIC_LIB_PREFIX}protoc${CMAKE_STATIC_LIBRARY_SUFFIX}") -set(PROTOBUF_EXECUTABLE "${PROTOBUF_PREFIX}/bin/protoc${CMAKE_EXECUTABLE_SUFFIX}") - -set(PROTOBUF_CONFIGURE CONFIGURE_COMMAND "${THIRDPARTY_CONFIGURE_COMMAND}" ${PROTOBUF_CMAKE_ARGS} - "${CMAKE_CURRENT_BINARY_DIR}/protobuf_ep-prefix/src/protobuf_ep/cmake") - -ExternalProject_Add(protobuf_ep - URL "https://artifactory.hashdata.xyz/artifactory/utility/protobuf-3.6.1.tar.gz" - ${PROTOBUF_CONFIGURE} - ${THIRDPARTY_LOG_OPTIONS} - BUILD_BYPRODUCTS "${PROTOBUF_STATIC_LIB}" "${PROTOC_STATIC_LIB}") - -set(PROTOBUF_LIBRARY ${PROTOBUF_STATIC_LIB}) -set(PROTOC_LIBRARY ${PROTOC_STATIC_LIB}) -set(PROTOBUF_VENDORED ON) -set(INSTALL_VENDORED_LIBS OFF) - -add_library (orc_protobuf INTERFACE) -add_library (orc::protobuf ALIAS orc_protobuf) -add_library (orc_protoc INTERFACE) -add_library (orc::protoc ALIAS orc_protoc) - -if (ORC_PREFER_STATIC_PROTOBUF AND ${PROTOBUF_STATIC_LIB}) - target_link_libraries (orc_protobuf INTERFACE ${PROTOBUF_STATIC_LIB}) -else () - target_link_libraries (orc_protobuf INTERFACE ${PROTOBUF_LIBRARY}) -endif() - -target_include_directories (orc_protobuf SYSTEM INTERFACE ${PROTOBUF_INCLUDE_DIR}) - -if (ORC_PREFER_STATIC_PROTOBUF AND ${PROTOC_STATIC_LIB}) - target_link_libraries (orc_protoc INTERFACE ${PROTOC_STATIC_LIB}) -else () - target_link_libraries (orc_protoc INTERFACE ${PROTOC_LIBRARY}) -endif() - -target_include_directories (orc_protoc SYSTEM INTERFACE ${PROTOBUF_INCLUDE_DIR}) - -if (PROTOBUF_VENDORED) - add_dependencies (orc_protoc protobuf_ep) - add_dependencies (orc_protobuf protobuf_ep) - if (INSTALL_VENDORED_LIBS) - install(FILES "${PROTOBUF_STATIC_LIB}" "${PROTOC_STATIC_LIB}" - DESTINATION "lib") - endif () -endif () - -set(orc_proto_file "${CMAKE_CURRENT_SOURCE_DIR}/storage/proto/orc_proto.proto") -set(orc_proto_src "${CMAKE_CURRENT_SOURCE_DIR}/storage/proto/orc_proto.pb.h" "${CMAKE_CURRENT_SOURCE_DIR}/storage/proto/orc_proto.pb.cc") - -set(pax_proto_file "${CMAKE_CURRENT_SOURCE_DIR}/storage/proto/pax.proto") -set(pax_proto_src "${CMAKE_CURRENT_SOURCE_DIR}/storage/proto/pax.pb.h" "${CMAKE_CURRENT_SOURCE_DIR}/storage/proto/pax.pb.cc") - -set(catalog_proto_file "${CMAKE_CURRENT_SOURCE_DIR}/storage/proto/micro_partition_stats.proto") -set(stats_proto_src "${CMAKE_CURRENT_SOURCE_DIR}/storage/proto/micro_partition_stats.pb.h" "${CMAKE_CURRENT_SOURCE_DIR}/storage/proto/micro_partition_stats.pb.cc") - -add_custom_command(OUTPUT ${orc_proto_src} - COMMAND ${PROTOBUF_EXECUTABLE} - -I ${CMAKE_CURRENT_SOURCE_DIR}/storage/proto/ - --cpp_out="${CMAKE_CURRENT_SOURCE_DIR}/storage/proto/" - ${orc_proto_file}) - -add_custom_command(OUTPUT ${pax_proto_src} - COMMAND ${PROTOBUF_EXECUTABLE} - -I ${CMAKE_CURRENT_SOURCE_DIR}/storage/proto/ - --cpp_out="${CMAKE_CURRENT_SOURCE_DIR}/storage/proto/" - ${pax_proto_file}) - -add_custom_command(OUTPUT ${stats_proto_src} - COMMAND ${PROTOBUF_EXECUTABLE} - -I ${CMAKE_CURRENT_SOURCE_DIR}/storage/proto/ - --cpp_out="${CMAKE_CURRENT_SOURCE_DIR}/storage/proto" - ${catalog_proto_file}) - -add_custom_target(generate_protobuf DEPENDS ${orc_proto_src} ${pax_proto_src} ${stats_proto_src}) - -if (BUILD_GTEST AND NOT BUILD_PAX_FORMAT) - add_subdirectory(contrib/googletest) - ADD_DEFINITIONS(-DRUN_GTEST) - file(GLOB TEST_CASE_SOURCES - ${CMAKE_CURRENT_SOURCE_DIR}/*/*_test.cc - ${CMAKE_CURRENT_SOURCE_DIR}/*/*/*_test.cc) - - link_directories($ENV{GPHOME}/lib) - add_executable(test_main ${TEST_CASE_SOURCES}) - add_dependencies(test_main gtest gmock gtest_main) - target_include_directories(test_main PUBLIC ${CMAKE_CURRENT_SOURCE_DIR} ${gtest_SOURCE_DIR}/include) - target_link_libraries(test_main gtest gmock gtest_main postgres pax) -endif(BUILD_GTEST AND NOT BUILD_PAX_FORMAT) - -# ztsd -set(ZSTD_BUILD_PROGRAMS OFF) -set(ZSTD_BUILD_TESTS OFF) -set(ZSTD_BUILD_CONTRIB) -add_subdirectory(contrib/zstd/build/cmake/) -set(ZTSD_HEADER contrib/zstd/lib) - -set(pax_comm_src - comm/bitmap.cc - comm/paxc_wrappers.cc - comm/cbdb_wrappers.cc) - -set(pax_exceptions_src - exceptions/CException.cc) - -set(pax_storage_src - storage/columns/pax_column.cc - storage/columns/pax_column_int.cc - storage/columns/pax_compress.cc - storage/columns/pax_columns.cc - storage/columns/pax_encoding_utils.cc - storage/columns/pax_encoding_non_fixed_column.cc - storage/columns/pax_encoding_column.cc - storage/columns/pax_decoding.cc - storage/columns/pax_encoding.cc - storage/columns/pax_rlev2_decoding.cc - storage/columns/pax_rlev2_encoding.cc - storage/file_system.cc - storage/pax_filter.cc - storage/local_file_system.cc - storage/micro_partition.cc - storage/micro_partition_file_factory.cc - storage/micro_partition_iterator.cc - storage/micro_partition_metadata.cc - storage/pax_buffer.cc - storage/proto/protobuf_stream.cc - storage/pax_filter.cc - storage/strategy.cc - storage/paxc_block_map_manager.cc - storage/orc/orc.cc - storage/strategy.cc) - -if(NOT BUILD_PAX_FORMAT) - set(pax_storage_src ${pax_storage_src} storage/pax.cc) -endif(NOT BUILD_PAX_FORMAT) - -set(pax_access_src - access/pax_access_handle.cc - access/pax_deleter.cc - access/pax_dml_state.cc - access/pax_inserter.cc - access/pax_updater.cc - access/pax_scanner.cc) - -set(pax_catalog_src - catalog/micro_partition_stats.cc - catalog/pax_aux_table.cc) - -set(pax_vec_src - storage/vec/pax_vec_adapter.cc - storage/vec/pax_vec_reader.cc) - -link_directories($ENV{GPHOME}/lib) - -if(BUILD_PAX_FORMAT) - # paxformat.so - ADD_DEFINITIONS(-DBUILD_PAX_FORMAT) - add_library(paxformat SHARED ${orc_proto_src} ${pax_proto_src} ${pax_storage_src} ${pax_exceptions_src} ${pax_comm_src} ) - target_include_directories(paxformat PUBLIC ${ZTSD_HEADER} ${CMAKE_CURRENT_SOURCE_DIR} ${CBDB_INCLUDE_DIR}) - target_link_libraries(paxformat PUBLIC uuid orc_protobuf zstd z) - set_target_properties(paxformat PROPERTIES - OUTPUT_NAME paxformat) - add_dependencies(paxformat generate_protobuf) - - # export headers - set(PAX_COMM_HEADERS - comm/cbdb_api.h - ) - - ## install dynamic libraray - install(TARGETS paxformat - LIBRARY DESTINATION ${CMAKE_INSTALL_PREFIX}/lib) - - # TODO(gongxun): - # We should explicitly specify the headers - # that need to be exported, and use the syntax of - # install(FILES,...) to install the header files - install(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/storage - DESTINATION ${CMAKE_INSTALL_PREFIX}/include/pax - FILES_MATCHING - PATTERN "*.h" -) - -install(FILES ${PAX_COMM_HEADERS} - DESTINATION ${CMAKE_INSTALL_PREFIX}/include/pax/comm +set(protobuf_files + ${CMAKE_CURRENT_SOURCE_DIR}/storage/proto/micro_partition_stats.proto + ${CMAKE_CURRENT_SOURCE_DIR}/storage/proto/pax.proto + ${CMAKE_CURRENT_SOURCE_DIR}/storage/proto/orc_proto.proto ) -else() - add_library(pax SHARED ${orc_proto_src} ${pax_proto_src} ${pax_storage_src} ${stats_proto_src} ${pax_exceptions_src} - ${pax_access_src} ${pax_comm_src} ${pax_catalog_src} ${pax_vec_src}) - set_target_properties(pax PROPERTIES OUTPUT_NAME pax) - target_include_directories(pax PUBLIC ${ZTSD_HEADER} ${CMAKE_CURRENT_SOURCE_DIR} ${CBDB_INCLUDE_DIR}) - target_link_libraries(pax PUBLIC uuid orc_protobuf zstd z postgres) - add_dependencies(pax generate_protobuf) - add_custom_command(TARGET pax POST_BUILD - COMMAND ${CMAKE_COMMAND} -E - copy_if_different $ ${CMAKE_CURRENT_SOURCE_DIR}/../data/pax.so) -endif(BUILD_PAX_FORMAT) +set(PROTO_DIR ${CMAKE_CURRENT_SOURCE_DIR}/storage/proto) +set(PROTO_OUTPUT_DIR ${CMAKE_CURRENT_SOURCE_DIR}/storage/proto) -# vec build -if (VEC_BUILD) - set(VEC_HEADER ${VEC_HOME}/src/include/) +## we generate these files in the cmake -B build phase. +## when we start the make phase, all dependent files exist, and we can compile with multiple threads. +FOREACH(FIL ${protobuf_files}) + GET_FILENAME_COMPONENT(FIL_WE ${FIL} NAME_WE) + string(REGEX REPLACE ".+/(.+)\\..*" "\\1" FILE_NAME ${FIL}) + string(REGEX REPLACE "(.+)\\${FILE_NAME}.*" "\\1" FILE_PATH ${FIL}) - find_package(PkgConfig REQUIRED) - pkg_check_modules(GLIB REQUIRED glib-2.0) + set(PROTO_SRCS ${PROTO_SRCS} "${PROTO_OUTPUT_DIR}/${FIL_WE}.pb.cc") + set(PROTO_HDRS ${PROTO_SRCS} "${PROTO_OUTPUT_DIR}/${FIL_WE}.pb.h") - target_include_directories(pax PRIVATE - ${VEC_HEADER} # for utils/tuptable_vec.h - ${CBDB_ROOT_INCLUDE_DIR} # for arrow-glib/arrow-glib.h and otehr arrow interface - ${GLIB_INCLUDE_DIRS} # for glib-object.h - ) + EXECUTE_PROCESS( + COMMAND ${PROTOBUF_PROTOC_EXECUTABLE} -I ${PROTO_DIR} --cpp_out=${PROTO_OUTPUT_DIR} ${FIL} + ) +ENDFOREACH() +add_custom_target(generate_protobuf DEPENDS ${PROTO_SRCS} ${PROTO_HDRS}) - if(BUILD_GTEST) - target_include_directories(test_main PRIVATE ${VEC_HEADER} ${CBDB_ROOT_INCLUDE_DIR} ${GLIB_INCLUDE_DIRS}) - endif(BUILD_GTEST) +link_directories($ENV{GPHOME}/lib) - target_link_libraries(pax PRIVATE arrow) -endif(VEC_BUILD) +list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake") +## build pax_format.so +include(pax_format) +## build pax.so +include(pax) diff --git a/contrib/pax_storage/src/cpp/access/pax_access_handle.cc b/contrib/pax_storage/src/cpp/access/pax_access_handle.cc index be34ff3aecc..a6bc4ab74be 100644 --- a/contrib/pax_storage/src/cpp/access/pax_access_handle.cc +++ b/contrib/pax_storage/src/cpp/access/pax_access_handle.cc @@ -3,11 +3,18 @@ #include "comm/cbdb_api.h" #include "access/pax_dml_state.h" +#include "access/pax_partition.h" #include "access/pax_scanner.h" #include "access/pax_updater.h" +#include "access/paxc_rel_options.h" +#include "access/paxc_scanner.h" #include "catalog/pax_aux_table.h" +#include "catalog/pax_fastsequence.h" +#include "catalog/pg_pax_tables.h" +#include "comm/guc.h" +#include "comm/pax_memory.h" #include "exceptions/CException.h" -#include "storage/paxc_block_map_manager.h" +#include "storage/local_file_system.h" #define NOT_IMPLEMENTED_YET \ ereport(ERROR, \ @@ -18,14 +25,8 @@ ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), \ errmsg("not supported on pax relations: %s", __func__))) -#define PAX_DEFAULT_COMPRESSLEVEL AO_DEFAULT_COMPRESSLEVEL -#define PAX_MIN_COMPRESSLEVEL AO_MIN_COMPRESSLEVEL -#define PAX_MAX_COMPRESSLEVEL AO_MAX_COMPRESSLEVEL - -#define PAX_DEFAULT_COMPRESSTYPE AO_DEFAULT_COMPRESSTYPE - #define RELATION_IS_PAX(rel) \ - (OidIsValid((rel)->rd_rel->relam) && AMOidIsPax((rel)->rd_rel->relam)) + (OidIsValid((rel)->rd_rel->relam) && RelationIsPAX(rel)) // CBDB_TRY(); // { @@ -41,6 +42,7 @@ // // CBDB_CATCH_MATCH() is optional and can have several match pattern. +char *global_pg_error_message = nullptr; cbdb::CException global_exception(cbdb::CException::kExTypeInvalid); // being of a try block w/o explicit handler @@ -63,6 +65,7 @@ cbdb::CException global_exception(cbdb::CException::kExTypeInvalid); catch (cbdb::CException & e) { \ internal_cbdb_try_throw_error_ = true; \ internal_cbdb_try_throw_error_with_stack_ = true; \ + global_pg_error_message = elog_message(); \ elog(LOG, "\npax stack trace: \n%s", e.Stack()); \ global_exception = e; \ } \ @@ -79,44 +82,29 @@ cbdb::CException global_exception(cbdb::CException::kExTypeInvalid); } while (0); // end of a try-catch block -#define CBDB_END_TRY() \ - } \ - if (internal_cbdb_try_throw_error_) { \ - if (internal_cbdb_try_throw_error_with_stack_) { \ - elog(LOG, "\npax stack trace: \n%s", global_exception.Stack()); \ - ereport(ERROR, errmsg("%s", global_exception.What().c_str())); \ - } \ - if (error_message.Length() == 0) \ - error_message.Append("ERROR: %s", __func__); \ - ereport(ERROR, errmsg("%s", error_message.Message())); \ - } \ - } \ +#define CBDB_END_TRY() \ + } \ + if (internal_cbdb_try_throw_error_) { \ + if (global_pg_error_message) { \ + elog(LOG, "\npg error message:%s", global_pg_error_message); \ + } \ + if (internal_cbdb_try_throw_error_with_stack_) { \ + elog(LOG, "\npax stack trace: \n%s", global_exception.Stack()); \ + ereport( \ + ERROR, \ + errmsg("%s (PG message: %s)", global_exception.What().c_str(), \ + !global_pg_error_message ? "" : global_pg_error_message)); \ + } \ + if (error_message.Length() == 0) \ + error_message.Append("ERROR: %s", __func__); \ + ereport(ERROR, errmsg("%s", error_message.Message())); \ + } \ + } \ while (0) -bool AMOidIsPax(Oid am_oid) { - HeapTuple tuple; - Form_pg_am form; - bool is_pax; - - tuple = SearchSysCache1(AMOID, ObjectIdGetDatum(am_oid)); - if (!HeapTupleIsValid(tuple)) - elog(ERROR, "cache lookup failed for pg_am.oid = %u", am_oid); - - form = (Form_pg_am)GETSTRUCT(tuple); - is_pax = strcmp(NameStr(form->amname), "pax") == 0; - ReleaseSysCache(tuple); - - return is_pax; -} - -// reloptions structure and variables. -static relopt_kind self_relopt_kind; -static const relopt_parse_elt kSelfReloptTab[] = { - {"compresslevel", RELOPT_TYPE_INT, offsetof(PaxOptions, compress_level)}, - {"compresstype", RELOPT_TYPE_STRING, offsetof(PaxOptions, compress_type)}, - {"storage_format", RELOPT_TYPE_STRING, - offsetof(PaxOptions, storage_format)}, -}; +#define PAX_SCAN_REUSE_BUFFER_DEFAULT_SIZE 8 * 1024 * 1024 +#define PAX_SCAN_REUSE_BUFFER_MIN_SIZE 1 * 1024 * 1024 +#define PAX_SCAN_REUSE_BUFFER_MAX_SIZE 32 * 1024 * 1024 // access methods that are implemented in C++ namespace pax { @@ -128,7 +116,7 @@ TableScanDesc CCPaxAccessMethod::ScanBegin(Relation relation, Snapshot snapshot, CBDB_TRY(); { return PaxScanDesc::BeginScan(relation, snapshot, nkeys, key, pscan, flags, - nullptr); + nullptr, true); } CBDB_CATCH_DEFAULT(); CBDB_END_TRY(); @@ -138,21 +126,22 @@ TableScanDesc CCPaxAccessMethod::ScanBegin(Relation relation, Snapshot snapshot, void CCPaxAccessMethod::ScanEnd(TableScanDesc scan) { CBDB_TRY(); - { PaxScanDesc::EndScan(scan); } + { + auto desc = PaxScanDesc::ToDesc(scan); + desc->EndScan(); + } CBDB_CATCH_DEFAULT(); - CBDB_FINALLY({ - // FIXME: destroy PaxScanDesc? - }); + CBDB_FINALLY({}); CBDB_END_TRY(); } TableScanDesc CCPaxAccessMethod::ScanExtractColumns( - Relation rel, Snapshot snapshot, ParallelTableScanDesc parallel_scan, - List *targetlist, List *qual, uint32 flags) { + Relation rel, Snapshot snapshot, int nkeys, struct ScanKeyData *key, + ParallelTableScanDesc parallel_scan, struct PlanState *ps, uint32 flags) { CBDB_TRY(); { - return pax::PaxScanDesc::BeginScanExtractColumns( - rel, snapshot, parallel_scan, targetlist, qual, flags); + return pax::PaxScanDesc::BeginScanExtractColumns(rel, snapshot, nkeys, key, + parallel_scan, ps, flags); } CBDB_CATCH_DEFAULT(); CBDB_FINALLY({}); @@ -160,16 +149,107 @@ TableScanDesc CCPaxAccessMethod::ScanExtractColumns( pg_unreachable(); } +struct IndexFetchTableData *CCPaxAccessMethod::IndexFetchBegin(Relation rel) { + CBDB_TRY(); + { + auto desc = PAX_NEW(rel); + return desc->ToBase(); + } + CBDB_CATCH_DEFAULT(); + CBDB_FINALLY({}); + CBDB_END_TRY(); + return nullptr; // keep compiler quiet +} + +void CCPaxAccessMethod::IndexFetchEnd(IndexFetchTableData *scan) { + CBDB_TRY(); + { + auto desc = PaxIndexScanDesc::FromBase(scan); + PAX_DELETE(desc); + } + CBDB_CATCH_DEFAULT(); + CBDB_FINALLY({}); + CBDB_END_TRY(); +} + +bool CCPaxAccessMethod::IndexFetchTuple(struct IndexFetchTableData *scan, + ItemPointer tid, Snapshot snapshot, + TupleTableSlot *slot, bool *call_again, + bool *all_dead) { + CBDB_TRY(); + { + auto desc = PaxIndexScanDesc::FromBase(scan); + return desc->FetchTuple(tid, snapshot, slot, call_again, all_dead); + } + CBDB_CATCH_DEFAULT(); + CBDB_FINALLY({}); + CBDB_END_TRY(); + return false; // keep compiler quiet +} + +void CCPaxAccessMethod::IndexFetchReset(IndexFetchTableData * /*scan*/) {} + void CCPaxAccessMethod::RelationSetNewFilenode(Relation rel, const RelFileNode *newrnode, char persistence, TransactionId *freeze_xid, MultiXactId *minmulti) { + Relation pax_tables_rel; + ScanKeyData scan_key[1]; + SysScanDesc scan; + HeapTuple tuple; + Oid pax_relid; + bool exists; + + *freeze_xid = *minmulti = InvalidTransactionId; + + pax_tables_rel = table_open(PAX_TABLES_RELATION_ID, RowExclusiveLock); + pax_relid = RelationGetRelid(rel); + + ScanKeyInit(&scan_key[0], ANUM_PG_PAX_TABLES_RELID, BTEqualStrategyNumber, + F_OIDEQ, ObjectIdGetDatum(pax_relid)); + scan = systable_beginscan(pax_tables_rel, PAX_TABLES_RELID_INDEX_ID, true, + NULL, 1, scan_key); + tuple = systable_getnext(scan); + exists = HeapTupleIsValid(tuple); + if (exists) { + Oid aux_relid; + + // set new filenode, not create new table + // + // 1. truncate aux table by new relfilenode + aux_relid = ::paxc::GetPaxAuxRelid(pax_relid); + Assert(OidIsValid(aux_relid)); + paxc::PaxAuxRelationSetNewFilenode(aux_relid); + } else { + // create new table + // + // 1. create aux table + // 2. initialize fast sequence in pg_pax_fastsequence + // 3. setup dependency + paxc::CPaxCreateMicroPartitionTable(rel); + } + + // initialize or reset the fast sequence number + paxc::CPaxInitializeFastSequenceEntry( + pax_relid, + exists ? FASTSEQUENCE_INIT_TYPE_UPDATE : FASTSEQUENCE_INIT_TYPE_CREATE); + + systable_endscan(scan); + table_close(pax_tables_rel, NoLock); + + // create relfilenode file for pax table + auto srel = RelationCreateStorage(*newrnode, persistence, SMGR_MD, rel); + smgrclose(srel); + + // create data directory CBDB_TRY(); { - *freeze_xid = *minmulti = InvalidTransactionId; - pax::CCPaxAuxTable::PaxAuxRelationSetNewFilenode(rel, newrnode, - persistence); + FileSystem *fs = pax::Singleton::GetInstance(); + auto path = cbdb::BuildPaxDirectoryPath(*newrnode, rel->rd_backend); + Assert(!path.empty()); + CBDB_CHECK((fs->CreateDirectory(path) == 0), + cbdb::CException::ExType::kExTypeIOError); } CBDB_CATCH_DEFAULT(); CBDB_FINALLY({}); @@ -233,12 +313,14 @@ void CCPaxAccessMethod::RelationFileUnlink(RelFileNodeBackend rnode) { CBDB_END_TRY(); } -void CCPaxAccessMethod::ScanRescan(TableScanDesc scan, ScanKey /*key*/, - bool /*set_params*/, bool /*allow_strat*/, - bool /*allow_sync*/, - bool /*allow_pagemode*/) { +void CCPaxAccessMethod::ScanRescan(TableScanDesc scan, ScanKey key, + bool set_params, bool allow_strat, + bool allow_sync, bool allow_pagemode) { CBDB_TRY(); - { pax::PaxScanDesc::ReScan(scan); } + { + auto desc = PaxScanDesc::ToDesc(scan); + desc->ReScan(key, set_params, allow_strat, allow_sync, allow_pagemode); + } CBDB_CATCH_DEFAULT(); CBDB_FINALLY({}); CBDB_END_TRY(); @@ -248,11 +330,12 @@ bool CCPaxAccessMethod::ScanGetNextSlot(TableScanDesc scan, ScanDirection /*direction*/, TupleTableSlot *slot) { CBDB_TRY(); - { return PaxScanDesc::ScanGetNextSlot(scan, slot); } + { + auto desc = PaxScanDesc::ToDesc(scan); + return desc->GetNextSlot(slot); + } CBDB_CATCH_DEFAULT(); - CBDB_FINALLY({ - // FIXME: destroy PaxScanDesc? - }); + CBDB_FINALLY({}); CBDB_END_TRY(); pg_unreachable(); @@ -315,11 +398,14 @@ TM_Result CCPaxAccessMethod::TupleUpdate(Relation relation, ItemPointer otid, pg_unreachable(); } -bool CCPaxAccessMethod::ScanAnalyzeNextBlock( - TableScanDesc scan, BlockNumber blockno, - BufferAccessStrategy /*bstrategy*/) { +bool CCPaxAccessMethod::ScanAnalyzeNextBlock(TableScanDesc scan, + BlockNumber blockno, + BufferAccessStrategy bstrategy) { CBDB_TRY(); - { return PaxScanDesc::ScanAnalyzeNextBlock(scan, blockno); } + { + auto desc = PaxScanDesc::ToDesc(scan); + return desc->ScanAnalyzeNextBlock(blockno, bstrategy); + } CBDB_CATCH_DEFAULT(); CBDB_FINALLY({}); CBDB_END_TRY(); @@ -327,34 +413,54 @@ bool CCPaxAccessMethod::ScanAnalyzeNextBlock( } bool CCPaxAccessMethod::ScanAnalyzeNextTuple(TableScanDesc scan, - TransactionId /*oldest_xmin*/, + TransactionId oldest_xmin, double *liverows, double *deadrows, TupleTableSlot *slot) { CBDB_TRY(); - { return PaxScanDesc::ScanAnalyzeNextTuple(scan, liverows, deadrows, slot); } + { + auto desc = PaxScanDesc::ToDesc(scan); + return desc->ScanAnalyzeNextTuple(oldest_xmin, liverows, deadrows, slot); + } CBDB_CATCH_DEFAULT(); CBDB_FINALLY({}); CBDB_END_TRY(); pg_unreachable(); } -bool CCPaxAccessMethod::ScanBitmapNextBlock(TableScanDesc /*scan*/, - TBMIterateResult * /*tbmres*/) { - NOT_IMPLEMENTED_YET; - return false; +bool CCPaxAccessMethod::ScanBitmapNextBlock(TableScanDesc scan, + TBMIterateResult *tbmres) { + CBDB_TRY(); + { + auto desc = PaxScanDesc::ToDesc(scan); + return desc->BitmapNextBlock(tbmres); + } + CBDB_CATCH_DEFAULT(); + CBDB_FINALLY({}); + CBDB_END_TRY(); + pg_unreachable(); } -bool CCPaxAccessMethod::ScanBitmapNextTuple(TableScanDesc /*scan*/, - TBMIterateResult * /*tbmres*/, - TupleTableSlot * /*slot*/) { - NOT_IMPLEMENTED_YET; - return false; +bool CCPaxAccessMethod::ScanBitmapNextTuple(TableScanDesc scan, + TBMIterateResult *tbmres, + TupleTableSlot *slot) { + CBDB_TRY(); + { + auto desc = PaxScanDesc::ToDesc(scan); + return desc->BitmapNextTuple(tbmres, slot); + } + CBDB_CATCH_DEFAULT(); + CBDB_FINALLY({}); + CBDB_END_TRY(); + pg_unreachable(); } bool CCPaxAccessMethod::ScanSampleNextBlock(TableScanDesc scan, SampleScanState *scanstate) { CBDB_TRY(); - { return PaxScanDesc::ScanSampleNextBlock(scan, scanstate); } + { + auto desc = PaxScanDesc::ToDesc(scan); + return desc->ScanSampleNextBlock(scanstate); + } CBDB_CATCH_DEFAULT(); CBDB_FINALLY({}); CBDB_END_TRY(); @@ -362,10 +468,13 @@ bool CCPaxAccessMethod::ScanSampleNextBlock(TableScanDesc scan, } bool CCPaxAccessMethod::ScanSampleNextTuple(TableScanDesc scan, - SampleScanState * /*scanstate*/, + SampleScanState *scanstate, TupleTableSlot *slot) { CBDB_TRY(); - { return PaxScanDesc::ScanSampleNextTuple(scan, slot); } + { + auto desc = PaxScanDesc::ToDesc(scan); + return desc->ScanSampleNextTuple(scanstate, slot); + } CBDB_CATCH_DEFAULT(); CBDB_FINALLY({}); CBDB_END_TRY(); @@ -409,9 +518,7 @@ void CCPaxAccessMethod::FinishBulkInsert(Relation relation, int options) { } void CCPaxAccessMethod::ExtDmlInit(Relation rel, CmdType operation) { - if (!RELATION_IS_PAX(rel)) { - return; - } + if (!RELATION_IS_PAX(rel)) return; CBDB_TRY(); { pax::CPaxDmlStateLocal::Instance()->InitDmlState(rel, operation); } @@ -421,9 +528,7 @@ void CCPaxAccessMethod::ExtDmlInit(Relation rel, CmdType operation) { } void CCPaxAccessMethod::ExtDmlFini(Relation rel, CmdType operation) { - if (!RELATION_IS_PAX(rel)) { - return; - } + if (!RELATION_IS_PAX(rel)) return; CBDB_TRY(); { pax::CPaxDmlStateLocal::Instance()->FinishDmlState(rel, operation); } @@ -458,29 +563,6 @@ void PaxAccessMethod::ParallelscanReinitialize( NOT_IMPLEMENTED_YET; } -struct IndexFetchTableData *PaxAccessMethod::IndexFetchBegin(Relation /*rel*/) { - NOT_SUPPORTED_YET; - return nullptr; -} - -void PaxAccessMethod::IndexFetchEnd(IndexFetchTableData * /*data*/) { - NOT_SUPPORTED_YET; -} - -void PaxAccessMethod::IndexFetchReset(IndexFetchTableData * /*data*/) { - NOT_SUPPORTED_YET; -} - -bool PaxAccessMethod::IndexFetchTuple(struct IndexFetchTableData * /*scan*/, - ItemPointer /*tid*/, - Snapshot /*snapshot*/, - TupleTableSlot * /*slot*/, - bool * /*call_again*/, - bool * /*all_dead*/) { - NOT_SUPPORTED_YET; - return false; -} - void PaxAccessMethod::TupleInsertSpeculative(Relation /*relation*/, TupleTableSlot * /*slot*/, CommandId /*cid*/, int /*options*/, @@ -556,10 +638,10 @@ uint64 PaxAccessMethod::RelationSize(Relation rel, ForkNumber fork_number) { if (fork_number != MAIN_FORKNUM) return 0; // Get the oid of pg_pax_blocks_xxx from pg_pax_tables - GetPaxTablesEntryAttributes(rel->rd_id, &pax_aux_oid, NULL, NULL); + pax_aux_oid = ::paxc::GetPaxAuxRelid(rel->rd_id); // Scan pg_pax_blocks_xxx to calculate size of micro partition - pax_aux_rel = heap_open(pax_aux_oid, AccessShareLock); + pax_aux_rel = table_open(pax_aux_oid, AccessShareLock); aux_tup_desc = RelationGetDescr(pax_aux_rel); aux_scan = systable_beginscan(pax_aux_rel, InvalidOid, false, NULL, 0, NULL); @@ -576,7 +658,7 @@ uint64 PaxAccessMethod::RelationSize(Relation rel, ForkNumber fork_number) { } systable_endscan(aux_scan); - heap_close(pax_aux_rel, AccessShareLock); + table_close(pax_aux_rel, AccessShareLock); return pax_size; } @@ -599,7 +681,7 @@ void PaxAccessMethod::EstimateRelSize(Relation rel, int32 * /*attr_widths*/, TupleDesc aux_tup_desc; HeapTuple aux_tup; SysScanDesc aux_scan; - uint32 total_tuples = 0; + uint64 total_tuples = 0; uint64 pax_size = 0; // Even an empty table takes at least one page, @@ -610,10 +692,10 @@ void PaxAccessMethod::EstimateRelSize(Relation rel, int32 * /*attr_widths*/, *allvisfrac = 0; // Get the oid of pg_pax_blocks_xxx from pg_pax_tables - GetPaxTablesEntryAttributes(rel->rd_id, &pax_aux_oid, NULL, NULL); + pax_aux_oid = ::paxc::GetPaxAuxRelid(rel->rd_id); // Scan pg_pax_blocks_xxx to get attributes - pax_aux_rel = heap_open(pax_aux_oid, AccessShareLock); + pax_aux_rel = table_open(pax_aux_oid, AccessShareLock); aux_tup_desc = RelationGetDescr(pax_aux_rel); aux_scan = systable_beginscan(pax_aux_rel, InvalidOid, false, NULL, 0, NULL); @@ -640,20 +722,122 @@ void PaxAccessMethod::EstimateRelSize(Relation rel, int32 * /*attr_widths*/, } systable_endscan(aux_scan); - heap_close(pax_aux_rel, AccessShareLock); + table_close(pax_aux_rel, AccessShareLock); *tuples = static_cast(total_tuples); *pages = RelationGuessNumberOfBlocksFromSize(pax_size); } double PaxAccessMethod::IndexBuildRangeScan( - Relation /*heap_relation*/, Relation /*index_relation*/, - IndexInfo * /*index_info*/, bool /*allow_sync*/, bool /*anyvisible*/, - bool /*progress*/, BlockNumber /*start_blockno*/, BlockNumber /*numblocks*/, - IndexBuildCallback /*callback*/, void * /*callback_state*/, - TableScanDesc /*scan*/) { - NOT_SUPPORTED_YET; - return 0.0; + Relation heap_relation, Relation index_relation, IndexInfo *index_info, + bool /*allow_sync*/, bool anyvisible, bool progress, + BlockNumber start_blockno, BlockNumber numblocks, + IndexBuildCallback callback, void *callback_state, TableScanDesc scan) { + Datum values[INDEX_MAX_KEYS]; + bool isnull[INDEX_MAX_KEYS]; + double reltuples = 0; + ExprState *predicate; + TupleTableSlot *slot; + EState *estate; + ExprContext *econtext; + Snapshot snapshot; + + bool checking_uniqueness; + bool need_unregister_snapshot; + BlockNumber previous_blkno = InvalidBlockNumber; + + Assert(OidIsValid(index_relation->rd_rel->relam)); + Assert(!IsSystemRelation(heap_relation)); + + checking_uniqueness = + (index_info->ii_Unique || index_info->ii_ExclusionOps != NULL); + // "Any visible" mode is not compatible with uniqueness checks; make sure + // only one of those is requested. + (void)anyvisible; // keep compiler quiet for release version + Assert(!(anyvisible && checking_uniqueness)); + + slot = table_slot_create(heap_relation, NULL); + estate = CreateExecutorState(); + econtext = GetPerTupleExprContext(estate); + econtext->ecxt_scantuple = slot; + predicate = ExecPrepareQual(index_info->ii_Predicate, estate); + + if (!scan) { + snapshot = RegisterSnapshot(GetTransactionSnapshot()); + scan = table_beginscan(heap_relation, snapshot, 0, NULL); + need_unregister_snapshot = true; + } else { + snapshot = scan->rs_snapshot; + need_unregister_snapshot = false; + } + + // FIXME: Only brin index uses partial index now. setup start_blockno + // and numblocks is too late after beginscan is called now, because + // the current micro partition is opened. The workaround is ugly to + // check and close the current micro partition and open another one. + if (start_blockno != 0 || numblocks != InvalidBlockNumber) + elog(ERROR, "PAX doesn't support partial index scan now"); + + while (table_scan_getnextslot(scan, ForwardScanDirection, slot)) { + CHECK_FOR_INTERRUPTS(); + + if (progress) { + BlockNumber blkno = pax::GetBlockNumber(slot->tts_tid); + if (previous_blkno == InvalidBlockNumber) + previous_blkno = blkno; + else if (previous_blkno != blkno) { + pgstat_progress_update_param(PROGRESS_SCAN_BLOCKS_DONE, + blkno - start_blockno); + previous_blkno = blkno; + } + } + reltuples += 1; + + MemoryContextReset(econtext->ecxt_per_tuple_memory); + + /* + * In a partial index, discard tuples that don't satisfy the + * predicate. + */ + if (predicate && !ExecQual(predicate, econtext)) continue; + + /* + * For the current heap tuple, extract all the attributes we use in + * this index, and note which are null. This also performs evaluation + * of any expressions needed. + */ + FormIndexDatum(index_info, slot, estate, values, isnull); + + /* + * You'd think we should go ahead and build the index tuple here, but + * some index AMs want to do further processing on the data first. So + * pass the values[] and isnull[] arrays, instead. + */ + callback(index_relation, &slot->tts_tid, values, isnull, true, + callback_state); + } + + /* Report scan progress one last time. */ + if (progress && previous_blkno != InvalidBlockNumber) + pgstat_progress_update_param(PROGRESS_SCAN_BLOCKS_DONE, + previous_blkno + 1 - start_blockno); + + table_endscan(scan); + if (need_unregister_snapshot) UnregisterSnapshot(snapshot); + + ExecDropSingleTupleTableSlot(slot); + FreeExecutorState(estate); + + /* These may have been pointing to the now-gone estate */ + index_info->ii_ExpressionsState = NIL; + index_info->ii_PredicateState = NULL; + + return reltuples; +} + +bool PaxAccessMethod::IndexUniqueCheck(Relation rel, ItemPointer tid, + Snapshot snapshot, bool *all_dead) { + return paxc::IndexUniqueCheck(rel, tid, snapshot, all_dead); } void PaxAccessMethod::IndexValidateScan(Relation /*heap_relation*/, @@ -664,83 +848,107 @@ void PaxAccessMethod::IndexValidateScan(Relation /*heap_relation*/, NOT_IMPLEMENTED_YET; } -#define PAX_COPY_OPT(pax_opts_, pax_opt_name_) \ - do { \ - PaxOptions *pax_opts = reinterpret_cast(pax_opts_); \ - int pax_name_offset_ = *reinterpret_cast(pax_opts->pax_opt_name_); \ - if (pax_name_offset_) \ - strlcpy(pax_opts->pax_opt_name_, \ - reinterpret_cast(pax_opts) + pax_name_offset_, \ - sizeof(pax_opts->pax_opt_name_)); \ - } while (0) -bytea *PaxAccessMethod::Amoptions(Datum reloptions, char /*relkind*/, - bool validate) { - void *rdopts; - - rdopts = build_reloptions(reloptions, validate, self_relopt_kind, - sizeof(PaxOptions), kSelfReloptTab, - lengthof(kSelfReloptTab)); - // adjust string values - PAX_COPY_OPT(rdopts, storage_format); - PAX_COPY_OPT(rdopts, compress_type); - - return reinterpret_cast(rdopts); -} -#undef PAX_COPY_OPT - +// Swap data between two pax tables, but not swap oids +// 1. swap partition-spec in pg_pax_tables +// 2. swap relation content for aux table and toast void PaxAccessMethod::SwapRelationFiles(Oid relid1, Oid relid2, TransactionId frozen_xid, MultiXactId cutoff_multi) { - HeapTuple tuple1; - HeapTuple tuple2; + HeapTuple old_tuple1; + HeapTuple old_tuple2; Relation pax_rel; + TupleDesc desc; + ScanKeyData key[1]; + SysScanDesc scan; - Oid b_relid1; - Oid b_relid2; + Oid aux_relid1; + Oid aux_relid2; - pax_rel = table_open(PaxTablesRelationId, RowExclusiveLock); + pax_rel = table_open(PAX_TABLES_RELATION_ID, RowExclusiveLock); + desc = RelationGetDescr(pax_rel); - tuple1 = SearchSysCacheCopy1(PAXTABLESID, relid1); - if (!HeapTupleIsValid(tuple1)) - ereport(ERROR, (errcode(ERRCODE_UNDEFINED_SCHEMA), - errmsg("cache lookup failed with relid=%u for aux relation " - "in pg_pax_tables.", - relid1))); + // save ctid, auxrelid and partition-spec for the first pax relation + ScanKeyInit(&key[0], ANUM_PG_PAX_TABLES_RELID, BTEqualStrategyNumber, F_OIDEQ, + ObjectIdGetDatum(relid1)); - tuple2 = SearchSysCacheCopy1(PAXTABLESID, relid2); - if (!HeapTupleIsValid(tuple2)) - ereport(ERROR, (errcode(ERRCODE_UNDEFINED_SCHEMA), - errmsg("cache lookup failed with relid=%u for aux relation " - "in pg_pax_tables.", - relid2))); + scan = systable_beginscan(pax_rel, PAX_TABLES_RELID_INDEX_ID, true, nullptr, + 1, key); + old_tuple1 = systable_getnext(scan); + if (!HeapTupleIsValid(old_tuple1)) + ereport(ERROR, (errmsg("relid=%u is not a pax relation", relid1))); - // swap the entries - { - Form_pg_pax_tables form1; - Form_pg_pax_tables form2; + old_tuple1 = heap_copytuple(old_tuple1); + systable_endscan(scan); - int16 temp_compresslevel; - NameData temp_compresstype; + // save ctid, auxrelid and partition-spec for the second pax relation + ScanKeyInit(&key[0], ANUM_PG_PAX_TABLES_RELID, BTEqualStrategyNumber, F_OIDEQ, + ObjectIdGetDatum(relid2)); + scan = systable_beginscan(pax_rel, PAX_TABLES_RELID_INDEX_ID, true, nullptr, + 1, key); + old_tuple2 = systable_getnext(scan); + if (!HeapTupleIsValid(old_tuple2)) + ereport(ERROR, (errmsg("relid=%u is not a pax relation", relid2))); - form1 = (Form_pg_pax_tables)GETSTRUCT(tuple1); - form2 = (Form_pg_pax_tables)GETSTRUCT(tuple2); + old_tuple2 = heap_copytuple(old_tuple2); + systable_endscan(scan); - Assert(((Form_pg_pax_tables)GETSTRUCT(tuple1))->relid == relid1); - Assert(((Form_pg_pax_tables)GETSTRUCT(tuple2))->relid == relid2); + // swap the entries + { + HeapTuple tuple1; + HeapTuple tuple2; + Datum values[NATTS_PG_PAX_TABLES]; + bool nulls[NATTS_PG_PAX_TABLES]; + Datum datum; + bool isnull; + + datum = + heap_getattr(old_tuple1, ANUM_PG_PAX_TABLES_AUXRELID, desc, &isnull); + Assert(!isnull); + aux_relid1 = DatumGetObjectId(datum); + + values[ANUM_PG_PAX_TABLES_RELID - 1] = ObjectIdGetDatum(relid1); + values[ANUM_PG_PAX_TABLES_AUXRELID - 1] = datum; + nulls[ANUM_PG_PAX_TABLES_RELID - 1] = false; + nulls[ANUM_PG_PAX_TABLES_AUXRELID - 1] = false; + + datum = heap_getattr(old_tuple2, ANUM_PG_PAX_TABLES_PARTITIONSPEC, desc, + &isnull); + if (!isnull) { + auto vl = reinterpret_cast(DatumGetPointer(datum)); + vl = pg_detoast_datum_packed(vl); + values[ANUM_PG_PAX_TABLES_PARTITIONSPEC - 1] = PointerGetDatum(vl); + } + nulls[ANUM_PG_PAX_TABLES_PARTITIONSPEC - 1] = isnull; - b_relid1 = form1->blocksrelid; - b_relid2 = form2->blocksrelid; + tuple1 = heap_form_tuple(desc, values, nulls); + tuple1->t_data->t_ctid = old_tuple1->t_data->t_ctid; + tuple1->t_self = old_tuple1->t_self; + tuple1->t_tableOid = old_tuple1->t_tableOid; - memcpy(&temp_compresstype, &form1->compresstype, sizeof(NameData)); - memcpy(&form1->compresstype, &form2->compresstype, sizeof(NameData)); - memcpy(&form2->compresstype, &temp_compresstype, sizeof(NameData)); + datum = + heap_getattr(old_tuple2, ANUM_PG_PAX_TABLES_AUXRELID, desc, &isnull); + Assert(!isnull); + aux_relid2 = DatumGetObjectId(datum); + + values[ANUM_PG_PAX_TABLES_RELID - 1] = ObjectIdGetDatum(relid2); + values[ANUM_PG_PAX_TABLES_AUXRELID - 1] = datum; + nulls[ANUM_PG_PAX_TABLES_RELID - 1] = false; + nulls[ANUM_PG_PAX_TABLES_AUXRELID - 1] = false; + + datum = heap_getattr(old_tuple1, ANUM_PG_PAX_TABLES_PARTITIONSPEC, desc, + &isnull); + if (!isnull) { + auto vl = reinterpret_cast(DatumGetPointer(datum)); + vl = pg_detoast_datum_packed(vl); + values[ANUM_PG_PAX_TABLES_PARTITIONSPEC - 1] = PointerGetDatum(vl); + } + nulls[ANUM_PG_PAX_TABLES_PARTITIONSPEC - 1] = isnull; - temp_compresslevel = form1->compresslevel; - form1->compresslevel = form2->compresslevel; - form2->compresslevel = temp_compresslevel; - } + tuple2 = heap_form_tuple(desc, values, nulls); + tuple2->t_data->t_ctid = old_tuple2->t_data->t_ctid; + tuple2->t_self = old_tuple2->t_self; + tuple2->t_tableOid = old_tuple2->t_tableOid; - { CatalogIndexState indstate; indstate = CatalogOpenIndexes(pax_rel); @@ -753,23 +961,55 @@ void PaxAccessMethod::SwapRelationFiles(Oid relid1, Oid relid2, /* swap relation files for aux table */ { - Relation b_rel1; - Relation b_rel2; - - b_rel1 = relation_open(b_relid1, AccessExclusiveLock); - b_rel2 = relation_open(b_relid2, AccessExclusiveLock); - - swap_relation_files(b_relid1, b_relid2, false, /* target_is_pg_class */ - true, /* swap_toast_by_content */ - true, /*swap_stats */ - true, /* is_internal */ + Relation aux_rel1; + Relation aux_rel2; + ReindexParams reindex_params = {0}; + Relation toast_rel1 = nullptr; + Relation toast_rel2 = nullptr; + + aux_rel1 = relation_open(aux_relid1, AccessExclusiveLock); + aux_rel2 = relation_open(aux_relid2, AccessExclusiveLock); + + if (OidIsValid(aux_rel1->rd_rel->reltoastrelid)) + toast_rel1 = + relation_open(aux_rel1->rd_rel->reltoastrelid, AccessExclusiveLock); + if (OidIsValid(aux_rel2->rd_rel->reltoastrelid)) + toast_rel2 = + relation_open(aux_rel2->rd_rel->reltoastrelid, AccessExclusiveLock); + + swap_relation_files(aux_relid1, aux_relid2, false, /* target_is_pg_class */ + true, /* swap_toast_by_content */ + true, /*swap_stats */ + true, /* is_internal */ frozen_xid, cutoff_multi, NULL); - relation_close(b_rel1, NoLock); - relation_close(b_rel2, NoLock); + if (toast_rel1) relation_close(toast_rel1, NoLock); + if (toast_rel2) relation_close(toast_rel2, NoLock); + relation_close(aux_rel1, NoLock); + relation_close(aux_rel2, NoLock); + + reindex_relation(aux_relid1, 0, &reindex_params); + reindex_relation(aux_relid2, 0, &reindex_params); } } +bytea *PaxAccessMethod::AmOptions(Datum reloptions, char relkind, + bool validate) { + return paxc_default_rel_options(reloptions, relkind, validate); +} + +void PaxAccessMethod::ValidateColumnEncodingClauses(List *encoding_opts) { + paxc_validate_column_encoding_clauses(encoding_opts); +} + +List *PaxAccessMethod::TransformColumnEncodingClauses(Relation /*rel*/, + List *encoding_opts, + bool validate, + bool from_type) { + return paxc_transform_column_encoding_clauses(encoding_opts, validate, + from_type); +} + } // namespace paxc // END of C implementation @@ -789,10 +1029,11 @@ static const TableAmRoutine kPaxColumnMethods = { .parallelscan_reinitialize = paxc::PaxAccessMethod::ParallelscanReinitialize, - .index_fetch_begin = paxc::PaxAccessMethod::IndexFetchBegin, - .index_fetch_reset = paxc::PaxAccessMethod::IndexFetchReset, - .index_fetch_end = paxc::PaxAccessMethod::IndexFetchEnd, - .index_fetch_tuple = paxc::PaxAccessMethod::IndexFetchTuple, + .index_fetch_begin = pax::CCPaxAccessMethod::IndexFetchBegin, + .index_fetch_reset = pax::CCPaxAccessMethod::IndexFetchReset, + .index_fetch_end = pax::CCPaxAccessMethod::IndexFetchEnd, + .index_fetch_tuple = pax::CCPaxAccessMethod::IndexFetchTuple, + .index_unique_check = paxc::PaxAccessMethod::IndexUniqueCheck, .tuple_fetch_row_version = paxc::PaxAccessMethod::TupleFetchRowVersion, .tuple_tid_valid = paxc::PaxAccessMethod::TupleTidValid, @@ -831,8 +1072,12 @@ static const TableAmRoutine kPaxColumnMethods = { .scan_sample_next_block = pax::CCPaxAccessMethod::ScanSampleNextBlock, .scan_sample_next_tuple = pax::CCPaxAccessMethod::ScanSampleNextTuple, - .amoptions = paxc::PaxAccessMethod::Amoptions, + .amoptions = paxc::PaxAccessMethod::AmOptions, .swap_relation_files = paxc::PaxAccessMethod::SwapRelationFiles, + .validate_column_encoding_clauses = + paxc::PaxAccessMethod::ValidateColumnEncodingClauses, + .transform_column_encoding_clauses = + paxc::PaxAccessMethod::TransformColumnEncodingClauses, }; PG_MODULE_MAGIC; @@ -841,107 +1086,168 @@ Datum pax_tableam_handler(PG_FUNCTION_ARGS) { // NOLINT PG_RETURN_POINTER(&kPaxColumnMethods); } -static void PaxValidateStorageFormat(const char *value) { - size_t i; - static const char *storage_formats[] = { - "orc", - "ppt", - }; - - for (i = 0; i < lengthof(storage_formats); i++) { - if (strcmp(value, storage_formats[i]) == 0) return; +static object_access_hook_type prev_object_access_hook = NULL; + +static void PaxObjectAccessHook(ObjectAccessType access, Oid class_id, + Oid object_id, int sub_id, void *arg) { + Relation rel; + PartitionKey pkey; + List *part; + List *pby; + paxc::PaxOptions *options; + + if (prev_object_access_hook) + prev_object_access_hook(access, class_id, object_id, sub_id, arg); + + if (access != OAT_POST_CREATE || class_id != RelationRelationId) return; + + CommandCounterIncrement(); + rel = relation_open(object_id, RowExclusiveLock); + auto ok = ((rel->rd_rel->relkind == RELKIND_RELATION || + rel->rd_rel->relkind == RELKIND_MATVIEW) && + rel->rd_options && RelationIsPAX(rel)); + if (!ok) goto out; + + options = reinterpret_cast(rel->rd_options); + if (!options->partition_by()) { + if (options->partition_ranges()) { + elog(ERROR, "set '%s', but partition_by not specified", + options->partition_ranges()); + } + goto out; } - ereport(ERROR, (errmsg("unsupported storage format: '%s'", value))); -} -static void PaxValidateCompresstype(const char *value) { - size_t i; - static const char *compress_types[] = { - "none", - "zlib", - }; + pby = paxc_raw_parse(options->partition_by()); + pkey = paxc::PaxRelationBuildPartitionKey(rel, pby); + if (pkey->partnatts > 1) elog(ERROR, "pax only support 1 partition key now"); - for (i = 0; i < lengthof(compress_types); i++) { - if (strcmp(value, compress_types[i]) == 0) return; - } - ereport(ERROR, (errmsg("unsupported compress type: '%s'", value))); -} + part = lappend(NIL, pby); + if (options->partition_ranges()) { + List *ranges; -static shmem_startup_hook_type prev_shmem_startup_hook = NULL; -static ExecutorStart_hook_type prev_executor_start = NULL; -static ExecutorEnd_hook_type prev_executor_end = NULL; -static uint32 executor_run_ref_count = 0; + ranges = paxc_parse_partition_ranges(options->partition_ranges()); + ranges = paxc::PaxValidatePartitionRanges(rel, pkey, ranges); + part = lappend(part, ranges); + } + // Currently, partition_ranges must be set to partition pax tables. + // We hope this option be removed and automatically partition data set. + else + elog(ERROR, "partition_ranges must be set for partition_by='%s'", + options->partition_by()); -void PaxShmemInit() { - if (prev_shmem_startup_hook) prev_shmem_startup_hook(); + ::paxc::PaxInitializePartitionSpec(rel, reinterpret_cast(part)); - paxc::paxc_shmem_startup(); +out: + relation_close(rel, NoLock); } -static void PaxExecutorStart(QueryDesc *query_desc, int eflags) { - if (prev_executor_start) - prev_executor_start(query_desc, eflags); - else - standard_ExecutorStart(query_desc, eflags); - - executor_run_ref_count++; +static void DefineGUCs() { + DefineCustomBoolVariable("pax_enable_debug", "enable pax debug", NULL, + &pax::pax_enable_debug, true, PGC_USERSET, 0, NULL, + NULL, NULL); + + DefineCustomBoolVariable("pax_enable_filter", "enable pax filter", NULL, + &pax::pax_enable_filter, true, PGC_USERSET, 0, NULL, + NULL, NULL); + + DefineCustomIntVariable( + "pax_max_tuples_per_group", + "the default value for the limit on the number of tuples in a group", + NULL, &pax::pax_max_tuples_per_group, VEC_BATCH_LENGTH, 0, + VEC_BATCH_LENGTH * 100, PGC_USERSET, 0, NULL, NULL, NULL); + +#ifdef ENABLE_PLASMA + DefineCustomBoolVariable( + "pax_enable_plasma", "Enable plasma cache the set of columns", NULL, + &pax::pax_enable_plasma_in_mem, true, PGC_USERSET, 0, NULL, NULL, NULL); +#endif + + DefineCustomIntVariable( + "pax_scan_reuse_buffer_size", "set the reuse buffer size", NULL, + &pax::pax_scan_reuse_buffer_size, PAX_SCAN_REUSE_BUFFER_DEFAULT_SIZE, + PAX_SCAN_REUSE_BUFFER_MIN_SIZE, PAX_SCAN_REUSE_BUFFER_MAX_SIZE, + PGC_USERSET, 0, NULL, NULL, NULL); } -static void PaxExecutorEnd(QueryDesc *query_desc) { - if (prev_executor_end) - prev_executor_end(query_desc); - else - standard_ExecutorEnd(query_desc); +struct PaxObjectProperty { + const char *name; + Oid class_oid; + Oid index_oid; + AttrNumber attnum_oid; +}; - executor_run_ref_count--; - Assert(executor_run_ref_count >= 0); - if (executor_run_ref_count == 0) { - paxc::release_command_resource(); - } -} +static const struct PaxObjectProperty kPaxObjectProperties[] = { + {"fast-sequence", PAX_FASTSEQUENCE_OID, PAX_FASTSEQUENCE_INDEX_OID, + ANUM_PG_PAX_FAST_SEQUENCE_OBJID}, + {"pg_pax_tables", PAX_TABLES_RELATION_ID, PAX_TABLES_RELID_INDEX_ID, + ANUM_PG_PAX_TABLES_RELID}, + // add pg_pax_tables here +}; -static void PaxXactCallback(XactEvent event, void * /*arg*/) { - if (event == XACT_EVENT_COMMIT || event == XACT_EVENT_ABORT || - event == XACT_EVENT_PARALLEL_ABORT || - event == XACT_EVENT_PARALLEL_COMMIT) { - if (executor_run_ref_count > 0) { - executor_run_ref_count = 0; - paxc::release_command_resource(); - } +static const struct PaxObjectProperty *FindPaxObjectProperty(Oid class_id) { + for (const auto &property : kPaxObjectProperties) { + const auto p = &property; + if (p->class_oid == class_id) return p; } + return NULL; } -void _PG_init(void) { // NOLINT - if (!process_shared_preload_libraries_in_progress) { - ereport(ERROR, (errmsg("pax must be loaded via shared_preload_libraries"))); - return; - } +static void PaxDeleteObject(struct CustomObjectClass * /*self*/, + const ObjectAddress *object, int /*flags*/) { + Relation rel; + HeapTuple tup; + SysScanDesc scan; + ScanKeyData skey[1]; + + const auto object_property = FindPaxObjectProperty(object->classId); + Assert(object_property); + Assert(object_property->class_oid == object->classId); + + rel = table_open(object->classId, RowExclusiveLock); + ScanKeyInit(&skey[0], object_property->attnum_oid, BTEqualStrategyNumber, + F_OIDEQ, ObjectIdGetDatum(object->objectId)); + + scan = + systable_beginscan(rel, object_property->index_oid, true, NULL, 1, skey); - paxc::paxc_shmem_request(); + /* we expect exactly one match */ + tup = systable_getnext(scan); + if (!HeapTupleIsValid(tup)) + elog(ERROR, "could not find tuple for %s %u", object_property->name, + object->objectId); - prev_shmem_startup_hook = shmem_startup_hook; - shmem_startup_hook = PaxShmemInit; + CatalogTupleDelete(rel, &tup->t_self); - prev_executor_start = ExecutorStart_hook; - ExecutorStart_hook = PaxExecutorStart; + systable_endscan(scan); - prev_executor_end = ExecutorEnd_hook; - ExecutorEnd_hook = PaxExecutorEnd; + table_close(rel, RowExclusiveLock); +} + +static struct CustomObjectClass pax_fastsequence_coc = { + .class_id = PAX_FASTSEQUENCE_OID, + .do_delete = PaxDeleteObject, +}; + +static struct CustomObjectClass pax_tables_coc = { + .class_id = PAX_TABLES_RELATION_ID, + .do_delete = PaxDeleteObject, +}; + +void _PG_init(void) { // NOLINT + prev_object_access_hook = object_access_hook; + object_access_hook = PaxObjectAccessHook; ext_dml_init_hook = pax::CCPaxAccessMethod::ExtDmlInit; ext_dml_finish_hook = pax::CCPaxAccessMethod::ExtDmlFini; file_unlink_hook = pax::CCPaxAccessMethod::RelationFileUnlink; - RegisterXactCallback(PaxXactCallback, NULL); - - self_relopt_kind = add_reloption_kind(); - add_string_reloption(self_relopt_kind, "storage_format", "pax storage format", - "orc", PaxValidateStorageFormat, AccessExclusiveLock); - add_string_reloption(self_relopt_kind, "compresstype", "pax compress type", - PAX_DEFAULT_COMPRESSTYPE, PaxValidateCompresstype, - AccessExclusiveLock); - add_int_reloption(self_relopt_kind, "compresslevel", "pax compress level", - PAX_DEFAULT_COMPRESSLEVEL, AO_MIN_COMPRESSLEVEL, - AO_MAX_COMPRESSLEVEL, AccessExclusiveLock); + register_custom_object_class(&pax_fastsequence_coc); + register_custom_object_class(&pax_tables_coc); + + DefineGUCs(); + + RegisterResourceReleaseCallback(paxc::FdHandleAbortCallback, NULL); + + paxc::paxc_reg_rel_options(); } } // extern "C" diff --git a/contrib/pax_storage/src/cpp/access/pax_access_handle.h b/contrib/pax_storage/src/cpp/access/pax_access_handle.h index 2bca0ba15a7..d88afbb2917 100644 --- a/contrib/pax_storage/src/cpp/access/pax_access_handle.h +++ b/contrib/pax_storage/src/cpp/access/pax_access_handle.h @@ -3,6 +3,7 @@ #include "comm/cbdb_api.h" namespace paxc { + class PaxAccessMethod final { private: PaxAccessMethod() = default; @@ -22,14 +23,6 @@ class PaxAccessMethod final { static void ParallelscanReinitialize(Relation rel, ParallelTableScanDesc pscan); - /* Index Scan Callbacks, unsupported yet */ - static struct IndexFetchTableData *IndexFetchBegin(Relation rel); - static void IndexFetchEnd(struct IndexFetchTableData *data); - static void IndexFetchReset(struct IndexFetchTableData *data); - static bool IndexFetchTuple(struct IndexFetchTableData *scan, ItemPointer tid, - Snapshot snapshot, TupleTableSlot *slot, - bool *call_again, bool *all_dead); - /* Callbacks for non-modifying operations on individual tuples */ static bool TupleFetchRowVersion(Relation relation, ItemPointer tid, Snapshot snapshot, TupleTableSlot *slot); @@ -66,15 +59,18 @@ class PaxAccessMethod final { bool allow_sync, bool anyvisible, bool progress, BlockNumber start_blockno, BlockNumber numblocks, IndexBuildCallback callback, void *callback_state, TableScanDesc scan); + static bool IndexUniqueCheck(Relation rel, ItemPointer tid, Snapshot snapshot, bool *all_dead); static void IndexValidateScan(Relation heap_relation, Relation index_relation, IndexInfo *index_info, Snapshot snapshot, ValidateIndexState *state); - - static bytea *Amoptions(Datum reloptions, char relkind, bool validate); - static void SwapRelationFiles(Oid relid1, Oid relid2, TransactionId frozen_xid, MultiXactId cutoff_multi); + + static bytea *AmOptions(Datum reloptions, char relkind, bool validate); + static void ValidateColumnEncodingClauses(List *encoding_opts); + static List *TransformColumnEncodingClauses(Relation rel, List *encoding_opts, + bool validate, bool from_type); }; } // namespace paxc @@ -96,9 +92,17 @@ class CCPaxAccessMethod final { TupleTableSlot *slot); static TableScanDesc ScanExtractColumns(Relation rel, Snapshot snapshot, + int nkeys, struct ScanKeyData *key, ParallelTableScanDesc parallel_scan, - List *targetlist, List *qual, - uint32 flags); + struct PlanState *ps, uint32 flags); + + /* Index Scan Callbacks */ + static struct IndexFetchTableData *IndexFetchBegin(Relation rel); + static void IndexFetchEnd(struct IndexFetchTableData *scan); + static void IndexFetchReset(struct IndexFetchTableData *scan); + static bool IndexFetchTuple(struct IndexFetchTableData *scan, ItemPointer tid, + Snapshot snapshot, TupleTableSlot *slot, + bool *call_again, bool *all_dead); /* Manipulations of physical tuples. */ static void TupleInsert(Relation relation, TupleTableSlot *slot, @@ -162,11 +166,3 @@ class CCPaxAccessMethod final { extern ext_dml_func_hook_type ext_dml_init_hook; extern ext_dml_func_hook_type ext_dml_finish_hook; - -// plain structure used by reloptions, can be accessed from C++ code. -struct PaxOptions { - int32 vl_len; /* varlena header (do not touch directly!) */ - char storage_format[16]; - char compress_type[16]; - int compress_level; -}; diff --git a/contrib/pax_storage/src/cpp/access/pax_deleter.cc b/contrib/pax_storage/src/cpp/access/pax_deleter.cc index 594bc221eb0..3af63a7e6bb 100644 --- a/contrib/pax_storage/src/cpp/access/pax_deleter.cc +++ b/contrib/pax_storage/src/cpp/access/pax_deleter.cc @@ -5,18 +5,15 @@ #include #include "access/pax_dml_state.h" +#include "catalog/pax_aux_table.h" #include "comm/singleton.h" #include "storage/pax_itemptr.h" -#include "storage/paxc_block_map_manager.h" namespace pax { -CPaxDeleter::CPaxDeleter(const Relation rel, const Snapshot snapshot) +CPaxDeleter::CPaxDeleter(Relation rel, Snapshot snapshot) : rel_(rel), snapshot_(snapshot) {} -CPaxDeleter::~CPaxDeleter() = default; - -TM_Result CPaxDeleter::DeleteTuple(const Relation relation, - const ItemPointer tid, const CommandId cid, - const Snapshot snapshot, +TM_Result CPaxDeleter::DeleteTuple(Relation relation, ItemPointer tid, + CommandId cid, Snapshot snapshot, TM_FailureData *tmfd) { CPaxDeleter *deleter = CPaxDmlStateLocal::Instance()->GetDeleter(relation, snapshot); @@ -29,63 +26,64 @@ TM_Result CPaxDeleter::DeleteTuple(const Relation relation, } return result; } +// used for delete tuples +TM_Result CPaxDeleter::MarkDelete(ItemPointer tid) { + uint32 tuple_offset = pax::GetTupleOffset(*tid); -TM_Result CPaxDeleter::MarkDelete(const ItemPointer tid) { - PaxItemPointer pax_tid(reinterpret_cast(tid)); - uint8 table_no = pax_tid.GetTableNo(); - uint32 block_number = pax_tid.GetBlockNumber(); - uint32 tuple_number = pax_tid.GetTupleNumber(); - - std::string block_id = - cbdb::GetBlockId(rel_->rd_id, table_no, block_number).ToStr(); + std::string block_id = MapToBlockNumber(rel_, *tid); if (block_bitmap_map_.find(block_id) == block_bitmap_map_.end()) { - // TODO(gongxun): bitmap should support dynamic raise size block_bitmap_map_[block_id] = - std::unique_ptr(new DynamicBitmap()); // NOLINT - } - DynamicBitmap *bitmap = block_bitmap_map_[block_id].get(); - if (bitmap->NumBits() <= tuple_number) { - bitmap->Resize(bitmap->NumBits() * 2); + pax_unique_ptr(PAX_NEW()); // NOLINT + cbdb::DeleteMicroPartitionEntry(RelationGetRelid(rel_), snapshot_, + block_id); } - - if (bitmap->Test(tuple_number)) { + auto bitmap = block_bitmap_map_[block_id].get(); + if (bitmap->Test(tuple_offset)) { return TM_SelfModified; } - - bitmap->Set(tuple_number); + bitmap->Set(tuple_offset); return TM_Ok; } -void CPaxDeleter::ExecDelete() { - if (block_bitmap_map_.empty()) { - return; +// used for merge remaining partition files, no tuple needs to delete +void CPaxDeleter::MarkDelete(BlockNumber pax_block_id) { + std::string block_id = std::to_string(pax_block_id); + + if (block_bitmap_map_.find(block_id) == block_bitmap_map_.end()) { + block_bitmap_map_[block_id] = pax_unique_ptr(PAX_NEW()); + cbdb::DeleteMicroPartitionEntry(RelationGetRelid(rel_), snapshot_, + block_id); } +} + +void CPaxDeleter::ExecDelete() { + if (block_bitmap_map_.empty()) return; - TableDeleter table_deleter(rel_, buildDeleteIterator(), + TableDeleter table_deleter(rel_, BuildDeleteIterator(), std::move(block_bitmap_map_), snapshot_); table_deleter.Delete(); } -std::unique_ptr> -CPaxDeleter::buildDeleteIterator() { +pax_unique_ptr> +CPaxDeleter::BuildDeleteIterator() { std::vector micro_partitions; + auto rel_path = cbdb::BuildPaxDirectoryPath(rel_->rd_node, rel_->rd_backend); for (auto &it : block_bitmap_map_) { std::string block_id = it.first; - DynamicBitmap *bitmap_ptr = it.second.get(); - BitmapIterator bitmap_it(bitmap_ptr); - int32 tuple_number = bitmap_it.Next(true); - if (tuple_number != -1) { + { pax::MicroPartitionMetadata meta_info; - meta_info.SetFileName(cbdb::BuildPaxFilePath(rel_, block_id)); + meta_info.SetFileName(cbdb::BuildPaxFilePath(rel_path, block_id)); meta_info.SetMicroPartitionId(std::move(block_id)); micro_partitions.push_back(std::move(meta_info)); } } - IteratorBase *iter = new VectorIterator(std::move(micro_partitions)); + IteratorBase *iter = + PAX_NEW>( + std::move(micro_partitions)); - return std::unique_ptr>(iter); + return pax_unique_ptr>(iter); } } // namespace pax diff --git a/contrib/pax_storage/src/cpp/access/pax_deleter.h b/contrib/pax_storage/src/cpp/access/pax_deleter.h index 7d94ffe6efd..0312e63a742 100644 --- a/contrib/pax_storage/src/cpp/access/pax_deleter.h +++ b/contrib/pax_storage/src/cpp/access/pax_deleter.h @@ -7,27 +7,26 @@ #include #include "comm/bitmap.h" +#include "comm/pax_memory.h" #include "storage/pax.h" namespace pax { class CPaxDeleter { public: - explicit CPaxDeleter(const Relation rel, const Snapshot snapshot); - - static TM_Result DeleteTuple(const Relation relation, const ItemPointer tid, - const CommandId cid, const Snapshot snapshot, + explicit CPaxDeleter(Relation rel, Snapshot snapshot); + ~CPaxDeleter() = default; + static TM_Result DeleteTuple(Relation relation, ItemPointer tid, + CommandId cid, Snapshot snapshot, TM_FailureData *tmfd); - TM_Result MarkDelete(const ItemPointer tid); - - ~CPaxDeleter(); - + TM_Result MarkDelete(ItemPointer tid); + void MarkDelete(BlockNumber pax_block_id); void ExecDelete(); private: - std::unique_ptr> buildDeleteIterator(); - std::map> block_bitmap_map_; - const Relation rel_; - const Snapshot snapshot_; + std::unique_ptr> BuildDeleteIterator(); + std::map> block_bitmap_map_; + Relation rel_; + Snapshot snapshot_; }; // class CPaxDeleter } // namespace pax diff --git a/contrib/pax_storage/src/cpp/access/pax_dml_state.cc b/contrib/pax_storage/src/cpp/access/pax_dml_state.cc index 335eb659197..91cd996339f 100644 --- a/contrib/pax_storage/src/cpp/access/pax_dml_state.cc +++ b/contrib/pax_storage/src/cpp/access/pax_dml_state.cc @@ -8,10 +8,6 @@ void CPaxDmlStateLocal::DmlStateResetCallback(void * /*arg*/) { } void CPaxDmlStateLocal::InitDmlState(Relation rel, CmdType operation) { - if (operation == CMD_UPDATE || operation == CMD_DELETE) { - cbdb::InitCommandResource(); - } - if (!dml_descriptor_tab_) { HASHCTL hash_ctl; Assert(!cbdb::pax_memory_context); @@ -44,7 +40,7 @@ void CPaxDmlStateLocal::FinishDmlState(Relation rel, CmdType /*operation*/) { // TODO(gongxun): deleter finish state->deleter->ExecDelete(); - delete state->deleter; + PAX_DELETE(state->deleter); state->deleter = nullptr; // FIXME: it's update operation, maybe we should do something here } @@ -55,7 +51,7 @@ void CPaxDmlStateLocal::FinishDmlState(Relation rel, CmdType /*operation*/) { old_ctx = MemoryContextSwitchTo(cbdb::pax_memory_context); state->inserter->FinishInsert(); - delete state->inserter; + PAX_DELETE(state->inserter); state->inserter = nullptr; MemoryContextSwitchTo(old_ctx); } @@ -66,7 +62,7 @@ CPaxInserter *CPaxDmlStateLocal::GetInserter(Relation rel) { state = FindDmlState(cbdb::RelationGetRelationId(rel)); // TODO(gongxun): switch memory context?? if (state->inserter == nullptr) { - state->inserter = new CPaxInserter(rel); + state->inserter = PAX_NEW(rel); } return state->inserter; } @@ -76,7 +72,7 @@ CPaxDeleter *CPaxDmlStateLocal::GetDeleter(Relation rel, Snapshot snapshot) { state = FindDmlState(cbdb::RelationGetRelationId(rel)); // TODO(gongxun): switch memory context?? if (state->deleter == nullptr) { - state->deleter = new CPaxDeleter(rel, snapshot); + state->deleter = PAX_NEW(rel, snapshot); } return state->deleter; } diff --git a/contrib/pax_storage/src/cpp/access/pax_inserter.cc b/contrib/pax_storage/src/cpp/access/pax_inserter.cc index 2584efdb3d9..68d1da880b6 100644 --- a/contrib/pax_storage/src/cpp/access/pax_inserter.cc +++ b/contrib/pax_storage/src/cpp/access/pax_inserter.cc @@ -4,18 +4,32 @@ #include #include "access/pax_dml_state.h" -#include "catalog/micro_partition_stats.h" +#include "access/pax_partition.h" +#include "access/paxc_rel_options.h" #include "catalog/pax_aux_table.h" #include "comm/cbdb_wrappers.h" +#include "storage/micro_partition_stats.h" #include "storage/strategy.h" namespace pax { -CPaxInserter::CPaxInserter(Relation rel) : rel_(rel), insert_count_(0) { - writer_ = new TableWriter(rel); - writer_->SetWriteSummaryCallback(&cbdb::AddMicroPartitionEntry) - ->SetFileSplitStrategy(new PaxDefaultSplitStrategy()) - ->SetStatsCollector(new MicroPartitionStats()) +CPaxInserter::CPaxInserter(Relation rel) + : rel_(rel), insert_count_(0), part_obj_(nullptr), writer_(nullptr) { + part_obj_ = PAX_NEW(); + auto ok = part_obj_->Initialize(rel_); + if (ok) { + writer_ = PAX_NEW(rel, part_obj_); + } else { + // fallback to TableWriter + writer_ = PAX_NEW(rel); + part_obj_->Release(); + PAX_DELETE(part_obj_); + part_obj_ = nullptr; + } + + writer_->SetWriteSummaryCallback(&cbdb::InsertOrUpdateMicroPartitionEntry) + ->SetFileSplitStrategy(PAX_NEW()) + ->SetStatsCollector(PAX_NEW()) ->Open(); } @@ -29,8 +43,7 @@ void CPaxInserter::InsertTuple(Relation relation, TupleTableSlot *slot, slot_getallattrs(slot); } - CTupleSlot cslot(slot); - writer_->WriteTuple(&cslot); + writer_->WriteTuple(slot); } void CPaxInserter::MultiInsert(Relation relation, TupleTableSlot **slots, @@ -51,8 +64,14 @@ void CPaxInserter::FinishBulkInsert(Relation relation, int /*options*/) { void CPaxInserter::FinishInsert() { writer_->Close(); - delete writer_; + PAX_DELETE(writer_); writer_ = nullptr; + + if (part_obj_) { + part_obj_->Release(); + PAX_DELETE(part_obj_); + part_obj_ = nullptr; + } } void CPaxInserter::TupleInsert(Relation relation, TupleTableSlot *slot, diff --git a/contrib/pax_storage/src/cpp/access/pax_inserter.h b/contrib/pax_storage/src/cpp/access/pax_inserter.h index abd191981bf..92300769755 100644 --- a/contrib/pax_storage/src/cpp/access/pax_inserter.h +++ b/contrib/pax_storage/src/cpp/access/pax_inserter.h @@ -4,9 +4,9 @@ #include "storage/micro_partition_metadata.h" #include "storage/pax.h" - +#include "storage/pax_table_partition_writer.h" namespace pax { - +class PartitionObject; class CPaxInserter { public: explicit CPaxInserter(Relation rel); @@ -29,6 +29,7 @@ class CPaxInserter { Relation rel_; uint32 insert_count_; + PartitionObject *part_obj_; TableWriter *writer_; }; // class CPaxInserter diff --git a/contrib/pax_storage/src/cpp/access/pax_partition.cc b/contrib/pax_storage/src/cpp/access/pax_partition.cc new file mode 100644 index 00000000000..9533677a969 --- /dev/null +++ b/contrib/pax_storage/src/cpp/access/pax_partition.cc @@ -0,0 +1,745 @@ +#include "access/pax_partition.h" + +#include "comm/cbdb_api.h" + +#include "access/pax_access_handle.h" +#include "catalog/pg_pax_tables.h" +#include "comm/cbdb_wrappers.h" + +namespace paxc { +// support optional `EVERY` syntax: +// FROM(start_value) TO(end_value) [ EVERY(interval) ] +struct PaxPartitionEveryIterator { + PartitionKey key; + Datum from_value; + Datum to_value; + + ExprState *plus_expr_state; + ParamListInfo plus_expr_params; + EState *estate; + + Datum current_start; + Datum current_end; + bool ended; + + ParseState *pstate; +}; + +static int PartitionCheckBound(PartitionKey key, PartitionBoundSpec *spec); + +static void PaxPartitionDestroyEveryIterator( + struct PaxPartitionEveryIterator *iter) { + if (iter->estate) FreeExecutorState(iter->estate); + pfree(iter); +} + +// See the implementation in PartEveryIterator +static struct PaxPartitionEveryIterator *PaxPartitionInitEveryIterator( + ParseState *pstate, PartitionKey key, Node *from, Node *to, Node *every) { + Assert(key->partnatts == 1); + auto part_col_typid = get_partition_col_typid(key, 0); + auto part_col_typmod = get_partition_col_typmod(key, 0); + auto part_col_collation = get_partition_col_collation(key, 0); + Datum from_value; + Datum to_value; + Const *c; + + auto iter = + (PaxPartitionEveryIterator *)palloc0(sizeof(PaxPartitionEveryIterator)); + Assert(from && to && every); + + c = castNode(Const, from); + if (c->constisnull) + elog(ERROR, "cann't use NULL with range partition specification"); + from_value = c->constvalue; + + c = castNode(Const, to); + if (c->constisnull) + elog(ERROR, "cann't use NULL with range partition specification"); + to_value = c->constvalue; + + auto param = makeNode(Param); + param->paramid = 1; + param->paramtype = part_col_typid; + param->paramtypmod = part_col_typmod; + param->paramcollid = part_col_collation; + param->location = -1; + + auto plus_expr = (Node *)make_op( + pstate, + list_make2(makeString((char *)"pg_catalog"), makeString((char *)"+")), + (Node *)param, (Node *)every, pstate->p_last_srf, -1); + + if (IsA(plus_expr, CollateExpr)) { + auto expr_collation = exprCollation(plus_expr); + if (OidIsValid(expr_collation) && expr_collation != part_col_collation) + elog(ERROR, + "collation of partition bound value for column %d doesn't match " + "partition key collation \"%s\"", + get_partition_col_attnum(key, 0), + get_collation_name(part_col_collation)); + } + plus_expr = coerce_to_target_type( + pstate, plus_expr, exprType(plus_expr), part_col_typid, part_col_typmod, + COERCION_ASSIGNMENT, COERCE_IMPLICIT_CAST, -1); + if (plus_expr == NULL) + ereport(ERROR, + (errcode(ERRCODE_DATATYPE_MISMATCH), + errmsg("specified value cannot be cast to type %s for column %d", + format_type_be(part_col_typid), + get_partition_col_attnum(key, 0)))); + + iter->key = key; + iter->from_value = from_value; + iter->to_value = to_value; + + iter->plus_expr_params = makeParamList(1); + iter->plus_expr_params->params[0].value = (Datum)0; + iter->plus_expr_params->params[0].isnull = true; + iter->plus_expr_params->params[0].pflags = 0; + iter->plus_expr_params->params[0].ptype = part_col_typid; + iter->estate = CreateExecutorState(); + iter->estate->es_param_list_info = iter->plus_expr_params; + + iter->plus_expr_state = + ExecInitExprWithParams((Expr *)plus_expr, iter->plus_expr_params); + + iter->current_end = iter->from_value; + iter->current_start = (Datum)0; + iter->ended = false; + + iter->pstate = pstate; + + return iter; +} + +static List *PaxPartitionBuildDatums(PartitionKey key, Datum *datums) { + List *result = NIL; + for (int i = 0; i < key->partnatts; i++) { + Const *c; + PartitionRangeDatum *prd; + c = makeConst( + key->parttypid[i], key->parttypmod[i], key->parttypcoll[i], + key->parttyplen[i], + datumCopy(datums[i], key->parttypbyval[i], key->parttyplen[i]), false, + key->parttypbyval[i]); + + prd = makeNode(PartitionRangeDatum); + prd->kind = PARTITION_RANGE_DATUM_VALUE; + prd->value = (Node *)c; + result = lappend(result, prd); + } + return result; +} + +static PartitionBoundSpec *PaxPartitionNextPartBound( + struct PaxPartitionEveryIterator *iter) { + if (iter->ended) return nullptr; + + bool isnull; + + iter->plus_expr_params->params[0].isnull = false; + iter->plus_expr_params->params[0].value = iter->current_end; + + auto next_start = iter->current_end; + auto next_end = ExecEvalExprSwitchContext( + iter->plus_expr_state, GetPerTupleExprContext(iter->estate), &isnull); + if (isnull) + ereport(ERROR, (errcode(ERRCODE_INVALID_TABLE_DEFINITION), + errmsg("could not compute next partition boundary with " + "EVERY, plus-operator returned NULL"), + parser_errposition(iter->pstate, -1))); + + auto cmpval = DatumGetInt32(FunctionCall2Coll(&iter->key->partsupfunc[0], + iter->key->partcollation[0], + next_end, iter->to_value)); + if (cmpval >= 0) { + iter->ended = true; + next_end = iter->to_value; + } + // sanity check in case next_start >= next_end + cmpval = DatumGetInt32(FunctionCall2Coll(&iter->key->partsupfunc[0], + iter->key->partcollation[0], + next_start, next_end)); + if (cmpval >= 0) elog(ERROR, "invalid range bound with EVERY"); + + iter->current_start = next_start; + iter->current_end = next_end; + + // build PartitionBoundSpec for [iter->current_start, iter->current_end) + PartitionBoundSpec *boundspec; + + boundspec = makeNode(PartitionBoundSpec); + boundspec->strategy = PARTITION_STRATEGY_RANGE; + boundspec->is_default = false; + boundspec->lowerdatums = + PaxPartitionBuildDatums(iter->key, &iter->current_start); + boundspec->upperdatums = + PaxPartitionBuildDatums(iter->key, &iter->current_end); + + return boundspec; +} + +static Node *GetConstValue(List *datums) { + auto prd = (PartitionRangeDatum *)linitial(datums); + Assert(IsA(prd, PartitionRangeDatum)); + Assert(prd->kind == PARTITION_RANGE_DATUM_VALUE); + + auto c = (Const *)prd->value; + Assert(c && IsA(c, Const) && !c->constisnull); + return (Node *)c; +} + +// generate a list of partition bound specs +static List *TransformPartitionExtension(ParseState *pstate, Relation relation, + PartitionKey key, + PartitionRangeExtension *range_ext) { + List *result = NIL; + PartitionBoundSpec *range; + + auto every = range_ext->every; + auto spec = transformPartitionBound(pstate, relation, key, &range_ext->spec); + if (!every) return list_make1(spec); + + if (PartitionCheckBound(key, spec) >= 0) + elog(ERROR, "invalid range bound: from %s to %s every(X)", + get_range_partbound_string(spec->lowerdatums), + get_range_partbound_string(spec->upperdatums)); + + // calculate partition by every expression + if (key->partnatts != 1 || key->partnatts != list_length(every)) + elog(ERROR, "pax partition EVERY only support one column"); + + auto ev = (Node *)linitial(every); + auto iter = PaxPartitionInitEveryIterator( + pstate, key, GetConstValue(spec->lowerdatums), + GetConstValue(spec->upperdatums), + (Node *)transformExpr(pstate, ev, EXPR_KIND_PARTITION_BOUND)); + + while ((range = PaxPartitionNextPartBound(iter))) { + result = lappend(result, range); + } + PaxPartitionDestroyEveryIterator(iter); + return result; +} + +static bool PaxLoadPartitionSpec(Oid relid, List **partparams_list, + List **partboundspec_list) { + Node *part; + List *list; + + ::paxc::GetPaxTablesEntryAttributes(relid, NULL, &part); + if (!part) return false; + + list = castNode(List, part); + Assert(list_length(list) == 2); + *partparams_list = castNode(List, list_nth(list, 0)); + *partboundspec_list = castNode(List, list_nth(list, 1)); + return true; +} + +static inline PartitionRangeDatumKind RangeDatumToKind(List *datums, int i) { + PartitionRangeDatum *rd = castNode(PartitionRangeDatum, list_nth(datums, i)); + return rd->kind; +} +static inline Datum RangeDatumToValue(List *datums, int i) { + PartitionRangeDatum *rd = castNode(PartitionRangeDatum, list_nth(datums, i)); + Const *c = castNode(Const, rd->value); + Assert(c && !c->constisnull); + return c->constvalue; +} +// Reference: partition_rbound_cmp() +int PartitionComparePartitionKeys(PartitionKey key, List *datums1, + List *datums2) { + Assert(key->partnatts == list_length(datums1)); + Assert(key->partnatts == list_length(datums2)); + FmgrInfo *partsupfunc = key->partsupfunc; + Oid *partcollation = key->partcollation; + int natts = key->partnatts; + int i; + int32 colnum = 0; + int32 cmpval = 0; + for (i = 0; i < natts; i++) { + colnum++; + auto kind1 = RangeDatumToKind(datums1, i); + auto kind2 = RangeDatumToKind(datums2, i); + + if (kind1 < kind2) return -colnum; + if (kind1 > kind2) return colnum; + if (kind1 != PARTITION_RANGE_DATUM_VALUE) { + /* + * The column bounds are both MINVALUE or both MAXVALUE. No later + * columns should be considered, but we still need to compare + * whether they are upper or lower bounds. + */ + break; + } + cmpval = DatumGetInt32(FunctionCall2Coll(&partsupfunc[i], partcollation[i], + RangeDatumToValue(datums1, i), + RangeDatumToValue(datums2, i))); + if (cmpval != 0) break; + } + return cmpval == 0 ? 0 : (cmpval < 0 ? -colnum : colnum); +} + +static int PartitionCheckBound(PartitionKey key, PartitionBoundSpec *spec) { + return PartitionComparePartitionKeys(key, spec->lowerdatums, + spec->upperdatums); +} + +int PartitionBoundSpecCmp(const ListCell *a, const ListCell *b, void *arg) { + auto spec1 = lfirst_node(PartitionBoundSpec, a); + auto spec2 = lfirst_node(PartitionBoundSpec, b); + auto key = static_cast(arg); + return PartitionComparePartitionKeys(key, spec1->lowerdatums, + spec2->lowerdatums); +} + +bool PartitionCheckBounds(PartitionKey key, List *spec_list) { + ListCell *lc; + int i; + int nparts = list_length(spec_list); + bool ok = true; + + Assert(nparts > 0); + + // self bound check + foreach (lc, spec_list) { + auto spec = lfirst_node(PartitionBoundSpec, lc); + + if (spec->strategy != key->strategy) + elog(ERROR, "strategy not match with partition key"); + if (spec->is_default) elog(ERROR, "unexpected default partition"); + if (list_length(spec->lowerdatums) != key->partnatts) + elog(ERROR, + "number of lower bound values mismatches the number of partition " + "keys"); + if (list_length(spec->upperdatums) != key->partnatts) + elog(ERROR, + "number of upper bound values mismatches the number of partition " + "keys"); + + ok = PartitionCheckBound(key, spec) < 0; + if (!ok) goto out; + } + + // cross bound check, only checks whether prev.upper <= cur.lower + list_sort_arg(spec_list, PartitionBoundSpecCmp, key); + for (i = 1; i < nparts; i++) { + auto spec1 = castNode(PartitionBoundSpec, list_nth(spec_list, i - 1)); + auto spec2 = castNode(PartitionBoundSpec, list_nth(spec_list, i)); + // the upper value should be less than or equal to the lower value of the + // next part + ok = PartitionComparePartitionKeys(key, spec1->upperdatums, + spec2->lowerdatums) <= 0; + if (!ok) break; + } +out: + return ok; +} + +List *PaxValidatePartitionRanges(Relation relation, PartitionKey key, + List *raw_partbound_list) { + ParseState *pstate = make_parsestate(NULL); + List *spec_list = NIL; + int nparts; + bool ok; + + nparts = list_length(raw_partbound_list); + Assert(nparts > 0); + + for (int i = 0; i < nparts; i++) { + auto spec = + static_cast(list_nth(raw_partbound_list, i)); + Assert(IsA(spec, PartitionBoundSpec)); + auto part_list = TransformPartitionExtension(pstate, relation, key, spec); + spec_list = list_concat(spec_list, part_list); + pfree(part_list); + } + + // check whether bounds overlaps + ok = paxc::PartitionCheckBounds(key, spec_list); + if (!ok) elog(ERROR, "partition bounds have overlaps"); + + list_free_deep(raw_partbound_list); + free_parsestate(pstate); + + return spec_list; +} + +// Reference: RelationBuildPartitionKey +PartitionKey PaxRelationBuildPartitionKey(Relation relation, + List *partparams_list) { + int i; + PartitionKey key; + Oid *partopclass; + ListCell *partexprs_item; + int16 procnum; + + Assert(RelationIsPAX(relation)); + + key = (PartitionKey)palloc0(sizeof(PartitionKeyData)); + key->strategy = PARTITION_STRATEGY_RANGE; + key->partnatts = list_length(partparams_list); + key->partattrs = (AttrNumber *)palloc(key->partnatts * sizeof(AttrNumber)); + key->partopfamily = (Oid *)palloc(key->partnatts * sizeof(Oid)); + key->partopcintype = (Oid *)palloc(key->partnatts * sizeof(Oid)); + key->partsupfunc = (FmgrInfo *)palloc0(key->partnatts * sizeof(FmgrInfo)); + + key->partcollation = (Oid *)palloc(key->partnatts * sizeof(Oid)); + key->parttypid = (Oid *)palloc(key->partnatts * sizeof(Oid)); + key->parttypmod = (int32 *)palloc(key->partnatts * sizeof(int32)); + key->parttyplen = (int16 *)palloc(key->partnatts * sizeof(int16)); + key->parttypbyval = (bool *)palloc(key->partnatts * sizeof(bool)); + key->parttypalign = (char *)palloc(key->partnatts * sizeof(char)); + key->parttypcoll = (Oid *)palloc(key->partnatts * sizeof(Oid)); + + partopclass = (Oid *)palloc(key->partnatts * sizeof(Oid)); + ComputePartitionAttrs(NULL, relation, partparams_list, key->partattrs, NULL, + partopclass, key->partcollation, key->strategy); + + /* determine support function number to search for */ + procnum = (key->strategy == PARTITION_STRATEGY_HASH) ? HASHEXTENDED_PROC + : BTORDER_PROC; + + // We don't have expressions as our partition keys, but keep + // the code the same as the kernel. + partexprs_item = list_head(key->partexprs); + for (i = 0; i < key->partnatts; i++) { + AttrNumber attno = key->partattrs[i]; + HeapTuple opclasstup; + Form_pg_opclass opclassform; + Oid funcid; + + /* Collect opfamily information */ + opclasstup = SearchSysCache1(CLAOID, ObjectIdGetDatum(partopclass[i])); + if (!HeapTupleIsValid(opclasstup)) + elog(ERROR, "cache lookup failed for opclass %u", partopclass[i]); + + opclassform = (Form_pg_opclass)GETSTRUCT(opclasstup); + key->partopfamily[i] = opclassform->opcfamily; + key->partopcintype[i] = opclassform->opcintype; + + /* Get a support function for the specified opfamily and datatypes */ + funcid = get_opfamily_proc(opclassform->opcfamily, opclassform->opcintype, + opclassform->opcintype, procnum); + if (!OidIsValid(funcid)) + ereport( + ERROR, + (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), + errmsg("operator class \"%s\" of access method %s is missing " + "support function %d for type %s", + NameStr(opclassform->opcname), + (key->strategy == PARTITION_STRATEGY_HASH) ? "hash" : "btree", + procnum, format_type_be(opclassform->opcintype)))); + + fmgr_info_cxt(funcid, &key->partsupfunc[i], CurrentMemoryContext); + + /* Collect type information */ + if (attno != 0) { + Form_pg_attribute att = TupleDescAttr(relation->rd_att, attno - 1); + + key->parttypid[i] = att->atttypid; + key->parttypmod[i] = att->atttypmod; + key->parttypcoll[i] = att->attcollation; + } else { + if (partexprs_item == NULL) + elog(ERROR, "wrong number of partition key expressions"); + + key->parttypid[i] = exprType(static_cast(lfirst(partexprs_item))); + key->parttypmod[i] = + exprTypmod(static_cast(lfirst(partexprs_item))); + key->parttypcoll[i] = + exprCollation(static_cast(lfirst(partexprs_item))); + + partexprs_item = lnext(key->partexprs, partexprs_item); + } + get_typlenbyvalalign(key->parttypid[i], &key->parttyplen[i], + &key->parttypbyval[i], &key->parttypalign[i]); + + ReleaseSysCache(opclasstup); + } + pfree(partopclass); + return key; +} + +static PartitionDesc PaxRelationBuildPartitionDesc(PartitionKey key, + List *partboundspec_list, + MemoryContext tmp_ctx, + MemoryContext target_ctx) { + PartitionDesc partdesc; + PartitionBoundInfo boundinfo; + PartitionBoundSpec **boundspecs = NULL; + int nparts; + MemoryContext saved_cxt; + int *mapping; + + saved_cxt = MemoryContextSwitchTo(tmp_ctx); + nparts = list_length(partboundspec_list); + boundspecs = + (PartitionBoundSpec **)palloc(nparts * sizeof(PartitionBoundSpec *)); + for (int i = 0; i < nparts; i++) + boundspecs[i] = + static_cast(list_nth(partboundspec_list, i)); + + /* + * Create PartitionBoundInfo and mapping, working in the caller's context. + * This could fail, but we haven't done any damage if so. + */ + boundinfo = partition_bounds_create(boundspecs, nparts, key, &mapping); + pfree(boundspecs); + + MemoryContextSwitchTo(target_ctx); + partdesc = (PartitionDescData *)palloc0(sizeof(PartitionDescData)); + partdesc->nparts = nparts; + partdesc->detached_exist = false; + partdesc->boundinfo = partition_bounds_copy(boundinfo, key); + pfree(boundinfo); + + // PAX doesn't have child partition tables + partdesc->oids = NULL; + partdesc->is_leaf = NULL; + /* Return to caller's context, and blow away the temporary context. */ + MemoryContextSwitchTo(saved_cxt); + return partdesc; +} + +static void PaxFormPartitionKeyDatum(PartitionKey key, TupleTableSlot *slot, + Datum *values, bool *isnull) { + for (int i = 0; i < key->partnatts; i++) { + AttrNumber keycol = key->partattrs[i]; + + Assert(keycol > 0); + values[i] = slot_getattr(slot, keycol, &isnull[i]); + } +} + +bool PartitionObjectInternal::Initialize(Relation pax_rel) { + MemoryContext tmp_ctx; + MemoryContext saved_ctx; + List *partparams_list; + List *partboundspec_list; + PartitionKey key = NULL; + PartitionDesc desc = NULL; + bool ok; + + Assert(pax_rel); + pax_rel_ = pax_rel; + + tmp_ctx = AllocSetContextCreate(CurrentMemoryContext, "tmp pax partition ctx", + ALLOCSET_DEFAULT_SIZES); + mctx_ = AllocSetContextCreate(CurrentMemoryContext, "pax partition ctx", + ALLOCSET_DEFAULT_SIZES); + MemoryContextCopyAndSetIdentifier(mctx_, RelationGetRelationName(pax_rel)); + + saved_ctx = MemoryContextSwitchTo(tmp_ctx); + ok = PaxLoadPartitionSpec(RelationGetRelid(pax_rel), &partparams_list, + &partboundspec_list); + if (!ok) goto out; + + MemoryContextSwitchTo(mctx_); + + // The partition keys have no strict constraint for DDLs. + // The column names/types may be changed later by the user, but the PAX code + // is not aware of it. So, we ignore these inconsistent changes for partition + // writer. + PG_TRY(); + { + key = PaxRelationBuildPartitionKey(pax_rel, partparams_list); + InitializeMergeInfo(key, partboundspec_list, tmp_ctx, mctx_); + + desc = + PaxRelationBuildPartitionDesc(key, partboundspec_list, tmp_ctx, mctx_); + partition_bound_spec_ = static_cast(copyObject(partboundspec_list)); + } + PG_CATCH(); + { + // fall back to not use the partition writer + ok = false; + FlushErrorState(); + } + PG_END_TRY(); + partition_key_ = key; + partition_desc_ = desc; +out: + MemoryContextSwitchTo(saved_ctx); + MemoryContextDelete(tmp_ctx); + return ok; +} + +void PartitionObjectInternal::InitializeMergeInfo(PartitionKey key, + List *partboundspec_list, + MemoryContext tmp_ctx, + MemoryContext target_ctx) { + // gather whether the adjacent bounds are continuous + // NOTE: the bounds are already sorted. + MemoryContext saved_ctx; + int *merge_index; + int nparts; + int merge_len; + + saved_ctx = MemoryContextSwitchTo(tmp_ctx); + nparts = list_length(partboundspec_list); + merge_index = (int *)palloc(2 * nparts * sizeof(int)); + merge_index[0] = 0; + merge_len = 1; + for (int i = 1; i < nparts; i++) { + PartitionBoundSpec *spec1 = + castNode(PartitionBoundSpec, list_nth(partboundspec_list, i - 1)); + PartitionBoundSpec *spec2 = + castNode(PartitionBoundSpec, list_nth(partboundspec_list, i)); + + auto cmpval = PartitionComparePartitionKeys(key, spec1->upperdatums, + spec2->lowerdatums); + Assert(cmpval <= 0); + if (cmpval != 0) { + merge_index[merge_len++] = i - 1; + merge_index[merge_len++] = i; + } + } + merge_index[merge_len++] = nparts - 1; + + Assert(merge_len % 2 == 0); + MemoryContextSwitchTo(target_ctx); + merge_len_ = merge_len; + merge_index_ = (int *)palloc(merge_len * sizeof(int)); + memcpy(merge_index_, merge_index, merge_len * sizeof(int)); + pfree(merge_index); + + MemoryContextSwitchTo(saved_ctx); +} + +void PartitionObjectInternal::Release() { + pax_rel_ = nullptr; + partition_key_ = nullptr; + partition_desc_ = nullptr; + partition_bound_spec_ = nullptr; + if (mctx_) { + MemoryContextDelete(mctx_); + mctx_ = nullptr; + } +} + +int PartitionObjectInternal::NumPartitions() const { + Assert(pax_rel_ && partition_key_ && partition_desc_ && mctx_); + return list_length(partition_bound_spec_); +} + +int PartitionObjectInternal::NumPartitionKeys() const { + Assert(pax_rel_ && partition_key_ && partition_desc_ && mctx_); + return get_partition_natts(partition_key_); +} + +int PartitionObjectInternal::FindPartition(TupleTableSlot *slot) { + Datum values[PARTITION_MAX_KEYS]; + bool isnull[PARTITION_MAX_KEYS]; + + Assert(pax_rel_ && partition_key_ && partition_desc_ && mctx_); + PaxFormPartitionKeyDatum(partition_key_, slot, values, isnull); + return get_partition_for_tuple(partition_key_, partition_desc_, values, + isnull); +} + +} // namespace paxc + +namespace pax { +bool PartitionObject::Initialize(Relation pax_rel) { + // FIXME: We MUST catch some types of exceptions and assumes + // the partition should be ignored. Because the partition constraint + // may be broken by: + // 1. rename column name + // 2. change column type + // 3. drop one or more columns in the partition keys + CBDB_WRAP_START; + { return stub_.Initialize(pax_rel); } + CBDB_WRAP_END; +} +void PartitionObject::Release() { + CBDB_WRAP_START; + { stub_.Release(); } + CBDB_WRAP_END; +} + +int PartitionObject::FindPartition(TupleTableSlot *slot) { + CBDB_WRAP_START; + { return stub_.FindPartition(slot); } + CBDB_WRAP_END; +} +std::pair PartitionObject::GetMergeListInfo() { + return {stub_.merge_index_, stub_.merge_len_}; +} + +} // namespace pax + +extern "C" { +// CREATE FUNCTION pax_dump_ranges(relid Oid) RETURNS SETOF TEXT AS +// '$libdir/pax', 'PaxPartitionDumpRanges' +// LANGUAGE C STRICT; +// UDF about partition +PG_FUNCTION_INFO_V1(PaxPartitionDumpRanges); +struct PartitionRangeDumpContext { + List *boundspec_list; + MemoryContext mctx; + int index; +}; + +Datum PaxPartitionDumpRanges(PG_FUNCTION_ARGS) { + PartitionRangeDumpContext *ctx; + FuncCallContext *funcctx; + + if (SRF_IS_FIRSTCALL()) { + Oid relid = PG_GETARG_OID(0); + MemoryContext tmp_ctx; + MemoryContext old_ctx; + List *partparams; + List *partboundspecs; + bool ok; + + funcctx = SRF_FIRSTCALL_INIT(); + tmp_ctx = + AllocSetContextCreate(funcctx->multi_call_memory_ctx, + "tmp pax partition ctx", ALLOCSET_DEFAULT_SIZES); + old_ctx = MemoryContextSwitchTo(tmp_ctx); + + ok = paxc::PaxLoadPartitionSpec(relid, &partparams, &partboundspecs); + if (!ok) partboundspecs = nullptr; + + ctx = + (PartitionRangeDumpContext *)palloc(sizeof(PartitionRangeDumpContext)); + ctx->boundspec_list = partboundspecs; + ctx->mctx = tmp_ctx; + ctx->index = 0; + funcctx->user_fctx = (void *)ctx; + MemoryContextSwitchTo(old_ctx); + } + + funcctx = SRF_PERCALL_SETUP(); + ctx = (PartitionRangeDumpContext *)funcctx->user_fctx; + while (ctx->index < list_length(ctx->boundspec_list)) { + StringInfoData str; + char *value_list; + text *range; + PartitionBoundSpec *spec = + castNode(PartitionBoundSpec, list_nth(ctx->boundspec_list, ctx->index)); + ++ctx->index; + + initStringInfo(&str); + appendStringInfoString(&str, "from"); + value_list = get_range_partbound_string(spec->lowerdatums); + appendStringInfoString(&str, value_list); + pfree(value_list); + + appendStringInfoString(&str, " to"); + value_list = get_range_partbound_string(spec->upperdatums); + appendStringInfoString(&str, value_list); + pfree(value_list); + + range = cstring_to_text(str.data); + pfree(str.data); + SRF_RETURN_NEXT(funcctx, PointerGetDatum(range)); + } + + MemoryContextDelete(ctx->mctx); + SRF_RETURN_DONE(funcctx); +} +} diff --git a/contrib/pax_storage/src/cpp/access/pax_partition.h b/contrib/pax_storage/src/cpp/access/pax_partition.h new file mode 100644 index 00000000000..2e7ed5ea05e --- /dev/null +++ b/contrib/pax_storage/src/cpp/access/pax_partition.h @@ -0,0 +1,75 @@ +#pragma once + +#include "comm/cbdb_api.h" + +#include + +namespace pax { +class PartitionObject; +} + +struct PartitionRangeExtension { + struct PartitionBoundSpec spec; + List *every; +}; +namespace paxc { + +extern PartitionKey PaxRelationBuildPartitionKey(Relation relation, + List *partparams_list); +extern bool PartitionCheckBounds(PartitionKey key, int nparts, + PartitionBoundSpec **partboundspecs); +extern List *PaxValidatePartitionRanges(Relation relation, PartitionKey key, + List *raw_partbound_list); + +class PartitionObjectInternal { + public: + bool Initialize(Relation pax_rel); + void Release(); + + // Get number of partitions, excluding the default partition + int NumPartitions() const; + int NumPartitionKeys() const; + // -1 if default partition, >=0 leaf partition + int FindPartition(TupleTableSlot *slot); + PartitionKey GetPartitionKey() { return partition_key_; } + PartitionDesc GetPartitionDesc() { return partition_desc_; } + + private: + void InitializeMergeInfo(PartitionKey key, List *partboundspec_list, + MemoryContext tmp_ctx, MemoryContext target_ctx); + + friend class pax::PartitionObject; + Relation pax_rel_ = nullptr; + PartitionKey partition_key_ = nullptr; + PartitionDesc partition_desc_ = nullptr; + int *merge_index_ = nullptr; + size_t merge_len_ = 0; + List *partition_bound_spec_ = nullptr; + MemoryContext mctx_ = nullptr; +}; +} // namespace paxc + +namespace pax { +class PartitionObject { + public: + bool Initialize(Relation pax_rel); + void Release(); + + PartitionKey GetPartitionKey() { return stub_.GetPartitionKey(); } + PartitionDesc GetPartitionDesc() { return stub_.GetPartitionDesc(); } + + // Get number of partitions, excluding the default partition + int NumPartitions() const { return stub_.NumPartitions(); } + // Get number of the partition keys + int NumPartitionKeys() const { return stub_.NumPartitionKeys(); } + + // -1 if default partition, >= 0 leaf partition + int FindPartition(TupleTableSlot *slot); + + std::pair GetMergeListInfo(); + + private: + paxc::PartitionObjectInternal stub_; +}; + +} // namespace pax diff --git a/contrib/pax_storage/src/cpp/access/pax_scanner.cc b/contrib/pax_storage/src/cpp/access/pax_scanner.cc index 9abd53ade28..3b4054b4bf2 100644 --- a/contrib/pax_storage/src/cpp/access/pax_scanner.cc +++ b/contrib/pax_storage/src/cpp/access/pax_scanner.cc @@ -1,19 +1,147 @@ #include "access/pax_scanner.h" #include "access/pax_access_handle.h" +#include "catalog/pax_aux_table.h" +#include "catalog/pg_pax_tables.h" +#include "comm/guc.h" +#include "comm/pax_memory.h" #include "storage/local_file_system.h" #include "storage/micro_partition.h" #include "storage/micro_partition_iterator.h" +#include "storage/micro_partition_stats.h" #include "storage/orc/orc.h" #include "storage/pax.h" #include "storage/pax_buffer.h" +#include "storage/pax_defined.h" + +#ifdef ENABLE_PLASMA +#include "storage/cache/pax_plasma_cache.h" +#endif + +#ifdef VEC_BUILD +#include "utils/am_vec.h" +#endif + +namespace paxc { +bool IndexUniqueCheck(Relation rel, ItemPointer tid, Snapshot snapshot, + bool * /*all_dead*/) { + paxc::ScanAuxContext context; + HeapTuple tuple; + char block_name[NAMEDATALEN]; + Oid aux_relid; + bool exists; + + aux_relid = ::paxc::GetPaxAuxRelid(RelationGetRelid(rel)); + snprintf(block_name, sizeof(block_name), "%u", pax::GetBlockNumber(*tid)); + context.BeginSearchMicroPartition(aux_relid, InvalidOid, snapshot, + AccessShareLock, block_name); + tuple = context.SearchMicroPartitionEntry(); + exists = HeapTupleIsValid(tuple); + context.EndSearchMicroPartition(AccessShareLock); + return exists; +} +} // namespace paxc namespace pax { +PaxIndexScanDesc::PaxIndexScanDesc(Relation rel) : base_{.rel = rel} { + Assert(rel); + Assert(&base_ == reinterpret_cast(this)); + rel_path_ = cbdb::BuildPaxDirectoryPath(rel->rd_node, rel->rd_backend); +} + +PaxIndexScanDesc::~PaxIndexScanDesc() { + if (reader_) { + reader_->Close(); + PAX_DELETE(reader_); + } +} + +bool PaxIndexScanDesc::FetchTuple(ItemPointer tid, Snapshot snapshot, + TupleTableSlot *slot, bool *call_again, + bool *all_dead) { + BlockNumber block = pax::GetBlockNumber(*tid); + if (block != current_block_ || !reader_) { + if (!OpenMicroPartition(block, snapshot)) return false; + } + + Assert(current_block_ == block && reader_); + if (call_again) *call_again = false; + if (all_dead) *all_dead = false; + + auto ok = reader_->GetTuple(slot, pax::GetTupleOffset(*tid)); + if (ok) { + SetBlockNumber(&slot->tts_tid, block); + ExecStoreVirtualTuple(slot); + } + + return ok; +} + +bool PaxIndexScanDesc::OpenMicroPartition(BlockNumber block, + Snapshot snapshot) { + bool ok; + + Assert(block != current_block_); + + ok = cbdb::IsMicroPartitionVisible(base_.rel, block, snapshot); + if (ok) { + MicroPartitionReader::ReaderOptions options; + + auto block_name = std::to_string(block); + auto file_name = cbdb::BuildPaxFilePath(rel_path_, block_name); + options.block_id = block_name; + auto file = Singleton::GetInstance()->Open( + file_name, fs::kReadMode); + auto reader = PAX_NEW(file); + reader->Open(options); + if (reader_) { + reader_->Close(); + PAX_DELETE(reader_); + } + reader_ = reader; + current_block_ = block; + } + + return ok; +} + +bool PaxScanDesc::BitmapNextBlock(struct TBMIterateResult *tbmres) { + cindex_ = 0; + if (!index_desc_) { + index_desc_ = PAX_NEW(rs_base_.rs_rd); + } + return true; +} + +bool PaxScanDesc::BitmapNextTuple(struct TBMIterateResult *tbmres, + TupleTableSlot *slot) { + ItemPointerData tid; + if (tbmres->ntuples < 0) { + // lossy bitmap. The maximum value of the last 16 bits in CTID is + // 0x7FFF + 1, i.e. 0x8000. See layout of ItemPointerData in PAX + if (cindex_ > 0X8000) elog(ERROR, "unexpected offset in pax"); + + ItemPointerSet(&tid, tbmres->blockno, cindex_); + } else if (cindex_ < tbmres->ntuples) { + // The maximum value of the last 16 bits in CTID is 0x7FFF + 1, + // i.e. 0x8000. See layout of ItemPointerData in PAX + if (tbmres->offsets[cindex_] > 0X8000) + elog(ERROR, "unexpected offset in pax"); + + ItemPointerSet(&tid, tbmres->blockno, tbmres->offsets[cindex_]); + } else { + return false; + } + ++cindex_; + return index_desc_->FetchTuple(&tid, rs_base_.rs_snapshot, slot, nullptr, + nullptr); +} + TableScanDesc PaxScanDesc::BeginScan(Relation relation, Snapshot snapshot, - int nkeys, struct ScanKeyData *key, + int nkeys, struct ScanKeyData * /*key*/, ParallelTableScanDesc pscan, uint32 flags, - PaxFilter *filter) { + PaxFilter *filter, bool build_bitmap) { PaxScanDesc *desc; MemoryContext old_ctx; TableReader::ReaderOptions reader_options{}; @@ -22,7 +150,7 @@ TableScanDesc PaxScanDesc::BeginScan(Relation relation, Snapshot snapshot, offsetof(PaxScanDesc, rs_base_) == 0, "rs_base should be the first field and aligned to the object address"); - desc = new PaxScanDesc(); + desc = PAX_NEW(); desc->memory_context_ = cbdb::AllocSetCtxCreate( CurrentMemoryContext, "Pax Storage", PAX_ALLOCSET_DEFAULT_SIZES); @@ -33,157 +161,224 @@ TableScanDesc PaxScanDesc::BeginScan(Relation relation, Snapshot snapshot, desc->rs_base_.rs_nkeys = nkeys; desc->rs_base_.rs_flags = flags; desc->rs_base_.rs_parallel = pscan; - desc->key_ = key; - desc->reused_buffer_ = new DataBuffer(32 * 1024 * 1024); // 32mb + desc->reused_buffer_ = PAX_NEW>(pax_scan_reuse_buffer_size); desc->filter_ = filter; + if (!desc->filter_) { + desc->filter_ = PAX_NEW(); + } + + if (!desc->filter_->GetColumnProjection().first) { + auto natts = cbdb::RelationGetAttributesNumber(relation); + auto cols = PAX_NEW_ARRAY(natts); + memset(cols, true, natts); + desc->filter_->SetColumnProjection(cols, natts); + } + #ifdef VEC_BUILD - if (flags & (1 << 12)) { - desc->vec_adapter_ = new VecAdapter(cbdb::RelationGetTupleDesc(relation)); + if (flags & SO_TYPE_VECTOR) { + desc->vec_adapter_ = + PAX_NEW(cbdb::RelationGetTupleDesc(relation), build_bitmap); reader_options.is_vec = true; reader_options.adapter = desc->vec_adapter_; } -#endif +#endif // VEC_BUILD + +#ifdef ENABLE_PLASMA + if (pax_enable_plasma_in_mem) { + std::string plasma_socket_path = + std::string(desc->plasma_socket_path_prefix_); + plasma_socket_path.append(std::to_string(PostPortNumber)); + plasma_socket_path.append("\0"); + PaxPlasmaCache::CacheOptions cache_options; + cache_options.domain_socket = plasma_socket_path; + cache_options.memory_quota = 0; + cache_options.waitting_ms = 0; + + desc->pax_cache_ = PAX_NEW(std::move(cache_options)); + auto status = desc->pax_cache_->Initialize(); + if (!status.Ok()) { + elog(WARNING, "Plasma cache client init failed, message: %s", + status.Error().c_str()); + PAX_DELETE(desc->pax_cache_); + desc->pax_cache_ = nullptr; + } + + reader_options.pax_cache = desc->pax_cache_; + } - // init shared memory - cbdb::InitCommandResource(); +#endif // ENABLE_PLASMA old_ctx = MemoryContextSwitchTo(desc->memory_context_); // build reader - reader_options.build_bitmap = true; + reader_options.build_bitmap = build_bitmap; reader_options.reused_buffer = desc->reused_buffer_; reader_options.rel_oid = desc->rs_base_.rs_rd->rd_id; reader_options.filter = filter; auto iter = MicroPartitionInfoIterator::New(relation, snapshot); if (filter && filter->HasMicroPartitionFilter()) { - auto wrap = new FilterIterator( + auto wrap = PAX_NEW>( std::move(iter), [filter, relation](const auto &x) { - return filter->TestMicroPartitionScan(x.GetStats(), - RelationGetDescr(relation)); + MicroPartitionStatsProvider provider(x.GetStats()); + auto ok = filter->TestScan(provider, RelationGetDescr(relation), + PaxFilterStatisticsKind::kFile); + return ok; }); iter = std::unique_ptr>(wrap); } - desc->reader_ = new TableReader(std::move(iter), reader_options); + desc->reader_ = PAX_NEW(std::move(iter), reader_options); desc->reader_->Open(); MemoryContextSwitchTo(old_ctx); return &desc->rs_base_; } -void PaxScanDesc::EndScan(TableScanDesc scan) { - PaxScanDesc *desc = ScanToDesc(scan); +void PaxScanDesc::EndScan() { + if (pax_enable_debug && filter_) { + filter_->LogStatistics(); + } - Assert(desc->reader_); - desc->reader_->Close(); + Assert(reader_); + reader_->Close(); - delete desc->reused_buffer_; - delete desc->reader_; - delete desc->filter_; + PAX_DELETE(reused_buffer_); + PAX_DELETE(reader_); + PAX_DELETE(filter_); #ifdef VEC_BUILD - delete desc->vec_adapter_; + PAX_DELETE(vec_adapter_); #endif + +#ifdef ENABLE_PLASMA + if (pax_cache_) { + pax_cache_->Destroy(); + PAX_DELETE(pax_cache_); + } +#endif + + PAX_DELETE(index_desc_); + // TODO(jiaqizho): please double check with abort transaction @gongxun - Assert(desc->memory_context_); - cbdb::MemoryCtxDelete(desc->memory_context_); - delete desc; + Assert(memory_context_); + cbdb::MemoryCtxDelete(memory_context_); + auto self = this; + PAX_DELETE(self); } TableScanDesc PaxScanDesc::BeginScanExtractColumns( - Relation rel, Snapshot snapshot, ParallelTableScanDesc parallel_scan, - List *targetlist, List *qual, uint32 flags) { + Relation rel, Snapshot snapshot, int /*nkeys*/, + struct ScanKeyData * /*key*/, ParallelTableScanDesc parallel_scan, + struct PlanState *ps, uint32 flags) { TableScanDesc paxscan; PaxFilter *filter; + List *targetlist = ps->plan->targetlist; + List *qual = ps->plan->qual; auto natts = cbdb::RelationGetAttributesNumber(rel); bool *cols; bool found = false; + bool build_bitmap = true; + PaxcExtractcolumnContext extract_column; + + filter = PAX_NEW(); - filter = new PaxFilter(); + Assert(natts >= 0); - cols = new bool[natts]; + cols = PAX_NEW_ARRAY(natts); memset(cols, false, natts); + extract_column.cols = cols; + extract_column.natts = natts; + found = cbdb::ExtractcolumnsFromNode(reinterpret_cast(targetlist), - cols, natts); + &extract_column); found = cbdb::ExtractcolumnsFromNode(reinterpret_cast(qual), cols, natts) || found; + build_bitmap = cbdb::IsSystemAttrNumExist(&extract_column, + SelfItemPointerAttributeNumber); // In some cases (for example, count(*)), targetlist and qual may be null, // extractcolumns_walker will return immediately, so no columns are specified. // We always scan the first column. - if (!found) cols[0] = true; + if (!found && !build_bitmap && natts > 0) cols[0] = true; // The `cols` life cycle will be bound to `PaxFilter` filter->SetColumnProjection(cols, natts); - { + if (pax_enable_filter) { ScanKey scan_keys = nullptr; int n_scan_keys = 0; auto ok = pax::BuildScanKeys(rel, qual, false, &scan_keys, &n_scan_keys); if (ok) filter->SetScanKeys(scan_keys, n_scan_keys); + + if (gp_enable_predicate_pushdown +#ifdef VEC_BUILD + && !(flags & SO_TYPE_VECTOR) +#endif + ) + filter->BuildExecutionFilterForColumns(rel, ps); } - paxscan = BeginScan(rel, snapshot, 0, nullptr, parallel_scan, flags, filter); + paxscan = BeginScan(rel, snapshot, 0, nullptr, parallel_scan, flags, filter, + build_bitmap); return paxscan; } // FIXME: shall we take these parameters into account? -void PaxScanDesc::ReScan(TableScanDesc scan) { - PaxScanDesc *desc = ScanToDesc(scan); +void PaxScanDesc::ReScan(ScanKey /*key*/, bool /*set_params*/, + bool /*allow_strat*/, bool /*allow_sync*/, + bool /*allow_pagemode*/) { MemoryContext old_ctx; - Assert(desc && desc->reader_); + Assert(reader_); - old_ctx = MemoryContextSwitchTo(desc->memory_context_); - desc->reader_->ReOpen(); + old_ctx = MemoryContextSwitchTo(memory_context_); + reader_->ReOpen(); MemoryContextSwitchTo(old_ctx); } -bool PaxScanDesc::ScanGetNextSlot(TableScanDesc scan, TupleTableSlot *slot) { - PaxScanDesc *desc = ScanToDesc(scan); +bool PaxScanDesc::GetNextSlot(TupleTableSlot *slot) { MemoryContext old_ctx; bool ok = false; - CTupleSlot cslot(slot); - old_ctx = MemoryContextSwitchTo(desc->memory_context_); + old_ctx = MemoryContextSwitchTo(memory_context_); - ok = desc->reader_->ReadTuple(&cslot); + Assert(reader_); + ok = reader_->ReadTuple(slot); MemoryContextSwitchTo(old_ctx); return ok; } -bool PaxScanDesc::ScanAnalyzeNextBlock(TableScanDesc scan, - BlockNumber blockno) { - PaxScanDesc *desc = ScanToDesc(scan); - desc->target_tuple_id_ = blockno; - +bool PaxScanDesc::ScanAnalyzeNextBlock(BlockNumber blockno, + BufferAccessStrategy /*bstrategy*/) { + target_tuple_id_ = blockno; return true; } -bool PaxScanDesc::ScanAnalyzeNextTuple(TableScanDesc scan, double *liverows, - const double *deadrows, +bool PaxScanDesc::ScanAnalyzeNextTuple(TransactionId /*oldest_xmin*/, + double *liverows, + const double * /* deadrows */, TupleTableSlot *slot) { - PaxScanDesc *desc = ScanToDesc(scan); MemoryContext old_ctx; bool ok = false; - old_ctx = MemoryContextSwitchTo(desc->memory_context_); - Assert(*deadrows == 0); // not dead rows in pax latest snapshot - while (desc->next_tuple_id_ < desc->target_tuple_id_) { - ok = PaxScanDesc::ScanGetNextSlot(scan, slot); + old_ctx = MemoryContextSwitchTo(memory_context_); + while (next_tuple_id_ < target_tuple_id_) { + ok = GetNextSlot(slot); if (!ok) break; - desc->next_tuple_id_++; + next_tuple_id_++; + } + if (next_tuple_id_ == target_tuple_id_) { + ok = GetNextSlot(slot); + next_tuple_id_++; + if (ok) *liverows += 1; } MemoryContextSwitchTo(old_ctx); - if (ok) *liverows += 1; return ok; } -bool PaxScanDesc::ScanSampleNextBlock(TableScanDesc scan, - SampleScanState *scanstate) { - PaxScanDesc *desc = ScanToDesc(scan); +bool PaxScanDesc::ScanSampleNextBlock(SampleScanState *scanstate) { MemoryContext old_ctx; TsmRoutine *tsm = scanstate->tsmroutine; BlockNumber blockno = 0; @@ -193,39 +388,36 @@ bool PaxScanDesc::ScanSampleNextBlock(TableScanDesc scan, double allvisfrac = 0; bool ok = false; - old_ctx = MemoryContextSwitchTo(desc->memory_context_); + old_ctx = MemoryContextSwitchTo(memory_context_); - if (desc->total_tuples_ == 0) { - paxc::PaxAccessMethod::EstimateRelSize(scan->rs_rd, &attrwidths, &pages, + if (total_tuples_ == 0) { + paxc::PaxAccessMethod::EstimateRelSize(rs_base_.rs_rd, &attrwidths, &pages, &total_tuples, &allvisfrac); - desc->total_tuples_ = total_tuples; + total_tuples_ = total_tuples; } if (tsm->NextSampleBlock) - blockno = tsm->NextSampleBlock(scanstate, desc->total_tuples_); + blockno = tsm->NextSampleBlock(scanstate, total_tuples_); else - blockno = system_nextsampleblock(scanstate, desc->total_tuples_); + blockno = system_nextsampleblock(scanstate, total_tuples_); ok = BlockNumberIsValid(blockno); - if (ok) { - desc->fetch_tuple_id_ = blockno; - } + if (ok) fetch_tuple_id_ = blockno; MemoryContextSwitchTo(old_ctx); return ok; } -bool PaxScanDesc::ScanSampleNextTuple(TableScanDesc scan, +bool PaxScanDesc::ScanSampleNextTuple(SampleScanState * /*scanstate*/, TupleTableSlot *slot) { - PaxScanDesc *desc = ScanToDesc(scan); MemoryContext old_ctx; bool ok = false; - old_ctx = MemoryContextSwitchTo(desc->memory_context_); - while (desc->next_tuple_id_ < desc->fetch_tuple_id_) { - ok = PaxScanDesc::ScanGetNextSlot(scan, slot); + old_ctx = MemoryContextSwitchTo(memory_context_); + while (next_tuple_id_ < fetch_tuple_id_) { + ok = GetNextSlot(slot); if (!ok) break; - desc->next_tuple_id_++; + next_tuple_id_++; } MemoryContextSwitchTo(old_ctx); return ok; diff --git a/contrib/pax_storage/src/cpp/access/pax_scanner.h b/contrib/pax_storage/src/cpp/access/pax_scanner.h index f06ab6c9fb8..33f6754855c 100644 --- a/contrib/pax_storage/src/cpp/access/pax_scanner.h +++ b/contrib/pax_storage/src/cpp/access/pax_scanner.h @@ -2,52 +2,84 @@ #include "comm/cbdb_api.h" +#include + #include "storage/pax.h" #include "storage/pax_filter.h" #ifdef VEC_BUILD #include "storage/vec/pax_vec_adapter.h" #endif + +namespace paxc { +bool IndexUniqueCheck(Relation rel, ItemPointer tid, Snapshot snapshot, + bool *all_dead); +} + namespace pax { +class PaxIndexScanDesc final { + public: + explicit PaxIndexScanDesc(Relation rel); + ~PaxIndexScanDesc(); + bool FetchTuple(ItemPointer tid, Snapshot snapshot, TupleTableSlot *slot, + bool *call_again, bool *all_dead); + inline IndexFetchTableData *ToBase() { return &base_; } + static inline PaxIndexScanDesc *FromBase(IndexFetchTableData *base) { + return reinterpret_cast(base); + } + + private: + bool OpenMicroPartition(BlockNumber block, Snapshot snapshot); + + IndexFetchTableData base_; + BlockNumber current_block_ = InvalidBlockNumber; + MicroPartitionReader *reader_ = nullptr; + std::string rel_path_; +}; class PaxScanDesc { public: static TableScanDesc BeginScan(Relation relation, Snapshot snapshot, int nkeys, struct ScanKeyData *key, ParallelTableScanDesc pscan, uint32 flags, - PaxFilter *filter); - - static void ReScan(TableScanDesc scan); - static void EndScan(TableScanDesc scan); + PaxFilter *filter, bool build_bitmap); static TableScanDesc BeginScanExtractColumns( - Relation rel, Snapshot snapshot, ParallelTableScanDesc parallel_scan, - List *targetlist, List *qual, uint32 flags); + Relation rel, Snapshot snapshot, int nkeys, struct ScanKeyData *key, + ParallelTableScanDesc parallel_scan, struct PlanState *ps, uint32 flags); - static bool ScanGetNextSlot(TableScanDesc scan, TupleTableSlot *slot); + void EndScan(); + void ReScan(ScanKey key, bool set_params, bool allow_strat, bool allow_sync, + bool allow_pagemode); - static bool ScanAnalyzeNextBlock(TableScanDesc scan, BlockNumber blockno); - static bool ScanAnalyzeNextTuple(TableScanDesc scan, double *liverows, - const double *deadrows, - TupleTableSlot *slot); + bool GetNextSlot(TupleTableSlot *slot); - static bool ScanSampleNextBlock(TableScanDesc scan, - SampleScanState *scanstate); + bool ScanAnalyzeNextBlock(BlockNumber blockno, + BufferAccessStrategy bstrategy); + bool ScanAnalyzeNextTuple(TransactionId oldest_xmin, double *liverows, + const double *deadrows, TupleTableSlot *slot); - static bool ScanSampleNextTuple(TableScanDesc scan, TupleTableSlot *slot); + bool ScanSampleNextBlock(SampleScanState *scanstate); - ~PaxScanDesc() = default; + bool ScanSampleNextTuple(SampleScanState *scanstate, TupleTableSlot *slot); - private: - PaxScanDesc() = default; + bool BitmapNextBlock(struct TBMIterateResult *tbmres); + bool BitmapNextTuple(struct TBMIterateResult *tbmres, TupleTableSlot *slot); + + ~PaxScanDesc() = default; - static inline PaxScanDesc *ScanToDesc(TableScanDesc scan) { + static inline PaxScanDesc *ToDesc(TableScanDesc scan) { auto desc = reinterpret_cast(scan); return desc; } + private: + template + friend T *PAX_NEW(Args &&...args); + PaxScanDesc() = default; + private: TableScanDescData rs_base_{}; - const ScanKeyData *key_ = nullptr; + TableReader *reader_ = nullptr; DataBuffer *reused_buffer_ = nullptr; @@ -67,6 +99,15 @@ class PaxScanDesc { #ifdef VEC_BUILD VecAdapter *vec_adapter_ = nullptr; #endif + +#ifdef ENABLE_PLASMA + const std::string plasma_socket_path_prefix_ = "/tmp/.s.plasma."; + PaxCache *pax_cache_ = nullptr; +#endif + + // used only by bitmap index scan + PaxIndexScanDesc *index_desc_ = nullptr; + int cindex_ = 0; }; // class PaxScanDesc } // namespace pax diff --git a/contrib/pax_storage/src/cpp/access/pax_updater.cc b/contrib/pax_storage/src/cpp/access/pax_updater.cc index e5f79c23ee2..3fbb8787c54 100644 --- a/contrib/pax_storage/src/cpp/access/pax_updater.cc +++ b/contrib/pax_storage/src/cpp/access/pax_updater.cc @@ -8,19 +8,29 @@ namespace pax { TM_Result CPaxUpdater::UpdateTuple( const Relation relation, const ItemPointer otid, TupleTableSlot *slot, const CommandId cid, const Snapshot snapshot, const Snapshot /*crosscheck*/, - const bool /*wait*/, TM_FailureData * /*tmfd*/, - LockTupleMode * /*lockmode*/, bool * /*update_indexes*/) { + const bool /*wait*/, TM_FailureData * tmfd, + LockTupleMode * lockmode, bool *update_indexes) { TM_Result result; - CPaxDeleter *deleter = - CPaxDmlStateLocal::Instance()->GetDeleter(relation, snapshot); + + auto dml_state = CPaxDmlStateLocal::Instance(); + auto deleter = dml_state->GetDeleter(relation, snapshot); + auto inserter = dml_state->GetInserter(relation); + Assert(deleter != nullptr); - CPaxInserter *inserter = CPaxDmlStateLocal::Instance()->GetInserter(relation); Assert(inserter != nullptr); + *lockmode = LockTupleExclusive; result = deleter->MarkDelete(otid); - // FIXME(gongxun): check result and return TM_SelfModified if needed - inserter->InsertTuple(relation, slot, cid, 0, nullptr); + if (result == TM_Ok) { + inserter->InsertTuple(relation, slot, cid, 0, nullptr); + *update_indexes = true; + } else { + // FIXME: set tmfd correctly. + // FYI, ao ignores both tmfd and lockmode + tmfd->ctid = *otid; + *update_indexes = false; + } // TODO(gongxun): update pgstat info return result; } diff --git a/contrib/pax_storage/src/cpp/access/paxc_gram.y b/contrib/pax_storage/src/cpp/access/paxc_gram.y new file mode 100644 index 00000000000..84ca498fd99 --- /dev/null +++ b/contrib/pax_storage/src/cpp/access/paxc_gram.y @@ -0,0 +1,575 @@ +%{ +#include "postgres.h" + + +#include "nodes/pg_list.h" +#include "parser/parser.h" +#include "parser/parse_type.h" +#include "parser/scanner.h" +#include "parser/scansup.h" +#include "utils/builtins.h" +#include "utils/datetime.h" + +#include "access/paxc_scanner.h" + +/* Location tracking support --- simpler than bison's default */ +#define YYLLOC_DEFAULT(Current, Rhs, N) \ + do { \ + if (N) \ + (Current) = (Rhs)[1]; \ + else \ + (Current) = (Rhs)[0]; \ + } while (0) + +#define parser_errposition(pos) scanner_errposition(pos, yyscanner) +#define parser_yyerror(msg) scanner_yyerror(yyscanner, msg) + +/* + * Bison doesn't allocate anything that needs to live across parser calls, + * so we can easily have it use palloc instead of malloc. This prevents + * memory leaks if we error out during parsing. Note this only works with + * bison >= 2.0. However, in bison 1.875 the default is to use alloca() + * if possible, so there's not really much problem anyhow, at least if + * you're building with gcc. + */ +#define YYMALLOC palloc +#define YYFREE pfree + +static void paxc_yyerror(core_yyscan_t yyscanner, const char *message); +static int paxc_yylex(core_yyscan_t yyscanner); +static int paxc_scanner_errposition(int location); +static List *paxc_result; + +%} + +/* %pure-parser */ +%expect 0 +%name-prefix="paxc_yy" +%locations +%parse-param {core_yyscan_t yyscanner} +%lex-param {core_yyscan_t yyscanner} + +%union +{ + core_YYSTYPE core_yystype; + /* these fields must match core_YYSTYPE: */ + int ival; + char *str; + const char *keyword; + + bool boolean; + List *list; + Node *node; + TypeName *typnam; + PartitionElem *partelem; + PartitionSpec *partspec; + PartitionBoundSpec *partboundspec; +} + +/* %type top_level_stmt */ +%type partition_by part_params any_name opt_collate attrs opt_qualified_name +%type part_elem +%type ColId attr_name + +// FIXME: types for partition ranges +//%type partition_ranges expr_list opt_type_modifiers +//%type partition_range +//%type AexprConst a_expr c_expr +//%type Numeric opt_float ConstTypename ConstDatetime ConstInterval ConstCharacter CharacterWithLength CharacterWithoutLength ConstBit BitWithLength BitWithoutLength +//%type opt_varying opt_timezone +//%type opt_interval interval_second +//%type Sconst character +//%type Iconst + +%token IDENT +//%token FCONST SCONST BCONST XCONST +//%token ICONST + +%token COLLATE +//%token TRUE_P FALSE_P HOUR_P YEAR_P NULL_P MONTH_P TO VARYING VARCHAR TIMESTAMP BIT TIME INTERVAL DAY_P MINUTE_P SECOND_P CHARACTER NATIONAL NCHAR CHAR_P ZONE INT_P INTEGER SMALLINT BIGINT REAL FLOAT_P DOUBLE_P PRECISION DECIMAL_P DEC NUMERIC BOOLEAN_P FROM + +%token WITH_LA WITHOUT_LA + + +%% + +top_level_stmt: + partition_by { paxc_result = $1; } +// | partition_ranges { paxc_result = $1; } + ; + +partition_by: part_params { $$ = $1; } + ; +part_params: + part_elem { $$ = list_make1($1); } + | part_params ',' part_elem { $$ = lappend($1, $3); } + ; + +part_elem: ColId opt_collate opt_qualified_name + { + PartitionElem *n = makeNode(PartitionElem); + + n->name = $1; + n->expr = NULL; + n->collation = $2; + n->opclass = $3; + n->location = @1; + $$ = n; + } +/* + | func_expr_windowless opt_collate opt_qualified_name + { + PartitionElem *n = makeNode(PartitionElem); + + n->name = NULL; + n->expr = $1; + n->collation = $2; + n->opclass = $3; + n->location = @1; + $$ = n; + } + | '(' a_expr ')' opt_collate opt_qualified_name + { + PartitionElem *n = makeNode(PartitionElem); + + n->name = NULL; + n->expr = $2; + n->collation = $4; + n->opclass = $5; + n->location = @1; + $$ = n; + } +*/ + ; + +/* Column identifier --- names that can be column, table, etc names. + */ +ColId: IDENT { $$ = $1; } + ; +opt_collate: COLLATE any_name { $$ = $2; } + | /*EMPTY*/ { $$ = NIL; } + ; + +any_name: + ColId { $$ = list_make1(makeString($1)); } + | ColId attrs { $$ = lcons(makeString($1), $2); } + ; + +attrs: '.' attr_name { $$ = list_make1(makeString($2)); } + | attrs '.' attr_name { $$ = lappend($1, makeString($3)); } + ; + +attr_name: IDENT { $$ = $1; } + ; + +/* opclass */ +opt_qualified_name: any_name { $$ = $1; } + | /*EMPTY*/ { $$ = NIL; } + ; + +//partition_ranges: partition_ranges ',' partition_range { $$ = lappend($1, $3); } +// | partition_range { $$ = list_make1($1); } +// ; +// +//partition_range: FROM '(' expr_list ')' TO '(' expr_list ')' +// { +// PartitionBoundSpec *n = makeNode(PartitionBoundSpec); +// +// n->strategy = PARTITION_STRATEGY_RANGE; +// n->is_default = false; +// n->lowerdatums = $3; +// n->upperdatums = $7; +// +// $$ = n; +// } +// ; +// +//expr_list: a_expr { $$ = list_make1($1); } +// | expr_list ',' a_expr { $$ = lappend($1, $3); } +// ; +// +//a_expr: c_expr { $$ = $1; } +// ; +//c_expr: AexprConst { $$ = $1; } +// ; +// +///* +// * Constants +// */ +//AexprConst: Iconst { $$ = makeIntConst($1, @1); } +// | FCONST { $$ = makeFloatConst($1, @1); } +// | Sconst { $$ = makeStringConst($1, @1); } +// | BCONST { $$ = makeBitStringConst($1, @1); } +// | XCONST +// { +// /* This is a bit constant per SQL99: +// * Without Feature F511, "BIT data type", +// * a shall not be a +// * or a . +// */ +// $$ = makeBitStringConst($1, @1); +// } +// | ConstTypename Sconst { $$ = makeStringConstCast($2, @2, $1); } +// | ConstInterval Sconst opt_interval +// { +// TypeName *t = $1; +// +// t->typmods = $3; +// $$ = makeStringConstCast($2, @2, t); +// } +// | ConstInterval '(' Iconst ')' Sconst +// { +// TypeName *t = $1; +// +// t->typmods = list_make2(makeIntConst(INTERVAL_FULL_RANGE, -1), +// makeIntConst($3, @3)); +// $$ = makeStringConstCast($5, @5, t); +// } +// | TRUE_P { $$ = makeBoolAConst(true, @1); } +// | FALSE_P { $$ = makeBoolAConst(false, @1); } +// | NULL_P { $$ = makeNullAConst(@1); } +// ; +// +//Iconst: ICONST { $$ = $1; }; +//Sconst: SCONST { $$ = $1; }; +// +//ConstTypename: +// Numeric { $$ = $1; } +// | ConstBit { $$ = $1; } +// | ConstCharacter { $$ = $1; } +// | ConstDatetime { $$ = $1; } +// ; +// +///* ConstBit is like Bit except "BIT" defaults to unspecified length */ +///* See notes for ConstCharacter, which addresses same issue for "CHAR" */ +//ConstBit: BitWithLength { $$ = $1; } +// | BitWithoutLength +// { +// $$ = $1; +// $$->typmods = NIL; +// } +// ; +// +//BitWithLength: BIT opt_varying '(' expr_list ')' +// { +// char *typname; +// +// typname = $2 ? "varbit" : "bit"; +// $$ = SystemTypeName(typname); +// $$->typmods = $4; +// $$->location = @1; +// } +// ; +// +//BitWithoutLength: BIT opt_varying +// { +// /* bit defaults to bit(1), varbit to no limit */ +// if ($2) +// { +// $$ = SystemTypeName("varbit"); +// } +// else +// { +// $$ = SystemTypeName("bit"); +// $$->typmods = list_make1(makeIntConst(1, -1)); +// } +// $$->location = @1; +// } +// ; +// +//ConstCharacter: CharacterWithLength +// { +// $$ = $1; +// } +// | CharacterWithoutLength +// { +// /* Length was not specified so allow to be unrestricted. +// * This handles problems with fixed-length (bpchar) strings +// * which in column definitions must default to a length +// * of one, but should not be constrained if the length +// * was not specified. +// */ +// $$ = $1; +// $$->typmods = NIL; +// } +// ; +// +//CharacterWithLength: character '(' Iconst ')' +// { +// $$ = SystemTypeName($1); +// $$->typmods = list_make1(makeIntConst($3, @3)); +// $$->location = @1; +// } +// ; +// +//CharacterWithoutLength: character +// { +// $$ = SystemTypeName($1); +// /* char defaults to char(1), varchar to no limit */ +// if (strcmp($1, "bpchar") == 0) +// $$->typmods = list_make1(makeIntConst(1, -1)); +// $$->location = @1; +// } +// ; +// +//character: CHARACTER opt_varying { $$ = $2 ? "varchar": "bpchar"; } +// | CHAR_P opt_varying { $$ = $2 ? "varchar": "bpchar"; } +// | VARCHAR { $$ = "varchar"; } +// | NATIONAL CHARACTER opt_varying { $$ = $3 ? "varchar": "bpchar"; } +// | NATIONAL CHAR_P opt_varying { $$ = $3 ? "varchar": "bpchar"; } +// | NCHAR opt_varying { $$ = $2 ? "varchar": "bpchar"; } +// ; +// +//opt_varying: VARYING { $$ = true; } +// | /*EMPTY*/ { $$ = false; } +// ; +// +///* +// * SQL date/time types +// */ +//ConstDatetime: +// TIMESTAMP '(' Iconst ')' opt_timezone +// { +// if ($5) +// $$ = SystemTypeName("timestamptz"); +// else +// $$ = SystemTypeName("timestamp"); +// $$->typmods = list_make1(makeIntConst($3, @3)); +// $$->location = @1; +// } +// | TIMESTAMP opt_timezone +// { +// if ($2) +// $$ = SystemTypeName("timestamptz"); +// else +// $$ = SystemTypeName("timestamp"); +// $$->location = @1; +// } +// | TIME '(' Iconst ')' opt_timezone +// { +// if ($5) +// $$ = SystemTypeName("timetz"); +// else +// $$ = SystemTypeName("time"); +// $$->typmods = list_make1(makeIntConst($3, @3)); +// $$->location = @1; +// } +// | TIME opt_timezone +// { +// if ($2) +// $$ = SystemTypeName("timetz"); +// else +// $$ = SystemTypeName("time"); +// $$->location = @1; +// } +// ; +// +//ConstInterval: INTERVAL +// { +// $$ = SystemTypeName("interval"); +// $$->location = @1; +// } +// ; +// +//opt_timezone: WITH_LA TIME ZONE { $$ = true; } +// | WITHOUT_LA TIME ZONE { $$ = false; } +// | /*EMPTY*/ { $$ = false; } +// ; +// +//opt_interval: +// YEAR_P { $$ = list_make1(makeIntConst(INTERVAL_MASK(YEAR), @1)); } +// | MONTH_P { $$ = list_make1(makeIntConst(INTERVAL_MASK(MONTH), @1)); } +// | DAY_P { $$ = list_make1(makeIntConst(INTERVAL_MASK(DAY), @1)); } +// | HOUR_P { $$ = list_make1(makeIntConst(INTERVAL_MASK(HOUR), @1)); } +// | MINUTE_P { $$ = list_make1(makeIntConst(INTERVAL_MASK(MINUTE), @1)); } +// | interval_second { $$ = $1; } +// | YEAR_P TO MONTH_P +// { +// $$ = list_make1(makeIntConst(INTERVAL_MASK(YEAR) | +// INTERVAL_MASK(MONTH), @1)); +// } +// | DAY_P TO HOUR_P +// { +// $$ = list_make1(makeIntConst(INTERVAL_MASK(DAY) | +// INTERVAL_MASK(HOUR), @1)); +// } +// | DAY_P TO MINUTE_P +// { +// $$ = list_make1(makeIntConst(INTERVAL_MASK(DAY) | +// INTERVAL_MASK(HOUR) | +// INTERVAL_MASK(MINUTE), @1)); +// } +// | DAY_P TO interval_second +// { +// $$ = $3; +// linitial($$) = makeIntConst(INTERVAL_MASK(DAY) | +// INTERVAL_MASK(HOUR) | +// INTERVAL_MASK(MINUTE) | +// INTERVAL_MASK(SECOND), @1); +// } +// | HOUR_P TO MINUTE_P +// { +// $$ = list_make1(makeIntConst(INTERVAL_MASK(HOUR) | +// INTERVAL_MASK(MINUTE), @1)); +// } +// | HOUR_P TO interval_second +// { +// $$ = $3; +// linitial($$) = makeIntConst(INTERVAL_MASK(HOUR) | +// INTERVAL_MASK(MINUTE) | +// INTERVAL_MASK(SECOND), @1); +// } +// | MINUTE_P TO interval_second +// { +// $$ = $3; +// linitial($$) = makeIntConst(INTERVAL_MASK(MINUTE) | +// INTERVAL_MASK(SECOND), @1); +// } +// | /*EMPTY*/ { $$ = NIL; } +// ; +// +//interval_second: +// SECOND_P +// { +// $$ = list_make1(makeIntConst(INTERVAL_MASK(SECOND), @1)); +// } +// | SECOND_P '(' Iconst ')' +// { +// $$ = list_make2(makeIntConst(INTERVAL_MASK(SECOND), @1), +// makeIntConst($3, @3)); +// } +// ; +// +//opt_type_modifiers: '(' expr_list ')' { $$ = $2; } +// | /* EMPTY */ { $$ = NIL; } +// ; +// +///* +// * SQL numeric data types +// */ +//Numeric: +// INT_P +// { +// $$ = SystemTypeName("int4"); +// $$->location = @1; +// } +// | INTEGER +// { +// $$ = SystemTypeName("int4"); +// $$->location = @1; +// } +// | SMALLINT +// { +// $$ = SystemTypeName("int2"); +// $$->location = @1; +// } +// | BIGINT +// { +// $$ = SystemTypeName("int8"); +// $$->location = @1; +// } +// | REAL +// { +// $$ = SystemTypeName("float4"); +// $$->location = @1; +// } +// | FLOAT_P opt_float +// { +// $$ = $2; +// $$->location = @1; +// } +// | DOUBLE_P PRECISION +// { +// $$ = SystemTypeName("float8"); +// $$->location = @1; +// } +// | DECIMAL_P opt_type_modifiers +// { +// $$ = SystemTypeName("numeric"); +// $$->typmods = $2; +// $$->location = @1; +// } +// | DEC opt_type_modifiers +// { +// $$ = SystemTypeName("numeric"); +// $$->typmods = $2; +// $$->location = @1; +// } +// | NUMERIC opt_type_modifiers +// { +// $$ = SystemTypeName("numeric"); +// $$->typmods = $2; +// $$->location = @1; +// } +// | BOOLEAN_P +// { +// $$ = SystemTypeName("bool"); +// $$->location = @1; +// } +// ; +// +//opt_float: '(' Iconst ')' +// { +// /* +// * Check FLOAT() precision limits assuming IEEE floating +// * types - thomas 1997-09-18 +// */ +// if ($2 < 1) +// ereport(ERROR, +// (errcode(ERRCODE_INVALID_PARAMETER_VALUE), +// errmsg("precision for type float must be at least 1 bit"), +// parser_errposition(@2))); +// else if ($2 <= 24) +// $$ = SystemTypeName("float4"); +// else if ($2 <= 53) +// $$ = SystemTypeName("float8"); +// else +// ereport(ERROR, +// (errcode(ERRCODE_INVALID_PARAMETER_VALUE), +// errmsg("precision for type float must be less than 54 bits"), +// parser_errposition(@2))); +// } +// | /*EMPTY*/ { $$ = SystemTypeName("float8"); } +// ; +// + +%% + +static int paxc_scanner_errposition(int location) { + return location; +} + +static void paxc_yyerror(core_yyscan_t yyscanner, const char *message) { + ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("%s", _(message)))); +} +static int paxc_yylex(core_yyscan_t yyscanner) { + return core_yylex(&paxc_yylval.core_yystype, &paxc_yylloc, yyscanner); +} + +static core_yyscan_t paxc_scanner_init(const char *str, core_yy_extra_type *extra) { + paxc_result = NIL; + return scanner_init(str, extra, &ScanKeywords, ScanKeywordTokens); +} + +static void paxc_scanner_finish(core_yyscan_t yyscanner) { + scanner_finish(yyscanner); + paxc_result = NIL; +} + +List *paxc_raw_parse(const char *str) { + core_yyscan_t yyscanner; + core_yy_extra_type extra; + List *result; + int yyresult; + + yyscanner = paxc_scanner_init(str, &extra); + yyresult = paxc_yyparse(yyscanner); + if (yyresult != 0) + elog(ERROR, "pacx_yyparse returned %d", yyresult); + + result = paxc_result; + paxc_scanner_finish(yyscanner); + return result; +} + diff --git a/contrib/pax_storage/src/cpp/access/paxc_rel_options.cc b/contrib/pax_storage/src/cpp/access/paxc_rel_options.cc new file mode 100644 index 00000000000..fc230b874d3 --- /dev/null +++ b/contrib/pax_storage/src/cpp/access/paxc_rel_options.cc @@ -0,0 +1,270 @@ +#include "access/paxc_rel_options.h" + +namespace paxc { + +typedef struct { + const char *optname; /* option's name */ + const pax::ColumnEncoding_Kind kind; +} relopt_compress_type_mapping; + +static const relopt_compress_type_mapping kSelfRelCompressMap[] = { + {ColumnEncoding_Kind_NO_ENCODED_STR, + pax::ColumnEncoding_Kind::ColumnEncoding_Kind_NO_ENCODED}, + {ColumnEncoding_Kind_RLE_V2_STR, + pax::ColumnEncoding_Kind::ColumnEncoding_Kind_RLE_V2}, + {ColumnEncoding_Kind_DIRECT_DELTA_STR, + pax::ColumnEncoding_Kind::ColumnEncoding_Kind_DIRECT_DELTA}, + {ColumnEncoding_Kind_COMPRESS_ZSTD_STR, + pax::ColumnEncoding_Kind::ColumnEncoding_Kind_COMPRESS_ZSTD}, + {ColumnEncoding_Kind_COMPRESS_ZLIB_STR, + pax::ColumnEncoding_Kind::ColumnEncoding_Kind_COMPRESS_ZLIB}, +}; + +typedef struct { + const char *optname; /* option's name */ + const pax::PaxStorageFormat format; +} relopt_format_type_mapping; + +static const relopt_format_type_mapping kSelfRelFormatMap[] = { + {STORAGE_FORMAT_TYPE_ORC, pax::PaxStorageFormat::kTypeStorageOrcNonVec}, + {STORAGE_FORMAT_TYPE_ORC_VEC, pax::PaxStorageFormat::kTypeStorageOrcVec}, +}; + +// reloptions structure and variables. +static relopt_kind self_relopt_kind; + +#define PAX_COPY_STR_OPT(pax_opts_, pax_opt_name_) \ + do { \ + PaxOptions *pax_opts = reinterpret_cast(pax_opts_); \ + int pax_name_offset_ = *reinterpret_cast(pax_opts->pax_opt_name_); \ + if (pax_name_offset_) \ + strlcpy(pax_opts->pax_opt_name_, \ + reinterpret_cast(pax_opts) + pax_name_offset_, \ + sizeof(pax_opts->pax_opt_name_)); \ + } while (0) + +static const char *kSelfColumnEncodingClauseWhiteList[] = { + PAX_SOPT_COMPTYPE, + PAX_SOPT_COMPLEVEL, +}; + +static const relopt_parse_elt kSelfReloptTab[] = { + // no allow set with encoding + {PAX_SOPT_STORAGE_FORMAT, RELOPT_TYPE_STRING, + offsetof(PaxOptions, storage_format)}, + // allow with encoding + {PAX_SOPT_COMPTYPE, RELOPT_TYPE_STRING, + offsetof(PaxOptions, compress_type)}, + {PAX_SOPT_COMPLEVEL, RELOPT_TYPE_INT, offsetof(PaxOptions, compress_level)}, + {PAX_SOPT_PARTITION_BY, RELOPT_TYPE_STRING, + offsetof(PaxOptions, partition_by_offset)}, + {PAX_SOPT_PARTITION_RANGES, RELOPT_TYPE_STRING, + offsetof(PaxOptions, partition_ranges_offset)}, +}; + +static void paxc_validate_rel_options_storage_format(const char *value) { + size_t i; + + for (i = 0; i < lengthof(kSelfRelFormatMap); i++) { + if (strcmp(value, kSelfRelFormatMap[i].optname) == 0) return; + } + ereport(ERROR, (errmsg("unsupported storage format: '%s'", value))); +} + +static void paxc_validate_rel_options_compress_type(const char *value) { + size_t i; + + for (i = 0; i < lengthof(kSelfRelCompressMap); i++) { + if (strcmp(value, kSelfRelCompressMap[i].optname) == 0) return; + } + ereport(ERROR, (errmsg("unsupported compress type: '%s'", value))); +} + +static void paxc_validate_rel_option(PaxOptions *options) { + Assert(options); + if (strcmp(ColumnEncoding_Kind_NO_ENCODED_STR, options->compress_type) == 0 || + strcmp(ColumnEncoding_Kind_RLE_V2_STR, options->compress_type) == 0 || + strcmp(ColumnEncoding_Kind_DIRECT_DELTA_STR, options->compress_type) == + 0) { + if (options->compress_level != 0) { + ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("compresslevel=%d should setting is not work for " + "current encoding.", + options->compress_level))); + } + } +} + +bytea *paxc_default_rel_options(Datum reloptions, char /*relkind*/, + bool validate) { + Assert(self_relopt_kind != 0); + bytea *rdopts = (bytea *)build_reloptions( + reloptions, validate, self_relopt_kind, sizeof(PaxOptions), + kSelfReloptTab, lengthof(kSelfReloptTab)); + + PAX_COPY_STR_OPT(rdopts, storage_format); + PAX_COPY_STR_OPT(rdopts, compress_type); + return rdopts; +} + +PaxOptions **paxc_relation_get_attribute_options(Relation rel) { + Datum *dats; + PaxOptions **opts; + int i; + + Assert(rel && OidIsValid(RelationGetRelid(rel))); + + opts = (PaxOptions **)palloc0(RelationGetNumberOfAttributes(rel) * + sizeof(PaxOptions *)); + + dats = get_rel_attoptions(RelationGetRelid(rel), + RelationGetNumberOfAttributes(rel)); + + for (i = 0; i < RelationGetNumberOfAttributes(rel); i++) { + if (DatumGetPointer(dats[i]) != NULL) { + opts[i] = (PaxOptions *)paxc_default_rel_options(dats[i], 0, false); + pfree(DatumGetPointer(dats[i])); + } + } + pfree(dats); + + return opts; +} + +static void paxc_validate_single_column_encoding_clauses( + List *single_column_encoding) { + ListCell *cell = NULL; + Datum d; + PaxOptions *option = NULL; + /* not allow caller pass the `PAX_SOPT_STORAGE_FORMAT` + */ + foreach (cell, single_column_encoding) { + DefElem *def = (DefElem *)lfirst(cell); + bool not_in_white_list = true; + + if (!def->defname) { + continue; + } + + for (size_t i = 0; i < lengthof(kSelfColumnEncodingClauseWhiteList); i++) { + if (strcmp(kSelfColumnEncodingClauseWhiteList[i], def->defname) == 0) { + not_in_white_list = false; + break; + } + } + + if (not_in_white_list) { + ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("%s not allow setting in ENCODING CLAUSES.", + def->defname))); + } + } + + d = transformRelOptions(PointerGetDatum(NULL), single_column_encoding, NULL, + NULL, true, false); + + option = (PaxOptions *)paxc_default_rel_options(d, 0, true); + paxc_validate_rel_option(option); +} + +void paxc_validate_column_encoding_clauses(List *encoding_opts) { + ListCell *lc; + foreach (lc, encoding_opts) { + ColumnReferenceStorageDirective *crsd = + (ColumnReferenceStorageDirective *)lfirst(lc); + paxc_validate_single_column_encoding_clauses(crsd->encoding); + } +} + +List *paxc_transform_column_encoding_clauses(List *encoding_opts, bool validate, + bool fromType) { + List *ret_list = NIL; + + if (fromType) { + return NIL; + } + + ret_list = list_copy(encoding_opts); + /* there are no need to do column encoding clauses transform in pax + * because pax will setting default encoding inside + */ + if (validate) { + paxc_validate_single_column_encoding_clauses(encoding_opts); + } + + /* if column no setting the encoding clauses + * in transformColumnEncoding will pass the relation option + * to column encoding clauses, should remove the + * `PAX_SOPT_STORAGE_FORMAT` from it. + */ + ListCell *cell = NULL; + foreach (cell, ret_list) { + DefElem *def = (DefElem *)lfirst(cell); + bool not_in_white_list = true; + if (!def->defname) { + continue; + } + + for (size_t i = 0; i < lengthof(kSelfColumnEncodingClauseWhiteList); i++) { + if (strcmp(kSelfColumnEncodingClauseWhiteList[i], def->defname) == 0) { + not_in_white_list = false; + break; + } + } + + if (not_in_white_list) { + ret_list = foreach_delete_current(ret_list, cell); + } + } + + return ret_list; +} + +void paxc_reg_rel_options() { + self_relopt_kind = add_reloption_kind(); + add_string_reloption( + self_relopt_kind, PAX_SOPT_STORAGE_FORMAT, "pax storage format", "orc", + paxc_validate_rel_options_storage_format, AccessExclusiveLock); + add_string_reloption(self_relopt_kind, PAX_SOPT_COMPTYPE, "pax compress type", + PAX_DEFAULT_COMPRESSTYPE, + paxc_validate_rel_options_compress_type, + AccessExclusiveLock); + add_int_reloption(self_relopt_kind, PAX_SOPT_COMPLEVEL, "pax compress level", + PAX_DEFAULT_COMPRESSLEVEL, PAX_MIN_COMPRESSLEVEL, + PAX_MAX_COMPRESSLEVEL, AccessExclusiveLock); + add_string_reloption(self_relopt_kind, PAX_SOPT_PARTITION_BY, "partition by", + NULL, NULL, AccessExclusiveLock); + add_string_reloption(self_relopt_kind, PAX_SOPT_PARTITION_RANGES, + "partition ranges", NULL, NULL, AccessExclusiveLock); +} + +} // namespace paxc + +namespace pax { + +ColumnEncoding_Kind CompressKeyToColumnEncodingKind(const char *encoding_str) { + Assert(encoding_str); + + for (size_t i = 0; i < lengthof(paxc::kSelfRelCompressMap); i++) { + if (strcmp(paxc::kSelfRelCompressMap[i].optname, encoding_str) == 0) { + return paxc::kSelfRelCompressMap[i].kind; + } + } + + CBDB_RAISE(cbdb::CException::kExTypeLogicError); +} + +PaxStorageFormat StorageFormatKeyToPaxStorageFormat( + const char *storage_format_str) { + Assert(storage_format_str); + + for (size_t i = 0; i < lengthof(paxc::kSelfRelFormatMap); i++) { + if (strcmp(paxc::kSelfRelFormatMap[i].optname, storage_format_str) == 0) { + return paxc::kSelfRelFormatMap[i].format; + } + } + + CBDB_RAISE(cbdb::CException::kExTypeLogicError); +} + +} // namespace pax diff --git a/contrib/pax_storage/src/cpp/access/paxc_rel_options.h b/contrib/pax_storage/src/cpp/access/paxc_rel_options.h new file mode 100644 index 00000000000..cc66575610b --- /dev/null +++ b/contrib/pax_storage/src/cpp/access/paxc_rel_options.h @@ -0,0 +1,110 @@ +#pragma once + +#include "comm/cbdb_api.h" + +#include "exceptions/CException.h" +#include "storage/pax_defined.h" +#include "storage/proto/proto_wrappers.h" // for ColumnEncoding_Kind + +namespace paxc { + +#define ColumnEncoding_Kind_NO_ENCODED_STR "none" +#define ColumnEncoding_Kind_RLE_V2_STR "rle" +#define ColumnEncoding_Kind_DIRECT_DELTA_STR "delta" +#define ColumnEncoding_Kind_COMPRESS_ZSTD_STR "zstd" +#define ColumnEncoding_Kind_COMPRESS_ZLIB_STR "zlib" + +#define STORAGE_FORMAT_TYPE_ORC "orc" +#define STORAGE_FORMAT_TYPE_ORC_VEC "orc_vec" +#define STORAGE_FORMAT_TYPE_DEFAULT STORAGE_FORMAT_TYPE_ORC + +#define PAX_DEFAULT_COMPRESSLEVEL AO_DEFAULT_COMPRESSLEVEL +#define PAX_MIN_COMPRESSLEVEL AO_MIN_COMPRESSLEVEL +#define PAX_MAX_COMPRESSLEVEL AO_MAX_COMPRESSLEVEL +#define PAX_DEFAULT_COMPRESSTYPE ColumnEncoding_Kind_NO_ENCODED_STR + +#define PAX_SOPT_STORAGE_FORMAT "storage_format" +#define PAX_SOPT_COMPTYPE SOPT_COMPTYPE +#define PAX_SOPT_COMPLEVEL SOPT_COMPLEVEL +#define PAX_SOPT_PARTITION_BY "partition_by" +#define PAX_SOPT_PARTITION_RANGES "partition_ranges" + +// plain structure used by reloptions, can be accessed from C++ code. +struct PaxOptions { + // Pax needs to define the StdRdOptions instead of just vl_len. + // This is because many places in the CBDB assume that option in + // relation can be cast into StdRdOptions. + StdRdOptions rd_options; + char storage_format[16]; + char compress_type[16]; + int compress_level; + int partition_by_offset = 0; + int partition_ranges_offset = 0; + + char *partition_by() { + return partition_by_offset == 0 + ? NULL + : reinterpret_cast(this) + partition_by_offset; + } + char *partition_ranges() { + return partition_ranges_offset == 0 + ? NULL + : reinterpret_cast(this) + partition_ranges_offset; + } +}; + +#define RelationGetOptions(relation, field_name, default_opt) \ + ((relation)->rd_options \ + ? ((paxc::PaxOptions *)(relation)->rd_options)->field_name \ + : (default_opt)) + +/* + * used to register pax rel options + */ +void paxc_reg_rel_options(); + +/* + * parse the rel options in `pg_attribute_encoding` and relation + * if no ENCODING setting in `pg_attribute_encoding` will fill with + * the default one + */ +bytea *paxc_default_rel_options(Datum reloptions, char /*relkind*/, + bool validate); + +/* + * parse the attr options from `pg_attribute_encoding` + * if no ENCODING setting in `pg_attribute_encoding` will fill with + * the default one + */ +PaxOptions **paxc_relation_get_attribute_options(Relation rel); + +/* + * validate the ENCODING CLAUSES + * like `CREATE TABLE t1 (c1 int, COLUMN c1 ENCODING (key=value)) using + * pax` + */ +void paxc_validate_column_encoding_clauses(List *encoding_opts); + +/* + * transform the ENCODING options if key no setting + * validate will become true only when the encoding syntax is true + * like `CREATE TABLE t1 (c1 int ENCODING (key=value)) using pax` + * + * pax no need transform the ENCODING options if key no setting + * it will deal the default value inside pax colomn + */ +List *paxc_transform_column_encoding_clauses(List *encoding_opts, bool validate, + bool fromType); + +} // namespace paxc + +namespace pax { + +// use to transform compress type str to encoding kind +extern ColumnEncoding_Kind CompressKeyToColumnEncodingKind( + const char *encoding_str); + +extern PaxStorageFormat StorageFormatKeyToPaxStorageFormat( + const char *storage_format_str); + +} // namespace pax diff --git a/contrib/pax_storage/src/cpp/access/paxc_scanner.cc b/contrib/pax_storage/src/cpp/access/paxc_scanner.cc new file mode 100644 index 00000000000..43d628ee25f --- /dev/null +++ b/contrib/pax_storage/src/cpp/access/paxc_scanner.cc @@ -0,0 +1,131 @@ +#include "access/paxc_scanner.h" + +#include "access/pax_partition.h" + +#define blank_char(ch) ((ch) == ' ' || (ch) == '\t' || (ch) == '\n') +#define ident_char(ch) (((ch) >= 'a' && (ch) <= 'z') || \ + ((ch) >= 'A' && (ch) <= 'Z') || \ + ((ch) >= '0' && (ch) <= '9') || \ + (ch) == '_') + +static inline const char *paxc_eat_blank(const char *s) { + while (blank_char(*s)) + s++; + return s; +} + +static inline const char *paxc_expect_char(const char *s, char ch) { + const char *p = paxc_eat_blank(s); + if (*p != ch) + elog(ERROR, "invalid syntax for partition range:'%s' at '%s'", s, p); + + return p + 1; +} + +static const char *paxc_expect_ident(const char *s, const char *ident) { + const char *p = s; + const char *q; + size_t n; + + n = strlen(ident); + p = paxc_eat_blank(s); + if (strncasecmp(p, ident, n) != 0) + elog(ERROR, "unexpected ident: %s, want %s", s, ident); + q = p + n; + if (ident_char(*q)) + elog(ERROR, "unexpected ident: %s, want %s", s, ident); + + return q; +} + +static const char *paxc_parse_single_integer(const char *expr, Node **result) { + const char *p; + char *endptr; + int val; + + p = paxc_eat_blank(expr); + val = strtol(p, &endptr, 10); + A_Const *n = makeNode(A_Const); + + n->val.type = T_Integer; + n->val.val.ival = val; + n->location = -1; + *result = (Node *)n; + + return endptr; +} + +static const char *paxc_parse_expr_list(const char *expr_list, List **result) { + const char *p = expr_list; + + *result = NIL; + p = paxc_eat_blank(expr_list); + while (*p) { + Node *value = NULL; + p = paxc_parse_single_integer(p, &value); + Assert(value); + + *result = lappend(*result, value); + + p = paxc_eat_blank(p); + if (*p != ',') break; + p++; + } + return p; +} + +List *paxc_parse_partition_ranges(const char *ranges) { + const char *p = ranges; + List *result = NIL; + if (!p || *p == '\0') return NIL; + + while (*p && (p = paxc_expect_ident(p, "from"))) { + List *from_list = NIL; + List *to_list = NIL; + List *every_list = NIL; + + p = paxc_expect_char(p, '('); + p = paxc_parse_expr_list(p, &from_list); + p = paxc_expect_char(p, ')'); + Assert(from_list); + + p = paxc_expect_ident(p, "to"); + p = paxc_expect_char(p, '('); + p = paxc_parse_expr_list(p, &to_list); + p = paxc_expect_char(p, ')'); + Assert(to_list); + + p = paxc_eat_blank(p); + if (strncasecmp(p, "every", 5) == 0) { + // from(X) to(Y) every(Z) + p += 5; + p = paxc_expect_char(p, '('); + p = paxc_parse_expr_list(p, &every_list); + p = paxc_expect_char(p, ')'); + Assert(every_list); + p = paxc_eat_blank(p); + } + if (*p == ',') { + p++; + } else if (*p != '\0') { + elog(ERROR, "unexpected range delimiter: %s", p); + } + + if (list_length(from_list) == 0 || + list_length(from_list) != list_length(to_list)) { + elog(ERROR, "the lengths of expr_list are not equal in from and to: %d %d", + list_length(from_list), list_length(to_list)); + } + + PartitionRangeExtension *ext = (PartitionRangeExtension *)palloc0(sizeof(PartitionRangeExtension)); + PartitionBoundSpec *n = &ext->spec; + n->type = T_PartitionBoundSpec; + n->strategy = PARTITION_STRATEGY_RANGE; + n->is_default = false; + n->lowerdatums = from_list; + n->upperdatums = to_list; + ext->every = every_list; + result = lappend(result, ext); + } + return result; +} diff --git a/contrib/pax_storage/src/cpp/access/paxc_scanner.h b/contrib/pax_storage/src/cpp/access/paxc_scanner.h new file mode 100644 index 00000000000..79ca99f9537 --- /dev/null +++ b/contrib/pax_storage/src/cpp/access/paxc_scanner.h @@ -0,0 +1,13 @@ +#pragma once +#include "comm/cbdb_api.h" + +#ifdef __cplusplus +extern "C" { +#endif +struct List; +extern struct List *paxc_raw_parse(const char *str); +extern struct List *paxc_parse_partition_ranges(const char *ranges); + +#ifdef __cplusplus +} +#endif diff --git a/contrib/pax_storage/src/cpp/catalog/micro_partition_stats.cc b/contrib/pax_storage/src/cpp/catalog/micro_partition_stats.cc deleted file mode 100644 index 3e0bd53d103..00000000000 --- a/contrib/pax_storage/src/cpp/catalog/micro_partition_stats.cc +++ /dev/null @@ -1,341 +0,0 @@ -#include "catalog/micro_partition_stats.h" - -#include "comm/cbdb_api.h" - -#include "comm/cbdb_wrappers.h" -#include "storage/micro_partition_metadata.h" -#include "storage/proto/proto_wrappers.h" - -namespace pax { -// SetStatsMessage may be called several times in a write, -// one for each micro partition, so all members need to reset. -// Some metainfo like typid, collation, oids for less/greater, -// fmgr should be exactly consistent. -MicroPartitionStats *MicroPartitionStats::SetStatsMessage( - pax::stats::MicroPartitionStatisticsInfo *stats, int natts) { - FmgrInfo finfo; - std::tuple zero_oids = {InvalidOid, InvalidOid, InvalidOid, InvalidOid}; - - Assert(natts > 0); - Assert(stats && stats->columnstats_size() == 0); - initial_check_ = false; - stats_ = stats; - - memset(&finfo, 0, sizeof(finfo)); - procs_.clear(); - finfos_.clear(); - status_.clear(); - for (int i = 0; i < natts; i++) { - procs_.emplace_back(zero_oids); - finfos_.emplace_back(std::pair({finfo, finfo})); - status_.emplace_back('u'); - auto columnstats = stats_->add_columnstats(); - Assert(columnstats->allnull()); - Assert(!columnstats->hasnull()); - } - Assert(stats_->columnstats_size() == natts); - return this; -} - -void MicroPartitionStats::AddRow(TupleTableSlot *slot) { - auto desc = slot->tts_tupleDescriptor; - auto n = desc->natts; - - if (!initial_check_) { - DoInitialCheck(desc); - initial_check_ = true; - } - CBDB_CHECK(status_.size() == static_cast(n), - cbdb::CException::ExType::kExTypeSchemaNotMatch); - for (auto i = 0; i < n; i++) { - auto att = &desc->attrs[i]; - - AssertImply(att->attisdropped, slot->tts_isnull[i]); - if (slot->tts_isnull[i]) - AddNullColumn(i); - else - AddNonNullColumn(i, slot->tts_values[i], desc); - } -} - -void MicroPartitionStats::AddNullColumn(int column_index) { - Assert(column_index >= 0); - Assert(column_index < static_cast(procs_.size())); - - auto column_stats = stats_->mutable_columnstats(column_index); - column_stats->set_hasnull(true); -} - -void MicroPartitionStats::AddNonNullColumn(int column_index, Datum value, - TupleDesc desc) { - Assert(column_index >= 0); - Assert(column_index < static_cast(procs_.size())); - - auto att = TupleDescAttr(desc, column_index); - auto collation = att->attcollation; - auto typlen = att->attlen; - auto typbyval = att->attbyval; - auto column_stats = stats_->mutable_columnstats(column_index); - column_stats->set_allnull(false); - - // update min/max - switch (status_[column_index]) { - case 'x': - break; - case 'y': - Assert(column_stats->minmaxstats().has_typid()); - Assert(column_stats->minmaxstats().has_minimal()); - Assert(column_stats->minmaxstats().has_maximum()); - Assert(column_stats->minmaxstats().has_proclt()); - Assert(column_stats->minmaxstats().has_procgt()); - Assert(column_stats->minmaxstats().has_procle()); - Assert(column_stats->minmaxstats().has_procge()); - Assert(column_stats->minmaxstats().typid() == att->atttypid); - Assert(column_stats->minmaxstats().collation() == collation); - - UpdateMinMaxValue(column_index, value, collation, typlen, typbyval); - break; - case 'n': { - auto minmax = column_stats->mutable_minmaxstats(); - - Assert(!minmax->has_proclt()); - Assert(!minmax->has_procgt()); - Assert(!minmax->has_procle()); - Assert(!minmax->has_procge()); - Assert(!minmax->has_typid()); - Assert(!minmax->has_minimal()); - Assert(!minmax->has_maximum()); - - minmax->set_typid(att->atttypid); - minmax->set_collation(collation); - minmax->set_proclt(std::get<0>(procs_[column_index])); - minmax->set_procgt(std::get<1>(procs_[column_index])); - minmax->set_procle(std::get<2>(procs_[column_index])); - minmax->set_procge(std::get<3>(procs_[column_index])); - minmax->set_minimal(ToValue(value, typlen, typbyval)); - minmax->set_maximum(ToValue(value, typlen, typbyval)); - status_[column_index] = 'y'; - break; - } - default: - Assert(false); - } -} - -void MicroPartitionStats::UpdateMinMaxValue(int column_index, Datum datum, - Oid collation, int typlen, - bool typbyval) { - Assert(initial_check_); - Assert(column_index >= 0 && static_cast(column_index) < status_.size()); - Assert(status_[column_index] == 'y'); - - auto &finfos = finfos_[column_index]; - auto minmax = - stats_->mutable_columnstats(column_index)->mutable_minmaxstats(); - bool ok; - - { - const auto &min = minmax->minimal(); - auto val = FromValue(min, typlen, typbyval, &ok); - CBDB_CHECK(ok, cbdb::CException::kExTypeLogicError); - auto update = - DatumGetBool(cbdb::FunctionCall2Coll(&finfos.first, collation, datum, val)); - if (update) minmax->set_minimal(ToValue(datum, typlen, typbyval)); - } - { - const auto &max = minmax->maximum(); - auto val = FromValue(max, typlen, typbyval, &ok); - CBDB_CHECK(ok, cbdb::CException::kExTypeLogicError); - auto update = - DatumGetBool(cbdb::FunctionCall2Coll(&finfos.second, collation, datum, val)); - if (update) minmax->set_maximum(ToValue(datum, typlen, typbyval)); - } -} - -bool MicroPartitionStats::GetStrategyProcinfo( - Oid typid, std::tuple &procids, - std::pair &finfos) { - return cbdb::MinMaxGetStrategyProcinfo(typid, &std::get<0>(procids), &finfos.first, - BTLessStrategyNumber) && - cbdb::MinMaxGetStrategyProcinfo(typid, &std::get<1>(procids), &finfos.second, - BTGreaterStrategyNumber) && - cbdb::MinMaxGetStrategyProcinfo(typid, &std::get<2>(procids), nullptr, - BTLessEqualStrategyNumber) && - cbdb::MinMaxGetStrategyProcinfo(typid, &std::get<3>(procids), nullptr, - BTGreaterEqualStrategyNumber); -} - -void MicroPartitionStats::DoInitialCheck(TupleDesc desc) { - auto natts = desc->natts; - - Assert(natts == static_cast(status_.size())); - Assert(natts == stats_->columnstats_size()); - Assert(status_.size() == procs_.size()); - Assert(status_.size() == finfos_.size()); - - for (int i = 0; i < natts; i++) { - auto att = TupleDescAttr(desc, i); - if (att->attisdropped || - !GetStrategyProcinfo(att->atttypid, procs_[i], finfos_[i])) { - status_[i] = 'x'; - continue; - } - status_[i] = 'n'; - } -} - -Datum MicroPartitionStats::FromValue(const std::string &s, int typlen, - bool typbyval, bool *ok) { - const char *p = s.data(); - *ok = true; - if (typbyval) { - Assert(typlen > 0); - switch (typlen) { - case 1: { - int8 i = *reinterpret_cast(p); - return cbdb::Int8ToDatum(i); - } - case 2: { - int16 i = *reinterpret_cast(p); - return cbdb::Int16ToDatum(i); - } - case 4: { - int32 i = *reinterpret_cast(p); - return cbdb::Int32ToDatum(i); - } - case 8: { - int64 i = *reinterpret_cast(p); - return cbdb::Int64ToDatum(i); - } - default: - Assert(!"unexpected typbyval, len not in 1,2,4,8"); - *ok = false; - break; - } - return 0; - } - - Assert(typlen == -1 || typlen > 0); - return PointerGetDatum(p); -} - -std::string MicroPartitionStats::ToValue(Datum datum, int typlen, - bool typbyval) { - if (typbyval) { - Assert(typlen > 0); - switch (typlen) { - case 1: { - int8 i = cbdb::DatumToInt8(datum); - return std::string(reinterpret_cast(&i), sizeof(i)); - } - case 2: { - int16 i = cbdb::DatumToInt16(datum); - return std::string(reinterpret_cast(&i), sizeof(i)); - } - case 4: { - int32 i = cbdb::DatumToInt32(datum); - return std::string(reinterpret_cast(&i), sizeof(i)); - } - case 8: { - int64 i = cbdb::DatumToInt64(datum); - return std::string(reinterpret_cast(&i), sizeof(i)); - } - default: - Assert(!"unexpected typbyval, len not in 1,2,4,8"); - break; - } - CBDB_RAISE(cbdb::CException::kExTypeLogicError); - } - - if (typlen == -1) { - void *v; - int len; - - v = cbdb::PointerAndLenFromDatum(datum, &len); - Assert(v && len != -1); - return std::string(reinterpret_cast(v), len); - } - // byref but fixed size - Assert(typlen > 0); - return std::string(reinterpret_cast(cbdb::DatumToPointer(datum)), - typlen); -} -} // namespace pax - -static inline const char *BoolToString(bool b) { return b ? "true" : "false"; } - -static char *TypeValueToCString(Oid typid, Oid collation, - const std::string &value) { - FmgrInfo finfo; - HeapTuple tuple; - Form_pg_type form; - Datum datum; - bool ok; - - tuple = SearchSysCache1(TYPEOID, ObjectIdGetDatum(typid)); - if (!HeapTupleIsValid(tuple)) - elog(ERROR, "cache lookup failed for type %u", typid); - - form = (Form_pg_type)GETSTRUCT(tuple); - Assert(OidIsValid(form->typoutput)); - - datum = pax::MicroPartitionStats::FromValue(value, form->typlen, - form->typbyval, &ok); - if (!ok) - elog(ERROR, "unexpected typlen: %d\n", form->typlen); - - fmgr_info_cxt(form->typoutput, &finfo, CurrentMemoryContext); - datum = FunctionCall1Coll(&finfo, collation, datum); - ReleaseSysCache(tuple); - - return DatumGetCString(datum); -} - -// define stat type for custom output -extern "C" { -extern Datum MicroPartitionStatsInput(PG_FUNCTION_ARGS); -extern Datum MicroPartitionStatsOutput(PG_FUNCTION_ARGS); -PG_FUNCTION_INFO_V1(MicroPartitionStatsInput); -PG_FUNCTION_INFO_V1(MicroPartitionStatsOutput); -} - -Datum MicroPartitionStatsInput(PG_FUNCTION_ARGS) { - ereport(ERROR, (errmsg("unsupport MicroPartitionStatsInput"))); - (void)fcinfo; - PG_RETURN_POINTER(NULL); -} - -Datum MicroPartitionStatsOutput(PG_FUNCTION_ARGS) { - struct varlena *v = PG_GETARG_VARLENA_PP(0); - pax::stats::MicroPartitionStatisticsInfo stats; - StringInfoData str; - - bool ok = stats.ParseFromArray(VARDATA_ANY(v), VARSIZE_ANY_EXHDR(v)); - if (!ok) ereport(ERROR, (errmsg("micropartition stats is corrupt"))); - - initStringInfo(&str); - for (int i = 0, n = stats.columnstats_size(); i < n; i++) { - const auto &column = stats.columnstats(i); - - if (i > 0) appendStringInfoChar(&str, ','); - - appendStringInfo(&str, "[(%s,%s)", BoolToString(column.allnull()), - BoolToString(column.hasnull())); - - if (!column.has_minmaxstats()) { - appendStringInfoString(&str, ",None]"); - continue; - } - - const auto &minmax = column.minmaxstats(); - appendStringInfo(&str, ",(%u,%u,%u,%u,%s,%s)]", minmax.typid(), - minmax.collation(), minmax.proclt(), - minmax.procgt(), - TypeValueToCString(minmax.typid(), minmax.collation(), - minmax.minimal()), - TypeValueToCString(minmax.typid(), minmax.collation(), - minmax.maximum())); - } - - PG_RETURN_CSTRING(str.data); -} diff --git a/contrib/pax_storage/src/cpp/catalog/micro_partition_stats.h b/contrib/pax_storage/src/cpp/catalog/micro_partition_stats.h deleted file mode 100644 index 56be8e8cddf..00000000000 --- a/contrib/pax_storage/src/cpp/catalog/micro_partition_stats.h +++ /dev/null @@ -1,51 +0,0 @@ -#pragma once -#include "comm/cbdb_api.h" - -#include -#include -#include - -namespace pax { -namespace stats { -class MicroPartitionStatisticsInfo; -} - -class MicroPartitionStats final { - public: - MicroPartitionStats() = default; - MicroPartitionStats *SetStatsMessage( - pax::stats::MicroPartitionStatisticsInfo *stats, int natts); - - void AddRow(TupleTableSlot *slot); - - static std::string ToValue(Datum datum, int typlen, bool typbyval); - static Datum FromValue(const std::string &s, int typlen, bool typbyval, bool *ok); - - private: - void AddNullColumn(int column_index); - void AddNonNullColumn(int column_index, Datum value, TupleDesc desc); - void DoInitialCheck(TupleDesc desc); - void UpdateMinMaxValue(int column_index, Datum datum, Oid collation, - int typlen, bool typbyval); - static bool GetStrategyProcinfo(Oid typid, std::tuple &procids, - std::pair &finfos); - - // stats_: only references the info object by pointer - pax::stats::MicroPartitionStatisticsInfo *stats_ = nullptr; - - // less: tuple[0], greater: tuple[1], le: tuple[2], ge: tuple[3] - std::vector> procs_; - // less: pair[0], greater: pair[1] - std::vector> finfos_; - - // status to indicate whether the oids are initialized - // or the min-max values are initialized - // 'u': all is uninitialized - // 'x': column doesn't support min-max - // 'n': oids are initialized, but min-max value is missing - // 'y': min-max is set, needs update. - std::vector status_; - bool initial_check_ = false; -}; - -} // namespace pax diff --git a/contrib/pax_storage/src/cpp/catalog/pax_aux_table.cc b/contrib/pax_storage/src/cpp/catalog/pax_aux_table.cc index 906d8d9fd22..44d97f25694 100644 --- a/contrib/pax_storage/src/cpp/catalog/pax_aux_table.cc +++ b/contrib/pax_storage/src/cpp/catalog/pax_aux_table.cc @@ -3,14 +3,14 @@ #include "comm/cbdb_api.h" #include - #include +#include "catalog/pax_fastsequence.h" +#include "catalog/pg_pax_tables.h" #include "comm/cbdb_wrappers.h" #include "storage/file_system.h" #include "storage/local_file_system.h" #include "storage/micro_partition_metadata.h" -#include "storage/paxc_block_map_manager.h" namespace paxc { @@ -46,26 +46,20 @@ static void CPaxTransactionalTruncateTable(Oid aux_relid) { // 2.create table outside transactional block, insert data // and truncate table inside transactional block. static void CPaxNontransactionalTruncateTable(Relation rel) { - HeapTuple tuple; Relation aux_rel; Oid aux_relid; - tuple = SearchSysCache1(PAXTABLESID, RelationGetRelid(rel)); - if (!HeapTupleIsValid(tuple)) - ereport(ERROR, (errcode(ERRCODE_UNDEFINED_SCHEMA), - errmsg("cache lookup failed with relid=%u for aux relation " - "in pg_pax_tables.", - RelationGetRelid(rel)))); - aux_relid = ((Form_pg_pax_tables)GETSTRUCT(tuple))->blocksrelid; - ReleaseSysCache(tuple); + aux_relid = ::paxc::GetPaxAuxRelid(RelationGetRelid(rel)); Assert(OidIsValid(aux_relid)); aux_rel = relation_open(aux_relid, AccessExclusiveLock); heap_truncate_one_rel(aux_rel); relation_close(aux_rel, NoLock); + + paxc::CPaxInitializeFastSequenceEntry(RelationGetRelid(rel), FASTSEQUENCE_INIT_TYPE_INPLACE); } -static void CPaxCreateMicroPartitionTable(const Relation rel) { +void CPaxCreateMicroPartitionTable(Relation rel) { Relation pg_class_desc; char aux_relname[32]; Oid relid; @@ -79,7 +73,7 @@ static void CPaxCreateMicroPartitionTable(const Relation rel) { // 1. create blocks table. snprintf(aux_relname, sizeof(aux_relname), "pg_pax_blocks_%u", pax_relid); - aux_namespace_id = PG_PAXAUX_NAMESPACE; + aux_namespace_id = PG_EXTAUX_NAMESPACE; aux_relid = GetNewOidForRelation(pg_class_desc, ClassOidIndexId, Anum_pg_class_oid, // new line aux_relname, aux_namespace_id); @@ -91,12 +85,18 @@ static void CPaxCreateMicroPartitionTable(const Relation rel) { // TODO(chenhongjie): uncompressed and compressed ptblocksize are needed. TupleDescInitEntry(tupdesc, (AttrNumber)ANUM_PG_PAX_BLOCK_TABLES_PTBLOCKSIZE, "ptblocksize", INT4OID, -1, 0); - TupleDescInitEntry(tupdesc, (AttrNumber)ANUM_PG_PAX_BLOCK_TABLES_PTSTATISITICS, + TupleDescInitEntry(tupdesc, + (AttrNumber)ANUM_PG_PAX_BLOCK_TABLES_PTSTATISITICS, "ptstatistics", PAX_AUX_STATS_TYPE_OID, -1, 0); + { + // Add constraints for the aux table + auto attr = TupleDescAttr(tupdesc, ANUM_PG_PAX_BLOCK_TABLES_PTBLOCKNAME - 1); + attr->attnotnull = true; + } relid = heap_create_with_catalog( aux_relname, aux_namespace_id, InvalidOid, aux_relid, InvalidOid, InvalidOid, rel->rd_rel->relowner, HEAP_TABLE_AM_OID, tupdesc, NIL, - RELKIND_RELATION, rel->rd_rel->relpersistence, rel->rd_rel->relisshared, + RELKIND_RELATION, RELPERSISTENCE_PERMANENT, rel->rd_rel->relisshared, RelationIsMapped(rel), ONCOMMIT_NOOP, NULL, /* GP Policy */ (Datum)0, false, /* use _user_acl */ true, true, InvalidOid, NULL, /* typeaddress */ @@ -104,8 +104,10 @@ static void CPaxCreateMicroPartitionTable(const Relation rel) { Assert(relid == aux_relid); table_close(pg_class_desc, NoLock); + NewRelationCreateToastTable(relid, (Datum)0); + // 2. insert entry into pg_pax_tables. - InsertPaxTablesEntry(pax_relid, aux_relid, "", 0); + ::paxc::InsertPaxTablesEntry(pax_relid, aux_relid, NULL); // 3. record pg_depend, pg_pax_blocks_ depends relation. { @@ -123,84 +125,115 @@ static void CPaxCreateMicroPartitionTable(const Relation rel) { base.classId = RelationRelationId; base.objectId = pax_relid; base.objectSubId = 0; - aux.classId = PaxTablesRelationId; + aux.classId = PAX_TABLES_RELATION_ID; aux.objectId = pax_relid; aux.objectSubId = 0; recordDependencyOn(&aux, &base, DEPENDENCY_INTERNAL); } -} - -static void CPaxDeletePaxBlockEntry(Oid relid, Snapshot pax_meta_data_snapshot, - const char *blockname) { - Relation rel; - ScanKeyData key[1]; - SysScanDesc scan; - HeapTuple tuple; - NameData ptblockname; + CommandCounterIncrement(); - rel = table_open(relid, RowExclusiveLock); - namestrcpy(&ptblockname, blockname); - ScanKeyInit(&key[0], ANUM_PG_PAX_BLOCK_TABLES_PTBLOCKNAME, - BTEqualStrategyNumber, F_NAMEEQ, NameGetDatum(&ptblockname)); - - // should add snapshot support - scan = systable_beginscan(rel, InvalidOid, false, pax_meta_data_snapshot, 1, - key); - - tuple = systable_getnext(scan); - if (HeapTupleIsValid(tuple)) { - CatalogTupleDelete(rel, &tuple->t_self); + // 4. create index on ptblockname dynamically, the index name should be pg_paxaux.pg_pax_blocks_index_xxx. + { + char aux_index_name[NAMEDATALEN]; + IndexInfo *indexInfo; + List *indexColNames; + Relation aux_rel; + int16 coloptions[1]; + Oid classObjectId[1]; + Oid collationObjectId[1]; + + snprintf(aux_index_name, sizeof(aux_index_name), "%s_idx", aux_relname); + + indexInfo = makeNode(IndexInfo); + indexInfo->ii_NumIndexAttrs = 1; + indexInfo->ii_NumIndexKeyAttrs = 1; + indexInfo->ii_IndexAttrNumbers[0] = ANUM_PG_PAX_BLOCK_TABLES_PTBLOCKNAME; + indexInfo->ii_Expressions = NIL; + indexInfo->ii_ExpressionsState = NIL; + indexInfo->ii_Predicate = NIL; + indexInfo->ii_PredicateState = NULL; + indexInfo->ii_Unique = true; + indexInfo->ii_ReadyForInserts = true; + indexInfo->ii_Concurrent = false; + indexInfo->ii_Am = BTREE_AM_OID; + indexInfo->ii_Context = CurrentMemoryContext; + + collationObjectId[0] = C_COLLATION_OID; + classObjectId[0] = GetDefaultOpClass(NAMEOID, BTREE_AM_OID); + coloptions[0] = 0; + + auto attr = TupleDescAttr(tupdesc, ANUM_PG_PAX_BLOCK_TABLES_PTBLOCKNAME - 1); + indexColNames = list_make1(NameStr(attr->attname)); + + // ShareLock is not really needed here, but take it anyway. + aux_rel = table_open(aux_relid, ShareLock); + + index_create(aux_rel, + aux_index_name, + InvalidOid, + InvalidOid, + InvalidOid, + InvalidOid, + indexInfo, + indexColNames, + BTREE_AM_OID, + rel->rd_rel->reltablespace, + collationObjectId, classObjectId, coloptions, (Datum) 0, + INDEX_CREATE_IS_PRIMARY, 0, true, true, NULL); + + // Unlock target table -- no one can see it + table_close(aux_rel, ShareLock); + + // Unlock the index -- no one can see it anyway + //UnlockRelationOid(paxauxiliary_idxid, AccessExclusiveLock); + + CommandCounterIncrement(); } - systable_endscan(scan); - table_close(rel, RowExclusiveLock); } -static void CPaxCopyPaxBlockEntry(Relation old_relation, Relation new_relation) { +void DeleteMicroPartitionEntry(Oid pax_relid, Snapshot snapshot, + const char *blockname) { + ScanAuxContext context; HeapTuple tuple; - SysScanDesc pax_scan; - Relation old_aux_rel, new_aux_rel; - Oid old_aux_relid = 0, new_aux_relid = 0; + Oid aux_relid; - HeapTuple tupcache; - tupcache = SearchSysCache1(PAXTABLESID, RelationGetRelid(old_relation)); - Assert(HeapTupleIsValid(tupcache)); - old_aux_relid = ((Form_pg_pax_tables)GETSTRUCT(tupcache))->blocksrelid; - ReleaseSysCache(tupcache); + aux_relid = ::paxc::GetPaxAuxRelid(pax_relid); - tupcache = SearchSysCache1(PAXTABLESID, RelationGetRelid(new_relation)); - Assert(HeapTupleIsValid(tupcache)); - new_aux_relid = ((Form_pg_pax_tables)GETSTRUCT(tupcache))->blocksrelid; - ReleaseSysCache(tupcache); + context.BeginSearchMicroPartition(aux_relid, InvalidOid, snapshot, RowExclusiveLock, blockname); + tuple = context.SearchMicroPartitionEntry(); + if (!HeapTupleIsValid(tuple)) + elog(ERROR, "delete micro partition \"%s\" failed for relation(%u)", blockname, pax_relid); - old_aux_rel = table_open(old_aux_relid, RowExclusiveLock); - new_aux_rel = table_open(new_aux_relid, RowExclusiveLock); + Assert(context.GetRelation()); + CatalogTupleDelete(context.GetRelation(), &tuple->t_self); - pax_scan = systable_beginscan(old_aux_rel, InvalidOid, false, - NULL, 0, NULL); - while ((tuple = systable_getnext(pax_scan)) != NULL) { - CatalogTupleInsert(new_aux_rel, tuple); - } - systable_endscan(pax_scan); - table_close(old_aux_rel, RowExclusiveLock); - table_close(new_aux_rel, RowExclusiveLock); + context.EndSearchMicroPartition(NoLock); } -} // namespace paxc +void InsertMicroPartitionPlaceHolder(Oid aux_relid, const char *blockname) { + NameData ptblockname; + Datum values[NATTS_PG_PAX_BLOCK_TABLES]; + bool nulls[NATTS_PG_PAX_BLOCK_TABLES]; -namespace cbdb { -Oid GetPaxAuxRelid(Oid relid) { - Oid aux_relid = InvalidOid; - CBDB_WRAP_START; - { - GetPaxTablesEntryAttributes(relid, &aux_relid, NULL, NULL); - return aux_relid; - } - CBDB_WRAP_END; + Assert(blockname && strlen(blockname) < NAMEDATALEN); + namestrcpy(&ptblockname, blockname); + + values[ANUM_PG_PAX_BLOCK_TABLES_PTBLOCKNAME - 1] = NameGetDatum(&ptblockname); + nulls[ANUM_PG_PAX_BLOCK_TABLES_PTBLOCKNAME - 1] = false; + + nulls[ANUM_PG_PAX_BLOCK_TABLES_PTTUPCOUNT - 1] = true; + nulls[ANUM_PG_PAX_BLOCK_TABLES_PTBLOCKSIZE - 1] = true; + nulls[ANUM_PG_PAX_BLOCK_TABLES_PTSTATISITICS - 1] = true; + + InsertTuple(aux_relid, values, nulls); + CommandCounterIncrement(); } -static void InsertPaxBlockEntry(Oid relid, const char *blockname, int pttupcount, - int ptblocksize, const ::pax::stats::MicroPartitionStatisticsInfo &mp_stats) { +void InsertOrUpdateMicroPartitionPlaceHolder(Oid aux_relid, + const char *blockname, + int num_tuples, int file_size, + const ::pax::stats::MicroPartitionStatisticsInfo &mp_stats) { int stats_length = mp_stats.ByteSize(); uint32 len = VARHDRSZ + stats_length; void *output; @@ -209,58 +242,221 @@ static void InsertPaxBlockEntry(Oid relid, const char *blockname, int pttupcount Datum values[NATTS_PG_PAX_BLOCK_TABLES]; bool nulls[NATTS_PG_PAX_BLOCK_TABLES]; - output = cbdb::Palloc(len); + output = palloc(len); SET_VARSIZE(output, len); mp_stats.SerializeToArray(VARDATA(output), stats_length); Assert(blockname); namestrcpy(&ptblockname, blockname); - values[ANUM_PG_PAX_BLOCK_TABLES_PTBLOCKNAME - 1] = - NameGetDatum(&ptblockname); + values[ANUM_PG_PAX_BLOCK_TABLES_PTBLOCKNAME - 1] = NameGetDatum(&ptblockname); nulls[ANUM_PG_PAX_BLOCK_TABLES_PTBLOCKNAME - 1] = false; - values[ANUM_PG_PAX_BLOCK_TABLES_PTTUPCOUNT - 1] = Int32GetDatum(pttupcount); + values[ANUM_PG_PAX_BLOCK_TABLES_PTTUPCOUNT - 1] = Int32GetDatum(num_tuples); nulls[ANUM_PG_PAX_BLOCK_TABLES_PTTUPCOUNT - 1] = false; - values[ANUM_PG_PAX_BLOCK_TABLES_PTBLOCKSIZE - 1] = - Int32GetDatum(ptblocksize); + values[ANUM_PG_PAX_BLOCK_TABLES_PTBLOCKSIZE - 1] = Int32GetDatum(file_size); nulls[ANUM_PG_PAX_BLOCK_TABLES_PTBLOCKSIZE - 1] = false; - // Serialize catalog statitics information into PG bytea format and saved in aux table ptstatitics column. + // Serialize catalog statitics information into PG bytea format and saved in + // aux table ptstatitics column. values[ANUM_PG_PAX_BLOCK_TABLES_PTSTATISITICS - 1] = PointerGetDatum(output); nulls[ANUM_PG_PAX_BLOCK_TABLES_PTSTATISITICS - 1] = false; - CBDB_WRAP_START; - { - paxc::InsertTuple(relid, values, nulls); + ScanAuxContext context; + context.BeginSearchMicroPartition(aux_relid, InvalidOid, NULL, RowExclusiveLock, blockname); + auto aux_rel = context.GetRelation(); + auto oldtuple = context.SearchMicroPartitionEntry(); + if (!HeapTupleIsValid(oldtuple)) + elog(ERROR, "micro partition doesn't exist before inserting tuples"); + + if (num_tuples > 0) { + auto newtuple = heap_form_tuple(RelationGetDescr(aux_rel), values, nulls); + + newtuple->t_data->t_ctid = oldtuple->t_data->t_ctid; + newtuple->t_self = oldtuple->t_self; + newtuple->t_tableOid = oldtuple->t_tableOid; + CatalogTupleUpdate(aux_rel, &newtuple->t_self, newtuple); + heap_freetuple(newtuple); + } else { + CatalogTupleDelete(aux_rel, &oldtuple->t_self); + } + context.EndSearchMicroPartition(NoLock); + + pfree(output); + + CommandCounterIncrement(); +} + +Oid FindAuxIndexOid(Oid aux_relid, Snapshot snapshot) { + ScanKeyData scankey[1]; + Relation indrel; + SysScanDesc scan; + HeapTuple tuple; + Oid index_oid; + int index_count = 0; + + ScanKeyInit(&scankey[0], Anum_pg_index_indrelid, BTEqualStrategyNumber, F_OIDEQ, ObjectIdGetDatum(aux_relid)); + indrel = table_open(IndexRelationId, AccessShareLock); + scan = systable_beginscan(indrel, IndexIndrelidIndexId, true, snapshot, 1, scankey); + + index_oid = InvalidOid; + while (HeapTupleIsValid(tuple = systable_getnext(scan))) { + auto index = (Form_pg_index) GETSTRUCT(tuple); + index_count++; + if (!index->indislive || !index->indisvalid) continue; + index_oid = index->indexrelid; + } + systable_endscan(scan); + table_close(indrel, NoLock); + + if (index_count != 1 || !OidIsValid(index_oid)) + elog(ERROR, "unexpected number of index of aux table: %d", index_count); + + return index_oid; +} + +static inline Oid GetAuxIndexOid(Oid aux_relid, Oid *aux_index_relid, Snapshot snapshot) { + if (aux_index_relid) { + if (OidIsValid(*aux_index_relid)) + return *aux_index_relid; + else + return *aux_index_relid = FindAuxIndexOid(aux_relid, snapshot); + } else { + return FindAuxIndexOid(aux_relid, snapshot); + } +} + +void ScanAuxContext::BeginSearchMicroPartition(Oid aux_relid, Oid aux_index_relid, Snapshot snapshot, LOCKMODE lockmode, const char *blockname) { + Assert(aux_relid); + if (!OidIsValid(aux_index_relid) && blockname) + aux_index_relid = FindAuxIndexOid(aux_relid, snapshot); + + aux_rel_ = table_open(aux_relid, lockmode); + if (blockname) { + ScanKeyData scankey[1]; + + ScanKeyInit(&scankey[0], ANUM_PG_PAX_BLOCK_TABLES_PTBLOCKNAME, BTEqualStrategyNumber, F_NAMEEQ, CStringGetDatum(blockname)); + scan_ = systable_beginscan(aux_rel_, aux_index_relid, true, snapshot, 1, scankey); + } else { + scan_ = systable_beginscan(aux_rel_, aux_index_relid, false, snapshot, 0, nullptr); + } +} + +HeapTuple ScanAuxContext::SearchMicroPartitionEntry() { + Assert(aux_rel_ && scan_); + return systable_getnext(scan_); +} + +void ScanAuxContext::EndSearchMicroPartition(LOCKMODE lockmode) { + Assert(aux_rel_ && scan_); + + systable_endscan(scan_); + table_close(aux_rel_, lockmode); + scan_ = nullptr; + aux_rel_ = nullptr; +} + +void PaxAuxRelationSetNewFilenode(Oid aux_relid) { + Relation aux_rel; + Oid toastrelid; + ReindexParams reindex_params = {0}; + + aux_rel = relation_open(aux_relid, AccessExclusiveLock); + RelationSetNewRelfilenode(aux_rel, aux_rel->rd_rel->relpersistence); + toastrelid = aux_rel->rd_rel->reltoastrelid; + if (OidIsValid(toastrelid)) { + Relation toast_rel; + toast_rel = relation_open(toastrelid, AccessExclusiveLock); + RelationSetNewRelfilenode(toast_rel, toast_rel->rd_rel->relpersistence); + relation_close(toast_rel, NoLock); } + if (aux_rel->rd_rel->relhasindex) + reindex_relation(aux_relid, REINDEX_REL_PROCESS_TOAST, &reindex_params); + pgstat_count_truncate(aux_rel); + relation_close(aux_rel, NoLock); +} + +bool IsMicroPartitionVisible(Relation pax_rel, BlockNumber block, Snapshot snapshot) { + struct ScanAuxContext context; + HeapTuple tuple; + Oid aux_relid; + char block_name[NAMEDATALEN]; + bool ok; + + aux_relid = ::paxc::GetPaxAuxRelid(RelationGetRelid(pax_rel)); + snprintf(block_name, sizeof(block_name), "%u", block); + + context.BeginSearchMicroPartition(aux_relid, InvalidOid, snapshot, AccessShareLock, block_name); + tuple = context.SearchMicroPartitionEntry(); + ok = HeapTupleIsValid(tuple); + context.EndSearchMicroPartition(NoLock); + + return ok; +} + +static void CPaxCopyPaxBlockEntry(Relation old_relation, + Relation new_relation) { + HeapTuple tuple; + SysScanDesc pax_scan; + Relation old_aux_rel, new_aux_rel; + Oid old_aux_relid = 0, new_aux_relid = 0; + + old_aux_relid = ::paxc::GetPaxAuxRelid(RelationGetRelid(old_relation)); + new_aux_relid = ::paxc::GetPaxAuxRelid(RelationGetRelid(new_relation)); + old_aux_rel = table_open(old_aux_relid, RowExclusiveLock); + new_aux_rel = table_open(new_aux_relid, RowExclusiveLock); + + pax_scan = systable_beginscan(old_aux_rel, InvalidOid, false, NULL, 0, NULL); + while ((tuple = systable_getnext(pax_scan)) != NULL) { + CatalogTupleInsert(new_aux_rel, tuple); + } + systable_endscan(pax_scan); + table_close(old_aux_rel, RowExclusiveLock); + table_close(new_aux_rel, RowExclusiveLock); +} + +} // namespace paxc + +namespace cbdb { +Oid GetPaxAuxRelid(Oid relid) { + CBDB_WRAP_START; + { return ::paxc::GetPaxAuxRelid(relid); } CBDB_WRAP_END; +} - cbdb::Pfree(output); +void DeleteMicroPartitionEntry(Oid pax_relid, Snapshot snapshot, + const std::string &blockname) { + CBDB_WRAP_START; + { paxc::DeleteMicroPartitionEntry(pax_relid, snapshot, blockname.c_str()); } + CBDB_WRAP_END; } -static void DeletePaxBlockEntry(Oid relid, Snapshot snapshot, - const char *blockname) { +void InsertMicroPartitionPlaceHolder(Oid pax_relid, const std::string &blockname) { CBDB_WRAP_START; { - paxc::CPaxDeletePaxBlockEntry(relid, snapshot, blockname); + Oid aux_relid; + + aux_relid = ::paxc::GetPaxAuxRelid(pax_relid); + paxc::InsertMicroPartitionPlaceHolder(aux_relid, blockname.c_str()); } CBDB_WRAP_END; } +void InsertOrUpdateMicroPartitionEntry(const pax::WriteSummary &summary) { + CBDB_WRAP_START; + { + Oid aux_relid; -void DeleteMicroPartitionEntry(Oid pax_relid, - Snapshot snapshot, - const std::string &block_id) { - Oid aux_relid = GetPaxAuxRelid(pax_relid); - cbdb::DeletePaxBlockEntry(aux_relid, snapshot, - block_id.c_str()); + aux_relid = ::paxc::GetPaxAuxRelid(summary.rel_oid); + paxc::InsertOrUpdateMicroPartitionPlaceHolder(aux_relid, summary.block_id.c_str(), + summary.num_tuples, summary.file_size, summary.mp_stats); + } + CBDB_WRAP_END; } -void AddMicroPartitionEntry(const pax::WriteSummary &summary) { - Oid aux_relid; - aux_relid = GetPaxAuxRelid(summary.rel_oid); - cbdb::InsertPaxBlockEntry(aux_relid, summary.block_id.c_str(), - summary.num_tuples, summary.file_size, summary.mp_stats); +bool IsMicroPartitionVisible(Relation pax_rel, BlockNumber block, Snapshot snapshot) { + CBDB_WRAP_START; + { return paxc::IsMicroPartitionVisible(pax_rel, block, snapshot); } + CBDB_WRAP_END; } static void PaxTransactionalTruncateTable(Oid aux_relid) { @@ -289,31 +485,6 @@ static void PaxCopyPaxBlockEntry(Relation old_relation, Relation new_relation) { } // namespace cbdb namespace pax { -void CCPaxAuxTable::PaxAuxRelationSetNewFilenode(Relation rel, - const RelFileNode *newrnode, - char persistence) { - HeapTuple tupcache; - std::string path; - FileSystem *fs = pax::Singleton::GetInstance(); - - tupcache = cbdb::SearchSysCache(rel, PAXTABLESID); - if (cbdb::TupleIsValid(tupcache)) { - Oid aux_relid = ((Form_pg_pax_tables)GETSTRUCT(tupcache))->blocksrelid; - cbdb::PaxTransactionalTruncateTable(aux_relid); - cbdb::ReleaseTupleCache(tupcache); - } else { - // create pg_pax_blocks_ - cbdb::PaxCreateMicroPartitionTable(rel); - } - - // Create pax table relfilenode file and database directory under path base/, - // The relfilenode created here is to be compatible with PG normal process - // logic instead of being used by pax storage. - cbdb::RelationCreateStorageDirectory(*newrnode, persistence, SMGR_MD, rel); - path = cbdb::BuildPaxDirectoryPath(*newrnode, rel->rd_backend); - Assert(!path.empty()); - CBDB_CHECK((fs->CreateDirectory(path) == 0), cbdb::CException::ExType::kExTypeIOError); -} void CCPaxAuxTable::PaxAuxRelationNontransactionalTruncate(Relation rel) { cbdb::PaxNontransactionalTruncateTable(rel); @@ -337,43 +508,52 @@ void CCPaxAuxTable::PaxAuxRelationCopyData(Relation rel, src_path = cbdb::BuildPaxDirectoryPath(rel->rd_node, rel->rd_backend); Assert(!src_path.empty()); - // get micropatition file source folder filename list for copying. - filelist = fs->ListDirectory(src_path); - if (filelist.empty()) return; - dst_path = cbdb::BuildPaxDirectoryPath(*newrnode, rel->rd_backend); Assert(!dst_path.empty()); if (src_path.empty() || dst_path.empty()) CBDB_RAISE(cbdb::CException::ExType::kExTypeFileOperationError); - // createnewpath is used to indicate if creating destination micropartition file directory and storage file for copying or not. - // 1. For RelationCopyData case, createnewpath should be set as true to explicitly create a new destination directory under + // createnewpath is used to indicate if creating destination micropartition + // file directory and storage file for copying or not. + // 1. For RelationCopyData case, createnewpath should be set as true to + // explicitly create a new destination directory under // new tablespace path pg_tblspc/. - // 2. For RelationCopyDataForCluster case, createnewpath should be set as false cause the destination directory was already - // created with a new temp table by previously calling PaxAuxRelationSetNewFilenode. + // 2. For RelationCopyDataForCluster case, createnewpath should be set as + // false cause the destination directory was already + // created with a new temp table by previously calling + // PaxAuxRelationSetNewFilenode. if (createnewpath) { // create pg_pax_table relfilenode file and dbid directory. cbdb::RelationCreateStorageDirectory(*newrnode, rel->rd_rel->relpersistence, - SMGR_MD, rel); + SMGR_MD, rel); // create micropartition file destination folder for copying. - CBDB_CHECK((fs->CreateDirectory(dst_path) == 0), cbdb::CException::ExType::kExTypeIOError); + CBDB_CHECK((fs->CreateDirectory(dst_path) == 0), + cbdb::CException::ExType::kExTypeIOError); } + // Get micropatition file source folder filename list for copying, if file + // list is empty then skip copying file directly. + filelist = fs->ListDirectory(src_path); + if (filelist.empty()) return; + for (auto &iter : filelist) { Assert(!iter.empty()); - src_path.append("/"); - src_path.append(iter); - dst_path.append("/"); - dst_path.append(iter); - fs->CopyFile(src_path, dst_path); + std::string src_file = src_path; + std::string dst_file = dst_path; + src_file.append("/"); + src_file.append(iter); + dst_file.append("/"); + dst_file.append(iter); + fs->CopyFile(src_file, dst_file); } // TODO(Tony) : here need to implement pending delete srcPath after set new // tablespace. } -void CCPaxAuxTable::PaxAuxRelationCopyDataForCluster(Relation old_rel, Relation new_rel) { +void CCPaxAuxTable::PaxAuxRelationCopyDataForCluster(Relation old_rel, + Relation new_rel) { PaxAuxRelationCopyData(old_rel, &new_rel->rd_node, false); cbdb::PaxCopyPaxBlockEntry(old_rel, new_rel); // TODO(Tony) : here need to implement PAX re-organize semantics logic. @@ -388,5 +568,5 @@ void CCPaxAuxTable::PaxAuxRelationFileUnlink(RelFileNode node, relpath = cbdb::BuildPaxDirectoryPath(node, backend); fs->DeleteDirectory(relpath, delete_topleveldir); } -} // namespace pax +} // namespace pax diff --git a/contrib/pax_storage/src/cpp/catalog/pax_aux_table.h b/contrib/pax_storage/src/cpp/catalog/pax_aux_table.h index 7d79f36f863..3d39ef67e4d 100644 --- a/contrib/pax_storage/src/cpp/catalog/pax_aux_table.h +++ b/contrib/pax_storage/src/cpp/catalog/pax_aux_table.h @@ -1,10 +1,9 @@ #pragma once -#include "catalog/pax_aux_table.h" - #include "comm/cbdb_api.h" #include +#include "catalog/pax_aux_table.h" #include "storage/micro_partition_metadata.h" #define ANUM_PG_PAX_BLOCK_TABLES_PTBLOCKNAME 1 @@ -13,6 +12,37 @@ #define ANUM_PG_PAX_BLOCK_TABLES_PTSTATISITICS 4 #define NATTS_PG_PAX_BLOCK_TABLES 4 +namespace paxc { +void CPaxCreateMicroPartitionTable(Relation rel); + +Oid FindAuxIndexOid(Oid aux_relid, Snapshot snapshot); + +void InsertMicroPartitionPlaceHolder(Oid aux_relid, const char *blockname); +void DeleteMicroPartitionEntry(Oid pax_relid, Snapshot snapshot, const char *blockname); +// Scan aux table +// seqscan: MicroPartitionInfoIterator +// index scan +struct ScanAuxContext { + public: + void BeginSearchMicroPartition(Oid aux_relid, Oid aux_index_relid, + Snapshot snapshot, LOCKMODE lockmode, const char *blockname); + void BeginSearchMicroPartition(Oid aux_relid, Snapshot snapshot, LOCKMODE lockmode) { + BeginSearchMicroPartition(aux_relid, InvalidOid, snapshot, lockmode, nullptr); + } + HeapTuple SearchMicroPartitionEntry(); + void EndSearchMicroPartition(LOCKMODE lockmode); + + Relation GetRelation() { return aux_rel_; } + + private: + Relation aux_rel_ = nullptr; + SysScanDesc scan_ = nullptr; +}; + +void PaxAuxRelationSetNewFilenode(Oid aux_relid); +bool IsMicroPartitionVisible(Relation pax_rel, BlockNumber block, Snapshot snapshot); +} + namespace pax { class CCPaxAuxTable final { public: @@ -25,26 +55,27 @@ class CCPaxAuxTable final { static void PaxAuxRelationNontransactionalTruncate(Relation rel); - static void PaxAuxRelationCopyData(Relation rel, - const RelFileNode *newrnode, + static void PaxAuxRelationCopyData(Relation rel, const RelFileNode *newrnode, bool createnewpath = true); - static void PaxAuxRelationCopyDataForCluster(Relation old_rel, Relation new_rel); + static void PaxAuxRelationCopyDataForCluster(Relation old_rel, + Relation new_rel); static void PaxAuxRelationFileUnlink(RelFileNode node, BackendId backend, bool delete_topleveldir); }; + } // namespace pax namespace cbdb { Oid GetPaxAuxRelid(Oid relid); -void AddMicroPartitionEntry(const pax::WriteSummary &summary); +void InsertMicroPartitionPlaceHolder(Oid pax_relid, const std::string &blockname); +void InsertOrUpdateMicroPartitionEntry(const pax::WriteSummary &summary); -void DeleteMicroPartitionEntry(Oid pax_relid, - Snapshot snapshot, - const std::string &block_id); +void DeleteMicroPartitionEntry(Oid pax_relid, Snapshot snapshot, + const std::string &blockname); +bool IsMicroPartitionVisible(Relation pax_rel, BlockNumber block, Snapshot snapshot); } // namespace cbdb - diff --git a/contrib/pax_storage/src/cpp/catalog/pax_fastsequence.cc b/contrib/pax_storage/src/cpp/catalog/pax_fastsequence.cc new file mode 100644 index 00000000000..a78cdf9c235 --- /dev/null +++ b/contrib/pax_storage/src/cpp/catalog/pax_fastsequence.cc @@ -0,0 +1,175 @@ +#include "catalog/pax_fastsequence.h" + +#include "comm/cbdb_api.h" + +namespace paxc { + +// Get the required objid Tuple from pg_pax_fastsequence system table. +// objid indicates single pax micro-partition table oid. +// lock_mode indicates the lock level used when retrive data from system table. +static HeapTuple CPaxOpenFastSequenceTable(Oid objid, + Relation *pax_fastsequence_rel, + SysScanDesc *pax_fastsequece_scan, + LOCKMODE lock_mode) { + ScanKeyData scankey[1]; + HeapTuple tuple; + Relation rel; + SysScanDesc scan; + + rel = table_open(PAX_FASTSEQUENCE_OID, lock_mode); + + /* SELECT * FROM paxaux.pg_pax_fastsequence WHERE objid = :1 FOR UPDATE */ + ScanKeyInit(&scankey[0], ANUM_PG_PAX_FAST_SEQUENCE_OBJID, + BTEqualStrategyNumber, F_OIDEQ, ObjectIdGetDatum(objid)); + + scan = systable_beginscan(rel, PAX_FASTSEQUENCE_INDEX_OID, true, NULL, 1, + scankey); + + tuple = systable_getnext(scan); + + *pax_fastsequence_rel = rel; + *pax_fastsequece_scan = scan; + + return tuple; +} + +static inline void CPaxCloseFastSequenceTable(Relation pax_fastsequence_rel, + SysScanDesc pax_fastsequece_scan, + LOCKMODE lock_mode) { + systable_endscan(pax_fastsequece_scan); + table_close(pax_fastsequence_rel, lock_mode); +} + +// update the existing fast sequence number for (objid). +// This tuple is updated with the new value. Otherwise, a new tuple is inserted +// into the table. +static void CPaxUpdateFastsequence(Relation pax_fastsequence_rel, + HeapTuple old_tuple, TupleDesc tuple_desc, + Oid objid, int32 new_seqno) { + HeapTuple new_tuple; + Datum values[NATTS_PG_PAX_FAST_SEQUENCE_TABLES]; + bool nulls[NATTS_PG_PAX_FAST_SEQUENCE_TABLES]; + + // If such a tuple does not exist, insert a new one. + Assert(HeapTupleIsValid(old_tuple)); + + values[ANUM_PG_PAX_FAST_SEQUENCE_OBJID - 1] = ObjectIdGetDatum(objid); + values[ANUM_PG_PAX_FAST_SEQUENCE_LASTSEQUENCE - 1] = + Int32GetDatum(new_seqno); + nulls[ANUM_PG_PAX_FAST_SEQUENCE_OBJID - 1] = false; + nulls[ANUM_PG_PAX_FAST_SEQUENCE_LASTSEQUENCE - 1] = false; + + new_tuple = heap_form_tuple(tuple_desc, values, nulls); + Assert(HeapTupleIsValid(new_tuple)); + + new_tuple->t_data->t_ctid = old_tuple->t_data->t_ctid; + new_tuple->t_self = old_tuple->t_self; + + heap_inplace_update(pax_fastsequence_rel, new_tuple); + heap_freetuple(new_tuple); +} + +// InitializeFastSequenceEntry is used to generate and keep track of allocated +// micropartition file number. objid indicates single pax micro-partition table +// oid. lastsequence indicates the current allocated file number by using +// fastsequence allocation. +void CPaxInitializeFastSequenceEntry(Oid objid, char init_type) { + Relation pax_fastsequence_rel; + SysScanDesc scan; + TupleDesc desc; + HeapTuple tuple; + HeapTuple new_tuple; + Datum values[NATTS_PG_PAX_FAST_SEQUENCE_TABLES]; + bool nulls[NATTS_PG_PAX_FAST_SEQUENCE_TABLES]; + + Assert(init_type == FASTSEQUENCE_INIT_TYPE_CREATE || + init_type == FASTSEQUENCE_INIT_TYPE_INPLACE || + init_type == FASTSEQUENCE_INIT_TYPE_UPDATE); + // Initilize a new object id and use row-based exclusive lock to avoid + // concurrency issue. + tuple = CPaxOpenFastSequenceTable(objid, &pax_fastsequence_rel, &scan, + RowExclusiveLock); + + desc = RelationGetDescr(pax_fastsequence_rel); + values[ANUM_PG_PAX_FAST_SEQUENCE_OBJID - 1] = ObjectIdGetDatum(objid); + values[ANUM_PG_PAX_FAST_SEQUENCE_LASTSEQUENCE - 1] = Int32GetDatum(0); + nulls[ANUM_PG_PAX_FAST_SEQUENCE_OBJID - 1] = false; + nulls[ANUM_PG_PAX_FAST_SEQUENCE_LASTSEQUENCE - 1] = false; + new_tuple = heap_form_tuple(desc, values, nulls); + + if (init_type == FASTSEQUENCE_INIT_TYPE_CREATE) { + ObjectAddress base; + ObjectAddress aux; + + if (HeapTupleIsValid(tuple)) + elog(ERROR, "existing tuple in pg_pax_fastsequence when creating pax table"); + + CatalogTupleInsert(pax_fastsequence_rel, new_tuple); + + base.classId = RelationRelationId; + base.objectId = objid; + base.objectSubId = 0; + aux.classId = PAX_FASTSEQUENCE_OID; + aux.objectId = objid; + aux.objectSubId = 0; + recordDependencyOn(&aux, &base, DEPENDENCY_INTERNAL); + } else { + // exists, set to 0 in-place, or update + if (!HeapTupleIsValid(tuple)) + elog(ERROR, "no tuple found in pg_pax_fastsequence for existing pax table"); + + new_tuple->t_data->t_ctid = tuple->t_data->t_ctid; + new_tuple->t_self = tuple->t_self; + if (init_type == FASTSEQUENCE_INIT_TYPE_INPLACE) + heap_inplace_update(pax_fastsequence_rel, new_tuple); + else if (init_type == FASTSEQUENCE_INIT_TYPE_UPDATE) + CatalogTupleUpdate(pax_fastsequence_rel, &new_tuple->t_self, new_tuple); + } + + heap_freetuple(new_tuple); + CPaxCloseFastSequenceTable(pax_fastsequence_rel, scan, RowExclusiveLock); +} + +// GetFastSequences +// Get consecutive sequence numbers, the returned sequence number is the +// lastsequence + 1 +int32 CPaxGetFastSequences(Oid objid) { + Relation pax_fastsequence_rel = NULL; + SysScanDesc scan = NULL; + TupleDesc tuple_desc; + HeapTuple tuple; + Datum seqno_datum; + int32 seqno; + bool isnull = false; + + // Increase and read sequence number base on objid and use row-based exclusive + // lock to avoid concurrency issue. + tuple = CPaxOpenFastSequenceTable(objid, &pax_fastsequence_rel, &scan, + RowExclusiveLock); + + Assert(HeapTupleIsValid(tuple)); + + tuple_desc = RelationGetDescr(pax_fastsequence_rel); + + seqno_datum = heap_getattr(tuple, ANUM_PG_PAX_FAST_SEQUENCE_LASTSEQUENCE, + tuple_desc, &isnull); + if (isnull) { + ereport( + ERROR, + (errcode(ERRCODE_UNDEFINED_OBJECT), + errmsg( + "CPaxGetFastSequences got an invalid lastsequence number: NULL"))); + } + seqno = DatumGetInt32(seqno_datum); + if (seqno < 0) + elog(ERROR, "sequence number out of range: %d", seqno); + + CPaxUpdateFastsequence(pax_fastsequence_rel, tuple, tuple_desc, objid, + seqno + 1); + + CPaxCloseFastSequenceTable(pax_fastsequence_rel, scan, RowExclusiveLock); + + return seqno; +} + +} // namespace paxc diff --git a/contrib/pax_storage/src/cpp/catalog/pax_fastsequence.h b/contrib/pax_storage/src/cpp/catalog/pax_fastsequence.h new file mode 100644 index 00000000000..b5269dfa3df --- /dev/null +++ b/contrib/pax_storage/src/cpp/catalog/pax_fastsequence.h @@ -0,0 +1,32 @@ +//------------------------------------------------------------------------- +// Cloudberry Database +// Copyright (c) 2023, HashData Technology Limited. +// pax_fastsequence.h +// provide a system table maintaining a light-weight fast sequence number for a +// unique object. +// +// IDENTIFICATION +// src/catalog/pax_fastsequence.h +// Author: Tony Ying +//-------------------------------------------------------------------------- + +#pragma once +#include "comm/cbdb_api.h" + +#define ANUM_PG_PAX_FAST_SEQUENCE_OBJID 1 +#define ANUM_PG_PAX_FAST_SEQUENCE_LASTSEQUENCE 2 +#define NATTS_PG_PAX_FAST_SEQUENCE_TABLES 2 + +// CREATE: initialize seqno by INSERT, no tuple exists before +// INPLACE: inplace update when grow the seqno or non-transactional truncate +// UPDATE: transactional truncate, needs to preserve the old seqno +// after rollback +#define FASTSEQUENCE_INIT_TYPE_CREATE 'C' +#define FASTSEQUENCE_INIT_TYPE_INPLACE 'I' +#define FASTSEQUENCE_INIT_TYPE_UPDATE 'U' + +namespace paxc { +void CPaxInitializeFastSequenceEntry(Oid objid, char init_type); +int32 CPaxGetFastSequences(Oid objid); + +} // namespace paxc diff --git a/contrib/pax_storage/src/cpp/catalog/pg_pax_tables.cc b/contrib/pax_storage/src/cpp/catalog/pg_pax_tables.cc new file mode 100644 index 00000000000..c3d7b98f5c9 --- /dev/null +++ b/contrib/pax_storage/src/cpp/catalog/pg_pax_tables.cc @@ -0,0 +1,131 @@ +#include "catalog/pg_pax_tables.h" + +#include "comm/cbdb_api.h" + +namespace paxc { + +void InsertPaxTablesEntry(Oid relid, Oid blocksrelid, Node *partitionspec) { + Relation rel; + TupleDesc desc; + HeapTuple tuple; + bool nulls[NATTS_PG_PAX_TABLES]; + Datum values[NATTS_PG_PAX_TABLES]; + + rel = table_open(PAX_TABLES_RELATION_ID, RowExclusiveLock); + desc = RelationGetDescr(rel); + Assert(desc->natts == NATTS_PG_PAX_TABLES); + + values[ANUM_PG_PAX_TABLES_RELID - 1] = ObjectIdGetDatum(relid); + values[ANUM_PG_PAX_TABLES_AUXRELID - 1] = ObjectIdGetDatum(blocksrelid); + nulls[ANUM_PG_PAX_TABLES_RELID - 1] = false; + nulls[ANUM_PG_PAX_TABLES_AUXRELID - 1] = false; + + if (partitionspec) { + values[ANUM_PG_PAX_TABLES_PARTITIONSPEC - 1] = + CStringGetTextDatum(nodeToString(partitionspec)); + nulls[ANUM_PG_PAX_TABLES_PARTITIONSPEC - 1] = false; + } else { + values[ANUM_PG_PAX_TABLES_PARTITIONSPEC - 1] = 0; + nulls[ANUM_PG_PAX_TABLES_PARTITIONSPEC - 1] = true; + } + tuple = heap_form_tuple(desc, values, nulls); + + /* insert a new tuple */ + CatalogTupleInsert(rel, tuple); + + table_close(rel, NoLock); +} + +void GetPaxTablesEntryAttributes(Oid relid, Oid *blocksrelid, + Node **partitionspec) { + Relation rel; + ScanKeyData key[1]; + SysScanDesc scan; + HeapTuple tuple; + bool isnull; + + rel = table_open(PAX_TABLES_RELATION_ID, RowExclusiveLock); + + ScanKeyInit(&key[0], ANUM_PG_PAX_TABLES_RELID, BTEqualStrategyNumber, F_OIDEQ, + ObjectIdGetDatum(relid)); + + scan = systable_beginscan(rel, PAX_TABLES_RELID_INDEX_ID, true, NULL, 1, key); + tuple = systable_getnext(scan); + if (!HeapTupleIsValid(tuple)) + ereport(ERROR, (errcode(ERRCODE_UNDEFINED_OBJECT), + errmsg("pax table relid \"%d\" does not exist in " + "pg_pax_tables", + relid))); + + if (partitionspec) { + Datum v; + v = heap_getattr(tuple, ANUM_PG_PAX_TABLES_PARTITIONSPEC, + RelationGetDescr(rel), &isnull); + *partitionspec = NULL; + if (!isnull) { + char *str = TextDatumGetCString(v); + *partitionspec = (Node *)stringToNode(str); + pfree(str); + } + } + + if (blocksrelid) { + *blocksrelid = heap_getattr(tuple, ANUM_PG_PAX_TABLES_AUXRELID, + RelationGetDescr(rel), &isnull); + if (isnull) ereport(ERROR, (errmsg("pg_pax_tables.auxrelid is null"))); + } + + /* Finish up scan and close pg_pax_tables catalog. */ + systable_endscan(scan); + table_close(rel, NoLock); +} + +void PaxInitializePartitionSpec(Relation paxrel, Node *part) { + Relation rel; + ScanKeyData key[1]; + SysScanDesc scan; + HeapTuple oldtuple; + TupleDesc desc; + bool isnull; + + Assert(paxrel->rd_rel->relkind == RELKIND_RELATION || + paxrel->rd_rel->relkind == RELKIND_MATVIEW); + Assert(paxrel->rd_options); + + rel = table_open(PAX_TABLES_RELATION_ID, RowExclusiveLock); + desc = RelationGetDescr(rel); + ScanKeyInit(&key[0], ANUM_PG_PAX_TABLES_RELID, BTEqualStrategyNumber, F_OIDEQ, + ObjectIdGetDatum(RelationGetRelid(paxrel))); + + scan = systable_beginscan(rel, PAX_TABLES_RELID_INDEX_ID, true, NULL, 1, key); + oldtuple = systable_getnext(scan); + if (!HeapTupleIsValid(oldtuple)) elog(ERROR, "only support pax tables"); + + (void)heap_getattr(oldtuple, ANUM_PG_PAX_TABLES_PARTITIONSPEC, desc, &isnull); + if (isnull) { + HeapTuple newtup; + Datum values[NATTS_PG_PAX_TABLES]; + bool repl[NATTS_PG_PAX_TABLES]; + bool isnull[NATTS_PG_PAX_TABLES]; + + memset(repl, false, sizeof(repl)); + values[ANUM_PG_PAX_TABLES_PARTITIONSPEC - 1] = + CStringGetTextDatum(nodeToString(part)); + repl[ANUM_PG_PAX_TABLES_PARTITIONSPEC - 1] = true; + isnull[ANUM_PG_PAX_TABLES_PARTITIONSPEC - 1] = false; + + newtup = heap_modify_tuple(oldtuple, desc, values, isnull, repl); + CatalogTupleUpdate(rel, &oldtuple->t_self, newtup); + heap_freetuple(newtup); + + CommandCounterIncrement(); + } else { + elog(ERROR, "existing pax table update partition spec?"); + } + + /* Finish up scan and close pg_pax_tables catalog. */ + systable_endscan(scan); + table_close(rel, NoLock); +} + +} // namespace paxc diff --git a/contrib/pax_storage/src/cpp/catalog/pg_pax_tables.h b/contrib/pax_storage/src/cpp/catalog/pg_pax_tables.h new file mode 100644 index 00000000000..f4d138b0a4b --- /dev/null +++ b/contrib/pax_storage/src/cpp/catalog/pg_pax_tables.h @@ -0,0 +1,24 @@ +#pragma once +#include "comm/cbdb_api.h" + +#define NATTS_PG_PAX_TABLES 3 +#define ANUM_PG_PAX_TABLES_RELID 1 +#define ANUM_PG_PAX_TABLES_AUXRELID 2 +#define ANUM_PG_PAX_TABLES_PARTITIONSPEC 3 + +namespace paxc { + +void InsertPaxTablesEntry(Oid relid, Oid blocksrelid, Node *partitionspec); + +void GetPaxTablesEntryAttributes(Oid relid, Oid *blocksrelid, + Node **partitionspec); + +void PaxInitializePartitionSpec(Relation paxrel, Node *part); + +static inline Oid GetPaxAuxRelid(Oid pax_relid) { + Oid aux_relid; + GetPaxTablesEntryAttributes(pax_relid, &aux_relid, nullptr); + return aux_relid; +} + +} // namespace paxc diff --git a/contrib/pax_storage/src/cpp/cmake/pax.cmake b/contrib/pax_storage/src/cpp/cmake/pax.cmake new file mode 100644 index 00000000000..6c7ef92b60c --- /dev/null +++ b/contrib/pax_storage/src/cpp/cmake/pax.cmake @@ -0,0 +1,187 @@ + +## generate_sql +add_executable(generate_sql_script_program "${CMAKE_CURRENT_SOURCE_DIR}/../../tools/gen_sql.c") +target_include_directories(generate_sql_script_program PUBLIC ${CMAKE_CURRENT_SOURCE_DIR} ${CBDB_INCLUDE_DIR}) +add_custom_command(OUTPUT generate_sql_file + COMMAND ${CMAKE_CURRENT_BINARY_DIR}/generate_sql_script_program > "${CMAKE_CURRENT_SOURCE_DIR}/../../pax-cdbinit--1.0.sql" + DEPENDS generate_sql_script_program + COMMENT "dynamically generate sql script file" +) +add_custom_target(create_sql_script DEPENDS generate_sql_script_program generate_sql_file) + +# bison +bison_target(paxc_gram access/paxc_gram.y ${CMAKE_CURRENT_BINARY_DIR}/paxc_gram.c) + + +set(pax_comm_src + comm/bitmap.cc + comm/guc.cc + comm/paxc_wrappers.cc + comm/pax_memory.cc + comm/cbdb_wrappers.cc) + +set(pax_exceptions_src + exceptions/CException.cc) + +set(pax_storage_src + storage/cache/pax_cache.cc + storage/cache/pax_plasma_cache.cc + storage/columns/pax_column_cache.cc + storage/columns/pax_column_traits.cc + storage/columns/pax_column.cc + storage/columns/pax_compress.cc + storage/columns/pax_columns.cc + storage/columns/pax_encoding_utils.cc + storage/columns/pax_encoding_non_fixed_column.cc + storage/columns/pax_encoding_column.cc + storage/columns/pax_decoding.cc + storage/columns/pax_encoding.cc + storage/columns/pax_rlev2_decoding.cc + storage/columns/pax_rlev2_encoding.cc + storage/columns/pax_vec_column.cc + storage/columns/pax_vec_encoding_column.cc + storage/oper/pax_oper.cc + storage/oper/pax_stats.cc + storage/file_system.cc + storage/local_file_system.cc + storage/micro_partition.cc + storage/micro_partition_file_factory.cc + storage/micro_partition_metadata.cc + storage/micro_partition_row_filter_reader.cc + storage/micro_partition_stats.cc + storage/orc/orc_format_reader.cc + storage/orc/orc_group.cc + storage/orc/orc_vec_group.cc + storage/orc/orc_reader.cc + storage/orc/orc_writer.cc + storage/pax_buffer.cc + storage/pax_filter.cc + storage/pax_itemptr.cc + storage/proto/protobuf_stream.cc + storage/pax.cc + storage/pax_table_partition_writer.cc + storage/strategy.cc + storage/micro_partition_iterator.cc + ) + + +set(pax_access_src + ${BISON_paxc_gram_OUTPUTS} # BISON output file + access/paxc_rel_options.cc + access/paxc_scanner.cc + access/pax_access_handle.cc + access/pax_deleter.cc + access/pax_dml_state.cc + access/pax_inserter.cc + access/pax_partition.cc + access/pax_updater.cc + access/pax_scanner.cc) + +set(pax_catalog_src + catalog/pax_aux_table.cc + catalog/pg_pax_tables.cc + catalog/pax_fastsequence.cc + ) + +set(pax_vec_src + storage/vec/pax_vec_adapter.cc + storage/vec/pax_vec_reader.cc) + + +#### pax.so +set(pax_target_src ${PROTO_SRCS} ${pax_storage_src} ${pax_exceptions_src} + ${pax_access_src} ${pax_comm_src} ${pax_catalog_src} ${pax_vec_src}) +set(pax_target_include ${ZTSD_HEADER} ${CMAKE_CURRENT_SOURCE_DIR} ${CBDB_INCLUDE_DIR}) +set(pax_target_link_libs protobuf zstd z postgres) +set(pax_target_link_directories ${PROJECT_SOURCE_DIR}/../../src/backend/) +set(pax_target_dependencies generate_protobuf create_sql_script) + +# enable plasma +if (ENABLE_PLASMA) + set(pax_target_link_libs ${pax_target_link_libs} uuid plasma) +endif() + +add_library(pax SHARED ${pax_target_src}) +set_target_properties(pax PROPERTIES OUTPUT_NAME pax) + +# vec build +if (VEC_BUILD) + find_package(PkgConfig REQUIRED) + pkg_check_modules(GLIB REQUIRED glib-2.0) + set(pax_target_include + ${pax_target_include} + ${VEC_HOME}/src/include # for utils/tuptable_vec.h + ${VEC_HOME}/arrow/include # for arrow-glib/arrow-glib.h and otehr arrow interface + ${GLIB_INCLUDE_DIRS} # for glib-object.h + ) + set(pax_target_link_directories + ${pax_target_link_directories} + ${VEC_HOME}/arrow/lib) + set(pax_target_link_libs + ${pax_target_link_libs} + arrow) +endif(VEC_BUILD) + +target_include_directories(pax PUBLIC ${pax_target_include}) +target_link_directories(pax PUBLIC ${pax_target_link_directories}) +target_link_libraries(pax PUBLIC ${pax_target_link_libs}) +set_target_properties(pax PROPERTIES + BUILD_RPATH_USE_ORIGIN ON + BUILD_WITH_INSTALL_RPATH ON + INSTALL_RPATH "$ORIGIN:$ORIGIN/.." + LINK_FLAGS "-Wl,--enable-new-dtags" +) + +add_dependencies(pax ${pax_target_dependencies}) +add_custom_command(TARGET pax POST_BUILD + COMMAND ${CMAKE_COMMAND} -E + copy_if_different $ ${CMAKE_CURRENT_SOURCE_DIR}/../../pax.so) + +if (BUILD_GTEST) + add_subdirectory(contrib/googletest) + ADD_DEFINITIONS(-DRUN_GTEST) + file(GLOB test_case_sources + pax_gtest_helper.cc + pax_gtest.cc + ${CMAKE_CURRENT_SOURCE_DIR}/*/*_test.cc + ${CMAKE_CURRENT_SOURCE_DIR}/*/*/*_test.cc) + + add_executable(test_main ${pax_target_src} ${test_case_sources}) + add_dependencies(test_main ${pax_target_dependencies} gtest gmock) + target_include_directories(test_main PUBLIC ${pax_target_include} ${CMAKE_CURRENT_SOURCE_DIR} ${gtest_SOURCE_DIR}/include contrib/cpp-stub/src/ contrib/cpp-stub/src_linux/) + + target_link_directories(test_main PUBLIC ${pax_target_link_directories}) + target_link_libraries(test_main PUBLIC ${pax_target_link_libs} gtest gmock postgres) +endif(BUILD_GTEST) + +if(BUILD_GBENCH) + add_subdirectory(contrib/googlebench) + ADD_DEFINITIONS(-DRUN_GBENCH) + file(GLOB bench_sources + pax_gbench.cc + ${CMAKE_CURRENT_SOURCE_DIR}/*/*_bench.cc + ${CMAKE_CURRENT_SOURCE_DIR}/*/*/*_bench.cc) + + add_executable(bench_main ${pax_target_src} ${bench_sources}) + add_dependencies(bench_main ${pax_target_dependencies} gtest gmock) + target_include_directories(bench_main PUBLIC ${pax_target_include} ${CMAKE_CURRENT_SOURCE_DIR} contrib/googlebench/include contrib/cpp-stub/src/ contrib/cpp-stub/src_linux/) + link_directories(contrib/googlebench/src) + target_link_libraries(bench_main PUBLIC ${pax_target_link_libs} gtest gmock benchmark postgres) + if (VEC_BUILD) + target_link_libraries(bench_main PRIVATE arrow) + endif(VEC_BUILD) +endif(BUILD_GBENCH) + +if (BUILD_TOOLS) + add_subdirectory(contrib/tabulate) + link_directories($ENV{GPHOME}/lib) + + add_executable(pax_dump storage/tools/pax_dump.cpp storage/tools/pax_dump_reader.cpp) + target_include_directories(pax_dump PUBLIC ${pax_target_include} ${CMAKE_CURRENT_SOURCE_DIR} contrib/tabulate/include) + add_dependencies(pax_dump ${pax_target_dependencies}) + target_link_libraries(pax_dump PUBLIC pax protobuf) +endif(BUILD_TOOLS) + +## install dynamic libraray +install(TARGETS pax + LIBRARY DESTINATION ${CMAKE_INSTALL_PREFIX}/lib) \ No newline at end of file diff --git a/contrib/pax_storage/src/cpp/cmake/pax_format.cmake b/contrib/pax_storage/src/cpp/cmake/pax_format.cmake new file mode 100644 index 00000000000..55206384e91 --- /dev/null +++ b/contrib/pax_storage/src/cpp/cmake/pax_format.cmake @@ -0,0 +1,93 @@ +# paxformat.so + +set(pax_comm_src + comm/bitmap.cc + comm/guc.cc + comm/paxc_wrappers.cc + comm/pax_memory.cc + comm/cbdb_wrappers.cc) + +set(pax_exceptions_src + exceptions/CException.cc) + +set(pax_storage_src + storage/cache/pax_cache.cc + storage/cache/pax_plasma_cache.cc + storage/columns/pax_column_cache.cc + storage/columns/pax_column_traits.cc + storage/columns/pax_column.cc + storage/columns/pax_compress.cc + storage/columns/pax_columns.cc + storage/columns/pax_encoding_utils.cc + storage/columns/pax_encoding_non_fixed_column.cc + storage/columns/pax_encoding_column.cc + storage/columns/pax_decoding.cc + storage/columns/pax_encoding.cc + storage/columns/pax_rlev2_decoding.cc + storage/columns/pax_rlev2_encoding.cc + storage/columns/pax_vec_column.cc + storage/columns/pax_vec_encoding_column.cc + storage/oper/pax_oper.cc + storage/oper/pax_stats.cc + storage/file_system.cc + storage/local_file_system.cc + storage/micro_partition.cc + storage/micro_partition_file_factory.cc + storage/micro_partition_metadata.cc + storage/micro_partition_row_filter_reader.cc + storage/micro_partition_stats.cc + storage/orc/orc_format_reader.cc + storage/orc/orc_group.cc + storage/orc/orc_vec_group.cc + storage/orc/orc_reader.cc + storage/orc/orc_writer.cc + storage/pax_buffer.cc + storage/pax_filter.cc + storage/proto/protobuf_stream.cc + ) + +add_library(paxformat SHARED ${PROTO_SRCS} ${pax_storage_src} ${pax_exceptions_src} ${pax_comm_src} ) +target_compile_definitions(paxformat PRIVATE BUILD_PAX_FORMAT) +target_include_directories(paxformat PUBLIC ${ZTSD_HEADER} ${CMAKE_CURRENT_SOURCE_DIR} ${CBDB_INCLUDE_DIR}) +target_link_libraries(paxformat PUBLIC uuid protobuf zstd z) +set_target_properties(paxformat PROPERTIES + OUTPUT_NAME paxformat) +add_dependencies(paxformat generate_protobuf) + +# export headers +set(PAX_COMM_HEADERS + comm/bitmap.h + comm/cbdb_api.h + comm/log.h + comm/cbdb_wrappers.h + comm/pax_rel.h + comm/pax_memory.h + comm/guc.h +) + +set(PAX_EXCEPTION_HEADERS + exceptions/CException.h +) + +# TODO(gongxun): +# We should explicitly specify the headers +# that need to be exported, and use the syntax of +# install(FILES,...) to install the header files +install(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/storage + DESTINATION ${CMAKE_INSTALL_PREFIX}/include/pax + FILES_MATCHING + PATTERN "*.h" +) + +install(FILES ${PAX_COMM_HEADERS} + DESTINATION ${CMAKE_INSTALL_PREFIX}/include/pax/comm +) + +install(FILES ${PAX_EXCEPTION_HEADERS} + DESTINATION ${CMAKE_INSTALL_PREFIX}/include/pax/exceptions +) + +## install dynamic libraray +install(TARGETS paxformat + LIBRARY DESTINATION ${CMAKE_INSTALL_PREFIX}/lib) + diff --git a/contrib/pax_storage/src/cpp/comm/bitmap.cc b/contrib/pax_storage/src/cpp/comm/bitmap.cc index 7009323445f..eeb2b404cfc 100644 --- a/contrib/pax_storage/src/cpp/comm/bitmap.cc +++ b/contrib/pax_storage/src/cpp/comm/bitmap.cc @@ -1,150 +1,23 @@ #include "comm/bitmap.h" -#include "exceptions/CException.h" - namespace pax { +const uint8 kNumBits[] = { + 0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4, + 1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5, + 1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5, + 2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6, + 1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5, + 2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6, + 2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6, + 3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7, + 1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5, + 2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6, + 2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6, + 3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7, + 2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6, + 3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7, + 3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7, + 4,5,5,6,5,6,6,7,5,6,6,7,6,7,7,8, +}; -DynamicBitmap::DynamicBitmap() { bitmap_.resize(1024); } -DynamicBitmap::DynamicBitmap(uint32 size) { bitmap_.resize(size); } - -DynamicBitmap::~DynamicBitmap() { bitmap_.clear(); } - -void DynamicBitmap::Set(uint32 index) { - CBDB_CHECK(index >= 0 && index < bitmap_.size(), - cbdb::CException::ExType::kExTypeOutOfRange); - bitmap_[index] = true; -} - -bool DynamicBitmap::Test(uint32 index) const { - CBDB_CHECK(index >= 0 && index < bitmap_.size(), - cbdb::CException::ExType::kExTypeOutOfRange); - return bitmap_[index]; -} - -void DynamicBitmap::Clear(uint32 index) { - CBDB_CHECK(index >= 0 && index < bitmap_.size(), - cbdb::CException::ExType::kExTypeOutOfRange); - bitmap_[index] = false; -} - -void DynamicBitmap::Reset() { bitmap_.clear(); } - -void DynamicBitmap::Resize(int size) { bitmap_.resize(size); } - -// TODO(gongxun): need to do optimization for this function -bool DynamicBitmap::BitmapFindFirst(uint32 offset, bool value, - uint32 *idx) const { - auto it = std::find(bitmap_.begin() + offset, bitmap_.end(), value); - if (it == bitmap_.end()) { - return false; - } - *idx = it - bitmap_.begin(); - return true; -} - -uint32 DynamicBitmap::NumBits() const { return bitmap_.size(); } - -FixedBitmap::FixedBitmap(uint32 size) { - byte_size_ = (size >> 3) + (size & 7 ? 1 : 0); - bitmap_ = new uint8[byte_size_]; - - num_bits_ = size; - memset(bitmap_, 0, byte_size_); -} - -FixedBitmap::~FixedBitmap() { delete[] bitmap_; } - -void FixedBitmap::Set(uint32 index) { - CBDB_CHECK(index >= 0 && index < num_bits_, - cbdb::CException::ExType::kExTypeOutOfRange); - bitmap_[index >> 3] |= 1 << (index & 7); -} - -bool FixedBitmap::Test(uint32 index) const { - CBDB_CHECK(index >= 0 && index < num_bits_, - cbdb::CException::ExType::kExTypeOutOfRange); - return bitmap_[index >> 3] & (1 << (index & 7)); } - -void FixedBitmap::Reset() { std::memset(bitmap_, 0, byte_size_); } - -void FixedBitmap::Clear(uint32 index) { - CBDB_CHECK(index >= 0 && index < num_bits_, - cbdb::CException::ExType::kExTypeOutOfRange); - bitmap_[index >> 3] &= ~(1 << (index & 7)); -} - -uint32 FixedBitmap::Size() const { return byte_size_; } -uint32 FixedBitmap::NumBits() const { return num_bits_; } -bool FixedBitmap::BitmapFindFirst(uint32 offset, bool value, - uint32 *idx) const { - const uint64 pattern64[2] = {0xffffffffffffffff, 0x0000000000000000}; - const uint8 pattern8[2] = {0xff, 0x00}; - uint32 bit; - - if (offset >= num_bits_) { - return false; - } - - // Jump to the byte at specified offset - const uint8 *p = bitmap_ + (offset >> 3); - uint32 num_bits = num_bits_ - offset; - - // Find a 'value' bit at the end of the first byte - if ((bit = offset & 0x7)) { - for (; bit < 8 && num_bits > 0; ++bit) { - if (Test(((p - bitmap_) << 3) + bit) == value) { - *idx = ((p - bitmap_) << 3) + bit; - return true; - } - - num_bits--; - } - p++; - } - - // check 64bit at the time for a 'value' bit - const uint64 *u64 = (const uint64 *)p; - while (num_bits >= 64 && *u64 == pattern64[value]) { - num_bits -= 64; - u64++; - } - - // check 8bit at the time for a 'value' bit - p = (const uint8 *)u64; - while (num_bits >= 8 && *p == pattern8[value]) { - num_bits -= 8; - p++; - } - - // Find a 'value' bit at the beginning of the last byte - for (bit = 0; num_bits > 0; ++bit) { - if (Test(((p - bitmap_) << 3) + bit) == value) { - *idx = ((p - bitmap_) << 3) + bit; - return true; - } - num_bits--; - } - - return false; -} - -BitmapIterator::BitmapIterator(Bitmap *map) : offset_(0), bitmap_(map) {} - -void BitmapIterator::SeekTo(size_t bit) { - Assert(bit < bitmap_->NumBits()); - offset_ = bit; -} - -int32 BitmapIterator::Next(bool value) { - int32 len = bitmap_->NumBits() - offset_; - if (len <= 0) return -1; - uint32 index; - if (bitmap_->BitmapFindFirst(offset_, value, &index)) { - offset_ = index + 1; - return index; - } - return -1; -} - -} // namespace pax diff --git a/contrib/pax_storage/src/cpp/comm/bitmap.h b/contrib/pax_storage/src/cpp/comm/bitmap.h index daa5819a3a1..0d503be48f2 100644 --- a/contrib/pax_storage/src/cpp/comm/bitmap.h +++ b/contrib/pax_storage/src/cpp/comm/bitmap.h @@ -4,97 +4,261 @@ #include -#include #include -#include -#include -#include -#include -#include -namespace pax { +#include "comm/pax_memory.h" +#include "exceptions/CException.h" -class Bitmap { - public: - virtual ~Bitmap() {} - virtual void Set(uint32 index) = 0; - virtual bool Test(uint32 index) const = 0; - virtual void Clear(uint32 index) = 0; - virtual void Reset() = 0; - virtual bool BitmapFindFirst(uint32 offset, bool value, - uint32 *idx) const = 0; - virtual uint32 NumBits() const = 0; -}; - -class DynamicBitmap : public Bitmap { +namespace pax { +extern const uint8 kNumBits[]; +#define BM_WORD_BITS (sizeof(T) << 3) +// log2(BM_WORD_BITS) +#define BM_WORD_SHIFTS \ + (sizeof(T) == 1 ? 3 : (sizeof(T) == 2 ? 4 : (sizeof(T) == 4 ? 5 : 6))) +#define BM_INDEX_WORD_OFF(index) ((index) >> BM_WORD_SHIFTS) +#define BM_INDEX_BIT_OFF(index) ((index) & (BM_WORD_BITS - 1)) +#define BM_INDEX_BIT(index) (1ULL << BM_INDEX_BIT_OFF(index)) +template +struct BitmapRaw final { public: - friend class BitmapIterator; - DynamicBitmap(); - explicit DynamicBitmap(uint32 size); + inline void Set(uint32 index) { + bitmap[BM_INDEX_WORD_OFF(index)] |= BM_INDEX_BIT(index); + } + // set first the bits [0, index] to 1 + inline void SetN(uint32 index) { + memset(&bitmap[0], -1, sizeof(T) * BM_INDEX_WORD_OFF(index)); + bitmap[BM_INDEX_WORD_OFF(index)] |= (BM_INDEX_BIT(index) << 1) - 1; + } + inline void Clear(uint32 index) { + bitmap[BM_INDEX_WORD_OFF(index)] &= ~BM_INDEX_BIT(index); + } + inline void ClearN(uint32 index) { + memset(&bitmap[0], 0, sizeof(T) * BM_INDEX_WORD_OFF(index)); + bitmap[BM_INDEX_WORD_OFF(index)] &= ~((BM_INDEX_BIT(index) << 1) - 1); + } + inline void ClearAll() { + AssertImply(size > 0, bitmap); + if (size > 0) memset(&bitmap[0], 0, sizeof(T) * size); + } + inline bool Test(uint32 index) const { + return (bitmap[BM_INDEX_WORD_OFF(index)] & BM_INDEX_BIT(index)) != 0; + } + // invert the bit and return the old value. + inline bool Toggle(uint32 index) { + return !((bitmap[BM_INDEX_WORD_OFF(index)] ^= BM_INDEX_BIT(index)) & + BM_INDEX_BIT(index)); + } + inline size_t WordBits(T v) const { + if (sizeof(T) == 1) + return kNumBits[v]; + else if (sizeof(T) == 2) + return kNumBits[v & 0xff] + kNumBits[v >> 8]; + else if (sizeof(T) == 4) + return kNumBits[v & 0xff] + kNumBits[(v >> 8) & 0xff] + + kNumBits[(v >> 16) & 0xff] + kNumBits[(v >> 24) & 0xff]; + else if (sizeof(T) == 8) + return kNumBits[v & 0xff] + kNumBits[(v >> 8) & 0xff] + + kNumBits[(v >> 16) & 0xff] + kNumBits[(v >> 24) & 0xff] + + kNumBits[(v >> 32) & 0xff] + kNumBits[(v >> 40) & 0xff] + + kNumBits[(v >> 48) & 0xff] + kNumBits[(v >> 56) & 0xff]; + return 0; + } + // count bits in range [0, index] + inline size_t CountBits(uint32 index) const { + size_t nbits = 0; + for (uint32 i = 0; i < BM_INDEX_WORD_OFF(index); i++) + nbits += WordBits(bitmap[i]); + { + auto w = bitmap[BM_INDEX_WORD_OFF(index)]; + nbits += WordBits(w & ((BM_INDEX_BIT(index) << 1) - 1)); + } - virtual ~DynamicBitmap(); + return nbits; + } + // count bits in range [start, end] + inline size_t CountBits(uint32 start_index, uint32 end_index) const { + size_t nbits = 0; + uint32 word_off = BM_INDEX_WORD_OFF(start_index); - void Set(uint32 index) override; + Assert(start_index <= end_index); - bool Test(uint32 index) const override; + if (BM_INDEX_WORD_OFF(end_index) == word_off) { + uint32 w = bitmap[word_off] >> BM_INDEX_BIT_OFF(start_index); + return WordBits(w & ((1ULL << (end_index - start_index + 1)) - 1)); + } + { + uint32 w = bitmap[BM_INDEX_WORD_OFF(start_index)]; + nbits += WordBits(w >> BM_INDEX_BIT_OFF(start_index)); + } + for (uint32 i = BM_INDEX_WORD_OFF(start_index + BM_WORD_BITS), + n = BM_INDEX_WORD_OFF(end_index); + i < n; i++) + nbits += WordBits(bitmap[i]); + { + auto w = bitmap[BM_INDEX_WORD_OFF(end_index)]; + nbits += WordBits(w & ((BM_INDEX_BIT(end_index) << 1) - 1)); + } + return nbits; + } - void Clear(uint32 index) override; + inline bool HasEnoughSpace(uint32 index) const { + static_assert(sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4 || + sizeof(T) == 8); + static_assert(BM_WORD_BITS == (1 << BM_WORD_SHIFTS)); + return (index >> BM_WORD_SHIFTS) < size; + } + inline bool Empty() const { + if (!bitmap) return true; + for (size_t i = 0; i < size; i++) + if (bitmap[i]) return false; + return true; + } + BitmapRaw() = default; + BitmapRaw(T *buffer, size_t size) : bitmap(buffer), size(size) {} + BitmapRaw(const BitmapRaw &) = delete; + BitmapRaw(BitmapRaw &&raw) : bitmap(raw.bitmap), size(raw.size) { + raw.bitmap = nullptr; + raw.size = 0; + } + BitmapRaw &operator=(BitmapRaw) = delete; + BitmapRaw &operator=(BitmapRaw &) = delete; + BitmapRaw &operator=(const BitmapRaw &) = delete; + BitmapRaw &operator=(BitmapRaw &&raw) { + if (this != &raw) { + PAX_DELETE_ARRAY(bitmap); + bitmap = raw.bitmap; + size = raw.size; + raw.bitmap = nullptr; + raw.size = 0; + } + return *this; + } - void Reset() override; + ~BitmapRaw() = default; - void Resize(int size); - - // TODO(gongxun): need to do optimization for this function - bool BitmapFindFirst(uint32 offset, bool value, uint32 *idx) const override; - - uint32 NumBits() const override; - - private: - std::vector bitmap_; + T *bitmap = nullptr; + size_t size = 0; }; -class FixedBitmap : public Bitmap { +template +class BitmapTpl final { public: - friend class BitmapIterator; - explicit FixedBitmap(uint32 size); + using BitmapMemoryPolicy = void (*)(BitmapRaw &, uint32); + explicit BitmapTpl(uint32 initial_size = 16, + BitmapMemoryPolicy policy = DefaultBitmapMemoryPolicy) { + static_assert(sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4 || + sizeof(T) == 8); + static_assert(BM_WORD_BITS == (1 << BM_WORD_SHIFTS)); + policy_ = policy; + policy(raw_, Max(initial_size, 16)); + } + explicit BitmapTpl(const BitmapRaw &raw, BitmapMemoryPolicy policy) { + static_assert(sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4 || + sizeof(T) == 8); + static_assert(BM_WORD_BITS == (1 << BM_WORD_SHIFTS)); + Assert(policy == ReadOnlyRefBitmap || policy == ReadOnlyOwnBitmap); + policy_ = policy; + raw_.bitmap = raw.bitmap; + raw_.size = raw.size; + } + BitmapTpl(const BitmapTpl &tpl) = delete; + BitmapTpl(BitmapTpl &&tpl) + : raw_(std::move(tpl.raw_)), policy_(tpl.policy_) {} + BitmapTpl &operator=(const BitmapTpl &tpl) = delete; + BitmapTpl &operator=(BitmapTpl &&tpl) = delete; + ~BitmapTpl() { + // Reference doesn't free the memory + if (policy_ == ReadOnlyRefBitmap) raw_.bitmap = nullptr; + } + inline size_t WordBits() const { return BM_WORD_BITS; } + inline void Set(uint32 index) { + if (unlikely(!raw_.HasEnoughSpace(index))) policy_(raw_, index); + raw_.Set(index); + } + inline void SetN(uint32 index) { + if (unlikely(!raw_.HasEnoughSpace(index))) policy_(raw_, index); + raw_.SetN(index); + } + inline void Clear(uint32 index) { + if (likely(raw_.HasEnoughSpace(index))) raw_.Clear(index); + } + inline void ClearN(uint32 index) { + if (raw_.HasEnoughSpace(index)) + raw_.ClearN(index); + else + raw_.ClearAll(); + } + inline void ClearAll() { raw_.ClearAll(); } + inline bool Test(uint32 index) const { + if (likely(raw_.HasEnoughSpace(index))) return raw_.Test(index); + return false; + } + // invert the bit and return the old value. + inline bool Toggle(uint32 index) { + if (unlikely(!raw_.HasEnoughSpace(index))) policy_(raw_, index); + return raw_.Toggle(index); + } + // count bits in range [0, index] + inline size_t CountBits(uint32 index) const { + if (raw_.size == 0) return 0; + if ((raw_.size << BM_WORD_SHIFTS) <= index) + index = (raw_.size << BM_WORD_SHIFTS) - 1; + return raw_.CountBits(index); + } + inline size_t CountBits(uint32 start_index, uint32 end_index) const { + if ((raw_.size << BM_WORD_SHIFTS) <= start_index) return 0; + if ((raw_.size << BM_WORD_SHIFTS) <= end_index) + end_index = (raw_.size << BM_WORD_SHIFTS) - 1; + Assert(start_index <= end_index); + return raw_.CountBits(start_index, end_index); + } - virtual ~FixedBitmap(); + inline bool Empty() const { return raw_.Empty(); } - void Set(uint32 index) override; + BitmapMemoryPolicy Policy() const { return policy_; } - bool Test(uint32 index) const override; + const BitmapRaw &Raw() const { return raw_; } + BitmapRaw &Raw() { return raw_; } - void Reset() override; + static void DefaultBitmapMemoryPolicy(BitmapRaw &raw, uint32 index) { + auto old_bitmap = raw.bitmap; + auto old_size = raw.size; + auto size = Max(BM_INDEX_WORD_OFF(index) + 1, old_size * 2); + auto p = PAX_NEW_ARRAY(size); + if (old_size > 0) memcpy(p, old_bitmap, sizeof(T) * old_size); + memset(&p[old_size], 0, sizeof(T) * (size - old_size)); + raw.bitmap = p; + raw.size = size; + PAX_DELETE_ARRAY(old_bitmap); + } + static void ReadOnlyRefBitmap(BitmapRaw &/*raw*/, uint32 /*index*/) { + // raise + CBDB_RAISE(cbdb::CException::kExTypeInvalidMemoryOperation); + } + static void ReadOnlyOwnBitmap(BitmapRaw &/*raw*/, uint32 /*index*/) { + CBDB_RAISE(cbdb::CException::kExTypeInvalidMemoryOperation); + } - void Clear(uint32 index) override; - - uint32 Size() const; - - uint32 NumBits() const override; - - bool BitmapFindFirst(uint32 offset, bool value, uint32 *idx) const; + static inline size_t RequireWords(size_t nbits) { + return nbits ? ((nbits - 1) >> BM_WORD_SHIFTS) + 1 : 0; + } + inline size_t CurrentBytes() const { return sizeof(T) * raw_.size; } + inline size_t MinimalStoredBytes(size_t nbits) { + auto nwords = RequireWords(nbits); + if (nwords > raw_.size) nwords = raw_.size; + while (nwords > 0 && raw_.bitmap[nwords - 1] == 0) nwords--; + return nwords * sizeof(T); + } private: - FixedBitmap(const FixedBitmap &other) = delete; - FixedBitmap(FixedBitmap &&other) = delete; - FixedBitmap &operator=(const FixedBitmap &other) = delete; - FixedBitmap &operator=(FixedBitmap &&other) = delete; - - uint32 byte_size_; - uint32 num_bits_; - uint8 *bitmap_; + inline bool HasEnoughSpace(uint32 index) const { + return raw_.HasEnoughSpace(index); + } + BitmapRaw raw_; + BitmapMemoryPolicy policy_; }; -class BitmapIterator { - public: - explicit BitmapIterator(Bitmap *map); - - void SeekTo(size_t bit); - - int32 Next(bool value); +using Bitmap8 = BitmapTpl; +using Bitmap64 = BitmapTpl; - private: - uint32 offset_; - Bitmap *bitmap_; -}; } // namespace pax diff --git a/contrib/pax_storage/src/cpp/comm/bitmap_test.cc b/contrib/pax_storage/src/cpp/comm/bitmap_test.cc index b92b18b63ca..f5f7f164725 100644 --- a/contrib/pax_storage/src/cpp/comm/bitmap_test.cc +++ b/contrib/pax_storage/src/cpp/comm/bitmap_test.cc @@ -5,68 +5,162 @@ namespace pax::tests { class BitMapTest : public ::testing::Test {}; -TEST_F(BitMapTest, test) { - FixedBitmap bit_map(100); - ASSERT_EQ(bit_map.Test(0), false); - ASSERT_EQ(bit_map.Test(99), false); - bit_map.Set(0); - ASSERT_EQ(bit_map.Test(0), true); - ASSERT_EQ(bit_map.Test(99), false); - bit_map.Set(99); - ASSERT_EQ(bit_map.Test(0), true); - ASSERT_EQ(bit_map.Test(99), true); - bit_map.Clear(0); - ASSERT_EQ(bit_map.Test(0), false); - ASSERT_EQ(bit_map.Test(99), true); - bit_map.Clear(99); - ASSERT_EQ(bit_map.Test(0), false); - ASSERT_EQ(bit_map.Test(99), false); - - ASSERT_EQ(bit_map.Size(), 13); +TEST_F(BitMapTest, Bitmap8) { + Bitmap8 bm(20); + + ASSERT_TRUE(bm.Empty()); + for (auto i = 0; i <= 128; i++) { + ASSERT_FALSE(bm.Test(i)); // zeros + ASSERT_FALSE(bm.Toggle(i)); + ASSERT_TRUE(bm.Test(i)); + ASSERT_TRUE(bm.Toggle(i)); + ASSERT_FALSE(bm.Test(i)); + + ASSERT_FALSE(bm.Test(i)); // zeros + bm.Set(i); + ASSERT_TRUE(bm.Test(i)); + bm.Set(i); + ASSERT_TRUE(bm.Test(i)); + + bm.Clear(i); + ASSERT_FALSE(bm.Test(i)); + bm.Clear(i); + ASSERT_FALSE(bm.Test(i)); + + bm.Set(i); + ASSERT_TRUE(bm.Test(i)); + } +} + +TEST_F(BitMapTest, Bitmap8SetN) { + Bitmap8 bm(10); + const auto nbits = 128; + + ASSERT_TRUE(bm.Empty()); + for (auto i = 0; i <= nbits; i++) ASSERT_FALSE(bm.Test(i)); + + auto fn = [&bm, nbits](uint32 index) { + bm.ClearAll(); + for (auto i = 0; i <= nbits; i++) ASSERT_FALSE(bm.Test(i)); + bm.SetN(index); + for (uint32 i = 0; i <= index; i++) ASSERT_TRUE(bm.Test(i)); + for (uint32 i = index + 1; i <= nbits; i++) ASSERT_FALSE(bm.Test(i)); + }; + for (uint32 i = 0; i <= nbits; i++) fn(i); +} + +TEST_F(BitMapTest, Bitmap8ClearN) { + Bitmap8 bm(10); + const auto nbits = 128; + + ASSERT_TRUE(bm.Empty()); + for (auto i = 0; i <= nbits; i++) ASSERT_FALSE(bm.Test(i)); + + auto fn = [&bm, nbits](uint32 index) { + for (auto i = 0; i <= nbits; i++) { + bm.Set(i); + ASSERT_TRUE(bm.Test(i)); + } + bm.ClearN(index); + for (uint32 i = 0; i <= index; i++) ASSERT_FALSE(bm.Test(i)); + for (uint32 i = index + 1; i <= nbits; i++) ASSERT_TRUE(bm.Test(i)); + }; + for (uint32 i = 0; i <= nbits; i++) fn(i); } -TEST_F(BitMapTest, FixedBitmap) { - FixedBitmap bit_map(100); - bit_map.Set(0); - bit_map.Set(50); - bit_map.Set(99); +TEST_F(BitMapTest, Bitmap64) { + Bitmap64 bm(100); - BitmapIterator it(&bit_map); + ASSERT_TRUE(bm.Empty()); + for (auto i = 0; i <= 128; i++) { + ASSERT_FALSE(bm.Test(i)); // zeros + ASSERT_FALSE(bm.Toggle(i)); + ASSERT_TRUE(bm.Test(i)); + ASSERT_TRUE(bm.Toggle(i)); + ASSERT_FALSE(bm.Test(i)); - ASSERT_EQ(it.Next(true), 0); - ASSERT_EQ(it.Next(true), 50); - ASSERT_EQ(it.Next(true), 99); + bm.Set(i); + ASSERT_TRUE(bm.Test(i)); + bm.Set(i); + ASSERT_TRUE(bm.Test(i)); - it.SeekTo(0); - ASSERT_EQ(it.Next(false), 1); - ASSERT_EQ(it.Next(false), 2); - ASSERT_EQ(it.Next(false), 3); + bm.Clear(i); + ASSERT_FALSE(bm.Test(i)); + bm.Clear(i); + ASSERT_FALSE(bm.Test(i)); + + bm.Set(i); + ASSERT_TRUE(bm.Test(i)); + } } +TEST_F(BitMapTest, Bitmap64SetN) { + Bitmap64 bm(1); + const auto nbits = 512; -TEST_F(BitMapTest, DynamicBitmap) { - DynamicBitmap bit_map(100); - bit_map.Set(0); - bit_map.Set(50); - bit_map.Set(99); + ASSERT_TRUE(bm.Empty()); + for (auto i = 0; i <= nbits; i++) ASSERT_FALSE(bm.Test(i)); - BitmapIterator it(&bit_map); + auto fn = [&bm, nbits](uint32 index) { + bm.ClearAll(); + for (auto i = 0; i <= nbits; i++) ASSERT_FALSE(bm.Test(i)); + bm.SetN(index); + for (uint32 i = 0; i <= index; i++) ASSERT_TRUE(bm.Test(i)); + for (uint32 i = index + 1; i <= nbits; i++) ASSERT_FALSE(bm.Test(i)); + }; + for (uint32 i = 0; i <= nbits; i++) fn(i); +} - ASSERT_EQ(it.Next(true), 0); - ASSERT_EQ(it.Next(true), 50); - ASSERT_EQ(it.Next(true), 99); +TEST_F(BitMapTest, Bitmap64ClearN) { + Bitmap64 bm(1); + const auto nbits = 512; - bit_map.Resize(200); - bit_map.Set(100); - bit_map.Set(150); - bit_map.Set(199); + ASSERT_TRUE(bm.Empty()); + for (auto i = 0; i <= nbits; i++) ASSERT_FALSE(bm.Test(i)); - ASSERT_EQ(it.Next(true), 100); - ASSERT_EQ(it.Next(true), 150); - ASSERT_EQ(it.Next(true), 199); + auto fn = [&bm, &nbits](uint32 index) { + for (auto i = 0; i <= nbits; i++) { + bm.Set(i); + ASSERT_TRUE(bm.Test(i)); + } + bm.ClearN(index); + for (uint32 i = 0; i <= index; i++) ASSERT_FALSE(bm.Test(i)); + for (uint32 i = index + 1; i <= nbits; i++) ASSERT_TRUE(bm.Test(i)); + }; + for (uint32 i = 0; i <= nbits; i++) fn(i); +} + +TEST_F(BitMapTest, CountBits) { + const uint32 starts[] = {0, 1, 3, 7}; + const uint32 ends[] = {0, 1, 7, 8, 9, 15, 16, 17}; + Bitmap8 bm(11); - it.SeekTo(0); - ASSERT_EQ(it.Next(false), 1); - ASSERT_EQ(it.Next(false), 2); - ASSERT_EQ(it.Next(false), 3); + auto fill_bits = [&bm](uint32 bits) { + uint32 k = 0; + bm.ClearAll(); + while (bits) { + if (bits & 1) bm.Set(k); + bits = bits >> 1; + k++; + } + }; + auto plain_count = [](uint32 bits, uint32 start, uint32 end) { + size_t nbits = 0; + for (auto i = start; i <= end; i++) { + if (bits & (1ULL << i)) nbits++; + } + return nbits; + }; + + for (uint32 i = 0; i < 0x3ffff; i++) { + fill_bits(i); + for (auto start : starts) { + for (auto end : ends) { + if (end < start) continue; + ASSERT_EQ(bm.CountBits(start, end), plain_count(i, start, end)); + ASSERT_EQ(bm.CountBits(end), plain_count(i, 0, end)); + } + } + } } + } // namespace pax::tests diff --git a/contrib/pax_storage/src/cpp/comm/cbdb_api.h b/contrib/pax_storage/src/cpp/comm/cbdb_api.h index b97800252ee..dcf074ed37b 100644 --- a/contrib/pax_storage/src/cpp/comm/cbdb_api.h +++ b/contrib/pax_storage/src/cpp/comm/cbdb_api.h @@ -1,11 +1,19 @@ #ifndef SRC_CPP_COMM_CBDB_API_H_ #define SRC_CPP_COMM_CBDB_API_H_ +#include "comm/pax_rel.h" + #ifdef __cplusplus extern "C" { #endif + +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wregister" + #include "postgres.h" // NOLINT +#include "postmaster/postmaster.h" +#include "access/detoast.h" #include "access/genam.h" #include "access/heapam.h" #include "access/relscan.h" @@ -14,17 +22,39 @@ extern "C" { #include "access/tsmapi.h" #include "access/tupdesc.h" #include "access/tupdesc_details.h" +#include "catalog/catalog.h" #include "catalog/dependency.h" #include "catalog/heap.h" +#include "catalog/gp_indexing.h" +#include "catalog/index.h" #include "catalog/indexing.h" +#include "catalog/objectaccess.h" #include "catalog/oid_dispatch.h" #include "catalog/pg_am.h" #include "catalog/pg_amop.h" #include "catalog/pg_amproc.h" +#include "catalog/pg_attribute_encoding.h" +#include "catalog/pg_collation.h" #include "catalog/pg_namespace.h" +#include "catalog/pg_opclass.h" +#include "catalog/toasting.h" +#include "commands/progress.h" +#include "commands/tablecmds.h" +#include "nodes/execnodes.h" +#include "funcapi.h" +#include "partitioning/partdesc.h" +#include "partitioning/partbounds.h" +#include "pgstat.h" +#include "utils/partcache.h" +#include "utils/ruleutils.h" +#include "access/nbtree.h" +#include "access/hash.h" +#include "parser/parse_utilcmd.h" +#include "nodes/makefuncs.h" +#include "parser/parse_oper.h" +#include "parser/parse_expr.h" #ifndef BUILD_PAX_FORMAT #include "access/reloptions.h" -#include "catalog/pg_pax_tables.h" #endif #include "catalog/storage.h" #include "cdb/cdbvars.h" @@ -41,33 +71,32 @@ extern "C" { #include "storage/lwlock.h" #include "storage/relfilenode.h" #include "storage/smgr.h" +#include "utils/backend_progress.h" #include "utils/builtins.h" +#include "utils/date.h" #include "utils/datum.h" #include "utils/elog.h" #include "utils/hsearch.h" #include "utils/lsyscache.h" #include "utils/memutils.h" +#include "utils/numeric.h" #include "utils/relcache.h" #include "utils/snapshot.h" #include "utils/syscache.h" #include "utils/wait_event.h" // no header file in cbdb -extern BlockNumber system_nextsampleblock(SampleScanState *node, BlockNumber nblocks); // NOLINT -extern bool extractcolumns_from_node(Node *expr, bool *cols, AttrNumber natts); // NOLINT +extern BlockNumber system_nextsampleblock(SampleScanState *node, // NOLINT + BlockNumber nblocks); +extern bool extractcolumns_from_node(Node *expr, bool *cols, // NOLINT + AttrNumber natts); +extern int get_partition_for_tuple(PartitionKey key, PartitionDesc partdesc, // NOLINT + Datum *values, bool *isnull); extern Oid GetDefaultOpClass(Oid type_id, Oid am_id); + +#pragma GCC diagnostic pop #ifdef __cplusplus } #endif -#define PAX_TABLE_AM_OID 7014 -#define PAX_AMNAME "pax" -#define PAX_AM_HANDLER_OID 7600 -#define PAX_AM_HANDLER_NAME "pax_tableam_handler" - -#define PAX_AUX_STATS_IN_OID 7601 -#define PAX_AUX_STATS_OUT_OID 7602 -#define PAX_AUX_STATS_TYPE_OID 7603 -#define PAX_AUX_STATS_TYPE_NAME "paxauxstats" - #endif // SRC_CPP_COMM_CBDB_API_H_ diff --git a/contrib/pax_storage/src/cpp/comm/cbdb_wrappers.cc b/contrib/pax_storage/src/cpp/comm/cbdb_wrappers.cc index fb7c4beced4..cb683fdfea6 100644 --- a/contrib/pax_storage/src/cpp/comm/cbdb_wrappers.cc +++ b/contrib/pax_storage/src/cpp/comm/cbdb_wrappers.cc @@ -1,6 +1,6 @@ #include "comm/cbdb_wrappers.h" + #include "comm/paxc_wrappers.h" -#include "storage/paxc_block_map_manager.h" extern "C" { const char *progname; } @@ -83,22 +83,6 @@ void Pfree(void *ptr) { } // namespace cbdb -void *operator new(std::size_t size) { return cbdb::Palloc(size); } - -void *operator new[](std::size_t size) { return cbdb::Palloc(size); } - -void *operator new(std::size_t size, MemoryContext ctx) { - return cbdb::MemCtxAlloc(ctx, size); -} - -void *operator new[](std::size_t size, MemoryContext ctx) { - return cbdb::MemCtxAlloc(ctx, size); -} - -void operator delete(void *ptr) { if (ptr) cbdb::Pfree(ptr); } - -void operator delete[](void *ptr) { if (ptr) cbdb::Pfree(ptr); } - HTAB *cbdb::HashCreate(const char *tabname, int64 nelem, const HASHCTL *info, int flags) { CBDB_WRAP_START; @@ -173,6 +157,14 @@ Datum cbdb::DatumFromPointer(const void *p, int16 typlen) { } #endif + +struct varlena *cbdb::PgDeToastDatum(struct varlena *datum) { + CBDB_WRAP_START; + { return detoast_attr(datum); } + CBDB_WRAP_END; + return nullptr; +} + struct varlena *cbdb::PgDeToastDatumPacked(struct varlena *datum) { CBDB_WRAP_START; { return pg_detoast_datum_packed(datum); } @@ -191,39 +183,10 @@ void *cbdb::PointerAndLenFromDatum(Datum d, int *len) { CBDB_WRAP_END; } -// pax ctid mapping functions - -void cbdb::InitCommandResource() { +void cbdb::SlotGetMissingAttrs(TupleTableSlot *slot, int start_attno, + int last_attno) { CBDB_WRAP_START; - { paxc::init_command_resource(); } - CBDB_WRAP_END; -} -void cbdb::ReleaseCommandResource() { - CBDB_WRAP_START; - { paxc::release_command_resource(); } - CBDB_WRAP_END; -} - -void cbdb::GetTableIndexAndTableNumber(Oid table_rel_oid, uint8 *table_no, - uint32 *table_index) { - CBDB_WRAP_START; - { - paxc::get_table_index_and_table_number(table_rel_oid, table_no, - table_index); - } - CBDB_WRAP_END; -} - -uint32 cbdb::GetBlockNumber(Oid table_rel_oid, uint32 table_index, - paxc::PaxBlockId block_id) { - CBDB_WRAP_START; - { return paxc::get_block_number(table_rel_oid, table_index, block_id); } - CBDB_WRAP_END; -} -paxc::PaxBlockId cbdb::GetBlockId(Oid table_rel_oid, uint8 table_no, - uint32 block_number) { - CBDB_WRAP_START; - { return paxc::get_block_id(table_rel_oid, table_no, block_number); } + { slot_getmissingattrs(slot, start_attno, last_attno); } CBDB_WRAP_END; } @@ -299,16 +262,10 @@ std::string cbdb::BuildPaxDirectoryPath(RelFileNode rd_node, CBDB_WRAP_END; } -std::string cbdb::BuildPaxFilePath(const Relation rel, +std::string cbdb::BuildPaxFilePath(const std::string &rel_path, const std::string &block_id) { - CBDB_WRAP_START; - { - char *tmp_str = paxc::BuildPaxFilePath(rel, block_id.c_str()); - std::string ret_str(tmp_str); - pfree(tmp_str); - return ret_str; - } - CBDB_WRAP_END; + Assert(!rel_path.empty()); + return rel_path + "/" + block_id; } int cbdb::RelationGetAttributesNumber(Relation rel) { @@ -317,56 +274,124 @@ int cbdb::RelationGetAttributesNumber(Relation rel) { CBDB_WRAP_END; } +StdRdOptions **cbdb::RelGetAttributeOptions(Relation rel) { + CBDB_WRAP_START; + { return RelationGetAttributeOptions(rel); } + CBDB_WRAP_END; +} + TupleDesc cbdb::RelationGetTupleDesc(Relation rel) { CBDB_WRAP_START; { return RelationGetDescr(rel); } CBDB_WRAP_END; } -bool cbdb::ExtractcolumnsFromNode(Node *expr, bool *cols, AttrNumber natts) { +bool cbdb::IsSystemAttrNumExist(struct PaxcExtractcolumnContext *context, + AttrNumber number) { + Assert(number < 0 && number > FirstLowInvalidHeapAttributeNumber && context); + return context->system_attr_number_mask[~number]; +} + +extern "C" { + +static bool paxc_extractcolumns_walker( // NOLINT + Node *node, struct PaxcExtractcolumnContext *ec_ctx) { + if (node == NULL) { + return false; + } + + if (IsA(node, Var)) { + Var *var = (Var *)node; + + if (IS_SPECIAL_VARNO(var->varno)) return false; + + if (var->varattno < 0) { + Assert(var->varattno > FirstLowInvalidHeapAttributeNumber); + ec_ctx->system_attr_number_mask[~var->varattno] = true; + } else if (ec_ctx->cols) { + if (var->varattno == 0) { + // If all attributes are included, + // set all entries in mask to true. + for (int attno = 0; attno < ec_ctx->natts; attno++) + ec_ctx->cols[attno] = true; + ec_ctx->found = true; + } else if (var->varattno <= ec_ctx->natts) { + ec_ctx->cols[var->varattno - 1] = true; + ec_ctx->found = true; + } + // Still need fill `system_attr_number_mask` + // Let this case return false + } + + return false; + } + + return expression_tree_walker(node, (bool (*)())paxc_extractcolumns_walker, + (void *)ec_ctx); +} + +}; // extern "C" + +bool cbdb::ExtractcolumnsFromNode(Node *expr, + struct PaxcExtractcolumnContext *ec_ctx) { + CBDB_WRAP_START; + { + paxc_extractcolumns_walker(expr, ec_ctx); + return ec_ctx->found; + } + CBDB_WRAP_END; +} + +bool cbdb::ExtractcolumnsFromNode(Node *expr, bool *cols, int natts) { CBDB_WRAP_START; { return extractcolumns_from_node(expr, cols, natts); } CBDB_WRAP_END; } -bool cbdb::MinMaxGetStrategyProcinfo(Oid atttypid, Oid *procid, FmgrInfo *finfo, StrategyNumber strategynum) -{ +bool cbdb::MinMaxGetStrategyProcinfo(Oid atttypid, Oid subtype, Oid *opfamily, FmgrInfo *finfo, + StrategyNumber strategynum) { CBDB_WRAP_START; - { return paxc::MinMaxGetStrategyProcinfo(atttypid, procid, finfo, strategynum); } + { + return paxc::MinMaxGetStrategyProcinfo(atttypid, subtype, opfamily, finfo, + strategynum); + } CBDB_WRAP_END; } -Datum cbdb::FunctionCall1Coll(FmgrInfo *flinfo, Oid collation, Datum arg1) -{ +Datum cbdb::FunctionCall1Coll(FmgrInfo *flinfo, Oid collation, Datum arg1) { CBDB_WRAP_START; - { return ::FunctionCall1Coll(flinfo, collation, arg1); } + { return ::FunctionCall1Coll(flinfo, collation, arg1); } CBDB_WRAP_END; } -Datum cbdb::FunctionCall2Coll(FmgrInfo *flinfo, Oid collation, Datum arg1, Datum arg2) -{ +Datum cbdb::FunctionCall2Coll(FmgrInfo *flinfo, Oid collation, Datum arg1, + Datum arg2) { CBDB_WRAP_START; - { return ::FunctionCall2Coll(flinfo, collation, arg1, arg2); } + { return ::FunctionCall2Coll(flinfo, collation, arg1, arg2); } CBDB_WRAP_END; } -Datum cbdb::FunctionCall3Coll(FmgrInfo *flinfo, Oid collation, Datum arg1, Datum arg2, Datum arg3) -{ +Datum cbdb::FunctionCall3Coll(FmgrInfo *flinfo, Oid collation, Datum arg1, + Datum arg2, Datum arg3) { CBDB_WRAP_START; - { return ::FunctionCall3Coll(flinfo, collation, arg1, arg2, arg3); } + { return ::FunctionCall3Coll(flinfo, collation, arg1, arg2, arg3); } CBDB_WRAP_END; } -Datum cbdb::FunctionCall4Coll(FmgrInfo *flinfo, Oid collation, Datum arg1, Datum arg2, Datum arg3, Datum arg4) -{ +Datum cbdb::FunctionCall4Coll(FmgrInfo *flinfo, Oid collation, Datum arg1, + Datum arg2, Datum arg3, Datum arg4) { CBDB_WRAP_START; - { return ::FunctionCall4Coll(flinfo, collation, arg1, arg2, arg3, arg4); } + { return ::FunctionCall4Coll(flinfo, collation, arg1, arg2, arg3, arg4); } CBDB_WRAP_END; } -SysScanDesc cbdb::SystableBeginScan(Relation rel, Oid index_id, bool index_ok, Snapshot snapshot, int n_keys, ScanKey keys) { +SysScanDesc cbdb::SystableBeginScan(Relation rel, Oid index_id, bool index_ok, + Snapshot snapshot, int n_keys, + ScanKey keys) { CBDB_WRAP_START; - { return systable_beginscan(rel, index_id, index_ok, snapshot, n_keys, keys); } + { + return systable_beginscan(rel, index_id, index_ok, snapshot, n_keys, keys); + } CBDB_WRAP_END; } @@ -382,7 +407,8 @@ void cbdb::SystableEndScan(SysScanDesc desc) { CBDB_WRAP_END; } -Datum cbdb::HeapGetAttr(HeapTuple tup, int attnum, TupleDesc tuple_desc, bool *isnull) { +Datum cbdb::HeapGetAttr(HeapTuple tup, int attnum, TupleDesc tuple_desc, + bool *isnull) { CBDB_WRAP_START; { return heap_getattr(tup, attnum, tuple_desc, isnull); } CBDB_WRAP_END; diff --git a/contrib/pax_storage/src/cpp/comm/cbdb_wrappers.h b/contrib/pax_storage/src/cpp/comm/cbdb_wrappers.h index 9f80b5e614c..02b10f2d007 100644 --- a/contrib/pax_storage/src/cpp/comm/cbdb_wrappers.h +++ b/contrib/pax_storage/src/cpp/comm/cbdb_wrappers.h @@ -6,13 +6,28 @@ #include #include "exceptions/CException.h" -#include "storage/pax_block_id.h" + +struct PaxcExtractcolumnContext { + // If cols set and call ExtractcolumnsFromNode with + // `target list`. Then the cols will fill with projection mask. + bool *cols = nullptr; + int natts = 0; + bool found = false; + + // This mask use to filter system attribute number. + // (~AttrNumber) will be index, mapping the [0, + // FirstLowInvalidHeapAttributeNumber) call `IsSystemAttrNumExist` to check + // system-defined attributes set + bool system_attr_number_mask[~FirstLowInvalidHeapAttributeNumber] = { + 0}; // NOLINT +}; namespace cbdb { #define PAX_ALLOCSET_DEFAULT_MINSIZE ALLOCSET_DEFAULT_MINSIZE #define PAX_ALLOCSET_DEFAULT_INITSIZE (8 * 1024) #define PAX_ALLOCSET_DEFAULT_MAXSIZE (3 * 64 * 1024 * 1024) + #define PAX_ALLOCSET_DEFAULT_SIZES \ PAX_ALLOCSET_DEFAULT_MINSIZE, PAX_ALLOCSET_DEFAULT_INITSIZE, \ PAX_ALLOCSET_DEFAULT_MAXSIZE @@ -98,23 +113,17 @@ static inline Datum Int64ToDatum(int64 d) noexcept { return Int64GetDatum(d); } void *PointerAndLenFromDatum(Datum d, int *len); +void SlotGetMissingAttrs(TupleTableSlot *slot, int start_attno, int last_attno); + #ifdef RUN_GTEST Datum DatumFromCString(const char *src, size_t length); Datum DatumFromPointer(const void *p, int16 typlen); #endif -struct varlena *PgDeToastDatumPacked(struct varlena *datum); +struct varlena *PgDeToastDatum(struct varlena *datum); -// pax ctid mapping functions -void InitCommandResource(); -void ReleaseCommandResource(); -void GetTableIndexAndTableNumber(Oid table_rel_oid, uint8 *table_no, - uint32 *table_index); -uint32 GetBlockNumber(Oid table_rel_oid, uint32 table_index, - paxc::PaxBlockId block_id); -paxc::PaxBlockId GetBlockId(Oid table_rel_oid, uint8 table_no, - uint32 block_number); +struct varlena *PgDeToastDatumPacked(struct varlena *datum); void RelationCreateStorageDirectory(RelFileNode rnode, char relpersistence, SMgrImpl smgr_which, Relation rel); @@ -137,31 +146,44 @@ void MakedirRecursive(const char *path); std::string BuildPaxDirectoryPath(RelFileNode rd_node, BackendId rd_backend); +std::string BuildPaxFilePath(const std::string &rel_path, const std::string &block_id); + int RelationGetAttributesNumber(Relation rel); +StdRdOptions **RelGetAttributeOptions(Relation rel); TupleDesc RelationGetTupleDesc(Relation rel); -bool ExtractcolumnsFromNode(Node *expr, bool *cols, AttrNumber natts); +bool ExtractcolumnsFromNode(Node *expr, + struct PaxcExtractcolumnContext *ec_ctx); + +bool IsSystemAttrNumExist(struct PaxcExtractcolumnContext *context, + AttrNumber number); -std::string BuildPaxFilePath(Relation rel, const std::string &block_id); +bool ExtractcolumnsFromNode(Node *expr, bool *cols, int natts); -bool MinMaxGetStrategyProcinfo(Oid atttypid, Oid *procid, FmgrInfo *finfo, StrategyNumber strategynum); +bool MinMaxGetStrategyProcinfo(Oid atttypid, Oid subtype, Oid *opfamily, FmgrInfo *finfo, + StrategyNumber strategynum); Datum FunctionCall1Coll(FmgrInfo *flinfo, Oid collation, Datum arg1); -Datum FunctionCall2Coll(FmgrInfo *flinfo, Oid collation, Datum arg1, Datum arg2); +Datum FunctionCall2Coll(FmgrInfo *flinfo, Oid collation, Datum arg1, + Datum arg2); -Datum FunctionCall3Coll(FmgrInfo *flinfo, Oid collation, Datum arg1, Datum arg2, Datum arg3); +Datum FunctionCall3Coll(FmgrInfo *flinfo, Oid collation, Datum arg1, Datum arg2, + Datum arg3); -Datum FunctionCall4Coll(FmgrInfo *flinfo, Oid collation, Datum arg1, Datum arg2, Datum arg3, Datum arg4); +Datum FunctionCall4Coll(FmgrInfo *flinfo, Oid collation, Datum arg1, Datum arg2, + Datum arg3, Datum arg4); -SysScanDesc SystableBeginScan(Relation rel, Oid index_id, bool index_ok, Snapshot snapshot, int n_keys, ScanKey keys); +SysScanDesc SystableBeginScan(Relation rel, Oid index_id, bool index_ok, + Snapshot snapshot, int n_keys, ScanKey keys); HeapTuple SystableGetNext(SysScanDesc desc); void SystableEndScan(SysScanDesc desc); -Datum HeapGetAttr(HeapTuple tup, int attnum, TupleDesc tuple_desc, bool *isnull); +Datum HeapGetAttr(HeapTuple tup, int attnum, TupleDesc tuple_desc, + bool *isnull); Relation TableOpen(Oid relid, LOCKMODE lockmode); @@ -188,13 +210,3 @@ void TableClose(Relation rel, LOCKMODE lockmode); } \ } // clang-format on - -// override the default new/delete to use current memory context -extern void *operator new(std::size_t size); -extern void *operator new[](std::size_t size); -extern void operator delete(void *ptr); -extern void operator delete[](void *ptr); - -// specify memory context for this allocation without switching memory context -extern void *operator new(std::size_t size, MemoryContext ctx); -extern void *operator new[](std::size_t size, MemoryContext ctx); diff --git a/contrib/pax_storage/src/cpp/comm/comm_test.cc b/contrib/pax_storage/src/cpp/comm/comm_test.cc index da867ecae32..3ff39adab1a 100644 --- a/contrib/pax_storage/src/cpp/comm/comm_test.cc +++ b/contrib/pax_storage/src/cpp/comm/comm_test.cc @@ -11,8 +11,6 @@ class CommTest : public ::testing::Test { 1 * 1024 * 1024, 1 * 1024 * 1024); MemoryContextSwitchTo(comm_test_memory_context); } - - void TearDown() override {} }; TEST_F(CommTest, TestDeleteOperator) { @@ -32,4 +30,19 @@ TEST_F(CommTest, TestDeleteOperator) { delete[] array_obj; } + +TEST_F(CommTest, TestNewOperator) { + auto obj = new bool[0]; + ASSERT_NE(obj, nullptr); + delete[] obj; + + auto obj2 = cbdb::Palloc(0); + ASSERT_NE(obj2, nullptr); + cbdb::Pfree(obj2); + + auto obj3 = cbdb::Palloc0(0); + ASSERT_NE(obj3, nullptr); + cbdb::Pfree(obj3); +} + } // namespace pax::tests \ No newline at end of file diff --git a/contrib/pax_storage/src/cpp/comm/guc.cc b/contrib/pax_storage/src/cpp/comm/guc.cc new file mode 100644 index 00000000000..7ae3334ac50 --- /dev/null +++ b/contrib/pax_storage/src/cpp/comm/guc.cc @@ -0,0 +1,15 @@ +#include "comm/guc.h" + +#include "storage/pax_defined.h" + +namespace pax { +bool pax_enable_debug = true; +bool pax_enable_filter = true; +int pax_scan_reuse_buffer_size = 0; +int pax_max_tuples_per_group = VEC_BATCH_LENGTH; + +#ifdef ENABLE_PLASMA +bool pax_enable_plasma_in_mem = true; +#endif + +} // namespace pax diff --git a/contrib/pax_storage/src/cpp/comm/guc.h b/contrib/pax_storage/src/cpp/comm/guc.h new file mode 100644 index 00000000000..c53678dc0dc --- /dev/null +++ b/contrib/pax_storage/src/cpp/comm/guc.h @@ -0,0 +1,13 @@ +#pragma once + +namespace pax { +extern bool pax_enable_debug; +extern bool pax_enable_filter; +extern int pax_scan_reuse_buffer_size; +extern int pax_max_tuples_per_group; + +#ifdef ENABLE_PLASMA +extern bool pax_enable_plasma_in_mem; +#endif + +} // namespace pax diff --git a/contrib/pax_storage/src/cpp/comm/log.h b/contrib/pax_storage/src/cpp/comm/log.h new file mode 100644 index 00000000000..21f2853b98e --- /dev/null +++ b/contrib/pax_storage/src/cpp/comm/log.h @@ -0,0 +1,12 @@ +#pragma once + +// Should never call PAX_LOG* without PAX_ENABLE_DEBUG +#define PAX_LOG_IF(ok, ...) \ + do { \ + if (ok) elog(LOG, __VA_ARGS__); \ + } while (0) + +#define PAX_LOG(...) \ + do { \ + elog(LOG, __VA_ARGS__); \ + } while (0) diff --git a/contrib/pax_storage/src/cpp/comm/pax_defer.h b/contrib/pax_storage/src/cpp/comm/pax_defer.h deleted file mode 100644 index ad39ba76bbd..00000000000 --- a/contrib/pax_storage/src/cpp/comm/pax_defer.h +++ /dev/null @@ -1,28 +0,0 @@ -#pragma once - -#include - -namespace pax { - -template -class Defer { - public: - const F function; - - public: - constexpr explicit Defer(const F &function) : function{function} {} - constexpr explicit Defer(F &&function) : function{std::move(function)} {} - ~Defer() { function(); } -}; - -template -inline Defer make_defer(F &&function) { - return Defer(std::forward(function)); -} - -} // namespace pax - -#define DEFER_CONCAT(n, ...) \ - const auto defer##n = pax::make_defer([&] { __VA_ARGS__; }) -#define DEFER_FORWARD(n, ...) DEFER_CONCAT(n, __VA_ARGS__) -#define DEFER(...) DEFER_FORWARD(__LINE__, __VA_ARGS__) diff --git a/contrib/pax_storage/src/cpp/comm/pax_memory.cc b/contrib/pax_storage/src/cpp/comm/pax_memory.cc new file mode 100644 index 00000000000..b9c05660709 --- /dev/null +++ b/contrib/pax_storage/src/cpp/comm/pax_memory.cc @@ -0,0 +1,25 @@ +#include "comm/pax_memory.h" + +#include "comm/cbdb_wrappers.h" + +void *operator new(std::size_t size) { return cbdb::Palloc(size); } + +void *operator new[](std::size_t size) { return cbdb::Palloc(size); } + +void *operator new(std::size_t size, MemoryContext ctx) { + return cbdb::MemCtxAlloc(ctx, size); +} + +void *operator new[](std::size_t size, MemoryContext ctx) { + return cbdb::MemCtxAlloc(ctx, size); +} + +void operator delete(void *ptr) { + if (ptr) cbdb::Pfree(ptr); +} + +void operator delete[](void *ptr) { + if (ptr) cbdb::Pfree(ptr); +} + + diff --git a/contrib/pax_storage/src/cpp/comm/pax_memory.h b/contrib/pax_storage/src/cpp/comm/pax_memory.h new file mode 100644 index 00000000000..03a81c036b1 --- /dev/null +++ b/contrib/pax_storage/src/cpp/comm/pax_memory.h @@ -0,0 +1,60 @@ +#pragma once +#include "comm/cbdb_api.h" +#include + +//#include "memory_allocator.h" + +namespace pax { + +template +static inline T* PAX_NEW(Args&&... args) { + return new T(std::forward(args)...); +} + +template +static inline T* PAX_NEW_ARRAY(size_t N) { + return new T[N]; +} + +template +static inline void PAX_DELETE(T *&obj) { + delete obj; + obj = nullptr; +} + +template +static inline void PAX_DELETE_ARRAY(T *&obj) { + delete []obj; + obj = nullptr; +} + +struct PaxMemoryDeleter { + template + inline void operator()(T* p) const { + delete p; + } +}; + +template +using pax_unique_ptr = std::unique_ptr; + +template +using pax_shared_ptr = std::shared_ptr; + +//template +//using pax_unique_ptr = std::unique_ptr; + +//template +//using pax_shared_ptr = std::shared_ptr; + +} + +// override the default new/delete to use current memory context +extern void *operator new(std::size_t size); +extern void *operator new[](std::size_t size); +extern void operator delete(void *ptr); +extern void operator delete[](void *ptr); + +// specify memory context for this allocation without switching memory context +extern void *operator new(std::size_t size, MemoryContext ctx); +extern void *operator new[](std::size_t size, MemoryContext ctx); diff --git a/contrib/pax_storage/src/cpp/comm/pax_rel.h b/contrib/pax_storage/src/cpp/comm/pax_rel.h new file mode 100644 index 00000000000..43934b475ff --- /dev/null +++ b/contrib/pax_storage/src/cpp/comm/pax_rel.h @@ -0,0 +1,29 @@ + +#ifndef SRC_CPP_COMM_PAX_REL_H_ +#define SRC_CPP_COMM_PAX_REL_H_ + +// Oid of pg_ext_aux.pg_pax_tables +#define PAX_TABLES_RELATION_ID 7061 +#define PAX_TABLES_RELID_INDEX_ID 7047 + +#define PAX_TABLE_AM_OID 7047 +#define PAX_AMNAME "pax" +#define PAX_AM_HANDLER_OID 7600 +#define PAX_AM_HANDLER_NAME "pax_tableam_handler" + +#define PAX_AUX_STATS_IN_OID 7601 +#define PAX_AUX_STATS_OUT_OID 7602 +#define PAX_AUX_STATS_TYPE_OID 7603 +#define PAX_AUX_STATS_TYPE_NAME "paxauxstats" + +#define PAX_FASTSEQUENCE_OID 7604 +#define PAX_FASTSEQUENCE_INDEX_OID 7605 + +#define PG_PAX_FASTSEQUENCE_NAMESPACE "pg_ext_aux" +#define PG_PAX_FASTSEQUENCE_TABLE "pg_pax_fastsequence" +#define PG_PAX_FASTSEQUENCE_INDEX_NAME "pg_pax_fastsequence_objid_idx" + +#define AMHandlerIsPAX(amhandler) ((amhandler) == PAX_AM_HANDLER_OID) +#define RelationIsPAX(relation) AMHandlerIsPAX((relation)->rd_amhandler) + +#endif // SRC_CPP_COMM_PAX_REL_H_ \ No newline at end of file diff --git a/contrib/pax_storage/src/cpp/comm/paxc_wrappers.cc b/contrib/pax_storage/src/cpp/comm/paxc_wrappers.cc index c968ae70940..219b7897259 100644 --- a/contrib/pax_storage/src/cpp/comm/paxc_wrappers.cc +++ b/contrib/pax_storage/src/cpp/comm/paxc_wrappers.cc @@ -155,21 +155,6 @@ char *BuildPaxDirectoryPath(RelFileNode rd_node, BackendId rd_backend) { return paxrelpath; } -// BuildPaxFilePath: function used to build pax storage directory path following -// pg convension, for example base/{database_oid}/{blocks_relid}_pax. parameter -// rel IN Relation information. parameter block_id IN micro-partition block id. -// return palloc'd pax storage directory path. -char *BuildPaxFilePath(Relation rel, const char *block_id) { - char *relpath = NULL; - char *filepath = NULL; - - relpath = BuildPaxDirectoryPath(rel->rd_node, rel->rd_backend); - Assert(relpath[0] != '\0'); - filepath = psprintf("%s/%s", relpath, block_id); - pfree(relpath); - return filepath; -} - static void UnlinkIfExistsFname(const char *fname, bool isdir, int elevel) { if (isdir) { if (rmdir(fname) != 0 && errno != ENOENT) @@ -224,12 +209,11 @@ static void DeletePaxDirectoryPathRecursive( } } -bool MinMaxGetStrategyProcinfo(Oid atttypid, Oid *procid, FmgrInfo *finfo, StrategyNumber strategynum) +bool MinMaxGetStrategyProcinfo(Oid atttypid, Oid subtype, Oid *opfamily, FmgrInfo *finfo, StrategyNumber strategynum) { FmgrInfo dummy; HeapTuple tuple; Oid opclass; - Oid opfamily; Oid oprid; RegProcedure opcode; bool isNull; @@ -238,10 +222,10 @@ bool MinMaxGetStrategyProcinfo(Oid atttypid, Oid *procid, FmgrInfo *finfo, Strat if (!OidIsValid(opclass)) return false; - opfamily = get_opclass_family(opclass); - tuple = SearchSysCache4(AMOPSTRATEGY, ObjectIdGetDatum(opfamily), - ObjectIdGetDatum(atttypid), + *opfamily = get_opclass_family(opclass); + tuple = SearchSysCache4(AMOPSTRATEGY, ObjectIdGetDatum(*opfamily), ObjectIdGetDatum(atttypid), + ObjectIdGetDatum(subtype), Int16GetDatum(strategynum)); if (!HeapTupleIsValid(tuple)) @@ -257,8 +241,6 @@ bool MinMaxGetStrategyProcinfo(Oid atttypid, Oid *procid, FmgrInfo *finfo, Strat return false; fmgr_info_cxt(opcode, finfo ? finfo : &dummy, CurrentMemoryContext); - *procid = opcode; - return true; } diff --git a/contrib/pax_storage/src/cpp/comm/paxc_wrappers.h b/contrib/pax_storage/src/cpp/comm/paxc_wrappers.h index fc0315a0b22..d1f21f402b1 100644 --- a/contrib/pax_storage/src/cpp/comm/paxc_wrappers.h +++ b/contrib/pax_storage/src/cpp/comm/paxc_wrappers.h @@ -8,6 +8,5 @@ void CopyFile(const char *srcsegpath, const char *dstsegpath); void DeletePaxDirectoryPath(const char *dirname, bool delete_topleveldir); void MakedirRecursive(const char *path); char *BuildPaxDirectoryPath(RelFileNode rd_node, BackendId rd_backend); -char *BuildPaxFilePath(Relation rel, const char *block_id); -bool MinMaxGetStrategyProcinfo(Oid atttypid, Oid *procid, FmgrInfo *finfo, StrategyNumber strategynum); +bool MinMaxGetStrategyProcinfo(Oid atttypid, Oid subtype, Oid *opfamily, FmgrInfo *finfo, StrategyNumber strategynum); } // namespace paxc diff --git a/contrib/pax_storage/src/cpp/comm/singleton.h b/contrib/pax_storage/src/cpp/comm/singleton.h index f53922f7b37..ef019b51683 100644 --- a/contrib/pax_storage/src/cpp/comm/singleton.h +++ b/contrib/pax_storage/src/cpp/comm/singleton.h @@ -3,6 +3,9 @@ #include #include #include + +#include "comm/pax_memory.h" + namespace pax { template diff --git a/contrib/pax_storage/src/cpp/contrib/cpp-stub b/contrib/pax_storage/src/cpp/contrib/cpp-stub new file mode 160000 index 00000000000..93d20c639a9 --- /dev/null +++ b/contrib/pax_storage/src/cpp/contrib/cpp-stub @@ -0,0 +1 @@ +Subproject commit 93d20c639a99fe93068692803aeb1982ea10dd6c diff --git a/contrib/pax_storage/src/cpp/contrib/googlebench b/contrib/pax_storage/src/cpp/contrib/googlebench new file mode 160000 index 00000000000..c2de5261302 --- /dev/null +++ b/contrib/pax_storage/src/cpp/contrib/googlebench @@ -0,0 +1 @@ +Subproject commit c2de5261302fa307ebe06b24c0fc30653bed5e17 diff --git a/contrib/pax_storage/src/cpp/contrib/tabulate b/contrib/pax_storage/src/cpp/contrib/tabulate new file mode 160000 index 00000000000..59f1c648070 --- /dev/null +++ b/contrib/pax_storage/src/cpp/contrib/tabulate @@ -0,0 +1 @@ +Subproject commit 59f1c6480705bae8e83800914e6ede4fb077b435 diff --git a/contrib/pax_storage/src/cpp/contrib/zstd b/contrib/pax_storage/src/cpp/contrib/zstd deleted file mode 160000 index 1e6651126b5..00000000000 --- a/contrib/pax_storage/src/cpp/contrib/zstd +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 1e6651126b5a0daf860c94d81cef019fb12283d7 diff --git a/contrib/pax_storage/src/cpp/pax_gbench.cc b/contrib/pax_storage/src/cpp/pax_gbench.cc new file mode 100644 index 00000000000..d9e72f48e26 --- /dev/null +++ b/contrib/pax_storage/src/cpp/pax_gbench.cc @@ -0,0 +1,9 @@ +#include + +static void example_benchmark(benchmark::State &state) { + for (auto _ : state) { + } +} +BENCHMARK(example_benchmark); + +BENCHMARK_MAIN(); \ No newline at end of file diff --git a/contrib/pax_storage/src/cpp/pax_gtest.cc b/contrib/pax_storage/src/cpp/pax_gtest.cc new file mode 100644 index 00000000000..6cb6c0c3a8f --- /dev/null +++ b/contrib/pax_storage/src/cpp/pax_gtest.cc @@ -0,0 +1,24 @@ +#include + +#include "stub.h" +#include "comm/gtest_wrappers.h" +#include "comm/cbdb_wrappers.h" + +bool MockMinMaxGetStrategyProcinfo(Oid, Oid, Oid *, FmgrInfo *, + StrategyNumber) { + return false; +} + +// Mock global method which is not link from another libarays +void GlobalMock(Stub *stub) { + stub->set(cbdb::MinMaxGetStrategyProcinfo, MockMinMaxGetStrategyProcinfo); +} + +int main(int argc, char **argv) { + Stub *stub_global; + stub_global = new Stub(); + testing::InitGoogleTest(&argc, argv); + GlobalMock(stub_global); + + return RUN_ALL_TESTS(); +} diff --git a/contrib/pax_storage/src/cpp/pax_gtest_helper.cc b/contrib/pax_storage/src/cpp/pax_gtest_helper.cc new file mode 100644 index 00000000000..a497986848e --- /dev/null +++ b/contrib/pax_storage/src/cpp/pax_gtest_helper.cc @@ -0,0 +1,168 @@ +#include "pax_gtest_helper.h" + +#include "storage/micro_partition.h" + +namespace pax::tests { + +void GenTextBuffer(char *buffer, size_t length) { + for (size_t i = 0; i < length; i++) { + buffer[i] = static_cast(i); + } +} + +void CreateMemoryContext() { + MemoryContext test_memory_context = AllocSetContextCreate( + (MemoryContext)NULL, "TestMemoryContext", 80 * 1024 * 1024, + 80 * 1024 * 1024, 80 * 1024 * 1024); + MemoryContextSwitchTo(test_memory_context); +} + +void CreateTestResourceOwner() { + CurrentResourceOwner = ResourceOwnerCreate(NULL, "TestResourceOwner"); +} + +void ReleaseTestResourceOwner() { + ResourceOwner tmp_resource_owner = CurrentResourceOwner; + CurrentResourceOwner = NULL; + ResourceOwnerRelease(tmp_resource_owner, RESOURCE_RELEASE_BEFORE_LOCKS, false, + true); + ResourceOwnerRelease(tmp_resource_owner, RESOURCE_RELEASE_LOCKS, false, true); + ResourceOwnerRelease(tmp_resource_owner, RESOURCE_RELEASE_AFTER_LOCKS, false, + true); + ResourceOwnerDelete(tmp_resource_owner); +} + +static TupleDesc CreateTestTupleDesc() { + auto tuple_desc = reinterpret_cast(cbdb::Palloc0( + sizeof(TupleDescData) + sizeof(FormData_pg_attribute) * COLUMN_NUMS)); + + tuple_desc->natts = COLUMN_NUMS; + tuple_desc->attrs[0] = {.atttypid = TEXTOID, + .attlen = -1, + .attbyval = false, + .attalign = TYPALIGN_DOUBLE, + .attisdropped = false, + .attcollation = DEFAULT_COLLATION_OID}; + + tuple_desc->attrs[1] = {.atttypid = TEXTOID, + .attlen = -1, + .attbyval = false, + .attalign = TYPALIGN_DOUBLE, + .attisdropped = false, + .attcollation = DEFAULT_COLLATION_OID}; + + tuple_desc->attrs[2] = {.atttypid = INT4OID, + .attlen = 4, + .attbyval = true, + .attalign = TYPALIGN_INT, + .attisdropped = false, + .attcollation = InvalidOid}; + return tuple_desc; +} + +TupleTableSlot *CreateTestTupleTableSlot(bool with_value) { + TupleTableSlot *tuple_slot = nullptr; + TupleDesc tuple_desc = nullptr; + + tuple_desc = CreateTestTupleDesc(); + + tuple_slot = MakeTupleTableSlot(tuple_desc, &TTSOpsVirtual); + + if (with_value) { + char column_buff[COLUMN_SIZE * 2]; + GenTextBuffer(column_buff, COLUMN_SIZE); + GenTextBuffer(column_buff + COLUMN_SIZE, COLUMN_SIZE); + + tuple_slot->tts_values[0] = + cbdb::DatumFromCString(column_buff, COLUMN_SIZE); + tuple_slot->tts_values[1] = + cbdb::DatumFromCString(column_buff + COLUMN_SIZE, COLUMN_SIZE); + tuple_slot->tts_values[2] = cbdb::Int32ToDatum(INT32_COLUMN_VALUE); + tuple_slot->tts_isnull[0] = false; + tuple_slot->tts_isnull[1] = false; + tuple_slot->tts_isnull[2] = false; + } + + return tuple_slot; +} + +static bool VerifyTestNonFixed(Datum datum, bool is_null) { + struct varlena *vl, *tunpacked; + int read_len; + char *read_data; + char column_buff[COLUMN_SIZE]; + + GenTextBuffer(column_buff, COLUMN_SIZE); + + if (is_null) { + return false; + } + + vl = (struct varlena *)DatumGetPointer(datum); + tunpacked = pg_detoast_datum_packed(vl); + if ((Pointer)vl != (Pointer)tunpacked) { + return false; + } + + read_len = VARSIZE(tunpacked); + read_data = VARDATA_ANY(tunpacked); + + if (read_len != COLUMN_SIZE + VARHDRSZ) { + return false; + } + + if (std::memcmp(read_data, column_buff, COLUMN_SIZE) != 0) { + return false; + } + return true; +} + +static bool VerifyTestFixed(Datum datum, bool is_null) { + return !is_null && cbdb::DatumToInt32(datum) == INT32_COLUMN_VALUE; +} + +bool VerifyTestTupleTableSlot(TupleTableSlot *tuple_slot) { + bool ok = true; + + if (!tuple_slot) { + return false; + } + + ok &= + VerifyTestNonFixed(tuple_slot->tts_values[0], tuple_slot->tts_isnull[0]); + ok &= + VerifyTestNonFixed(tuple_slot->tts_values[1], tuple_slot->tts_isnull[1]); + ok &= VerifyTestFixed(tuple_slot->tts_values[2], tuple_slot->tts_isnull[2]); + return ok; +} + +bool VerifyTestTupleTableSlot(TupleTableSlot *tuple_slot, int attrno) { + Assert(attrno <= 3 && attrno > 0); + + if (!tuple_slot) { + return false; + } + + if (attrno <= 2) { + return VerifyTestNonFixed(tuple_slot->tts_values[attrno - 1], + tuple_slot->tts_isnull[attrno - 1]); + } else { + return VerifyTestFixed(tuple_slot->tts_values[attrno - 1], + tuple_slot->tts_isnull[attrno - 1]); + } +} + +void DeleteTestTupleTableSlot(TupleTableSlot *tuple_slot) { + cbdb::Pfree(tuple_slot->tts_tupleDescriptor); + cbdb::Pfree(tuple_slot); +} + +std::vector CreateTestSchemaTypes() { + std::vector types; + types.emplace_back(pax::orc::proto::Type_Kind::Type_Kind_STRING); + types.emplace_back(pax::orc::proto::Type_Kind::Type_Kind_STRING); + types.emplace_back(pax::orc::proto::Type_Kind::Type_Kind_INT); + return types; +} + +} // namespace pax::tests diff --git a/contrib/pax_storage/src/cpp/pax_gtest_helper.h b/contrib/pax_storage/src/cpp/pax_gtest_helper.h new file mode 100644 index 00000000000..b82bf97e037 --- /dev/null +++ b/contrib/pax_storage/src/cpp/pax_gtest_helper.h @@ -0,0 +1,26 @@ +#pragma once +#include "comm/cbdb_api.h" + +#include + +#include "storage/proto/proto_wrappers.h" + +namespace pax::tests { + +// 3 clomun - string(len 100), string(len 100), int(len 4) +#define COLUMN_NUMS 3 +#define COLUMN_SIZE 100 +#define INT32_COLUMN_VALUE 0x123 +#define INT32_COLUMN_VALUE_DEFAULT 0x001 + +extern void CreateMemoryContext(); +extern void CreateTestResourceOwner(); +extern void ReleaseTestResourceOwner(); +extern TupleTableSlot *CreateTestTupleTableSlot(bool with_value = true); +extern bool VerifyTestTupleTableSlot(TupleTableSlot *tuple_slot); +extern bool VerifyTestTupleTableSlot(TupleTableSlot *tuple_slot, int attrno); +extern void DeleteTestTupleTableSlot(TupleTableSlot *tuple_slot); + +extern void GenTextBuffer(char *buffer, size_t length); +extern std::vector CreateTestSchemaTypes(); +} // namespace pax::tests diff --git a/contrib/pax_storage/src/cpp/storage/cache/pax_cache.cc b/contrib/pax_storage/src/cpp/storage/cache/pax_cache.cc new file mode 100644 index 00000000000..9b135ee5a75 --- /dev/null +++ b/contrib/pax_storage/src/cpp/storage/cache/pax_cache.cc @@ -0,0 +1,14 @@ +#include "storage/cache/pax_cache.h" + +namespace pax { + +bool PaxCache::Status::Ok() const { return ok_; } + +std::string PaxCache::Status::Error() { return error_msg_; } + +void PaxCache::Status::SetError(const std::string &error_msg) { + ok_ = false; + error_msg_ = error_msg; +} + +}; // namespace pax \ No newline at end of file diff --git a/contrib/pax_storage/src/cpp/storage/cache/pax_cache.h b/contrib/pax_storage/src/cpp/storage/cache/pax_cache.h new file mode 100644 index 00000000000..121c78342a0 --- /dev/null +++ b/contrib/pax_storage/src/cpp/storage/cache/pax_cache.h @@ -0,0 +1,64 @@ +#pragma once + +#include +#include + +namespace pax { + +class PaxCache { + public: + struct Status { + friend class PaxCache; + + bool Ok() const; + + std::string Error(); + + void SetError(const std::string &error_msg); + + private: + bool ok_ = true; + std::string error_msg_; + }; + + struct BatchBuffer { + const char *buffer = nullptr; + size_t buffer_len = 0; + const char *meta = nullptr; + size_t meta_len = 0; + + bool not_exist = false; + }; + + virtual ~PaxCache() = default; + + virtual Status Initialize() = 0; + + virtual Status Put(const std::string &key, + const BatchBuffer &batch_buffer) = 0; + + virtual Status Put(const std::string &key, + const std::vector> &buffers, + const std::pair &meta) = 0; + + virtual Status Exists(const std::string &key, bool *has) = 0; + + virtual Status Get(const std::string &key, BatchBuffer &batch_buffer) = 0; + + virtual Status Get(const std::vector &keys, + std::vector &batchs) = 0; + + virtual Status Release(const std::string &key) = 0; + + virtual Status Release(const std::vector &keys) = 0; + + virtual Status Delete(const std::string &key) = 0; + + virtual Status Delete(const std::vector &key) = 0; + + virtual Status Destroy() = 0; + + virtual size_t KeySizeLimit() = 0; +}; + +} // namespace pax diff --git a/contrib/pax_storage/src/cpp/storage/cache/pax_cache_test.cc b/contrib/pax_storage/src/cpp/storage/cache/pax_cache_test.cc new file mode 100644 index 00000000000..15ac2015dbb --- /dev/null +++ b/contrib/pax_storage/src/cpp/storage/cache/pax_cache_test.cc @@ -0,0 +1,347 @@ +#ifdef ENABLE_PLASMA +#include "plasma/store.h" +#endif + +#include + +#include + +#include + +// #include "comm/gtest_wrappers.h" +#include "pax_gtest_helper.h" +#include "storage/cache/pax_cache.h" +#include "storage/cache/pax_plasma_cache.h" +#ifdef ENABLE_PLASMA + +namespace pax::tests { + +#define CACHE_DATA_LEN 100 +#define CACHE_META_LEN 20 + +class PaxCacheTest : public ::testing::Test { + void SetUp() override { + plasma_server_ = std::thread([this] { + plasma::StartServer( + plasma_socket_ /* socket_name */, "" /* plasma_directory */, + false /* hugepages_enabled */, nullptr /* external_store */, + 10 * 1024 * 1024 /* system_memory */, + PLASMA_INFO /* plasmaLogSeverity */); + plasma::ShutdownServer(); + }); + sleep(1); + } + + void TearDown() override { + plasma::StopServer(); + plasma_server_.join(); + } + + protected: + static void PutKey(PaxCache *pax_cache, const std::string &key, + const PaxCache::BatchBuffer &input) { + auto status = pax_cache->Put(key, input); + ASSERT_TRUE(status.Ok()) << "fail to put key: " << key << status.Error(); + } + + static void Exist(PaxCache *pax_cache, const std::string &key, bool exist) { + bool exist_rc = false; + auto status = pax_cache->Exists(key, &exist_rc); + ASSERT_TRUE(status.Ok()) << status.Error(); + ASSERT_TRUE(exist ? exist_rc : !exist_rc) << "key: " << key << " exist"; + }; + + protected: + const int64_t client_memory_quota_ = 5 * 1024 * 1024; + char plasma_socket_[1024] = "/tmp/plasma"; + std::thread plasma_server_; +}; + +TEST_F(PaxCacheTest, TestCacheInterface) { + PaxCache *pax_cache; + PaxPlasmaCache::CacheOptions cache_options; + PaxCache::Status status; + PaxCache::BatchBuffer batch_buffer{0}; + + cache_options.domain_socket = std::string(plasma_socket_); + cache_options.client_name = "CLI1"; + cache_options.memory_quota = client_memory_quota_; + cache_options.waitting_ms = 0; + + pax_cache = new PaxPlasmaCache(cache_options); + status = pax_cache->Initialize(); + ASSERT_TRUE(status.Ok()) << status.Error(); + + // create 3 key + char data[CACHE_DATA_LEN]; + char meta[CACHE_META_LEN]; + GenTextBuffer(data, CACHE_DATA_LEN); + GenTextBuffer(meta, CACHE_META_LEN); + + batch_buffer.buffer = data; + batch_buffer.buffer_len = CACHE_DATA_LEN; + batch_buffer.meta = nullptr; + batch_buffer.meta = 0; + + PutKey(pax_cache, "key1", batch_buffer); + batch_buffer.meta = meta; + batch_buffer.meta_len = CACHE_META_LEN; + + PutKey(pax_cache, "key2", batch_buffer); + PutKey(pax_cache, "key3", batch_buffer); + + Exist(pax_cache, "key1", true); + Exist(pax_cache, "key2", true); + Exist(pax_cache, "key3", true); + + batch_buffer.buffer = nullptr; + batch_buffer.buffer_len = 0; + batch_buffer.meta = nullptr; + batch_buffer.meta_len = 0; + + // get + release + status = pax_cache->Get("key1", batch_buffer); + ASSERT_TRUE(status.Ok()) << status.Error(); + EXPECT_EQ(CACHE_DATA_LEN, batch_buffer.buffer_len); + EXPECT_EQ(0, batch_buffer.meta_len); + EXPECT_EQ(0, std::memcmp(batch_buffer.buffer, data, CACHE_DATA_LEN)); + // still will alloc a meta address with size 0 + EXPECT_NE(nullptr, batch_buffer.meta); + + status = pax_cache->Get("key2", batch_buffer); + ASSERT_TRUE(status.Ok()) << status.Error(); + EXPECT_EQ(CACHE_DATA_LEN, batch_buffer.buffer_len); + EXPECT_EQ(CACHE_META_LEN, batch_buffer.meta_len); + EXPECT_EQ(0, std::memcmp(batch_buffer.buffer, data, CACHE_DATA_LEN)); + EXPECT_EQ(0, std::memcmp(batch_buffer.meta, meta, CACHE_META_LEN)); + + status = pax_cache->Get("key3", batch_buffer); + ASSERT_TRUE(status.Ok()) << status.Error(); + + status = pax_cache->Release("key1"); + ASSERT_TRUE(status.Ok()) << status.Error(); + + std::vector release_list = {"key2", "key3"}; + status = pax_cache->Release(release_list); + ASSERT_TRUE(status.Ok()) << status.Error(); + + status = pax_cache->Delete("key1"); + ASSERT_TRUE(status.Ok()) << status.Error(); + + std::vector delete_list = {"key2", "key3"}; + status = pax_cache->Delete(delete_list); + ASSERT_TRUE(status.Ok()) << status.Error(); + + Exist(pax_cache, "key1", false); + Exist(pax_cache, "key2", false); + Exist(pax_cache, "key3", false); + + status = pax_cache->Destroy(); + ASSERT_TRUE(status.Ok()) << status.Error(); + + delete pax_cache; +} + +TEST_F(PaxCacheTest, TestLRUReplace) { + PaxCache *pax_cache; + PaxPlasmaCache::CacheOptions cache_options; + PaxCache::Status status; + PaxCache::BatchBuffer batch_buffer{0}; + + cache_options.domain_socket = std::string(plasma_socket_); + cache_options.client_name = "CLI1"; + cache_options.memory_quota = CACHE_DATA_LEN * 3; + cache_options.waitting_ms = 0; + + pax_cache = new PaxPlasmaCache(cache_options); + status = pax_cache->Initialize(); + ASSERT_TRUE(status.Ok()) << status.Error(); + + char data[CACHE_DATA_LEN]; + GenTextBuffer(data, CACHE_DATA_LEN); + + batch_buffer.buffer = data; + batch_buffer.buffer_len = CACHE_DATA_LEN; + batch_buffer.meta = nullptr; + batch_buffer.meta_len = 0; + + PutKey(pax_cache, "key1", batch_buffer); + PutKey(pax_cache, "key2", batch_buffer); + PutKey(pax_cache, "key3", batch_buffer); + PutKey(pax_cache, "key4", batch_buffer); + PutKey(pax_cache, "key5", batch_buffer); + + status = pax_cache->Get("key1", batch_buffer); + ASSERT_TRUE(status.Ok()) << status.Error(); + ASSERT_TRUE(batch_buffer.not_exist); + + status = pax_cache->Get("key2", batch_buffer); + ASSERT_TRUE(status.Ok()) << status.Error(); + ASSERT_TRUE(batch_buffer.not_exist); + + status = pax_cache->Get("key3", batch_buffer); + ASSERT_TRUE(status.Ok()) << status.Error(); + ASSERT_FALSE(batch_buffer.not_exist); + + status = pax_cache->Release("key3"); + ASSERT_TRUE(status.Ok()) << status.Error(); + + status = pax_cache->Delete("key3"); + ASSERT_TRUE(status.Ok()) << status.Error(); + + status = pax_cache->Get("key4", batch_buffer); + ASSERT_TRUE(status.Ok()) << status.Error(); + ASSERT_FALSE(batch_buffer.not_exist); + + status = pax_cache->Release("key4"); + ASSERT_TRUE(status.Ok()) << status.Error(); + + status = pax_cache->Delete("key4"); + ASSERT_TRUE(status.Ok()) << status.Error(); + + status = pax_cache->Get("key5", batch_buffer); + ASSERT_TRUE(status.Ok()) << status.Error(); + ASSERT_FALSE(batch_buffer.not_exist); + + status = pax_cache->Release("key5"); + ASSERT_TRUE(status.Ok()) << status.Error(); + + status = pax_cache->Delete("key5"); + ASSERT_TRUE(status.Ok()) << status.Error(); + + status = pax_cache->Destroy(); + ASSERT_TRUE(status.Ok()) << status.Error(); + + delete pax_cache; +} + +TEST_F(PaxCacheTest, TestGetNoExist) { + PaxCache *pax_cache; + PaxPlasmaCache::CacheOptions cache_options; + PaxCache::Status status; + PaxCache::BatchBuffer batch_buffer{0}; + + cache_options.domain_socket = std::string(plasma_socket_); + cache_options.client_name = "CLI1"; + cache_options.memory_quota = client_memory_quota_; + cache_options.waitting_ms = 0; + + pax_cache = new PaxPlasmaCache(cache_options); + status = pax_cache->Initialize(); + ASSERT_TRUE(status.Ok()) << status.Error(); + + char data[CACHE_DATA_LEN]; + char meta[CACHE_META_LEN]; + GenTextBuffer(data, CACHE_DATA_LEN); + GenTextBuffer(meta, CACHE_META_LEN); + + batch_buffer.buffer = data; + batch_buffer.buffer_len = CACHE_DATA_LEN; + batch_buffer.meta = meta; + batch_buffer.meta_len = CACHE_META_LEN; + + PutKey(pax_cache, "key1", batch_buffer); + Exist(pax_cache, "key1", true); + + status = pax_cache->Get("key1", batch_buffer); + ASSERT_TRUE(status.Ok()) << status.Error(); + + status = pax_cache->Release("key1"); + ASSERT_TRUE(status.Ok()) << status.Error(); + + status = pax_cache->Get("abc", batch_buffer); + ASSERT_TRUE(status.Ok()); + ASSERT_TRUE(batch_buffer.not_exist); + + std::vector batch_buffers; + status = pax_cache->Get({"key1", "abc"}, batch_buffers); + ASSERT_TRUE(status.Ok()) << status.Error(); + ASSERT_FALSE(batch_buffers[0].not_exist); + ASSERT_TRUE(batch_buffers[1].not_exist); + + status = pax_cache->Release("key1"); + ASSERT_TRUE(status.Ok()) << status.Error(); + + status = pax_cache->Delete("key1"); + ASSERT_TRUE(status.Ok()) << status.Error(); + + status = pax_cache->Destroy(); + ASSERT_TRUE(status.Ok()) << status.Error(); + + delete pax_cache; +} + +TEST_F(PaxCacheTest, TestDifferentClientDelete) { + PaxCache *pax_cache; + PaxPlasmaCache::CacheOptions cache_options; + PaxCache::Status status; + PaxCache::BatchBuffer batch_buffer{0}; + + cache_options.domain_socket = std::string(plasma_socket_); + cache_options.client_name = "CLI1"; + cache_options.memory_quota = client_memory_quota_; + cache_options.waitting_ms = 0; + + pax_cache = new PaxPlasmaCache(cache_options); + status = pax_cache->Initialize(); + ASSERT_TRUE(status.Ok()) << status.Error(); + + char data[CACHE_DATA_LEN]; + char meta[CACHE_META_LEN]; + GenTextBuffer(data, CACHE_DATA_LEN); + GenTextBuffer(meta, CACHE_META_LEN); + + batch_buffer.buffer = data; + batch_buffer.buffer_len = CACHE_DATA_LEN; + batch_buffer.meta = meta; + batch_buffer.meta_len = CACHE_META_LEN; + + PutKey(pax_cache, "key1", batch_buffer); + Exist(pax_cache, "key1", true); + + // CLI1 destroy + status = pax_cache->Destroy(); + ASSERT_TRUE(status.Ok()) << status.Error(); + delete pax_cache; + + // create CLI2 + cache_options.client_name = "CLI2"; + pax_cache = new PaxPlasmaCache(cache_options); + + status = pax_cache->Initialize(); + ASSERT_TRUE(status.Ok()) << status.Error(); + + // check exist + Exist(pax_cache, "key1", true); + + // get key1 + batch_buffer.buffer = nullptr; + batch_buffer.buffer_len = 0; + batch_buffer.meta = nullptr; + batch_buffer.meta_len = 0; + + status = pax_cache->Get("key1", batch_buffer); + ASSERT_TRUE(status.Ok()) << status.Error(); + EXPECT_EQ(CACHE_DATA_LEN, batch_buffer.buffer_len); + EXPECT_EQ(CACHE_META_LEN, batch_buffer.meta_len); + EXPECT_EQ(0, std::memcmp(batch_buffer.buffer, data, CACHE_DATA_LEN)); + EXPECT_EQ(0, std::memcmp(batch_buffer.meta, meta, CACHE_META_LEN)); + + status = pax_cache->Release("key1"); + ASSERT_TRUE(status.Ok()) << status.Error(); + + // delete key1 + status = pax_cache->Delete("key1"); + ASSERT_TRUE(status.Ok()) << status.Error(); + + // should delete success + Exist(pax_cache, "key1", false); + + status = pax_cache->Destroy(); + ASSERT_TRUE(status.Ok()) << status.Error(); + delete pax_cache; +} + +} // namespace pax::tests + +#endif // ENABLE_PLASMA diff --git a/contrib/pax_storage/src/cpp/storage/cache/pax_plasma_cache.cc b/contrib/pax_storage/src/cpp/storage/cache/pax_plasma_cache.cc new file mode 100644 index 00000000000..2d9e941a1d2 --- /dev/null +++ b/contrib/pax_storage/src/cpp/storage/cache/pax_plasma_cache.cc @@ -0,0 +1,276 @@ +#include "storage/cache/pax_plasma_cache.h" + +#ifdef ENABLE_PLASMA +#include +#include +#endif // ENABLE_PLASMA + +#include + +#include "comm/cbdb_wrappers.h" + +#ifdef ENABLE_PLASMA + +namespace pax { + +static inline plasma::ObjectID KeyToPlasmaId(const std::string &key, + size_t key_size_limit) { + plasma::ObjectID key_id; + + Assert(key.length() <= key_size_limit); + memcpy(key_id.mutable_data(), key.c_str(), key.length()); + memset(key_id.mutable_data() + key.length(), 0, + key_size_limit - key.length()); + + return key_id; +} + +static inline std::string PlasmaIdToKey(const plasma::ObjectID &key_id) { + std::string key; + key = key_id.binary(); + return key; +} + +PaxPlasmaCache::PaxPlasmaCache(const CacheOptions &option) + : PaxCache(), + options_(option), + is_initialized_(false), + plasma_client_(PAX_NEW()) {} + +PaxPlasmaCache::~PaxPlasmaCache() { PAX_DELETE(plasma_client_); }; + +PaxCache::Status PaxPlasmaCache::Initialize() { + PaxCache::Status status; + if (is_initialized_) { + status.SetError("Don't initialize twice."); + return status; + } + + auto plasma_status = plasma_client_->Connect( + options_.domain_socket /*store_socket_name*/, "" /*manager_socket_name*/, + 0 /*release_delay*/, 3 /*num_retries*/); + CHECK_PLASMA_STATUS(plasma_status, status); + + if (options_.memory_quota != 0) { + plasma_status = plasma_client_->SetClientOptions(options_.client_name, + options_.memory_quota); + CHECK_PLASMA_STATUS(plasma_status, status); + } + + is_initialized_ = true; + return status; +} + +PaxCache::Status PaxPlasmaCache::Put(const std::string &key, + const BatchBuffer &batch_buffer) { + PaxCache::Status status; + plasma::ObjectID key_id; + std::shared_ptr plasma_buffer; + + assert(is_initialized_); + assert(key.length() <= KeySizeLimit()); + key_id = KeyToPlasmaId(key, KeySizeLimit()); + + plasma::Status plasma_status = plasma_client_->Create( + key_id, batch_buffer.buffer_len, (const uint8_t *)batch_buffer.meta, + batch_buffer.meta_len, &plasma_buffer); + CHECK_PLASMA_STATUS(plasma_status, status); + + assert((size_t)plasma_buffer->size() == batch_buffer.buffer_len); + + memcpy(plasma_buffer->mutable_data(), batch_buffer.buffer, + batch_buffer.buffer_len); + + plasma_status = plasma_client_->Seal(key_id); + CHECK_PLASMA_STATUS(plasma_status, status); + + plasma_status = plasma_client_->Release(key_id); + CHECK_PLASMA_STATUS(plasma_status, status); + + return status; +} + +PaxCache::Status PaxPlasmaCache::Put( + const std::string &key, + const std::vector> &buffers, + const std::pair &meta) { + PaxCache::Status status; + plasma::ObjectID key_id; + std::shared_ptr plasma_buffer; + size_t total_size = 0; + size_t data_offset = 0; + + assert(is_initialized_); + assert(key.length() <= KeySizeLimit()); + key_id = KeyToPlasmaId(key, KeySizeLimit()); + + for (auto &pair : buffers) { + total_size += pair.second; + } + + plasma::Status plasma_status = + plasma_client_->Create(key_id, total_size, (const uint8_t *)meta.first, + meta.second, &plasma_buffer); + CHECK_PLASMA_STATUS(plasma_status, status); + + assert((size_t)plasma_buffer->size() == total_size); + + for (auto &pair : buffers) { + memcpy(plasma_buffer->mutable_data() + data_offset, pair.first, + pair.second); + data_offset += pair.second; + } + Assert(data_offset == total_size); + + plasma_status = plasma_client_->Seal(key_id); + CHECK_PLASMA_STATUS(plasma_status, status); + + plasma_status = plasma_client_->Release(key_id); + CHECK_PLASMA_STATUS(plasma_status, status); + return status; +} + +PaxCache::Status PaxPlasmaCache::Exists(const std::string &key, bool *has) { + PaxCache::Status status; + plasma::ObjectID key_id; + + assert(is_initialized_); + assert(key.length() <= KeySizeLimit()); + key_id = KeyToPlasmaId(key, KeySizeLimit()); + + plasma::Status plasma_status = plasma_client_->Contains(key_id, has); + CHECK_PLASMA_STATUS(plasma_status, status); + + return status; +} + +PaxCache::Status PaxPlasmaCache::Get(const std::string &key, + BatchBuffer &batch_buffer) { + PaxCache::Status status; + plasma::ObjectID key_id; + plasma::ObjectBuffer obj_buffer; + + assert(is_initialized_); + assert(key.length() <= KeySizeLimit()); + key_id = KeyToPlasmaId(key, KeySizeLimit()); + auto plasma_status = + plasma_client_->Get(&key_id, 1, options_.waitting_ms, &obj_buffer); + CHECK_PLASMA_STATUS(plasma_status, status); + + if (!obj_buffer.data) { + // not exist in server + batch_buffer.not_exist = true; + return status; + } + + batch_buffer.buffer = (const char *)obj_buffer.data->data(); + batch_buffer.buffer_len = obj_buffer.data->size(); + batch_buffer.meta = (const char *)obj_buffer.metadata->data(); + batch_buffer.meta_len = obj_buffer.metadata->size(); + batch_buffer.not_exist = false; + + return status; +} + +PaxCache::Status PaxPlasmaCache::Get(const std::vector &keys, + std::vector &batchs) { + PaxCache::Status status; + plasma::ObjectID key_ids[keys.size()]; + plasma::ObjectBuffer obj_buffers[keys.size()]; + + assert(is_initialized_); + + for (size_t i = 0; i < keys.size(); i++) { + assert(keys[i].length() <= KeySizeLimit()); + key_ids[i] = KeyToPlasmaId(keys[i], KeySizeLimit()); + } + + auto plasma_status = plasma_client_->Get(key_ids, keys.size(), + options_.waitting_ms, obj_buffers); + CHECK_PLASMA_STATUS(plasma_status, status); + + for (size_t i = 0; i < keys.size(); i++) { + BatchBuffer batch_buffer; + if (!obj_buffers[i].data) { + batch_buffer.not_exist = true; + } else { + batch_buffer.not_exist = false; + batch_buffer.buffer = (const char *)obj_buffers[i].data->data(); + batch_buffer.buffer_len = obj_buffers[i].data->size(); + batch_buffer.meta = (const char *)obj_buffers[i].metadata->data(); + batch_buffer.meta_len = obj_buffers[i].metadata->size(); + } + batchs.emplace_back(batch_buffer); + } + + return status; +} + +PaxCache::Status PaxPlasmaCache::Release(const std::string &key) { + PaxCache::Status status; + plasma::ObjectID key_id; + + assert(is_initialized_); + assert(key.length() <= KeySizeLimit()); + key_id = KeyToPlasmaId(key, KeySizeLimit()); + auto plasma_status = plasma_client_->Release(key_id); + CHECK_PLASMA_STATUS(plasma_status, status); + return status; +} + +PaxCache::Status PaxPlasmaCache::Release(const std::vector &keys) { + PaxCache::Status status; + plasma::ObjectID key_id; + + assert(is_initialized_); + for (const auto &key : keys) { + key_id = KeyToPlasmaId(key, KeySizeLimit()); + auto plasma_status = plasma_client_->Release(key_id); + CHECK_PLASMA_STATUS(plasma_status, status); + } + return status; +} + +PaxCache::Status PaxPlasmaCache::Delete(const std::string &key) { + PaxCache::Status status; + plasma::ObjectID key_id; + + assert(is_initialized_); + assert(key.length() <= KeySizeLimit()); + key_id = KeyToPlasmaId(key, KeySizeLimit()); + plasma::Status plasma_status = plasma_client_->Delete(key_id); + CHECK_PLASMA_STATUS(plasma_status, status); + + return status; +} + +PaxCache::Status PaxPlasmaCache::Delete(const std::vector &keys) { + PaxCache::Status status; + std::vector key_ids; + std::vector obj_buffers; + + assert(is_initialized_); + for (const auto &key : keys) { + key_ids.emplace_back(KeyToPlasmaId(key, KeySizeLimit())); + } + + plasma::Status plasma_status = plasma_client_->Delete(key_ids); + CHECK_PLASMA_STATUS(plasma_status, status); + + return status; +} + +PaxCache::Status PaxPlasmaCache::Destroy() { + PaxCache::Status status; + plasma::Status plasma_status = plasma_client_->Disconnect(); + assert(is_initialized_); + is_initialized_ = false; + CHECK_PLASMA_STATUS(plasma_status, status); + return status; +} + +size_t PaxPlasmaCache::KeySizeLimit() { return plasma::kUniqueIDSize; } + +} // namespace pax + +#endif // ENABLE_PLASMA diff --git a/contrib/pax_storage/src/cpp/storage/cache/pax_plasma_cache.h b/contrib/pax_storage/src/cpp/storage/cache/pax_plasma_cache.h new file mode 100644 index 00000000000..ab3555a4461 --- /dev/null +++ b/contrib/pax_storage/src/cpp/storage/cache/pax_plasma_cache.h @@ -0,0 +1,82 @@ +#pragma once + +#ifdef ENABLE_PLASMA + +#include +#include + +#include "storage/cache/pax_cache.h" + +namespace plasma { +class PlasmaClient; +} + +namespace pax { + +#define CHECK_PLASMA_STATUS(plasma_status, status_rc) \ + do { \ + if (!(plasma_status).ok()) { \ + (status_rc).SetError((plasma_status).ToString()); \ + return (status_rc); \ + } \ + } while (0); + +class PaxPlasmaCache : public PaxCache { + public: + struct CacheOptions { + std::string domain_socket; + // client name + memory quota will limit current client memory used + // if memory_quota_ is 0 means no limit + // Notice that: if current plasma server capcity LT memory quota + // Then it will make Initialize failed + std::string client_name; + size_t memory_quota = 0; + + // the waitting time after `Get` call failed + // during this period, if the same `key` is put, + // the data will be obtained + size_t waitting_ms = 0; + }; + + explicit PaxPlasmaCache(const CacheOptions &option); + + ~PaxPlasmaCache() override; + + PaxCache::Status Initialize() override; + + PaxCache::Status Put(const std::string &key, + const BatchBuffer &batch_buffer) override; + + PaxCache::Status Put(const std::string &key, + const std::vector> &buffers, + const std::pair &meta) override; + + PaxCache::Status Exists(const std::string &key, bool *has) override; + + PaxCache::Status Get(const std::string &key, + BatchBuffer &batch_buffer) override; + + PaxCache::Status Get(const std::vector &keys, + std::vector &batchs) override; + + PaxCache::Status Release(const std::string &key) override; + + PaxCache::Status Release(const std::vector &keys) override; + + PaxCache::Status Delete(const std::string &key) override; + + PaxCache::Status Delete(const std::vector &keys) override; + + PaxCache::Status Destroy() override; + + size_t KeySizeLimit() override; + + private: + CacheOptions options_; + bool is_initialized_; + plasma::PlasmaClient *plasma_client_; +}; + +} // namespace pax + +#endif // ENABLE_PLASMA diff --git a/contrib/pax_storage/src/cpp/storage/columns/pax_column.cc b/contrib/pax_storage/src/cpp/storage/columns/pax_column.cc index 04eba6feb42..2ad0a7bfa57 100644 --- a/contrib/pax_storage/src/cpp/storage/columns/pax_column.cc +++ b/contrib/pax_storage/src/cpp/storage/columns/pax_column.cc @@ -6,125 +6,87 @@ #include #include -#include "comm/pax_defer.h" +#include "storage/columns/pax_column_traits.h" +#include "storage/pax_defined.h" namespace pax { PaxColumn::PaxColumn() : null_bitmap_(nullptr), + total_rows_(0), + non_null_rows_(0), encoded_type_(ColumnEncoding_Kind::ColumnEncoding_Kind_NO_ENCODED), - storage_type_(PaxColumnStorageType::kTypeStorageNonVec) {} + compress_level_(0), + type_align_size_(PAX_DATA_NO_ALIGN) {} -PaxColumn::~PaxColumn() { - if (null_bitmap_) { - delete null_bitmap_; - } -} +PaxColumn::~PaxColumn() { PAX_DELETE(null_bitmap_); } PaxColumnTypeInMem PaxColumn::GetPaxColumnTypeInMem() const { return PaxColumnTypeInMem::kTypeInvalid; } -void PaxColumn::Clear() { - if (null_bitmap_) { - delete null_bitmap_; - null_bitmap_ = nullptr; - } -} - bool PaxColumn::HasNull() { return null_bitmap_ != nullptr; } -void PaxColumn::SetNulls(DataBuffer *null_bitmap) { +bool PaxColumn::AllNull() const { + return null_bitmap_ && null_bitmap_->Empty(); +} + +void PaxColumn::SetBitmap(Bitmap8 *null_bitmap) { Assert(!null_bitmap_); null_bitmap_ = null_bitmap; } -DataBuffer *PaxColumn::GetNulls() const { return null_bitmap_; } - -std::pair PaxColumn::GetRangeNulls(size_t start_pos, - size_t len) { - Assert(null_bitmap_); - CBDB_CHECK((start_pos + len) <= GetRows(), - cbdb::CException::ExType::kExTypeOutOfRange); +size_t PaxColumn::GetRows() const { return total_rows_; } - static_assert(sizeof(char) == sizeof(bool)); - return std::make_pair(null_bitmap_->GetBuffer() + start_pos, len); -} +size_t PaxColumn::GetNonNullRows() const { return non_null_rows_; } -size_t PaxColumn::GetRows() { - return null_bitmap_ ? null_bitmap_->Used() : GetNonNullRows(); -} +void PaxColumn::SetRows(size_t total_rows) { total_rows_ = total_rows; } size_t PaxColumn::GetRangeNonNullRows(size_t start_pos, size_t len) { CBDB_CHECK((start_pos + len) <= GetRows(), cbdb::CException::ExType::kExTypeOutOfRange); - if (null_bitmap_) { - size_t total_non_null = 0; - for (size_t i = start_pos; i < (start_pos + len); i++) { - if ((*null_bitmap_)[i]) { - total_non_null++; - } - } - - return total_non_null; - } else { - return len; + if (!null_bitmap_) return len; + if (len == 0) { + return 0; } + return null_bitmap_->CountBits(start_pos, start_pos + len - 1); +} + +void PaxColumn::CreateNulls(size_t cap) { + Assert(!null_bitmap_); + null_bitmap_ = PAX_NEW(cap); + null_bitmap_->SetN(total_rows_); } void PaxColumn::AppendNull() { if (!null_bitmap_) { - size_t current_rows = GetNonNullRows(); - size_t size = current_rows > DEFAULT_CAPACITY - ? (current_rows / DEFAULT_CAPACITY + 1) * DEFAULT_CAPACITY - : DEFAULT_CAPACITY; - null_bitmap_ = new DataBuffer(size); - null_bitmap_->Brush(current_rows * sizeof(bool)); - memset(null_bitmap_->GetBuffer(), 1, null_bitmap_->Capacity()); + CreateNulls(DEFAULT_CAPACITY); } - - if (null_bitmap_->Available() == 0) { - size_t old_cap = null_bitmap_->Capacity(); - null_bitmap_->ReSize(old_cap * 2); - memset(null_bitmap_->GetAvailableBuffer(), 1, old_cap); - } - - null_bitmap_->Write(false); - null_bitmap_->Brush(sizeof(bool)); + null_bitmap_->Clear(total_rows_); + ++total_rows_; } -void PaxColumn::Append([[maybe_unused]] char *buffer, - [[maybe_unused]] size_t size) { - if (null_bitmap_) { - if (null_bitmap_->Available() == 0) { - size_t old_cap = null_bitmap_->Capacity(); - null_bitmap_->ReSize(old_cap * 2); - memset(null_bitmap_->GetAvailableBuffer(), 1, old_cap); - } - null_bitmap_->Brush(sizeof(bool)); - } +void PaxColumn::Append(char * /*buffer*/, size_t /*size*/) { + if (null_bitmap_) null_bitmap_->Set(total_rows_); + ++total_rows_; + ++non_null_rows_; } -PaxColumn *PaxColumn::SetColumnEncodeType(ColumnEncoding_Kind encoding_type) { - encoded_type_ = encoding_type; - return this; -} +size_t PaxColumn::GetAlignSize() const { return type_align_size_; } -PaxColumn *PaxColumn::SetColumnStorageType(PaxColumnStorageType storage_type) { - storage_type_ = storage_type; - return this; +void PaxColumn::SetAlignSize(size_t align_size) { + Assert(align_size > 0 && (align_size & (align_size - 1)) == 0); + type_align_size_ = align_size; } -ColumnEncoding_Kind PaxColumn::GetEncodingType() const { return encoded_type_; } - template -PaxCommColumn::PaxCommColumn(uint64 capacity) : capacity_(capacity) { - data_ = new DataBuffer(capacity * sizeof(T)); +PaxCommColumn::PaxCommColumn(uint32 capacity) { + data_ = PAX_NEW>(capacity * sizeof(T)); } template PaxCommColumn::~PaxCommColumn() { - delete data_; + PAX_DELETE(data_); } template // NOLINT: redirect constructor @@ -132,7 +94,7 @@ PaxCommColumn::PaxCommColumn() : PaxCommColumn(DEFAULT_CAPACITY) {} template void PaxCommColumn::Set(DataBuffer *data) { - delete data_; + PAX_DELETE(data_); data_ = data; } @@ -145,10 +107,10 @@ void PaxCommColumn::Append(char *buffer, size_t size) { // TODO(jiaqizho): Is it necessary to support multiple buffer insertions for // bulk insert push to mirco partition? Assert(size == sizeof(T)); - Assert(GetNonNullRows() <= capacity_); + Assert(data_->Capacity() >= sizeof(T)); - if (GetNonNullRows() == capacity_) { - ReSize(capacity_ * 2); + if (data_->Available() == 0) { + data_->ReSize(data_->Used() + size, 2); } data_->Write(buffer_t, sizeof(T)); @@ -156,22 +118,13 @@ void PaxCommColumn::Append(char *buffer, size_t size) { } template -PaxColumnTypeInMem PaxCommColumn::GetPaxColumnTypeInMem() const { - return PaxColumnTypeInMem::kTypeFixed; -} - -template -void PaxCommColumn::Clear() { - PaxColumn::Clear(); - data_->BrushBackAll(); +PaxStorageFormat PaxCommColumn::GetStorageFormat() const { + return PaxStorageFormat::kTypeStorageOrcNonVec; } template -void PaxCommColumn::ReSize(uint64 cap) { - if (capacity_ < cap) { - data_->ReSize(cap * sizeof(T)); - capacity_ = cap; - } +PaxColumnTypeInMem PaxCommColumn::GetPaxColumnTypeInMem() const { + return PaxColumnTypeInMem::kTypeFixed; } template @@ -223,36 +176,30 @@ template class PaxCommColumn; template class PaxCommColumn; template class PaxCommColumn; -PaxNonFixedColumn::PaxNonFixedColumn(uint64 capacity) : estimated_size_(0) { - data_ = new DataBuffer(capacity * sizeof(char) * 100); - lengths_ = new DataBuffer(capacity * sizeof(char)); +PaxNonFixedColumn::PaxNonFixedColumn(uint32 capacity) : estimated_size_(0) { + data_ = PAX_NEW>(capacity * sizeof(char)); + lengths_ = PAX_NEW>(capacity * sizeof(char)); } PaxNonFixedColumn::PaxNonFixedColumn() : PaxNonFixedColumn(DEFAULT_CAPACITY) {} PaxNonFixedColumn::~PaxNonFixedColumn() { - if (data_) { - delete data_; - } - - if (lengths_) { - delete lengths_; - } + PAX_DELETE(data_); + PAX_DELETE(lengths_); } -void PaxNonFixedColumn::Set(DataBuffer *data, DataBuffer *lengths, +void PaxNonFixedColumn::Set(DataBuffer *data, DataBuffer *lengths, size_t total_size) { - if (data_) { - delete data_; - } - - if (lengths_) { - delete lengths_; - } + PAX_DELETE(data_); + PAX_DELETE(lengths_); estimated_size_ = total_size; data_ = data; lengths_ = lengths; + BuildOffsets(); +} + +void PaxNonFixedColumn::BuildOffsets() { offsets_.clear(); for (size_t i = 0; i < lengths_->GetSize(); i++) { offsets_.emplace_back(i == 0 ? 0 : offsets_[i - 1] + (*lengths_)[i - 1]); @@ -260,30 +207,30 @@ void PaxNonFixedColumn::Set(DataBuffer *data, DataBuffer *lengths, } void PaxNonFixedColumn::Append(char *buffer, size_t size) { - Assert(likely(reinterpret_cast MAXALIGN(data_->Position()) == - data_->Position())); - size_t origin_size; origin_size = size; - // FIMXE(gongxun): maybe it should be aligned base on the typalign? - size = MAXALIGN(size); + if (!COLUMN_STORAGE_FORMAT_IS_VEC(this)) { + Assert(likely(reinterpret_cast MAXALIGN(data_->Position()) == + data_->Position())); + size = MAXALIGN(size); + } PaxColumn::Append(buffer, origin_size); - while (data_->Available() < size) { - data_->ReSize(data_->Capacity() * 2); + if (data_->Available() < size) { + data_->ReSize(data_->Used() + size, 2); } if (lengths_->Available() == 0) { - lengths_->ReSize(lengths_->Capacity() * 2); + lengths_->ReSize(lengths_->Used() + sizeof(int32), 2); } estimated_size_ += size; data_->Write(buffer, origin_size); data_->Brush(size); - lengths_->Write(reinterpret_cast(&size), sizeof(int64)); - lengths_->Brush(sizeof(int64)); + lengths_->Write(reinterpret_cast(&size), sizeof(int32)); + lengths_->Brush(sizeof(int32)); offsets_.emplace_back(offsets_.empty() ? 0 @@ -292,7 +239,7 @@ void PaxNonFixedColumn::Append(char *buffer, size_t size) { Assert(offsets_.size() == lengths_->GetSize()); } -DataBuffer *PaxNonFixedColumn::GetLengthBuffer() const { +DataBuffer *PaxNonFixedColumn::GetLengthBuffer() const { return lengths_; } @@ -300,13 +247,8 @@ PaxColumnTypeInMem PaxNonFixedColumn::GetPaxColumnTypeInMem() const { return PaxColumnTypeInMem::kTypeNonFixed; } -void PaxNonFixedColumn::Clear() { - PaxColumn::Clear(); - - data_->BrushBackAll(); - lengths_->BrushBackAll(); - - offsets_.clear(); +PaxStorageFormat PaxNonFixedColumn::GetStorageFormat() const { + return PaxStorageFormat::kTypeStorageOrcNonVec; } std::pair PaxNonFixedColumn::GetBuffer() { @@ -350,14 +292,4 @@ std::pair PaxNonFixedColumn::GetRangeBuffer(size_t start_pos, return std::make_pair(data_->GetBuffer() + offsets_[start_pos], range_len); } -bool PaxNonFixedColumn::IsMemTakeOver() const { - Assert(data_->IsMemTakeOver() == lengths_->IsMemTakeOver()); - return data_->IsMemTakeOver(); -} - -void PaxNonFixedColumn::SetMemTakeOver(bool take_over) { - data_->SetMemTakeOver(take_over); - lengths_->SetMemTakeOver(take_over); -} - }; // namespace pax diff --git a/contrib/pax_storage/src/cpp/storage/columns/pax_column.h b/contrib/pax_storage/src/cpp/storage/columns/pax_column.h index f6de8f4f567..69592fa8876 100644 --- a/contrib/pax_storage/src/cpp/storage/columns/pax_column.h +++ b/contrib/pax_storage/src/cpp/storage/columns/pax_column.h @@ -8,9 +8,13 @@ #include #include +#include "comm/bitmap.h" #include "storage/columns/pax_compress.h" +#include "storage/columns/pax_decoding.h" +#include "storage/columns/pax_encoding.h" #include "storage/columns/pax_encoding_utils.h" #include "storage/pax_buffer.h" +#include "storage/pax_defined.h" #include "storage/proto/proto_wrappers.h" namespace pax { @@ -21,47 +25,115 @@ namespace pax { // Used to mapping pg_type enum PaxColumnTypeInMem { kTypeInvalid = 1, kTypeFixed = 2, kTypeNonFixed = 3 }; -enum PaxColumnStorageType { - // default non-vec store - // which split null field and null bitmap - kTypeStorageNonVec, - // vec storage format - // spec the storage format - kTypeStorageVec, -}; - class PaxColumn { public: PaxColumn(); virtual ~PaxColumn(); - virtual PaxColumn *SetColumnEncodeType(ColumnEncoding_Kind encoding_type); - - virtual PaxColumn *SetColumnStorageType(PaxColumnStorageType storage_type); - // Get the column in memory type virtual PaxColumnTypeInMem GetPaxColumnTypeInMem() const; - // Empties the vector from all its elements, recursively. - // Do not alter the current capacity. - virtual void Clear(); - // Get column buffer from current column virtual std::pair GetBuffer() = 0; - // Get buffer by position + // The interface `GetBuffer(size_t position)` and + // `GetRangeBuffer(size_t start_pos, size_t len)` + // will return the different values in different + // `ColumnStorageType` + `ColumnTypeInMem` + // + // Also they should NEVER call in write path with encoding option!!! + // But without encoding option, still can direct call it. + // + // If `storage_type_` is kTypeStorageOrcVec + // Then data part contains `null field` which means no need use + // `row index - null counts` to get the data. + // + // But If `storage_type_` is not kTypeStorageOrcVec + // Then position should be `row index - null counts`, because + // data part will not contains `null field`. + // + // Also it is kind different in fixed-length column and non-fixed-length + // column when `storage_type_` is kTypeStorageOrcVec. For the fixed-length + // column, If we got a `null field`, then it will return the buffer with zero + // fill. But in non-fixed-length column, once we got `null field`, the buffer + // will be nullptr. + // + // A example to explain: + // std::tuple GetBufferWithNull( + // size_t row_index, + // size_t null_counts) { + // + // PaxColumn *column = source(); + // char * buffer = nullptr; + // size_t length = 0; + // switch (GetPaxColumnTypeInMem()) { + // case kTypeFixed: { + // if (COLUMN_STORAGE_FORMAT_IS_VEC(column)) { + // std::tie(buffer, length) = column->GetBuffer(row_index); + // assert(buffer); // different return in different ColumnTypeInMem + // if (!length) { + // return {nullptr, 0, true}; + // } + // } else { + // std::tie(buffer, length) = column->GetBuffer( + // row_index - null_counts); + // } + // assert(buffer && length); + // return {buffer, length, false}; + // } + // case kTypeNonFixed: { + // if (COLUMN_STORAGE_FORMAT_IS_VEC(column)) { + // std::tie(buffer, length) = column->GetBuffer(row_index); + // // different return in different ColumnTypeInMem + // assert((!buffer && !length) || (buffer && length)); + // if (!buffer && !length) { + // return {nullptr, 0, true}; + // } + // } else { + // std::tie(buffer, length) = column->GetBuffer( + // row_index - null_counts); + // } + // return {buffer, length, false}; + // break; + // } + // default: + // // nothing + // } + // // should not react here! + // assert(false); + // } + // + // A simplest example: + // std::tuple GetBufferWithNull(size_t row_index, + // size_t null_counts) { + // PaxColumn *column = source(); + // char * buffer = nullptr; + // size_t length = 0; + // if (COLUMN_STORAGE_FORMAT_IS_VEC(column)) { + // std::tie(buffer, length) = column->GetBuffer(row_index); + // if (!length) { + // return {nullptr, 0, true}; + // } + // } else { + // std::tie(buffer, length) = column->GetBuffer(row_index - null_counts); + // } + // assert(buffer && length); + // return {buffer, length, false}; + // } + // virtual std::pair GetBuffer(size_t position) = 0; // Get buffer by range [start_pos, start_pos + len) + // Should never call in write path with encoding option virtual std::pair GetRangeBuffer(size_t start_pos, size_t len) = 0; // Get all rows number(contain null) from column - virtual size_t GetRows(); + virtual size_t GetRows() const; // Get rows number(not null) from column - virtual size_t GetNonNullRows() const = 0; + virtual size_t GetNonNullRows() const; // Get all rows number(not null) from column by range [start_pos, start_pos + // len) @@ -76,36 +148,86 @@ class PaxColumn { // Estimated memory size from current column virtual size_t PhysicalSize() const = 0; - // Get current encoding type - virtual ColumnEncoding_Kind GetEncodingType() const; + // Get current storage type + virtual PaxStorageFormat GetStorageFormat() const = 0; // Get the data size without encoding/compress virtual int64 GetOriginLength() const = 0; - // Get the type length, if non-fixed, will return -1 + // Get the type length, used to identify sub-class + // - `PaxCommColumn` will return the length + // - `PaxNonFixedColumn` will return -1 virtual int32 GetTypeLength() const = 0; // Contain null filed or not bool HasNull(); + // Are all values null? + bool AllNull() const; + // Set null bitmap - void SetNulls(DataBuffer *null_bitmap); + void SetBitmap(Bitmap8 *null_bitmap); + + // Get Bitmap + Bitmap8 *GetBitmap() { return null_bitmap_; } + + void SetRows(size_t total_rows); + + virtual size_t GetAlignSize() const; + + virtual void SetAlignSize(size_t align_size); + + // Get current encoding type + inline ColumnEncoding_Kind GetEncodingType() const { return encoded_type_; } + + // Get current compress level + inline int GetCompressLevel() const { return compress_level_; } + + protected: + // The encoding option should pass in sub-class + inline void SetEncodeType(ColumnEncoding_Kind encoding_type) { + encoded_type_ = encoding_type; + } - // Get null bitmaps - DataBuffer *GetNulls() const; + inline void SetCompressLevel(int compress_level) { + compress_level_ = compress_level; + } - // Get bull bitmaps by range [start_pos, start_pos + len) - std::pair GetRangeNulls(size_t start_pos, size_t len); + private: + void CreateNulls(size_t cap); protected: // null field bit map - DataBuffer *null_bitmap_; + Bitmap8 *null_bitmap_; + + // Writer: write pointer + // Reader: total rows + uint32 total_rows_; - // the column is encoded type + // some of subclass will not implements the not null logic, + // but can direct get not null rows by data part. + size_t non_null_rows_; + + // the column encoded type ColumnEncoding_Kind encoded_type_; - // whether the column is storage - PaxColumnStorageType storage_type_; + // the column compress level + int compress_level_; + + // data part align size. + // This field only takes effect when current column is no encoding/compress. + // + // About `type_align` in `pg_type` what you need to know: + // 1. address alignment: the datum which return need alignment with + // `type_align` + // 2. datum padding: the datum need padding with `type_align` + // + // The align logic in pax: + // 1. address alignment: + // - write will make sure address alignment(data stream) in disk + // - `ReadTuple` with/without memcpy should get a alignment datum + // 2. datum padding: deal it in column `Append` + size_t type_align_size_; private: PaxColumn(const PaxColumn &); @@ -115,7 +237,7 @@ class PaxColumn { template class PaxCommColumn : public PaxColumn { public: - explicit PaxCommColumn(uint64 capacity); + explicit PaxCommColumn(uint32 capacity); ~PaxCommColumn() override; @@ -125,6 +247,8 @@ class PaxCommColumn : public PaxColumn { PaxColumnTypeInMem GetPaxColumnTypeInMem() const override; + PaxStorageFormat GetStorageFormat() const override; + void Append(char *buffer, size_t size) override; std::pair GetBuffer(size_t position) override; @@ -134,8 +258,6 @@ class PaxCommColumn : public PaxColumn { size_t GetNonNullRows() const override; - void Clear() override; - size_t PhysicalSize() const override; int64 GetOriginLength() const override; @@ -145,10 +267,6 @@ class PaxCommColumn : public PaxColumn { int32 GetTypeLength() const override; protected: - virtual void ReSize(uint64 capacity); - - protected: - uint64 capacity_; DataBuffer *data_; }; @@ -162,20 +280,20 @@ extern template class PaxCommColumn; class PaxNonFixedColumn : public PaxColumn { public: - explicit PaxNonFixedColumn(uint64 capacity); + explicit PaxNonFixedColumn(uint32 capacity); PaxNonFixedColumn(); ~PaxNonFixedColumn() override; - virtual void Set(DataBuffer *data, DataBuffer *lengths, + virtual void Set(DataBuffer *data, DataBuffer *lengths, size_t total_size); void Append(char *buffer, size_t size) override; PaxColumnTypeInMem GetPaxColumnTypeInMem() const override; - void Clear() override; + PaxStorageFormat GetStorageFormat() const override; std::pair GetBuffer() override; @@ -192,18 +310,22 @@ class PaxNonFixedColumn : public PaxColumn { size_t GetNonNullRows() const override; - DataBuffer *GetLengthBuffer() const; + DataBuffer *GetLengthBuffer() const; - bool IsMemTakeOver() const; + DataBuffer *GetOffsetBuffer(bool append_last = false); - void SetMemTakeOver(bool take_over); + protected: + void BuildOffsets(); protected: size_t estimated_size_; DataBuffer *data_; - // orc needs to serialize int64 array - DataBuffer *lengths_; + // orc needs to serialize int32 array + // the length of a single tuple field will not exceed 2GB, + // so a variable-length element of the lengths stream can use int32 + // to represent the length + DataBuffer *lengths_; std::vector offsets_; }; diff --git a/contrib/pax_storage/src/cpp/storage/columns/pax_column_cache.cc b/contrib/pax_storage/src/cpp/storage/columns/pax_column_cache.cc new file mode 100644 index 00000000000..6b2335dd09c --- /dev/null +++ b/contrib/pax_storage/src/cpp/storage/columns/pax_column_cache.cc @@ -0,0 +1,236 @@ +#include "storage/columns/pax_column_cache.h" + +#include + +#ifdef ENABLE_PLASMA +namespace pax { + +struct PaxColumnsMeta { + int16 type_len : 8; + uint32 null_size : 32; + uint32 data_size : 32; + uint32 len_size : 32; + uint32 rows : 32; +} __attribute__((__aligned__(8))); + +static std::string BuildCacheKey(const std::string &file_name, + const uint16 column_index, + const uint16 group_index) { + unsigned char key_str[20]; + + CBDB_CHECK(uuid_parse(file_name.c_str(), key_str) == 0, + cbdb::CException::ExType::kExTypeCError); + + static_assert(sizeof(uuid_t) == 16, "Invalid uuid_t length"); + memcpy(key_str + 16, &column_index, sizeof(uint16)); + memcpy(key_str + 18, &group_index, sizeof(uint16)); + + return std::string((char *)key_str, 20); +} + +PaxColumnCache::PaxColumnCache(PaxCache *cache, const std::string &file_name, + bool *proj, size_t proj_num) + : pax_cache_(cache), + file_name_(file_name), + proj_(proj), + proj_num_(proj_num) { + Assert(pax_cache_ && proj_); +} + +template +static PaxColumn *NewFixColumn(const char *buffer, size_t buffer_len) { + auto column = PAX_NEW>(0); + Assert(buffer_len % sizeof(T) == 0); + auto data_buffer = PAX_NEW>((T *)(buffer), buffer_len, false, false); + data_buffer->BrushAll(); + column->Set(data_buffer); + return column; +} + +std::tuple, bool *> +PaxColumnCache::ReadCache(size_t group_index) { + PaxColumns *columns = PAX_NEW(); + std::vector keys; + std::vector batchs; + size_t cache_index = 0; + int64 rows = -1; + bool *proj_copy = PAX_NEW_ARRAY(proj_num_); + size_t no_proj_num = 0; + + memcpy(proj_copy, proj_, proj_num_); + + for (size_t i = 0; i < proj_num_; i++) { + if (!proj_copy[i]) { + continue; + } + keys.emplace_back(BuildCacheKey(file_name_, i, group_index)); + } + + auto status = pax_cache_->Get(keys, batchs); + if (!status.Ok()) { + keys.clear(); + // TODO(jiaqizho): add log here + return std::make_tuple(nullptr, keys, proj_copy); + } + + for (size_t i = 0; i < proj_num_; i++) { + if (!proj_copy[i]) { + no_proj_num++; + columns->Append(nullptr); + continue; + } + auto batch_buffer = batchs[cache_index++]; + + if (batch_buffer.not_exist) { + keys[i - no_proj_num] = ""; + columns->Append(nullptr); + continue; + } + + Assert(batch_buffer.meta_len == sizeof(PaxColumnsMeta)); + PaxColumnsMeta *meta = (PaxColumnsMeta *)batch_buffer.meta; + + AssertImply(rows != -1, (size_t)rows == meta->rows); + rows = meta->rows; + + Assert(batch_buffer.buffer_len == + (size_t)(meta->null_size + meta->data_size + meta->len_size)); + + PaxColumn *column = nullptr; + switch (meta->type_len) { + case -1: { + auto non_fixed_column = PAX_NEW(0); + Assert(meta->len_size % sizeof(int64) == 0); + auto data_buffer = PAX_NEW>( + (char *)(batch_buffer.buffer + meta->null_size), meta->data_size, + false, false); + auto len_data_buffer = PAX_NEW>( + (int64 *)(batch_buffer.buffer + meta->null_size + meta->data_size), + meta->len_size, false, false); + data_buffer->BrushAll(); + len_data_buffer->BrushAll(); + + non_fixed_column->Set(data_buffer, len_data_buffer, + batch_buffer.buffer_len); + column = non_fixed_column; + break; + } + case 1: { + column = NewFixColumn(batch_buffer.buffer + meta->null_size, + meta->data_size); + break; + } + case 2: { + column = NewFixColumn(batch_buffer.buffer + meta->null_size, + meta->data_size); + break; + } + case 4: { + column = NewFixColumn(batch_buffer.buffer + meta->null_size, + meta->data_size); + break; + } + case 8: { + column = NewFixColumn(batch_buffer.buffer + meta->null_size, + meta->data_size); + break; + } + default: { + Assert(false); + } + } + + if (meta->null_size != 0) { + auto null_bitmap = PAX_NEW( + BitmapRaw((uint8 *)(batch_buffer.buffer), meta->null_size), + BitmapTpl::ReadOnlyRefBitmap); + column->SetBitmap(null_bitmap); + } else { + column->SetBitmap(nullptr); + } + + columns->Append(column); + proj_copy[i] = false; + } + + if (rows != -1) { + columns->AddRows(rows); + } + + return std::make_tuple(columns, keys, proj_copy); +} + +void PaxColumnCache::ReleaseCache(std::vector keys) { + for (auto &key : keys) { + if (key.length() != 0) pax_cache_->Release(key); + } +} + +void PaxColumnCache::WriteCache(PaxColumns *columns, size_t group_index) { + std::string key; + PaxColumnsMeta meta{}; + int64 rows = -1; + + for (size_t i = 0; i < proj_num_; i++) { + auto column = (*columns)[i]; + if (!proj_[i] || !column) { + continue; + } + + key = BuildCacheKey(file_name_, i, group_index); + + AssertImply(rows != -1, (size_t)rows == column->GetRows()); + rows = column->GetRows(); + + std::vector> buffers; + + if (column->HasNull()) { + auto bm = column->GetBitmap(); + Assert(bm); + auto nbytes = bm->MinimalStoredBytes(column->GetRows()); + Assert(nbytes <= bm->Raw().size); + + meta.null_size = nbytes; + buffers.emplace_back( + std::make_pair(reinterpret_cast(bm->Raw().bitmap), nbytes)); + } else { + meta.null_size = 0; + } + + char *buffer = nullptr; + size_t buffer_len = 0; + + if (column->GetPaxColumnTypeInMem() == kTypeNonFixed) { + auto non_fixed_column = (PaxNonFixedColumn *)column; + std::tie(buffer, buffer_len) = non_fixed_column->GetBuffer(); + auto len_buffer = non_fixed_column->GetLengthBuffer(); + + buffers.emplace_back(std::make_pair(buffer, buffer_len)); + buffers.emplace_back( + std::make_pair((char *)len_buffer->GetBuffer(), len_buffer->Used())); + + meta.type_len = non_fixed_column->GetTypeLength(); + meta.data_size = buffer_len; + meta.len_size = len_buffer->Used(); + meta.rows = rows; + } else if (column->GetPaxColumnTypeInMem() == kTypeFixed) { + std::tie(buffer, buffer_len) = column->GetBuffer(); + buffers.emplace_back(std::make_pair(buffer, buffer_len)); + + meta.type_len = column->GetTypeLength(); + meta.data_size = buffer_len; + meta.len_size = 0; + meta.rows = rows; + + } else { + Assert(false); + } + + pax_cache_->Put(key, buffers, + std::make_pair((char *)&meta, sizeof(PaxColumnsMeta))); + } +} + +} // namespace pax + +#endif // ENABLE_PLASMA diff --git a/contrib/pax_storage/src/cpp/storage/columns/pax_column_cache.h b/contrib/pax_storage/src/cpp/storage/columns/pax_column_cache.h new file mode 100644 index 00000000000..ee56de5a1b1 --- /dev/null +++ b/contrib/pax_storage/src/cpp/storage/columns/pax_column_cache.h @@ -0,0 +1,38 @@ +#pragma once +#include +#include +#include + +#include "storage/columns/pax_columns.h" + +#ifdef ENABLE_PLASMA +#include "storage/cache/pax_plasma_cache.h" +namespace pax { + +class PaxColumnCache final { + public: + PaxColumnCache(PaxCache *cache, const std::string &file_name, bool *proj, + size_t proj_num); + + ~PaxColumnCache() = default; + + // Read the cache from current group + // Return the tuple with + // + std::tuple, bool *> ReadCache( + size_t group_index); + + void ReleaseCache(std::vector keys); + + void WriteCache(PaxColumns *columns, size_t group_index); + + private: + PaxCache *pax_cache_; + std::string file_name_; + bool *proj_; + size_t proj_num_; +}; + +}; // namespace pax + +#endif // ENABLE_PLASMA diff --git a/contrib/pax_storage/src/cpp/storage/columns/pax_column_int.cc b/contrib/pax_storage/src/cpp/storage/columns/pax_column_int.cc deleted file mode 100644 index 51419bd6ffd..00000000000 --- a/contrib/pax_storage/src/cpp/storage/columns/pax_column_int.cc +++ /dev/null @@ -1,43 +0,0 @@ - -#include "storage/columns/pax_column_int.h" - -namespace pax { - -template -PaxIntColumn::PaxIntColumn(const PaxEncoder::EncodingOption &encoding_option) - : PaxEncodingColumn(DEFAULT_CAPACITY, encoding_option) { - PaxEncodingColumn::InitEncoder(); -} - -template -PaxIntColumn::PaxIntColumn(uint64 capacity, - const PaxEncoder::EncodingOption &encoding_option) - : PaxEncodingColumn(capacity, encoding_option) { - PaxEncodingColumn::InitEncoder(); -} - -template -PaxIntColumn::PaxIntColumn(const PaxDecoder::DecodingOption &decoding_option) - : PaxEncodingColumn(DEFAULT_CAPACITY, decoding_option) { - PaxEncodingColumn::InitDecoder(); -} - -template -PaxIntColumn::PaxIntColumn(uint64 capacity, - const PaxDecoder::DecodingOption &decoding_option) - : PaxEncodingColumn(capacity, decoding_option) { - PaxEncodingColumn::InitDecoder(); -} - -template -ColumnEncoding_Kind PaxIntColumn::GetDefaultColumnType() { - return sizeof(T) >= 4 ? ColumnEncoding_Kind::ColumnEncoding_Kind_ORC_RLE_V2 - : ColumnEncoding_Kind::ColumnEncoding_Kind_DIRECT_DELTA; -} - -template class PaxIntColumn; -template class PaxIntColumn; -template class PaxIntColumn; -template class PaxIntColumn; - -} // namespace pax diff --git a/contrib/pax_storage/src/cpp/storage/columns/pax_column_int.h b/contrib/pax_storage/src/cpp/storage/columns/pax_column_int.h deleted file mode 100644 index e38b8c3f0cd..00000000000 --- a/contrib/pax_storage/src/cpp/storage/columns/pax_column_int.h +++ /dev/null @@ -1,31 +0,0 @@ - -#pragma once -#include "storage/columns/pax_encoding_column.h" - -namespace pax { - -template -class PaxIntColumn final : public PaxEncodingColumn { - public: - explicit PaxIntColumn(const PaxEncoder::EncodingOption &encoding_option); - - PaxIntColumn(uint64 capacity, - const PaxEncoder::EncodingOption &encoding_option); - - explicit PaxIntColumn(const PaxDecoder::DecodingOption &decoding_option); - - PaxIntColumn(uint64 capacity, - const PaxDecoder::DecodingOption &decoding_option); - - ~PaxIntColumn() override = default; - - protected: - ColumnEncoding_Kind GetDefaultColumnType() override; -}; - -extern template class PaxIntColumn; -extern template class PaxIntColumn; -extern template class PaxIntColumn; -extern template class PaxIntColumn; - -} // namespace pax diff --git a/contrib/pax_storage/src/cpp/storage/columns/pax_column_test.cc b/contrib/pax_storage/src/cpp/storage/columns/pax_column_test.cc index 640975de062..96128117b29 100644 --- a/contrib/pax_storage/src/cpp/storage/columns/pax_column_test.cc +++ b/contrib/pax_storage/src/cpp/storage/columns/pax_column_test.cc @@ -5,12 +5,11 @@ #include "comm/cbdb_wrappers.h" #include "comm/gtest_wrappers.h" #include "exceptions/CException.h" -#include "storage/columns/pax_column_int.h" -#include "storage/columns/pax_encoding_column.h" -#include "storage/columns/pax_encoding_non_fixed_column.h" +#include "pax_gtest_helper.h" +#include "storage/columns/pax_column_traits.h" namespace pax::tests { - +using namespace pax::traits; static void AppendInt4All(PaxColumn *pax_column, size_t bits) { int64 data; for (int16 i = INT16_MIN; i <= INT16_MAX; ++i) { // dead loop @@ -55,123 +54,168 @@ static void VerifyInt4All(char *verify_buff, size_t verify_len, size_t bits) { } static PaxColumn *CreateEncodeColumn( - uint8 bits, const PaxEncoder::EncodingOption &encoding_option) { + uint8 bits, const PaxEncoder::EncodingOption &encoding_option, + PaxStorageFormat storage_type = PaxStorageFormat::kTypeStorageOrcNonVec) { PaxColumn *int_column; switch (bits) { case 16: - int_column = new PaxIntColumn(1024, std::move(encoding_option)); + if (storage_type == PaxStorageFormat::kTypeStorageOrcNonVec) { + int_column = + ColumnOptCreateTraits::create_encoding( + 1024, std::move(encoding_option)); + } else { + int_column = + ColumnOptCreateTraits::create_encoding( + 1024, std::move(encoding_option)); + } break; case 32: - int_column = new PaxIntColumn(1024, std::move(encoding_option)); + if (storage_type == PaxStorageFormat::kTypeStorageOrcNonVec) { + int_column = + ColumnOptCreateTraits::create_encoding( + 1024, std::move(encoding_option)); + } else { + int_column = + ColumnOptCreateTraits::create_encoding( + 1024, std::move(encoding_option)); + } break; case 64: - int_column = new PaxIntColumn(1024, std::move(encoding_option)); + if (storage_type == PaxStorageFormat::kTypeStorageOrcNonVec) { + int_column = + ColumnOptCreateTraits::create_encoding( + 1024, std::move(encoding_option)); + } else { + int_column = + ColumnOptCreateTraits::create_encoding( + 1024, std::move(encoding_option)); + } break; default: int_column = nullptr; break; } + return int_column; } static PaxColumn *CreateDecodeColumn( - uint8 bits, size_t origin_lem, + uint8 bits, size_t origin_len, size_t origin_rows, const PaxDecoder::DecodingOption &decoding_option, char *encoded_buff, - size_t encoded_len) { + size_t encoded_len, + PaxStorageFormat storage_type = PaxStorageFormat::kTypeStorageOrcNonVec, + size_t column_not_nulls = 0) { + PaxColumn *column_rc = nullptr; switch (bits) { case 16: { auto *buffer_for_read = new DataBuffer( reinterpret_cast(encoded_buff), encoded_len, false, false); buffer_for_read->Brush(encoded_len); - auto int_column = new PaxIntColumn(origin_lem / sizeof(int16), - std::move(decoding_option)); - int_column->Set(buffer_for_read); - - return int_column; + if (storage_type == PaxStorageFormat::kTypeStorageOrcNonVec) { + auto int_column = + ColumnOptCreateTraits::create_decoding( + origin_len / sizeof(int16), std::move(decoding_option)); + int_column->Set(buffer_for_read); + column_rc = int_column; + } else { + auto int_column = + ColumnOptCreateTraits::create_decoding( + origin_len / sizeof(int16), std::move(decoding_option)); + int_column->Set(buffer_for_read, column_not_nulls); + column_rc = int_column; + } + break; } case 32: { auto *buffer_for_read = new DataBuffer( reinterpret_cast(encoded_buff), encoded_len, false, false); buffer_for_read->Brush(encoded_len); - auto int_column = new PaxIntColumn(origin_lem / sizeof(int32), - std::move(decoding_option)); - int_column->Set(buffer_for_read); - return int_column; + if (storage_type == PaxStorageFormat::kTypeStorageOrcNonVec) { + auto int_column = + ColumnOptCreateTraits::create_decoding( + origin_len / sizeof(int32), std::move(decoding_option)); + int_column->Set(buffer_for_read); + column_rc = int_column; + } else { + auto int_column = + ColumnOptCreateTraits::create_decoding( + origin_len / sizeof(int32), std::move(decoding_option)); + int_column->Set(buffer_for_read, column_not_nulls); + column_rc = int_column; + } + break; } case 64: { auto *buffer_for_read = new DataBuffer( reinterpret_cast(encoded_buff), encoded_len, false, false); buffer_for_read->Brush(encoded_len); - auto int_column = new PaxIntColumn(origin_lem / sizeof(int64), - std::move(decoding_option)); - int_column->Set(buffer_for_read); - return int_column; + if (storage_type == PaxStorageFormat::kTypeStorageOrcNonVec) { + auto int_column = + ColumnOptCreateTraits::create_decoding( + origin_len / sizeof(int64), std::move(decoding_option)); + int_column->Set(buffer_for_read); + column_rc = int_column; + } else { + auto int_column = + ColumnOptCreateTraits::create_decoding( + origin_len / sizeof(int64), std::move(decoding_option)); + int_column->Set(buffer_for_read, column_not_nulls); + column_rc = int_column; + } + break; } default: { return nullptr; } } - return nullptr; + + if (column_rc) { + column_rc->SetRows(origin_rows); + } + return column_rc; } -class PaxColumnTest : public ::testing::Test { +class PaxColumnTest : public ::testing::TestWithParam { public: - void SetUp() override { - MemoryContext orc_test_memory_context = AllocSetContextCreate( - (MemoryContext)NULL, "PaxColumn memory context", 80 * 1024 * 1024, - 80 * 1024 * 1024, 80 * 1024 * 1024); - - MemoryContextSwitchTo(orc_test_memory_context); - } + void SetUp() override { CreateMemoryContext(); } }; -class PaxColumnEncodingTest : public ::testing::TestWithParam { +class PaxColumnEncodingTest : public ::testing::TestWithParam< + ::testing::tuple> { public: - void SetUp() override { - MemoryContext orc_test_memory_context = AllocSetContextCreate( - (MemoryContext)NULL, "PaxColumn memory context", 80 * 1024 * 1024, - 80 * 1024 * 1024, 80 * 1024 * 1024); - - MemoryContextSwitchTo(orc_test_memory_context); - } + void SetUp() override { CreateMemoryContext(); } }; class PaxColumnCompressTest : public ::testing::TestWithParam< ::testing::tuple> { public: - void SetUp() override { - MemoryContext orc_test_memory_context = AllocSetContextCreate( - (MemoryContext)NULL, "PaxColumn memory context", 800 * 1024 * 1024, - 800 * 1024 * 1024, 800 * 1024 * 1024); - - MemoryContextSwitchTo(orc_test_memory_context); - } + void SetUp() override { CreateMemoryContext(); } }; class PaxNonFixedColumnCompressTest : public ::testing::TestWithParam< ::testing::tuple> { public: - void SetUp() override { - MemoryContext orc_test_memory_context = AllocSetContextCreate( - (MemoryContext)NULL, "PaxColumn memory context", 800 * 1024 * 1024, - 800 * 1024 * 1024, 800 * 1024 * 1024); - - MemoryContextSwitchTo(orc_test_memory_context); - } + void SetUp() override { CreateMemoryContext(); } }; -TEST_F(PaxColumnTest, FixColumnGetRangeBufferTest) { +TEST_P(PaxColumnTest, FixColumnGetRangeBufferTest) { PaxColumn *column; + auto storage_type = GetParam(); char *buffer = nullptr; size_t buffer_len = 0; - column = new PaxCommColumn(200); + if (storage_type == PaxStorageFormat::kTypeStorageOrcNonVec) { + column = ColumnCreateTraits::create(200); + } else { + column = ColumnCreateTraits::create(200); + } + for (int32 i = 0; i < 16; i++) { column->Append(reinterpret_cast(&i), sizeof(int32)); } @@ -186,7 +230,12 @@ TEST_F(PaxColumnTest, FixColumnGetRangeBufferTest) { ASSERT_EQ(column->GetRows(), 16); ASSERT_EQ(column->GetRangeNonNullRows(0, column->GetRows()), 16); - column->Clear(); + delete column; + if (storage_type == PaxStorageFormat::kTypeStorageOrcNonVec) { + column = ColumnCreateTraits::create(200); + } else { + column = ColumnCreateTraits::create(200); + } for (int32 i = 0; i < 16; i++) { if (i % 3 == 0) { @@ -195,12 +244,37 @@ TEST_F(PaxColumnTest, FixColumnGetRangeBufferTest) { column->Append(reinterpret_cast(&i), sizeof(int32)); } - std::tie(buffer, buffer_len) = column->GetRangeBuffer(5, 10); - ASSERT_EQ(buffer_len, 10 * sizeof(int32)); + switch (storage_type) { + case kTypeStorageOrcNonVec: { + std::tie(buffer, buffer_len) = column->GetRangeBuffer(5, 10); + ASSERT_EQ(buffer_len, 10 * sizeof(int32)); - for (size_t i = 5; i < 16; i++) { - auto *i_32 = reinterpret_cast(buffer + ((i - 5) * sizeof(int32))); - ASSERT_EQ(*i_32, (int32)i); + for (size_t i = 5; i < 16; i++) { + auto *i_32 = + reinterpret_cast(buffer + ((i - 5) * sizeof(int32))); + ASSERT_EQ(*i_32, (int32)i); + } + break; + } + case kTypeStorageOrcVec: { + std::tie(buffer, buffer_len) = column->GetRangeBuffer(0, 10); + ASSERT_EQ(buffer_len, 10 * sizeof(int32)); + + size_t nulls_count = 0; + for (size_t i = 0; i < 10; i++) { + auto *i_32 = reinterpret_cast(buffer + (i * sizeof(int32))); + if (i % 4 == 0) { + nulls_count++; + ASSERT_EQ(*i_32, 0); + } else { + ASSERT_EQ(*i_32, (int32)i - nulls_count); + } + } + + break; + } + default: + break; } ASSERT_EQ(column->GetRows(), 16 + 6); @@ -209,12 +283,18 @@ TEST_F(PaxColumnTest, FixColumnGetRangeBufferTest) { delete column; } -TEST_F(PaxColumnTest, NonFixColumnGetRangeBufferTest) { +TEST_P(PaxColumnTest, NonFixColumnGetRangeBufferTest) { PaxColumn *column; + auto storage_type = GetParam(); char *buffer = nullptr; size_t buffer_len = 0; - column = new PaxNonFixedColumn(200); + if (storage_type == PaxStorageFormat::kTypeStorageOrcNonVec) { + column = ColumnCreateTraits2::create(200); + } else { + column = ColumnCreateTraits2::create(200); + } + for (int64 i = 0; i < 16; i++) { column->Append(reinterpret_cast(&i), sizeof(int64)); } @@ -223,13 +303,19 @@ TEST_F(PaxColumnTest, NonFixColumnGetRangeBufferTest) { ASSERT_EQ(buffer_len, 10 * sizeof(int64)); for (size_t i = 5; i < 16; i++) { - auto *i_32 = reinterpret_cast(buffer + ((i - 5) * sizeof(int64))); - ASSERT_EQ(*i_32, (int64)i); + auto *i_64 = reinterpret_cast(buffer + ((i - 5) * sizeof(int64))); + ASSERT_EQ(*i_64, (int64)i); } ASSERT_EQ(column->GetRows(), 16); ASSERT_EQ(column->GetRangeNonNullRows(0, column->GetRows()), 16); - column->Clear(); + delete column; + + if (storage_type == PaxStorageFormat::kTypeStorageOrcNonVec) { + column = ColumnCreateTraits2::create(200); + } else { + column = ColumnCreateTraits2::create(200); + } for (int64 i = 0; i < 16; i++) { if (i % 3 == 0) { @@ -238,12 +324,49 @@ TEST_F(PaxColumnTest, NonFixColumnGetRangeBufferTest) { column->Append(reinterpret_cast(&i), sizeof(int64)); } - std::tie(buffer, buffer_len) = column->GetRangeBuffer(5, 10); - ASSERT_EQ(buffer_len, 10 * sizeof(int64)); + switch (storage_type) { + case kTypeStorageOrcNonVec: { + std::tie(buffer, buffer_len) = column->GetRangeBuffer(5, 10); + ASSERT_EQ(buffer_len, 10 * sizeof(int64)); - for (size_t i = 5; i < 16; i++) { - auto *i_32 = reinterpret_cast(buffer + ((i - 5) * sizeof(int64))); - ASSERT_EQ(*i_32, (int64)i); + for (size_t i = 5; i < 16; i++) { + auto *i_64 = + reinterpret_cast(buffer + ((i - 5) * sizeof(int64))); + ASSERT_EQ(*i_64, (int64)i); + } + break; + } + case kTypeStorageOrcVec: { + size_t nulls_count = 0; + for (size_t i = 0; i < 10; i++) { + std::tie(buffer, buffer_len) = column->GetBuffer(i); + if (buffer) { + ASSERT_EQ(i - nulls_count, *reinterpret_cast(buffer)); + } else { + nulls_count++; + } + } + + std::tie(buffer, buffer_len) = column->GetRangeBuffer(0, 10); + + // 0 4 8 is null + ASSERT_EQ(buffer_len, 7 * sizeof(int64)); + + nulls_count = 0; + for (size_t i = 0; i < 10; i++) { + auto *i_64 = reinterpret_cast( + buffer + ((i - nulls_count) * sizeof(int64))); + if (i % 4 == 0) { + nulls_count++; + } else { + ASSERT_EQ(*i_64, (int32)i - nulls_count); + } + } + + break; + } + default: + break; } ASSERT_EQ(column->GetRows(), 16 + 6); @@ -254,7 +377,8 @@ TEST_F(PaxColumnTest, NonFixColumnGetRangeBufferTest) { TEST_P(PaxColumnEncodingTest, GetRangeEncodingColumnTest) { PaxColumn *int_column; - auto bits = GetParam(); + auto bits = ::testing::get<0>(GetParam()); + auto storage_type = ::testing::get<1>(GetParam()); if (bits < 32) { return; } @@ -264,7 +388,8 @@ TEST_P(PaxColumnEncodingTest, GetRangeEncodingColumnTest) { ColumnEncoding_Kind::ColumnEncoding_Kind_DEF_ENCODED; encoding_option.is_sign = true; - int_column = CreateEncodeColumn(bits, std::move(encoding_option)); + int_column = + CreateEncodeColumn(bits, std::move(encoding_option), storage_type); ASSERT_TRUE(int_column); int64 data; @@ -280,16 +405,19 @@ TEST_P(PaxColumnEncodingTest, GetRangeEncodingColumnTest) { ASSERT_LT(encoded_len, UINT16_MAX); auto origin_len = int_column->GetOriginLength(); + auto origin_rows = int_column->GetRows(); ASSERT_EQ(origin_len, (100) * bits / 8); PaxDecoder::DecodingOption decoding_option; decoding_option.column_encode_type = - ColumnEncoding_Kind::ColumnEncoding_Kind_ORC_RLE_V2; + ColumnEncoding_Kind::ColumnEncoding_Kind_RLE_V2; decoding_option.is_sign = true; auto int_column_for_read = CreateDecodeColumn( - bits, origin_len, std::move(decoding_option), encoded_buff, encoded_len); + bits, origin_len, origin_rows, std::move(decoding_option), encoded_buff, + encoded_len, storage_type, 100); + ASSERT_EQ(int_column_for_read->GetCompressLevel(), 0); char *verify_buff; size_t verify_len; std::tie(verify_buff, verify_len) = @@ -317,7 +445,7 @@ TEST_P(PaxColumnCompressTest, FixedCompressColumnGetRangeTest) { PaxEncoder::EncodingOption encoding_option; encoding_option.column_encode_type = kind; - encoding_option.compress_lvl = 5; + encoding_option.compress_level = 5; encoding_option.is_sign = true; int_column = CreateEncodeColumn(bits, std::move(encoding_option)); @@ -336,6 +464,7 @@ TEST_P(PaxColumnCompressTest, FixedCompressColumnGetRangeTest) { ASSERT_LT(encoded_len, UINT16_MAX); auto origin_len = int_column->GetOriginLength(); + auto origin_rows = int_column->GetRows(); ASSERT_EQ(origin_len, kind != ColumnEncoding_Kind_NO_ENCODED ? (100) * bits / 8 : NO_ENCODE_ORIGIN_LEN); @@ -343,11 +472,13 @@ TEST_P(PaxColumnCompressTest, FixedCompressColumnGetRangeTest) { PaxDecoder::DecodingOption decoding_option; decoding_option.column_encode_type = kind; decoding_option.is_sign = true; + decoding_option.compress_level = 5; auto int_column_for_read = - CreateDecodeColumn(bits, (100) * bits / 8, std::move(decoding_option), - encoded_buff, encoded_len); + CreateDecodeColumn(bits, (100) * bits / 8, origin_rows, + std::move(decoding_option), encoded_buff, encoded_len); + ASSERT_EQ(int_column_for_read->GetCompressLevel(), 5); char *verify_buff; size_t verify_len; std::tie(verify_buff, verify_len) = @@ -370,7 +501,8 @@ TEST_P(PaxColumnCompressTest, FixedCompressColumnGetRangeTest) { TEST_P(PaxColumnEncodingTest, PaxEncodingColumnDefault) { PaxColumn *int_column; - auto bits = GetParam(); + auto bits = ::testing::get<0>(GetParam()); + auto storage_type = ::testing::get<1>(GetParam()); if (bits < 32) { return; } @@ -380,7 +512,8 @@ TEST_P(PaxColumnEncodingTest, PaxEncodingColumnDefault) { ColumnEncoding_Kind::ColumnEncoding_Kind_DEF_ENCODED; encoding_option.is_sign = true; - int_column = CreateEncodeColumn(bits, std::move(encoding_option)); + int_column = + CreateEncodeColumn(bits, std::move(encoding_option), storage_type); ASSERT_TRUE(int_column); AppendInt4All(int_column, bits); @@ -392,16 +525,19 @@ TEST_P(PaxColumnEncodingTest, PaxEncodingColumnDefault) { ASSERT_LT(encoded_len, UINT16_MAX); auto origin_len = int_column->GetOriginLength(); + auto origin_rows = int_column->GetRows(); ASSERT_EQ(origin_len, (UINT16_MAX + 1) * bits / 8); PaxDecoder::DecodingOption decoding_option; decoding_option.column_encode_type = - ColumnEncoding_Kind::ColumnEncoding_Kind_ORC_RLE_V2; + ColumnEncoding_Kind::ColumnEncoding_Kind_RLE_V2; decoding_option.is_sign = true; auto int_column_for_read = CreateDecodeColumn( - bits, origin_len, std::move(decoding_option), encoded_buff, encoded_len); + bits, origin_len, origin_rows, std::move(decoding_option), encoded_buff, + encoded_len, storage_type); + ASSERT_EQ(int_column_for_read->GetCompressLevel(), 0); char *verify_buff; size_t verify_len; std::tie(verify_buff, verify_len) = int_column_for_read->GetBuffer(); @@ -413,14 +549,16 @@ TEST_P(PaxColumnEncodingTest, PaxEncodingColumnDefault) { TEST_P(PaxColumnEncodingTest, PaxEncodingColumnSpecType) { PaxColumn *int_column; - auto bits = GetParam(); + auto bits = ::testing::get<0>(GetParam()); + auto storage_type = ::testing::get<1>(GetParam()); PaxEncoder::EncodingOption encoding_option; encoding_option.column_encode_type = - ColumnEncoding_Kind::ColumnEncoding_Kind_ORC_RLE_V2; + ColumnEncoding_Kind::ColumnEncoding_Kind_RLE_V2; encoding_option.is_sign = true; - int_column = CreateEncodeColumn(bits, std::move(encoding_option)); + int_column = + CreateEncodeColumn(bits, std::move(encoding_option), storage_type); ASSERT_TRUE(int_column); AppendInt4All(int_column, bits); @@ -432,15 +570,18 @@ TEST_P(PaxColumnEncodingTest, PaxEncodingColumnSpecType) { ASSERT_LT(encoded_len, UINT16_MAX); auto origin_len = int_column->GetOriginLength(); + auto origin_rows = int_column->GetRows(); ASSERT_EQ(origin_len, (UINT16_MAX + 1) * bits / 8); PaxDecoder::DecodingOption decoding_option; decoding_option.column_encode_type = - ColumnEncoding_Kind::ColumnEncoding_Kind_ORC_RLE_V2; + ColumnEncoding_Kind::ColumnEncoding_Kind_RLE_V2; decoding_option.is_sign = true; auto int_column_for_read = CreateDecodeColumn( - bits, origin_len, std::move(decoding_option), encoded_buff, encoded_len); + bits, origin_len, origin_rows, std::move(decoding_option), encoded_buff, + encoded_len, storage_type); + ASSERT_EQ(int_column_for_read->GetCompressLevel(), 0); char *verify_buff; size_t verify_len; @@ -453,14 +594,16 @@ TEST_P(PaxColumnEncodingTest, PaxEncodingColumnSpecType) { TEST_P(PaxColumnEncodingTest, PaxEncodingColumnNoEncoding) { PaxColumn *int_column; - auto bits = GetParam(); + auto bits = ::testing::get<0>(GetParam()); + auto storage_type = ::testing::get<1>(GetParam()); PaxEncoder::EncodingOption encoding_option; encoding_option.column_encode_type = ColumnEncoding_Kind::ColumnEncoding_Kind_NO_ENCODED; encoding_option.is_sign = true; - int_column = CreateEncodeColumn(bits, std::move(encoding_option)); + int_column = + CreateEncodeColumn(bits, std::move(encoding_option), storage_type); ASSERT_TRUE(int_column); AppendInt4All(int_column, bits); @@ -471,6 +614,7 @@ TEST_P(PaxColumnEncodingTest, PaxEncodingColumnNoEncoding) { ASSERT_NE(encoded_buff, nullptr); auto origin_len = int_column->GetOriginLength(); + auto origin_rows = int_column->GetRows(); ASSERT_EQ(origin_len, NO_ENCODE_ORIGIN_LEN); PaxDecoder::DecodingOption decoding_option; @@ -479,8 +623,9 @@ TEST_P(PaxColumnEncodingTest, PaxEncodingColumnNoEncoding) { decoding_option.is_sign = true; auto int_column_for_read = CreateDecodeColumn( - bits, encoded_len, std::move(decoding_option), encoded_buff, encoded_len); - + bits, encoded_len, origin_rows, std::move(decoding_option), encoded_buff, + encoded_len, storage_type); + ASSERT_EQ(int_column_for_read->GetCompressLevel(), 0); char *verify_buff; size_t verify_len; std::tie(verify_buff, verify_len) = int_column_for_read->GetBuffer(); @@ -497,7 +642,7 @@ TEST_P(PaxColumnCompressTest, PaxEncodingColumnCompressDecompress) { PaxEncoder::EncodingOption encoding_option; encoding_option.column_encode_type = kind; - encoding_option.compress_lvl = 5; + encoding_option.compress_level = 5; encoding_option.is_sign = true; int_column = CreateEncodeColumn(bits, std::move(encoding_option)); @@ -511,6 +656,7 @@ TEST_P(PaxColumnCompressTest, PaxEncodingColumnCompressDecompress) { ASSERT_NE(encoded_buff, nullptr); auto origin_len = int_column->GetOriginLength(); + auto origin_rows = int_column->GetRows(); ASSERT_EQ(origin_len, kind != ColumnEncoding_Kind_NO_ENCODED ? (UINT16_MAX + 1) * bits / 8 : NO_ENCODE_ORIGIN_LEN); @@ -518,11 +664,13 @@ TEST_P(PaxColumnCompressTest, PaxEncodingColumnCompressDecompress) { PaxDecoder::DecodingOption decoding_option; decoding_option.column_encode_type = kind; decoding_option.is_sign = true; + decoding_option.compress_level = 5; auto int_column_for_read = - CreateDecodeColumn(bits, (UINT16_MAX + 1) * bits / 8, + CreateDecodeColumn(bits, (UINT16_MAX + 1) * bits / 8, origin_rows, std::move(decoding_option), encoded_buff, encoded_len); + ASSERT_EQ(int_column_for_read->GetCompressLevel(), 5); char *verify_buff; size_t verify_len; std::tie(verify_buff, verify_len) = int_column_for_read->GetBuffer(); @@ -543,7 +691,7 @@ TEST_P(PaxNonFixedColumnCompressTest, PaxEncoder::EncodingOption encoding_option; encoding_option.column_encode_type = kind; - encoding_option.compress_lvl = 5; + encoding_option.compress_level = 5; encoding_option.is_sign = true; non_fixed_column = @@ -577,16 +725,17 @@ TEST_P(PaxNonFixedColumnCompressTest, PaxDecoder::DecodingOption decoding_option; decoding_option.column_encode_type = kind; decoding_option.is_sign = true; + decoding_option.compress_level = 5; auto non_fixed_column_for_read = new PaxNonFixedEncodingColumn( buffer_len * number, std::move(decoding_option)); auto data_buffer_for_read = new DataBuffer(encoded_buff, encoded_len, false, false); data_buffer_for_read->Brush(encoded_len); - auto length_buffer_cpy = new DataBuffer(*length_buffer); + auto length_buffer_cpy = new DataBuffer(*length_buffer); non_fixed_column_for_read->Set(data_buffer_for_read, length_buffer_cpy, origin_len); - + ASSERT_EQ(non_fixed_column_for_read->GetCompressLevel(), 5); char *verify_buff; size_t verify_len; @@ -612,8 +761,15 @@ TEST_P(PaxNonFixedColumnCompressTest, delete non_fixed_column_for_read; } -INSTANTIATE_TEST_CASE_P(PaxColumnEncodingTestCombine, PaxColumnEncodingTest, - testing::Values(16, 32, 64)); +INSTANTIATE_TEST_CASE_P(PaxColumnTestCombine, PaxColumnTest, + testing::Values(PaxStorageFormat::kTypeStorageOrcNonVec, + PaxStorageFormat::kTypeStorageOrcVec)); + +INSTANTIATE_TEST_CASE_P( + PaxColumnEncodingTestCombine, PaxColumnEncodingTest, + testing::Combine(testing::Values(16, 32, 64), + testing::Values(PaxStorageFormat::kTypeStorageOrcNonVec, + PaxStorageFormat::kTypeStorageOrcVec))); INSTANTIATE_TEST_CASE_P( PaxColumnEncodingTestCombine, PaxColumnCompressTest, diff --git a/contrib/pax_storage/src/cpp/storage/columns/pax_column_traits.cc b/contrib/pax_storage/src/cpp/storage/columns/pax_column_traits.cc new file mode 100644 index 00000000000..e23364ddd3a --- /dev/null +++ b/contrib/pax_storage/src/cpp/storage/columns/pax_column_traits.cc @@ -0,0 +1,114 @@ +#include "storage/columns/pax_column_traits.h" + +namespace pax::traits { + + +Impl::CreateFunc> + ColumnCreateTraits::create = + Impl::CreateImpl>; +Impl::CreateFunc> + ColumnCreateTraits::create = + Impl::CreateImpl>; +Impl::CreateFunc> + ColumnCreateTraits::create = + Impl::CreateImpl>; +Impl::CreateFunc> + ColumnCreateTraits::create = + Impl::CreateImpl>; +Impl::CreateFunc> + ColumnCreateTraits::create = + Impl::CreateImpl>; +Impl::CreateFunc> + ColumnCreateTraits::create = + Impl::CreateImpl>; + +Impl::CreateFunc> + ColumnCreateTraits::create = + Impl::CreateImpl>; +Impl::CreateFunc> + ColumnCreateTraits::create = + Impl::CreateImpl>; +Impl::CreateFunc> + ColumnCreateTraits::create = + Impl::CreateImpl>; +Impl::CreateFunc> + ColumnCreateTraits::create = + Impl::CreateImpl>; +Impl::CreateFunc> + ColumnCreateTraits::create = + Impl::CreateImpl>; +Impl::CreateFunc> + ColumnCreateTraits::create = + Impl::CreateImpl>; +Impl::CreateFunc + ColumnCreateTraits2::create = + Impl::CreateImpl; +Impl::CreateFunc + ColumnCreateTraits2::create = + Impl::CreateImpl; + + + +Impl::CreateEncodingFunc> + ColumnOptCreateTraits::create_encoding = + Impl::CreateEncodingImpl>; +Impl::CreateEncodingFunc> + ColumnOptCreateTraits::create_encoding = + Impl::CreateEncodingImpl>; +Impl::CreateEncodingFunc> + ColumnOptCreateTraits::create_encoding = + Impl::CreateEncodingImpl>; +Impl::CreateEncodingFunc> + ColumnOptCreateTraits::create_encoding = + Impl::CreateEncodingImpl>; +Impl::CreateDecodingFunc> + ColumnOptCreateTraits::create_decoding = + Impl::CreateDecodingImpl>; +Impl::CreateDecodingFunc> + ColumnOptCreateTraits::create_decoding = + Impl::CreateDecodingImpl>; +Impl::CreateDecodingFunc> + ColumnOptCreateTraits::create_decoding = + Impl::CreateDecodingImpl>; +Impl::CreateDecodingFunc> + ColumnOptCreateTraits::create_decoding = + Impl::CreateDecodingImpl>; + +Impl::CreateEncodingFunc> + ColumnOptCreateTraits::create_encoding = + Impl::CreateEncodingImpl>; +Impl::CreateEncodingFunc> + ColumnOptCreateTraits::create_encoding = + Impl::CreateEncodingImpl>; +Impl::CreateEncodingFunc> + ColumnOptCreateTraits::create_encoding = + Impl::CreateEncodingImpl>; +Impl::CreateEncodingFunc> + ColumnOptCreateTraits::create_encoding = + Impl::CreateEncodingImpl>; +Impl::CreateDecodingFunc> + ColumnOptCreateTraits::create_decoding = + Impl::CreateDecodingImpl>; +Impl::CreateDecodingFunc> + ColumnOptCreateTraits::create_decoding = + Impl::CreateDecodingImpl>; +Impl::CreateDecodingFunc> + ColumnOptCreateTraits::create_decoding = + Impl::CreateDecodingImpl>; +Impl::CreateDecodingFunc> + ColumnOptCreateTraits::create_decoding = + Impl::CreateDecodingImpl>; + +Impl::CreateEncodingFunc + ColumnOptCreateTraits2::create_encoding = + Impl::CreateEncodingImpl; +Impl::CreateDecodingFunc + ColumnOptCreateTraits2::create_decoding = + Impl::CreateDecodingImpl; +Impl::CreateEncodingFunc + ColumnOptCreateTraits2::create_encoding = + Impl::CreateEncodingImpl; +Impl::CreateDecodingFunc + ColumnOptCreateTraits2::create_decoding = + Impl::CreateDecodingImpl; +} // namespace pax::traits diff --git a/contrib/pax_storage/src/cpp/storage/columns/pax_column_traits.h b/contrib/pax_storage/src/cpp/storage/columns/pax_column_traits.h new file mode 100644 index 00000000000..d992a70ec91 --- /dev/null +++ b/contrib/pax_storage/src/cpp/storage/columns/pax_column_traits.h @@ -0,0 +1,115 @@ +#pragma once +#include "comm/pax_memory.h" +#include "storage/columns/pax_column.h" +#include "storage/columns/pax_encoding_column.h" +#include "storage/columns/pax_encoding_non_fixed_column.h" +#include "storage/columns/pax_vec_column.h" +#include "storage/columns/pax_vec_encoding_column.h" + +namespace pax::traits { + +namespace Impl { + +template +using CreateFunc = std::function; + +template +static T *CreateImpl(uint64 cap) { + auto t = PAX_NEW(cap); + return t; +} + +template +using CreateEncodingFunc = + std::function; + +template +using CreateDecodingFunc = + std::function; + +template +static T *CreateEncodingImpl(uint64 cap, + const PaxEncoder::EncodingOption &encoding_opt) { + auto t = PAX_NEW(cap, encoding_opt); + return t; +} + +template +static T *CreateDecodingImpl(uint64 cap, + const PaxDecoder::DecodingOption &decoding_opt) { + auto t = PAX_NEW(cap, decoding_opt); + return t; +} + +} // namespace Impl + +template