Skip to content

Commit 68917da

Browse files
authored
Introduce AMD GPU support with ROCm HIP (#1989)
1 parent 57c053a commit 68917da

24 files changed

Lines changed: 611 additions & 19 deletions

.github/workflows/ci.yml

Lines changed: 39 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -228,6 +228,39 @@ jobs:
228228
name: python-wheels-${{ runner.os }}-${{ matrix.arch }}
229229
path: python/wheelhouse
230230

231+
build-python-wheels-rocm:
232+
runs-on: ${{ matrix.os }}
233+
strategy:
234+
matrix:
235+
os: [ubuntu-24.04, windows-2025]
236+
237+
steps:
238+
- uses: actions/checkout@v6
239+
with:
240+
submodules: recursive
241+
242+
- name: Build wheels
243+
uses: pypa/cibuildwheel@v3.2.1
244+
with:
245+
package-dir: python
246+
output-dir: python/wheelhouse
247+
env:
248+
CIBW_ENVIRONMENT_LINUX: ROCM_PATH=/opt/rocm LD_LIBRARY_PATH=/opt/rocm/lib/llvm/lib:$LD_LIBRARY_PATH
249+
CIBW_ENVIRONMENT_WINDOWS: CTRANSLATE2_ROOT='${{ github.workspace }}\install'
250+
CIBW_BEFORE_ALL_LINUX: python/tools/prepare_build_environment_linux_rocm.sh
251+
CIBW_BEFORE_ALL_WINDOWS: bash python/tools/prepare_build_environment_windows_rocm.sh
252+
CIBW_BEFORE_BUILD: pip install -r python/install_requirements.txt
253+
CIBW_MANYLINUX_X86_64_IMAGE: manylinux_2_28
254+
CIBW_ARCHS: auto64
255+
CIBW_SKIP: "*-musllinux_*"
256+
CIBW_REPAIR_WHEEL_COMMAND_LINUX: 'auditwheel repair -w {dest_dir} --exclude "/opt/rocm/lib/lib*" {wheel}'
257+
258+
- name: Upload Python wheels
259+
uses: actions/upload-artifact@v6
260+
with:
261+
name: rocm-python-wheels-${{ runner.os }}
262+
path: python/wheelhouse
263+
231264

232265
# We could test the Python wheels using cibuildwheel but we prefer to run the tests outside
233266
# the build environment to ensure wheels correctly embed all dependencies.
@@ -334,6 +367,10 @@ jobs:
334367

335368
build-and-push-docker-images:
336369
runs-on: ubuntu-22.04
370+
strategy:
371+
matrix:
372+
gpu: [cuda, rocm]
373+
337374
steps:
338375
- uses: actions/checkout@v4
339376
with:
@@ -364,7 +401,7 @@ jobs:
364401
365402
- name: Build Docker images
366403
run: |
367-
./docker/build_all.sh
404+
./docker/build_all.sh latest 0 ${{ matrix.gpu }}
368405
369406
- name: Login to DockerHub
370407
if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags')
@@ -377,7 +414,7 @@ jobs:
377414
- name: Push Docker images
378415
if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags')
379416
run: |
380-
./docker/build_all.sh ${GITHUB_REF##*/v} 1
417+
./docker/build_all.sh ${GITHUB_REF##*/v} 1 ${{ matrix.gpu }}
381418
382419
383420
build-and-deploy-docs:

CMakeLists.txt

Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ option(WITH_OPENBLAS "Compile with OpenBLAS backend" OFF)
1414
option(WITH_RUY "Compile with Ruy backend" OFF)
1515
option(WITH_CUDA "Compile with CUDA backend" OFF)
1616
option(WITH_CUDNN "Compile with cuDNN backend" OFF)
17+
option(WITH_HIP "Compile with HIP backend" OFF)
1718
option(CUDA_DYNAMIC_LOADING "Dynamically load CUDA libraries at runtime" OFF)
1819
option(ENABLE_CPU_DISPATCH "Compile CPU kernels for multiple ISA and dispatch at runtime" ON)
1920
option(ENABLE_PROFILING "Compile with profiling support" OFF)
@@ -491,6 +492,9 @@ ELSEIF (ENABLE_ADDRESS_SANITIZER)
491492
ENDIF ()
492493

493494
if (WITH_CUDA)
495+
if(WITH_HIP)
496+
message(FATAL_ERROR "WITH_CUDA=ON incompatible with WITH_HIP=ON")
497+
endif()
494498
find_package(CUDA 11.0 REQUIRED)
495499
list(APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake)
496500
if (WITH_TENSOR_PARALLEL)
@@ -679,6 +683,94 @@ if (WITH_CUDA)
679683
)
680684

681685

686+
elseif(WITH_HIP)
687+
if(WITH_TENSOR_PARALLEL)
688+
message(FATAL_ERROR "WITH_HIP=ON incompatible with WITH_TENSOR_PARALLEL=ON")
689+
endif()
690+
enable_language(HIP)
691+
set(CMAKE_CXX_STANDARD_REQUIRED ON)
692+
message(STATUS "HIP Compiler: ${CMAKE_HIP_COMPILER}")
693+
message(STATUS "CMAKE_HIP_ARCHITECTURES: ${CMAKE_HIP_ARCHITECTURES}")
694+
695+
if(NOT DEFINED ENV{ROCM_PATH})
696+
set(ROCM_PATH /opt/rocm)
697+
else()
698+
set(ROCM_PATH $ENV{ROCM_PATH})
699+
endif()
700+
list(APPEND CMAKE_PREFIX_PATH ${ROCM_PATH})
701+
702+
find_package(hiprand REQUIRED)
703+
find_package(hipblas REQUIRED)
704+
find_package(rocprim REQUIRED)
705+
find_package(rocthrust REQUIRED)
706+
find_package(hipcub REQUIRED)
707+
708+
list(REMOVE_ITEM SOURCES
709+
src/ops/awq/dequantize.cc
710+
src/ops/awq/dequantize_cpu.cc
711+
src/ops/awq/gemm.cc
712+
src/ops/awq/gemm_cpu.cc
713+
src/ops/awq/gemv.cc
714+
src/ops/awq/gemv_cpu.cc
715+
)
716+
list(REMOVE_ITEM CUDA_SOURCES
717+
src/ops/awq/gemm_gpu.cu
718+
src/ops/awq/gemv_gpu.cu
719+
src/ops/awq/dequantize_gpu.cu
720+
)
721+
if(WITH_FLASH_ATTN)
722+
message(FATAL_ERROR "WITH_HIP=ON incompatible with WITH_FLASH_ATTN=ON")
723+
endif()
724+
725+
set_source_files_properties(${CUDA_SOURCES} PROPERTIES LANGUAGE HIP)
726+
set_source_files_properties(
727+
src/cpu/allocator.cc
728+
src/cpu/backend.cc
729+
src/cpu/cpu_info.cc
730+
src/cpu/cpu_isa.cc
731+
src/cpu/kernels.cc
732+
src/cpu/parallel.cc
733+
src/cpu/primitives.cc
734+
src/ops/alibi_add_cpu.cc
735+
src/ops/bias_add_cpu.cc
736+
src/ops/concat_split_slide_cpu.cc
737+
src/ops/conv1d_cpu.cc
738+
src/ops/dequantize_cpu.cc
739+
src/ops/gather_cpu.cc
740+
src/ops/gumbel_max_cpu.cc
741+
src/ops/layer_norm_cpu.cc
742+
src/ops/mean_cpu.cc
743+
src/ops/median_filter_cpu.cc
744+
src/ops/multinomial_cpu.cc
745+
src/ops/quantize_cpu.cc
746+
src/ops/rms_norm_cpu.cc
747+
src/ops/rotary_cpu.cc
748+
src/ops/softmax_cpu.cc
749+
src/ops/tile_cpu.cc
750+
src/ops/topk_cpu.cc
751+
src/ops/topp_mask_cpu.cc
752+
src/ops/nccl_ops_cpu.cc
753+
PROPERTIES LANGUAGE CXX
754+
)
755+
link_directories(${ROCM_PATH}/lib)
756+
757+
add_definitions(-DCT2_WITH_CUDA)
758+
add_definitions(-DCT2_USE_HIP)
759+
760+
add_library(${PROJECT_NAME}
761+
SHARED
762+
${SOURCES}
763+
${CUDA_SOURCES}
764+
)
765+
766+
add_compile_definitions(__HIP_PLATFORM_AMD__)
767+
add_compile_definitions(__HIP_PLATFORM_HCC__)
768+
target_include_directories(${PROJECT_NAME} PRIVATE ${CMAKE_SOURCE_DIR} ${CMAKE_SOURCE_DIR}/include ${ROCM_PATH}/include /include)
769+
target_link_libraries(${PROJECT_NAME} PRIVATE hiprand roc::hipblas roc::rocprim roc::rocthrust hip::hipcub)
770+
771+
set_target_properties(${PROJECT_NAME} PROPERTIES LINKER_LANGUAGE CXX)
772+
773+
682774
elseif(WITH_CUDNN)
683775
message(FATAL_ERROR "WITH_CUDNN=ON requires WITH_CUDA=ON")
684776
else()

docker/Dockerfile_rocm

Lines changed: 110 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,110 @@
1+
FROM rocm/dev-ubuntu-22.04:7.2 AS builder
2+
3+
RUN apt-get update && \
4+
apt-get install -y --no-install-recommends \
5+
rocm-hip-runtime-dev \
6+
hipblas-common-dev \
7+
hipblas-dev \
8+
hipcub-dev \
9+
hiprand-dev \
10+
rocprim-dev \
11+
rocrand-dev \
12+
rocthrust-dev \
13+
python3-dev \
14+
python3-pip \
15+
wget \
16+
&& \
17+
apt-get clean && \
18+
rm -rf /var/lib/apt/lists/*
19+
20+
WORKDIR /root
21+
22+
ENV ONEAPI_VERSION=2025.3
23+
RUN wget -q https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB && \
24+
apt-key add *.PUB && \
25+
rm *.PUB && \
26+
echo "deb https://apt.repos.intel.com/oneapi all main" > /etc/apt/sources.list.d/oneAPI.list && \
27+
apt-get update && \
28+
apt-get install -y --no-install-recommends \
29+
intel-oneapi-mkl-devel-$ONEAPI_VERSION \
30+
&& \
31+
apt-get clean && \
32+
rm -rf /var/lib/apt/lists/*
33+
34+
RUN python3 -m pip --no-cache-dir install cmake==3.22.*
35+
36+
ENV ONEDNN_VERSION=3.10.2
37+
RUN wget -q https://github.com/oneapi-src/oneDNN/archive/refs/tags/v${ONEDNN_VERSION}.tar.gz && \
38+
tar xf *.tar.gz && \
39+
rm *.tar.gz && \
40+
cd oneDNN-* && \
41+
cmake -DCMAKE_BUILD_TYPE=Release -DONEDNN_LIBRARY_TYPE=STATIC -DONEDNN_BUILD_EXAMPLES=OFF -DONEDNN_BUILD_TESTS=OFF -DONEDNN_ENABLE_WORKLOAD=INFERENCE -DONEDNN_ENABLE_PRIMITIVE="CONVOLUTION;REORDER" -DONEDNN_BUILD_GRAPH=OFF . && \
42+
make -j$(nproc) install && \
43+
cd .. && \
44+
rm -r oneDNN-*
45+
46+
ENV OPENMPI_VERSION=4.1.6
47+
RUN wget -q https://download.open-mpi.org/release/open-mpi/v4.1/openmpi-${OPENMPI_VERSION}.tar.bz2 && \
48+
tar xf *.tar.bz2 && \
49+
rm *.tar.bz2 && \
50+
cd openmpi-* && \
51+
./configure && \
52+
make -j$(nproc) install && \
53+
cd .. && \
54+
rm -r openmpi-*
55+
56+
COPY third_party third_party
57+
COPY cli cli
58+
COPY include include
59+
COPY src src
60+
COPY cmake cmake
61+
COPY python python
62+
COPY CMakeLists.txt .
63+
64+
ARG CXX_FLAGS
65+
ENV CXX_FLAGS=${CXX_FLAGS:-"-msse4.1 -O3 -Wno-deprecated-literal-operator"}
66+
ARG HIP_FLAGS
67+
ENV HIP_FLAGS=${HIP_FLAGS:-"-O3 -Wno-deprecated-literal-operator"}
68+
ARG HIP_ARCHITECTURES
69+
ENV HIP_ARCHITECTURES=${HIP_ARCHITECTURES:-"gfx1030;gfx1100;gfx1101;gfx1102;gfx1150;gfx1151;gfx1200;gfx1201"}
70+
ENV CTRANSLATE2_ROOT=/opt/ctranslate2
71+
ENV LD_LIBRARY_PATH=/usr/local/lib/:${LD_LIBRARY_PATH}
72+
73+
RUN mkdir build_tmp && \
74+
cd build_tmp && \
75+
cmake -DCMAKE_INSTALL_PREFIX=${CTRANSLATE2_ROOT} -DCMAKE_C_COMPILER=amdclang -DCMAKE_CXX_COMPILER=amdclang++ \
76+
-DWITH_HIP=ON -DWITH_MKL=ON -DWITH_DNNL=ON -DOPENMP_RUNTIME=COMP \
77+
-DCMAKE_HIP_ARCHITECTURES="${HIP_ARCHITECTURES}" \
78+
-DGPU_TARGETS="${HIP_ARCHITECTURES}" -DCMAKE_BUILD_TYPE=Release \
79+
-DCMAKE_CXX_FLAGS="${CXX_FLAGS}" -DCMAKE_HIP_FLAGS="${HIP_FLAGS}" \
80+
.. && \
81+
VERBOSE=1 make -j$(nproc) install
82+
83+
ENV LANG=en_US.UTF-8
84+
COPY README.md .
85+
86+
RUN cd python && \
87+
python3 -m pip --no-cache-dir install -r install_requirements.txt && \
88+
python3 setup.py bdist_wheel --dist-dir $CTRANSLATE2_ROOT
89+
90+
FROM rocm/dev-ubuntu-22.04:7.2
91+
92+
RUN apt-get update && \
93+
apt-get install -y --no-install-recommends \
94+
rocm-hip-libraries \
95+
openmpi-bin \
96+
libgomp1 \
97+
python3-pip \
98+
&& \
99+
apt-get clean && \
100+
rm -rf /var/lib/apt/lists/*
101+
102+
ENV CTRANSLATE2_ROOT=/opt/ctranslate2
103+
ENV ROCM_ROOT=/opt/rocm
104+
ENV LD_LIBRARY_PATH=$CTRANSLATE2_ROOT/lib:$ROCM_ROOT/lib/llvm/lib:$LD_LIBRARY_PATH
105+
106+
COPY --from=builder $CTRANSLATE2_ROOT $CTRANSLATE2_ROOT
107+
RUN python3 -m pip --no-cache-dir install $CTRANSLATE2_ROOT/*.whl && \
108+
rm $CTRANSLATE2_ROOT/*.whl
109+
110+
ENTRYPOINT ["/opt/ctranslate2/bin/ct2-translator"]

docker/build_all.sh

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ cd $ROOT_DIR
2020

2121
VERSION=${1:-latest}
2222
PUSH=${2:-0}
23+
GPU=${3:-cuda}
2324
IMAGE=ghcr.io/opennmt/ctranslate2
2425

2526
build()
@@ -42,4 +43,8 @@ build()
4243
fi
4344
}
4445

45-
build Dockerfile ubuntu22.04-cuda12.2
46+
if [ "$GPU" == "rocm" ]; then
47+
build Dockerfile_rocm ubuntu22.04-rocm7.2
48+
else
49+
build Dockerfile ubuntu22.04-cuda12.8
50+
fi

include/ctranslate2/ops/ops.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,9 @@
4040
#include "slide.h"
4141
#include "nccl_ops.h"
4242
#include "flash_attention.h"
43+
#ifndef CT2_USE_HIP
4344
#include "awq/gemm.h"
4445
#include "awq/gemv.h"
4546
#include "awq/dequantize_awq.h"
47+
#endif
4648
#include "sum.h"

python/ctranslate2/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,8 @@
2121
add_dll_directory = getattr(os, "add_dll_directory", None)
2222
if add_dll_directory is not None:
2323
add_dll_directory(package_dir)
24+
add_dll_directory(f"{package_dir}/../_rocm_sdk_core/bin")
25+
add_dll_directory(f"{package_dir}/../_rocm_sdk_libraries_custom/bin")
2426

2527
for library in glob.glob(os.path.join(package_dir, "*.dll")):
2628
ctypes.CDLL(library)

0 commit comments

Comments
 (0)