Merge branch 'OpenNMT:master' into master

a2d8a4v · web-flow · commit e786a9046fae · 2026-01-12T10:24:27.000+08:00
diff --git a/include/ctranslate2/ops/flash_attention.h b/include/ctranslate2/ops/flash_attention.h
@@ -6,7 +6,7 @@ namespace ctranslate2 {
   namespace ops {
     class FlashAttention : public Op {
     public:
-      FlashAttention(float queries_scale, dim_t sliding_window);
+      FlashAttention(float queries_scale, dim_t sliding_window, bool is_causal = true);
 
       void operator()(StorageView& queries,
                       StorageView& keys,
@@ -25,6 +25,7 @@ namespace ctranslate2 {
     private:
       const float _queries_scale;
       const dim_t _sliding_window;
+      const bool _is_causal;
       template <Device D>
       void compute(StorageView& queries,
                    StorageView& keys,
diff --git a/src/models/whisper.cc b/src/models/whisper.cc
@@ -389,17 +389,18 @@ namespace ctranslate2 {
       const ops::MedianFilter median_filter_op(median_filter_width);
       const dim_t batch_size = attention_probs.dim(0);
 
-      // The remaining operations are not implemented on GPU, so move back to CPU.
-      attention_probs.move_to(Device::CPU, DataType::FLOAT32);
-
       ops::LayerNorm(-2, 0)(attention_probs);
 
-      StorageView median_filter;
+      StorageView median_filter(attention_probs.dtype(), attention_probs.device());
       median_filter_op(attention_probs, median_filter);
 
-      StorageView weights;
+      StorageView weights(median_filter.dtype(), median_filter.device());
       ops::Mean(1)(median_filter, weights);
 
+      // The remaining operations are not implemented on GPU, so move back to CPU.
+      synchronize_stream(weights.device());
+      weights.move_to(Device::CPU, DataType::FLOAT32);
+
       std::vector<std::vector<std::pair<dim_t, dim_t>>> alignments;
       alignments.reserve(batch_size);
 
diff --git a/src/ops/flash_attention.cc b/src/ops/flash_attention.cc
@@ -4,9 +4,10 @@
 
 namespace ctranslate2 {
   namespace ops {
-    FlashAttention::FlashAttention(float queries_scale, dim_t sliding_window)
+    FlashAttention::FlashAttention(float queries_scale, dim_t sliding_window, bool is_causal)
     : _queries_scale(queries_scale)
-    ,_sliding_window(sliding_window)
+    , _sliding_window(sliding_window)
+    , _is_causal(is_causal)
     {
     }
 
diff --git a/src/ops/flash_attention_gpu.cu b/src/ops/flash_attention_gpu.cu
@@ -232,8 +232,8 @@ namespace ctranslate2 {
         num_heads_k = cached_keys->dim(2);
       }
 
+      bool is_causal = _is_causal;
       // causal=true is the same as causal=false in this case
-      bool is_causal = true;
       if (seqlen_q == 1 && !alibi) { is_causal = false; }
       if (is_causal) { window_size_right = 0; }
 
diff --git a/src/ops/layer_norm_gpu.cu b/src/ops/layer_norm_gpu.cu
@@ -15,6 +15,15 @@ namespace at {
                                                const T* beta,
                                                T* Y);
 
+    template <typename T, typename SizeT>
+    __global__ void LayerNormAxisForwardCUDAKernel(SizeT N,
+                                                   SizeT inner,
+                                                   float eps,
+                                                   const T* X,
+                                                   const T* gamma,
+                                                   const T* beta,
+                                                   T* Y);
+
   }
 }
 
@@ -30,19 +39,29 @@ namespace ctranslate2 {
                             const dim_t axis,
                             const dim_t outer_size,
                             const dim_t axis_size,
-                            const dim_t,
+                            const dim_t inner_size,
                             StorageView& output) const {
-      if (axis != input.rank() - 1 || !beta || !gamma)
-        throw std::invalid_argument("Generalized LayerNorm is currently not implemented on GPU");
-
-      at::native::LayerNormForwardCUDAKernel<cuda::device_type<T>, cuda::index_t>
-        <<<outer_size, CUDA_NUM_THREADS, 0, cuda::get_cuda_stream()>>>(
-          axis_size,
-          _epsilon,
-          cuda::device_cast(input.data<T>()),
-          cuda::device_cast(gamma->data<T>()),
-          cuda::device_cast(beta->data<T>()),
-          cuda::device_cast(output.data<T>()));
+      if (axis == input.rank() - 1) {
+        at::native::LayerNormForwardCUDAKernel<cuda::device_type<T>, cuda::index_t>
+          <<<outer_size, CUDA_NUM_THREADS, 0, cuda::get_cuda_stream()>>>(
+            axis_size,
+            _epsilon,
+            cuda::device_cast(input.data<T>()),
+            gamma ? cuda::device_cast(gamma->data<T>()) : nullptr,
+            beta ? cuda::device_cast(beta->data<T>()) : nullptr,
+            cuda::device_cast(output.data<T>()));
+      } else {
+        const dim_t blocks = std::min(outer_size * inner_size, cuda::max_blocks);
+        at::native::LayerNormAxisForwardCUDAKernel<cuda::device_type<T>, cuda::index_t>
+          <<<blocks, CUDA_NUM_THREADS, 0, cuda::get_cuda_stream()>>>(
+            axis_size,
+            inner_size,
+            _epsilon,
+            cuda::device_cast(input.data<T>()),
+            gamma ? cuda::device_cast(gamma->data<T>()) : nullptr,
+            beta ? cuda::device_cast(beta->data<T>()) : nullptr,
+            cuda::device_cast(output.data<T>()));
+      }
     }
 
 #define DECLARE_IMPL(T)                                                 \
@@ -181,7 +200,53 @@ namespace at {
 
       for (SizeT j = threadIdx.x; j < N; j += blockDim.x) {
         const SizeT index = i * N + j;
-        Y[index] = (float(X[index]) - s_mean) * s_variance * float(gamma[j]) + float(beta[j]);
+        const float gamma_v = gamma == nullptr ? float(1) : float(gamma[j]);
+        const float beta_v = beta == nullptr ? float(0) : float(beta[j]);
+        Y[index] = T((float(X[index]) - s_mean) * s_variance * gamma_v + beta_v);
+      }
+    }
+
+    template <typename T, typename SizeT>
+    __global__ void LayerNormAxisForwardCUDAKernel(SizeT N,
+                                                   SizeT inner_size,
+                                                   float eps,
+                                                   const T* X,
+                                                   const T* gamma,
+                                                   const T* beta,
+                                                   T* Y) {
+      typedef cub::BlockReduce<float, CUDA_NUM_THREADS> BlockReduce;
+      __shared__ typename BlockReduce::TempStorage m_temp_storage;
+      __shared__ typename BlockReduce::TempStorage v_temp_storage;
+      __shared__ float s_mean;
+      __shared__ float s_variance;
+
+      const SizeT i = blockIdx.x / inner_size;
+      const SizeT j = blockIdx.x % inner_size;
+
+      float sum1 = 0;
+      float sum2 = 0;
+      for (SizeT k = threadIdx.x; k < N; k += blockDim.x) {
+        const SizeT index = (i * N + k) * inner_size + j;
+        sum1 += float(X[index]);
+        sum2 += float(X[index]) * float(X[index]);
+      }
+      sum1 = BlockReduce(m_temp_storage).Sum(sum1);
+      sum2 = BlockReduce(v_temp_storage).Sum(sum2);
+      if (threadIdx.x == 0) {
+        const float scale = float(1) / float(N);
+        sum1 *= scale;
+        sum2 = fmaxf(sum2 * scale - sum1 * sum1, float(0));
+        s_mean = sum1;
+        s_variance = rsqrtf(sum2 + eps);
+      }
+
+      __syncthreads();
+
+      for (SizeT k = threadIdx.x; k < N; k += blockDim.x) {
+        const SizeT index = (i * N + k) * inner_size + j;
+        const float gamma_v = gamma == nullptr ? float(1) : float(gamma[j]);
+        const float beta_v = beta == nullptr ? float(0) : float(beta[j]);
+        Y[index] = T((float(X[index]) - s_mean) * s_variance * gamma_v + beta_v);
       }
     }
 
diff --git a/tests/ops_test.cc b/tests/ops_test.cc
@@ -724,9 +724,6 @@ TEST_P(OpDeviceFPTest, LayerNorm) {
 
 TEST_P(OpDeviceFPTest, LayerNormAxis) {
   const Device device = GetParam().device;
-  if (device == Device::CUDA) {
-    GTEST_SKIP() << "Generalized LayerNorm is not implemented on GPU";
-  }
   const DataType dtype = GetParam().dtype;
   const float error = GetParam().error;
   StorageView x({2, 3, 2}, std::vector<float>{
@@ -745,7 +742,7 @@ TEST_P(OpDeviceFPTest, LayerNormAxis) {
       1.4136513471603394, -1.3856042623519897}, device);
   StorageView y(dtype, device);
   ops::LayerNorm(1, 0)(x.to(dtype), y);
-  expect_storage_eq(y.to_float32(), expected, error);
+  expect_storage_eq(y.to_float32(), expected, error * 10);
 }
 
 TEST_P(OpDeviceFPTest, RMSNorm) {
@@ -780,7 +777,8 @@ TEST_P(OpDeviceTest, QuantizeINT8) {
   }
 
   // With rounding before cast and shift to uint8.
-  {
+  // Shift to uin8_t is not defined on CUDA
+  if (device != Device::CUDA) {
     StorageView expected_qa(a.shape(), std::vector<int8_t>{1, 90, -64, -103, -98, -1, 110, -128});
     ops::Quantize(ops::Quantize::ScaleType::GLOBAL, true, true)(a, qa, scale);
     expect_storage_eq(scale, expected_scale);

Original file line number	Diff line number	Diff line change
`@@ -4,9 +4,10 @@`
`4`	`4`
`5`	`5`	`namespace ctranslate2 {`
`6`	`6`	`namespace ops {`
`7`		`- FlashAttention::FlashAttention(float queries_scale, dim_t sliding_window)`
	`7`	`+ FlashAttention::FlashAttention(float queries_scale, dim_t sliding_window, bool is_causal)`
`8`	`8`	`: _queries_scale(queries_scale)`
`9`		`- ,_sliding_window(sliding_window)`
	`9`	`+ , _sliding_window(sliding_window)`
	`10`	`+ , _is_causal(is_causal)`
`10`	`11`	`{`
`11`	`12`	`}`
`12`	`13`
Original file line number	Diff line number	Diff line change
`@@ -232,8 +232,8 @@ namespace ctranslate2 {`
`232`	`232`	`num_heads_k = cached_keys->dim(2);`
`233`	`233`	`}`
`234`	`234`
	`235`	`+ bool is_causal = _is_causal;`
`235`	`236`	`// causal=true is the same as causal=false in this case`
`236`		`- bool is_causal = true;`
`237`	`237`	`if (seqlen_q == 1 && !alibi) { is_causal = false; }`
`238`	`238`	`if (is_causal) { window_size_right = 0; }`
`239`	`239`