Fix self-attention for decoder-only models and add set_alignment_heads API

QuentinFuxa · claude · QuentinFuxa · commit 444b38bf00c5 · 2026-02-20T08:59:53.000+01:00
Self-attention was not returning attention weights for decoder-only models
(Generator) because the attention pointer was always nullptr in
TransformerDecoderLayer. Now passes the attention pointer to self-attention
when there is no encoder-attention (decoder-only case).

Also adds set_alignment_heads() to Generator Python API, allowing users to
select specific (layer, head) pairs instead of the default (last layer,
head 0). The attention from selected heads is concatenated in the output
and can be reshaped to (num_heads, context_length).

Fixed multi-head attention handling in decoding.cc to support variable-rank
attention tensors (rank 3 for multi-head vs rank 2 for averaged).

Co-Authored-By: Claude Opus 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/include/ctranslate2/layers/decoder.h b/include/ctranslate2/layers/decoder.h
@@ -20,6 +20,11 @@ namespace ctranslate2 {
     public:
       Decoder(Device device);
 
+      // Configure which attention heads to collect when return_attention is enabled.
+      virtual void set_alignment_heads(const std::vector<std::pair<dim_t, dim_t>>& alignment_heads) {
+        (void)alignment_heads;
+      }
+
       virtual DecoderState initial_state(bool iterative_decoding = true) const = 0;
 
       // Forwards one step.
diff --git a/include/ctranslate2/layers/transformer.h b/include/ctranslate2/layers/transformer.h
@@ -185,7 +185,7 @@ namespace ctranslate2 {
                       StorageView* attention = nullptr) override;
 
       void set_alignment_heads(const dim_t layer, const dim_t num_heads_to_average);
-      void set_alignment_heads(const std::vector<std::pair<dim_t, dim_t>>& alignment_heads);
+      void set_alignment_heads(const std::vector<std::pair<dim_t, dim_t>>& alignment_heads) override;
 
       std::unique_ptr<StorageView>
       get_layer_alignment_heads(const dim_t layer, const dim_t batch_size) const;
diff --git a/include/ctranslate2/models/language_model.h b/include/ctranslate2/models/language_model.h
@@ -58,6 +58,12 @@ namespace ctranslate2 {
                           const StorageView& lengths,
                           const bool return_log_probs);
 
+      // Configure which attention heads to collect when return_attention is enabled.
+      // Each pair is (layer_index, head_index).
+      virtual void set_alignment_heads(const std::vector<std::pair<dim_t, dim_t>>& alignment_heads) {
+        (void)alignment_heads;
+      }
+
     protected:
       virtual bool skip_scoring(const std::vector<std::string>& tokens,
                                 const ScoringOptions& options,
@@ -89,6 +95,8 @@ namespace ctranslate2 {
       DecoderReplica(const std::shared_ptr<const LanguageModel>& model,
                      std::unique_ptr<layers::Decoder> decoder);
 
+      void set_alignment_heads(const std::vector<std::pair<dim_t, dim_t>>& alignment_heads) override;
+
     protected:
       bool skip_scoring(const std::vector<std::string>& tokens,
                         const ScoringOptions& options,
diff --git a/include/ctranslate2/replica_pool.h b/include/ctranslate2/replica_pool.h
@@ -152,6 +152,15 @@ namespace ctranslate2 {
       return worker.replica();
     }
 
+    // Apply a function to each replica. Not thread-safe.
+    template <typename Func>
+    void for_each_replica(Func func) {
+      for (size_t i = 0; i < num_replicas(); ++i) {
+        auto& worker = static_cast<ReplicaWorker<Replica>&>(_thread_pool->get_worker(i));
+        func(worker.replica());
+      }
+    }
+
   protected:
     template <typename Result, typename Func>
     std::vector<std::future<Result>>
diff --git a/python/cpp/generator.cc b/python/cpp/generator.cc
@@ -11,6 +11,12 @@ namespace ctranslate2 {
     public:
       using ReplicaPoolHelper::ReplicaPoolHelper;
 
+      void set_alignment_heads(const std::vector<std::pair<dim_t, dim_t>>& alignment_heads) {
+        _pool->for_each_replica([&](models::SequenceGeneratorReplica& replica) {
+          replica.set_alignment_heads(alignment_heads);
+        });
+      }
+
       std::variant<std::vector<GenerationResult>,
                    std::vector<AsyncResult<GenerationResult>>>
       generate_batch(const BatchTokens& tokens,
@@ -185,6 +191,23 @@ namespace ctranslate2 {
         .def_property_readonly("num_active_batches", &GeneratorWrapper::num_active_batches,
                                "Number of batches waiting to be processed or currently processed.")
 
+        .def("set_alignment_heads", &GeneratorWrapper::set_alignment_heads,
+             py::arg("alignment_heads"),
+             R"pbdoc(
+                 Configure which attention heads to collect when ``return_attention=True``.
+
+                 By default, only head 0 of the last layer is returned (averaged).
+                 Use this method to select specific (layer, head) pairs. The attention
+                 from the selected heads will be concatenated in the output.
+
+                 Arguments:
+                   alignment_heads: List of (layer_index, head_index) pairs to collect.
+
+                 Example:
+
+                     >>> generator.set_alignment_heads([(31, 0), (31, 3), (33, 7)])
+             )pbdoc")
+
         .def("generate_batch", &GeneratorWrapper::generate_batch,
              py::arg("start_tokens"),
              py::kw_only(),
diff --git a/src/decoding.cc b/src/decoding.cc
@@ -146,13 +146,23 @@ namespace ctranslate2 {
     if (!history)
       return {};
 
-    const auto source_length = history.dim(-1);
+    // For averaged attention: history is (batch, beam, steps, ctx)
+    // For per-head attention: history is (batch, beam, steps, heads, ctx)
+    // Compute total floats per time step (ctx or heads*ctx).
+    dim_t step_size = 1;
+    for (dim_t d = 3; d < history.rank(); ++d)
+      step_size *= history.dim(d);
 
     std::vector<std::vector<float>> attention;
     attention.reserve(end - start);
+    // Compute stride for the steps dimension: step_size floats per step.
+    // Base offset for (batch, beam) = batch * (beam_stride) + beam * (steps * step_size).
+    const dim_t steps = history.dim(2);
+    const dim_t beam_stride = steps * step_size;
+    const float* base = history.data<float>() + batch * history.dim(1) * beam_stride + beam * beam_stride;
     for (dim_t t = start; t < end; ++t) {
-      const auto* vector = history.index<float>({batch, beam, t, 0});
-      attention.emplace_back(vector, vector + source_length);
+      const float* vector = base + t * step_size;
+      attention.emplace_back(vector, vector + step_size);
     }
     return attention;
   }
@@ -911,8 +921,11 @@ namespace ctranslate2 {
             && (return_prefix || step >= prefix_length)) {
           results[batch_id].hypotheses[0].push_back(word_id);
           if (attention_step) {
-            const auto* attn = attention_step.index<float>({i, 0});
-            results[batch_id].attention[0].emplace_back(attn, attn + attention_step.dim(-1));
+            // For averaged attention: shape (batch, ctx) -> take ctx floats
+            // For per-head attention: shape (batch, heads, ctx) -> take heads*ctx floats
+            const dim_t attn_size = attention_step.size() / attention_step.dim(0);
+            const auto* attn = attention_step.data<float>() + i * attn_size;
+            results[batch_id].attention[0].emplace_back(attn, attn + attn_size);
           }
         }
 
@@ -1166,9 +1179,11 @@ namespace ctranslate2 {
         if (options.return_attention) {
           if (attention.device() != Device::CPU)
             attention = attention.to_float32().to(Device::CPU);
+          // Compute floats per time step (ctx or heads*ctx for multi-head).
+          const dim_t step_size = attention.size() / (attention.dim(0) * attention.dim(1));
           for (dim_t t = 0; t < prefix_length; ++t) {
-            const float* vector = attention.index<float>({0, t, 0});
-            result.attention[i].emplace_back(vector, vector + attention.dim(-1));
+            const float* vector = attention.data<float>() + t * step_size;
+            result.attention[i].emplace_back(vector, vector + step_size);
           }
         }
       }
diff --git a/src/layers/transformer.cc b/src/layers/transformer.cc
@@ -222,7 +222,7 @@ namespace ctranslate2 {
                              context,
                              cached_self_attn_keys,
                              cached_self_attn_values,
-                             nullptr,
+                             _encoder_attention ? nullptr : attention,
                              input_padder,
                              input_padder,
                              true,
@@ -291,7 +291,7 @@ namespace ctranslate2 {
                         attn,
                         cached_self_attn_keys,
                         cached_self_attn_values,
-                        nullptr,
+                        _encoder_attention ? nullptr : attention,
                         input_padder,
                         input_padder,
                         true,
@@ -315,7 +315,7 @@ namespace ctranslate2 {
                       output,
                       cached_self_attn_keys,
                       cached_self_attn_values,
-                      nullptr,
+                      _encoder_attention ? nullptr : attention,
                       input_padder,
                       input_padder,
                       true,
diff --git a/src/models/language_model.cc b/src/models/language_model.cc
@@ -110,6 +110,11 @@ namespace ctranslate2 {
     {
     }
 
+    void DecoderReplica::set_alignment_heads(
+        const std::vector<std::pair<dim_t, dim_t>>& alignment_heads) {
+      _decoder->set_alignment_heads(alignment_heads);
+    }
+
     std::vector<ScoringResult>
     DecoderReplica::run_scoring(const std::vector<std::vector<std::string>>& tokens,
                                 const ScoringOptions& options) {

Original file line number	Diff line number	Diff line change
`@@ -110,6 +110,11 @@ namespace ctranslate2 {`
`110`	`110`	`{`
`111`	`111`	`}`
`112`	`112`
	`113`	`+ void DecoderReplica::set_alignment_heads(`
	`114`	`+ const std::vector<std::pair<dim_t, dim_t>>& alignment_heads) {`
	`115`	`+ _decoder->set_alignment_heads(alignment_heads);`
	`116`	`+ }`
	`117`	`+`
`113`	`118`	`std::vector<ScoringResult>`
`114`	`119`	`DecoderReplica::run_scoring(const std::vector<std::vector<std::string>>& tokens,`
`115`	`120`	`const ScoringOptions& options) {`