Merge pull request #1768 from pps-lab/bert_layer_improvements_pr

mkskeller · web-flow · commit 2ee81da730a4 · 2026-02-17T11:51:35.000+11:00
Reduced compilation time of BERT and other improvements
diff --git a/Compiler/ml.py b/Compiler/ml.py
@@ -866,8 +866,8 @@ def __init__(self, N, d_in, d_out, d=1, activation='id', debug=False):
             self.f_input = self.Y
 
     def __repr__(self):
-        return '%s(%s, %s, %s, activation=%s)' % \
-            (type(self).__name__, self.N, self.d_in,
+        return '%s(%s, %s, %s, %s, activation=%s)' % \
+            (type(self).__name__, self.N, self.d_in, self.d,
              self.d_out, repr(self.activation))
 
     def reset(self):
@@ -1755,7 +1755,7 @@ class LayerNorm(Layer):  # Changed class name
     thetas = lambda self: (self.weights, self.bias)
     nablas = lambda self: (self.nabla_weights, self.nabla_bias)
 
-    def __init__(self, shape, approx=False, layernorm_eps=None, args=None):
+    def __init__(self, shape, approx=True, layernorm_eps=None, args=None):
         if len(shape) == 2:
             shape = [shape[0], 1, shape[1]] # Not sure why this extra dimension is added
         tensors = (Tensor(shape, sfix) for i in range(4))
@@ -1810,6 +1810,7 @@ def _(*arg):
             tmp = self.weights[:] * (sel_X[:] - mu_sel) * fac_sel  # Removed self.mu reference
             sel_Y[:] = self.bias[:] + tmp
 
+    @_layer_method_call_tape
     def forward(self, batch, training=False):
         d = self.X.sizes[1]
         d_in = self.X.sizes[2]
@@ -2670,7 +2671,7 @@ class BertBase(BaseLayer, FixBase):
 class BertPooler(BertBase):
 
     thetas = lambda self: self.dense.thetas()
-    nablas = lambda self: self.dense.nablas() # refer to downstream layers?
+    nablas = lambda self: self.dense.nablas()
 
     def __init__(self, n_examples, seq_len, hidden_state):
         input_shape = [n_examples, seq_len, hidden_state]
@@ -2679,28 +2680,19 @@ def __init__(self, n_examples, seq_len, hidden_state):
         self.dense = Dense(n_examples, hidden_state, hidden_state)
         self.activation = Tanh(output_shape)
 
-        self.d_out = hidden_state
-
-
-    def _forward(self, batch):
-        # self.dense.X.address = self.X.address
         self.activation.X.address = self.dense.Y.address
         self.activation.Y.address = self.Y.address
 
-        # grab the first repr?
+        self.d_out = hidden_state
+
+    def _forward(self, batch):
         # batch contains [n_batch, n_heads, n_dim]
         @for_range(len(batch))
         def _(j):
             self.dense.X[j][:] = self.X[batch[j]][0][:]
 
-        # if self.debug_output:
-        #     print_ln("forward layer pooler.dense X %s", self.dense.X.reveal_nested())
-
         self.dense.forward(batch)
-        # print_ln("LINEAR Layer weights after bertpooler.dense: %s", self.opt.layers[-2].W.reveal_nested())
-
-        self.activation._forward(batch)
-        # print_ln("LINEAR Layer weights after bertpooler.activation: %s", self.opt.layers[-2].W.reveal_nested())
+        self.activation.forward(batch)
 
     def reset(self):
         self.dense.reset()
@@ -2767,44 +2759,28 @@ def __init__(self, n_examples, seq_len, hidden_state, intermediate_size, num_att
         self.intermediate = BertIntermediate(internal_shape, hidden_state, intermediate_size, seq_len)
         self.output = BertOutput(internal_shape, intermediate_size, hidden_state, seq_len, dropout, layernorm_eps, rsqrt_approx)
 
-        self.hidden_state = sfix.Tensor(input_shape) # TODO: Could also make this smaller
-        # self.nabla_hidden_state = sfix.Tensor(input_shape)
-        # self.nabla_hidden_state.alloc()
-
-        # self.X.address = self.multi_head_attention.X.address
-        # self.Y.address = self.output.Y.address
-
         self.d_out = hidden_state
 
-        print("Init BertLayer", input_shape, output_shape)
-
+    @_layer_method_call_tape
     def forward(self, batch, training=False):
         if batch is None:
             batch = Array.create_from(regint(0))
 
         self.multi_head_attention._X.address = self.X.address
         self.output.Y.address = self.Y.address
-        self.hidden_state.address = self.X.address
-        # self.multi_head_attention.Y.address = self.Y.address
-
-        self.multi_head_attention.forward(batch, self.hidden_state, training)
-        # if self.debug_output:
-            # print_ln("our layer X %s %s", self.X[0][0][0].reveal(), self.output.X[0][0][0].reveal())
 
+        self.multi_head_attention.forward(batch, self.X, training)
         if self.debug_output:
             print_ln("forward layer multi_head_attention %s %s", self.multi_head_attention.Y[0][1][0].reveal(), sum(sum(self.multi_head_attention.Y[0].reveal())))
             # print_ln("forward layer multi_head_attention full %s", self.multi_head_attention.Y.reveal())
 
-        print("Forward Attention")
-
         batch_inc = regint.Array(len(batch))
         batch_inc.assign(regint.inc(len(batch)))
         self.intermediate.X.address = self.multi_head_attention.Y.address
         self.intermediate.forward(batch_inc)
 
         if self.debug_output:
             print_ln("forward layer intermediate %s %s %s", self.intermediate.Y.shape, self.intermediate.Y[0][1][0:20].reveal(), sum(sum(self.intermediate.Y[0].reveal())))
-
             print_ln(" ")
 
         self.output.X.address = self.intermediate.Y.address
@@ -2813,14 +2789,7 @@ def forward(self, batch, training=False):
 
         if self.debug_output:
             print_ln("our output %s %s %s %s", self.Y.address, len(self.Y[0].reveal()), self.Y[0][0][0:20].reveal(), sum(sum(self.Y[0].reveal())))
-            # print_ln("our output %s %s %s %s", self.Y.address, len(self.Y[0].reveal()), self.Y[0][0][0:20].reveal(), sum(sum(self.Y[0].reveal())))
-            # print_ln("our output %s %s %s %s", self.Y.address, len(self.Y[0].reveal()), self.Y[0][0][0:20].reveal(), sum(sum(self.Y[0].reveal())))
-
             print_ln("our layer output %s %s %s %s", self.output.Y.address, len(self.Y[0].reveal()), self.output.Y[0][0][0:20].reveal(), sum(sum(self.output.Y[0].reveal())))
-            # print_ln("shapes %s %s", self.Y.sizes, self.output.Y.sizes)
-            # print_ln("types %s %s %s %s %s %s", self.Y.value_type, self.output.Y.value_type, type(self.Y), type(self.output.Y), self, self.output)
-
-        print("Forward BertLayer")
 
     def reset(self):
         self.multi_head_attention.reset()
@@ -2864,10 +2833,8 @@ def backward(self, compute_nabla_X=True, batch=None):
         self.output.nabla_Y.address = self.nabla_Y.address
         self.intermediate.nabla_Y.address = self.output.nabla_X.address
         self.multi_head_attention.nabla_Y.address = self.intermediate.nabla_X.address
-        # self.multi_head_attention.nabla_X.address = self.nabla_X.address
 
         nabla_y_multi_head_attention_from_layernorm = self.output.backward(True, batch)
-        # print_ln("Backward BertLayer.output.nabla_X %s", self.output.nabla_X.reveal_nested()[:8])
         self.intermediate.backward(True, batch)
 
         # residual, add it to Y because it gave the output of multihadattention to output
@@ -2904,7 +2871,11 @@ def __init__(self, n_examples, hidden_size, intermediate_size, seq_len):
         self.dense = Dense(n_examples, hidden_size, intermediate_size, seq_len)
         self.activation = Gelu([n_examples, seq_len, intermediate_size])
 
+        self.dense.X.address = self.X.address
+        self.activation.X.address = self.dense.Y.address
+        self.activation.Y.address = self.Y.address
 
+    @_layer_method_call_tape
     def forward(self, batch=None, training=None):
         self.dense.X.address = self.X.address
         self.activation.X.address = self.dense.Y.address
@@ -2914,16 +2885,14 @@ def forward(self, batch=None, training=None):
         if self.debug_output:
             print_ln("forward layer intermediate.dense %s", self.dense.Y[0][0][0:20].reveal())
 
-        self.activation._forward(batch)
+        self.activation.forward(batch)
 
     def reset(self):
         self.dense.reset()
 
     def backward(self, compute_nabla_X=True, batch=None):
         self.activation.nabla_X.alloc()
 
-        # print_ln("Backward BertIntermediate.nabla_X %s", self.nabla_X.reveal_nested()[:8])
-
         self.activation.nabla_Y.address = self.nabla_Y.address
         self.dense.nabla_Y.address = self.activation.nabla_X.address
         self.dense.nabla_X.address = self.nabla_X.address
@@ -2941,13 +2910,13 @@ def __init__(self, n_examples, intermediate_size, hidden_size, seq_len, dropout=
         input_shape = [n_examples, seq_len, intermediate_size]
         output_shape = [n_examples, seq_len, hidden_size]
         self.input_shape = input_shape
-        print("INSTANTIATING BERTOUTPUT with ", input_shape, output_shape, intermediate_size, hidden_size, rsqrt_approx)
         super(BertOutput, self).__init__(input_shape, output_shape)
         self.dense = Dense(n_examples, intermediate_size, hidden_size, seq_len)
         self.layer_norm = LayerNorm(output_shape, layernorm_eps=layernorm_eps, approx=rsqrt_approx)
         self.dropout = Dropout([n_examples, seq_len, hidden_size], alpha=dropout)
 
 
+    @_layer_method_call_tape
     def forward(self, batch, input_tensor, training=False, input_tensor_batch=None):
         # Because input_tensor might be the full training data shape
         self.dense.X.address = self.X.address
@@ -2975,18 +2944,12 @@ def _(base, size):
                 self.layer_norm.X.assign_part_vector(
                     self.layer_norm.X.get_part_vector(base, size) +
                     input_tensor.get_part_vector(base, size), base)
-        # if self.debug_output:
-        #     print_ln("input tensor %s", input_tensor.reveal())
-
-        # self.layer_norm.X[:] += input_tensor[:] # TODO: is it maybe this addition since we take the last value? would be strange
 
         if self.debug_output:
             print_ln("forward layer layer_norm_add %s", self.layer_norm.X[0][0][0:20].reveal())
             print_ln("")
         self.layer_norm.forward(batch)
 
-
-
     def reset(self):
         self.dense.reset()
 
@@ -3049,6 +3012,7 @@ def __init__(self, n_examples, seq_len, hidden_size, num_attention_heads, dropou
         self.nabla_attention_scores = MultiArray([internal_shape, self.num_attention_heads, self.seq_len, self.seq_len], sfix)
         self.nabla_preattention_scores = MultiArray([internal_shape, self.num_attention_heads, self.seq_len, self.seq_len], sfix)
 
+    @_layer_method_call_tape
     def forward(self, batch=None, hidden_state=None, training=None):
         N = len(batch)
 
@@ -3068,32 +3032,24 @@ def forward(self, batch=None, hidden_state=None, training=None):
         inc_batch.assign(regint.inc(N))
 
         if self.debug_output:
-            # print_ln('forward layer wq full %s', self.wq.X.reveal())
             print_ln('forward layer wv %s %s', self.wv.Y[0][0][0:10].reveal(), sum(self.wv.Y[0][0].reveal()))
             print_ln('forward layer hidden_state %s', hidden_state[0][1][0:10].reveal())
-            # print_ln('forward layer wv full %s', self.wv.Y.reveal())
 
-        # max_size = program.budget // self.attention_head_size
         @for_range_opt_multithread(self.n_threads, [N, self.num_attention_heads])
         def _(i, j):
-        # for j in range(self.num_attention_heads):
             query_sub = sfix.Matrix(self.seq_len, self.attention_head_size) # this is mem inefficient?
             key_sub = sfix.Matrix(self.seq_len, self.attention_head_size)
-            # print(self.wq.Y.shape, "wk Y shape", i, self.attention_head_size, j, self.wq.Y[i], self.wq.Y[i][:])
 
             @for_range_opt(self.seq_len)
             def _(k):
-            # for k in range(self.seq_len):
                 query_sub[k] = self.wq.Y[i][k].get_part_vector(j * self.attention_head_size, self.attention_head_size)
                 key_sub[k] = self.wk.Y[i][k].get_part_vector(j * self.attention_head_size, self.attention_head_size)
 
-            # print_ln("query_sub %s %s", i, j)
             res = query_sub.direct_mul_trans(key_sub)
             self.attention_scores[i].assign_part_vector(res, j)
 
         if self.debug_output:
             print_ln('forward layer attention_scores %s', self.attention_scores[0][0].reveal())
-            # print_ln('forward layer attention_scores full %s', self.attention_scores.reveal())
 
         @for_range_opt_multithread(self.n_threads, [N, self.num_attention_heads, self.seq_len])
         def _(i, j, k):
@@ -3113,7 +3069,6 @@ def _(i, j):
             @for_range_opt([self.seq_len])
             def _(k):
                 value_sub[k] = self.wv.Y[i][k].get_part_vector(j * self.attention_head_size, self.attention_head_size)
-                # value_sub[k] = self.wv.Y[i][k][j * self.attention_head_size:(j + 1) * self.attention_head_size]
 
             res = sfix.Matrix(self.seq_len, self.attention_head_size)
             res.assign_vector(self.dropout.Y[i][j].direct_mul(value_sub))
@@ -3124,13 +3079,7 @@ def _(k):
                 self.context[i][k].assign_part_vector(res[k],
                     j * self.attention_head_size
                 )
-            # for k in range(self.seq_len):
-            #     self.context[i][k][j * self.attention_head_size:(j + 1) * self.attention_head_size] = res[k * self.attention_head_size:(k + 1) * self.attention_head_size]
-
-            # How to transfer to forward?
 
-        # missing half of the values ?
-        # print_ln('forward layer old_context %s', self.old_context[0].get_vector().reveal())
         if self.debug_output:
             print_ln('forward layer multiheadattention before internal output %s', self.context[0][0][0:20].get_vector().reveal())
 
@@ -3142,8 +3091,6 @@ def _(k):
             print_ln('forward multiheadattention output %s', self.output.Y[0][0][0:20].reveal())
             print_ln("")
 
-    # return context
-
     def reset(self):
         self.wq.reset()
         self.wk.reset()
@@ -3199,8 +3146,6 @@ def _(k):
                     nabla_value_sub[k],
                 j * self.attention_head_size)
 
-            print("RES MULTI BACK", self.dropout.Y, res, self.num_attention_heads, self.attention_head_size)
-
         self.dropout.nabla_X.alloc()
         self.dropout.backward(True, batch)
 
@@ -4413,10 +4358,20 @@ def process(item, inputs, input_shape, args, kwargs={}):
             raise CompilerError('multi-input layer %s not supported' % item)
         name = type(item).__name__
         if name == 'Linear':
-            assert mul(input_shape[1:]) == item.in_features
+            # Precondition: the item
             assert item.bias is not None
-            layers.append(Dense(input_shape[0], item.in_features,
-                                item.out_features))
+            if mul(input_shape[1:]) == item.in_features:
+                layers.append(Dense(input_shape[0], item.in_features,
+                                    item.out_features))
+            elif input_shape[-1] == item.in_features:
+                # we loop over all but last dimension
+                assert len(input_shape) == 3, "Dense only supports one extra dimension to loop over"
+                d = input_shape[1]
+                layers.append(Dense(input_shape[0], item.in_features,
+                                    item.out_features, d))
+            else:
+                assert False, f"input shape {input_shape} incompatible with in_features {item.in_features}"
+
             if input_via is not None:
                 shapes = [x.shape for x in (layers[-1].W, layers[-1].b)]
                 import numpy
@@ -4487,6 +4442,15 @@ def process(item, inputs, input_shape, args, kwargs={}):
             input_shape = layers[-1].shape
         elif name == 'ReLU' or item == torch.nn.functional.relu:
             layers.append(Relu(input_shape))
+        elif name == 'GeLU' or item == torch.nn.functional.gelu:
+            layers.append(Gelu(input_shape))
+        elif name == 'LayerNorm':
+            layers.append(LayerNorm(input_shape, True, item.eps))
+            if input_via is not None:
+                layers[-1].weights = sfix.input_tensor_via(
+                    input_via, item.weight.detach())
+                layers[-1].beta = sfix.input_tensor_via(
+                    input_via, item.bias.detach())
         elif name == 'Flatten':
             return
         elif name == 'BatchNorm2d' or name == 'BatchNorm1d':
@@ -4539,7 +4503,7 @@ def process(item, inputs, input_shape, args, kwargs={}):
             num_attention_heads = config.num_attention_heads
             layernorm_eps = config.layer_norm_eps
             seq_len = input_shape[1]
-            rsqrt_approx = False
+            rsqrt_approx = True
             layer = BertLayer(input_shape[0], seq_len, hidden_state, intermediate_size, num_attention_heads,
                               layernorm_eps, 0.125, rsqrt_approx, batch_size=batch_size)
             if input_via is not None:
diff --git a/Compiler/types.py b/Compiler/types.py
@@ -7029,7 +7029,10 @@ def __add__(self, other):
         :return: container of same shape and type as :py:obj:`self` """
         if is_zero(other):
             return self
-        assert self.sizes == other.sizes
+        if hasattr(other, 'sizes'):
+            assert self.sizes == other.sizes
+        if hasattr(other, 'size'):
+            assert self.total_size() == other.size
         return self.from_vector(
             self.sizes, self.get_vector() + other.get_vector())
 
@@ -7604,7 +7607,7 @@ def reveal_to_binary_output(self, player=None):
     def __str__(self):
         return '%s multi-array of lengths %s at %s' % (
             self.value_type, self.sizes,
-            '<unallocated>' if self.array._address is None else self.address)
+            '<unallocated>' if self.address is None else self.address)
     __repr__ = __str__
 
 class MultiArray(SubMultiArray):
diff --git a/Programs/Source/bert_inference.mpc b/Programs/Source/bert_inference.mpc