data61 · AraxTheCoder · Apr 17, 2026
diff --git a/Compiler/library.py b/Compiler/library.py
@@ -24,19 +24,59 @@ def get_block():
     return get_program().curr_block
 
 def vectorize(function):
+    def mask_output(value, active: regint):
+        if value is None:
+            return None
+        if isinstance(value, tuple):
+            return tuple(mask_output(x, active) for x in value)
+        if isinstance(value, list):
+            return [mask_output(x, active) for x in value]
+
+        try:
+            size = value.size
+        except AttributeError:
+            return value
+
+        if size == 1:
+            return value
+
+        return value * active
+
+    def get_vector_size(call_args, call_kwargs):
+        if len(call_args) > 0 and 'size' in dir(call_args[0]):
+            return call_args[0].size
+        elif 'size' in call_kwargs:
+            return call_kwargs['size']
+        else:
+            return None
+
     def vectorized_function(*args, **kwargs):
-        if len(args) > 0 and 'size' in dir(args[0]):
-            instructions_base.set_global_vector_size(args[0].size)
-            res = function(*args, **kwargs)
-            instructions_base.reset_global_vector_size()
-        elif 'size' in kwargs:
-            instructions_base.set_global_vector_size(kwargs['size'])
-            del kwargs['size']
-            res = function(*args, **kwargs)
+        active_vector_size = regint.conv(kwargs.pop('active_length', None))
+
+        size = get_vector_size(args, kwargs)
+        if size is not None:
+            if 'size' in kwargs:
+                del kwargs['size']
+            instructions_base.set_global_vector_size(size)
+
+        set_active_vector_size = active_vector_size is not None and size is not None and size > 1
+        context_saved_arg = None
+        if set_active_vector_size:
+            context_saved_arg = get_arg()
+            starg(-active_vector_size)
+
+        res = function(*args, **kwargs)
+
+        if set_active_vector_size:
+            starg(context_saved_arg)
+            active = regint.inc(size) < active_vector_size
+            res = mask_output(res, active)
+
+        if size is not None:
             instructions_base.reset_global_vector_size()
-        else:
-            res = function(*args, **kwargs)
+
         return res
+
     vectorized_function.__name__ = function.__name__
     copy_doc(vectorized_function, function)
     return vectorized_function

diff --git a/GC/ShareThread.hpp b/GC/ShareThread.hpp
@@ -167,24 +167,50 @@ template<class T>
 void ShareThread<T>::and_(Processor<T>& processor,
         const vector<int>& args, bool repeat)
 {
+    vector<int> active_args;
+    int active_limit = -1;
+    long prefix = processor.get_arg().get();
+    if (prefix < 0)
+        active_limit = int(-prefix);
+
+    if (active_limit >= 0)
+    {
+        active_args.reserve(args.size());
+        for (auto it = args.begin(); it < args.end(); it += 4)
+        {
+            int active_bits = min(*it, active_limit);
+            if (active_bits > 0)
+            {
+                active_args.push_back(active_bits);
+                active_args.push_back(*(it + 1));
+                active_args.push_back(*(it + 2));
+                active_args.push_back(*(it + 3));
+            }
+        }
+    }
+    else
+    {
+        active_args = args;
+    }
+
     auto& protocol = this->protocol;
     auto& S = processor.S;
-    processor.check_args(args, 4);
+    processor.check_args(active_args, 4);
     protocol->init_mul();
     T x_ext, y_ext;
 
     size_t total_bits = 0;
-    for (auto it = args.begin(); it < args.end(); it += 4)
+    for (auto it = active_args.begin(); it < active_args.end(); it += 4)
         total_bits += *it;
 
     // accept 10 % waste
-    bool fast_mode = 0.1 * total_bits > args.size() / 4 * T::default_length;
+    bool fast_mode = 0.1 * total_bits > active_args.size() / 4 * T::default_length;
     if (fast_mode)
     {
         protocol->set_fast_mode(true);
     }
 
-    ArgList<BitOpTuple<T>> infos(args);
+    ArgList<BitOpTuple<T>> infos(active_args);
 
     if (repeat)
         for (auto info : infos)
@@ -268,11 +294,43 @@ void ShareThread<T>::andrsvec(Processor<T>& processor, const vector<int>& args)
     int N_BITS = T::default_length;
     auto& protocol = this->protocol;
     assert(protocol);
+
+    vector<int> active_args;
+    int active_limit = -1;
+    long prefix = processor.get_arg().get();
+    if (prefix < 0)
+        active_limit = int(-prefix);
+
+    if (active_limit >= 0)
+    {
+        active_args.reserve(args.size());
+        auto it = args.begin();
+        while (it < args.end())
+        {
+            int n_args = (*it - 3) / 2;
+            int size = *(it + 1);
+            int active_size = min(size, active_limit);
+            if (active_size > 0)
+            {
+                int group_size = 2 * n_args + 3;
+                active_args.push_back(*it);
+                active_args.push_back(active_size);
+                for (int i = 0; i < group_size - 2; i++)
+                    active_args.push_back(*(it + 2 + i));
+            }
+            it += 2 * n_args + 3;
+        }
+    }
+    else
+    {
+        active_args = args;
+    }
+
     protocol->init_mul();
-    auto it = args.begin();
+    auto it = active_args.begin();
     T x_ext, y_ext;
     int total_bits = 0;
-    while (it < args.end())
+    while (it < active_args.end())
     {
         int n_args = (*it++ - 3) / 2;
         int size = *it++;
@@ -297,8 +355,8 @@ void ShareThread<T>::andrsvec(Processor<T>& processor, const vector<int>& args)
 
     protocol->exchange();
 
-    it = args.begin();
-    while (it < args.end())
+    it = active_args.begin();
+    while (it < active_args.end())
     {
         int n_args = (*it++ - 3) / 2;
         int size = *it++;

diff --git a/Processor/Instruction.cpp b/Processor/Instruction.cpp
@@ -21,10 +21,11 @@ void Instruction::execute_clear_gf2n(StackedVector<cgf2n>& registers,
 {
     auto& C2 = registers;
     auto& M2C = memory;
+    int active_size = get_effective_vector_size(Proc, size);
     switch (opcode)
     {
 #define X(NAME, PRE, CODE) \
-        case NAME: { PRE; for (int i = 0; i < size; i++) { CODE; } } break;
+        case NAME: { PRE; for (int i = 0; i < active_size; i++) { CODE; } } break;
         CLEAR_GF2N_INSTRUCTIONS
 #undef X
     }
@@ -62,29 +63,32 @@ void Instruction::execute_regint(ArithmeticProcessor& Proc, MemoryPart<Integer>&
 {
     (void) Mi;
     auto& Ci = Proc.get_Ci();
+    int active_size = get_effective_vector_size(Proc, size);
     switch (opcode)
     {
 #define X(NAME, PRE, CODE) \
-        case NAME: { PRE; for (int i = 0; i < size; i++) { CODE; } } break;
+        case NAME: { PRE; for (int i = 0; i < active_size; i++) { CODE; } } break;
         REGINT_INSTRUCTIONS
 #undef X
     }
 }
 
 void Instruction::shuffle(ArithmeticProcessor& Proc) const
 {
-    for (int i = 0; i < size; i++)
+    int active_size = get_effective_vector_size(Proc, size);
+    for (int i = 0; i < active_size; i++)
         Proc.write_Ci(r[0] + i, Proc.read_Ci(r[1] + i));
-    for (int i = 0; i < size; i++)
+    for (int i = 0; i < active_size; i++)
     {
-        int j = Proc.shared_prng.get_uint(size - i);
+        int j = Proc.shared_prng.get_uint(active_size - i);
         swap(Proc.get_Ci_ref(r[0] + i), Proc.get_Ci_ref(r[0] + i + j));
     }
 }
 
 void Instruction::bitdecint(ArithmeticProcessor& Proc) const
 {
-    for (int j = 0; j < size; j++)
+    int active_size = get_effective_vector_size(Proc, size);
+    for (int j = 0; j < active_size; j++)
     {
         long a = Proc.read_Ci(r[0] + j);
         for (unsigned int i = 0; i < start.size(); i++)