Optimizations to Frobenius automorphisms

Optimize the function which maps only non zero elements to 1, and zero elements to 0, by implementing an alternative algorithm which takes O(logd) Frobenius automorphisms rather than O(d). Co-authored-by: Jack Crawford <jack.crawford@sky.com>
homenc · Oct 4, 2022 · 3b0b9e5 · 3b0b9e5
1 parent 3e3b910
commit 3b0b9e5
Show file tree

Hide file tree

Showing 2 changed files with 50 additions and 11 deletions.
diff --git a/include/helib/EncryptedArray.h b/include/helib/EncryptedArray.h
@@ -2651,7 +2651,7 @@ inline void totalSums(Ctxt& ctxt)
 
 //! @brief Map all non-zero slots to 1, leaving zero slots as zero.
 //! Assumes that r=1, and that all the slots contain elements from GF(p^d).
-void mapTo01(const EncryptedArray& ea, Ctxt& ctxt);
+void mapTo01(const EncryptedArray& ea, Ctxt& ctxt, bool multithread = true);
 // Implemented in eqtesting.cpp. We compute
 //             x^{p^d-1} = x^{(1+p+...+p^{d-1})*(p-1)}
 // by setting y=x^{p-1} and then outputting y * y^p * ... * y^{p^{d-1}},

diff --git a/src/eqtesting.cpp b/src/eqtesting.cpp
@@ -9,6 +9,21 @@
  * See the License for the specific language governing permissions and
  * limitations under the License. See accompanying LICENSE file.
  */
+
+/* Copyright (C) 2022 Intel Corporation
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Modifying HElib to optimize the 01 map.
+ * Contributions include
+ * Modified:
+ *   mapTo01
+ *     added parallelism to existing logic for norm calculation
+ *     added alternative logic for norm calculation which uses log(d) 
+ *     automorphisms on a single core
+ *     added an additional optional argument `multithread` which determines 
+ *     which version to run
+ *      
+ */
 /**
  * @file eqtesting.cpp
  * @brief Useful functions for equality testing...
@@ -17,6 +32,7 @@
 #include <helib/timing.h>
 #include <helib/EncryptedArray.h>
 #include <helib/Ptxt.h>
+#include <NTL/BasicThreadPool.h>
 
 #include <cstdio>
 
@@ -29,24 +45,47 @@ namespace helib {
 // and then outputting y * y^p * ... * y^{p^{d-1}}, with exponentiation to
 // powers of p done via Frobenius.
 
-// FIXME: the computation of the "norm" y * y^p * ... * y^{p^{d-1}}
-// can be done using O(log d) automorphisms, rather than O(d).
-
-void mapTo01(const EncryptedArray& ea, Ctxt& ctxt)
+void mapTo01(const EncryptedArray& ea, Ctxt& ctxt, bool multithread)
 {
   long p = ctxt.getPtxtSpace();
   if (p != ea.getPAlgebra().getP()) // ptxt space is p^r for r>1
     throw LogicError("mapTo01 not implemented for r>1");
 
   if (p > 2)
     ctxt.power(p - 1); // set y = x^{p-1}
-
   long d = ea.getDegree();
-  if (d > 1) { // compute the product of the d automorphisms
-    std::vector<Ctxt> v(d, ctxt);
-    for (long i = 1; i < d; i++)
-      v[i].frobeniusAutomorph(i);
-    totalProduct(ctxt, v);
+  // TODO: investigate this trade off more thoroughly
+  // Computing in parallel over t threads has runtime approximately
+  // (d - 1)/t, whereas single thread has runtime approx log(d)
+  if ((NTL::AvailableThreads() > 1) && multithread) {
+    // Compute O(d) Frobenius automorphisms in parallel    
+    if (d > 1) {
+      // compute the d - 1 automorphisms in parallel
+      std::vector<Ctxt> v(d, ctxt);
+      NTL_EXEC_RANGE(d - 1, first, last)
+      for (long i = first; i < last; i++)
+        v[i + 1].frobeniusAutomorph(i + 1);
+      NTL_EXEC_RANGE_END
+      // and compute the product of the d automorphisms
+      totalProduct(ctxt, v);
+    }
+  } else {
+    // Compute of the "norm" y * y^p * ... * y^{p^{d-1}}
+    //  using O(log d) automorphisms, rather than O(d).
+    long e = 1;
+    long b = NTL::NumBits(d);
+    Ctxt orig = ctxt;
+    for (long i = b - 2; i >= 0; i--) {
+      Ctxt tmp = ctxt;
+      tmp.frobeniusAutomorph(e);
+      ctxt *= tmp;
+      e *= 2;
+      if (NTL::bit(d, i)) {
+        ctxt.frobeniusAutomorph(1);
+        ctxt *= orig;
+        e++;
+      }
+    }
   }
 }