Fast constant-time gcd computation and modular inversion

Niels Möller nisse at lysator.liu.se
Wed Aug 24 17:07:17 CEST 2022


nisse at lysator.liu.se (Niels Möller) writes:

> If we require the largest matrix element, 2^k, to fit in a 64-bit
> signed, we can have at most k = 62.

I've implemented a first version doing GMP_LIMB_BITS-2 bits at a time.
For a start, gcd only, not cofactors/inverse. Code appended below.

It's rather neat. To be compared to the lehmer loop in
mpn/generic/gcd.c, which involves an iteration calling mpn_hgcd2 and
mpn_gcd_subdiv_step.

The analogue of hgcd2 is the "steps" function below, a straight loop
using masking tricks for the conditions, no unpredictable branches.

The most subtle part is the line

  count = (49 * bits + 57) / 17;

Proving that this is enough for the algorithm to terminate with the gcd
is what the bulk of the proofs in the paper is about.

About a factor 3 slower than plain mpz_gcd on my machine. It will be
interesting to see if slowdown for modular inversion is about the same.

Regards,
/Niels

------8<------------
/* Side channel silent gcd (work in progress)

Copyright 2022 Niels Möller

This is is free software; you can redistribute it and/or modify
it under the terms of either:

  * the GNU Lesser General Public License as published by the Free
    Software Foundation; either version 3 of the License, or (at your
    option) any later version.

or

  * the GNU General Public License as published by the Free Software
    Foundation; either version 2 of the License, or (at your option) any
    later version.

or both in parallel, as here.

The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
for more details.

You should have received copies of the GNU General Public License and the
GNU Lesser General Public License along with the GNU MP Library.  If not,
see https://www.gnu.org/licenses/.  */

#include <assert.h>
#include <limits.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>

#include <gmp.h>

#if GMP_NUMB_BITS < GMP_LIMB_BITS
#error Nails not supported.
#endif

#define BITCNT_BITS (sizeof(mp_bitcnt_t) * CHAR_BIT)

#define STEPS (GMP_LIMB_BITS - 2)

#define MAX(a,b) ((a) >= (b) ? (a) : (b))

struct matrix {
  /* Matrix elements interpreted as signed two's complement. Absolute
     value of elements is at most 2^STEPS. */
  mp_limb_t a[2][2];
};

/* Conditionally set (a, b) <-- (b, -a) */
#define CND_NEG_SWAP_LIMB(cnd, a, b) do {\
    mp_limb_t __cnd_sum = (a) + (b);	 \
    mp_limb_t __cnd_diff = (a) - (b);	 \
    (a) -= __cnd_diff & -(cnd);		 \
    (b) -= __cnd_sum & -(cnd);		 \
  } while (0)

/* Perform STEPS elementary reduction steps on (delta, f, g). In the
   least significant STEPS bits of f, g matter. Note that mp_bitcnt_t
   is an unsigned type, but is used as two's complement. */
static mp_bitcnt_t
steps(struct matrix *M, mp_bitcnt_t delta, mp_limb_t f, mp_limb_t g)
{
  mp_limb_t a00, a01, a10, a11;
  unsigned i;
  assert (f & 1);

  /* Identity matrix. */
  a00 = a11 = 1;
  a01 = a10 = 0;

  /* Preserve invariant (f ; g) = 2^{-i} M (f_orig, g_orig) */
  for (i = 0; i < STEPS; i++, delta++)
    {
      mp_limb_t odd = g & 1;
      mp_limb_t swap = odd & ~(delta >> (BITCNT_BITS-1));

      /* Swap; (f'; g') = (g; -f) = (0,1;-1,0) (g;f) */
      CND_NEG_SWAP_LIMB(swap, f, g);
      CND_NEG_SWAP_LIMB(swap, a00, a10);
      CND_NEG_SWAP_LIMB(swap, a01, a11);

      /* Conditional negation. */
      delta = (delta ^ - (mp_bitcnt_t) swap) + swap;

      /* Cancel low bit and shift. */
      g += f & -odd;
      a10 += a00 & -odd;
      a11 += a01 & -odd;

      g >>= 1;
      a00 <<= 1;
      a01 <<= 1;
    }
  M->a[0][0] = a00; M->a[0][1] = a01;
  M->a[1][0] = a10; M->a[1][1] = a11;
  return delta;
}

/* Set R = (u * F + v * G) >> STEPS, treating all numbers as two's
   complement. No overlap allowed. */
static void
add_add_mul_shift (mp_ptr rp, const mp_limb_t *gp, const mp_limb_t *fp, mp_size_t n,
		   mp_limb_t u, mp_limb_t v) {
  mp_limb_t f_sign = fp[n-1] >> (GMP_LIMB_BITS - 1);
  mp_limb_t g_sign = gp[n-1] >> (GMP_LIMB_BITS - 1);
  mp_limb_t u_sign = u >> (GMP_LIMB_BITS - 1);
  mp_limb_t v_sign = v >> (GMP_LIMB_BITS - 1);
  mp_limb_t hi = mpn_mul_1 (rp, fp, n, u) - ((-f_sign) & u);
  mp_limb_t lo;
  hi -= ((-u_sign) & fp[n-1]) + mpn_cnd_sub_n (u_sign, rp + 1, rp + 1, fp, n-1);

  hi += mpn_addmul_1 (rp, gp, n, v) - ((-g_sign) & v);
  hi -= ((-v_sign) & gp[n-1]) + mpn_cnd_sub_n (v_sign, rp + 1, rp + 1, gp, n-1);

  lo = mpn_rshift (rp, rp, n, STEPS);
  assert (lo == 0);
  rp[n-1] += (hi << (GMP_LIMB_BITS - STEPS));
}

/* Update (f'; g') = M (f; g), where all numbers are two's complement. */
static void
matrix_vector_mul (const struct matrix *M, mp_size_t n, mp_limb_t *fp, mp_limb_t *gp, mp_limb_t *tp)
{
  add_add_mul_shift (tp, gp, fp, n, M->a[0][0], M->a[0][1]);
  add_add_mul_shift (tp + n, gp, fp, n, M->a[1][0], M->a[1][1]);
  mpn_copyi (fp, tp, n);
  mpn_copyi (gp, tp + n, n);
}

static void __attribute__((noreturn))
die (const char *msg)
{
  fprintf(stderr, "%s\n", msg);
  abort();
}

static mp_limb_t *
xalloc_limbs(mp_size_t n)
{
  mp_limb_t *p = malloc(n * sizeof(*p));
  if (!p)
    die("Out of memory");

  mpn_zero (p, n);
  return p;
}

static void
cnd_neg (int cnd, mp_limb_t *rp, const mp_limb_t *ap, mp_size_t n,
	 mp_limb_t* tp)
{
  mpn_lshift (tp, ap, n, 1);
  mpn_cnd_sub_n (cnd, rp, ap, tp, n);
}

static void
djb_gcd (mpz_t g, const mpz_t a, const mpz_t b)
{
  /* Extra limb needed for sign bit. */
  mp_size_t n = 1 + MAX(mpz_size(a), mpz_size(b));
  mp_bitcnt_t bits = GMP_LIMB_BITS * n;
  mp_bitcnt_t count;
  mp_limb_t *fp, *gp, *tp;
  mp_bitcnt_t delta;

  assert (mpz_odd_p (a));
  assert (bits >= 46);
  count = (49 * bits + 57) / 17;

  fp = xalloc_limbs (n);
  gp = xalloc_limbs (n);
  tp = xalloc_limbs (2*n);

  mpn_copyi (fp, mpz_limbs_read (a), mpz_size (a));
  mpn_copyi (gp, mpz_limbs_read (b), mpz_size (b));

  for (delta = 1;;count -= STEPS)
    {
      struct matrix M;
      delta = steps (&M, delta, fp[0], gp[0]);
      assert (((M.a[0][0] * fp[0] + M.a[0][1] * gp[0])
	       & (((mp_limb_t)1 << STEPS) - 1)) == 0);
      assert (((M.a[1][0] * fp[0] + M.a[1][1] * gp[0])
	       & (((mp_limb_t)1 << STEPS) - 1)) == 0);
      matrix_vector_mul (&M, n, fp, gp, tp);
      if (count <= STEPS)
	break;
    }
  assert (mpn_zero_p (gp, n));
  cnd_neg (fp[n-1] >> (GMP_LIMB_BITS - 1), fp, fp, n, tp);
  mpn_copyi (mpz_limbs_write (g, n), fp, n);
  mpz_limbs_finish (g, n);

  free (fp);
  free (gp);
  free (tp);
}

int
main (int argc, char **argv)
{
  mp_size_t n;
  mpz_t a, b, g, ref;
  gmp_randstate_t rands;

  mpz_inits (a, b, g, ref, NULL);
  gmp_randinit_default (rands);

  for (n = 1; n < 20; n++)
    {
      unsigned count;
      clock_t clocks_ref = 0;
      clock_t clocks_djb = 0;
      for (count = 0; count < 1000; count++)
	{
	  clock_t time_start, time_ref, time_djb;
	  if (count & 1)
	    {
	      mpz_rrandomb (a, rands, GMP_LIMB_BITS * n);
	      mpz_rrandomb (b, rands, GMP_LIMB_BITS * n);
	    }
	  else
	    {
	      mpz_urandomb (a, rands, GMP_LIMB_BITS * n);
	      mpz_urandomb (b, rands, GMP_LIMB_BITS * n);
	    }
	  mpz_setbit (a, 0);

	  time_start = clock();
	  mpz_gcd (ref, a, b);
	  time_ref = clock();
	  djb_gcd (g, a, b);
	  time_djb = clock();

	  if (mpz_cmp (g, ref) != 0)
	    {
	      gmp_fprintf (stderr,
			   "size = %zd\na   = 0x%Zx\nb   = 0x%Zx\nref = %0xZx\ng   = 0x%Zx\n",
			   (size_t) n, a, b, ref, g);
	      die("Test failed");
	    }
	  clocks_ref += (time_ref - time_start);
	  clocks_djb += (time_djb - time_ref);
	}
      fprintf (stderr, "size %zd, ref %.3f (us), djb %.3f (us)\n", (size_t) n,
	       clocks_ref * (1e6 / CLOCKS_PER_SEC) / count,
	       clocks_djb * (1e6 / CLOCKS_PER_SEC) / count);

    }
  mpz_clears (a, b, g, ref, NULL);
}

-- 
Niels Möller. PGP key CB4962D070D77D7FCB8BA36271D8F1FF368C6677.
Internet email is subject to wholesale government surveillance.


More information about the gmp-devel mailing list