[Gmp-commit] /home/hgfiles/gmp: mpn_mulmod_bnm1 changes, to reduce allocation...

Thu Jan 21 11:26:04 CET 2010

details:   /home/hgfiles/gmp/rev/85730915e18f
changeset: 13387:85730915e18f
user:      Niels M?ller <nisse at lysator.liu.se>
date:      Thu Jan 21 11:19:52 2010 +0100
description:
mpn_mulmod_bnm1 changes, to reduce allocation in mpn_nussbaumer_mul.

diffstat:

 ChangeLog                    |  14 +++++++++++++
 mpn/generic/mulmod_bnm1.c    |  46 +++++++++++++++++++++++++++++++++++++------
 mpn/generic/nussbaumer_mul.c |   9 +++----
 mpn/generic/sqrmod_bnm1.c    |  34 +++++++++++++++++++++++++------
 tests/mpn/t-mulmod_bnm1.c    |  35 +++++++++++++++++----------------
 5 files changed, 102 insertions(+), 36 deletions(-)

diffs (truncated from 311 to 300 lines):

diff -r 630c72a64734 -r 85730915e18f ChangeLog

--- a/ChangeLog	Wed Jan 20 09:21:25 2010 +0100
+++ b/ChangeLog	Thu Jan 21 11:19:52 2010 +0100
@@ -1,3 +1,17 @@
+2010-01-21  Niels Möller  <nisse at lysator.liu.se>
+
+	* mpn/generic/nussbaumer_mul.c (mpn_nussbaumer_mul): Take
+	advantage of new mpn_mulmod_bnm1 interface, to reduce allocation.
+
+	* tests/mpn/t-mulmod_bnm1.c (ref_mulmod_bnm1, main): Adapted to
+	mpn_mulmod_bnm1 interface change.
+
+	* mpn/generic/mulmod_bnm1.c (mpn_mulmod_bnm1): Interface change,
+	in case an + bn < rn, only write an + bn output limbs. New input
+	requirement, an + bn > rn/2.
+	* mpn/generic/sqrmod_bnm1.c (mpn_sqrmod_bnm1): Corresponding
+	changes.
+
 2010-01-19  Torbjorn Granlund  <tege at gmplib.org>
 
 	* tune/tuneup.c (fftmes): Round up initial n according to initial k.
diff -r 630c72a64734 -r 85730915e18f mpn/generic/mulmod_bnm1.c
--- a/mpn/generic/mulmod_bnm1.c	Wed Jan 20 09:21:25 2010 +0100
+++ b/mpn/generic/mulmod_bnm1.c	Thu Jan 21 11:19:52 2010 +0100
@@ -70,7 +70,7 @@
 }
 
 
-/* Computes {rp,rn} <- {ap,an}*{bp,bn} Mod(B^rn-1)
+/* Computes {rp,MIN(rn,an+bn)} <- {ap,an}*{bp,bn} Mod(B^rn-1)
  *
  * The result is expected to be ZERO if and only if one of the operand
  * already is. Otherwise the class [0] Mod(B^rn-1) is represented by
@@ -81,7 +81,7 @@
  * compute the full product with an+bn <= rn, because this condition
  * implies (B^an-1)(B^bn-1) < (B^rn-1) .
  *
- * Requires both 0 < bn <= an <= rn
+ * Requires 0 < bn <= an <= rn and an + bn > rn/2
  * Scratch need: rn + 2 + (need for recursive call OR rn + 2). This gives
  *
  * S(n) <= rn + 2 + MAX (rn + 2, S(n/2)) <= 2rn + 2 log2 rn + 2
@@ -100,7 +100,6 @@
 	  if (UNLIKELY (an + bn <= rn))
 	    {
 	      mpn_mul (rp, ap, an, bp, bn);
-	      MPN_ZERO (rp + an + bn, rn - (an + bn));
 	    }
 	  else
 	    {
@@ -121,6 +120,14 @@
 
       n = rn >> 1;
 
+      /* We need at least an + bn >= n, to be able to fit one of the
+	 recursive products at rp. Requiring strict inequality makes
+	 the coded slightly simpler. If desired, we could avoid this
+	 restriction by initially halving rn as long as rn is even and
+	 an + bn <= rn/2. */
+      
+      ASSERT (an + bn > n);
+
       /* Compute xm = a*b mod (B^n - 1), xp = a*b mod (B^n + 1)
 	 and crt together as
 
@@ -132,6 +139,10 @@
 #define b0 bp
 #define b1 (bp + n)
 
+      /* FIXME: See if the need for temporary storage can be reduced
+       * in case an <= n or bn <= n. This matters for calls from
+       * mpn_nussbaumer_mul, since then bn <= n always and an <= n for
+       * balanced products. */
 #define xp  tp	/* 2n + 2 */
       /* am1  maybe in {xp, n} */
       /* bm1  maybe in {xp + n, n} */
@@ -275,11 +286,32 @@
       /* Compute the highest half:
 	 ([(xp + xm)/2 mod (B^n-1)] - xp ) * B^n
        */
-      cy = xp[n] + mpn_sub_n (rp + n, rp, xp, n);
-      /* cy = 1 only if {xp,n+1} is not ZERO, i.e. {rp,n} is not ZERO.
-	 DECR will affect _at most_ the lowest n limbs. */
-      MPN_DECR_U (rp, 2*n, cy);
+      if (UNLIKELY (an + bn < rn))
+	{
+	  /* Note that in this case, the only way the result can equal
+	     zero mod B^{rn} - 1 is if one of the inputs is zero, and
+	     then the output of both the recursive calls and this CRT
+	     reconstruction is zero, not B^{rn} - 1. Which is good,
+	     since the latter representation doesn't fit in the output
+	     area.*/
+	  cy = mpn_sub_n (rp + n, rp, xp, an + bn - n);
 
+	  /* FIXME: This subtraction of the high parts is not really
+	     necessary, we do it to get the carry out, and for sanity
+	     checking. */
+	  cy = xp[n] + mpn_sub_nc (so, rp + an + bn - n, xp + an + bn - n,
+				   rn - (an + bn), cy);
+	  ASSERT (an + bn == rn - 1 || mpn_zero_p (so+1, rn - 1 - (an + bn)));
+	  cy = mpn_sub_1 (rp, rp, an + bn, cy);
+	  ASSERT (cy == so[0]);
+ 	}
+      else
+	{
+	  cy = xp[n] + mpn_sub_n (rp + n, rp, xp, n);
+	  /* cy = 1 only if {xp,n+1} is not ZERO, i.e. {rp,n} is not ZERO.
+	     DECR will affect _at most_ the lowest n limbs. */
+	  MPN_DECR_U (rp, 2*n, cy);
+	}
 #undef a0
 #undef a1
 #undef b0
diff -r 630c72a64734 -r 85730915e18f mpn/generic/nussbaumer_mul.c
--- a/mpn/generic/nussbaumer_mul.c	Wed Jan 20 09:21:25 2010 +0100
+++ b/mpn/generic/nussbaumer_mul.c	Thu Jan 21 11:19:52 2010 +0100
@@ -35,7 +35,7 @@
 		    mp_srcptr bp, mp_size_t bn)
 {
   mp_size_t rn;
-  mp_ptr rp, tp;
+  mp_ptr tp;
   TMP_DECL;
 
   ASSERT (an >= bn);
@@ -44,13 +44,12 @@
   rn = mpn_mulmod_bnm1_next_size (an + bn);
 
   TMP_MARK;
-  TMP_ALLOC_LIMBS_2(rp, rn, tp, mpn_mulmod_bnm1_itch (rn));
+  tp = TMP_ALLOC_LIMBS (mpn_mulmod_bnm1_itch (rn));
 
   if ((ap == bp) && (an == bn))
-    mpn_sqrmod_bnm1 (rp, rn, ap, an, tp);
+    mpn_sqrmod_bnm1 (pp, rn, ap, an, tp);
   else
-    mpn_mulmod_bnm1 (rp, rn, ap, an, bp, bn, tp);
+    mpn_mulmod_bnm1 (pp, rn, ap, an, bp, bn, tp);
 
-  MPN_COPY (pp, rp, an + bn);
   TMP_FREE;
 }
diff -r 630c72a64734 -r 85730915e18f mpn/generic/sqrmod_bnm1.c
--- a/mpn/generic/sqrmod_bnm1.c	Wed Jan 20 09:21:25 2010 +0100
+++ b/mpn/generic/sqrmod_bnm1.c	Thu Jan 21 11:19:52 2010 +0100
@@ -68,7 +68,7 @@
 }
 
 
-/* Computes {rp,rn} <- {ap,an}^2 Mod(B^rn-1)
+/* Computes {rp,MIN(rn,2an)} <- {ap,an}^2 Mod(B^rn-1)
  *
  * The result is expected to be ZERO if and only if the operand
  * already is. Otherwise the class [0] Mod(B^rn-1) is represented by
@@ -77,7 +77,7 @@
  * compute the full square with an <= 2*rn, because this condition
  * implies (B^an-1)^2 < (B^rn-1) .
  *
- * Requires 0 < an <= rn
+ * Requires rn/4 < an <= rn
  * Scratch need: rn + 2 + (need for recursive call OR rn + 2). This gives
  *
  * S(n) <= rn + 2 + MAX (rn + 2, S(n/2)) <= 2rn + 2 log2 rn + 2
@@ -95,7 +95,6 @@
 	  if (UNLIKELY (2*an <= rn))
 	    {
 	      mpn_sqr (rp, ap, an);
-	      MPN_ZERO (rp + 2*an, rn - 2*an);
 	    }
 	  else
 	    {
@@ -116,6 +115,8 @@
 
       n = rn >> 1;
 
+      ASSERT (2*an > n);
+      
       /* Compute xm = a^2 mod (B^n - 1), xp = a^2 mod (B^n + 1)
 	 and crt together as
 
@@ -240,11 +241,30 @@
       /* Compute the highest half:
 	 ([(xp + xm)/2 mod (B^n-1)] - xp ) * B^n
        */
-      cy = xp[n] + mpn_sub_n (rp + n, rp, xp, n);
-      /* cy = 1 only if {xp,n+1} is not ZERO, i.e. {rp,n} is not ZERO.
-	 DECR will affect _at most_ the lowest n limbs. */
-      MPN_DECR_U (rp, 2*n, cy);
+      if (UNLIKELY (2*an < rn))
+	{
+	  /* Note that in this case, the only way the result can equal
+	     zero mod B^{rn} - 1 is if the input is zero, and
+	     then the output of both the recursive calls and this CRT
+	     reconstruction is zero, not B^{rn} - 1. */
+	  cy = mpn_sub_n (rp + n, rp, xp, 2*an - n);
 
+	  /* FIXME: This subtraction of the high parts is not really
+	     necessary, we do it to get the carry out, and for sanity
+	     checking. */
+	  cy = xp[n] + mpn_sub_nc (so, rp + 2*an - n, xp + 2*an - n,
+				   rn - 2*an, cy);
+	  ASSERT (mpn_zero_p (so+1, rn - 1 - 2*an));
+	  cy = mpn_sub_1 (rp, rp, 2*an, cy);
+	  ASSERT (cy == so[0]);
+	}
+      else
+	{
+	  cy = xp[n] + mpn_sub_n (rp + n, rp, xp, n);
+	  /* cy = 1 only if {xp,n+1} is not ZERO, i.e. {rp,n} is not ZERO.
+	     DECR will affect _at most_ the lowest n limbs. */
+	  MPN_DECR_U (rp, 2*n, cy);
+	}
 #undef a0
 #undef a1
 #undef xp
diff -r 630c72a64734 -r 85730915e18f tests/mpn/t-mulmod_bnm1.c
--- a/tests/mpn/t-mulmod_bnm1.c	Wed Jan 20 09:21:25 2010 +0100
+++ b/tests/mpn/t-mulmod_bnm1.c	Thu Jan 21 11:19:52 2010 +0100
@@ -62,9 +62,7 @@
   else
     refmpn_mul (rp, bp, bn, ap, an);
   an += bn;
-  if( UNLIKELY(an <= rn) )
-    MPN_ZERO (rp + an, rn - an);
-  else {
+  if (an > rn) {
     cy = mpn_add (rp, rp, rn, rp + rn, an - rn);
     /* If cy == 1, then the value of rp is at most B^rn - 2, so there can
      * be no overflow when adding in the carry. */
@@ -114,7 +112,7 @@
     {
       unsigned size_min;
       unsigned size_range;
-      mp_size_t an,bn,n;
+      mp_size_t an,bn,rn,n;
       mp_size_t itch;
       mp_limb_t p_before, p_after, s_before, s_after;
 
@@ -129,17 +127,19 @@
 	+ gmp_urandomm_ui (rands, (1L << size_range) + 1 - MIN_N);
       n = mpn_mulmod_bnm1_next_size (n);
 
-      if (test & 1) {
+      if ( (test & 1) || n == 1) {
 	/* Half of the tests are done with the main scenario in mind:
 	   both an and bn >= rn/2 */
 	an = ((n+1) >> 1) + gmp_urandomm_ui (rands, (n+1) >> 1);
 	bn = ((n+1) >> 1) + gmp_urandomm_ui (rands, (n+1) >> 1);
       } else {
 	/* Second half of the tests are done using mulmod to compute a
-	   full product and an >= bn > 0; recursion make it eventually
-	   fall in the case above. */
-	an = ((n+3) >> 2) + gmp_urandomm_ui (rands, n - (n >> 2));
-	bn = 1 + ((an == 1) ? 0 : gmp_urandomm_ui (rands, an - 1));
+	   full product with n/2 < an+bn <= n. */
+	an = 1 + gmp_urandomm_ui (rands, n - 1);
+	if (an >= n/2)
+	  bn = 1 + gmp_urandomm_ui (rands, n - an);
+	else
+	  bn = n/2 + 1 - an + gmp_urandomm_ui (rands, (n+1)/2);
       }
 
       /* Make sure an >= bn */
@@ -165,9 +165,10 @@
 	/* We don't propagate carry, this means that the desired condition
 	   is not triggered all the times. A few times are enough anyway. */
       }
-      mpn_random2 (pp-1, n + 2);
+      rn = MIN(n, an + bn);
+      mpn_random2 (pp-1, rn + 2);
       p_before = pp[-1];
-      p_after = pp[n];
+      p_after = pp[rn];
 
       itch = mpn_mulmod_bnm1_itch (n);
       ASSERT_ALWAYS (itch <= mpn_mulmod_bnm1_itch (MAX_N));
@@ -177,9 +178,9 @@
 
       mpn_mulmod_bnm1 (  pp, n, ap, an, bp, bn, scratch);
       ref_mulmod_bnm1 (refp, n, ap, an, bp, bn);
-      if (pp[-1] != p_before || pp[n] != p_after
+      if (pp[-1] != p_before || pp[rn] != p_after
 	  || scratch[-1] != s_before || scratch[itch] != s_after
-	  || mpn_cmp (refp, pp, n) != 0)
+	  || mpn_cmp (refp, pp, rn) != 0)
 	{
 	  printf ("ERROR in test %d, an = %d, bn = %d, n = %d\n",
 		  test, (int) an, (int) bn, (int) n);
@@ -188,9 +189,9 @@
 	      printf ("before pp:"); mpn_dump (pp -1, 1);
 	      printf ("keep:   "); mpn_dump (&p_before, 1);
 	    }
-	  if (pp[n] != p_after)
+	  if (pp[rn] != p_after)
 	    {
-	      printf ("after pp:"); mpn_dump (pp + n, 1);
+	      printf ("after pp:"); mpn_dump (pp + rn, 1);
 	      printf ("keep:   "); mpn_dump (&p_after, 1);
 	    }
 	  if (scratch[-1] != s_before)