src/modulo.cc - qsieve Mercurial Source Tree

// Modulo-Operations for unsigned int (and signed int)
// written by Thorsten Reinecke (1999-03-12)
// last change: 2007-06-22
 
// some of the following functions are necessary because a*b%m will
// return false results if a*b overflows unsigned-int-range.
// these routines are okay for unsigned-int-range/2
 
/*! @file
 * @brief
 * This is the implementation part of some number theoretic functions.
 *
 * The functions implemented in this file provide support for fast modulo
 * operations (multiplication, exponentiation,
 * computation of the modular inverse, quadratic residues)
 * and some integer functions (greatest common divisor, prime numbers).
 *
 * Most of these functions contain also optimized assembler code for
 * Intel Pentium and AMD Athlon processors.
 */
 
 
#ifndef MODULO_CC_INCLUDED
#define MODULO_CC_INCLUDED
 
#include "modulo.H"
#include <iostream>
#include "sqrt_modulo.cc"
 
//! number theoretic concepts
namespace numtheory
{
 
/*!
 * @param p unsigned integer to test for primality
 * @param base base for the strong probable prime test
 * @returns whether @p p is a strong probable prime regarding to @p base .
 */
bool strong_probab_prime(const unsigned int p, const unsigned int base)
{
  // assumes p>base>1 !!
  register unsigned int d=p-1;
  register unsigned int s=0;
  while (!(d&1U)) { d>>=1; ++s; }
  //cout << "d,s: " << d << "," << s << endl;
  if (s==0) return false;
  //cout << "p,d,s: " << p << "," << d << "," << s << endl;
  d=powmod(base,d,p);
  unsigned int x = d;
  while (s && x!=p-1) { x=squaremod(x,p); --s; } 
  //cout << "p,d,s,x: " << p << "," << d << "," << s << "," << x << endl;
  return (d==1 && x!=p-1) || (d!=1 && x==p-1);
}
 
/*!
 * @param p unsigned integer to check for primality
 * @result whether @p p is probably prime to the bases 2, 7 and 61.
 * 
 * @remark actually the result returns, whether @p p is prime,
 *         iff 61 < @p p < 4.759.123.141 (and no integer overflow occurs),
 *         as someone has checked these conditions beforehand.
 */
bool probab_prime(const unsigned int p)
{
 // assumes p>61 !!
 // actually p is prime if 61<p<4.759.123.141 and p is a probable prime for 2,7 and 61
 // (this has been checked by someone before).
 //
 // problem: mulmod(a,b,n) produces false results if highest
 //          significant bit in unsigned int overflows!
 return strong_probab_prime(p,2)
        && strong_probab_prime(p,7)
        && strong_probab_prime(p,61);
}
 
 
/*!
 * @param p unsigned integer to check for primality
 * @result whether @p p is a prime number
 * 
 * @remark This function is slow for large numbers, but safe for all numbers, since
 *         it is based on trial division.
 */
bool is_prime(register const int p)
{
  // slow for large p's, but safe for all p's
  if (p==2 || p==3) return true;
  if ((p%2==0) || (p%3==0)) return false;
  register int t = 5;
  while (t*t<=p) 
    {
      if ((p%t)==0) return false;
      t+=2;
      if ((p%t)==0) return false;
      t+=4;
    }
  return true;
}
 
 
/*!
 * @param a an integer value
 * @param b an odd unsigned integer value
 * @result Jacobi symbol (a/b) (which is undefined for even @p b)
 */
signed int jacobi (int a, unsigned int b)
{
  // returns jacobi(a/b), assumes b odd!
 
  if (a%b)
   {
     signed int x = 1;
     if (a<0)
      {
        a=-a;
        if ((b-1)&2) x=-x; // (-1/b)=(-1)^((b-1)/2)
      }
     while (a>1 && b>1)
      {
        a%=b; if (a==0) return 0;
        unsigned int twos = 0;
        while ((a&1)==0) { a>>=1; twos++; }
        if (twos&1) // only odd two's in a can change the sign!
         {
          // (2/b)=(-1)^((b^2-1)/8)
          twos=(b&15)*(b&15); // but note that overflow wouldn't hurt here...
          if (twos&8) x=-x;
         }
    // a and b odd natural numbers -> (a/b)=(-1)^( (a-1)/2 * (b-1)/2 ) * (b/a)
        if (2&a&b) x=-x;
        twos=a; a=b; b=twos; // swap loop again...
      }
     return x;
   }
  else return 0;   
}
 
 
/*!
 * @param a an integer value
 * @param p an odd prime number (as unsigned integer)
 * @result Legendre symbol (a/p), which is
 *          - 1, if @p a is a quadratic residue modulo @p p
 *          - -1, if @p a is not a quadratic residue modulo @p p
 *          - 0, if @p a and @p p are not coprime
 * @remark 
 *         - The result is only defined, if @p p is an odd prime number. You cannot rely on
 *           on any specific behaviour on this function, if this condition is not met!
 *         - For valid values the result is identical to that of the jacobi function.
 */
signed int legendre (signed int a, const unsigned int p)
{
  // returns legendre(a/p), assumes p being an odd prime!
   
  // warning: legendre is only defined for odd prime p.
  //          this function works for all odd p: 
  //          it will therefore return jacobi(a/p)
  // remember: undefined results can be runerrors, but
  //           an undefined result can be anything else, too.
  //           So, undefined means, you can't rely on a specific
  //           behaviour.
 
  //           For debugging reasons one may wish to check for
  //           odd prime input p.
#if 0
  if ( (p%2==0) || !is_prime(p) )
   {
     cerr << "error in legendre: " << p << " is not an odd prime!" << endl;
     exit(1);
   }
#endif
  return jacobi(a,p);
} 
 
 
/*!
 * @param x an unsigned integer value
 * @param m an unsigned integer value
 * @result the modular inverse of @p x mod @p m
 * @remark
 *         - Let @p y be the modular inverse of @p x mod @p m, then @p x * @p y = 1 (mod @p m)
 *         - The result is only defined, if @p x and @p m are coprime.
 */
unsigned int invmod(const unsigned int x, const unsigned int m)
{
  // returns 1/x mod m (or error if no inverse exists)
  unsigned int a;
  signed int inv;
 
#ifndef ASM_386
  unsigned int b = x; //(x>=m) ? x%m : x;
  signed int sw=1;
  a=m; inv=0;
  while (b)
   {
//     std::cout << "b=" <<  b << ", a=" << a; 
     unsigned int q=a/b; unsigned int r=a%b;
     a=b; b=r;
     signed h=inv-sw*q; inv=sw; sw=h;
//     std::cout << ", q=" << q << ", r=" << r << ", inv=" << inv << std::endl;
   }
  inv = (inv>=0) ? inv : m+inv;
 
#else
 
#ifdef ASM_CMOV
// inline assembler code for AMD Athlon (and other x86 processors providing conditional mov operations)
 
unsigned int temp;
 
#ifdef DEBUG
#warning "using cmov for invmod-function"
#endif
 
// utilize, that division is faster for smaller operands
// also use conditional mov instructions where possible
  asm volatile ( \
   "movl %[M],%%eax \n\t" \
   "movl %[X],%%ebx \n\t" \
   "movl $0,%[inv] # initialize inv \n\t" \
   "movl $1,%%ecx \n\t" \
   "1: # calc modular inverse\n\t" \
   "xorl %%edx,%%edx # prepare long div \n\t" \
   "divl %%ebx \n\t" \
   "movl %%ebx,%[tmp] \n\t" \
   "movl %%edx,%%ebx \n\t" \
   "imull %%ecx \n\t" \
   "subl %[inv],%%eax \n\t" \
   "movl %%ecx,%[inv] \n\t" \
   "movl %%eax,%%ecx \n\t" \
   "movl %[tmp],%%eax \n\t" \
   "negl %%ecx \n\t" \
   "testl $0xffff0000,%%ebx \n\t" \
   "jnz 1b \n\t" \
   "movl %%eax,%%edx \n\t" \
   "testl %%ebx,%%ebx \n\t" \
   "jz 9f \n\t" \
   "shrl $16,%%edx \n\t" \
   "cmpl %%ebx,%%edx \n\t" \
   "jae 1b # ext gcd loop (big operands 0:32/32) \n\t" \
   "movzx %%ax,%%eax \n\t" \
   "2: # calc modular inverse (small operands 16:16/16) \n\t" \
   "divw %%bx \n\t" \
   "movl %%ebx,%[tmp] \n\t" \
   "movl %%edx,%%ebx \n\t" \
   "imull %%ecx \n\t" \
   "subl %[inv],%%eax \n\t" \
   "movl %%ecx,%[inv] \n\t" \
   "xorl %%edx,%%edx \n\t" \
   "movl %%eax,%%ecx \n\t" \
   "movl %[tmp],%%eax \n\t" \
   "negl %%ecx \n\t" \
   "testb $0xff,%%bh \n\t" \
   "jnz 2b # ext gcd loop (small ops) \n\t" \
   "testb %%bl,%%bl \n\t" \
   "jz 9f \n\t" \
   "cmpb %%bl,%%ah \n\t" \
   "jae 2b \n\t" \
   "3: # calc modular inverse (byte operands 8:8/8 \n\t" \
   "divb %%bl \n\t" \
   "movb %%bl,%%bh \n\t" \
   "movb %%ah,%%bl \n\t" \
   "movzx %%al,%%eax \n\t" \
   "imull %%ecx \n\t" \
   "subl %[inv],%%eax \n\t" \
   "movl %%ecx,%[inv] \n\t" \
   "movl %%eax,%%ecx \n\t" \
   "movzx %%bh,%%eax \n\t" \
   "negl %%ecx \n\t" \
   "cmpb $1,%%bl \n\t" \
   "ja 3b # ext gcd loop (byte ops) \n\t" \
   "movzx %%bl,%%ebx \n\t" \
   "cmovel %%ecx,%[inv] \n\t" \
   "cmovel %%ebx,%%eax \n\t" \
   "9: # normalize result \n\t" \
   "movl %[M],%%edx \n\t" \
   "addl %[inv],%%edx \n\t" \
   "cmpl $0,%[inv] \n\t" \
   "cmovng %%edx,%[inv] \n"
   : "=&a" (a), [inv] "=&D" (inv), [tmp] "=&g" (temp) : [M] "g" (m), [X] "g" (x) : "ebx", "edx", "ecx", "cc");
 
#else /* "standard" ASM_386 handoptimized inline assembler */
  asm volatile ( \
   "movl %[M],%%eax \n\t" \
   "movl %[X],%%ebx \n\t" \
   "movl $0,%[inv] # initialize inv \n\t" \
   "movl $1,%%ecx \n\t" \
   "1: # calc modular inverse\n\t" \
   "xorl %%edx,%%edx # prepare long div \n\t" \
   "divl %%ebx \n\t" \
   "pushl %%ebx \n\t" \
   "movl %%edx,%%ebx \n\t" \
   "imull %%ecx \n\t" \
   "movl %[inv],%%edx \n\t" \
   "subl %%eax,%%edx \n\t" \
   "movl %%ecx,%[inv] \n\t" \
   "movl %%edx,%%ecx \n\t" \
   "popl %%eax \n\t" \
   "testl %%ebx,%%ebx \n\t" \
   "jz 9f # done \n\t" \
   "testl $0xffff0000,%%eax \n\t" \
   "jnz 1b # ext gcd loop (big operands) \n\t" \
   "2: # calc modular inverse (small operands) \n\t" \
   "xorl %%edx,%%edx # prepare (word) div \n\t" \
   "divw %%bx \n\t" \
   "pushl %%ebx \n\t" \
   "movl %%edx,%%ebx \n\t" \
   "imull %%ecx \n\t" \
   "movl %[inv],%%edx \n\t" \
   "subl %%eax,%%edx \n\t" \
   "movl %%ecx,%[inv] \n\t" \
   "movl %%edx,%%ecx \n\t" \
   "popl %%eax \n\t" \
   "cmpl $1,%%ebx \n\t" \
   "ja 2b # ext gcd loop (small ops) \n\t" \
   "jb 9f \n\t" \
   "movl %%ecx,%[inv] \n\t" \
   "movl %%ebx,%%eax \n\t" \
   "9: # normalize result \n\t" \
   "cmpl $0,%[inv] \n\t" \
   "jg 2f \n\t" \
   "addl %[M],%[inv] \n\t" \
   "2: \n" \
   : "=&a" (a), [inv] "=&D" (inv) : [M] "g" (m), [X] "g" (x) : "ebx", "edx", "ecx", "cc");
#endif
#endif
 
  if (a!=1)
   {
     std::cerr << "WARNING! invmod: 1/" << x << " (mod " << m << ") does not exist" << std::endl;
     exit(1);
   }
 
#if 0 /* for debugging */
  if   (mulmod(x%m,inv,m)!=1)
   {
     std::cerr << "error in invmod: buggy?" << std::endl;
//     std::cout << "a= " << a << " inv= " << inv <<  std::endl;
//     std::cout << "x= " << x << " m= " << m <<  std::endl;
     exit(1);
   }
#endif
 
  return inv;
}
 
 
/*!
 * @param x an odd unsigned integer value
 * @param m an unsigned integer value
 * @result the modular inverse of @p x mod @p m
 * @remark
 *         - Let @p y be the modular inverse of @p x mod @p m, then @p x * @p y = 1 (mod @p m)
 *         - The result is only defined, if @p x and @p m are coprime,
 *           0 < @p x < @p m and @p m is odd.
 *         - x>m for @p x =x'*2^k and 0 < x' < @p m is also allowed
 */
unsigned int bininvmod(register unsigned int x, register const unsigned int m)
{
  // returns 1/x mod m (or error if no inverse exists)
  // crafted binary extended euclidean gcd algorithm 
 
  // additional constraints:
  // 1. m=1 (mod 2),  that is: m is odd
  // 2.  0 < x < m
 
#if 1 && defined(ASM_CMOV)
  register unsigned int r=0;
  asm( \
   "mov %[m],%%edx \n\t" \
   "1: test $1,%[v] \n\t" \
   "jnz 2f \n\t" \
   "mov %[m],%%eax \n\t" \
   "shr $1,%[v] \n\t" \
   "add %[s],%%eax \n\t" \
   "shr $1,%%eax \n\t" \
   "shr $1,%[s] \n\t" \
   "cmovc %%eax,%[s] \n\t" \
   "jmp 1b \n\t" \
   "2: sub %[v],%%edx \n\t" \
   "xor %%eax,%%eax \n\t" \
   "sub %[s],%[r] \n\t" \
   "cmovb %[m],%%eax \n\t" \
   "add %%eax,%[r] \n\t" \
   "3: mov %[m],%%eax \n\t" \
   "shr $1,%%edx \n\t" \
   "add %[r],%%eax \n\t" \
   "shr $1,%%eax \n\t" \
   "shr $1,%[r] \n\t" \
   "cmovc %%eax,%[r] \n\t" \
   "test $1,%%edx \n\t" \
   "jz 3b \n\t" \
   "cmp %[v],%%edx \n\t" \
   "ja 2b \n\t" \
   "cmp $1,%%edx \n\t" \
   "je 9f \n\t" \
   "sub %%edx,%[v] \n\t" \
   "#jz 9f \n\t" \
   "4: xor %%eax,%%eax \n\t" \
   "sub %[r],%[s] \n\t" \
   "cmovb %[m],%%eax \n\t" \
   "add %%eax,%[s] \n\t" \
   "5: mov %[m],%%eax \n\t" \
   "shr $1,%[v] \n\t" \
   "add %[s],%%eax \n\t" \
   "shr $1,%%eax \n\t" \
   "shr $1,%[s] \n\t" \
   "cmovc %%eax,%[s] \n\t" \
   "test $1,%[v] \n\t" \
   "jz 5b \n\t" \
   "sub %%edx,%[v] \n\t" \
   "ja 4b \n\t" \
   "#je 9f \n\t" \
   "add %%edx,%[v] \n\t" \
   "cmp $1,%[v] \n\t" \
   "jne 2b \n\t" \
   "mov %[s],%[r] \n\t" \
   "9: # %[r] is the inverse \n" \
   : [v] "+r" (x), [r] "+r" (r) /* output list */
   : [m] "r" (m), [s] "r" (1)   /* input list */
   : "cc", "eax", "edx" /* clobber list */  );
   return r;
#else
  // generic version
  register unsigned int u=m, v=x;
  register unsigned int r=0, s=1;
 
  while ((v&1)==0)
   {
     v>>=1;
     s+= (s&1) ? m : 0;
     s>>=1;
   }
 
  while (true)
   {
     // now:
     // 1. u,v are odd
     // 2. u>=v
     if (__builtin_expect (v==1, 0)) return s; // we expect many iterations before terminating
 
     do
      {
        u-=v;
        r-= (r>=s) ? s : s-m;
        do
         {
           u>>=1;
           r+= (r&1) ? m : 0;
           r>>=1;
         } while ((u&1)==0);
      } while (u>v);
 
     // now:
     // 1. u,v are odd
     // 2. v>=u
     if (__builtin_expect (u==1, 0)) return r; // we expect many iterations before terminating
 
     do
      {
        v-=u;
        s-= (s>=r) ? r : r-m;
        do
         {
           v>>=1;
           s+= (s&1) ? m : 0;
           s>>=1;
         } while ((v&1)==0);
      } while (v>=u);
 
   }
#endif
}
 
 
// a small helper macro to speedup montgomery invmod
#if defined(ASM_386) || defined(ASM_CMOV)
// this array is here to avoid the costly "bit-search-forward" instruction for 6-bit values
// on Athlon: bsf %[reg],%%ecx  [vectorpath, latency: 8]
// replacement code: mov %[reg],%%ecx; and $3f,%%ecx; movzx %[shifts](%ecx),%%ecx
//                  [ directpath, latency: 1+1+3 = 4 ]
static const char shifts[64] __attribute__ ((aligned (64))) // 64-byte aligned for cacheline-optimization!
 = { 6,0,1,0, 2,0,1,0, 3,0,1,0, 2,0,1,0, 4,0,1,0, 2,0,1,0, 3,0,1,0, 2,0,1,0,
     5,0,1,0, 2,0,1,0, 3,0,1,0, 2,0,1,0, 4,0,1,0, 2,0,1,0, 3,0,1,0, 2,0,1,0 };
 // this is fast i386 inline assembler
#define nightshift(VV,RR,KK) \
  asm( \
   "mov %[v],%%ecx \n\t" \
   "and $0x3f,%%ecx \n\t" \
   "test $0x7f,%[v] \n\t" \
   "movzx %[shifts](%%ecx),%%ecx \n\t" \
   "jnz 1f \n\t" \
   "bsf %[v],%%ecx \n\t" \
   "1: add %%ecx,%[k] \n\t" \
   "shr %%cl,%[v] \n\t" \
   "shl %%cl,%[r] \n" \
   : [v] "+r" (VV), [r] "+r" (RR), [k] "+rm" (KK) : [shifts] "o" (shifts[0]) : "cc", "ecx");
#elif defined(ASM_X86_64)
#define nightshift(VV,RR,KK) \
  asm( \
   "bsf %[v],%%ecx \n\t" \
   "add %%ecx,%[k] \n\t" \
   "shr %%cl,%[v] \n\t" \
   "shl %%cl,%[r] \n" \
   : [v] "+r" (VV), [r] "+r" (RR), [k] "+rm" (KK) : : "cc", "ecx");
#else
 // and this is slow C++
#define nightshift(VV,RR,KK) \
  while ((VV&1)==0) { VV>>=1; RR<<=1; ++KK; }
#endif 
 // and this is the end of this macro ;-)
 
 
/*!
 * @param x an unsigned integer value
 * @param m an unsigned integer value
 * @result the modular inverse of @p x mod @p m
 * @remark
 *         - Let @p y be the modular inverse of @p x mod @p m, then @p x * @p y = 1 (mod @p m)
 *         - The result is only defined, if @p x and @p m are coprime,
 *           0 < @p x < @p m and @p m is odd.
 */
unsigned int montgominvmod(register unsigned int x, unsigned int m)
{
  // returns 1/x mod m (or error if no inverse exists)
  // crafted montgomery based algorithm for invmod
 
  // additional constraints:
  // 1. m=1 (mod 2),  that is: m is odd
  // 2.  0 < x < m
 
  using std::cout;
  using std::endl;
 
#if 1 && defined(ASM_X86_64)
  //unsigned int h=x;
  register unsigned int r=0;
  register unsigned int s=1;
  asm( \
   "xor %%eax,%%eax \n\t" \
   "#mov %%edx,%%eax \n\t" \
   "#shr $32-26,%%edx \n\t" \
   "#shl $26,%%eax \n\t" \
   "divl %[m] \n\t" \
   "mov %[v],%%ecx \n\t" \
   "mov $-32,%%edi # now r*2^32 mod m \n\t" \
   "mov %[m],%%eax \n\t" \
   "bsf %[v],%%ecx \n\t" \
   "add %%ecx,%%edi # without div transform: mov ecx->edi \n\t" \
   "shr %%cl,%[v] \n\t" \
   "shl %%cl,%[r] \n" \
   ".balign 16,,8 \n\t" \
   "1: # u part\n\t" \
   "sub %[v],%%eax \n\t" \
   "mov %%eax,%%ecx \n\t" \
   "bsf %%eax,%%ecx \n\t" \
   "add %[s],%[r] \n\t" \
   "add %%ecx,%%edi \n\t" \
   "shr %%cl,%%eax \n\t" \
   "shl %%cl,%[s] \n\t" \
   "cmp %[v],%%eax \n\t" \
   "ja 1b \n\t" \
   "cmp $1,%%eax \n\t" \
   "je 9f \n\t" \
   "# v part \n\t" \
   "sub %%eax,%[v] \n\t" \
   "2: bsf %[v],%%ecx \n\t" \
   "add %[r],%[s] \n\t" \
   "shr %%cl,%[v] \n\t" \
   "shl %%cl,%[r] \n\t" \
   "add %%ecx,%%edi \n\t" \
   "cmp %%eax,%[v] \n\t" \
   "jb 1b \n\t" \
   "sub %%eax,%[v] \n\t" \
   "jmp 2b \n\t" \
   "9: #adjust shift \n\t" \
   "mov %[r],%%ecx \n\t" \
   "mov %[m],%%eax \n\t" \
   "bsf %[r],%%ecx \n\t" \
   "shl %%cl,%[r] \n\t" \
   "add %%ecx,%%edi \n\t" \
   "js 4f # negative-k transform \n\t" \
   "jz 5f # zero! \n\t" \
   "neg %[r] \n\t" \
   "add %%eax,%[r] \n\t" \
   "shr %%eax \n\t" \
   "inc %%eax \n\t" \
   "shr $1,%[r] \n\t" \
   "dec %%edi \n\t" \
   "jz 8f \n\t" \
   "# now the correction part \n\t" \
   "# cmov-version \n\t" \
   "3: xorl %%ecx,%%ecx \n\t" \
   "shr $1,%[r] \n\t" \
   "cmovc %%eax,%%ecx \n\t" \
   "add %%ecx,%[r] \n\t" \
   "dec %%edi \n\t" \
   "jnz 3b \n\t" \
   "jmp 8f \n\t" \
   "5: # case k=0 \n\t" \
   "neg %[r] \n\t" \
   "add %%eax,%[r] \n\t" \
   "jmp 8f \n\t" \
   "4: # negative-k transform \n\t" \
   "neg %[r] \n\t" \
   "add %%eax,%[r] \n\t" \
   "0: add %[r],%[r] \n\t" \
   "mov %[r],%%ecx \n\t" \
   "sub %%eax,%[r] \n\t" \
   "cmovb %%ecx,%[r] \n\t" \
   "inc %%edi \n\t" \
   "jnz 0b \n\t" \
   "#jmp 8f \n\t" \
   "# or: i386 version without cmov \n\t" \
   "#3: shr $1,%[r] \n\t" \
   "#sbb %%ecx,%%ecx \n\t" \
   "#and %%eax,%%ecx \n\t" \
   "#add %%ecx,%[r] \n\t" \
   "#dec %%edi \n\t" \
   "#jnz 3b \n\t" \
   "#jmp 8f \n\t" \
   "#5: # case k=0 \n\t" \
   "#neg %[r] \n\t" \
   "#add %%eax,%[r] \n\t" \
   "#jmp 8f \n\t" \
   "#4: # negative-k transform (i386 version)\n\t" \
   "#sub %[r],%%eax \n\t" \
   "#mov %%edi,%%ecx \n\t" \
   "#neg %%cl \n\t" \
   "#mov %%eax,%%edx \n\t" \
   "#add $32,%%edi \n\t" \
   "#shl %%cl,%%eax \n\t" \
   "#mov %%edi,%%ecx \n\t" \
   "#shr %%cl,%%edx \n\t" \
   "#divl %[m] \n\t " \
   "#mov %%edx,%[r] \n\t" \
   "8: \n" \
   : [v] "+d" (x), [r] "+r" (r), [s] "+r" (s) /* output list */
   : [m] "rm" (m) /* input list */
   : "cc", "eax", "ecx", "edi"  /* clobber list */  );
   //x=h; // needed, if you want to enable modulo-check below!
#elif 1 && defined(ASM_CMOV)
  //unsigned int h=x;
  register unsigned int r=0;
  register unsigned int s=1;
  asm( \
   "xor %%eax,%%eax \n\t" \
   "#mov %%edx,%%eax \n\t" \
   "#shr $32-26,%%edx \n\t" \
   "#shl $26,%%eax \n\t" \
   "divl %[m] \n\t" \
   "mov %[v],%%ecx \n\t" \
   "mov $-32,%%edi # now r*2^32 mod m \n\t" \
   "mov %[m],%%eax \n\t" \
   "and $0x3f,%%ecx \n\t" \
   "test $0x7f,%[v] \n\t" \
   "movzx %[shifts](%%ecx),%%ecx \n\t" \
   "jnz 0f \n\t" \
   "bsf %[v],%%ecx \n\t" \
   "0: add %%ecx,%%edi # without div transform: mov ecx->edi \n\t" \
   "shr %%cl,%[v] \n\t" \
   "shl %%cl,%[r] \n" \
   ".balign 16,,8 \n\t" \
   "1: # u part\n\t" \
   "sub %[v],%%eax \n\t" \
   "mov %%eax,%%ecx \n\t" \
   "and $0x3f,%%ecx \n\t" \
   "test $0x7f,%%eax \n\t" \
   "movzx %[shifts](%%ecx),%%ecx \n\t" \
   "jnz 0f \n\t" \
   "bsf %%eax,%%ecx \n\t" \
   "0: add %[s],%[r] \n\t" \
   "add %%ecx,%%edi \n\t" \
   "shr %%cl,%%eax \n\t" \
   "shl %%cl,%[s] \n\t" \
   "cmp %[v],%%eax \n\t" \
   "ja 1b \n\t" \
   "cmp $1,%%eax \n\t" \
   "je 9f \n\t" \
   "# v part \n\t" \
   "sub %%eax,%[v] \n\t" \
   "2: mov %[v],%%ecx \n\t" \
   "and $0x3f,%%ecx \n\t" \
   "test $0x7f,%[v] \n\t" \
   "movzx %[shifts](%%ecx),%%ecx \n\t" \
   "jnz 0f \n\t" \
   "bsf %[v],%%ecx \n\t" \
   "0: add %[r],%[s] \n\t" \
   "shr %%cl,%[v] \n\t" \
   "shl %%cl,%[r] \n\t" \
   "add %%ecx,%%edi \n\t" \
   "cmp %%eax,%[v] \n\t" \
   "jb 1b \n\t" \
   "sub %%eax,%[v] \n\t" \
   "jmp 2b \n\t" \
   "9: #adjust shift \n\t" \
   "mov %[r],%%ecx \n\t" \
   "mov %[m],%%eax \n\t" \
   "and $0x3f,%%ecx \n\t" \
   "test $0x7f,%[r] \n\t" \
   "movzx %[shifts](%%ecx),%%ecx \n\t" \
   "jnz 0f \n\t" \
   "bsf %[r],%%ecx \n\t" \
   "shl %%cl,%[r] \n\t" \
   "0: add %%ecx,%%edi \n\t" \
   "js 4f # negative-k transform \n\t" \
   "jz 5f # zero! \n\t" \
   "neg %[r] \n\t" \
   "add %%eax,%[r] \n\t" \
   "shr %%eax \n\t" \
   "inc %%eax \n\t" \
   "shr $1,%[r] \n\t" \
   "dec %%edi \n\t" \
   "jz 8f \n\t" \
   "# now the correction part \n\t" \
   "# cmov-version \n\t" \
   "3: xorl %%ecx,%%ecx \n\t" \
   "shr $1,%[r] \n\t" \
   "cmovc %%eax,%%ecx \n\t" \
   "add %%ecx,%[r] \n\t" \
   "dec %%edi \n\t" \
   "jnz 3b \n\t" \
   "jmp 8f \n\t" \
   "5: # case k=0 \n\t" \
   "neg %[r] \n\t" \
   "add %%eax,%[r] \n\t" \
   "jmp 8f \n\t" \
   "4: # negative-k transform \n\t" \
   "neg %[r] \n\t" \
   "add %%eax,%[r] \n\t" \
   "0: add %[r],%[r] \n\t" \
   "mov %[r],%%ecx \n\t" \
   "sub %%eax,%[r] \n\t" \
   "cmovb %%ecx,%[r] \n\t" \
   "inc %%edi \n\t" \
   "jnz 0b \n\t" \
   "#jmp 8f \n\t" \
   "# or: i386 version without cmov \n\t" \
   "#3: shr $1,%[r] \n\t" \
   "#sbb %%ecx,%%ecx \n\t" \
   "#and %%eax,%%ecx \n\t" \
   "#add %%ecx,%[r] \n\t" \
   "#dec %%edi \n\t" \
   "#jnz 3b \n\t" \
   "#jmp 8f \n\t" \
   "#5: # case k=0 \n\t" \
   "#neg %[r] \n\t" \
   "#add %%eax,%[r] \n\t" \
   "#jmp 8f \n\t" \
   "#4: # negative-k transform (i386 version)\n\t" \
   "#sub %[r],%%eax \n\t" \
   "#mov %%edi,%%ecx \n\t" \
   "#neg %%cl \n\t" \
   "#mov %%eax,%%edx \n\t" \
   "#add $32,%%edi \n\t" \
   "#shl %%cl,%%eax \n\t" \
   "#mov %%edi,%%ecx \n\t" \
   "#shr %%cl,%%edx \n\t" \
   "#divl %[m] \n\t " \
   "#mov %%edx,%[r] \n\t" \
   "8: \n" \
   : [v] "+d" (x), [r] "+r" (r), [s] "+r" (s) /* output list */
   : [m] "rm" (m), [shifts] "o" (shifts[0])  /* input list */
   : "cc", "eax", "ecx", "edi"  /* clobber list */  );
   //x=h; // needed, if you want to enable modulo-check below!
#elif 1 && defined(ASM_386)
  // version without cmov
  //unsigned int h=x;
  register unsigned int r=0;
  register unsigned int s=1;
  asm( \
   "xor %%eax,%%eax \n\t" \
   "#mov %%edx,%%eax \n\t" \
   "#shr $32-26,%%edx \n\t" \
   "#shl $26,%%eax \n\t" \
   "divl %[m] \n\t" \
   "mov %[v],%%ecx \n\t" \
   "mov $-32,%%edi # now r*2^32 mod m \n\t" \
   "mov %[m],%%eax \n\t" \
   "and $0x3f,%%ecx \n\t" \
   "test $0x7f,%[v] \n\t" \
   "movzx %[shifts](%%ecx),%%ecx \n\t" \
   "jnz 0f \n\t" \
   "bsf %[v],%%ecx \n\t" \
   "0: add %%ecx,%%edi # without div transform: mov ecx->edi \n\t" \
   "shr %%cl,%[v] \n\t" \
   "shl %%cl,%[r] \n" \
   ".balign 32,,12 \n\t"
   "1: # u part\n\t" \
   "sub %[v],%%eax \n\t" \
   "mov %%eax,%%ecx \n\t" \
   "and $0x3f,%%ecx \n\t" \
   "test $0x7f,%%eax \n\t" \
   "movzx %[shifts](%%ecx),%%ecx \n\t" \
   "jnz 0f \n\t" \
   "bsf %%eax,%%ecx \n\t" \
   "0: add %[s],%[r] \n\t" \
   "add %%ecx,%%edi \n\t" \
   "shr %%cl,%%eax \n\t" \
   "shl %%cl,%[s] \n\t" \
   "cmp %[v],%%eax \n\t" \
   "ja 1b \n\t" \
   "cmp $1,%%eax \n\t" \
   "je 9f \n\t" \
   "# v part \n\t" \
   "sub %%eax,%[v] \n\t" \
   "2: mov %[v],%%ecx \n\t" \
   "and $0x3f,%%ecx \n\t" \
   "test $0x7f,%[v] \n\t" \
   "movzx %[shifts](%%ecx),%%ecx \n\t" \
   "jnz 0f \n\t" \
   "bsf %[v],%%ecx \n\t" \
   "0: add %[r],%[s] \n\t" \
   "shr %%cl,%[v] \n\t" \
   "shl %%cl,%[r] \n\t" \
   "add %%ecx,%%edi \n\t" \
   "cmp %%eax,%[v] \n\t" \
   "jb 1b \n\t" \
   "sub %%eax,%[v] \n\t" \
   "jmp 2b \n\t" \
   "9: #adjust shift \n\t" \
   "mov %[r],%%ecx \n\t" \
   "mov %[m],%%eax \n\t" \
   "and $0x3f,%%ecx \n\t" \
   "test $0x7f,%[r] \n\t" \
   "movzx %[shifts](%%ecx),%%ecx \n\t" \
   "jnz 0f \n\t" \
   "bsf %[r],%%ecx \n\t" \
   "shl %%cl,%[r] \n\t" \
   "0: add %%ecx,%%edi \n\t" \
   "js 4f # negative-k transform \n\t" \
   "jz 5f # zero! \n\t" \
   "neg %[r] \n\t" \
   "add %%eax,%[r] \n\t" \
   "shr %%eax \n\t" \
   "inc %%eax \n\t" \
   "shr $1,%[r] \n\t" \
   "dec %%edi \n\t" \
   "jz 8f \n\t" \
   "# now the correction part \n\t" \
   "# cmov-version \n\t" \
   "#3: xorl %%ecx,%%ecx \n\t" \
   "#shr $1,%[r] \n\t" \
   "#cmovc %%eax,%%ecx \n\t" \
   "#add %%ecx,%[r] \n\t" \
   "#dec %%edi \n\t" \
   "#jnz 3b \n\t" \
   "#jmp 8f \n\t" \
   "#5: # case k=0 \n\t" \
   "#neg %[r] \n\t" \
   "#add %%eax,%[r] \n\t" \
   "#jmp 8f \n\t" \
   "#4: # negative-k transform \n\t" \
   "#neg %[r] \n\t" \
   "#add %%eax,%[r] \n\t" \
   "#0: add %[r],%[r] \n\t" \
   "#mov %[r],%%ecx \n\t" \
   "#sub %%eax,%[r] \n\t" \
   "#cmovb %%ecx,%[r] \n\t" \
   "#inc %%edi \n\t" \
   "#jnz 0b \n\t" \
   "#jmp 8f \n\t" \
   "# or: i386 version without cmov \n\t" \
   "3: shr $1,%[r] \n\t" \
   "sbb %%ecx,%%ecx \n\t" \
   "and %%eax,%%ecx \n\t" \
   "add %%ecx,%[r] \n\t" \
   "dec %%edi \n\t" \
   "jnz 3b \n\t" \
   "jmp 8f \n\t" \
   "5: # case k=0 \n\t" \
   "neg %[r] \n\t" \
   "add %%eax,%[r] \n\t" \
   "jmp 8f \n\t" \
   "4: # negative-k transform (i386 version)\n\t" \
   "sub %[r],%%eax \n\t" \
   "mov %%edi,%%ecx \n\t" \
   "neg %%cl \n\t" \
   "mov %%eax,%%edx \n\t" \
   "add $32,%%edi \n\t" \
   "shl %%cl,%%eax \n\t" \
   "mov %%edi,%%ecx \n\t" \
   "shr %%cl,%%edx \n\t" \
   "divl %[m] \n\t " \
   "mov %%edx,%[r] \n\t" \
   "8: \n" \
   : [v] "+d" (x), [r] "+r" (r), [s] "+r" (s) /* output list */
   : [m] "rm" (m), [shifts] "o" (shifts[0])  /* input list */
   : "cc", "eax", "ecx", "edi"  /* clobber list */  );
   //x=h; // needed, if you want to enable modulo-check below!
#else
  // generic version
  register unsigned int u=m, v=x;
  register unsigned int r=0, s=1;
  register signed int k=0;
 
  nightshift(v,r,k); // = while ((v&1)==0) { v>>=1; r<<=1; ++k; }
 
  while (true)
   {
     // now: u,v odd, u>=v
     do
      {
        u-=v; r+=s;
        nightshift(u,s,k); // --> do { u>>=1; s<<=1; ++k; } while ((u&1)==0);
      } while (u>v);
 
     // now: u,v odd, v>=u
     if (__builtin_expect (u==1, 0)) goto loopdone; // we expect many iterations before terminating
 
     do
      {
        v-=u; s+=r;
        nightshift(v,r,k); // --> do { v>>=1; r<<=1; ++k; } while ((v&1)==0);
      } while (v>=u);
   }
 loopdone:
 
  while ((r&1)==0) { r>>=1; ++k; }
  r=m-r; // now r is even
 
  do { if (r&1) r+=m; r>>=1; } while(--k);
 
#endif /* of asm-cmov / asm-i386 / generic version */
 
#if 0
  if ((mulmod(r,x,m))!=1)
   {
     std::cerr << "invmod-error!" << endl;
     std::cerr << "check should produce 1, but got " << mulmod(r,x,m) << endl;
     std::cerr << "r=" << r << " x=" << x << " m=" << m << endl;
     std::cerr << "real invmod should be : " << invmod(x,m) << endl;
     exit(1);
   }
#endif
  return r;
}
 
 
 
#ifdef USE_JASONS_INVMOD
 namespace japinvmod
 {
   #include "invmod.c"
   class init { public: init() { init_fast_invmod(); } };
   static init at_startup;
 }
#endif
 
 
} // namespace numtheory
 
#endif /* MODULO_CC_INCLUDED */
Source at commit 01a75632c7d3 created 11 years 9 months ago. By Nathan Adams, compile
Download this file
qsieve

qsieve Mercurial Source Tree

Root/src/modulo.cc

Branches

Tags