// Modulo-Operations for unsigned int (and signed int)
// written by Thorsten Reinecke (1999-03-12)
// last change: 2007-06-22
// some of the following functions are necessary because a*b%m will
// return false results if a*b overflows unsigned-int-range.
// these routines are okay for unsigned-int-range/2
/*! @file
* @brief
* This is the implementation part of some number theoretic functions.
*
* The functions implemented in this file provide support for fast modulo
* operations (multiplication, exponentiation,
* computation of the modular inverse, quadratic residues)
* and some integer functions (greatest common divisor, prime numbers).
*
* Most of these functions contain also optimized assembler code for
* Intel Pentium and AMD Athlon processors.
*/
#ifndef MODULO_CC_INCLUDED
#define MODULO_CC_INCLUDED
#include "modulo.H"
#include <iostream>
#include "sqrt_modulo.cc"
//! number theoretic concepts
namespace numtheory
{
/*!
* @param p unsigned integer to test for primality
* @param base base for the strong probable prime test
* @returns whether @p p is a strong probable prime regarding to @p base .
*/
bool strong_probab_prime(const unsigned int p, const unsigned int base)
{
// assumes p>base>1 !!
register unsigned int d=p-1;
register unsigned int s=0;
while (!(d&1U)) { d>>=1; ++s; }
//cout << "d,s: " << d << "," << s << endl;
if (s==0) return false;
//cout << "p,d,s: " << p << "," << d << "," << s << endl;
d=powmod(base,d,p);
unsigned int x = d;
while (s && x!=p-1) { x=squaremod(x,p); --s; }
//cout << "p,d,s,x: " << p << "," << d << "," << s << "," << x << endl;
return (d==1 && x!=p-1) || (d!=1 && x==p-1);
}
/*!
* @param p unsigned integer to check for primality
* @result whether @p p is probably prime to the bases 2, 7 and 61.
*
* @remark actually the result returns, whether @p p is prime,
* iff 61 < @p p < 4.759.123.141 (and no integer overflow occurs),
* as someone has checked these conditions beforehand.
*/
bool probab_prime(const unsigned int p)
{
// assumes p>61 !!
// actually p is prime if 61<p<4.759.123.141 and p is a probable prime for 2,7 and 61
// (this has been checked by someone before).
//
// problem: mulmod(a,b,n) produces false results if highest
// significant bit in unsigned int overflows!
return strong_probab_prime(p,2)
&& strong_probab_prime(p,7)
&& strong_probab_prime(p,61);
}
/*!
* @param p unsigned integer to check for primality
* @result whether @p p is a prime number
*
* @remark This function is slow for large numbers, but safe for all numbers, since
* it is based on trial division.
*/
bool is_prime(register const int p)
{
// slow for large p's, but safe for all p's
if (p==2 || p==3) return true;
if ((p%2==0) || (p%3==0)) return false;
register int t = 5;
while (t*t<=p)
{
if ((p%t)==0) return false;
t+=2;
if ((p%t)==0) return false;
t+=4;
}
return true;
}
/*!
* @param a an integer value
* @param b an odd unsigned integer value
* @result Jacobi symbol (a/b) (which is undefined for even @p b)
*/
signed int jacobi (int a, unsigned int b)
{
// returns jacobi(a/b), assumes b odd!
if (a%b)
{
signed int x = 1;
if (a<0)
{
a=-a;
if ((b-1)&2) x=-x; // (-1/b)=(-1)^((b-1)/2)
}
while (a>1 && b>1)
{
a%=b; if (a==0) return 0;
unsigned int twos = 0;
while ((a&1)==0) { a>>=1; twos++; }
if (twos&1) // only odd two's in a can change the sign!
{
// (2/b)=(-1)^((b^2-1)/8)
twos=(b&15)*(b&15); // but note that overflow wouldn't hurt here...
if (twos&8) x=-x;
}
// a and b odd natural numbers -> (a/b)=(-1)^( (a-1)/2 * (b-1)/2 ) * (b/a)
if (2&a&b) x=-x;
twos=a; a=b; b=twos; // swap loop again...
}
return x;
}
else return 0;
}
/*!
* @param a an integer value
* @param p an odd prime number (as unsigned integer)
* @result Legendre symbol (a/p), which is
* - 1, if @p a is a quadratic residue modulo @p p
* - -1, if @p a is not a quadratic residue modulo @p p
* - 0, if @p a and @p p are not coprime
* @remark
* - The result is only defined, if @p p is an odd prime number. You cannot rely on
* on any specific behaviour on this function, if this condition is not met!
* - For valid values the result is identical to that of the jacobi function.
*/
signed int legendre (signed int a, const unsigned int p)
{
// returns legendre(a/p), assumes p being an odd prime!
// warning: legendre is only defined for odd prime p.
// this function works for all odd p:
// it will therefore return jacobi(a/p)
// remember: undefined results can be runerrors, but
// an undefined result can be anything else, too.
// So, undefined means, you can't rely on a specific
// behaviour.
// For debugging reasons one may wish to check for
// odd prime input p.
#if 0
if ( (p%2==0) || !is_prime(p) )
{
cerr << "error in legendre: " << p << " is not an odd prime!" << endl;
exit(1);
}
#endif
return jacobi(a,p);
}
/*!
* @param x an unsigned integer value
* @param m an unsigned integer value
* @result the modular inverse of @p x mod @p m
* @remark
* - Let @p y be the modular inverse of @p x mod @p m, then @p x * @p y = 1 (mod @p m)
* - The result is only defined, if @p x and @p m are coprime.
*/
unsigned int invmod(const unsigned int x, const unsigned int m)
{
// returns 1/x mod m (or error if no inverse exists)
unsigned int a;
signed int inv;
#ifndef ASM_386
unsigned int b = x; //(x>=m) ? x%m : x;
signed int sw=1;
a=m; inv=0;
while (b)
{
// std::cout << "b=" << b << ", a=" << a;
unsigned int q=a/b; unsigned int r=a%b;
a=b; b=r;
signed h=inv-sw*q; inv=sw; sw=h;
// std::cout << ", q=" << q << ", r=" << r << ", inv=" << inv << std::endl;
}
inv = (inv>=0) ? inv : m+inv;
#else
#ifdef ASM_CMOV
// inline assembler code for AMD Athlon (and other x86 processors providing conditional mov operations)
unsigned int temp;
#ifdef DEBUG
#warning "using cmov for invmod-function"
#endif
// utilize, that division is faster for smaller operands
// also use conditional mov instructions where possible
asm volatile ( \
"movl %[M],%%eax \n\t" \
"movl %[X],%%ebx \n\t" \
"movl $0,%[inv] # initialize inv \n\t" \
"movl $1,%%ecx \n\t" \
"1: # calc modular inverse\n\t" \
"xorl %%edx,%%edx # prepare long div \n\t" \
"divl %%ebx \n\t" \
"movl %%ebx,%[tmp] \n\t" \
"movl %%edx,%%ebx \n\t" \
"imull %%ecx \n\t" \
"subl %[inv],%%eax \n\t" \
"movl %%ecx,%[inv] \n\t" \
"movl %%eax,%%ecx \n\t" \
"movl %[tmp],%%eax \n\t" \
"negl %%ecx \n\t" \
"testl $0xffff0000,%%ebx \n\t" \
"jnz 1b \n\t" \
"movl %%eax,%%edx \n\t" \
"testl %%ebx,%%ebx \n\t" \
"jz 9f \n\t" \
"shrl $16,%%edx \n\t" \
"cmpl %%ebx,%%edx \n\t" \
"jae 1b # ext gcd loop (big operands 0:32/32) \n\t" \
"movzx %%ax,%%eax \n\t" \
"2: # calc modular inverse (small operands 16:16/16) \n\t" \
"divw %%bx \n\t" \
"movl %%ebx,%[tmp] \n\t" \
"movl %%edx,%%ebx \n\t" \
"imull %%ecx \n\t" \
"subl %[inv],%%eax \n\t" \
"movl %%ecx,%[inv] \n\t" \
"xorl %%edx,%%edx \n\t" \
"movl %%eax,%%ecx \n\t" \
"movl %[tmp],%%eax \n\t" \
"negl %%ecx \n\t" \
"testb $0xff,%%bh \n\t" \
"jnz 2b # ext gcd loop (small ops) \n\t" \
"testb %%bl,%%bl \n\t" \
"jz 9f \n\t" \
"cmpb %%bl,%%ah \n\t" \
"jae 2b \n\t" \
"3: # calc modular inverse (byte operands 8:8/8 \n\t" \
"divb %%bl \n\t" \
"movb %%bl,%%bh \n\t" \
"movb %%ah,%%bl \n\t" \
"movzx %%al,%%eax \n\t" \
"imull %%ecx \n\t" \
"subl %[inv],%%eax \n\t" \
"movl %%ecx,%[inv] \n\t" \
"movl %%eax,%%ecx \n\t" \
"movzx %%bh,%%eax \n\t" \
"negl %%ecx \n\t" \
"cmpb $1,%%bl \n\t" \
"ja 3b # ext gcd loop (byte ops) \n\t" \
"movzx %%bl,%%ebx \n\t" \
"cmovel %%ecx,%[inv] \n\t" \
"cmovel %%ebx,%%eax \n\t" \
"9: # normalize result \n\t" \
"movl %[M],%%edx \n\t" \
"addl %[inv],%%edx \n\t" \
"cmpl $0,%[inv] \n\t" \
"cmovng %%edx,%[inv] \n"
: "=&a" (a), [inv] "=&D" (inv), [tmp] "=&g" (temp) : [M] "g" (m), [X] "g" (x) : "ebx", "edx", "ecx", "cc");
#else /* "standard" ASM_386 handoptimized inline assembler */
asm volatile ( \
"movl %[M],%%eax \n\t" \
"movl %[X],%%ebx \n\t" \
"movl $0,%[inv] # initialize inv \n\t" \
"movl $1,%%ecx \n\t" \
"1: # calc modular inverse\n\t" \
"xorl %%edx,%%edx # prepare long div \n\t" \
"divl %%ebx \n\t" \
"pushl %%ebx \n\t" \
"movl %%edx,%%ebx \n\t" \
"imull %%ecx \n\t" \
"movl %[inv],%%edx \n\t" \
"subl %%eax,%%edx \n\t" \
"movl %%ecx,%[inv] \n\t" \
"movl %%edx,%%ecx \n\t" \
"popl %%eax \n\t" \
"testl %%ebx,%%ebx \n\t" \
"jz 9f # done \n\t" \
"testl $0xffff0000,%%eax \n\t" \
"jnz 1b # ext gcd loop (big operands) \n\t" \
"2: # calc modular inverse (small operands) \n\t" \
"xorl %%edx,%%edx # prepare (word) div \n\t" \
"divw %%bx \n\t" \
"pushl %%ebx \n\t" \
"movl %%edx,%%ebx \n\t" \
"imull %%ecx \n\t" \
"movl %[inv],%%edx \n\t" \
"subl %%eax,%%edx \n\t" \
"movl %%ecx,%[inv] \n\t" \
"movl %%edx,%%ecx \n\t" \
"popl %%eax \n\t" \
"cmpl $1,%%ebx \n\t" \
"ja 2b # ext gcd loop (small ops) \n\t" \
"jb 9f \n\t" \
"movl %%ecx,%[inv] \n\t" \
"movl %%ebx,%%eax \n\t" \
"9: # normalize result \n\t" \
"cmpl $0,%[inv] \n\t" \
"jg 2f \n\t" \
"addl %[M],%[inv] \n\t" \
"2: \n" \
: "=&a" (a), [inv] "=&D" (inv) : [M] "g" (m), [X] "g" (x) : "ebx", "edx", "ecx", "cc");
#endif
#endif
if (a!=1)
{
std::cerr << "WARNING! invmod: 1/" << x << " (mod " << m << ") does not exist" << std::endl;
exit(1);
}
#if 0 /* for debugging */
if (mulmod(x%m,inv,m)!=1)
{
std::cerr << "error in invmod: buggy?" << std::endl;
// std::cout << "a= " << a << " inv= " << inv << std::endl;
// std::cout << "x= " << x << " m= " << m << std::endl;
exit(1);
}
#endif
return inv;
}
/*!
* @param x an odd unsigned integer value
* @param m an unsigned integer value
* @result the modular inverse of @p x mod @p m
* @remark
* - Let @p y be the modular inverse of @p x mod @p m, then @p x * @p y = 1 (mod @p m)
* - The result is only defined, if @p x and @p m are coprime,
* 0 < @p x < @p m and @p m is odd.
* - x>m for @p x =x'*2^k and 0 < x' < @p m is also allowed
*/
unsigned int bininvmod(register unsigned int x, register const unsigned int m)
{
// returns 1/x mod m (or error if no inverse exists)
// crafted binary extended euclidean gcd algorithm
// additional constraints:
// 1. m=1 (mod 2), that is: m is odd
// 2. 0 < x < m
#if 1 && defined(ASM_CMOV)
register unsigned int r=0;
asm( \
"mov %[m],%%edx \n\t" \
"1: test $1,%[v] \n\t" \
"jnz 2f \n\t" \
"mov %[m],%%eax \n\t" \
"shr $1,%[v] \n\t" \
"add %[s],%%eax \n\t" \
"shr $1,%%eax \n\t" \
"shr $1,%[s] \n\t" \
"cmovc %%eax,%[s] \n\t" \
"jmp 1b \n\t" \
"2: sub %[v],%%edx \n\t" \
"xor %%eax,%%eax \n\t" \
"sub %[s],%[r] \n\t" \
"cmovb %[m],%%eax \n\t" \
"add %%eax,%[r] \n\t" \
"3: mov %[m],%%eax \n\t" \
"shr $1,%%edx \n\t" \
"add %[r],%%eax \n\t" \
"shr $1,%%eax \n\t" \
"shr $1,%[r] \n\t" \
"cmovc %%eax,%[r] \n\t" \
"test $1,%%edx \n\t" \
"jz 3b \n\t" \
"cmp %[v],%%edx \n\t" \
"ja 2b \n\t" \
"cmp $1,%%edx \n\t" \
"je 9f \n\t" \
"sub %%edx,%[v] \n\t" \
"#jz 9f \n\t" \
"4: xor %%eax,%%eax \n\t" \
"sub %[r],%[s] \n\t" \
"cmovb %[m],%%eax \n\t" \
"add %%eax,%[s] \n\t" \
"5: mov %[m],%%eax \n\t" \
"shr $1,%[v] \n\t" \
"add %[s],%%eax \n\t" \
"shr $1,%%eax \n\t" \
"shr $1,%[s] \n\t" \
"cmovc %%eax,%[s] \n\t" \
"test $1,%[v] \n\t" \
"jz 5b \n\t" \
"sub %%edx,%[v] \n\t" \
"ja 4b \n\t" \
"#je 9f \n\t" \
"add %%edx,%[v] \n\t" \
"cmp $1,%[v] \n\t" \
"jne 2b \n\t" \
"mov %[s],%[r] \n\t" \
"9: # %[r] is the inverse \n" \
: [v] "+r" (x), [r] "+r" (r) /* output list */
: [m] "r" (m), [s] "r" (1) /* input list */
: "cc", "eax", "edx" /* clobber list */ );
return r;
#else
// generic version
register unsigned int u=m, v=x;
register unsigned int r=0, s=1;
while ((v&1)==0)
{
v>>=1;
s+= (s&1) ? m : 0;
s>>=1;
}
while (true)
{
// now:
// 1. u,v are odd
// 2. u>=v
if (__builtin_expect (v==1, 0)) return s; // we expect many iterations before terminating
do
{
u-=v;
r-= (r>=s) ? s : s-m;
do
{
u>>=1;
r+= (r&1) ? m : 0;
r>>=1;
} while ((u&1)==0);
} while (u>v);
// now:
// 1. u,v are odd
// 2. v>=u
if (__builtin_expect (u==1, 0)) return r; // we expect many iterations before terminating
do
{
v-=u;
s-= (s>=r) ? r : r-m;
do
{
v>>=1;
s+= (s&1) ? m : 0;
s>>=1;
} while ((v&1)==0);
} while (v>=u);
}
#endif
}
// a small helper macro to speedup montgomery invmod
#if defined(ASM_386) || defined(ASM_CMOV)
// this array is here to avoid the costly "bit-search-forward" instruction for 6-bit values
// on Athlon: bsf %[reg],%%ecx [vectorpath, latency: 8]
// replacement code: mov %[reg],%%ecx; and $3f,%%ecx; movzx %[shifts](%ecx),%%ecx
// [ directpath, latency: 1+1+3 = 4 ]
static const char shifts[64] __attribute__ ((aligned (64))) // 64-byte aligned for cacheline-optimization!
= { 6,0,1,0, 2,0,1,0, 3,0,1,0, 2,0,1,0, 4,0,1,0, 2,0,1,0, 3,0,1,0, 2,0,1,0,
5,0,1,0, 2,0,1,0, 3,0,1,0, 2,0,1,0, 4,0,1,0, 2,0,1,0, 3,0,1,0, 2,0,1,0 };
// this is fast i386 inline assembler
#define nightshift(VV,RR,KK) \
asm( \
"mov %[v],%%ecx \n\t" \
"and $0x3f,%%ecx \n\t" \
"test $0x7f,%[v] \n\t" \
"movzx %[shifts](%%ecx),%%ecx \n\t" \
"jnz 1f \n\t" \
"bsf %[v],%%ecx \n\t" \
"1: add %%ecx,%[k] \n\t" \
"shr %%cl,%[v] \n\t" \
"shl %%cl,%[r] \n" \
: [v] "+r" (VV), [r] "+r" (RR), [k] "+rm" (KK) : [shifts] "o" (shifts[0]) : "cc", "ecx");
#elif defined(ASM_X86_64)
#define nightshift(VV,RR,KK) \
asm( \
"bsf %[v],%%ecx \n\t" \
"add %%ecx,%[k] \n\t" \
"shr %%cl,%[v] \n\t" \
"shl %%cl,%[r] \n" \
: [v] "+r" (VV), [r] "+r" (RR), [k] "+rm" (KK) : : "cc", "ecx");
#else
// and this is slow C++
#define nightshift(VV,RR,KK) \
while ((VV&1)==0) { VV>>=1; RR<<=1; ++KK; }
#endif
// and this is the end of this macro ;-)
/*!
* @param x an unsigned integer value
* @param m an unsigned integer value
* @result the modular inverse of @p x mod @p m
* @remark
* - Let @p y be the modular inverse of @p x mod @p m, then @p x * @p y = 1 (mod @p m)
* - The result is only defined, if @p x and @p m are coprime,
* 0 < @p x < @p m and @p m is odd.
*/
unsigned int montgominvmod(register unsigned int x, unsigned int m)
{
// returns 1/x mod m (or error if no inverse exists)
// crafted montgomery based algorithm for invmod
// additional constraints:
// 1. m=1 (mod 2), that is: m is odd
// 2. 0 < x < m
using std::cout;
using std::endl;
#if 1 && defined(ASM_X86_64)
//unsigned int h=x;
register unsigned int r=0;
register unsigned int s=1;
asm( \
"xor %%eax,%%eax \n\t" \
"#mov %%edx,%%eax \n\t" \
"#shr $32-26,%%edx \n\t" \
"#shl $26,%%eax \n\t" \
"divl %[m] \n\t" \
"mov %[v],%%ecx \n\t" \
"mov $-32,%%edi # now r*2^32 mod m \n\t" \
"mov %[m],%%eax \n\t" \
"bsf %[v],%%ecx \n\t" \
"add %%ecx,%%edi # without div transform: mov ecx->edi \n\t" \
"shr %%cl,%[v] \n\t" \
"shl %%cl,%[r] \n" \
".balign 16,,8 \n\t" \
"1: # u part\n\t" \
"sub %[v],%%eax \n\t" \
"mov %%eax,%%ecx \n\t" \
"bsf %%eax,%%ecx \n\t" \
"add %[s],%[r] \n\t" \
"add %%ecx,%%edi \n\t" \
"shr %%cl,%%eax \n\t" \
"shl %%cl,%[s] \n\t" \
"cmp %[v],%%eax \n\t" \
"ja 1b \n\t" \
"cmp $1,%%eax \n\t" \
"je 9f \n\t" \
"# v part \n\t" \
"sub %%eax,%[v] \n\t" \
"2: bsf %[v],%%ecx \n\t" \
"add %[r],%[s] \n\t" \
"shr %%cl,%[v] \n\t" \
"shl %%cl,%[r] \n\t" \
"add %%ecx,%%edi \n\t" \
"cmp %%eax,%[v] \n\t" \
"jb 1b \n\t" \
"sub %%eax,%[v] \n\t" \
"jmp 2b \n\t" \
"9: #adjust shift \n\t" \
"mov %[r],%%ecx \n\t" \
"mov %[m],%%eax \n\t" \
"bsf %[r],%%ecx \n\t" \
"shl %%cl,%[r] \n\t" \
"add %%ecx,%%edi \n\t" \
"js 4f # negative-k transform \n\t" \
"jz 5f # zero! \n\t" \
"neg %[r] \n\t" \
"add %%eax,%[r] \n\t" \
"shr %%eax \n\t" \
"inc %%eax \n\t" \
"shr $1,%[r] \n\t" \
"dec %%edi \n\t" \
"jz 8f \n\t" \
"# now the correction part \n\t" \
"# cmov-version \n\t" \
"3: xorl %%ecx,%%ecx \n\t" \
"shr $1,%[r] \n\t" \
"cmovc %%eax,%%ecx \n\t" \
"add %%ecx,%[r] \n\t" \
"dec %%edi \n\t" \
"jnz 3b \n\t" \
"jmp 8f \n\t" \
"5: # case k=0 \n\t" \
"neg %[r] \n\t" \
"add %%eax,%[r] \n\t" \
"jmp 8f \n\t" \
"4: # negative-k transform \n\t" \
"neg %[r] \n\t" \
"add %%eax,%[r] \n\t" \
"0: add %[r],%[r] \n\t" \
"mov %[r],%%ecx \n\t" \
"sub %%eax,%[r] \n\t" \
"cmovb %%ecx,%[r] \n\t" \
"inc %%edi \n\t" \
"jnz 0b \n\t" \
"#jmp 8f \n\t" \
"# or: i386 version without cmov \n\t" \
"#3: shr $1,%[r] \n\t" \
"#sbb %%ecx,%%ecx \n\t" \
"#and %%eax,%%ecx \n\t" \
"#add %%ecx,%[r] \n\t" \
"#dec %%edi \n\t" \
"#jnz 3b \n\t" \
"#jmp 8f \n\t" \
"#5: # case k=0 \n\t" \
"#neg %[r] \n\t" \
"#add %%eax,%[r] \n\t" \
"#jmp 8f \n\t" \
"#4: # negative-k transform (i386 version)\n\t" \
"#sub %[r],%%eax \n\t" \
"#mov %%edi,%%ecx \n\t" \
"#neg %%cl \n\t" \
"#mov %%eax,%%edx \n\t" \
"#add $32,%%edi \n\t" \
"#shl %%cl,%%eax \n\t" \
"#mov %%edi,%%ecx \n\t" \
"#shr %%cl,%%edx \n\t" \
"#divl %[m] \n\t " \
"#mov %%edx,%[r] \n\t" \
"8: \n" \
: [v] "+d" (x), [r] "+r" (r), [s] "+r" (s) /* output list */
: [m] "rm" (m) /* input list */
: "cc", "eax", "ecx", "edi" /* clobber list */ );
//x=h; // needed, if you want to enable modulo-check below!
#elif 1 && defined(ASM_CMOV)
//unsigned int h=x;
register unsigned int r=0;
register unsigned int s=1;
asm( \
"xor %%eax,%%eax \n\t" \
"#mov %%edx,%%eax \n\t" \
"#shr $32-26,%%edx \n\t" \
"#shl $26,%%eax \n\t" \
"divl %[m] \n\t" \
"mov %[v],%%ecx \n\t" \
"mov $-32,%%edi # now r*2^32 mod m \n\t" \
"mov %[m],%%eax \n\t" \
"and $0x3f,%%ecx \n\t" \
"test $0x7f,%[v] \n\t" \
"movzx %[shifts](%%ecx),%%ecx \n\t" \
"jnz 0f \n\t" \
"bsf %[v],%%ecx \n\t" \
"0: add %%ecx,%%edi # without div transform: mov ecx->edi \n\t" \
"shr %%cl,%[v] \n\t" \
"shl %%cl,%[r] \n" \
".balign 16,,8 \n\t" \
"1: # u part\n\t" \
"sub %[v],%%eax \n\t" \
"mov %%eax,%%ecx \n\t" \
"and $0x3f,%%ecx \n\t" \
"test $0x7f,%%eax \n\t" \
"movzx %[shifts](%%ecx),%%ecx \n\t" \
"jnz 0f \n\t" \
"bsf %%eax,%%ecx \n\t" \
"0: add %[s],%[r] \n\t" \
"add %%ecx,%%edi \n\t" \
"shr %%cl,%%eax \n\t" \
"shl %%cl,%[s] \n\t" \
"cmp %[v],%%eax \n\t" \
"ja 1b \n\t" \
"cmp $1,%%eax \n\t" \
"je 9f \n\t" \
"# v part \n\t" \
"sub %%eax,%[v] \n\t" \
"2: mov %[v],%%ecx \n\t" \
"and $0x3f,%%ecx \n\t" \
"test $0x7f,%[v] \n\t" \
"movzx %[shifts](%%ecx),%%ecx \n\t" \
"jnz 0f \n\t" \
"bsf %[v],%%ecx \n\t" \
"0: add %[r],%[s] \n\t" \
"shr %%cl,%[v] \n\t" \
"shl %%cl,%[r] \n\t" \
"add %%ecx,%%edi \n\t" \
"cmp %%eax,%[v] \n\t" \
"jb 1b \n\t" \
"sub %%eax,%[v] \n\t" \
"jmp 2b \n\t" \
"9: #adjust shift \n\t" \
"mov %[r],%%ecx \n\t" \
"mov %[m],%%eax \n\t" \
"and $0x3f,%%ecx \n\t" \
"test $0x7f,%[r] \n\t" \
"movzx %[shifts](%%ecx),%%ecx \n\t" \
"jnz 0f \n\t" \
"bsf %[r],%%ecx \n\t" \
"shl %%cl,%[r] \n\t" \
"0: add %%ecx,%%edi \n\t" \
"js 4f # negative-k transform \n\t" \
"jz 5f # zero! \n\t" \
"neg %[r] \n\t" \
"add %%eax,%[r] \n\t" \
"shr %%eax \n\t" \
"inc %%eax \n\t" \
"shr $1,%[r] \n\t" \
"dec %%edi \n\t" \
"jz 8f \n\t" \
"# now the correction part \n\t" \
"# cmov-version \n\t" \
"3: xorl %%ecx,%%ecx \n\t" \
"shr $1,%[r] \n\t" \
"cmovc %%eax,%%ecx \n\t" \
"add %%ecx,%[r] \n\t" \
"dec %%edi \n\t" \
"jnz 3b \n\t" \
"jmp 8f \n\t" \
"5: # case k=0 \n\t" \
"neg %[r] \n\t" \
"add %%eax,%[r] \n\t" \
"jmp 8f \n\t" \
"4: # negative-k transform \n\t" \
"neg %[r] \n\t" \
"add %%eax,%[r] \n\t" \
"0: add %[r],%[r] \n\t" \
"mov %[r],%%ecx \n\t" \
"sub %%eax,%[r] \n\t" \
"cmovb %%ecx,%[r] \n\t" \
"inc %%edi \n\t" \
"jnz 0b \n\t" \
"#jmp 8f \n\t" \
"# or: i386 version without cmov \n\t" \
"#3: shr $1,%[r] \n\t" \
"#sbb %%ecx,%%ecx \n\t" \
"#and %%eax,%%ecx \n\t" \
"#add %%ecx,%[r] \n\t" \
"#dec %%edi \n\t" \
"#jnz 3b \n\t" \
"#jmp 8f \n\t" \
"#5: # case k=0 \n\t" \
"#neg %[r] \n\t" \
"#add %%eax,%[r] \n\t" \
"#jmp 8f \n\t" \
"#4: # negative-k transform (i386 version)\n\t" \
"#sub %[r],%%eax \n\t" \
"#mov %%edi,%%ecx \n\t" \
"#neg %%cl \n\t" \
"#mov %%eax,%%edx \n\t" \
"#add $32,%%edi \n\t" \
"#shl %%cl,%%eax \n\t" \
"#mov %%edi,%%ecx \n\t" \
"#shr %%cl,%%edx \n\t" \
"#divl %[m] \n\t " \
"#mov %%edx,%[r] \n\t" \
"8: \n" \
: [v] "+d" (x), [r] "+r" (r), [s] "+r" (s) /* output list */
: [m] "rm" (m), [shifts] "o" (shifts[0]) /* input list */
: "cc", "eax", "ecx", "edi" /* clobber list */ );
//x=h; // needed, if you want to enable modulo-check below!
#elif 1 && defined(ASM_386)
// version without cmov
//unsigned int h=x;
register unsigned int r=0;
register unsigned int s=1;
asm( \
"xor %%eax,%%eax \n\t" \
"#mov %%edx,%%eax \n\t" \
"#shr $32-26,%%edx \n\t" \
"#shl $26,%%eax \n\t" \
"divl %[m] \n\t" \
"mov %[v],%%ecx \n\t" \
"mov $-32,%%edi # now r*2^32 mod m \n\t" \
"mov %[m],%%eax \n\t" \
"and $0x3f,%%ecx \n\t" \
"test $0x7f,%[v] \n\t" \
"movzx %[shifts](%%ecx),%%ecx \n\t" \
"jnz 0f \n\t" \
"bsf %[v],%%ecx \n\t" \
"0: add %%ecx,%%edi # without div transform: mov ecx->edi \n\t" \
"shr %%cl,%[v] \n\t" \
"shl %%cl,%[r] \n" \
".balign 32,,12 \n\t"
"1: # u part\n\t" \
"sub %[v],%%eax \n\t" \
"mov %%eax,%%ecx \n\t" \
"and $0x3f,%%ecx \n\t" \
"test $0x7f,%%eax \n\t" \
"movzx %[shifts](%%ecx),%%ecx \n\t" \
"jnz 0f \n\t" \
"bsf %%eax,%%ecx \n\t" \
"0: add %[s],%[r] \n\t" \
"add %%ecx,%%edi \n\t" \
"shr %%cl,%%eax \n\t" \
"shl %%cl,%[s] \n\t" \
"cmp %[v],%%eax \n\t" \
"ja 1b \n\t" \
"cmp $1,%%eax \n\t" \
"je 9f \n\t" \
"# v part \n\t" \
"sub %%eax,%[v] \n\t" \
"2: mov %[v],%%ecx \n\t" \
"and $0x3f,%%ecx \n\t" \
"test $0x7f,%[v] \n\t" \
"movzx %[shifts](%%ecx),%%ecx \n\t" \
"jnz 0f \n\t" \
"bsf %[v],%%ecx \n\t" \
"0: add %[r],%[s] \n\t" \
"shr %%cl,%[v] \n\t" \
"shl %%cl,%[r] \n\t" \
"add %%ecx,%%edi \n\t" \
"cmp %%eax,%[v] \n\t" \
"jb 1b \n\t" \
"sub %%eax,%[v] \n\t" \
"jmp 2b \n\t" \
"9: #adjust shift \n\t" \
"mov %[r],%%ecx \n\t" \
"mov %[m],%%eax \n\t" \
"and $0x3f,%%ecx \n\t" \
"test $0x7f,%[r] \n\t" \
"movzx %[shifts](%%ecx),%%ecx \n\t" \
"jnz 0f \n\t" \
"bsf %[r],%%ecx \n\t" \
"shl %%cl,%[r] \n\t" \
"0: add %%ecx,%%edi \n\t" \
"js 4f # negative-k transform \n\t" \
"jz 5f # zero! \n\t" \
"neg %[r] \n\t" \
"add %%eax,%[r] \n\t" \
"shr %%eax \n\t" \
"inc %%eax \n\t" \
"shr $1,%[r] \n\t" \
"dec %%edi \n\t" \
"jz 8f \n\t" \
"# now the correction part \n\t" \
"# cmov-version \n\t" \
"#3: xorl %%ecx,%%ecx \n\t" \
"#shr $1,%[r] \n\t" \
"#cmovc %%eax,%%ecx \n\t" \
"#add %%ecx,%[r] \n\t" \
"#dec %%edi \n\t" \
"#jnz 3b \n\t" \
"#jmp 8f \n\t" \
"#5: # case k=0 \n\t" \
"#neg %[r] \n\t" \
"#add %%eax,%[r] \n\t" \
"#jmp 8f \n\t" \
"#4: # negative-k transform \n\t" \
"#neg %[r] \n\t" \
"#add %%eax,%[r] \n\t" \
"#0: add %[r],%[r] \n\t" \
"#mov %[r],%%ecx \n\t" \
"#sub %%eax,%[r] \n\t" \
"#cmovb %%ecx,%[r] \n\t" \
"#inc %%edi \n\t" \
"#jnz 0b \n\t" \
"#jmp 8f \n\t" \
"# or: i386 version without cmov \n\t" \
"3: shr $1,%[r] \n\t" \
"sbb %%ecx,%%ecx \n\t" \
"and %%eax,%%ecx \n\t" \
"add %%ecx,%[r] \n\t" \
"dec %%edi \n\t" \
"jnz 3b \n\t" \
"jmp 8f \n\t" \
"5: # case k=0 \n\t" \
"neg %[r] \n\t" \
"add %%eax,%[r] \n\t" \
"jmp 8f \n\t" \
"4: # negative-k transform (i386 version)\n\t" \
"sub %[r],%%eax \n\t" \
"mov %%edi,%%ecx \n\t" \
"neg %%cl \n\t" \
"mov %%eax,%%edx \n\t" \
"add $32,%%edi \n\t" \
"shl %%cl,%%eax \n\t" \
"mov %%edi,%%ecx \n\t" \
"shr %%cl,%%edx \n\t" \
"divl %[m] \n\t " \
"mov %%edx,%[r] \n\t" \
"8: \n" \
: [v] "+d" (x), [r] "+r" (r), [s] "+r" (s) /* output list */
: [m] "rm" (m), [shifts] "o" (shifts[0]) /* input list */
: "cc", "eax", "ecx", "edi" /* clobber list */ );
//x=h; // needed, if you want to enable modulo-check below!
#else
// generic version
register unsigned int u=m, v=x;
register unsigned int r=0, s=1;
register signed int k=0;
nightshift(v,r,k); // = while ((v&1)==0) { v>>=1; r<<=1; ++k; }
while (true)
{
// now: u,v odd, u>=v
do
{
u-=v; r+=s;
nightshift(u,s,k); // --> do { u>>=1; s<<=1; ++k; } while ((u&1)==0);
} while (u>v);
// now: u,v odd, v>=u
if (__builtin_expect (u==1, 0)) goto loopdone; // we expect many iterations before terminating
do
{
v-=u; s+=r;
nightshift(v,r,k); // --> do { v>>=1; r<<=1; ++k; } while ((v&1)==0);
} while (v>=u);
}
loopdone:
while ((r&1)==0) { r>>=1; ++k; }
r=m-r; // now r is even
do { if (r&1) r+=m; r>>=1; } while(--k);
#endif /* of asm-cmov / asm-i386 / generic version */
#if 0
if ((mulmod(r,x,m))!=1)
{
std::cerr << "invmod-error!" << endl;
std::cerr << "check should produce 1, but got " << mulmod(r,x,m) << endl;
std::cerr << "r=" << r << " x=" << x << " m=" << m << endl;
std::cerr << "real invmod should be : " << invmod(x,m) << endl;
exit(1);
}
#endif
return r;
}
#ifdef USE_JASONS_INVMOD
namespace japinvmod
{
#include "invmod.c"
class init { public: init() { init_fast_invmod(); } };
static init at_startup;
}
#endif
} // namespace numtheory
#endif /* MODULO_CC_INCLUDED */