"js 2f \n\t" \␊ |
"decl %[pos] \n\t" \␊ |
"2: # hit detected" \␊ |
: [pos] "=r" (offset) : "[pos]" (offset), [sieb] "q" (SieveArray) : "cc");␊ |
: [pos] "=q" (offset) : "[pos]" (offset), [sieb] "q" (SieveArray) : "cc");␊ |
#endif␊ |
#if defined(ASM_X86_64)␊ |
// code containing cmov optimizations for X86_64␊ |
|
"js 2f \n\t" \␊ |
"decl %[pos] \n\t" \␊ |
"2: # hit detected" \␊ |
: [pos] "=r" (offset) : "[pos]" (offset), [sieb] "q" (SieveArray) : "cc");␊ |
: [pos] "=q" (offset) : "[pos]" (offset), [sieb] "q" (SieveArray) : "cc");␊ |
#define asm_sieb(lp, d0, d1, P) \␊ |
asm volatile ( \␊ |
"cmpl %[disp1],%[disp0]\n\t" \␊ |
|
"testl $0x80008000,(%[sieb],%[pos],4) \n\t" \␊ |
"jz 1b \n\t" \␊ |
"2: # hit detected" \␊ |
: [pos] "=r" (offset) : "[pos]" (offset), [sieb] "q" (SieveArray) : "cc");␊ |
: [pos] "=q" (offset) : "[pos]" (offset), [sieb] "q" (SieveArray) : "cc");␊ |
#define asm_sieb(lp, d0, d1, P) \␊ |
asm volatile ( \␊ |
"cmpl %[disp1],%[disp0]\n\t" \␊ |
|
"pcmpeqd 16(%[Deltas],%[i],8),%%xmm1 \n\t" \␊ |
"packssdw %%xmm1,%%xmm0 \n\t" \␊ |
"pmovmskb %%xmm0,%[pflags] # create bytemask" \␊ |
: [pflags] "=r" (pflags)␊ |
: [pflags] "=q" (pflags)␊ |
: [divisors] "r" (&PrimeNumbers[0]), [Deltas] "r" (&Delta_of_PrimeNumbers[0][0]),␊ |
[FloatRecips] "r" (&PrimeNumberFloatReciprocals[0]),␊ |
[_xmm7] "x" (XMM7), [_xmm6] "x" (XMM6), [i] "r" (nr)␊ |
|
"pcmpeqd %%xmm3,%%xmm1 \n\t" \␊ |
"packssdw %%xmm1,%%xmm0 \n\t" \␊ |
"pmovmskb %%xmm0,%[pflags] # create bytemask" \␊ |
: [pflags] "=r" (pflags)␊ |
: [pflags] "=q" (pflags)␊ |
: [PrimeNumbers] "r" (&PrimeNumbers[0]), [LocDeltas] "r" (&LocalPhysInterval_Delta_of_PrimeNumbers[0][0]),␊ |
[_xmm7] "x" (XMM7), [i] "r" (nr)␊ |
: "xmm0", "xmm1", "xmm2", "xmm3");␊ |
|
"pcmpeqd 16+%[Deltas](,%[i],8),%%xmm1 \n\t" \␊ |
"packssdw %%xmm1,%%xmm0 \n\t" \␊ |
"pmovmskb %%xmm0,%[pflags] # create bytemask" \␊ |
: [pflags] "=r" (pflags)␊ |
: [pflags] "=q" (pflags)␊ |
: [divisors] "o" (PrimeNumbers[0]), [Deltas] "o" (Delta_of_PrimeNumbers[0][0]),␊ |
[FloatRecips] "o" (PrimeNumberFloatReciprocals[0]),␊ |
[_xmm7] "x" (XMM7), [_xmm6] "x" (XMM6), [i] "r" (nr)␊ |
|
"pcmpeqd %%xmm3,%%xmm1 \n\t" \␊ |
"packssdw %%xmm1,%%xmm0 \n\t" \␊ |
"pmovmskb %%xmm0,%[pflags] # create bytemask" \␊ |
: [pflags] "=r" (pflags)␊ |
: [pflags] "=q" (pflags)␊ |
: [PrimeNumbers] "o" (PrimeNumbers[0]), [LocDeltas] "o" (LocalPhysInterval_Delta_of_PrimeNumbers[0][0]),␊ |
[_xmm7] "x" (XMM7), [i] "r" (nr)␊ |
: "xmm0", "xmm1", "xmm2", "xmm3");␊ |
|
"pcmpeqd 24+%[Deltas](,%[i],8),%%mm2 # L2AM \n\t" \␊ |
"packssdw %%mm2,%%mm1 # L2AM pack comparison results together \n\t" \␊ |
"pmovmskb %%mm1,%[pflags2] # L6+ (VectorPath) into bytemask" \␊ |
: [pflags] "=r" (pflags), [pflags2] "=r" (pflags2)␊ |
: [pflags] "=q" (pflags), [pflags2] "=q" (pflags2)␊ |
: [divisors] "o" (PrimeNumbers[0]), [Deltas] "o" (Delta_of_PrimeNumbers[0][0]),␊ |
[FloatRecips] "o" (PrimeNumberFloatReciprocals[0]),␊ |
[_xmm7] "x" (XMM7), [_xmm6] "x" (XMM6), [i] "r" (nr)␊ |
|
"pcmpeqd 8+%[LocDeltas](,%[i],8),%%mm2 # L2AM \n\t" \␊ |
"packssdw %%mm2,%%mm1 # L2AM pack comparison results together \n\t" \␊ |
"pmovmskb %%mm1,%[pflags] # L6+ (VectorPath) into bytemask" \␊ |
: [pflags] "=r" (pflags)␊ |
: [pflags] "=q" (pflags)␊ |
: [PrimeNumbers] "o" (PrimeNumbers[0]), [LocDeltas] "o" (LocalPhysInterval_Delta_of_PrimeNumbers[0][0]),␊ |
[_mm7] "y" (MM7), [i] "r" (nr)␊ |
: "mm0", "mm1", "mm2");␊ |
|
"pcmpeqd 8+%[Deltas](,%[i],8),%%mm2 # L2AM \n\t" \␊ |
"packssdw %%mm2,%%mm1 # L2AM pack comparison results together \n\t" \␊ |
"pmovmskb %%mm1,%[pflags] # L6+ (VectorPath) into bytemask" \␊ |
: [pflags] "=r" (pflags)␊ |
: [pflags] "=q" (pflags)␊ |
: [divisors] "o" (PrimeNumbers[0]), [FloatRecips] "o" (PrimeNumberFloatReciprocals[0]),␊ |
[Deltas] "o" (Delta_of_PrimeNumbers[0][0]),␊ |
[_mm7] "y" (MM7), [_mm6] "y" (MM6), [i] "r" (nr)␊ |
|
"pcmpeqd 8+%[LocDeltas](,%[i],8),%%mm2 # L2AM \n\t" \␊ |
"packssdw %%mm2,%%mm1 # L2AM pack comparison results together \n\t" \␊ |
"pmovmskb %%mm1,%[pflags] # L6+ (VectorPath) into bytemask" \␊ |
: [pflags] "=r" (pflags)␊ |
: [pflags] "=q" (pflags)␊ |
: [PrimeNumbers] "o" (PrimeNumbers[0]), [LocDeltas] "o" (LocalPhysInterval_Delta_of_PrimeNumbers[0][0]),␊ |
[_mm7] "y" (MM7), [i] "r" (nr)␊ |
: "mm0", "mm1", "mm2");␊ |
|
"packssdw %%mm1,%%mm5 \n\t" \␊ |
"packsswb %%mm0,%%mm5 # mm0 is not relevant\n\t" \␊ |
"movd %%mm5,%[pflags] # L5+ (VectorPath)" \␊ |
: [pflags] "=r" (pflags)␊ |
: [pflags] "=q" (pflags)␊ |
: [divisors] "o" (PrimeNumbers[0]), [FloatRecips] "o" (PrimeNumberFloatReciprocals[0]),␊ |
[Deltas] "o" (Delta_of_PrimeNumbers[0][0]),␊ |
[_mm7] "y" (MM7), [_mm6] "y" (MM6), [i] "r" (nr)␊ |
|
"packssdw %%mm1,%%mm0 \n\t" \␊ |
"packsswb %%mm1,%%mm0 \n\t" \␊ |
"movd %%mm0,%[pflags] # L5+ (VectorPath)" \␊ |
: [pflags] "=r" (pflags)␊ |
: [pflags] "=q" (pflags)␊ |
: [PrimeNumbers] "o" (PrimeNumbers[0]), [LocDeltas] "o" (LocalPhysInterval_Delta_of_PrimeNumbers[0][0]),␊ |
[_mm7] "y" (MM7), [i] "r" (nr)␊ |
: "mm0", "mm1");␊ |