#ifndef MYBITSTRING_HEADER_
#define MYBITSTRING_HEADER_
/*! @file
* @brief
* X64_86 (based on i386) assembler optimized implementation of an array class of bits (in dense representation)
*/
#include <new>
using std::nothrow;
#include "utils.H"
#if 0
// use 32 bits
typedef unsigned int BS_unsigned_int;
typedef signed int BS_signed_int;
#define DW_per_BS_unsigned_int 1
#define BS_shifts 5
#define BS_bits 32
#define BS_mask 31
#else
// use 64 bits
typedef unsigned long int BS_unsigned_int;
typedef signed long int BS_signed_int;
#define DW_per_BS_unsigned_int 2
#define BS_shifts 6
#define BS_bits 64
#define BS_mask 63
#endif
class myBitString
{
protected:
int size;
BS_unsigned_int *data;
inline void resize(int newsize)
{
if (newsize < size) return;
//cerr << "Resize " << size << " to " << newsize << endl;
BS_unsigned_int *olddata = data;
data = new(nothrow) BS_unsigned_int[newsize];
if (!data)
{
cerr << "myBitString.resize(" << newsize << ") out of memory" << endl;
exit(1);
}
for (int i=0; i<size; i++) data[i] = olddata[i];
for (int i=size; i< newsize; i++) data[i]=0;
size=newsize;
if (olddata) delete [] olddata;
}
public:
inline void optisize(void)
{
int i=size-1;
while (i>=0 && data[i]==0) i--;
if (i==size-1) return; // nothing to do...
if (size<128/DW_per_BS_unsigned_int)
{
// to avoid heap pollution for small sizes, simply decrease
// the size without reallocating any heap memory
#ifdef VERBOSE
cout << "MyBitString::Optisize, no reallocate " << size << " to " << i+1 << endl;
#endif
size=i+1; return;
}
#ifdef VERBOSE
cout << "MyBitString::Optisize, reallocate" << size << " to " << i+1 << endl;
#endif
if (i<0) { delete [] data; data=NULL; size=0; return; } // now empty
size=i+1; // new size
BS_unsigned_int *olddata = data;
data = new(nothrow) BS_unsigned_int[size];
if (!data)
{
cerr << "myBitString.optisize(" << size << ") out of memory" << endl;
exit(1);
}
for (; i>=0; --i) data[i] = olddata[i];
if (olddata) delete [] olddata;
}
inline myBitString(void) : size(0), data(NULL) { }
inline ~myBitString(void)
{
if (data) delete [] data;
data=NULL;
size=0;
}
inline myBitString(const myBitString &rhs) : size(rhs.size), data(NULL)
{
data = new(nothrow) BS_unsigned_int[size];
if (!data)
{
cerr << "myBitString.operator=(" << size << ") out of memory" << endl;
exit(1);
}
for (int i=size-1; i>=0; --i) data[i] = rhs.data[i];
}
inline myBitString& operator= (const myBitString &rhs)
{
if (data) delete [] data;
size=rhs.size;
data = new(nothrow) BS_unsigned_int[size];
if (!data)
{
cerr << "myBitString.operator=(" << size << ") out of memory" << endl;
exit(1);
}
for (int i=size-1; i>=0; --i) data[i] = rhs.data[i];
return *this;
}
inline int count(const bool b = true) const
{
int z=0;
if (!b) return -1;
for (int i=0; i<size; i++)
{
register BS_unsigned_int h = data[i];
while (h)
{
if (h&1) z++;
h>>=1;
}
}
return z;
}
inline int first(const bool b = true) const
{
if (b)
{
for (int i=0; i<size; i++)
{
register BS_unsigned_int h = data[i];
if (h)
{
BS_signed_int j;
asm("bsf %0,%1" : "+g" (h), "=c" (j) );
return BS_bits*i+j;
}
}
//cerr << "first(1)=-1" << endl;
return -1;
}
else
{
cerr << "myBitString.first(0) not implemented" << endl;
exit(1);
}
}
inline int next(int pos, const bool b = true) const
{
int i,j;
if (pos<0) return first(b);
pos++;
if (b)
{
i=pos>>BS_shifts;
j=pos&BS_mask;
if (i>=size) return -1;
register BS_unsigned_int h = data[i];
h>>=j;
if (h)
{
BS_signed_int j;
asm("bsf %0,%1" : "+g" (h), "=c" (j) );
return pos+j;
}
else
{
j = 0;
for (i++;i<size; i++)
{
h = data[i];
if (h)
{
BS_signed_int j;
asm("bsf %0,%1" : "+g" (h), "=c" (j) );
return BS_bits*i+j;
}
}
}
return -1;
}
else
{
cerr << "myBitString.next(0) not implemented" << endl;
exit(1);
}
}
inline int last(const bool b = true) const
{
if (b)
{
for (register int i=size-1; i>=0; i--)
{
//cerr << "i=" << i << endl;
register BS_unsigned_int h = data[i];
if (h)
{
BS_signed_int j;
asm("bsr %0,%1" : "+g" (h), "=c" (j) );
return BS_bits*i+j;
}
}
//cerr << "last(1)=-1"<<endl;
return -1;
}
else
{
cerr << "myBitString.last(0) not implemented" << endl;
exit(1);
}
}
inline int prev(int pos, const bool b = true) const
{
if (pos>BS_bits*size) return last(b);
if (pos<=0) return -1;
pos--;
if (b)
{
int i=pos>>BS_shifts;
int j=pos&BS_mask;
register BS_unsigned_int h = data[i];
h<<=BS_mask-j;
if (h)
{
BS_signed_int j;
asm("bsr %0,%1" : "+g" (h), "=c" (j) );
return pos-(BS_mask-j);
}
else
{
for (i--;i>=0; i--)
{
h = data[i];
if (h)
{
BS_signed_int j;
asm("bsr %0,%1" : "+g" (h), "=c" (j) );
return BS_bits*i+j;
}
}
}
return -1;
}
else
{
cerr << "myBitString.prev(0) not implemented" << endl;
exit(1);
}
}
inline void invert(const int pos)
{
//cerr << "invert("<<pos<<")"<<endl;
int i=pos>>BS_shifts; // BS_bits bits = 2^BS_shifts
if (i>=size) resize(i+1);
// remark: as we modify only one bit, we use btcl instead of btcq
asm volatile ("btcl %1,(%0)" : : "q" (data), "r" (pos) : "cc"); // bit-test and complement
// actually data[pos/BS_bits] gets modified (but we do not inform the compiler about this...)
}
inline bool test_and_invert(const int pos)
{
//cerr << "invert("<<pos<<")"<<endl;
int i=pos>>BS_shifts; // BS_bits bits = 2^BS_shifts
if (i>=size) resize(i+1);
bool ret;
asm ("btcl %[pos],(%[data])\n\tsetc %[flag]" : [flag] "=g" (ret) : [data] "q" (data), [pos] "r" (pos): "cc"); // bit-test and complement
// actually data[pos/BS_bits] gets modified (but we do not inform the compiler about this...)
return ret;
}
inline bool test(const int pos) const
{
if ((pos>>BS_shifts)>=size) return false;
bool r;
asm ("btl %2,(%1)\n\tsetc %0" : "=r" (r) : "q" (data), "r" (pos) : "cc"); // bit-test and r=CarryFlag
return r;
}
inline void set(const int pos, const bool b)
{
//cerr << "set("<<pos<< "," << b << ")" << endl;
register int i=pos>>BS_shifts;
if (i>=size)
{
if (b)
resize(i+1);
else
return;
}
if (b)
asm volatile ("btsl %1,(%0)" : : "q" (data), "r" (pos) : "cc"); //data[i] |= bitmask[j];
else
asm volatile ("btrl %1,(%0)" : : "q" (data), "r" (pos) : "cc"); //data[i] &= ~bitmask[j];
// actually data[pos/BS_bits] gets modified (but we do not inform the compiler about this...)
}
inline void set(const int pos)
{
register int i=pos>>BS_shifts;
if (i>=size) resize(i+1);
asm ("btsl %1,(%0)" : : "q" (data), "r" (pos) : "cc"); //data[i] |= bitmask[j];
// actually data[pos/BS_bits] gets modified (but we do not inform the compiler about this...)
}
inline void _xor(const myBitString &s)
{
int i=s.size-1;
while (i>=0 && s.data[i]==0) i--; // Nullen des zweiten BitStrings ignorieren
if (i>=size) // <-> i+1>size
{
cerr << "myBitString.xor calls resize" << endl;
resize(i+1);
}
for (; i>=0; --i)
data[i] ^= s.data[i];
}
inline const myBitString& operator ^= (const myBitString &s)
{
_xor(s); return *this;
}
inline void _and(const myBitString &s1, const myBitString &s2)
// der eigene Bitstring wird durch s1&s2 (komponentenweise) überschrieben
{
int i = MIN(s1.size,s2.size)-1;
if (i>=size) // <-> i+1>size
{
// CAUTION: (this == &s1 || this == &s2) -> BOOM!!
// But since (this == &s1 || this == &s2) also
// implies MIN(s1.size,s2.size)-1 < size, we can safely
// destroy the array...
if (data) delete [] data; // alten Vektor löschen
size=i+1;
data = new BS_unsigned_int[size];
// initialisieren der Bits des neuen Vektors nicht notwendig (geschieht ja gleich durch das and)
}
else
{
register int j=size-1;
while (j>i) data[j--]=0; // überschüssige Bits im Resultat löschen
}
for (; i>=0; --i)
data[i] = s1.data[i] & s2.data[i];
}
template<typename T> void test_and_add_carry(const myBitString &s2, T &CarryVec) const
// generic version, but still fast (will be automatically used, if no
// specialized version is defined below)
// temporäre (komponentenweise) AND-Verknüpfung den eigenen Bitstrings
// mit s2; den Vektor CarryVector an den korrespondierenden gesetzten
// Positionen inkrementieren.
{
const int s = MIN(size,s2.size);
const int Bits = 8*sizeof(BS_unsigned_int);
for (int i=0; i<s; ++i)
{
register BS_unsigned_int h = data[i] & s2.data[i];
register int j=0;
while (h)
{
if (h&0xff)
{
CarryVec[Bits*i+j ]+=h&1;
CarryVec[Bits*i+j+1]+=(h>>1)&1;
CarryVec[Bits*i+j+2]+=(h>>2)&1;
CarryVec[Bits*i+j+3]+=(h>>3)&1;
CarryVec[Bits*i+j+4]+=(h>>4)&1;
CarryVec[Bits*i+j+5]+=(h>>5)&1;
CarryVec[Bits*i+j+6]+=(h>>6)&1;
CarryVec[Bits*i+j+7]+=(h>>7)&1;
}
h>>=8;
j+=8;
}
}
}
#if 1 && defined(ASM_X86_64)
#if defined(VERBOSE)
#warning "support for test_and_add_carry(...), (uint16, using SSE2)"
#endif
void test_and_add_carry(const myBitString &s2, unsigned short int CarryVec[]) const
// temporäre (komponentenweise) AND-Verknüpfung den eigenen Bitstrings
// mit s2; den Vektor CarryVector an den korrespondierenden gesetzten
// Positionen inkrementieren.
{
//cout << "spezialisierte Variante uint16 (using SSE2)" << endl;
static const unsigned short int PackedMultipl[8] __attribute__ ((aligned (16))) = { 128,64,32,16,8,4,2,1 };
// these multipliers are needed to shift the bits of the packed words to the desired position
#ifdef DEBUG
if (reinterpret_cast<unsigned long int>(&PackedMultipl[0]) & 0x0fUL)
{
cerr << "Address of PackedMultipl is " << &PackedMultipl[0]
<< " which is not 16 byte aligned as requested!" << endl;
}
#endif
long int s = MIN(size,s2.size)*DW_per_BS_unsigned_int -1; // doublewords (minus 1) to process
// int s = MIN(last(),s2.last()); if (s>0) s/=32;
// important:
// (i) we preload -4(%[data1],%[i],4) and -4(%[data2],%[i],4);
// fencepost error (reading -4(%[data]), resp. -4(%[data2]) is avoided by using %%eax instead of %[i]
// (ii) CarryVec MUST be 16-byte aligned!! (otherwise the program segfaults!)
asm volatile ( \
"test %[i],%[i] \n\t" \
"js 9f \n\t" \
"movl $0x00010001,%%eax \n\t" \
"mov %[i],%%rdx \n\t" \
"movd %%eax,%%xmm7 \n\t" \
"movdqa %[PM],%%xmm6 \n\t" \
"movl (%[data1],%[i],4),%%eax \n\t" \
"pshufd $0x00,%%xmm7,%%xmm7 \n\t" \
"shl $6,%%rdx \n\t" \
"andl (%[data2],%[i],4),%%eax \n\t" \
"add %[V],%%rdx \n\t" \
"movd %%eax,%%xmm4 \n\t" \
"mov %[i],%%rax \n\t" \
"1: \n\t" \
"dec %%rax \n\t" \
"cmovs %[i],%%rax \n\t" \
"prefetchnta -72(%[data2],%[i],4) \n\t" \
"pshuflw $0x00,%%xmm4,%%xmm1 \n\t" \
"pshuflw $0x55,%%xmm4,%%xmm3 \n\t" \
"movd (%[data1],%%rax,4),%%xmm5 \n\t" \
"pshufd $0x00,%%xmm1,%%xmm1 \n\t" \
"pshufd $0x00,%%xmm3,%%xmm2 \n\t" \
"pmullw %%xmm6,%%xmm1 \n\t" \
"movd (%[data2],%%rax,4),%%xmm4 \n\t" \
"movdqa %%xmm1,%%xmm0 \n\t" \
"psrlw $15,%%xmm1 \n\t" \
"pmullw %%xmm6,%%xmm2 \n\t" \
"psrlw $7,%%xmm0 \n\t" \
"movdqa %%xmm2,%%xmm3 \n\t" \
"psrlw $7,%%xmm2 \n\t" \
"pand %%xmm7,%%xmm0 \n\t" \
"psrlw $15,%%xmm3 \n\t" \
"pand %%xmm7,%%xmm2 \n\t" \
"paddw (%%rdx),%%xmm0 \n\t" \
"paddw 16(%%rdx),%%xmm1 \n\t" \
"pand %%xmm5,%%xmm4 \n\t" \
"paddw 32(%%rdx),%%xmm2 \n\t" \
"paddw 48(%%rdx),%%xmm3 \n\t" \
"movdqa %%xmm0,(%%rdx) \n\t" \
"movdqa %%xmm1,16(%%rdx) \n\t" \
"movdqa %%xmm2,32(%%rdx) \n\t" \
"movdqa %%xmm3,48(%%rdx) \n\t" \
"3: sub $64,%%rdx \n\t" \
"dec %[i] \n\t" \
"jns 1b \n\t" \
"9: " \
: [i] "+r" (s)
: [PM] "m" (PackedMultipl[0]), [data1] "r" (data), [data2] "r" (s2.data), [V] "D" (CarryVec)
: "memory", "cc", "rax", "rdx", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7");
}
#else
void test_and_add_carry(const myBitString &s2, unsigned short int CarryVec[]) const
// temporäre (komponentenweise) AND-Verknüpfung den eigenen Bitstrings
// mit s2; den Vektor CarryVector an den korrespondierenden gesetzten
// Positionen inkrementieren.
{
//cout << "spezialisierte Variante uint16" << endl;
long int s = MIN(size,s2.size)-1;
asm volatile ( \
"test %[i],%[i] \n\t" \
"js 9f \n\t" \
"1: movl (%[data1],%[i],4),%%eax \n\t" \
"mov %[i],%%rdx \n\t" \
"andl (%[data2],%[i],4),%%eax \n\t" \
"shl $5,%%rdx \n\t" \
"2: shrl %%eax \n\t" \
"adcw $0,(%[V],%%rdx,2) \n\t" \
"shrl %%eax \n\t" \
"adcw $0,2(%[V],%%rdx,2) \n\t" \
"shrl %%eax \n\t" \
"adcw $0,4(%[V],%%rdx,2) \n\t" \
"shrl %%eax \n\t" \
"adcw $0,6(%[V],%%rdx,2) \n\t" \
"shrl %%eax \n\t" \
"adcw $0,8(%[V],%%rdx,2) \n\t" \
"shrl %%eax \n\t" \
"adcw $0,10(%[V],%%rdx,2) \n\t" \
"shrl %%eax \n\t" \
"adcw $0,12(%[V],%%rdx,2) \n\t" \
"shrl %%eax \n\t" \
"adcw $0,14(%[V],%%rdx,2) \n\t" \
"add $8,%%rdx \n\t" \
"test %%eax,%%eax \n\t" \
"jnz 2b \n\t" \
"dec %[i] \n\t" \
"jns 1b \n\t" \
"9: " \
: [i] "+r" (s)
: [data1] "r" (data), [data2] "r" (s2.data), [V] "D" (CarryVec)
: "memory", "cc", "eax", "rdx");
}
#endif
#if 0
void test_and_add_carry(const myBitString &s2, unsigned int CarryVec[]) const
// temporäre (komponentenweise) AND-Verknüpfung den eigenen Bitstrings
// mit s2; den Vektor CarryVector an den korrespondierenden gesetzten
// Positionen inkrementieren.
{
//cout << "spezialisierte Variante uint32" << endl;
int s = MIN(size,s2.size)-1;
asm volatile ( \
"test %[i],%[i] \n\t" \
"js 9f \n\t" \
"1: movl (%[data1],%[i],4),%%eax \n\t" \
"movl %[i],%%edx \n\t" \
"andl (%[data2],%[i],4),%%eax \n\t" \
"shll $5,%%edx \n\t" \
"2: btl $0,%%eax \n\t" \
"adcl $0,(%[V],%%edx,4) \n\t" \
"btl $1,%%eax \n\t" \
"adcl $0,4(%[V],%%edx,4) \n\t" \
"btl $2,%%eax \n\t" \
"adcl $0,8(%[V],%%edx,4) \n\t" \
"btl $3,%%eax \n\t" \
"adcl $0,12(%[V],%%edx,4) \n\t" \
"btl $4,%%eax \n\t" \
"adcl $0,16(%[V],%%edx,4) \n\t" \
"btl $5,%%eax \n\t" \
"adcl $0,20(%[V],%%edx,4) \n\t" \
"btl $6,%%eax \n\t" \
"adcl $0,24(%[V],%%edx,4) \n\t" \
"btl $7,%%eax \n\t" \
"adcl $0,28(%[V],%%edx,4) \n\t" \
"addl $8,%%edx \n\t" \
"shrl $8,%%eax \n\t" \
"jnz 2b \n\t" \
"decl %[i] \n\t" \
"jns 1b \n\t" \
"9: " \
: [i] "+r" (s)
: [data1] "r" (data), [data2] "r" (s2.data), [V] "D" (CarryVec)
: "memory", "cc", "eax", "edx");
}
#endif
};
#endif