/* this file contains the assembly routines for a high-speed number-theoretic FFT. It can work modulo any number < 2^31-1; approximate timings on a 200MHz Pentium MMX: 256 pts 168us 512 pts 322us 1024 pts 710us 2048 pts 1560us Includes useful utilty routines, and a simple driver for testing purposes. Calling convention: test In other code, before using the NTT you should first fill out the global array of doubles, set the prime to use, and use precompute() to make the roots of unity. main() will show how. I hereby place this code into the public domain. Use it for whatever you want, no strings attached, but optionally please be nice and tell me if you find it useful. Jason Papadopoulos jasonp@glue.umd.edu 10/24/98 */ #include #include #include double stuff[7]; /* contents: 3*2^62 3*2^51 n temp1...temp3 1/n */ long n; void ntt_DIT( long *x, long *w, long power ); long bitreverse(long input, int power); long timesmod( long a, long b ); long powermod( long a, long b ); void precompute( long *x, long primroot, long power ); void array_timesmod( long *a, long *b, long len ); void array_butterfly( long *a, long *b, long len ); void array_first2( long *a, long len ); void ntt_DIT( long *x, long *w, long power ); /*-------------------------------------------------------------------------*/ int main(int argc, char *argv[]) { long i, *a, *w, power=atol(argv[1]), size=1<=0; i--) { output+= (input & 1) << i; input>>=1; } return output; } /*------------------------------------------------------------------------*/ long timesmod( long a, long b) { /*This is a clever 31-bit modular multiply that doesn't need 64-bit math. Note that the number to reduce by (and its double precision reciprocal) must be stored in globally. For details see ACM SIGPLAN notices, vol. 27 no. 1, p.95 */ double dquot = (double)a * (double)b * stuff[6]; unsigned long reducedquot = dquot + .5; unsigned long remainder = a * b - reducedquot * n; return (remainder & 0x80000000? remainder+n: remainder); } /*---------------------------------------------------------------------*/ long powermod( long a, long b ) { /* modular exponentiation a^b, 31 bit ^ 32 bit mod a 31-bit number "n". n must be defined as a global variable, along with its reciprocal. */ long answer = 1, temp = a; while ( b>0 ) { if ( b&1 ) answer = timesmod( temp, answer ); temp = timesmod( temp, temp ); b >>= 1; } return answer; } /*---------------------------------------------------------------------*/ void precompute( long *x, long primroot, long power ) { /* precomputes the roots of unity that the NTT routine will need, into the array x. primroot is the primitive root / generator to use, and power is the maximum power of two size that can be transformed. Note that the array x can also be used as-is for transforming any power of two *smaller* than this upper limit, meaning you only have to do this once for each prime. Which is good, because the computation is ponderously slow. */ long size = 1<<(power-1), inc = size/2, i=0, j; while( inc ) { for( j=0; j {a+b, a-b} mod n It's assumed that all the numbers involved are positive, and the code checks that they stay that way. len must be a multiple of 4. Running time: 35 cycles per 4 butterflies. */ asm(" pushl %%ebp .align 4 0: movl (%0), %%esi movl (%0), %%ebp pushl %3 movl (%1), %%edi addl %%edi, %%ebp subl %%edi, %%esi movl %%esi, %%edi subl %2, %%ebp sarl $31, %%edi movl %%ebp, %3 sarl $31, %3 andl %2, %%edi addl %%edi, %%esi andl %2, %3 addl %3, %%ebp movl %%esi, (%1) movl %%ebp, (%0) movl 4(%1), %%edi movl 4(%0), %%esi movl 4(%0), %%ebp addl %%edi, %%ebp subl %%edi, %%esi movl %%esi, %%edi subl %2, %%ebp sarl $31, %%edi movl %%ebp, %3 sarl $31, %3 andl %2, %%edi addl %%edi, %%esi andl %2, %3 addl %3, %%ebp movl %%esi, 4(%1) movl %%ebp, 4(%0) movl 8(%1), %%edi movl 8(%0), %%esi movl 8(%0), %%ebp addl %%edi, %%ebp subl %%edi, %%esi movl %%esi, %%edi subl %2, %%ebp sarl $31, %%edi movl %%ebp, %3 sarl $31, %3 andl %2, %%edi addl %%edi, %%esi andl %2, %3 addl %3, %%ebp movl %%esi, 8(%1) movl %%ebp, 8(%0) movl 12(%1), %%edi movl 12(%0), %%esi movl 12(%0), %%ebp addl %%edi, %%ebp subl %%edi, %%esi movl %%esi, %%edi subl %2, %%ebp sarl $31, %%edi movl %%ebp, %3 sarl $31, %3 andl %2, %%edi addl %%edi, %%esi andl %2, %3 addl %3, %%ebp movl %%esi, 12(%1) popl %3 movl %%ebp, 12(%0) addl $16, %0 addl $16, %1 subl $4, %3 jnz 0b popl %%ebp ": : "a"(a), "b"(b), "c"(n), "d"(len) : "%esi", "%edi", "%ebp", "memory" ); } /*-------------------------------------------------------------------------*/ void array_first2( long *a, long len ) { /* performs the first two levels of the decimation in time NTT for "len" points in a row. len above must be a multiple of 4. Running time: 48 cycles per 4 array elements. This involves four butterflies but only one modular multiplication (by the contents of stuff[3] ) per group of 4 points. The mod step for each butterfly and the multiply is *very* tedious, and more than doubles the runtime. There are also many more stalls here than elsewhere, and they're unavoidable for want of more registers. */ asm(" pushl %%ebp .align 4 0: movl 8(%0), %%ebp movl 12(%0), %%edi leal (%%ebp,%%edi), %%esi subl %%edi, %%ebp movl %%ebp, %%edi subl %1, %%esi sarl $31, %%edi movl %%esi, %%edx sarl $31, %%edx andl %1, %%edi addl %%edi, %%ebp andl %1, %%edx addl %%edx, %%esi movl %%ebp, 12(%0) movl %%esi, 8(%0) fildl 12(%0) movl (%0), %%ebp movl 4(%0), %%edi leal (%%ebp,%%edi), %%esi subl %%edi, %%ebp fmull _stuff+24 subl %1, %%esi movl %%ebp, %%edi sarl $31, %%edi movl %%esi, %%edx fld %%st fmull _stuff+48 sarl $31, %%edx andl %1, %%edi addl %%edi, %%ebp andl %1, %%edx faddl _stuff addl %%edx, %%esi movl %%ebp, 4(%0) movl 8(%0), %%edi fsubl _stuff leal (%%esi,%%edi), %%ebp subl %%edi, %%esi movl %%esi, %%edi subl %1, %%ebp fmull _stuff+16 sarl $31, %%edi movl %%ebp, %%edx sarl $31, %%edx andl %1, %%edi fsubrp %%st, %%st(1) addl %%edi, %%esi andl %1, %%edx addl %%edx, %%ebp movl %%esi, 8(%0) faddl _stuff+8 movl %%ebp, (%0) movl 4(%0), %%ebp fstpl _stuff+32 # stall x 2 movl _stuff+32, %%esi addl $16, %0 sarl $31, %%esi movl _stuff+32, %%edi andl %1, %%esi addl %%esi, %%edi # stall leal (%%ebp,%%edi), %%esi subl %%edi, %%ebp movl %%ebp, %%edi subl %1, %%esi sarl $31, %%edi movl %%esi, %%edx sarl $31, %%edx andl %1, %%edi addl %%edi, %%ebp andl %1, %%edx addl %%edx, %%esi movl %%ebp, -4(%0) movl %%esi, -12(%0) nop subl $4, %2 jnz 0b popl %%ebp ": : "a"(a), "b"(n), "c"(len) : "%edx", "%esi", "%edi", "%ebp", "memory" ); } /*-------------------------------------------------------------------------*/ void ntt_DIT( long *x, long *w, long power ) { /* performs a 2^power size decimation in time number-theoretic transform. w points to the array of roots of unity, assumed to be calculated with precompute(). */ long blocks, passes, *top, *middle, size = 1<=0; passes--,size*=2 ) { blocks = 1<