/*---------------------------------------------------------------------- blokpi.c: calculate pi v1.0 This code is for a very fast pi calculator that adds up series with a certain special form. It uses block summation (see the blokpi.txt for what this means and how it works) and can be configured at com- pile time for optimal pi-crunching speed across different platforms. I designed it to be compiled using djgpp on the Pentium processor (since that's what I have), and it runs fastest for these, but the code involved is generic enough so that any ANSI C compiler will do. Another plus is that the series added up can be changed with a min- imum of effort, so this program can *also* add up similar series for other constants, also at breakneck speed. This program is placed in the public domain by its author, Jason Papadopoulos. Do whatever the heck you want with it, but if you figure out a way to speed it up, please be kind and send an e-mail to jasonp@glue.umd.edu --------------------------------------------------------------------*/ #include #include #include #include /* compile for Intel FPU */ #define INTEL /* number of doubles in 1 block */ #define BLOCKPI 1000 /* number of fractions in 1 block. If GCCDOS is defined this must be a multiple of 4; otherwise it must be a multiple of 3. To be really general, make it a mul- tiple of 12). NOTE: it must also be a multiple of FRACTIONS_PER_TERM */ #define BLOCKTERM 252 /* special high-performance code is available when compiled using djgpp for DOS. Runs about 18% faster. */ #define GCCDOS /* specifics of the formula used */ #define FRACTIONS_PER_TERM 2 #define CORRECT_DIGITS_PER_TERM 1.4314 double stuff[14] = { 0, 0, /* don't mess with these; filled later */ 13, /* 1st additive constant; req'd */ 7, /* increment for 1st additive constant */ 8, /* numerator of scaling constant */ 15, /* denominator of scaling constant */ 2, /* 1st numerator */ 2, /* increment for 1st numerator */ 11, /* 1st denominator */ 6, /* increment for 1st denominator */ 1, /* 2nd numerator */ 2, /* increment for 2nd numerator */ -21, /* 2nd denominator */ -18}; /* increment for 2nd denominator */ /* ...add more fractions if needed... */ /* Pick the base you want to work in, i.e. how many digits get stored in a double. For Intel registers, BASEDIGITS + (max number of digits in num and denom of fractions) = 18. For anything else the sum is 14 (the IEEE guarantees at least that much). The more terms you want to add up the smaller BASE must be. 10 is a good default for Intel stuff */ #define BASE 1.0e10 #define BASEDIGITS 10 /******************* CODE STARTS HERE ***********************************/ typedef struct { double recip; double num; double den; double rem; } term; #ifdef INTEL #define ROUND 3.0 * 2147483648.0 * 2147483648.0 /* 3*2^62 */ #else #define ROUND 3.0 * 67108864.0 * 67108864.0 /* 3*2^52 */ #endif #ifdef GCCDOS /*-----------------------ASSEMBLY CODED NUMBER CRUNCHING ROUTINE -----*/ #define PIPELINE 4 void crunch( double *pi, long length, term *t ) { /* High-performance djgpp code that multiplies a block of the array pi by a quartet of fractions t[0],t[1],t[2],t[3]. *pi must point to one element beyond the end of the block. It works with four fractions at a time, whereas the generic crunch() works with three. Running time: the inner loop runs in 50 cycles on a Pentium. There's a stall here and there on more modern processors be- cause you have to wait longer for multiplies to finish, although I've tried to minimize those too. "length" is the number of doubles before *pi that the loop will start from, and must be negative. */ asm(" fldl 24(%1) # r0 loads cache lines for fldl 56(%1) # r1 r0 each term fldl 88(%1) # r2 r1 r0 fldl 120(%1) # r3 r2 r1 r0 .align 4 0: fldl _stuff # B r3 r2 r1 r0 B = BASE fmul %%st, %%st(4) # B r3 r2 r1 R0 fldl (%0,%2,8) # x0 B r3 r2 r1 R0 fmull 8(%1) # nx0 B r3 r2 r1 R0 fxch %%st(1) # B nx0 r3 r2 r1 R0 fldl -8(%0,%2,8) # x1 B nx0 r3 r2 r1 R0 fmull 40(%1) # nx1 B nx0 r3 r2 r1 R0 fxch %%st(6) # R0 B nx0 r3 r2 r1 nx1 faddp %%st, %%st(2) # B d0 r3 r2 r1 nx1 fmul %%st, %%st(4) # B d0 r3 r2 R1 nx1 fldl -16(%0,%2,8) # x2 B d0 r3 r2 R1 nx1 fmull 72(%1) # nx2 B d0 r3 r2 R1 nx1 fxch %%st(6) # nx1 B d0 r3 r2 R1 nx2 faddp %%st, %%st(5) # B d0 r3 r2 d1 nx2 fmul %%st, %%st(3) # B d0 r3 R2 d1 nx2 fldl -24(%0,%2,8) # x3 B d0 r3 R2 d1 nx2 fmull 104(%1) # nx3 B d0 r3 R2 d1 nx2 fxch %%st(6) # nx2 B d0 r3 R2 d1 nx3 faddp %%st, %%st(4) # B d0 r3 d2 d1 nx3 fmulp %%st, %%st(2) # d0 R3 d2 d1 nx3 fld %%st # d0 d0 R3 d2 d1 nx3 fmull (%1) # q0 d0 R3 d2 d1 nx3 fxch %%st(5) # nx3 d0 R3 d2 d1 q0 faddp %%st, %%st(2) # d0 d3 d2 d1 q0 fld %%st(3) # d1 d0 d3 d2 d1 q0 fmull 32(%1) # q1 d0 d3 d2 d1 q0 fxch %%st(5) # q0 d0 d3 d2 d1 q1 faddl _stuff+8 # q0+ d0 d3 d2 d1 q1 fld %%st(3) # d2 q0+ d0 d3 d2 d1 q1 fmull 64(%1) # q2 q0+ d0 d3 d2 d1 q1 fxch %%st(6) # q1 q0+ d0 d3 d2 d1 q2 faddl _stuff+8 # q1+ q0+ d0 d3 d2 d1 q2 fld %%st(3) # d3 q1+ q0+ d0 d3 d2 d1 q2 fmull 96(%1) # q3 q1+ q0+ d0 d3 d2 d1 q2 fxch %%st(7) # q2 q1+ q0+ d0 d3 d2 d1 q3 faddl _stuff+8 # q2+ q1+ q0+ d0 d3 d2 d1 q3 fxch %%st(2) # q0+ q1+ q2+ d0 d3 d2 d1 q3 fsubl _stuff+8 # q0- q1+ q2+ d0 d3 d2 d1 q3 fxch %%st(1) # q1+ q0- q2+ d0 d3 d2 d1 q3 fsubl _stuff+8 # q1- q0- q2+ d0 d3 d2 d1 q3 fxch %%st(7) # q3 q0- q2+ d0 d3 d2 d1 q1- faddl _stuff+8 # q3+ q0- q2+ d0 d3 d2 d1 q1- fxch %%st(2) # q2+ q0- q3+ d0 d3 d2 d1 q1- fsubl _stuff+8 # q2- q0- q3+ d0 d3 d2 d1 q1- fxch %%st(1) # q0- q2- q3+ d0 d3 d2 d1 q1- fstl (%0,%2,8) # q0- q2- q3+ d0 d3 d2 d1 q1- fmull 16(%1) # nq0 q2- q3+ d0 d3 d2 d1 q1- fxch %%st(7) # q1- q2- q3+ d0 d3 d2 d1 nq0 fstl -8(%0,%2,8) # q1- q2- q3+ d0 d3 d2 d1 nq0 fmull 48(%1) # nq1 q2- q3+ d0 d3 d2 d1 nq0 fxch %%st(2) # q3+ q2- nq1 d0 d3 d2 d1 nq0 fsubl _stuff+8 # q3- q2- nq1 d0 d3 d2 d1 nq0 fxch %%st(1) # q2- q3- nq1 d0 d3 d2 d1 nq0 fstl -16(%0,%2,8) # q2- q3- nq1 d0 d3 d2 d1 nq0 fmull 80(%1) # nq2 q3- nq1 d0 d3 d2 d1 nq0 fxch %%st(7) # nq0 q3- nq1 d0 d3 d2 d1 nq2 fsubrp %%st, %%st(3) # q3- nq1 R0 d3 d2 d1 nq2 fstl -24(%0,%2,8) # q3- nq1 R0 d3 d2 d1 nq2 fmull 112(%1) # nq3 nq1 R0 d3 d2 d1 nq2 fxch %%st(1) # nq1 nq3 R0 d3 d2 d1 nq2 fsubrp %%st, %%st(5) # nq3 R0 d3 d2 R1 nq2 fxch %%st(5) # nq2 R0 d3 d2 R1 nq3 fsubrp %%st, %%st(3) # R0 d3 R2 R1 nq3 fxch %%st(4) # nq3 d3 R2 R1 R0 remainders end up fsubrp %%st, %%st(1) # R3 R2 R1 R0 in original order incl %2 jnz 0b fstpl 120(%1) # r2 r1 r0 fstpl 88(%1) # r1 r0 fstpl 56(%1) # r0 fstpl 24(%1) ": : "r"(pi), "r"(t), "r"(length) : "memory" ); } #else /*----------------------------- GENERIC NUMBER CRUNCHING ROUTINE -----*/ #define PIPELINE 3 void crunch( double *pi, long length, term *t ) { /* multiply a block of the array pi by a trio of fractions t[0],t[1],t[2]. *pi must point to one element beyond the end of the block. For RISC machines one may get better performance by mixing loads and stores with arithmetic more thoroughly than is done here. "length" is the number of doubles before *pi that the loop will start from (usually -BLOCKPI), and must be negative. */ long j; register double f0, f1, f2, f3, f4, f5; f0 = t[0].rem; f1 = t[1].rem; f2 = t[2].rem; for (j=length; j; j++) { f3 = pi[j]; f3 *= t[0].num; f4 = pi[j-1]; f4 *= t[1].num; f5 = pi[j-2]; f5 *= t[2].num; f0 *= stuff[0]; f1 *= stuff[0]; f2 *= stuff[0]; f0 += f3; f1 += f4; f2 += f5; f3 = f0; f3 *= t[0].recip; f4 = f1; f4 *= t[1].recip; f5 = f2; f5 *= t[2].recip; f3 += stuff[1]; f4 += stuff[1]; f5 += stuff[1]; f3 -= stuff[1]; f4 -= stuff[1]; f5 -= stuff[1]; pi[j] = f3; f3 *= t[0].den; pi[j-1] = f4; f4 *= t[1].den; pi[j-2] = f5; f5 *= t[2].den; f0 -= f3; f1 -= f4; f2 -= f5; } t[0].rem = f0; t[1].rem = f1; t[2].rem = f2; } /*---------------------------------------------------------------*/ #endif void crunch_one( double *pi, long length, term *t ) { /* multiply "length" doubles of the array *pi by the single fraction that *t points to. *pi must be one element beyond the end of the array. Note that this code is very inefficient and should be used sparingly; the pipelined crunch() routines are 2-2.5 times faster. */ long i; register double f0, f1; f0 = (*t).rem; for (i=length; i; i++) { f0 *= stuff[0]; f1 = pi[i]; f1 *= (*t).num; f0 += f1; f1 = f0; f1 *= (*t).recip; f1 += stuff[1]; f1 -= stuff[1]; pi[i] = f1; f1 *= (*t).den; f0 -= f1; } (*t).rem = f0; } /*---------------------------------------------------------------*/ void blockmultiply( double *pi, long length, term *t, long tlength ) { /* Multiply a block of doubles "*pi" by a block of fractions "*t". *pi and *t must point to one element beyond the end of their respective blocks. "length" and "tlength" denote the size of the *pi and *t arrays, respectively, and must be negative. This is the program's major workhorse routine, and multiplies by PIPELINE fractions at a time. */ long i, j; for (i=tlength; i; i+=PIPELINE) { /* set up for pipelined multiplication. crunch_one is used to move some remainders farther forward than others, so that several array elements in pi[] are all in different stages and can all be worked on simultaneuously. */ for (j=PIPELINE-1; j; j--) crunch_one( &pi[length+j], -j, &t[i+PIPELINE-1-j] ); /* switch to the much faster multiply routine. If the block size BLOCKPI is large then the overhead of using crunch_one becomes negligible. */ crunch( pi, length+PIPELINE-1, &t[i] ); /* re-align all the remainders again, using crunch_one. This lets you stop crunching out the present set of fractions and pick up the computation later, in the next block. Other computations on this block can be performed, while the block is still in cache. */ for (j=1; j OutputFile"); printf("\n\n"); exit(-1); } digits=atol(argv[1]); number=digits/CORRECT_DIGITS_PER_TERM + 3; digits = digits/BASEDIGITS + 1; /* figure out how many */ if( digits % BLOCKPI < 6 ) /* words are needed, and */ digits += 6 - digits % BLOCKPI; /* make sure there are */ /* enough for pipelining */ number *= FRACTIONS_PER_TERM; while( (number % BLOCKTERM) % PIPELINE ) /* do the same for the */ number += FRACTIONS_PER_TERM; /* number of terms. */ number /= FRACTIONS_PER_TERM; x = (double *)calloc( digits, sizeof(double) ); /* allocate memory */ y = (term *)calloc( BLOCKTERM, sizeof(term) ); stuff[0] = BASE; /* initialize the array */ stuff[1] = ROUND; /* of auxiliary stuff */ stuff[2] += (number-1) * stuff[3]; for(j=0; j=BASE) { x[i]-=BASE; x[i-1]++; } } printout(x, digits); printf( "\n\nelapsed time: %lf s\n", (double)(stop-start)/(double)CLOCKS_PER_SEC); free(x); free(y); /* whew! */ }