;**************************************************************************************** ;* TEENSYPI.COM -> world's smallest decimal spigot program -> 86 bytes (maybe 90) * ;* (generates & prints 9306 correct decimal digits of Pi) * ;**************************************************************************************** ; based on info primarily found on JASON'S PI PAGE ; -> http://www.isr.umd.edu/~jasonp/pipage.html ; ; the smallest decimal program on current jason site is hpigot.asm by Henrik Pedersen ; this program is very tight so I took 2 fundamentally differtent approaches to beat it ; ; 1. replacing initalizing array to 2000 for first inner loop with code modification ; the standard appraoch in all decimal spigot programs I've seen is to init array with... ; mov ax,2000 3 ; mov cx,terms 3 ; mov di,buffer 3 ; rep stosw 2 ; ; this sequence takes 11 bytes plus the subsequent "bfr[b]" load of 2-5 bytes ; (hpigot.asm uses 5B load) ; ; instead I don't initalize the array but rather change the load inst so that on the ; first pass it loads 2000 & on subsequent passess it loads bfr[b] ; mov dx,2000 3 ; mov [si],dx 2 this dummy store is changed after the first pass to ; mov dx,[si] - by changing only the opcode byte ; ; mov byte ptr inst,08Bh 5 this is the inst changing the store to a load ; ; this new sequence takes only 10 bytes including the laod rather than the ; previous 11+load ; ; 2. I could have just added technique #1 to hpigot but that's no fun. so instead I ; decided to use a different apparoach in the inner loop based on using 32-bit ; instructions. this was an experiment to see if the power of 32-bit instructions ; would offset their larger size (due to 66h prefix & larger constant values) ; ; * the program needs the high part of 2 32b regs initialized to 0. I (& the x86 guys here) ; believe that the high 16b of all GPRs is 0S upon entry to a .COM (the low 16b is definitely ; not initialized to 0s). this is artifact of the original 16b real-DOS behavior. ; ; thus,the 86 byte version of the program doesn't init the high 16b of the 2 regs. ; however, an control variable allows these regs to be initialized by the program. ; this initialization adds 4 bytes to the size yielding a 90 byte version ; - still smaller than smallest I found on web ; ; note that i couldn't imporve over the hspigot.asm print code so that is the same. ; (in fact, my print code takes 1 more byte since i haven't cleared cx when I start it) ; ;--the simple c version of my rough algorithm------------------------------- ; (copied from some Pi site) ; ; UINT a = 10000, b, c = 65142, d, e, f[65142], g, h, i; ; for (;b = c, c -= 14; i = printf("%04d", e + dd/a), e = dd % a ) { ; while (g = --b*2) { ; dd = h * b + a * (i ? f[b] : 2000); ; h = dd / --g; ; f[b] = dd % g; ; } ; } ;**************************************************************************************** ;--memory image (decimal) ; 0 - 255: the PSP ; 256 - 345: the teensypi.com instruction bytes (90) ; 346 - 65487: the data array = 32571 16b words = 9306 digits ; (9306 digits * 14 / 4 digits per word -> 32571) ; 65488 - 65535: 48 bytes of stack space (somewhat arbritary size) ; (program uses only 1 word beyond the inital 0 word on stack, ; this stack size should allow for interrupts) ; designed to be assembled under MASM digits EQU 9306 ; enough to fill up 64k sehment terms EQU (digits*14)/4 ; the number of array elements .586 CODE SEGMENT 'CODE' USE16 ASSUME CS:CODE,DS:CODE,ES:CODE,SS:CODE ORG 100H TEENSYPI PROC NEAR ;--initalize---------------------------------------------------- HIGH_32_INIT_NEEDED EQU 0 ; flag says that we need to init high 16b of some 32b regs to 0 IF HIGH_32_INIT_NEEDED ;lea ebx,[terms] ; 5 ; clears top 16b ebx -> needed for 32-bit mul below ; no manipulation of b later ever encraches into top half db 66h, 8dh, 1eh, 3ah, 7fh ; inst emited as db due to perversity of masm ;lea ebp,[10000k] ; 5 ; clears top 16b ebp -> needed for 32-bit mul below ; no manipulation of b later ever encraches into top half db 66h, 8dh, 2eh, 10h, 27h xor di,di ; inital outer loop remainder (e) 2 ELSE mov bx,terms ; 3 mov bp,10000 ; 3 xor di,di ; inital outer loop remainder (e) 2 ENDIF ;--outer loop -> for (c = terms-1; c > = 0; c -= 14) ------------- loop1: ;--outer loop regs ; c value in bx at start -> kept in stack across inner loop ;--b = c & init inner loop push bx ; c in bx becomes b ->save c 1 xor eax,eax ; clr incoming quo 3 ;--inner loop -> iterate thru bfr[] from c down---------------- loop2: ;--regs ; previous term quo in eax ; b in ebx -> 2b-1 in ecx -> next b-1 (32b for use of single mul) ; (cx needs to be 0 at end of inner loop for print code) ; &buffer[b] in si ; 10000 in ebp (32b for use of reg mul) ; last outer loop remainder (e) in di ; ; eax, edx used as temps ; ecx unused! ;--prev quo * b mul ebx ; prev quo may > 16 bits 3 ; result here fits into eax so edx is cleared -> need that below ;--no longer need b directly -> 2b for addr -> 2b-1 (denom) in bx ; bx reused for 2b-1 in order to save move to unused cx ; (but this costs an extra byte at end using shr instead of dec to get new b-1) add bx,bx ; 2 lea si,buffer[bx] ; 4 dec bx ; 1 ;--2000 or f[b] based on first loop or not mov dx,2000 ; 3 modified_inst: ; to overlay after first pass mov [si],dx ; 89... 2 ;mov dx,[si] ; 8B... ; after first pass, this mov [si],dx is overlaid with mov dx,[si] ; thus, only 1 byte (the mov opcode) needs to be overlaid -> saves space in overlay code ; ; (we can't easily directly overlay the mov dx,2000 sunce it's 3 bytes) ; thus, a total of 3 + 2 + 5 (for the code mode inst below) = 10 bytes needed ; to handle the initalization pass INCLUDING the load bfr[b] itself ; ; this 16b load counts on hi edx being 0 from mul above ;--(bfr[b] / 2000) * 10000 imul edx,ebp ; 10000 pos so same result as mul 4 ; use imul because no mul edx,ebp instruction form ;--(denom * b) + (bfr[b]/b * 10000) add eax,edx ; 3 ;--divides here need 32/16 -> 32 quo, 16 remain ; cause the 32/16 may result in 16b overflow ; our approach is to do 32:32/32 -> 32:16 so quo is in eax ;--quo (h) = d / denom; cdq ; zeroes edx 2 div ebx ; 32b quo now in eax 3 ;--f[b] = d % g; mov [si],dx ; dx -> f[b] (only need 16b) 2 ;--if (--b) goto loop2 shr bx,1 ; (2b-1)>>1 -> b-1 2 jnz loop2 ; 2 ;--end of inner loop-------------------------------------------- ;--last quo in eax ;--fix up the load for after 1st pass mov byte ptr modified_inst,08Bh ; -> mov dx,[si] 5 ; takes advantage of fact that CS = DS for .COM ;--e + quo / 10000 ;--e = quo % 10000 cdq ; 2 div ebp ; 3 add ax,di ; leave in ax for printing 2 mov di,dx ; save e for next pass 2 ;--print it mov cx,4 ; loop for 4 digits 3 mov bl,10 ; bh happens to be 0 from loop 2 push_loop: cwd ; 0 -> dx 1 div bx ; 2 push dx ; 1 loop push_loop ; 2 mov cl,4 ; 2 pop_loop: pop ax ; 1 add al,'0' ; to ascii 2 int 29h ; 2 loop pop_loop ; 2 ;--c-=14 & loop pop bx ; fetch back c from STK #1 1 sub bx,14 ; 3 jns loop1 ; 2 ;--we're done ret ; back to DOS ; 1 TEENSYPI ENDP ;***** DATA AREA **************************************************** ; ; EVEN ; buffer EQU $ ; digit buffer CODE ENDS END TEENSYPI