314 lines
10 KiB
NASM
314 lines
10 KiB
NASM
|
; int _MulUnsArrByUnsArr(src1,src2,dst,m,n,tmp1)
|
||
|
;
|
||
|
; ARGUMENT
|
||
|
; unsigned *src1[m],*src2[n]; where (m,n)<=5
|
||
|
; unsigned *dst[10]; destination is unsigned [10]
|
||
|
; int m,n; m=#ints in src1, n=#ints in src2
|
||
|
; int tmp1; where tmp1 is used as a temp multiplier for each loop
|
||
|
;
|
||
|
; DESCRIPTION
|
||
|
;
|
||
|
; Multiplies multiplier by multiplicand giving dst. Src1 and scr2 are
|
||
|
; 80-bitx80-bit is computed to 160-bit. The number of ints in src1
|
||
|
; and src2 are examined to determine which is the multiplier, ie.
|
||
|
; which has the fewest number of ints. If they have the same number
|
||
|
; number of ints, then src1 will be the multiplier. As each int
|
||
|
; is multiplied to obtain the partial product, it is added to the dst
|
||
|
; and any carries are added to succeeding column locations in the
|
||
|
; dst array.
|
||
|
;
|
||
|
; SIDE EFFECTS
|
||
|
; Src1 and src2 remain unchanged. Dst IS ZEROED PRIOR TO ADD. NOTE THAT
|
||
|
; NEITHER pSrc1 or pSrc2 CAN HAVE THE MSB OF THE HIGH-ORDER INT SET, ie.
|
||
|
; neither number can be negative. If it is, the results are indeterminate.
|
||
|
;
|
||
|
; RETURNS
|
||
|
; None.
|
||
|
;
|
||
|
; AUTHOR
|
||
|
; Andy Anderson 04-Jun-88 1030
|
||
|
; Copyright (C) 1987-90 Greenleaf Software Inc. All Rights Reserved.
|
||
|
;
|
||
|
; MODIFICATIONS
|
||
|
;
|
||
|
;
|
||
|
.SFCOND
|
||
|
|
||
|
include model.h
|
||
|
include prologue.h
|
||
|
include gm.equ
|
||
|
|
||
|
|
||
|
; partial products for intermediate results
|
||
|
|
||
|
ReferVar wGMTemp1,<cWord>
|
||
|
|
||
|
dseg _gm
|
||
|
endds
|
||
|
|
||
|
pseg gmath
|
||
|
;
|
||
|
;
|
||
|
; if large memory model then:
|
||
|
;
|
||
|
; parm1_ = ptr to multiplier
|
||
|
; parm3_ = ptr to multiplicand
|
||
|
; parm5_ = destination segment ptr } pointer to unsigned[10]
|
||
|
; destination offset ptr } destination
|
||
|
; parm7_ = # of 16-bit int's in multiplier
|
||
|
; parm8_ = # of 16-bit int's in multiplicand
|
||
|
; parm9_ = temp for current multiplier
|
||
|
;
|
||
|
; for if small model then
|
||
|
; parm1_ = ptr to multiplier
|
||
|
; parm2_ = ptr to multiplicand
|
||
|
; parm3_ = ptr to unsigned[10] dst
|
||
|
; parm4_ = # of 16-bit int's in MULTIPLIER (a1,...,an)
|
||
|
; parm5_ = # of 16-bit int's in MULTIPLICAND(b1,...,bn)
|
||
|
; parm6_ = temp for current multiplier
|
||
|
;
|
||
|
;
|
||
|
; Then set up and do the first set of mults. Note that the # of
|
||
|
; 'bigloop' iterations correspond to the number of 16-bit digits
|
||
|
; in the multiplier and the 'mlp' loop iterations correspond to the
|
||
|
; # ints in the multiplicand. [these line up if you set tab=5].
|
||
|
; Algorithm(32-bitx32-bit):
|
||
|
; b2 b1
|
||
|
; a2 a1
|
||
|
; __________
|
||
|
;
|
||
|
;| first p1h p1l
|
||
|
;|iteration p2h p2l
|
||
|
;|2nd p3h p3l
|
||
|
;|iter p4h p4l
|
||
|
; ____________________________________
|
||
|
; c4 c3 c2 c1
|
||
|
;
|
||
|
;or---> Algorithm(64-bitx32-bit):
|
||
|
; b4 b3 b2 b1
|
||
|
; a2 a1
|
||
|
; __________________
|
||
|
;| first p1h p1l
|
||
|
;|iteration p2h p2l
|
||
|
;| p3h p3l
|
||
|
;| p4h p4l
|
||
|
;----
|
||
|
;| p5h p5l
|
||
|
;| 2nd p6h p6l
|
||
|
;|iter p7h p7l
|
||
|
;| p8h p8l
|
||
|
;-----
|
||
|
;______________________________________
|
||
|
; 0 0 c6 c5 c4 c3 c2 c1
|
||
|
;
|
||
|
;or---> Algorithm(64-bitx64-bit):
|
||
|
; b4 b3 b2 b1
|
||
|
; a4 a3 a2 a1
|
||
|
; __________________
|
||
|
;| first p1h p1l
|
||
|
;|iteration p2h p2l
|
||
|
;| p3h p3l
|
||
|
;| p4h p4l
|
||
|
;----
|
||
|
;| p5h p5l
|
||
|
;| 2nd p6h p6l
|
||
|
;|iter p7h p7l
|
||
|
;| p8h p8l
|
||
|
;-----
|
||
|
;| third p1h p1l
|
||
|
;|iteration p2h p2l
|
||
|
;| p3h p3l
|
||
|
;| p4h p4l
|
||
|
;----
|
||
|
;| p5h p5l
|
||
|
;| 4th p6h p6l
|
||
|
;|iter p7h p7l
|
||
|
;| p8h p8l
|
||
|
;-----
|
||
|
;| fifth p9h p9l
|
||
|
;|iteration p10h p10l
|
||
|
;| p11h p11l
|
||
|
;| p12h p12l
|
||
|
;----
|
||
|
;| 6th p13h p13l
|
||
|
;| p14h p14l
|
||
|
;| p15h p15l
|
||
|
;|p16h p16l
|
||
|
;-----
|
||
|
;_________________________________________________________________
|
||
|
; c8 c7 c6 c5 c4 c3 c2 c1
|
||
|
;
|
||
|
; As each multiply is done, the partial product is added to
|
||
|
; the destination 'dst'
|
||
|
;
|
||
|
;
|
||
|
;
|
||
|
cproc _MulUnsArrByUnsArr,,_mgmn
|
||
|
|
||
|
if _LDATA
|
||
|
push ds
|
||
|
push es
|
||
|
mov ax,parm7_
|
||
|
cmp ax,parm8_ ; see which is larger
|
||
|
jle nochg ; m<n
|
||
|
xchg ax,parm8_ ; exchange m,n and
|
||
|
mov parm7_,ax ; # multiplier ints in ax
|
||
|
mov bx,parm1_ ; the order of src1 and src2
|
||
|
xchg bx,parm3_ ; so multiplier remains
|
||
|
mov parm1_,bx ; as parm1_
|
||
|
mov bx,parm2_
|
||
|
xchg bx,parm4_
|
||
|
mov parm2_,bx
|
||
|
else
|
||
|
mov ax,parm4_ ; number of digits in multiplier
|
||
|
cmp ax,parm5_ ; see which is larger
|
||
|
jle nochg ; m<n
|
||
|
xchg ax,parm5_ ; exchange m,n and
|
||
|
mov parm4_,ax ; # multiplier ints in ax
|
||
|
mov bx,parm1_ ; the order of src1 and src2
|
||
|
xchg bx,parm2_ ; so multiplier remains
|
||
|
mov parm1_,bx
|
||
|
endif
|
||
|
|
||
|
nochg:
|
||
|
ifdef DSNOTHING
|
||
|
mov bx,seg wGMTemp1
|
||
|
mov ds,bx
|
||
|
endif
|
||
|
mov wGMTemp1, ax ; save # reps in global
|
||
|
|
||
|
; clear the destination
|
||
|
if _LDATA
|
||
|
les di,parm5_ ; assure zero dst
|
||
|
add di,2 ; by concatenating zeroes
|
||
|
lds si,parm5_ ; starting with the first
|
||
|
sub ax,ax
|
||
|
mov [si],ax
|
||
|
mov cx,9
|
||
|
rep movsw ; zero-fill dst
|
||
|
les di,parm5_
|
||
|
else
|
||
|
push es ; save entry es
|
||
|
mov ax,ds
|
||
|
mov es,ax
|
||
|
mov di,parm3_ ; set up to zero dst
|
||
|
mov si,di
|
||
|
add di,2
|
||
|
sub ax,ax
|
||
|
mov [si],ax
|
||
|
mov cx,9
|
||
|
rep movsw ; zero-fill dst
|
||
|
mov di,parm3_ ; reset to start of dst
|
||
|
pop es ; restore es
|
||
|
endif
|
||
|
|
||
|
;
|
||
|
; Do only the number of multiplies required by the
|
||
|
; number of 16-bit int's in the multiplier and multiplicand
|
||
|
;
|
||
|
|
||
|
bigloop:
|
||
|
if _LDATA
|
||
|
mov cx,parm8_ ; # digits in multiplicand
|
||
|
lds si,parm1_ ; load si with multiplier S.A.
|
||
|
mov ax,[si] ; get 1st(next) multiplier
|
||
|
mov parm9_,ax ; to mult for partial products
|
||
|
lds si,parm3_ ; ptr to lsd of multiplicand
|
||
|
else
|
||
|
mov cx,parm5_ ; # digits in multiplicand
|
||
|
mov si,parm1_ ; load si with multiplier S.A.
|
||
|
mov ax,[si] ; get the 1st(next) multiplier
|
||
|
mov parm6_,ax ; to mult for partial products
|
||
|
mov si,parm2_ ; ptr to lsd of multiplicand
|
||
|
endif
|
||
|
|
||
|
xor ax,ax ; clear flags
|
||
|
xor bx,bx ; clear offset
|
||
|
;
|
||
|
; Then do the inner set of multiplies (each loop in the algorithm)
|
||
|
;
|
||
|
mlp:
|
||
|
|
||
|
if _LDATA
|
||
|
mov ax,[si][bx] ; starting at lsd, get next highest
|
||
|
mul Word Ptr parm9_ ; 16-bit multiplicand & get product
|
||
|
clc ; clear uns mult carry (value in dx)
|
||
|
add es:[di+bx],ax ; add partial products to dst
|
||
|
adc es:[di+bx+2],dx ; then if carry gets set from
|
||
|
else
|
||
|
mov ax,[si+bx] ; starting at lsd, get next highest
|
||
|
mul Word Ptr parm6_ ; 16-bit multiplicand & get product
|
||
|
clc ; clear uns mult carry (value in dx)
|
||
|
add [di+bx],ax ; add partial products to dst
|
||
|
adc [di+bx+2],dx ; then if carry gets set from
|
||
|
endif
|
||
|
mov ax,dx ; add msb's to next word
|
||
|
jc carry ; here, must get special handling
|
||
|
nocary:
|
||
|
add bx,2 ; offsets
|
||
|
loop mlp ; done yet??
|
||
|
;
|
||
|
; Yes. Now see if we've multiplied by all the multiplier
|
||
|
; digits.
|
||
|
;
|
||
|
if _LDATA
|
||
|
mov bx,ds ; save ds
|
||
|
ifndef DSNOTHING
|
||
|
mov ax, seg DGROUP
|
||
|
else
|
||
|
mov ax,seg wGMTemp1
|
||
|
endif
|
||
|
mov ds,ax
|
||
|
dec wGMTemp1 ; see if we are done with the
|
||
|
cmp wGMTemp1,0 ; outer loop
|
||
|
mov ds,bx ;restore ds after compare
|
||
|
else
|
||
|
dec wGMTemp1 ; see if we are done with the
|
||
|
cmp wGMTemp1,0 ; outer loop
|
||
|
endif
|
||
|
je done ; yes: done
|
||
|
|
||
|
if _LDATA
|
||
|
add Word Ptr parm1_,2 ; pts to next int in multiplier
|
||
|
else
|
||
|
add Word Ptr parm1_,2
|
||
|
endif
|
||
|
add di,2
|
||
|
xor bx,bx ; clear the offset
|
||
|
jmp short bigloop
|
||
|
;
|
||
|
; Here to propogate as many column carries as can happen. Since
|
||
|
; we guarantee than non-negative numbers are passed, this means that
|
||
|
; a max of 4 additional adds to the destination 'dst' could be made,
|
||
|
; and that no carry can happen from the most significant word.
|
||
|
; Therefore, we add a max of 4 times without changing di or bx.
|
||
|
carry:
|
||
|
mov ax,0000h ; we'll add only the carry
|
||
|
if _LDATA
|
||
|
adc es:[di+bx+4],ax
|
||
|
jnc nocary ; if no carry, return to mult
|
||
|
adc es:[di+bx+6],ax
|
||
|
jnc nocary
|
||
|
adc es:[di+bx+8],ax
|
||
|
jnc nocary
|
||
|
adc es:[di+bx+10],ax ; can't exceed dst bounds
|
||
|
else
|
||
|
adc [di+bx+4],ax
|
||
|
jnc nocary ; if no carry, return to mult
|
||
|
adc [di+bx+6],ax
|
||
|
jnc nocary
|
||
|
adc [di+bx+8],ax
|
||
|
jnc nocary
|
||
|
adc [di+bx+10],ax ; can't exceed dst bounds
|
||
|
endif
|
||
|
jmp short nocary ; &cant carry here
|
||
|
|
||
|
done:
|
||
|
if _LDATA
|
||
|
pop es
|
||
|
pop ds
|
||
|
endif
|
||
|
cproce
|
||
|
endps
|
||
|
END
|