Robert Harley (Robert.Harley@inria.fr)
Sun, 21 Feb 1999 17:23:28 +0100 (MET)
/* > sqrt.S
* Purpose: Fast double precision square root in Alpha ASM.
* Copyright: (c) Robert J. Harley, 19-Oct-1998
* Contact: Robert.Harley@inria.fr
* Legalese: This code is subject to the GNU General Public License (v2).
*/
/* Integer registers. */
#define ea $1
#define ta $2
#define wm $3
#define wp $4
#define pt $5
#define ux $6
#define ua $7
#define pc $8
#define v $22
#define v2 $23
#define tmp $24
#define exp $25
#define ra $26
#define pv $27
#define at $28
#define gp $29
#define sp $30
#define zero $31
/* Floating-point registers. */
#define dX $f0
#define d0_5 $f10
#define d1_5 $f11
#define dB $f12
#define dT $f13
#define dZ $f14
#define dTmp $f15
#define dA $f16
#define dZero $f31
/* Delay one integer slot (E0 or E1). */
#define NOP bis $31, $31, $31
/* Delay one floating-point slot (FA or FM). */
#define FNOP cpys $f31, $f31, $f31
/* No delay, for alignment (invalidated at slotting). */
#define UNOP ldq_u $31, 0($30)
.set noreorder
.section .rodata
#define BLA(n) .long n<<16 | 126<<23
.type tab, @object
.size tab, 256
.align 5
tab:
BLA(127)
BLA(125)
BLA(123)
BLA(121)
BLA(119)
BLA(118)
BLA(116)
BLA(114)
BLA(113)
BLA(111)
BLA(109)
BLA(108)
BLA(106)
BLA(105)
BLA(103)
BLA(102)
BLA(100)
BLA(99)
BLA(97)
BLA(96)
BLA(95)
BLA(93)
BLA(92)
BLA(91)
BLA(90)
BLA(88)
BLA(87)
BLA(86)
BLA(85)
BLA(84)
BLA(83)
BLA(82)
BLA(80)
BLA(79)
BLA(78)
BLA(77)
BLA(76)
BLA(75)
BLA(74)
BLA(73)
BLA(72)
BLA(71)
BLA(70)
BLA(70)
BLA(69)
BLA(68)
BLA(67)
BLA(66)
BLA(65)
BLA(64)
BLA(63)
BLA(63)
BLA(62)
BLA(61)
BLA(60)
BLA(59)
BLA(59)
BLA(58)
BLA(57)
BLA(56)
BLA(56)
BLA(55)
BLA(54)
BLA(53)
#undef BLA
.align 4
consts:
.s_floating 1.5
.s_floating 0.5
.t_floating 0.70710678118654757273731092936941422522068023681640625
.text
.globl sqrt
.ent sqrt
.align 5
sqrt:
.frame sp, 24, ra, 0
subq sp, 24, sp
ldgp gp, .-sqrt(pv)
stt dA, 0(sp)
.prologue 1
.align 4
lda pc, consts
lda exp, 1022
ldq ua, 0(sp)
sll exp, 52, exp
lda pt, tab
lds d1_5, 0(pc)
srl ua, 46, ta
lds d0_5, 4(pc)
and ta, 63, ta
sll ua, 12, ux
s4addq ta, pt, pt
srl ux, 12, ux
lds dX, 0(pt)
bis ux, exp, ux
ble ua, pm_zero_or_negative
stq ux, 0(sp)
mult dX, dX, dTmp
srl ua, 52, ea
ldt dZ, 0(sp)
beq ea, positive_denormal
lda tmp, -2047(ea)
addq ea, 1, ta
addt dZ, dZ, dB
mult dZ, dTmp, dZ
beq tmp, pos_nan_or_inf
bic ta, 1, ta
lda ta, -1024(ta)
subt d1_5, dZ, dT
mult dZ, dT, dZ
ldt dTmp, 8(pc)
mult dX, dT, dX
sll ua, 52, wp
mult dZ, dT, dZ
UNOP
subt d1_5, dZ, dT
UNOP
sll ta, 51, ta
UNOP
mult dX, dT, dX
blbs ea, skip
addt dB, dB, dB
sll ua, 53, wp
mult dX, dTmp, dX
UNOP
skip:
mult dB, dX, dT
mult dX, d0_5, dX
mult dT, dT, dZ
subt dB, dZ, dZ
mult dX, dZ, dX
addt dX, dT, dX
stt dX, 0(sp)
FNOP
ldq ux, 0(sp)
subq ux, exp, v
mulq v, v, v2
addq ux, ta, ux
stq ux, 8(sp)
addq ux, 1, ux
subq wp, v, wm
stq ux, 0(sp)
subq ux, 2, ux
stq ux, 16(sp)
addq wp, v, wp
UNOP
subq v2, wm, wm
subq v2, wp, wp
sra wm, 63, wm
cmple zero, wp, wp
addq wm, wp, wm
s8addq wm, sp, wm
ldt dX, 8(wm)
addq sp, 24, sp
ret zero, (ra), 1
.align 4
pm_zero_or_negative:
positive_denormal:
pos_nan_or_inf:
cpys dZero, dZero, dX /* Junk for now. */
addq sp, 24, sp
ret zero, (ra), 1
.end sqrt
This archive was generated by hypermail 2.0b3 on Sun Feb 21 1999 - 09:00:26 PST