sqrt.S


Robert Harley (Robert.Harley@inria.fr)
Sun, 21 Feb 1999 17:23:28 +0100 (MET)


/* > sqrt.S
 * Purpose: Fast double precision square root in Alpha ASM.
 * Copyright: (c) Robert J. Harley, 19-Oct-1998
 * Contact: Robert.Harley@inria.fr
 * Legalese: This code is subject to the GNU General Public License (v2).
 */

/* Integer registers. */

#define ea $1
#define ta $2
#define wm $3
#define wp $4
#define pt $5
#define ux $6
#define ua $7
#define pc $8

#define v $22
#define v2 $23
#define tmp $24
#define exp $25

#define ra $26
#define pv $27
#define at $28
#define gp $29
#define sp $30
#define zero $31

/* Floating-point registers. */

#define dX $f0

#define d0_5 $f10
#define d1_5 $f11
#define dB $f12
#define dT $f13
#define dZ $f14
#define dTmp $f15

#define dA $f16

#define dZero $f31

/* Delay one integer slot (E0 or E1). */
#define NOP bis $31, $31, $31

/* Delay one floating-point slot (FA or FM). */
#define FNOP cpys $f31, $f31, $f31

/* No delay, for alignment (invalidated at slotting). */
#define UNOP ldq_u $31, 0($30)

        .set noreorder

.section .rodata

#define BLA(n) .long n<<16 | 126<<23

        .type tab, @object
        .size tab, 256
        .align 5
tab:

        BLA(127)
        BLA(125)
        BLA(123)
        BLA(121)
        BLA(119)
        BLA(118)
        BLA(116)
        BLA(114)
        BLA(113)
        BLA(111)
        BLA(109)
        BLA(108)
        BLA(106)
        BLA(105)
        BLA(103)
        BLA(102)
        BLA(100)
        BLA(99)
        BLA(97)
        BLA(96)
        BLA(95)
        BLA(93)
        BLA(92)
        BLA(91)
        BLA(90)
        BLA(88)
        BLA(87)
        BLA(86)
        BLA(85)
        BLA(84)
        BLA(83)
        BLA(82)
        BLA(80)
        BLA(79)
        BLA(78)
        BLA(77)
        BLA(76)
        BLA(75)
        BLA(74)
        BLA(73)
        BLA(72)
        BLA(71)
        BLA(70)
        BLA(70)
        BLA(69)
        BLA(68)
        BLA(67)
        BLA(66)
        BLA(65)
        BLA(64)
        BLA(63)
        BLA(63)
        BLA(62)
        BLA(61)
        BLA(60)
        BLA(59)
        BLA(59)
        BLA(58)
        BLA(57)
        BLA(56)
        BLA(56)
        BLA(55)
        BLA(54)
        BLA(53)

#undef BLA

        .align 4
consts:
        .s_floating 1.5
        .s_floating 0.5
        .t_floating 0.70710678118654757273731092936941422522068023681640625

.text
        .globl sqrt
        .ent sqrt
        .align 5
sqrt:
        .frame sp, 24, ra, 0
        subq sp, 24, sp
        ldgp gp, .-sqrt(pv)
        stt dA, 0(sp)
        .prologue 1
        .align 4

        lda pc, consts
        lda exp, 1022
        ldq ua, 0(sp)
        sll exp, 52, exp

        lda pt, tab
        lds d1_5, 0(pc)
        srl ua, 46, ta
        lds d0_5, 4(pc)

        and ta, 63, ta
        sll ua, 12, ux
        s4addq ta, pt, pt
        srl ux, 12, ux

        lds dX, 0(pt)
        bis ux, exp, ux
        ble ua, pm_zero_or_negative
        stq ux, 0(sp)

        mult dX, dX, dTmp
        srl ua, 52, ea
        ldt dZ, 0(sp)
        beq ea, positive_denormal

        lda tmp, -2047(ea)
        addq ea, 1, ta
        addt dZ, dZ, dB
        mult dZ, dTmp, dZ

        beq tmp, pos_nan_or_inf
        bic ta, 1, ta
        lda ta, -1024(ta)
        subt d1_5, dZ, dT

        mult dZ, dT, dZ
        ldt dTmp, 8(pc)
        mult dX, dT, dX
        sll ua, 52, wp

        mult dZ, dT, dZ
        UNOP
        subt d1_5, dZ, dT
        UNOP

        sll ta, 51, ta
        UNOP
        mult dX, dT, dX
        blbs ea, skip

        addt dB, dB, dB
        sll ua, 53, wp
        mult dX, dTmp, dX
        UNOP
skip:
        mult dB, dX, dT
        mult dX, d0_5, dX
        mult dT, dT, dZ
        subt dB, dZ, dZ

        mult dX, dZ, dX
        addt dX, dT, dX
        stt dX, 0(sp)
        FNOP

        ldq ux, 0(sp)
        subq ux, exp, v
        mulq v, v, v2
        addq ux, ta, ux

        stq ux, 8(sp)
        addq ux, 1, ux
        subq wp, v, wm
        stq ux, 0(sp)

        subq ux, 2, ux
        stq ux, 16(sp)
        addq wp, v, wp
        UNOP

        subq v2, wm, wm
        subq v2, wp, wp
        sra wm, 63, wm
        cmple zero, wp, wp

        addq wm, wp, wm
        s8addq wm, sp, wm
        ldt dX, 8(wm)
        addq sp, 24, sp

        ret zero, (ra), 1

        .align 4
pm_zero_or_negative:
positive_denormal:
pos_nan_or_inf:
        cpys dZero, dZero, dX /* Junk for now. */
        addq sp, 24, sp
        ret zero, (ra), 1

        .end sqrt



This archive was generated by hypermail 2.0b3 on Sun Feb 21 1999 - 09:00:26 PST