patch-2.4.0-test5 linux/arch/ia64/lib/idiv.S
Next file: linux/arch/ia64/lib/memcpy.S
Previous file: linux/arch/ia64/lib/copy_user.S
Back to the patch index
Back to the overall index
- Lines: 229
- Date:
Fri Jul 14 16:08:12 2000
- Orig file:
v2.4.0-test4/linux/arch/ia64/lib/idiv.S
- Orig date:
Fri Jun 23 21:55:07 2000
diff -u --recursive --new-file v2.4.0-test4/linux/arch/ia64/lib/idiv.S linux/arch/ia64/lib/idiv.S
@@ -1,162 +1,98 @@
/*
* Integer division routine.
*
- * Copyright (C) 1999 Hewlett-Packard Co
- * Copyright (C) 1999 David Mosberger-Tang <davidm@hpl.hp.com>
+ * Copyright (C) 1999-2000 Hewlett-Packard Co
+ * Copyright (C) 1999-2000 David Mosberger-Tang <davidm@hpl.hp.com>
*/
-/* Simple integer division. It uses the straight forward division
- algorithm. This may not be the absolutely fastest way to do it,
- but it's not horrible either. According to ski, the worst case
- scenario of dividing 0xffffffffffffffff by 1 takes 133 cycles.
-
- An alternative would be to use an algorithm similar to the
- floating point division algorithm (Newton-Raphson iteration),
- but that approach is rather tricky (one has to be very careful
- to get the last bit right...).
-
- While this algorithm is straight-forward, it does use a couple
- of neat ia-64 specific tricks:
-
- - it uses the floating point unit to determine the initial
- shift amount (shift = floor(ld(x)) - floor(ld(y)))
-
- - it uses predication to avoid a branch in the case where
- x < y (this is what p8 is used for)
-
- - it uses rotating registers and the br.ctop branch to
- implement a software-pipelined loop that's unrolled
- twice (without any code expansion!)
-
- - the code is relatively well scheduled to avoid unnecessary
- nops while maximizing parallelism
-*/
#include <asm/asmmacro.h>
-#include <asm/break.h>
- .text
- .psr abi64
-#ifdef __BIG_ENDIAN__
- .psr msb
- .msb
-#else
- .psr lsb
- .lsb
-#endif
+/*
+ * Compute a 64-bit unsigned integer quotient.
+ *
+ * Use reciprocal approximation and Newton-Raphson iteration to compute the
+ * quotient. frcpa gives 8.6 significant bits, so we need 3 iterations
+ * to get more than the 64 bits of precision that we need for DImode.
+ *
+ * Must use max precision for the reciprocal computations to get 64 bits of
+ * precision.
+ *
+ * r32 holds the dividend. r33 holds the divisor.
+ */
#ifdef MODULO
# define OP mod
-# define Q r9
-# define R r8
-#else
-# define OP div
-# define Q r8
-# define R r9
-#endif
-
-#ifdef SINGLE
-# define PREC si
#else
-# define PREC di
+# define OP div
#endif
#ifdef UNSIGNED
-# define SGN u
-# define INT_TO_FP(a,b) fma.s0 a=b,f1,f0
-# define FP_TO_INT(a,b) fcvt.fxu.trunc.s0 a=b
+# define SGN u
+# define INT_TO_FP(a,b) fcvt.xuf.s1 a=b
+# define FP_TO_INT(a,b) fcvt.fxu.trunc.s1 a=b
#else
# define SGN
# define INT_TO_FP(a,b) fcvt.xf a=b
-# define FP_TO_INT(a,b) fcvt.fx.trunc.s0 a=b
+# define FP_TO_INT(a,b) fcvt.fx.trunc.s1 a=b
#endif
#define PASTE1(a,b) a##b
#define PASTE(a,b) PASTE1(a,b)
-#define NAME PASTE(PASTE(__,SGN),PASTE(OP,PASTE(PREC,3)))
+#define NAME PASTE(PASTE(__,SGN),PASTE(OP,di3))
GLOBAL_ENTRY(NAME)
UNW(.prologue)
- alloc r2=ar.pfs,2,6,0,8
- UNW(.save pr, r18)
- mov r18=pr
-#ifdef SINGLE
-# ifdef UNSIGNED
- zxt4 in0=in0
- zxt4 in1=in1
-# else
- sxt4 in0=in0
- sxt4 in1=in1
-# endif
- ;;
-#endif
+ .regstk 2,0,0,0
+ // Transfer inputs to FP registers.
+ setf.sig f8 = in0
+ setf.sig f9 = in1
+ UNW(.fframe 16)
+ UNW(.save.f 0x20)
+ stf.spill [sp] = f17,-16
-#ifndef UNSIGNED
- cmp.lt p6,p0=in0,r0 // x negative?
- cmp.lt p7,p0=in1,r0 // y negative?
- ;;
-(p6) sub in0=r0,in0 // make x positive
-(p7) sub in1=r0,in1 // ditto for y
+ // Convert the inputs to FP, to avoid FP software-assist faults.
+ INT_TO_FP(f8, f8)
;;
-#endif
-
- setf.sig f8=in0
- UNW(.save ar.lc, r3)
+ UNW(.save.f 0x10)
+ stf.spill [sp] = f16
UNW(.body)
-
- mov r3=ar.lc // save ar.lc
- setf.sig f9=in1
+ INT_TO_FP(f9, f9)
+ ;;
+ frcpa.s1 f17, p6 = f8, f9 // y = frcpa(b)
+ ;;
+ /*
+ * This is the magic algorithm described in Section 8.6.2 of "IA-64
+ * and Elementary Functions" by Peter Markstein; HP Professional Books
+ * (http://www.hp.com/go/retailbooks/)
+ */
+(p6) fmpy.s1 f7 = f8, f17 // q = a*y
+(p6) fnma.s1 f6 = f9, f17, f1 // e = -b*y + 1
+ ;;
+(p6) fma.s1 f16 = f7, f6, f7 // q1 = q*e + q
+(p6) fmpy.s1 f7 = f6, f6 // e1 = e*e
+ ;;
+(p6) fma.s1 f16 = f16, f7, f16 // q2 = q1*e1 + q1
+(p6) fma.s1 f6 = f17, f6, f17 // y1 = y*e + y
+ ;;
+(p6) fma.s1 f6 = f6, f7, f6 // y2 = y1*e1 + y1
+(p6) fnma.s1 f7 = f9, f16, f8 // r = -b*q2 + a
;;
- mov Q=0 // initialize q
- mov R=in0 // stash away x in a static register
- mov r16=1 // r16 = 1
- INT_TO_FP(f8,f8)
- cmp.eq p8,p0=0,in0 // x==0?
- cmp.eq p9,p0=0,in1 // y==0?
- ;;
- INT_TO_FP(f9,f9)
-(p8) br.dpnt.few .L3
-(p9) break __IA64_BREAK_KDB // attempted division by zero (should never happen)
- mov ar.ec=r0 // epilogue count = 0
- ;;
- getf.exp r14=f8 // r14 = exponent of x
- getf.exp r15=f9 // r15 = exponent of y
- mov ar.lc=r0 // loop count = 0
- ;;
- sub r17=r14,r15 // r17 = (exp of x - exp y) = shift amount
- cmp.ge p8,p0=r14,r15
- ;;
-
- .rotr y[2], mask[2] // in0 and in1 may no longer be valid after
- // the first write to a rotating register!
-
-(p8) shl y[1]=in1,r17 // y[1] = y<<shift
-(p8) shl mask[1]=r16,r17 // mask[1] = 1<<shift
-
-(p8) mov ar.lc=r17 // loop count = r17
- ;;
-.L1:
-(p8) cmp.geu.unc p9,p0=R,y[1]// p9 = (x >= y[1])
-(p8) shr.u mask[0]=mask[1],1 // prepare mask[0] and y[0] for next
-(p8) shr.u y[0]=y[1],1 // iteration
- ;;
-(p9) sub R=R,y[1] // if (x >= y[1]), subtract y[1] from x
-(p9) add Q=Q,mask[1] // and set corresponding bit in q (Q)
- br.ctop.dptk.few .L1 // repeated unless ar.lc-- == 0
- ;;
-.L2:
-#ifndef UNSIGNED
-# ifdef MODULO
-(p6) sub R=r0,R // set sign of remainder according to x
-# else
-(p6) sub Q=r0,Q // set sign of quotient
+(p6) fma.s1 f17 = f7, f6, f16 // q3 = r*y2 + q2
+ ;;
+#ifdef MODULO
+ FP_TO_INT(f17, f17) // round quotient to an unsigned integer
+ ;;
+ INT_TO_FP(f17, f17) // renormalize
+ ;;
+ fnma.s1 f17 = f17, f9, f8 // compute remainder
;;
-(p7) sub Q=r0,Q
-# endif
#endif
-.L3:
- mov ar.pfs=r2 // restore ar.pfs
- mov ar.lc=r3 // restore ar.lc
- mov pr=r18,0xffffffffffff0000 // restore p16-p63
- br.ret.sptk.few rp
+ UNW(.restore sp)
+ ldf.fill f16 = [sp], 16
+ FP_TO_INT(f8, f17) // round result to an (unsigned) integer
+ ;;
+ ldf.fill f17 = [sp]
+ getf.sig r8 = f8 // transfer result to result register
+ br.ret.sptk rp
END(NAME)
FUNET's LINUX-ADM group, linux-adm@nic.funet.fi
TCL-scripts by Sam Shen (who was at: slshen@lbl.gov)