patch-2.4.0-test7 linux/arch/ia64/lib/memcpy.S

Next file: linux/arch/ia64/mm/init.c
Previous file: linux/arch/ia64/kernel/unwind.c
Back to the patch index
Back to the overall index
Lines: 312
Date: Fri Aug 11 19:09:06 2000
Orig file: v2.4.0-test6/linux/arch/ia64/lib/memcpy.S
Orig date: Thu Jul 27 17:37:59 2000

diff -u --recursive --new-file v2.4.0-test6/linux/arch/ia64/lib/memcpy.S linux/arch/ia64/lib/memcpy.S
@@ -1,3 +1,20 @@
+/*
+ *
+ * Optimized version of the standard memcpy() function
+ *
+ * Inputs:
+ * 	in0:	destination address
+ *	in1:	source address
+ *	in2:	number of bytes to copy
+ * Output:
+ * 	no return value
+ *
+ * Copyright (C) 2000 Hewlett-Packard Co
+ * Copyright (C) 2000 Stephane Eranian <eranian@hpl.hp.com>
+ * Copyright (C) 2000 David Mosberger-Tang <davidm@hpl.hp.com>
+ */
+#include <linux/config.h>
+
 #include <asm/asmmacro.h>
 
 GLOBAL_ENTRY(bcopy)
@@ -10,77 +27,254 @@
 	// FALL THROUGH
 GLOBAL_ENTRY(memcpy)
 
-#	define MEM_LAT	4
-
-#	define N	MEM_LAT-1
-#	define Nrot	((MEM_LAT + 7) & ~7)
+#	define MEM_LAT	2		/* latency to L1 cache */
 
 #	define dst	r2
 #	define src	r3
-#	define len	r9
-#	define saved_pfs r10
-#	define saved_lc	r11
-#	define saved_pr	r16
-#	define t0	r17
-#	define cnt	r18
-
+#	define retval	r8
+#	define saved_pfs r9
+#	define saved_lc	r10
+#	define saved_pr	r11
+#	define cnt	r16
+#	define src2	r17
+#	define t0	r18
+#	define t1	r19
+#	define t2	r20
+#	define t3	r21
+#	define t4	r22
+#	define src_end	r23
+
+#	define N	(MEM_LAT + 4)
+#	define Nrot	((N + 7) & ~7)
+
+	/*
+	 * First, check if everything (src, dst, len) is a multiple of eight.  If
+	 * so, we handle everything with no taken branches (other than the loop
+	 * itself) and a small icache footprint.  Otherwise, we jump off to
+	 * the more general copy routine handling arbitrary
+	 * sizes/alignment etc.
+	 */
 	UNW(.prologue)
 	UNW(.save ar.pfs, saved_pfs)
 	alloc saved_pfs=ar.pfs,3,Nrot,0,Nrot
+#if !(defined(CONFIG_ITANIUM_ASTEP_SPECIFIC) || defined(CONFIG_ITANIUM_BSTEP_SPECIFIC))
 	lfetch [in1]
+#else
+	nop.m 0
+#endif
+	or t0=in0,in1
+	;;
 
-	.rotr val[MEM_LAT]
-	.rotp p[MEM_LAT]
-
+	or t0=t0,in2
 	UNW(.save ar.lc, saved_lc)
 	mov saved_lc=ar.lc
-
-	or t0=in0,in1
 	UNW(.save pr, saved_pr)
 	mov saved_pr=pr
 
-	UNW(.body)
-
-	mov ar.ec=MEM_LAT
+	cmp.eq p6,p0=in2,r0	// zero length?
+	mov retval=in0		// return dst
+(p6)	br.ret.spnt.many rp	// zero length, return immediately
+	;;
 
-	mov r8=in0		// return dst
-	shr cnt=in2,3		// number of 8-byte words to copy
+	mov dst=in0		// copy because of rotation
+	shr.u cnt=in2,3		// number of 8-byte words to copy
 	mov pr.rot=1<<16
 	;;
-	cmp.eq p6,p0=in2,r0	// zero length?
-	or t0=t0,in2
-(p6)	br.ret.spnt.many rp	// yes, return immediately
 
-	mov dst=in0		// copy because of rotation
-	mov src=in1		// copy because of rotation
 	adds cnt=-1,cnt		// br.ctop is repeat/until
+	cmp.gtu p7,p0=16,in2	// copying less than 16 bytes?
+	UNW(.body)
+	mov ar.ec=N
 	;;
+
 	and t0=0x7,t0
 	mov ar.lc=cnt
 	;;
 	cmp.ne p6,p0=t0,r0
-(p6)	br.cond.spnt.few slow_memcpy
 
+	mov src=in1		// copy because of rotation
+(p7)	br.cond.spnt.few memcpy_short
+(p6)	br.cond.spnt.few memcpy_long
+	;;
+	.rotr val[N]
+	.rotp p[N]
 1:
 (p[0])	ld8 val[0]=[src],8
-(p[N])	st8 [dst]=val[N],8
-	br.ctop.sptk.few 1b
+(p[N-1])st8 [dst]=val[N-1],8
+	br.ctop.dptk.few 1b
 	;;
-.exit:
 	mov ar.lc=saved_lc
-	mov pr=saved_pr,0xffffffffffff0000
+	mov pr=saved_pr,-1
 	mov ar.pfs=saved_pfs
 	br.ret.sptk.many rp
 
-slow_memcpy:
-	adds cnt=-1,in2
+	/*
+	 * Small (<16 bytes) unaligned copying is done via a simple byte-at-the-time
+	 * copy loop.  This performs relatively poorly on Itanium, but it doesn't
+	 * get used very often (gcc inlines small copies) and due to atomicity
+	 * issues, we want to avoid read-modify-write of entire words.
+	 */
+	.align 32
+memcpy_short:
+	adds cnt=-1,in2		// br.ctop is repeat/until
+	mov ar.ec=MEM_LAT
 	;;
 	mov ar.lc=cnt
 	;;
+	/*
+	 * It is faster to put a stop bit in the loop here because it makes
+	 * the pipeline shorter (and latency is what matters on short copies).
+	 */
 1:
 (p[0])	ld1 val[0]=[src],1
-(p[N])	st1 [dst]=val[N],1
-	br.ctop.sptk.few 1b
-	br.sptk.few .exit
+	;;
+(p[MEM_LAT-1])st1 [dst]=val[MEM_LAT-1],1
+	br.ctop.dptk.few 1b
+	;;
+	mov ar.lc=saved_lc
+	mov pr=saved_pr,-1
+	mov ar.pfs=saved_pfs
+	br.ret.sptk.many rp
+
+	/*
+	 * Large (>= 16 bytes) copying is done in a fancy way.  Latency isn't
+	 * an overriding concern here, but throughput is.  We first do
+	 * sub-word copying until the destination is aligned, then we check
+	 * if the source is also aligned.  If so, we do a simple load/store-loop
+	 * until there are less than 8 bytes left over and then we do the tail,
+	 * by storing the last few bytes using sub-word copying.  If the source
+	 * is not aligned, we branch off to the non-congruent loop.
+	 *
+	 *   stage:   op:
+	 *         0  ld
+	 *	   :
+	 * MEM_LAT+3  shrp
+	 * MEM_LAT+4  st
+	 *
+	 * On Itanium, the pipeline itself runs without stalls.  However,  br.ctop
+	 * seems to introduce an unavoidable bubble in the pipeline so the overall
+	 * latency is 2 cycles/iteration.  This gives us a _copy_ throughput
+	 * of 4 byte/cycle.  Still not bad.
+	 */
+#	undef N
+#	undef Nrot
+#	define N	(MEM_LAT + 5)		/* number of stages */
+#	define Nrot	((N+1 + 2 + 7) & ~7)	/* number of rotating regs */
+
+#define LOG_LOOP_SIZE	6
+
+memcpy_long:
+	alloc t3=ar.pfs,3,Nrot,0,Nrot	// resize register frame
+	and t0=-8,src		// t0 = src & ~7
+	and t2=7,src		// t2 = src & 7
+	;;
+	ld8 t0=[t0]		// t0 = 1st source word
+	adds src2=7,src		// src2 = (src + 7)
+	sub t4=r0,dst		// t4 = -dst
+	;;
+	and src2=-8,src2	// src2 = (src + 7) & ~7
+	shl t2=t2,3		// t2 = 8*(src & 7)
+	shl t4=t4,3		// t4 = 8*(dst & 7)
+	;;
+	ld8 t1=[src2]		// t1 = 1st source word if src is 8-byte aligned, 2nd otherwise
+	sub t3=64,t2		// t3 = 64-8*(src & 7)
+	shr.u t0=t0,t2
+	;;
+	add src_end=src,in2
+	shl t1=t1,t3
+	mov pr=t4,0x38		// (p5,p4,p3)=(dst & 7)
+	;;
+	or t0=t0,t1
+	mov cnt=r0
+	adds src_end=-1,src_end
+	;;
+(p3)	st1 [dst]=t0,1
+(p3)	shr.u t0=t0,8
+(p3)	adds cnt=1,cnt
+	;;
+(p4)	st2 [dst]=t0,2
+(p4)	shr.u t0=t0,16
+(p4)	adds cnt=2,cnt
+	;;
+(p5)	st4 [dst]=t0,4
+(p5)	adds cnt=4,cnt
+	and src_end=-8,src_end	// src_end = last word of source buffer
+	;;
+
+	// At this point, dst is aligned to 8 bytes and there at least 16-7=9 bytes left to copy:
+
+1:{	add src=cnt,src			// make src point to remainder of source buffer
+	sub cnt=in2,cnt			// cnt = number of bytes left to copy
+	mov t4=ip
+  }	;;
+	and src2=-8,src			// align source pointer
+	adds t4=memcpy_loops-1b,t4
+	mov ar.ec=N
+
+	and t0=7,src			// t0 = src & 7
+	shr.u t2=cnt,3			// t2 = number of 8-byte words left to copy
+	shl cnt=cnt,3			// move bits 0-2 to 3-5
+	;;
+
+	.rotr val[N+1], w[2]
+	.rotp p[N]
+
+	cmp.ne p6,p0=t0,r0		// is src aligned, too?
+	shl t0=t0,LOG_LOOP_SIZE		// t0 = 8*(src & 7)
+	adds t2=-1,t2			// br.ctop is repeat/until
+	;;
+	add t4=t0,t4
+	mov pr=cnt,0x38			// set (p5,p4,p3) to # of bytes last-word bytes to copy
+	mov ar.lc=t2
+	;;
+(p6)	ld8 val[1]=[src2],8		// prime the pump...
+	mov b6=t4
+	br.sptk.few b6
+	;;
+
+memcpy_tail:
+	// At this point, (p5,p4,p3) are set to the number of bytes left to copy (which is
+	// less than 8) and t0 contains the last few bytes of the src buffer:
+(p5)	st4 [dst]=t0,4
+(p5)	shr.u t0=t0,32
+	mov ar.lc=saved_lc
+	;;
+(p4)	st2 [dst]=t0,2
+(p4)	shr.u t0=t0,16
+	mov ar.pfs=saved_pfs
+	;;
+(p3)	st1 [dst]=t0
+	mov pr=saved_pr,-1
+	br.ret.sptk.many rp
+
+///////////////////////////////////////////////////////
+	.align 64
+
+#define COPY(shift,index)									\
+ 1:												\
+  { .mfi											\
+	(p[0])		ld8 val[0]=[src2],8;							\
+			nop.f 0;								\
+	(p[MEM_LAT+3])	shrp w[0]=val[MEM_LAT+3],val[MEM_LAT+4-index],shift;			\
+  };												\
+  { .mbb											\
+	(p[MEM_LAT+4])	st8 [dst]=w[1],8;							\
+			nop.b 0;								\
+			br.ctop.dptk.few 1b;							\
+  };												\
+			;;									\
+			ld8 val[N-1]=[src_end];	/* load last word (may be same as val[N]) */	\
+			;;									\
+			shrp t0=val[N-1],val[N-index],shift;					\
+			br memcpy_tail
+memcpy_loops:
+	COPY(0, 1) /* no point special casing this---it doesn't go any faster without shrp */
+	COPY(8, 0)
+	COPY(16, 0)
+	COPY(24, 0)
+	COPY(32, 0)
+	COPY(40, 0)
+	COPY(48, 0)
+	COPY(56, 0)
 
 END(memcpy)
FUNET's LINUX-ADM group, linux-adm@nic.funet.fi
TCL-scripts by Sam Shen (who was at: slshen@lbl.gov)