patch-2.4.8 linux/arch/ia64/lib/copy_page.S
Next file: linux/arch/ia64/lib/copy_user.S
Previous file: linux/arch/ia64/lib/clear_user.S
Back to the patch index
Back to the overall index
-  Lines: 138
-  Date:
Tue Jul 31 10:30:08 2001
-  Orig file: 
v2.4.7/linux/arch/ia64/lib/copy_page.S
-  Orig date: 
Thu Apr  5 12:51:47 2001
diff -u --recursive --new-file v2.4.7/linux/arch/ia64/lib/copy_page.S linux/arch/ia64/lib/copy_page.S
@@ -2,8 +2,6 @@
  *
  * Optimized version of the standard copy_page() function
  *
- * Based on comments from ddd. Try not to overflow write buffer.
- *
  * Inputs:
  *	in0:	address of target page
  *	in1:	address of source page
@@ -12,11 +10,14 @@
  *
  * Copyright (C) 1999, 2001 Hewlett-Packard Co
  * Copyright (C) 1999 Stephane Eranian <eranian@hpl.hp.com>
+ * Copyright (C) 2001 David Mosberger <davidm@hpl.hp.com>
+ *
+ * 4/06/01 davidm	Tuned to make it perform well both for cached and uncached copies.
  */
 #include <asm/asmmacro.h>
 #include <asm/page.h>
 
-#define PIPE_DEPTH	6
+#define PIPE_DEPTH	3
 #define EPI		p[PIPE_DEPTH-1]
 
 #define lcount		r16
@@ -27,62 +28,67 @@
 #define src2		r21
 #define tgt1		r22
 #define tgt2		r23
+#define srcf		r24
+#define tgtf		r25
+
+#define Nrot		((8*PIPE_DEPTH+7)&~7)
 
 GLOBAL_ENTRY(copy_page)
 	.prologue
 	.save ar.pfs, saved_pfs
-	alloc saved_pfs=ar.pfs,3,((2*PIPE_DEPTH+7)&~7),0,((2*PIPE_DEPTH+7)&~7)
+	alloc saved_pfs=ar.pfs,3,Nrot-3,0,Nrot
 
-	.rotr t1[PIPE_DEPTH], t2[PIPE_DEPTH]
+	.rotr t1[PIPE_DEPTH], t2[PIPE_DEPTH], t3[PIPE_DEPTH], t4[PIPE_DEPTH], \
+	      t5[PIPE_DEPTH], t6[PIPE_DEPTH], t7[PIPE_DEPTH], t8[PIPE_DEPTH]
 	.rotp p[PIPE_DEPTH]
 
 	.save ar.lc, saved_lc
-	mov saved_lc=ar.lc	// save ar.lc ahead of time
+	mov saved_lc=ar.lc
+	mov ar.ec=PIPE_DEPTH
+
+	mov lcount=PAGE_SIZE/64-1
 	.save pr, saved_pr
-	mov saved_pr=pr		// rotating predicates are preserved
-				// resgisters we must save.
+	mov saved_pr=pr
+	mov pr.rot=1<<16
+
 	.body
 
-	mov src1=in1		// initialize 1st stream source
-	adds src2=8,in1		// initialize 2nd stream source
-	mov lcount=PAGE_SIZE/16-1 // as many 16bytes as there are on a page
-				  // -1 is because br.ctop is repeat/until
-
-	adds tgt2=8,in0		// initialize 2nd stream target
-	mov tgt1=in0		// initialize 1st stream target
-	;;
-	mov pr.rot=1<<16	// pr16=1 & pr[17-63]=0 , 63 not modified
-
-	mov ar.lc=lcount	// set loop counter
-	mov ar.ec=PIPE_DEPTH	// ar.ec must match pipeline depth
-	;;
-
-	// We need to preload the n-1 stages of the pipeline (n=depth).
-	// We do this during the "prolog" of the loop: we execute
-	// n-1 times the "load" bundle. Then both loads & stores are
-	// enabled until we reach the end of the last word of the page
-	// on the load side. Then, we enter the epilog (controlled by ec)
-	// where we just do the stores and no loads n times : drain the pipe
-	// (we exit the loop when ec=1).
-	//
-	// The initialization of the prolog is done via the predicate registers:
-	// the choice of EPI DEPENDS on the depth of the pipeline (n).
-	// When lc > 0 pr63=1 and it is fed back into pr16 and pr16-pr62
-	// are then shifted right at every iteration,
-	// Thus by initializing pr16=1 and the rest to 0 before the loop
-	// we get EPI=1 after n iterations.
-	//
-1:				// engage loop now, let the magic happen...
-(p16)	ld8 t1[0]=[src1],16	// new data on top of pipeline in 1st stream
-(p16)	ld8 t2[0]=[src2],16	// new data on top of pipeline in 2nd stream
-	nop.i 0x0
-(EPI)	st8 [tgt1]=t1[PIPE_DEPTH-1],16	// store top of  1st pipeline
-(EPI)	st8 [tgt2]=t2[PIPE_DEPTH-1],16	// store top of 2nd pipeline
-	br.ctop.dptk.few 1b	// once lc==0, ec-- & p16=0
-				// stores but no loads anymore
+	mov src1=in1
+	adds src2=8,in1
+	;;
+	adds tgt2=8,in0
+	add srcf=512,in1
+	mov ar.lc=lcount
+	mov tgt1=in0
+	add tgtf=512,in0
+	;;
+1:
+(p[0])	ld8 t1[0]=[src1],16
+(EPI)	st8 [tgt1]=t1[PIPE_DEPTH-1],16
+(p[0])	ld8 t2[0]=[src2],16
+(EPI)	st8 [tgt2]=t2[PIPE_DEPTH-1],16
+	;;
+(p[0])	ld8 t3[0]=[src1],16
+(EPI)	st8 [tgt1]=t3[PIPE_DEPTH-1],16
+(p[0])	ld8 t4[0]=[src2],16
+(EPI)	st8 [tgt2]=t4[PIPE_DEPTH-1],16
+	;;
+(p[0])	ld8 t5[0]=[src1],16
+(EPI)	st8 [tgt1]=t5[PIPE_DEPTH-1],16
+(p[0])	ld8 t6[0]=[src2],16
+(EPI)	st8 [tgt2]=t6[PIPE_DEPTH-1],16
+	;;
+(p[0])	ld8 t7[0]=[src1],16
+(EPI)	st8 [tgt1]=t7[PIPE_DEPTH-1],16
+(p[0])	ld8 t8[0]=[src2],16
+(EPI)	st8 [tgt2]=t8[PIPE_DEPTH-1],16
+
+	lfetch [srcf], 64
+	lfetch [tgtf], 64
+	br.ctop.sptk.few 1b
 	;;
 	mov pr=saved_pr,0xffffffffffff0000	// restore predicates
-	mov ar.pfs=saved_pfs	// restore ar.ec
-	mov ar.lc=saved_lc	// restore saved lc
-	br.ret.sptk.few rp	// bye...
+	mov ar.pfs=saved_pfs
+	mov ar.lc=saved_lc
+	br.ret.sptk.few rp
 END(copy_page)
FUNET's LINUX-ADM group, linux-adm@nic.funet.fi
TCL-scripts by Sam Shen (who was at: slshen@lbl.gov)