Here is the code of the (real simple) e300 memcpy.
The code is in style very close to the routine used by the linux kernel. Please ignore the numeric labels (these are neded for Linux)
My cooking recipe for copy performance is quite simple.
We align the DST so that the CPU can write out a cache line per loop iteration. The DCBZ instruction is used to clear the DST cache line (this will prevent the CPU form reading the DST before overwriting it)
PPC has copy back cache, so if we copy 1 Byte from $100 to $200 the CPU will always write out a cache line (32byte).
If we write the first byte to $200 the CPU will read the 32byte from $200 to its cache, then overwrite the one byte and later write back the cache line out to memory.
This behavior is needed if we only write single bytes but as we want to write the whole cache line reading the DST in is unnecessary. The DCBZ instruction is the PPC way of telling the CPU that it does not need to read the DST in as we are going to overwrite it all.
To avoid read bubbles we prefetch the SRC using the DCBT instruction.
A little voodoo comes by using the best prefetch ranges for SRC and DST.
The e300 seems to not like prefetching far in advance.
Even prefetching just 2 lines in advance will hurt performance. We align our Prefetch-Pointer to the next SRC cache line.
The main loop uses 4 registers.
Less or more than 4 registers seem to be negative to performance.
I'm sure this routine can be improved further.
Code:
#define L1_CACHE_SHIFT 5
#define MAX_COPY_PREFETCH 4
#define L1_CACHE_BYTES (1 << L1_CACHE_SHIFT)
CACHELINE_BYTES = L1_CACHE_BYTES
LG_CACHELINE_BYTES = L1_CACHE_SHIFT
CACHELINE_MASK = (L1_CACHE_BYTES-1)
/*
* Memcpy optimized for PPC e300
*
* This relative simple memcpy does the following to optimize performance
*
* For sizes > 32 byte:
* DST is aligned to 32bit boundary - using 8bit copies
* DST is aligned to cache line boundary (32byte) - using aligned 32bit copies
* The main copy loop prossess one cache line (32byte) per iteration
* The DST cacheline is clear using DCBZ
* The clearing of the aligned DST cache line is very important for performance
* it prevents the CPU from fetching the DST line from memory - this saves 33% of memory accesses.
* To optimize SRC read performance the SRC is prefetched using DCBT
*
* The trick for getting good performance is to use a good match of prefetch distance
* for SRC reading and for DST clearing.
* Typically you DCBZ the DST 0 or 1 cache line ahead
* Typically you DCBT the SRC 2 - 4 cache lines ahaed
* on the e300 prefetching the SRC too far ahead will be slower than not prefetching at all.
*
* We use DCBZ DST[0] and DBCT SRC[0-1] depending on the SRC alignment
*
*/
.align 7
/* parameters r3=DST, r4=SRC, r5=size */
/* returns r3=0 */
.global memcpy_e300
memcpy_e300:
dcbt 0,r4 /* Prefetch SRC cache line 32byte */
neg r0,r3 /* DST alignment */
addi r4,r4,-4
andi. r0,r0,CACHELINE_MASK /* # of bytes away from cache line boundary */
addi r6,r3,-4
cmplw cr1,r5,r0 /* is this more than total to do? */
beq .Lcachelinealigned
blt cr1,.Lcopyrest /* if not much to do */
andi. r8,r0,3 /* get it word-aligned first */
mtctr r8
beq+ .Ldstwordaligned
.Laligntoword:
70: lbz r9,4(r4) /* we copy bytes (8bit) 0-3 */
71: stb r9,4(r6) /* to get the DST 32bit aligned */
addi r4,r4,1
addi r6,r6,1
bdnz .Laligntoword
.Ldstwordaligned:
subf r5,r0,r5
srwi. r0,r0,2
mtctr r0
beq .Lcachelinealigned
.Laligntocacheline:
72: lwzu r9,4(r4) /* do copy 32bit words (0-7) */
73: stwu r9,4(r6) /* to get DST cache line aligned (32byte) */
bdnz .Laligntocacheline
.Lcachelinealigned:
srwi. r0,r5,LG_CACHELINE_BYTES /* # complete cachelines */
clrlwi r5,r5,32-LG_CACHELINE_BYTES
li r11,32+4
beq .Lcopyrest
addi r3,r4,4 /* Find out which SRC cacheline to prefetch */
neg r3,r3
andi. r3,r3,31
addi r3,r3,32
mtctr r0
.align 7
.Lloop: /* the main body of the cacheline loop */
dcbt r3,r4 /* SRC cache line prefetch */
dcbz r11,r6 /* clear DST cache line */
lwz r7, 0x04(r4) /* copy using a 4 register stride for best performance on e300 */
lwz r8, 0x08(r4)
lwz r9, 0x0c(r4)
lwz r10,0x10(r4)
stw r7, 0x04(r6)
stw r8, 0x08(r6)
stw r9, 0x0c(r6)
stw r10,0x10(r6)
lwz r7, 0x14(r4)
lwz r8, 0x18(r4)
lwz r9, 0x1c(r4)
lwzu r10,0x20(r4)
stw r7, 0x14(r6)
stw r8, 0x18(r6)
stw r9, 0x1c(r6)
stwu r10,0x20(r6)
bdnz .Lloop
.Lcopyrest:
srwi. r0,r5,2
mtctr r0
beq .Llastbytes
.Lcopywords:
30: lwzu r0,4(r4) /* we copy remaining words (0-7) */
31: stwu r0,4(r6)
bdnz .Lcopywords
.Llastbytes:
andi. r0,r5,3
mtctr r0
beq+ .Lend
.Lcopybytes:
40: lbz r0,4(r4) /* we copy remaining bytes (0-3) */
41: stb r0,4(r6)
addi r4,r4,1
addi r6,r6,1
bdnz .Lcopybytes
.Lend: li r3,0 /* done : return 0 for Linux / DST for glibc*/
blr
I'm looking forward to your replies / ideas