2 * This program is free software; you can redistribute it and/or modify
3 * it under the terms of the GNU General Public License as published by
4 * the Free Software Foundation; either version 2 of the License, or
5 * (at your option) any later version.
7 * This program is distributed in the hope that it will be useful,
8 * but WITHOUT ANY WARRANTY; without even the implied warranty of
9 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10 * GNU General Public License for more details.
12 * You should have received a copy of the GNU General Public License
13 * along with this program; if not, write to the Free Software
14 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
16 * Copyright (C) IBM Corporation, 2012
18 * Author: Anton Blanchard <anton@au.ibm.com>
20 #include <asm/ppc_asm.h>
22 _GLOBAL(memcpy_power7)
40 /* Get the source 8B aligned */
68 stdu r1,-STACKFRAMESIZE(r1)
69 std r14,STK_REG(R14)(r1)
70 std r15,STK_REG(R15)(r1)
71 std r16,STK_REG(R16)(r1)
72 std r17,STK_REG(R17)(r1)
73 std r18,STK_REG(R18)(r1)
74 std r19,STK_REG(R19)(r1)
75 std r20,STK_REG(R20)(r1)
76 std r21,STK_REG(R21)(r1)
77 std r22,STK_REG(R22)(r1)
78 std r0,STACKFRAMESIZE+16(r1)
83 /* Now do cacheline (128B) sized loads and stores. */
124 ld r14,STK_REG(R14)(r1)
125 ld r15,STK_REG(R15)(r1)
126 ld r16,STK_REG(R16)(r1)
127 ld r17,STK_REG(R17)(r1)
128 ld r18,STK_REG(R18)(r1)
129 ld r19,STK_REG(R19)(r1)
130 ld r20,STK_REG(R20)(r1)
131 ld r21,STK_REG(R21)(r1)
132 ld r22,STK_REG(R22)(r1)
133 addi r1,r1,STACKFRAMESIZE
135 /* Up to 127B to go */
159 /* Up to 63B to go */
172 /* Up to 31B to go */
181 9: clrldi r5,r5,(64-4)
183 /* Up to 15B to go */
187 lwz r0,0(r4) /* Less chance of a reject with word ops */
213 .Lunwind_stack_nonvmx_copy:
214 addi r1,r1,STACKFRAMESIZE
217 #ifdef CONFIG_ALTIVEC
223 stdu r1,-STACKFRAMESIZE(r1)
226 ld r0,STACKFRAMESIZE+16(r1)
227 ld r3,STACKFRAMESIZE+48(r1)
228 ld r4,STACKFRAMESIZE+56(r1)
229 ld r5,STACKFRAMESIZE+64(r1)
233 * We prefetch both the source and destination using enhanced touch
234 * instructions. We use a stream ID of 0 for the load side and
235 * 1 for the store side.
239 ori r9,r9,1 /* stream=1 */
241 srdi r7,r5,7 /* length in cachelines, capped at 0x3FF */
245 1: lis r0,0x0E00 /* depth=7 */
248 ori r10,r7,1 /* stream=1 */
250 lis r8,0x8000 /* GO=1 */
258 dcbtst r0,r10,0b01010
260 dcbt r0,r8,0b01010 /* GO */
263 beq cr1,.Lunwind_stack_nonvmx_copy
266 * If source and destination are not relatively aligned we use a
267 * slower permute loop.
270 rldicl. r6,r6,0,(64-4)
271 bne .Lvmx_unaligned_copy
273 /* Get the destination 16B aligned */
304 /* Get the desination 128B aligned */
343 std r14,STK_REG(R14)(r1)
344 std r15,STK_REG(R15)(r1)
345 std r16,STK_REG(R16)(r1)
355 * Now do cacheline sized loads and stores. By this stage the
356 * cacheline stores are also cacheline aligned.
380 ld r14,STK_REG(R14)(r1)
381 ld r15,STK_REG(R15)(r1)
382 ld r16,STK_REG(R16)(r1)
384 /* Up to 127B to go */
415 /* Up to 15B to go */
416 11: clrldi r5,r5,(64-4)
440 15: addi r1,r1,STACKFRAMESIZE
442 b .exit_vmx_copy /* tail call optimise */
444 .Lvmx_unaligned_copy:
445 /* Get the destination 16B aligned */
469 lwz r0,0(r4) /* Less chance of a reject with word ops */
478 /* Get the desination 128B aligned */
488 lvsl vr16,0,r4 /* Setup permute control vector */
494 vperm vr8,vr0,vr1,vr16
502 vperm vr8,vr0,vr1,vr16
504 vperm vr9,vr1,vr0,vr16
512 vperm vr8,vr0,vr3,vr16
514 vperm vr9,vr3,vr2,vr16
516 vperm vr10,vr2,vr1,vr16
518 vperm vr11,vr1,vr0,vr16
529 std r14,STK_REG(R14)(r1)
530 std r15,STK_REG(R15)(r1)
531 std r16,STK_REG(R16)(r1)
541 * Now do cacheline sized loads and stores. By this stage the
542 * cacheline stores are also cacheline aligned.
547 vperm vr8,vr0,vr7,vr16
549 vperm vr9,vr7,vr6,vr16
551 vperm vr10,vr6,vr5,vr16
553 vperm vr11,vr5,vr4,vr16
555 vperm vr12,vr4,vr3,vr16
557 vperm vr13,vr3,vr2,vr16
559 vperm vr14,vr2,vr1,vr16
561 vperm vr15,vr1,vr0,vr16
574 ld r14,STK_REG(R14)(r1)
575 ld r15,STK_REG(R15)(r1)
576 ld r16,STK_REG(R16)(r1)
578 /* Up to 127B to go */
585 vperm vr8,vr0,vr3,vr16
587 vperm vr9,vr3,vr2,vr16
589 vperm vr10,vr2,vr1,vr16
591 vperm vr11,vr1,vr0,vr16
601 vperm vr8,vr0,vr1,vr16
603 vperm vr9,vr1,vr0,vr16
611 vperm vr8,vr0,vr1,vr16
616 /* Up to 15B to go */
617 11: clrldi r5,r5,(64-4)
618 addi r4,r4,-16 /* Unwind the +16 load offset */
621 lwz r0,0(r4) /* Less chance of a reject with word ops */
644 15: addi r1,r1,STACKFRAMESIZE
646 b .exit_vmx_copy /* tail call optimise */
647 #endif /* CONFiG_ALTIVEC */