1 /* Optimized memset implementation for PowerPC64/POWER8.
2 Copyright (C) 2014-2015 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <http://www.gnu.org/licenses/>. */
21 #define MTVSRD_V1_R4 .long 0x7c240166 /* mtvsrd v1,r4 */
23 /* __ptr_t [r3] memset (__ptr_t s [r3], int c [r4], size_t n [r5]));
36 insrdi r4,r4,16,32 /* Replicate byte to word. */
37 ble cr7,L(write_LT_32)
39 andi. r11,r10,15 /* Check alignment of DST. */
40 insrdi r4,r4,32,0 /* Replicate word to double word. */
47 /* Get DST aligned to 16 bytes. */
68 /* For sizes larger than 255 two possible paths:
69 - if constant is '0', zero full cache lines with dcbz
70 - otherwise uses vector instructions. */
76 bge cr5,L(huge_vector)
79 /* Size between 32 and 255 bytes with constant different than 0, use
80 doubleword store instruction to achieve best throughput. */
88 /* Main aligned write loop, writes 32-bytes at a time. */
107 /* Write remaining 1~31 bytes. */
143 /* Size larger than 255 bytes with constant different than 0, use
144 vector instruction to achieve best throughput. */
146 /* Replicate set byte to quadword in VMX register. */
151 /* Main aligned write loop: 128 bytes at a time. */
174 bdnz L(aligned_128loop)
176 /* Write remaining 1~127 bytes. */
199 /* Copies 4~7 bytes. */
206 /* Return original DST pointer. */
209 /* Special case when value is 0 and we have a long length to deal
210 with. Use dcbz to zero out a full cacheline of 128 bytes at a time.
211 Before using dcbz though, we need to get the destination 128-byte
217 beq L(huge_dcbz_aligned)
224 /* Write 1~128 bytes until DST is aligned to 128 bytes. */
252 1: bf 31,L(huge_dcbz_aligned)
256 L(huge_dcbz_aligned):
257 /* Setup dcbz unroll offsets and count numbers. */
270 /* Sets 512 bytes to zero in each iteration, the loop unrolling shows
271 a throughput boost for large sizes (2048 bytes or higher). */
290 /* We have 1~511 bytes remaining. */
330 /* Remaining 1~15 bytes. */
354 /* Handle short copies of 0~31 bytes. Best throughput is achieved
355 by just unrolling all operations. */
360 ble cr6,L(write_LE_8)
362 /* At least 9 bytes to go. */
366 beq L(write_LT_32_aligned)
368 /* Force 4-byte alignment for SRC. */
376 1: bf 31,L(end_4bytes_alignment)
381 L(end_4bytes_alignment):
385 L(write_LT_32_aligned):
400 /* Copies 4~7 bytes. */
411 /* Copies 2~3 bytes. */
430 /* Handles copies of 0~8 bytes. */
438 END_GEN_TB (memset,TB_TOCLESS)
439 libc_hidden_builtin_def (memset)
441 /* Copied from bzero.S to prevent the linker from inserting a stub
442 between bzero and memset. */
450 weak_alias (__bzero, bzero)