Update copyright dates with scripts/update-copyrights.
[glibc.git] / sysdeps / powerpc / powerpc64 / power8 / memset.S
blob459692670995e20acb688b9f5b40135eec35547e
1 /* Optimized memset implementation for PowerPC64/POWER8.
2    Copyright (C) 2014-2015 Free Software Foundation, Inc.
3    This file is part of the GNU C Library.
5    The GNU C Library is free software; you can redistribute it and/or
6    modify it under the terms of the GNU Lesser General Public
7    License as published by the Free Software Foundation; either
8    version 2.1 of the License, or (at your option) any later version.
10    The GNU C Library is distributed in the hope that it will be useful,
11    but WITHOUT ANY WARRANTY; without even the implied warranty of
12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13    Lesser General Public License for more details.
15    You should have received a copy of the GNU Lesser General Public
16    License along with the GNU C Library; if not, see
17    <http://www.gnu.org/licenses/>.  */
19 #include <sysdep.h>
21 #define MTVSRD_V1_R4  .long 0x7c240166     /* mtvsrd  v1,r4  */
23 /* __ptr_t [r3] memset (__ptr_t s [r3], int c [r4], size_t n [r5]));
24    Returns 's'.  */
26         .machine power8
27 EALIGN (memset, 5, 0)
28         CALL_MCOUNT 3
30 L(_memset):
31         cmpldi  cr7,r5,31
32         neg     r0,r3
33         mr      r10,r3
35         insrdi  r4,r4,8,48
36         insrdi  r4,r4,16,32     /* Replicate byte to word.  */
37         ble     cr7,L(write_LT_32)
39         andi.   r11,r10,15      /* Check alignment of DST.  */
40         insrdi  r4,r4,32,0      /* Replicate word to double word.  */
42         beq     L(big_aligned)
44         mtocrf  0x01,r0
45         clrldi  r0,r0,60
47         /* Get DST aligned to 16 bytes.  */
48 1:      bf      31,2f
49         stb     r4,0(r10)
50         addi    r10,r10,1
52 2:      bf      30,4f
53         sth     r4,0(r10)
54         addi    r10,r10,2
56 4:      bf      29,8f
57         stw     r4,0(r10)
58         addi    r10,r10,4
60 8:      bf      28,16f
61         std     r4,0(r10)
62         addi    r10,r10,8
64 16:     subf    r5,r0,r5
66         .align  4
67 L(big_aligned):
68         /* For sizes larger than 255 two possible paths:
69            - if constant is '0', zero full cache lines with dcbz
70            - otherwise uses vector instructions.  */
71         cmpldi  cr5,r5,255
72         dcbtst  0,r10
73         cmpldi  cr6,r4,0
74         crand   27,26,21
75         bt      27,L(huge_dcbz)
76         bge     cr5,L(huge_vector)
79         /* Size between 32 and 255 bytes with constant different than 0, use
80            doubleword store instruction to achieve best throughput.  */
81         srdi    r8,r5,5
82         clrldi  r11,r5,59
83         cmpldi  cr6,r11,0
84         cmpdi   r8,0
85         beq     L(tail_bytes)
86         mtctr   r8
88         /* Main aligned write loop, writes 32-bytes at a time.  */
89         .align  4
90 L(big_loop):
91         std     r4,0(r10)
92         std     r4,8(r10)
93         std     r4,16(r10)
94         std     r4,24(r10)
95         addi    r10,r10,32
96         bdz     L(tail_bytes)
98         std     r4,0(r10)
99         std     r4,8(r10)
100         std     r4,16(r10)
101         std     r4,24(r10)
102         addi    r10,10,32
103         bdnz    L(big_loop)
105         b       L(tail_bytes)
107         /* Write remaining 1~31 bytes.  */
108         .align  4
109 L(tail_bytes):
110         beqlr   cr6
112         srdi    r7,r11,4
113         clrldi  r8,r11,60
114         mtocrf  0x01,r7
116         .align  4
117         bf      31,8f
118         std     r4,0(r10)
119         std     r4,8(r10)
120         addi    r10,r10,16
122         .align  4
123 8:      mtocrf  0x1,r8
124         bf      28,4f
125         std     r4,0(r10)
126         addi    r10,r10,8
128         .align  4
129 4:      bf      29,2f
130         stw     4,0(10)
131         addi    10,10,4
133         .align  4
134 2:      bf      30,1f
135         sth     4,0(10)
136         addi    10,10,2
138         .align  4
139 1:      bflr    31
140         stb     4,0(10)
141         blr
143         /* Size larger than 255 bytes with constant different than 0, use
144            vector instruction to achieve best throughput.  */
145 L(huge_vector):
146         /* Replicate set byte to quadword in VMX register.  */
147         MTVSRD_V1_R4
148         xxpermdi 32,v0,v1,0
149         vspltb   v2,v0,15
151         /* Main aligned write loop: 128 bytes at a time.  */
152         li      r6,16
153         li      r7,32
154         li      r8,48
155         mtocrf  0x02,r5
156         srdi    r12,r5,7
157         cmpdi   r12,0
158         beq     L(aligned_tail)
159         mtctr   r12
160         b       L(aligned_128loop)
162         .align  4
163 L(aligned_128loop):
164         stvx    v2,0,r10
165         stvx    v2,r10,r6
166         stvx    v2,r10,r7
167         stvx    v2,r10,r8
168         addi    r10,r10,64
169         stvx    v2,0,r10
170         stvx    v2,r10,r6
171         stvx    v2,r10,r7
172         stvx    v2,r10,r8
173         addi    r10,r10,64
174         bdnz    L(aligned_128loop)
176         /* Write remaining 1~127 bytes.  */
177 L(aligned_tail):
178         mtocrf  0x01,r5
179         bf      25,32f
180         stvx    v2,0,r10
181         stvx    v2,r10,r6
182         stvx    v2,r10,r7
183         stvx    v2,r10,r8
184         addi    r10,r10,64
186 32:     bf      26,16f
187         stvx    v2,0,r10
188         stvx    v2,r10,r6
189         addi    r10,r10,32
191 16:     bf      27,8f
192         stvx    v2,0,r10
193         addi    r10,r10,16
195 8:      bf      28,4f
196         std     r4,0(r10)
197         addi    r10,r10,8
199         /* Copies 4~7 bytes.  */
200 4:      bf      29,L(tail2)
201         stw     r4,0(r10)
202         bf      30,L(tail5)
203         sth     r4,4(r10)
204         bflr    31
205         stb     r4,6(r10)
206         /* Return original DST pointer.  */
207         blr
209         /* Special case when value is 0 and we have a long length to deal
210            with.  Use dcbz to zero out a full cacheline of 128 bytes at a time.
211            Before using dcbz though, we need to get the destination 128-byte
212            aligned.  */
213         .align  4
214 L(huge_dcbz):
215         andi.   r11,r10,127
216         neg     r0,r10
217         beq     L(huge_dcbz_aligned)
219         clrldi  r0,r0,57
220         subf    r5,r0,r5
221         srdi    r0,r0,3
222         mtocrf  0x01,r0
224         /* Write 1~128 bytes until DST is aligned to 128 bytes.  */
225 8:      bf      28,4f
227         std     r4,0(r10)
228         std     r4,8(r10)
229         std     r4,16(r10)
230         std     r4,24(r10)
231         std     r4,32(r10)
232         std     r4,40(r10)
233         std     r4,48(r10)
234         std     r4,56(r10)
235         addi    r10,r10,64
237         .align  4
238 4:      bf      29,2f
239         std     r4,0(r10)
240         std     r4,8(r10)
241         std     r4,16(r10)
242         std     r4,24(r10)
243         addi    r10,r10,32
245         .align  4
246 2:      bf      30,1f
247         std     r4,0(r10)
248         std     r4,8(r10)
249         addi    r10,r10,16
251         .align  4
252 1:      bf      31,L(huge_dcbz_aligned)
253         std     r4,0(r10)
254         addi    r10,r10,8
256 L(huge_dcbz_aligned):
257         /* Setup dcbz unroll offsets and count numbers.  */
258         srdi    r8,r5,9
259         clrldi  r11,r5,55
260         cmpldi  cr6,r11,0
261         li      r9,128
262         cmpdi   r8,0
263         beq     L(huge_tail)
264         li      r7,256
265         li      r6,384
266         mtctr   r8
268         .align  4
269 L(huge_loop):
270         /* Sets 512 bytes to zero in each iteration, the loop unrolling shows
271            a throughput boost for large sizes (2048 bytes or higher).  */
272         dcbz    0,r10
273         dcbz    r9,r10
274         dcbz    r7,r10
275         dcbz    r6,r10
276         addi    r10,r10,512
277         bdnz    L(huge_loop)
279         beqlr   cr6
281 L(huge_tail):
282         srdi    r6,r11,8
283         srdi    r7,r11,4
284         clrldi  r8,r11,4
285         cmpldi  cr6,r8,0
286         mtocrf  0x01,r6
288         beq     cr6,L(tail)
290         /* We have 1~511 bytes remaining.  */
291         .align  4
292 32:     bf      31,16f
293         dcbz    0,r10
294         dcbz    r9,r10
295         addi    r10,r10,256
297         .align  4
298 16:     mtocrf  0x01,r7
299         bf      28,8f
300         dcbz    0,r10
301         addi    r10,r10,128
303         .align  4
304 8:      bf      29,4f
305         std     r4,0(r10)
306         std     r4,8(r10)
307         std     r4,16(r10)
308         std     r4,24(r10)
309         std     r4,32(r10)
310         std     r4,40(r10)
311         std     r4,48(r10)
312         std     r4,56(r10)
313         addi    r10,r10,64
315         .align  4
316 4:      bf      30,2f
317         std     r4,0(r10)
318         std     r4,8(r10)
319         std     r4,16(r10)
320         std     r4,24(r10)
321         addi    r10,r10,32
323         .align  4
324 2:      bf      31,L(tail)
325         std     r4,0(r10)
326         std     r4,8(r10)
327         addi    r10,r10,16
328         .align  4
330         /* Remaining 1~15 bytes.  */
331 L(tail):
332         mtocrf  0x01,r8
334         .align
335 8:      bf      28,4f
336         std     r4,0(r10)
337         addi    r10,r10,8
339         .align  4
340 4:      bf      29,2f
341         stw     r4,0(r10)
342         addi    r10,r10,4
344         .align  4
345 2:      bf      30,1f
346         sth     r4,0(r10)
347         addi    r10,r10,2
349         .align  4
350 1:      bflr    31
351         stb     r4,0(r10)
352         blr
354         /* Handle short copies of 0~31 bytes.  Best throughput is achieved
355            by just unrolling all operations.  */
356         .align  4
357 L(write_LT_32):
358         cmpldi  cr6,5,8
359         mtocrf  0x01,r5
360         ble     cr6,L(write_LE_8)
362         /* At least 9 bytes to go.  */
363         neg     r8,r4
364         andi.   r0,r8,3
365         cmpldi  cr1,r5,16
366         beq     L(write_LT_32_aligned)
368         /* Force 4-byte alignment for SRC.  */
369         mtocrf  0x01,r0
370         subf    r5,r0,r5
372 2:      bf      30,1f
373         sth     r4,0(r10)
374         addi    r10,r10,2
376 1:      bf      31,L(end_4bytes_alignment)
377         stb     r4,0(r10)
378         addi    r10,r10,1
380         .align  4
381 L(end_4bytes_alignment):
382         cmpldi  cr1,r5,16
383         mtocrf  0x01,r5
385 L(write_LT_32_aligned):
386         blt     cr1,8f
388         stw     r4,0(r10)
389         stw     r4,4(r10)
390         stw     r4,8(r10)
391         stw     r4,12(r10)
392         addi    r10,r10,16
394 8:      bf      28,L(tail4)
395         stw     r4,0(r10)
396         stw     r4,4(r10)
397         addi    r10,r10,8
399         .align  4
400         /* Copies 4~7 bytes.  */
401 L(tail4):
402         bf      29,L(tail2)
403         stw     r4,0(r10)
404         bf      30,L(tail5)
405         sth     r4,4(r10)
406         bflr    31
407         stb     r4,6(r10)
408         blr
410         .align  4
411         /* Copies 2~3 bytes.  */
412 L(tail2):
413         bf      30,1f
414         sth     r4,0(r10)
415         bflr    31
416         stb     r4,2(r10)
417         blr
419         .align  4
420 L(tail5):
421         bflr    31
422         stb     r4,4(r10)
423         blr
425         .align  4
426 1:      bflr    31
427         stb     r4,0(r10)
428         blr
430         /* Handles copies of 0~8 bytes.  */
431         .align  4
432 L(write_LE_8):
433         bne     cr6,L(tail4)
435         stw     r4,0(r10)
436         stw     r4,4(r10)
437         blr
438 END_GEN_TB (memset,TB_TOCLESS)
439 libc_hidden_builtin_def (memset)
441 /* Copied from bzero.S to prevent the linker from inserting a stub
442    between bzero and memset.  */
443 ENTRY (__bzero)
444         CALL_MCOUNT 3
445         mr      r5,r4
446         li      r4,0
447         b       L(_memset)
448 END (__bzero)
449 #ifndef __bzero
450 weak_alias (__bzero, bzero)
451 #endif