locale: Fix some spelling typos
[glibc.git] / sysdeps / powerpc / powerpc64 / power8 / memset.S
blob2742ec8d687aad623a543e7e837d2cea93eee9ef
1 /* Optimized memset implementation for PowerPC64/POWER8.
2    Copyright (C) 2014-2024 Free Software Foundation, Inc.
3    This file is part of the GNU C Library.
5    The GNU C Library is free software; you can redistribute it and/or
6    modify it under the terms of the GNU Lesser General Public
7    License as published by the Free Software Foundation; either
8    version 2.1 of the License, or (at your option) any later version.
10    The GNU C Library is distributed in the hope that it will be useful,
11    but WITHOUT ANY WARRANTY; without even the implied warranty of
12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13    Lesser General Public License for more details.
15    You should have received a copy of the GNU Lesser General Public
16    License along with the GNU C Library; if not, see
17    <https://www.gnu.org/licenses/>.  */
19 #include <sysdep.h>
21 /* void * [r3] memset (void *s [r3], int c [r4], size_t n [r5]));
22    Returns 's'.  */
24 #ifndef MEMSET
25 # define MEMSET memset
26 #endif
27         .machine  power8
28 ENTRY_TOCLESS (MEMSET, 5)
29         CALL_MCOUNT 3
31 L(_memset):
32         cmpldi  cr7,r5,31
33         neg     r0,r3
34         mr      r10,r3
36         insrdi  r4,r4,8,48
37         insrdi  r4,r4,16,32     /* Replicate byte to word.  */
38         ble     cr7,L(write_LT_32)
40         andi.   r11,r10,15      /* Check alignment of DST.  */
41         insrdi  r4,r4,32,0      /* Replicate word to double word.  */
43         beq     L(big_aligned)
45         mtocrf  0x01,r0
46         clrldi  r0,r0,60
48         /* Get DST aligned to 16 bytes.  */
49 1:      bf      31,2f
50         stb     r4,0(r10)
51         addi    r10,r10,1
53 2:      bf      30,4f
54         sth     r4,0(r10)
55         addi    r10,r10,2
57 4:      bf      29,8f
58         stw     r4,0(r10)
59         addi    r10,r10,4
61 8:      bf      28,16f
62         std     r4,0(r10)
63         addi    r10,r10,8
65 16:     subf    r5,r0,r5
67         .align  4
68 L(big_aligned):
69         /* For sizes larger than 255 two possible paths:
70            - if constant is '0', zero full cache lines with dcbz
71            - otherwise uses vector instructions.  */
72         cmpldi  cr5,r5,255
73         dcbtst  0,r10
74         cmpldi  cr6,r4,0
75         crand   27,26,21
76         bt      27,L(huge_dcbz)
77         bge     cr5,L(huge_vector)
80         /* Size between 32 and 255 bytes with constant different than 0, use
81            doubleword store instruction to achieve best throughput.  */
82         srdi    r8,r5,5
83         clrldi  r11,r5,59
84         cmpldi  cr6,r11,0
85         cmpdi   r8,0
86         beq     L(tail_bytes)
87         mtctr   r8
89         /* Main aligned write loop, writes 32-bytes at a time.  */
90         .align  4
91 L(big_loop):
92         std     r4,0(r10)
93         std     r4,8(r10)
94         std     r4,16(r10)
95         std     r4,24(r10)
96         addi    r10,r10,32
97         bdz     L(tail_bytes)
99         std     r4,0(r10)
100         std     r4,8(r10)
101         std     r4,16(r10)
102         std     r4,24(r10)
103         addi    r10,10,32
104         bdnz    L(big_loop)
106         b       L(tail_bytes)
108         /* Write remaining 1~31 bytes.  */
109         .align  4
110 L(tail_bytes):
111         beqlr   cr6
113         srdi    r7,r11,4
114         clrldi  r8,r11,60
115         mtocrf  0x01,r7
117         .align  4
118         bf      31,8f
119         std     r4,0(r10)
120         std     r4,8(r10)
121         addi    r10,r10,16
123         .align  4
124 8:      mtocrf  0x1,r8
125         bf      28,4f
126         std     r4,0(r10)
127         addi    r10,r10,8
129         .align  4
130 4:      bf      29,2f
131         stw     4,0(10)
132         addi    10,10,4
134         .align  4
135 2:      bf      30,1f
136         sth     4,0(10)
137         addi    10,10,2
139         .align  4
140 1:      bflr    31
141         stb     4,0(10)
142         blr
144         /* Size larger than 255 bytes with constant different than 0, use
145            vector instruction to achieve best throughput.  */
146 L(huge_vector):
147         /* Replicate set byte to quadword in VMX register.  */
148         mtvsrd  v1,r4
149         xxpermdi 32,v0,v1,0
150         vspltb   v2,v0,15
152         /* Main aligned write loop: 128 bytes at a time.  */
153         li      r6,16
154         li      r7,32
155         li      r8,48
156         mtocrf  0x02,r5
157         srdi    r12,r5,7
158         cmpdi   r12,0
159         beq     L(aligned_tail)
160         mtctr   r12
161         b       L(aligned_128loop)
163         .align  4
164 L(aligned_128loop):
165         stvx    v2,0,r10
166         stvx    v2,r10,r6
167         stvx    v2,r10,r7
168         stvx    v2,r10,r8
169         addi    r10,r10,64
170         stvx    v2,0,r10
171         stvx    v2,r10,r6
172         stvx    v2,r10,r7
173         stvx    v2,r10,r8
174         addi    r10,r10,64
175         bdnz    L(aligned_128loop)
177         /* Write remaining 1~127 bytes.  */
178 L(aligned_tail):
179         mtocrf  0x01,r5
180         bf      25,32f
181         stvx    v2,0,r10
182         stvx    v2,r10,r6
183         stvx    v2,r10,r7
184         stvx    v2,r10,r8
185         addi    r10,r10,64
187 32:     bf      26,16f
188         stvx    v2,0,r10
189         stvx    v2,r10,r6
190         addi    r10,r10,32
192 16:     bf      27,8f
193         stvx    v2,0,r10
194         addi    r10,r10,16
196 8:      bf      28,4f
197         std     r4,0(r10)
198         addi    r10,r10,8
200         /* Copies 4~7 bytes.  */
201 4:      bf      29,L(tail2)
202         stw     r4,0(r10)
203         bf      30,L(tail5)
204         sth     r4,4(r10)
205         bflr    31
206         stb     r4,6(r10)
207         /* Return original DST pointer.  */
208         blr
210         /* Special case when value is 0 and we have a long length to deal
211            with.  Use dcbz to zero out a full cacheline of 128 bytes at a time.
212            Before using dcbz though, we need to get the destination 128-byte
213            aligned.  */
214         .align  4
215 L(huge_dcbz):
216         andi.   r11,r10,127
217         neg     r0,r10
218         beq     L(huge_dcbz_aligned)
220         clrldi  r0,r0,57
221         subf    r5,r0,r5
222         srdi    r0,r0,3
223         mtocrf  0x01,r0
225         /* Write 1~128 bytes until DST is aligned to 128 bytes.  */
226 8:      bf      28,4f
228         std     r4,0(r10)
229         std     r4,8(r10)
230         std     r4,16(r10)
231         std     r4,24(r10)
232         std     r4,32(r10)
233         std     r4,40(r10)
234         std     r4,48(r10)
235         std     r4,56(r10)
236         addi    r10,r10,64
238         .align  4
239 4:      bf      29,2f
240         std     r4,0(r10)
241         std     r4,8(r10)
242         std     r4,16(r10)
243         std     r4,24(r10)
244         addi    r10,r10,32
246         .align  4
247 2:      bf      30,1f
248         std     r4,0(r10)
249         std     r4,8(r10)
250         addi    r10,r10,16
252         .align  4
253 1:      bf      31,L(huge_dcbz_aligned)
254         std     r4,0(r10)
255         addi    r10,r10,8
257 L(huge_dcbz_aligned):
258         /* Setup dcbz unroll offsets and count numbers.  */
259         srdi    r8,r5,9
260         clrldi  r11,r5,55
261         cmpldi  cr6,r11,0
262         li      r9,128
263         cmpdi   r8,0
264         beq     L(huge_tail)
265         li      r7,256
266         li      r6,384
267         mtctr   r8
269         .align  4
270 L(huge_loop):
271         /* Sets 512 bytes to zero in each iteration, the loop unrolling shows
272            a throughput boost for large sizes (2048 bytes or higher).  */
273         dcbz    0,r10
274         dcbz    r9,r10
275         dcbz    r7,r10
276         dcbz    r6,r10
277         addi    r10,r10,512
278         bdnz    L(huge_loop)
280         beqlr   cr6
282 L(huge_tail):
283         srdi    r6,r11,8
284         srdi    r7,r11,4
285         clrldi  r8,r11,4
286         cmpldi  cr6,r8,0
287         mtocrf  0x01,r6
289         beq     cr6,L(tail)
291         /* We have 1~511 bytes remaining.  */
292         .align  4
293 32:     bf      31,16f
294         dcbz    0,r10
295         dcbz    r9,r10
296         addi    r10,r10,256
298         .align  4
299 16:     mtocrf  0x01,r7
300         bf      28,8f
301         dcbz    0,r10
302         addi    r10,r10,128
304         .align  4
305 8:      bf      29,4f
306         std     r4,0(r10)
307         std     r4,8(r10)
308         std     r4,16(r10)
309         std     r4,24(r10)
310         std     r4,32(r10)
311         std     r4,40(r10)
312         std     r4,48(r10)
313         std     r4,56(r10)
314         addi    r10,r10,64
316         .align  4
317 4:      bf      30,2f
318         std     r4,0(r10)
319         std     r4,8(r10)
320         std     r4,16(r10)
321         std     r4,24(r10)
322         addi    r10,r10,32
324         .align  4
325 2:      bf      31,L(tail)
326         std     r4,0(r10)
327         std     r4,8(r10)
328         addi    r10,r10,16
329         .align  4
331         /* Remaining 1~15 bytes.  */
332 L(tail):
333         mtocrf  0x01,r8
335         .align
336 8:      bf      28,4f
337         std     r4,0(r10)
338         addi    r10,r10,8
340         .align  4
341 4:      bf      29,2f
342         stw     r4,0(r10)
343         addi    r10,r10,4
345         .align  4
346 2:      bf      30,1f
347         sth     r4,0(r10)
348         addi    r10,r10,2
350         .align  4
351 1:      bflr    31
352         stb     r4,0(r10)
353         blr
355         /* Handle short copies of 0~31 bytes.  Best throughput is achieved
356            by just unrolling all operations.  */
357         .align  4
358 L(write_LT_32):
359         cmpldi  cr6,5,8
360         mtocrf  0x01,r5
361         ble     cr6,L(write_LE_8)
363         /* At least 9 bytes to go.  */
364         neg     r8,r4
365         andi.   r0,r8,3
366         cmpldi  cr1,r5,16
367         beq     L(write_LT_32_aligned)
369         /* Force 4-byte alignment for SRC.  */
370         mtocrf  0x01,r0
371         subf    r5,r0,r5
373 2:      bf      30,1f
374         /* Use stb instead of sth because it doesn't generate
375            alignment interrupts on cache-inhibited storage.  */
376         stb     r4,0(r10)
377         stb     r4,1(r10)
378         addi    r10,r10,2
380 1:      bf      31,L(end_4bytes_alignment)
381         stb     r4,0(r10)
382         addi    r10,r10,1
384         .align  4
385 L(end_4bytes_alignment):
386         cmpldi  cr1,r5,16
387         mtocrf  0x01,r5
389 L(write_LT_32_aligned):
390         blt     cr1,8f
392         stw     r4,0(r10)
393         stw     r4,4(r10)
394         stw     r4,8(r10)
395         stw     r4,12(r10)
396         addi    r10,r10,16
398 8:      bf      28,L(tail4)
399         stw     r4,0(r10)
400         stw     r4,4(r10)
401         addi    r10,r10,8
403         .align  4
404         /* Copies 4~7 bytes.  */
405 L(tail4):
406         bf      29,L(tail2)
407         stw     r4,0(r10)
408         bf      30,L(tail5)
409         sth     r4,4(r10)
410         bflr    31
411         stb     r4,6(r10)
412         blr
414         .align  4
415         /* Copies 2~3 bytes.  */
416 L(tail2):
417         bf      30,1f
418         sth     r4,0(r10)
419         bflr    31
420         stb     r4,2(r10)
421         blr
423         .align  4
424 L(tail5):
425         bflr    31
426         stb     r4,4(r10)
427         blr
429         .align  4
430 1:      bflr    31
431         stb     r4,0(r10)
432         blr
434         /* Handles copies of 0~8 bytes.  */
435         .align  4
436 L(write_LE_8):
437         bne     cr6,L(LE7_tail4)
438         /* If input is word aligned, use stw, else use stb.  */
439         andi.   r0,r10,3
440         bne     L(8_unalign)
442         stw     r4,0(r10)
443         stw     r4,4(r10)
444         blr
446         /* Unaligned input and size is 8.  */
447         .align  4
448 L(8_unalign):
449         andi.   r0,r10,1
450         beq     L(8_hwalign)
451         stb     r4,0(r10)
452         sth     r4,1(r10)
453         sth     r4,3(r10)
454         sth     r4,5(r10)
455         stb     r4,7(r10)
456         blr
458         /* Halfword aligned input and size is 8.  */
459         .align  4
460 L(8_hwalign):
461         sth     r4,0(r10)
462         sth     r4,2(r10)
463         sth     r4,4(r10)
464         sth     r4,6(r10)
465         blr
467         .align  4
468         /* Copies 4~7 bytes.  */
469 L(LE7_tail4):
470         /* Use stb instead of sth because it doesn't generate
471            alignment interrupts on cache-inhibited storage.  */
472         bf      29,L(LE7_tail2)
473         stb     r4,0(r10)
474         stb     r4,1(r10)
475         stb     r4,2(r10)
476         stb     r4,3(r10)
477         bf      30,L(LE7_tail5)
478         stb     r4,4(r10)
479         stb     r4,5(r10)
480         bflr    31
481         stb     r4,6(r10)
482         blr
484         .align  4
485         /* Copies 2~3 bytes.  */
486 L(LE7_tail2):
487         bf      30,1f
488         stb     r4,0(r10)
489         stb     r4,1(r10)
490         bflr    31
491         stb     r4,2(r10)
492         blr
494         .align  4
495 L(LE7_tail5):
496         bflr    31
497         stb     r4,4(r10)
498         blr
500         .align  4
501 1:      bflr    31
502         stb     r4,0(r10)
503         blr
505 END_GEN_TB (MEMSET,TB_TOCLESS)
506 libc_hidden_builtin_def (memset)