sysdeps/powerpc/powerpc64/power9/strcmp.S

   1 /* Optimized strcmp implementation for PowerPC64/POWER9.
   2    Copyright (C) 2016-2018 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4
   5    The GNU C Library is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU Lesser General Public
   7    License as published by the Free Software Foundation; either
   8    version 2.1 of the License, or (at your option) any later version.
   9
  10    The GNU C Library is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    Lesser General Public License for more details.
  14
  15    You should have received a copy of the GNU Lesser General Public
  16    License along with the GNU C Library; if not, see
  17    <http://www.gnu.org/licenses/>.  */
  18 #ifdef __LITTLE_ENDIAN__
  19 #include <sysdep.h>
  20
  21 #ifndef STRCMP
  22 # define STRCMP strcmp
  23 #endif
  24
  25 /* Implements the function
  26
  27    int [r3] strcmp (const char *s1 [r3], const char *s2 [r4])
  28
  29    The implementation uses unaligned doubleword access for first 32 bytes
  30    as in POWER8 patch and uses vectorised loops after that.  */
  31
  32 /* TODO: Change this to actual instructions when minimum binutils is upgraded
  33    to 2.27. Macros are defined below for these newer instructions in order
  34    to maintain compatibility.  */
  35 # define VCTZLSBB(r,v) .long (0x10010602 | ((r)<<(32-11)) | ((v)<<(32-21)))
  36
  37 # define VEXTUBRX(t,a,b) .long (0x1000070d \
  38                                 | ((t)<<(32-11))  \
  39                                 | ((a)<<(32-16))  \
  40                                 | ((b)<<(32-21)) )
  41
  42 # define VCMPNEZB(t,a,b) .long (0x10000507 \
  43                                 | ((t)<<(32-11))  \
  44                                 | ((a)<<(32-16))  \
  45                                 | ((b)<<(32-21)) )
  46
  47 /* Get 16 bytes for unaligned case.
  48    reg1: Vector to hold next 16 bytes.
  49    reg2: Address to read from.
  50    reg3: Permute control vector.  */
  51 # define GET16BYTES(reg1, reg2, reg3) \
  52         lvx     reg1, 0, reg2; \
  53         vperm   v8, v2, reg1, reg3; \
  54         vcmpequb.       v8, v0, v8; \
  55         beq     cr6, 1f; \
  56         vspltisb        v9, 0; \
  57         b       2f; \
  58         .align 4; \
  59 1: \
  60         addi    r6, reg2, 16; \
  61         lvx     v9, 0, r6; \
  62 2: \
  63         vperm   reg1, v9, reg1, reg3;
  64
  65 /* TODO: change this to .machine power9 when the minimum required binutils
  66    allows it.  */
  67
  68         .machine  power7
  69 ENTRY_TOCLESS (STRCMP, 4)
  70         li      r0, 0
  71
  72         /* Check if [s1]+16 or [s2]+16 will cross a 4K page boundary using
  73            the code:
  74
  75             (((size_t) s1) % PAGE_SIZE > (PAGE_SIZE - ITER_SIZE))
  76
  77            with PAGE_SIZE being 4096 and ITER_SIZE begin 16.  */
  78
  79         rldicl  r7, r3, 0, 52
  80         rldicl  r9, r4, 0, 52
  81         cmpldi  cr7, r7, 4096-16
  82         bgt     cr7, L(pagecross_check)
  83         cmpldi  cr5, r9, 4096-16
  84         bgt     cr5, L(pagecross_check)
  85
  86         /* For short strings up to 16 bytes,  load both s1 and s2 using
  87            unaligned dwords and compare.  */
  88         ld      r8, 0(r3)
  89         ld      r10, 0(r4)
  90         cmpb    r12, r8, r0
  91         cmpb    r11, r8, r10
  92         orc.    r9, r12, r11
  93         bne     cr0, L(different_nocmpb)
  94
  95         ld      r8, 8(r3)
  96         ld      r10, 8(r4)
  97         cmpb    r12, r8, r0
  98         cmpb    r11, r8, r10
  99         orc.    r9, r12, r11
 100         bne     cr0, L(different_nocmpb)
 101
 102         addi    r7, r3, 16
 103         addi    r4, r4, 16
 104
 105 L(align):
 106         /* Now it has checked for first 16 bytes.  */
 107         vspltisb        v0, 0
 108         vspltisb        v2, -1
 109         lvsr    v6, 0, r4   /* Compute mask.  */
 110         or      r5, r4, r7
 111         andi.   r5, r5, 0xF
 112         beq     cr0, L(aligned)
 113         andi.   r5, r7, 0xF
 114         beq     cr0, L(s1_align)
 115         lvsr    v10, 0, r7   /* Compute mask.  */
 116
 117         /* Both s1 and s2 are unaligned.  */
 118         GET16BYTES(v4, r7, v10)
 119         GET16BYTES(v5, r4, v6)
 120         VCMPNEZB(v7, v5, v4)
 121         beq     cr6, L(match)
 122         b       L(different)
 123
 124         /* Align s1 to qw and adjust s2 address.  */
 125         .align  4
 126 L(match):
 127         clrldi  r6, r7, 60
 128         subfic  r5, r6, 16
 129         add     r7, r7, r5
 130         add     r4, r4, r5
 131         andi.   r5, r4, 0xF
 132         beq     cr0, L(aligned)
 133         lvsr    v6, 0, r4
 134         /* There are 2 loops depending on the input alignment.
 135            Each loop gets 16 bytes from s1 and s2 and compares.
 136            Loop until a mismatch or null occurs.  */
 137 L(s1_align):
 138         lvx     v4, r7, r0
 139         GET16BYTES(v5, r4, v6)
 140         VCMPNEZB(v7, v5, v4)
 141         addi    r7, r7, 16
 142         addi    r4, r4, 16
 143         bne     cr6, L(different)
 144
 145         lvx     v4, r7, r0
 146         GET16BYTES(v5, r4, v6)
 147         VCMPNEZB(v7, v5, v4)
 148         addi    r7, r7, 16
 149         addi    r4, r4, 16
 150         bne     cr6, L(different)
 151
 152         lvx     v4, r7, r0
 153         GET16BYTES(v5, r4, v6)
 154         VCMPNEZB(v7, v5, v4)
 155         addi    r7, r7, 16
 156         addi    r4, r4, 16
 157         bne     cr6, L(different)
 158
 159         lvx     v4, r7, r0
 160         GET16BYTES(v5, r4, v6)
 161         VCMPNEZB(v7, v5, v4)
 162         addi    r7, r7, 16
 163         addi    r4, r4, 16
 164         beq     cr6, L(s1_align)
 165         b       L(different)
 166
 167         .align  4
 168 L(aligned):
 169         lvx     v4, 0, r7
 170         lvx     v5, 0, r4
 171         VCMPNEZB(v7, v5, v4)
 172         addi    r7, r7, 16
 173         addi    r4, r4, 16
 174         bne     cr6, L(different)
 175
 176         lvx     v4, 0, r7
 177         lvx     v5, 0, r4
 178         VCMPNEZB(v7, v5, v4)
 179         addi    r7, r7, 16
 180         addi    r4, r4, 16
 181         bne     cr6, L(different)
 182
 183         lvx     v4, 0, r7
 184         lvx     v5, 0, r4
 185         VCMPNEZB(v7, v5, v4)
 186         addi    r7, r7, 16
 187         addi    r4, r4, 16
 188         bne     cr6, L(different)
 189
 190         lvx     v4, 0, r7
 191         lvx     v5, 0, r4
 192         VCMPNEZB(v7, v5, v4)
 193         addi    r7, r7, 16
 194         addi    r4, r4, 16
 195         beq     cr6, L(aligned)
 196
 197         /* Calculate and return the difference.  */
 198 L(different):
 199         VCTZLSBB(r6, v7)
 200         VEXTUBRX(r5, r6, v4)
 201         VEXTUBRX(r4, r6, v5)
 202         subf    r3, r4, r5
 203         extsw   r3, r3
 204         blr
 205
 206         .align  4
 207 L(different_nocmpb):
 208         neg     r3, r9
 209         and     r9, r9, r3
 210         cntlzd  r9, r9
 211         subfic  r9, r9, 63
 212         srd     r3, r8, r9
 213         srd     r10, r10, r9
 214         rldicl  r10, r10, 0, 56
 215         rldicl  r3, r3, 0, 56
 216         subf    r3, r10, r3
 217         extsw   r3, r3
 218         blr
 219
 220         .align  4
 221 L(pagecross_check):
 222         subfic  r9, r9, 4096
 223         subfic  r7, r7, 4096
 224         cmpld   cr7, r7, r9
 225         bge     cr7, L(pagecross)
 226         mr      r7, r9
 227
 228         /* If unaligned 16 bytes reads across a 4K page boundary, it uses
 229            a simple byte a byte comparison until the page alignment for s1
 230            is reached.  */
 231 L(pagecross):
 232         add     r7, r3, r7
 233         subf    r9, r3, r7
 234         mtctr   r9
 235
 236         .align  4
 237 L(pagecross_loop):
 238         /* Loads a byte from s1 and s2, compare if *s1 is equal to *s2
 239            and if *s1 is '\0'.  */
 240         lbz     r9, 0(r3)
 241         lbz     r10, 0(r4)
 242         addi    r3, r3, 1
 243         addi    r4, r4, 1
 244         cmplw   cr7, r9, r10
 245         cmpdi   cr5, r9, r0
 246         bne     cr7, L(pagecross_ne)
 247         beq     cr5, L(pagecross_nullfound)
 248         bdnz    L(pagecross_loop)
 249         b       L(align)
 250
 251         .align  4
 252 L(pagecross_ne):
 253         extsw   r3, r9
 254         mr      r9, r10
 255 L(pagecross_retdiff):
 256         subf    r9, r9, r3
 257         extsw   r3, r9
 258         blr
 259
 260         .align  4
 261 L(pagecross_nullfound):
 262         li      r3, 0
 263         b       L(pagecross_retdiff)
 264 END (STRCMP)
 265 libc_hidden_builtin_def (strcmp)
 266 #else
 267 #include <sysdeps/powerpc/powerpc64/power8/strcmp.S>
 268 #endif