sysdeps/powerpc/powerpc64/le/power9/strncmp.S

   1 /* Optimized strncmp implementation for PowerPC64/POWER9.
   2    Copyright (C) 2016-2018 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4
   5    The GNU C Library is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU Lesser General Public
   7    License as published by the Free Software Foundation; either
   8    version 2.1 of the License, or (at your option) any later version.
   9
  10    The GNU C Library is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    Lesser General Public License for more details.
  14
  15    You should have received a copy of the GNU Lesser General Public
  16    License along with the GNU C Library; if not, see
  17    <http://www.gnu.org/licenses/>.  */
  18 #include <sysdep.h>
  19
  20 /* Implements the function
  21
  22    int [r3] strncmp (const char *s1 [r3], const char *s2 [r4], size_t [r5] n)
  23
  24    The implementation uses unaligned doubleword access to avoid specialized
  25    code paths depending of data alignment for first 32 bytes and uses
  26    vectorised loops after that.  */
  27
  28 #ifndef STRNCMP
  29 # define STRNCMP strncmp
  30 #endif
  31
  32 /* TODO: Change this to actual instructions when minimum binutils is upgraded
  33    to 2.27.  Macros are defined below for these newer instructions in order
  34    to maintain compatibility.  */
  35 #define VCTZLSBB(r,v) .long (0x10010602 | ((r)<<(32-11)) | ((v)<<(32-21)))
  36
  37 #define VEXTUBRX(t,a,b) .long (0x1000070d \
  38                                 | ((t)<<(32-11))  \
  39                                 | ((a)<<(32-16))  \
  40                                 | ((b)<<(32-21)) )
  41
  42 #define VCMPNEZB(t,a,b) .long (0x10000507 \
  43                                 | ((t)<<(32-11))  \
  44                                 | ((a)<<(32-16))  \
  45                                 | ((b)<<(32-21)) )
  46
  47 /* Get 16 bytes for unaligned case.
  48    reg1: Vector to hold next 16 bytes.
  49    reg2: Address to read from.
  50    reg3: Permute control vector.  */
  51 #define GET16BYTES(reg1, reg2, reg3) \
  52         lvx     reg1, 0, reg2; \
  53         vperm   v8, v2, reg1, reg3; \
  54         vcmpequb.       v8, v0, v8; \
  55         beq     cr6, 1f; \
  56         vspltisb        v9, 0; \
  57         b       2f; \
  58         .align 4; \
  59 1: \
  60         cmplw   cr6, r5, r11; \
  61         ble     cr6, 2f; \
  62         addi    r6, reg2, 16; \
  63         lvx     v9, 0, r6; \
  64 2: \
  65         vperm   reg1, v9, reg1, reg3;
  66
  67 /* TODO: change this to .machine power9 when minimum binutils
  68    is upgraded to 2.27.  */
  69         .machine  power7
  70 ENTRY_TOCLESS (STRNCMP, 4)
  71         /* Check if size is 0.  */
  72         cmpdi   cr0, r5, 0
  73         beq     cr0, L(ret0)
  74         li      r0, 0
  75
  76         /* Check if [s1]+32 or [s2]+32 will cross a 4K page boundary using
  77            the code:
  78
  79             (((size_t) s1) % PAGE_SIZE > (PAGE_SIZE - ITER_SIZE))
  80
  81            with PAGE_SIZE being 4096 and ITER_SIZE begin 32.  */
  82         rldicl  r8, r3, 0, 52
  83         cmpldi  cr7, r8, 4096-32
  84         bgt     cr7, L(pagecross)
  85         rldicl  r9, r4, 0, 52
  86         cmpldi  cr7, r9, 4096-32
  87         bgt     cr7, L(pagecross)
  88
  89         /* For short strings up to 32 bytes, load both s1 and s2 using
  90            unaligned dwords and compare.  */
  91
  92         ld      r7, 0(r3)
  93         ld      r9, 0(r4)
  94         li      r8, 0
  95         cmpb    r8, r7, r8
  96         cmpb    r6, r7, r9
  97         orc.    r8, r8, r6
  98         bne     cr0, L(different1)
  99
 100         /* If the strings compared are equal, but size is less or equal
 101            to 8, return 0.  */
 102         cmpldi  cr7, r5, 8
 103         li      r9, 0
 104         ble     cr7, L(ret1)
 105         addi    r5, r5, -8
 106
 107         ld      r7, 8(r3)
 108         ld      r9, 8(r4)
 109         cmpb    r8, r7, r8
 110         cmpb    r6, r7, r9
 111         orc.    r8, r8, r6
 112         bne     cr0, L(different1)
 113         cmpldi  cr7, r5, 8
 114         mr      r9, r8
 115         ble     cr7, L(ret1)
 116         /* Update pointers and size.  */
 117         addi    r5, r5, -8
 118         addi    r3, r3, 16
 119         addi    r4, r4, 16
 120
 121         ld      r7, 0(r3)
 122         ld      r9, 0(r4)
 123         li      r8, 0
 124         cmpb    r8, r7, r8
 125         cmpb    r6, r7, r9
 126         orc.    r8, r8, r6
 127         bne     cr0, L(different1)
 128         cmpldi  cr7, r5, 8
 129         li      r9, 0
 130         ble     cr7, L(ret1)
 131         addi    r5, r5, -8
 132
 133         ld      r7, 8(r3)
 134         ld      r9, 8(r4)
 135         cmpb    r8, r7, r8
 136         cmpb    r6, r7, r9
 137         orc.    r8, r8, r6
 138         bne     cr0, L(different1)
 139         cmpldi  cr7, r5, 8
 140         mr      r9, r8
 141         ble     cr7, L(ret1)
 142
 143         /* Update pointers and size.  */
 144         addi    r5, r5, -8
 145         addi    r3, r3, 16
 146         addi    r4, r4, 16
 147 L(align):
 148         /* Now it has checked for first 32 bytes, align source1 to doubleword
 149            and adjust source2 address.  */
 150         vspltisb        v0, 0
 151         vspltisb        v2, -1
 152         or      r6, r4, r3
 153         andi.   r6, r6, 0xF
 154         beq     cr0, L(aligned)
 155         lvsr    v6, 0, r4   /* Compute mask.  */
 156         clrldi  r6, r4, 60
 157         subfic  r11, r6, 16
 158         andi.   r6, r3, 0xF
 159         beq     cr0, L(s1_align)
 160         /* Both s1 and s2 are unaligned.  */
 161         GET16BYTES(v5, r4, v6)
 162         lvsr    v10, 0, r3   /* Compute mask.  */
 163         clrldi  r6, r3, 60
 164         subfic  r11, r6, 16
 165         GET16BYTES(v4, r3, v10)
 166         VCMPNEZB(v7, v5, v4)
 167         beq     cr6, L(match)
 168         b       L(different)
 169
 170         /* Align s1 to qw and adjust s2 address.  */
 171         .align  4
 172 L(match):
 173         cmpldi  cr7, r5, 16
 174         ble     cr7, L(ret0)
 175         subf    r5, r11, r5
 176         add     r3, r3, r11
 177         add     r4, r4, r11
 178         andi.   r11, r4, 0xF
 179         beq     cr0, L(aligned)
 180         lvsr    v6, 0, r4
 181         clrldi  r6, r4, 60
 182         subfic  r11, r6, 16
 183         /* There are 2 loops depending on the input alignment.
 184            Each loop gets 16 bytes from s1 and s2, checks for null
 185            and compares them. Loops until a mismatch or  null occurs.  */
 186 L(s1_align):
 187         lvx     v4, 0, r3
 188         GET16BYTES(v5, r4, v6)
 189         VCMPNEZB(v7, v5, v4)
 190         bne     cr6, L(different)
 191         cmpldi  cr7, r5, 16
 192         ble     cr7, L(ret0)
 193         addi    r5, r5, -16
 194         addi    r3, r3, 16
 195         addi    r4, r4, 16
 196
 197         lvx     v4, 0, r3
 198         GET16BYTES(v5, r4, v6)
 199         VCMPNEZB(v7, v5, v4)
 200         bne     cr6, L(different)
 201         cmpldi  cr7, r5, 16
 202         ble     cr7, L(ret0)
 203         addi    r5, r5, -16
 204         addi    r3, r3, 16
 205         addi    r4, r4, 16
 206
 207         lvx     v4, 0, r3
 208         GET16BYTES(v5, r4, v6)
 209         VCMPNEZB(v7, v5, v4)
 210         bne     cr6, L(different)
 211         cmpldi  cr7, r5, 16
 212         ble     cr7, L(ret0)
 213         addi    r5, r5, -16
 214         addi    r3, r3, 16
 215         addi    r4, r4, 16
 216
 217         lvx     v4, 0, r3
 218         GET16BYTES(v5, r4, v6)
 219         VCMPNEZB(v7, v5, v4)
 220         bne     cr6, L(different)
 221         cmpldi  cr7, r5, 16
 222         ble     cr7, L(ret0)
 223         addi    r5, r5, -16
 224         addi    r3, r3, 16
 225         addi    r4, r4, 16
 226         b       L(s1_align)
 227         .align  4
 228 L(aligned):
 229         lvx     v4, 0, r3
 230         lvx     v5, 0, r4
 231         VCMPNEZB(v7, v5, v4)
 232         bne     cr6, L(different)
 233         cmpldi  cr7, r5, 16
 234         ble     cr7, L(ret0)
 235         addi    r5, r5, -16
 236         addi    r3, r3, 16
 237         addi    r4, r4, 16
 238
 239         lvx     v4, 0, r3
 240         lvx     v5, 0, r4
 241         VCMPNEZB(v7, v5, v4)
 242         bne     cr6, L(different)
 243         cmpldi  cr7, r5, 16
 244         ble     cr7, L(ret0)
 245         addi    r5, r5, -16
 246         addi    r3, r3, 16
 247         addi    r4, r4, 16
 248
 249         lvx     v4, 0, r3
 250         lvx     v5, 0, r4
 251         VCMPNEZB(v7, v5, v4)
 252         bne     cr6, L(different)
 253         cmpldi  cr7, r5, 16
 254         ble     cr7, L(ret0)
 255         addi    r5, r5, -16
 256         addi    r3, r3, 16
 257         addi    r4, r4, 16
 258
 259         lvx     v4, 0, r3
 260         lvx     v5, 0, r4
 261         VCMPNEZB(v7, v5, v4)
 262         bne     cr6, L(different)
 263         cmpldi  cr7, r5, 16
 264         ble     cr7, L(ret0)
 265         addi    r5, r5, -16
 266         addi    r3, r3, 16
 267         addi    r4, r4, 16
 268         b       L(aligned)
 269         /* Calculate and return the difference.  */
 270 L(different):
 271         VCTZLSBB(r6, v7)
 272         cmplw   cr7, r5, r6
 273         ble     cr7, L(ret0)
 274         VEXTUBRX(r5, r6, v4)
 275         VEXTUBRX(r4, r6, v5)
 276         subf    r3, r4, r5
 277         extsw   r3, r3
 278         blr
 279
 280         .align 4
 281 L(ret0):
 282         li      r9, 0
 283 L(ret1):
 284         mr      r3, r9
 285         blr
 286
 287         /* The code now checks if r8 and r5 are different by issuing a
 288            cmpb and shifts the result based on its output:
 289
 290           leadzero = (__builtin_ffsl (z1) - 1);
 291           leadzero = leadzero > (n-1)*8 ? (n-1)*8 : leadzero;
 292           r1 = (r1 >> leadzero) & 0xFFUL;
 293           r2 = (r2 >> leadzero) & 0xFFUL;
 294           return r1 - r2;  */
 295
 296         .align 4
 297 L(different1):
 298         neg     r11, r8
 299         sldi    r5, r5, 3
 300         and     r8, r11, r8
 301         addi    r5, r5, -8
 302         cntlzd  r8, r8
 303         subfic  r8, r8, 63
 304         extsw   r8, r8
 305         cmpld   cr7, r8, r5
 306         ble     cr7, L(different2)
 307         mr      r8, r5
 308 L(different2):
 309         extsw   r8, r8
 310         srd     r7, r7, r8
 311         srd     r9, r9, r8
 312         rldicl  r3, r7, 0, 56
 313         rldicl  r9, r9, 0, 56
 314         subf    r9, r9, 3
 315         extsw   r9, r9
 316         mr      r3, r9
 317         blr
 318
 319         /* If unaligned 16 bytes reads across a 4K page boundary, it uses
 320            a simple byte a byte comparison until the page alignment for s1
 321            is reached.  */
 322         .align 4
 323 L(pagecross):
 324         lbz     r7, 0(r3)
 325         lbz     r9, 0(r4)
 326         subfic  r8, r8,4095
 327         cmplw   cr7, r9, r7
 328         bne     cr7, L(byte_ne_3)
 329         cmpdi   cr7, r9, 0
 330         beq     cr7, L(byte_ne_0)
 331         addi    r5, r5, -1
 332         subf    r7, r8, r5
 333         subf    r9, r7, r5
 334         addi    r9, r9, 1
 335         mtctr   r9
 336         b       L(pagecross_loop1)
 337
 338         .align 4
 339 L(pagecross_loop0):
 340         beq     cr7, L(ret0)
 341         lbz     r9, 0(r3)
 342         lbz     r8, 0(r4)
 343         addi    r5, r5, -1
 344         cmplw   cr7, r9, r8
 345         cmpdi   cr5, r9, 0
 346         bne     cr7, L(byte_ne_2)
 347         beq     cr5, L(byte_ne_0)
 348 L(pagecross_loop1):
 349         cmpdi   cr7, r5, 0
 350         addi    r3, r3, 1
 351         addi    r4, r4, 1
 352         bdnz    L(pagecross_loop0)
 353         cmpdi   cr7, r7, 0
 354         li      r9, 0
 355         bne+    cr7, L(align)
 356         b       L(ret1)
 357
 358         .align 4
 359 L(byte_ne_0):
 360         li      r7, 0
 361 L(byte_ne_1):
 362         subf    r9, r9, r7
 363         extsw   r9, r9
 364         b       L(ret1)
 365
 366         .align 4
 367 L(byte_ne_2):
 368         extsw   r7, r9
 369         mr      r9, r8
 370         b       L(byte_ne_1)
 371 L(byte_ne_3):
 372         extsw   r7, r7
 373         b       L(byte_ne_1)
 374 END(STRNCMP)
 375 libc_hidden_builtin_def(strncmp)