[NDS32] Add unaligned access support.
[official-gcc.git] / gcc / config / rs6000 / rs6000-string.c
blobc70a4802119e6defe3a2bb6a0f844a2342d2a64d
1 /* Subroutines used to expand string and block move, clear,
2 compare and other operations for PowerPC.
3 Copyright (C) 1991-2018 Free Software Foundation, Inc.
5 This file is part of GCC.
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published
9 by the Free Software Foundation; either version 3, or (at your
10 option) any later version.
12 GCC is distributed in the hope that it will be useful, but WITHOUT
13 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public
15 License for more details.
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
21 #define IN_TARGET_CODE 1
23 #include "config.h"
24 #include "system.h"
25 #include "coretypes.h"
26 #include "backend.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "memmodel.h"
30 #include "tm_p.h"
31 #include "ira.h"
32 #include "print-tree.h"
33 #include "varasm.h"
34 #include "explow.h"
35 #include "expr.h"
36 #include "output.h"
37 #include "target.h"
39 /* Expand a block clear operation, and return 1 if successful. Return 0
40 if we should let the compiler generate normal code.
42 operands[0] is the destination
43 operands[1] is the length
44 operands[3] is the alignment */
46 int
47 expand_block_clear (rtx operands[])
49 rtx orig_dest = operands[0];
50 rtx bytes_rtx = operands[1];
51 rtx align_rtx = operands[3];
52 bool constp = (GET_CODE (bytes_rtx) == CONST_INT);
53 HOST_WIDE_INT align;
54 HOST_WIDE_INT bytes;
55 int offset;
56 int clear_bytes;
57 int clear_step;
59 /* If this is not a fixed size move, just call memcpy */
60 if (! constp)
61 return 0;
63 /* This must be a fixed size alignment */
64 gcc_assert (GET_CODE (align_rtx) == CONST_INT);
65 align = INTVAL (align_rtx) * BITS_PER_UNIT;
67 /* Anything to clear? */
68 bytes = INTVAL (bytes_rtx);
69 if (bytes <= 0)
70 return 1;
72 /* Use the builtin memset after a point, to avoid huge code bloat.
73 When optimize_size, avoid any significant code bloat; calling
74 memset is about 4 instructions, so allow for one instruction to
75 load zero and three to do clearing. */
76 if (TARGET_ALTIVEC && (align >= 128 || TARGET_EFFICIENT_UNALIGNED_VSX))
77 clear_step = 16;
78 else if (TARGET_POWERPC64 && (align >= 64 || !STRICT_ALIGNMENT))
79 clear_step = 8;
80 else
81 clear_step = 4;
83 if (optimize_size && bytes > 3 * clear_step)
84 return 0;
85 if (! optimize_size && bytes > 8 * clear_step)
86 return 0;
88 for (offset = 0; bytes > 0; offset += clear_bytes, bytes -= clear_bytes)
90 machine_mode mode = BLKmode;
91 rtx dest;
93 if (bytes >= 16 && TARGET_ALTIVEC && (align >= 128 || TARGET_EFFICIENT_UNALIGNED_VSX))
95 clear_bytes = 16;
96 mode = V4SImode;
98 else if (bytes >= 8 && TARGET_POWERPC64
99 && (align >= 64 || !STRICT_ALIGNMENT))
101 clear_bytes = 8;
102 mode = DImode;
103 if (offset == 0 && align < 64)
105 rtx addr;
107 /* If the address form is reg+offset with offset not a
108 multiple of four, reload into reg indirect form here
109 rather than waiting for reload. This way we get one
110 reload, not one per store. */
111 addr = XEXP (orig_dest, 0);
112 if ((GET_CODE (addr) == PLUS || GET_CODE (addr) == LO_SUM)
113 && GET_CODE (XEXP (addr, 1)) == CONST_INT
114 && (INTVAL (XEXP (addr, 1)) & 3) != 0)
116 addr = copy_addr_to_reg (addr);
117 orig_dest = replace_equiv_address (orig_dest, addr);
121 else if (bytes >= 4 && (align >= 32 || !STRICT_ALIGNMENT))
122 { /* move 4 bytes */
123 clear_bytes = 4;
124 mode = SImode;
126 else if (bytes >= 2 && (align >= 16 || !STRICT_ALIGNMENT))
127 { /* move 2 bytes */
128 clear_bytes = 2;
129 mode = HImode;
131 else /* move 1 byte at a time */
133 clear_bytes = 1;
134 mode = QImode;
137 dest = adjust_address (orig_dest, mode, offset);
139 emit_move_insn (dest, CONST0_RTX (mode));
142 return 1;
145 /* Figure out the correct instructions to generate to load data for
146 block compare. MODE is used for the read from memory, and
147 data is zero extended if REG is wider than MODE. If LE code
148 is being generated, bswap loads are used.
150 REG is the destination register to move the data into.
151 MEM is the memory block being read.
152 MODE is the mode of memory to use for the read. */
153 static void
154 do_load_for_compare (rtx reg, rtx mem, machine_mode mode)
156 switch (GET_MODE (reg))
158 case E_DImode:
159 switch (mode)
161 case E_QImode:
162 emit_insn (gen_zero_extendqidi2 (reg, mem));
163 break;
164 case E_HImode:
166 rtx src = mem;
167 if (!BYTES_BIG_ENDIAN)
169 src = gen_reg_rtx (HImode);
170 emit_insn (gen_bswaphi2 (src, mem));
172 emit_insn (gen_zero_extendhidi2 (reg, src));
173 break;
175 case E_SImode:
177 rtx src = mem;
178 if (!BYTES_BIG_ENDIAN)
180 src = gen_reg_rtx (SImode);
181 emit_insn (gen_bswapsi2 (src, mem));
183 emit_insn (gen_zero_extendsidi2 (reg, src));
185 break;
186 case E_DImode:
187 if (!BYTES_BIG_ENDIAN)
188 emit_insn (gen_bswapdi2 (reg, mem));
189 else
190 emit_insn (gen_movdi (reg, mem));
191 break;
192 default:
193 gcc_unreachable ();
195 break;
197 case E_SImode:
198 switch (mode)
200 case E_QImode:
201 emit_insn (gen_zero_extendqisi2 (reg, mem));
202 break;
203 case E_HImode:
205 rtx src = mem;
206 if (!BYTES_BIG_ENDIAN)
208 src = gen_reg_rtx (HImode);
209 emit_insn (gen_bswaphi2 (src, mem));
211 emit_insn (gen_zero_extendhisi2 (reg, src));
212 break;
214 case E_SImode:
215 if (!BYTES_BIG_ENDIAN)
216 emit_insn (gen_bswapsi2 (reg, mem));
217 else
218 emit_insn (gen_movsi (reg, mem));
219 break;
220 case E_DImode:
221 /* DImode is larger than the destination reg so is not expected. */
222 gcc_unreachable ();
223 break;
224 default:
225 gcc_unreachable ();
227 break;
228 default:
229 gcc_unreachable ();
230 break;
234 /* Select the mode to be used for reading the next chunk of bytes
235 in the compare.
237 OFFSET is the current read offset from the beginning of the block.
238 BYTES is the number of bytes remaining to be read.
239 ALIGN is the minimum alignment of the memory blocks being compared in bytes.
240 WORD_MODE_OK indicates using WORD_MODE is allowed, else SImode is
241 the largest allowable mode. */
242 static machine_mode
243 select_block_compare_mode (unsigned HOST_WIDE_INT offset,
244 unsigned HOST_WIDE_INT bytes,
245 unsigned HOST_WIDE_INT align, bool word_mode_ok)
247 /* First see if we can do a whole load unit
248 as that will be more efficient than a larger load + shift. */
250 /* If big, use biggest chunk.
251 If exactly chunk size, use that size.
252 If remainder can be done in one piece with shifting, do that.
253 Do largest chunk possible without violating alignment rules. */
255 /* The most we can read without potential page crossing. */
256 unsigned HOST_WIDE_INT maxread = ROUND_UP (bytes, align);
258 if (word_mode_ok && bytes >= UNITS_PER_WORD)
259 return word_mode;
260 else if (bytes == GET_MODE_SIZE (SImode))
261 return SImode;
262 else if (bytes == GET_MODE_SIZE (HImode))
263 return HImode;
264 else if (bytes == GET_MODE_SIZE (QImode))
265 return QImode;
266 else if (bytes < GET_MODE_SIZE (SImode)
267 && offset >= GET_MODE_SIZE (SImode) - bytes)
268 /* This matches the case were we have SImode and 3 bytes
269 and offset >= 1 and permits us to move back one and overlap
270 with the previous read, thus avoiding having to shift
271 unwanted bytes off of the input. */
272 return SImode;
273 else if (word_mode_ok && bytes < UNITS_PER_WORD
274 && offset >= UNITS_PER_WORD-bytes)
275 /* Similarly, if we can use DImode it will get matched here and
276 can do an overlapping read that ends at the end of the block. */
277 return word_mode;
278 else if (word_mode_ok && maxread >= UNITS_PER_WORD)
279 /* It is safe to do all remaining in one load of largest size,
280 possibly with a shift to get rid of unwanted bytes. */
281 return word_mode;
282 else if (maxread >= GET_MODE_SIZE (SImode))
283 /* It is safe to do all remaining in one SImode load,
284 possibly with a shift to get rid of unwanted bytes. */
285 return SImode;
286 else if (bytes > GET_MODE_SIZE (SImode))
287 return SImode;
288 else if (bytes > GET_MODE_SIZE (HImode))
289 return HImode;
291 /* final fallback is do one byte */
292 return QImode;
295 /* Compute the alignment of pointer+OFFSET where the original alignment
296 of pointer was BASE_ALIGN. */
297 static unsigned HOST_WIDE_INT
298 compute_current_alignment (unsigned HOST_WIDE_INT base_align,
299 unsigned HOST_WIDE_INT offset)
301 if (offset == 0)
302 return base_align;
303 return MIN (base_align, offset & -offset);
306 /* Prepare address and then do a load.
308 MODE is the mode to use for the load.
309 DEST is the destination register for the data.
310 ADDR is the address to be loaded.
311 ORIG_ADDR is the original address expression. */
312 static void
313 do_load_for_compare_from_addr (machine_mode mode, rtx dest, rtx addr,
314 rtx orig_addr)
316 rtx mem = gen_rtx_MEM (mode, addr);
317 MEM_COPY_ATTRIBUTES (mem, orig_addr);
318 set_mem_size (mem, GET_MODE_SIZE (mode));
319 do_load_for_compare (dest, mem, mode);
320 return;
323 /* Do a branch for an if/else decision.
325 CMPMODE is the mode to use for the comparison.
326 COMPARISON is the rtx code for the compare needed.
327 A is the first thing to be compared.
328 B is the second thing to be compared.
329 CR is the condition code reg input, or NULL_RTX.
330 TRUE_LABEL is the label to branch to if the condition is true.
332 The return value is the CR used for the comparison.
333 If CR is null_rtx, then a new register of CMPMODE is generated.
334 If A and B are both null_rtx, then CR must not be null, and the
335 compare is not generated so you can use this with a dot form insn. */
337 static void
338 do_ifelse (machine_mode cmpmode, rtx_code comparison,
339 rtx a, rtx b, rtx cr, rtx true_label)
341 gcc_assert ((a == NULL_RTX && b == NULL_RTX && cr != NULL_RTX)
342 || (a != NULL_RTX && b != NULL_RTX));
344 if (cr != NULL_RTX)
345 gcc_assert (GET_MODE (cr) == cmpmode);
346 else
347 cr = gen_reg_rtx (cmpmode);
349 rtx label_ref = gen_rtx_LABEL_REF (VOIDmode, true_label);
351 if (a != NULL_RTX)
352 emit_move_insn (cr, gen_rtx_COMPARE (cmpmode, a, b));
354 rtx cmp_rtx = gen_rtx_fmt_ee (comparison, VOIDmode, cr, const0_rtx);
356 rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, cmp_rtx, label_ref, pc_rtx);
357 rtx j = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse));
358 JUMP_LABEL (j) = true_label;
359 LABEL_NUSES (true_label) += 1;
362 /* Emit an isel of the proper mode for DEST.
364 DEST is the isel destination register.
365 SRC1 is the isel source if CR is true.
366 SRC2 is the isel source if CR is false.
367 CR is the condition for the isel. */
368 static void
369 do_isel (rtx dest, rtx cmp, rtx src_t, rtx src_f, rtx cr)
371 if (GET_MODE (dest) == DImode)
372 emit_insn (gen_isel_signed_di (dest, cmp, src_t, src_f, cr));
373 else
374 emit_insn (gen_isel_signed_si (dest, cmp, src_t, src_f, cr));
377 /* Emit a subtract of the proper mode for DEST.
379 DEST is the destination register for the subtract.
380 SRC1 is the first subtract input.
381 SRC2 is the second subtract input.
383 Computes DEST = SRC1-SRC2. */
384 static void
385 do_sub3 (rtx dest, rtx src1, rtx src2)
387 if (GET_MODE (dest) == DImode)
388 emit_insn (gen_subdi3 (dest, src1, src2));
389 else
390 emit_insn (gen_subsi3 (dest, src1, src2));
393 /* Emit an add of the proper mode for DEST.
395 DEST is the destination register for the add.
396 SRC1 is the first add input.
397 SRC2 is the second add input.
399 Computes DEST = SRC1+SRC2. */
400 static void
401 do_add3 (rtx dest, rtx src1, rtx src2)
403 if (GET_MODE (dest) == DImode)
404 emit_insn (gen_adddi3 (dest, src1, src2));
405 else
406 emit_insn (gen_addsi3 (dest, src1, src2));
409 /* Generate rtl for a load, shift, and compare of less than a full word.
411 LOAD_MODE is the machine mode for the loads.
412 DIFF is the reg for the difference.
413 CMP_REM is the reg containing the remaining bytes to compare.
414 DCOND is the CCUNS reg for the compare if we are doing P9 code with setb.
415 SRC1_ADDR is the first source address.
416 SRC2_ADDR is the second source address.
417 ORIG_SRC1 is the original first source block's address rtx.
418 ORIG_SRC2 is the original second source block's address rtx. */
419 static void
420 do_load_mask_compare (const machine_mode load_mode, rtx diff, rtx cmp_rem, rtx dcond,
421 rtx src1_addr, rtx src2_addr, rtx orig_src1, rtx orig_src2)
423 HOST_WIDE_INT load_mode_size = GET_MODE_SIZE (load_mode);
424 rtx shift_amount = gen_reg_rtx (word_mode);
425 rtx d1 = gen_reg_rtx (word_mode);
426 rtx d2 = gen_reg_rtx (word_mode);
428 do_load_for_compare_from_addr (load_mode, d1, src1_addr, orig_src1);
429 do_load_for_compare_from_addr (load_mode, d2, src2_addr, orig_src2);
430 do_sub3 (shift_amount, GEN_INT (load_mode_size), cmp_rem);
432 if (word_mode == DImode)
434 emit_insn (gen_ashldi3 (shift_amount, shift_amount,
435 GEN_INT (LOG2_BITS_PER_UNIT)));
436 emit_insn (gen_lshrdi3 (d1, d1,
437 gen_lowpart (SImode, shift_amount)));
438 emit_insn (gen_lshrdi3 (d2, d2,
439 gen_lowpart (SImode, shift_amount)));
441 else
443 emit_insn (gen_ashlsi3 (shift_amount, shift_amount,
444 GEN_INT (LOG2_BITS_PER_UNIT)));
445 emit_insn (gen_lshrsi3 (d1, d1, shift_amount));
446 emit_insn (gen_lshrsi3 (d2, d2, shift_amount));
449 if (TARGET_P9_MISC)
451 /* Generate a compare, and convert with a setb later. */
452 rtx cmp = gen_rtx_COMPARE (CCUNSmode, d1, d2);
453 emit_insn (gen_rtx_SET (dcond, cmp));
455 else
457 if (word_mode == DImode)
458 emit_insn (gen_subfdi3_carry (diff, d2, d1));
459 else
460 emit_insn (gen_subfsi3_carry (diff, d2, d1));
464 /* Generate rtl for an overlapping load and compare of less than a
465 full load_mode. This assumes that the previous word is part of the
466 block being compared so it's ok to back up part of a word so we can
467 compare the last unaligned full word that ends at the end of the block.
469 LOAD_MODE is the machine mode for the loads.
470 ISCONST tells whether the remaining length is a constant or in a register.
471 BYTES_REM is the remaining length if ISCONST is true.
472 DIFF is the reg for the difference.
473 CMP_REM is the reg containing the remaining bytes to compare if !ISCONST.
474 DCOND is the CCUNS reg for the compare if we are doing P9 code with setb.
475 SRC1_ADDR is the first source address.
476 SRC2_ADDR is the second source address.
477 ORIG_SRC1 is the original first source block's address rtx.
478 ORIG_SRC2 is the original second source block's address rtx. */
479 static void
480 do_overlap_load_compare (machine_mode load_mode, bool isConst,
481 HOST_WIDE_INT bytes_rem, rtx diff,
482 rtx cmp_rem, rtx dcond, rtx src1_addr, rtx src2_addr,
483 rtx orig_src1, rtx orig_src2)
485 HOST_WIDE_INT load_mode_size = GET_MODE_SIZE (load_mode);
486 HOST_WIDE_INT addr_adj = load_mode_size - bytes_rem;
487 rtx d1 = gen_reg_rtx (word_mode);
488 rtx d2 = gen_reg_rtx (word_mode);
490 rtx addr1, addr2;
491 if (!isConst || addr_adj)
493 rtx adj_reg = gen_reg_rtx (word_mode);
494 if (isConst)
495 emit_move_insn (adj_reg, GEN_INT (-addr_adj));
496 else
498 rtx reg_lms = gen_reg_rtx (word_mode);
499 emit_move_insn (reg_lms, GEN_INT (load_mode_size));
500 do_sub3 (adj_reg, cmp_rem, reg_lms);
503 addr1 = gen_rtx_PLUS (word_mode, src1_addr, adj_reg);
504 addr2 = gen_rtx_PLUS (word_mode, src2_addr, adj_reg);
506 else
508 addr1 = src1_addr;
509 addr2 = src2_addr;
512 do_load_for_compare_from_addr (load_mode, d1, addr1, orig_src1);
513 do_load_for_compare_from_addr (load_mode, d2, addr2, orig_src2);
515 if (TARGET_P9_MISC)
517 /* Generate a compare, and convert with a setb later. */
518 rtx cmp = gen_rtx_COMPARE (CCUNSmode, d1, d2);
519 emit_insn (gen_rtx_SET (dcond, cmp));
521 else
523 if (word_mode == DImode)
524 emit_insn (gen_subfdi3_carry (diff, d2, d1));
525 else
526 emit_insn (gen_subfsi3_carry (diff, d2, d1));
530 /* Expand a block compare operation using loop code, and return true
531 if successful. Return false if we should let the compiler generate
532 normal code, probably a memcmp call.
534 OPERANDS[0] is the target (result).
535 OPERANDS[1] is the first source.
536 OPERANDS[2] is the second source.
537 OPERANDS[3] is the length.
538 OPERANDS[4] is the alignment. */
539 bool
540 expand_compare_loop (rtx operands[])
542 rtx target = operands[0];
543 rtx orig_src1 = operands[1];
544 rtx orig_src2 = operands[2];
545 rtx bytes_rtx = operands[3];
546 rtx align_rtx = operands[4];
548 /* This case is complicated to handle because the subtract
549 with carry instructions do not generate the 64-bit
550 carry and so we must emit code to calculate it ourselves.
551 We choose not to implement this yet. */
552 if (TARGET_32BIT && TARGET_POWERPC64)
553 return false;
555 /* Allow non-const length. */
556 int bytes_is_const = CONST_INT_P (bytes_rtx);
558 /* This must be a fixed size alignment. */
559 if (!CONST_INT_P (align_rtx))
560 return false;
562 HOST_WIDE_INT align1 = MEM_ALIGN (orig_src1) / BITS_PER_UNIT;
563 HOST_WIDE_INT align2 = MEM_ALIGN (orig_src2) / BITS_PER_UNIT;
564 HOST_WIDE_INT minalign = MIN (align1, align2);
566 bool isP7 = (rs6000_tune == PROCESSOR_POWER7);
568 gcc_assert (GET_MODE (target) == SImode);
570 /* Anything to move? */
571 HOST_WIDE_INT bytes = 0;
572 if (bytes_is_const)
573 bytes = INTVAL (bytes_rtx);
575 if (bytes_is_const && bytes == 0)
576 return true;
578 /* Limit the amount we compare, if known statically. */
579 HOST_WIDE_INT max_bytes;
580 switch (rs6000_tune)
582 case PROCESSOR_POWER7:
583 if (!bytes_is_const)
584 if (minalign < 8)
585 max_bytes = 0;
586 else
587 max_bytes = 128;
588 else
589 if (minalign < 8)
590 max_bytes = 32;
591 else
592 max_bytes = 128;
593 break;
594 case PROCESSOR_POWER8:
595 if (!bytes_is_const)
596 max_bytes = 0;
597 else
598 if (minalign < 8)
599 max_bytes = 128;
600 else
601 max_bytes = 64;
602 break;
603 case PROCESSOR_POWER9:
604 if (bytes_is_const)
605 max_bytes = 191;
606 else
607 max_bytes = 0;
608 break;
609 default:
610 max_bytes = 128;
613 /* Allow the option to override the default. */
614 if (rs6000_block_compare_inline_loop_limit >= 0)
615 max_bytes = (unsigned HOST_WIDE_INT) rs6000_block_compare_inline_loop_limit;
617 if (max_bytes == 0)
618 return false;
620 rtx cmp_rem = gen_reg_rtx (word_mode); /* Remainder for library call. */
621 rtx loop_cmp = gen_reg_rtx (word_mode); /* Actual amount compared by loop. */
622 HOST_WIDE_INT niter;
623 rtx iter = gen_reg_rtx (word_mode);
624 rtx iv1 = gen_reg_rtx (word_mode);
625 rtx iv2 = gen_reg_rtx (word_mode);
626 rtx d1_1 = gen_reg_rtx (word_mode); /* Addr expression src1+iv1 */
627 rtx d1_2 = gen_reg_rtx (word_mode); /* Addr expression src1+iv2 */
628 rtx d2_1 = gen_reg_rtx (word_mode); /* Addr expression src2+iv1 */
629 rtx d2_2 = gen_reg_rtx (word_mode); /* Addr expression src2+iv2 */
631 /* Strip unneeded subreg from length if there is one. */
632 if (SUBREG_P (bytes_rtx) && subreg_lowpart_p (bytes_rtx))
633 bytes_rtx = SUBREG_REG (bytes_rtx);
634 /* Extend bytes_rtx to word_mode if needed. But, we expect only to
635 maybe have to deal with the case were bytes_rtx is SImode and
636 word_mode is DImode. */
637 if (!bytes_is_const)
639 if (GET_MODE_SIZE (GET_MODE (bytes_rtx)) > GET_MODE_SIZE (word_mode))
640 /* Do not expect length longer than word_mode. */
641 return false;
642 else if (GET_MODE_SIZE (GET_MODE (bytes_rtx)) < GET_MODE_SIZE (word_mode))
644 bytes_rtx = force_reg (GET_MODE (bytes_rtx), bytes_rtx);
645 bytes_rtx = force_reg (word_mode,
646 gen_rtx_fmt_e (ZERO_EXTEND, word_mode,
647 bytes_rtx));
649 else
650 /* Make sure it's in a register before we get started. */
651 bytes_rtx = force_reg (GET_MODE (bytes_rtx), bytes_rtx);
654 machine_mode load_mode = word_mode;
655 HOST_WIDE_INT load_mode_size = GET_MODE_SIZE (load_mode);
657 /* Number of bytes per iteration of the unrolled loop. */
658 HOST_WIDE_INT loop_bytes = 2 * load_mode_size;
659 /* max iters and bytes compared in the loop. */
660 HOST_WIDE_INT max_loop_iter = max_bytes / loop_bytes;
661 HOST_WIDE_INT max_loop_bytes = max_loop_iter * loop_bytes;
662 int l2lb = floor_log2 (loop_bytes);
664 if (bytes_is_const && (max_bytes < load_mode_size
665 || !IN_RANGE (bytes, load_mode_size, max_bytes)))
666 return false;
668 bool no_remainder_code = false;
669 rtx final_label = gen_label_rtx ();
670 rtx final_ref = gen_rtx_LABEL_REF (VOIDmode, final_label);
671 rtx diff_label = gen_label_rtx ();
672 rtx library_call_label = NULL;
673 rtx cleanup_label = gen_label_rtx ();
675 rtx cr;
677 rtx src1_addr = copy_addr_to_reg (XEXP (orig_src1, 0));
678 rtx src2_addr = copy_addr_to_reg (XEXP (orig_src2, 0));
680 /* Difference found is stored here before jump to diff_label. */
681 rtx diff = gen_reg_rtx (word_mode);
682 rtx j;
684 /* Example of generated code for 35 bytes aligned 1 byte.
686 mtctr 8
687 li 6,0
688 li 5,8
689 .L13:
690 ldbrx 7,3,6
691 ldbrx 9,10,6
692 ldbrx 0,3,5
693 ldbrx 4,10,5
694 addi 6,6,16
695 addi 5,5,16
696 subfc. 9,9,7
697 bne 0,.L10
698 subfc. 9,4,0
699 bdnzt 2,.L13
700 bne 0,.L10
701 add 3,3,6
702 add 10,10,6
703 addi 9,3,-5
704 ldbrx 7,0,9
705 addi 9,10,-5
706 ldbrx 9,0,9
707 subfc 9,9,7
708 .p2align 4,,15
709 .L10:
710 popcntd 9,9
711 subfe 10,10,10
712 or 9,9,10
714 Compiled with -fno-reorder-blocks for clarity. */
716 /* Structure of what we're going to do:
717 Two separate lengths: what we will compare before bailing to library
718 call (max_bytes), and the total length to be checked.
719 if length <= 16, branch to linear cleanup code starting with
720 remainder length check (length not known at compile time)
721 set up 2 iv's and load count reg, compute remainder length
722 unrollx2 compare loop
723 if loop exit due to a difference, branch to difference handling code
724 if remainder length < 8, branch to final cleanup compare
725 load and compare 8B
726 final cleanup comparison (depends on alignment and length)
727 load 8B, shift off bytes past length, compare
728 load 8B ending at last byte and compare
729 load/compare 1 byte at a time (short block abutting 4k boundary)
730 difference handling, 64->32 conversion
731 final result
732 branch around memcmp call
733 memcmp library call
736 /* If bytes is not const, compare length and branch directly
737 to the cleanup code that can handle 0-16 bytes if length
738 is >= 16. Stash away bytes-max_bytes for the library call. */
739 if (bytes_is_const)
741 /* These need to be set for some of the places we may jump to. */
742 if (bytes > max_bytes)
744 no_remainder_code = true;
745 niter = max_loop_iter;
746 library_call_label = gen_label_rtx ();
748 else
750 niter = bytes / loop_bytes;
752 emit_move_insn (iter, GEN_INT (niter));
753 emit_move_insn (loop_cmp, GEN_INT (niter * loop_bytes));
754 emit_move_insn (cmp_rem, GEN_INT (bytes - niter * loop_bytes));
756 else
758 library_call_label = gen_label_rtx ();
760 /* If we go to the cleanup code, it expects length to be in cmp_rem. */
761 emit_move_insn (cmp_rem, bytes_rtx);
763 /* Check for > max_bytes bytes. We want to bail out as quickly as
764 possible if we have to go over to memcmp. */
765 do_ifelse (CCmode, GT, bytes_rtx, GEN_INT (max_bytes),
766 NULL_RTX, library_call_label);
768 /* Check for < loop_bytes bytes. */
769 do_ifelse (CCmode, LT, bytes_rtx, GEN_INT (loop_bytes),
770 NULL_RTX, cleanup_label);
772 /* Loop compare bytes and iterations if bytes>max_bytes. */
773 rtx mb_reg = gen_reg_rtx (word_mode);
774 emit_move_insn (mb_reg, GEN_INT (max_loop_bytes));
775 rtx mi_reg = gen_reg_rtx (word_mode);
776 emit_move_insn (mi_reg, GEN_INT (max_loop_iter));
778 /* Compute number of loop iterations if bytes <= max_bytes. */
779 if (word_mode == DImode)
780 emit_insn (gen_lshrdi3 (iter, bytes_rtx, GEN_INT (l2lb)));
781 else
782 emit_insn (gen_lshrsi3 (iter, bytes_rtx, GEN_INT (l2lb)));
784 /* Compute bytes to compare in loop if bytes <= max_bytes. */
785 rtx mask = GEN_INT (HOST_WIDE_INT_M1U << l2lb);
786 if (word_mode == DImode)
788 emit_insn (gen_anddi3 (loop_cmp, bytes_rtx, mask));
790 else
792 emit_insn (gen_andsi3 (loop_cmp, bytes_rtx, mask));
795 /* Check for bytes <= max_bytes. */
796 if (TARGET_ISEL)
798 /* P9 has fast isel so we use one compare and two isel. */
799 cr = gen_reg_rtx (CCmode);
800 rtx compare_rtx = gen_rtx_COMPARE (CCmode, bytes_rtx,
801 GEN_INT (max_bytes));
802 emit_move_insn (cr, compare_rtx);
803 rtx cmp_rtx = gen_rtx_LE (VOIDmode, cr, const0_rtx);
804 do_isel (loop_cmp, cmp_rtx, loop_cmp, mb_reg, cr);
805 do_isel (iter, cmp_rtx, iter, mi_reg, cr);
807 else
809 rtx lab_after = gen_label_rtx ();
810 do_ifelse (CCmode, LE, bytes_rtx, GEN_INT (max_bytes),
811 NULL_RTX, lab_after);
812 emit_move_insn (loop_cmp, mb_reg);
813 emit_move_insn (iter, mi_reg);
814 emit_label (lab_after);
817 /* Now compute remainder bytes which isn't used until after the loop. */
818 do_sub3 (cmp_rem, bytes_rtx, loop_cmp);
821 rtx dcond = NULL_RTX; /* Used for when we jump to diff_label. */
822 /* For p9 we need to have just one of these as multiple places define
823 it and it gets used by the setb at the end. */
824 if (TARGET_P9_MISC)
825 dcond = gen_reg_rtx (CCUNSmode);
827 if (!bytes_is_const || bytes >= loop_bytes)
829 /* It should not be possible to come here if remaining bytes is
830 < 16 in the runtime case either. Compute number of loop
831 iterations. We compare 2*word_mode per iteration so 16B for
832 64-bit code and 8B for 32-bit. Set up two induction
833 variables and load count register. */
835 /* HACK ALERT: create hard reg for CTR here. If we just use a
836 pseudo, cse will get rid of it and then the allocator will
837 see it used in the lshr above and won't give us ctr. */
838 rtx ctr = gen_rtx_REG (Pmode, CTR_REGNO);
839 emit_move_insn (ctr, iter);
840 emit_move_insn (diff, GEN_INT (0));
841 emit_move_insn (iv1, GEN_INT (0));
842 emit_move_insn (iv2, GEN_INT (load_mode_size));
844 /* inner loop to compare 2*word_mode */
845 rtx loop_top_label = gen_label_rtx ();
846 emit_label (loop_top_label);
848 rtx src1_ix1 = gen_rtx_PLUS (word_mode, src1_addr, iv1);
849 rtx src2_ix1 = gen_rtx_PLUS (word_mode, src2_addr, iv1);
851 do_load_for_compare_from_addr (load_mode, d1_1,
852 src1_ix1, orig_src1);
853 do_load_for_compare_from_addr (load_mode, d2_1,
854 src2_ix1, orig_src2);
855 do_add3 (iv1, iv1, GEN_INT (loop_bytes));
857 rtx src1_ix2 = gen_rtx_PLUS (word_mode, src1_addr, iv2);
858 rtx src2_ix2 = gen_rtx_PLUS (word_mode, src2_addr, iv2);
860 do_load_for_compare_from_addr (load_mode, d1_2,
861 src1_ix2, orig_src1);
862 do_load_for_compare_from_addr (load_mode, d2_2,
863 src2_ix2, orig_src2);
864 do_add3 (iv2, iv2, GEN_INT (loop_bytes));
866 if (TARGET_P9_MISC)
868 /* Generate a compare, and convert with a setb later. */
869 rtx cmp = gen_rtx_COMPARE (CCUNSmode, d1_1, d2_1);
870 emit_insn (gen_rtx_SET (dcond, cmp));
872 else
874 dcond = gen_reg_rtx (CCmode);
875 if (word_mode == DImode)
876 emit_insn (gen_subfdi3_carry_dot2 (diff, d2_1, d1_1, dcond));
877 else
878 emit_insn (gen_subfsi3_carry_dot2 (diff, d2_1, d1_1, dcond));
881 do_ifelse (GET_MODE (dcond), NE, NULL_RTX, NULL_RTX,
882 dcond, diff_label);
884 if (TARGET_P9_MISC)
886 /* Generate a compare, and convert with a setb later. */
887 rtx cmp = gen_rtx_COMPARE (CCUNSmode, d1_2, d2_2);
888 emit_insn (gen_rtx_SET (dcond, cmp));
890 else
892 dcond = gen_reg_rtx (CCmode);
893 if (word_mode == DImode)
894 emit_insn (gen_subfdi3_carry_dot2 (diff, d2_2, d1_2, dcond));
895 else
896 emit_insn (gen_subfsi3_carry_dot2 (diff, d2_2, d1_2, dcond));
899 rtx eqrtx = gen_rtx_EQ (VOIDmode, d1_2, d2_2);
900 if (TARGET_64BIT)
901 j = emit_jump_insn (gen_bdnztf_di (loop_top_label, ctr, ctr,
902 eqrtx, dcond));
903 else
904 j = emit_jump_insn (gen_bdnztf_si (loop_top_label, ctr, ctr,
905 eqrtx, dcond));
906 JUMP_LABEL (j) = loop_top_label;
907 LABEL_NUSES (loop_top_label) += 1;
910 HOST_WIDE_INT bytes_remaining = 0;
911 if (bytes_is_const)
912 bytes_remaining = (bytes % loop_bytes);
914 /* If diff is nonzero, branch to difference handling
915 code. If we exit here with a nonzero diff, it is
916 because the second word differed. */
917 if (TARGET_P9_MISC)
918 do_ifelse (CCUNSmode, NE, NULL_RTX, NULL_RTX, dcond, diff_label);
919 else
920 do_ifelse (CCmode, NE, diff, const0_rtx, NULL_RTX, diff_label);
922 if (library_call_label != NULL && bytes_is_const && bytes > max_bytes)
924 /* If the length is known at compile time, then we will always
925 have a remainder to go to the library call with. */
926 rtx library_call_ref = gen_rtx_LABEL_REF (VOIDmode, library_call_label);
927 j = emit_jump_insn (gen_rtx_SET (pc_rtx, library_call_ref));
928 JUMP_LABEL (j) = library_call_label;
929 LABEL_NUSES (library_call_label) += 1;
930 emit_barrier ();
933 if (bytes_is_const && bytes_remaining == 0)
935 /* No remainder and if we are here then diff is 0 so just return 0 */
936 if (TARGET_64BIT)
937 emit_insn (gen_movsi (target, gen_lowpart (SImode, diff)));
938 else
939 emit_move_insn (target, diff);
940 j = emit_jump_insn (gen_rtx_SET (pc_rtx, final_ref));
941 JUMP_LABEL (j) = final_label;
942 LABEL_NUSES (final_label) += 1;
943 emit_barrier ();
945 else if (!no_remainder_code)
947 /* Update addresses to point to the next word to examine. */
948 do_add3 (src1_addr, src1_addr, iv1);
949 do_add3 (src2_addr, src2_addr, iv1);
951 emit_label (cleanup_label);
953 if (!bytes_is_const)
955 /* If we're dealing with runtime length, we have to check if
956 it's zero after the loop. When length is known at compile
957 time the no-remainder condition is dealt with above. By
958 doing this after cleanup_label, we also deal with the
959 case where length is 0 at the start and we bypass the
960 loop with a branch to cleanup_label. */
961 emit_move_insn (target, const0_rtx);
962 do_ifelse (CCmode, EQ, cmp_rem, const0_rtx,
963 NULL_RTX, final_label);
966 rtx final_cleanup = gen_label_rtx ();
967 rtx cmp_rem_before = gen_reg_rtx (word_mode);
968 /* Compare one more word_mode chunk if needed. */
969 if (!bytes_is_const || bytes_remaining >= load_mode_size)
971 /* If remainder length < word length, branch to final
972 cleanup compare. */
973 if (!bytes_is_const)
974 do_ifelse (CCmode, LT, cmp_rem, GEN_INT (load_mode_size),
975 NULL_RTX, final_cleanup);
977 /* load and compare 8B */
978 do_load_for_compare_from_addr (load_mode, d1_1,
979 src1_addr, orig_src1);
980 do_load_for_compare_from_addr (load_mode, d2_1,
981 src2_addr, orig_src2);
983 /* Compare the word, see if we need to do the last partial. */
984 if (TARGET_P9_MISC)
986 /* Generate a compare, and convert with a setb later. */
987 rtx cmp = gen_rtx_COMPARE (CCUNSmode, d1_1, d2_1);
988 emit_insn (gen_rtx_SET (dcond, cmp));
990 else
992 dcond = gen_reg_rtx (CCmode);
993 if (word_mode == DImode)
994 emit_insn (gen_subfdi3_carry_dot2 (diff, d2_1, d1_1, dcond));
995 else
996 emit_insn (gen_subfsi3_carry_dot2 (diff, d2_1, d1_1, dcond));
999 do_ifelse (GET_MODE (dcond), NE, NULL_RTX, NULL_RTX,
1000 dcond, diff_label);
1002 do_add3 (src1_addr, src1_addr, GEN_INT (load_mode_size));
1003 do_add3 (src2_addr, src2_addr, GEN_INT (load_mode_size));
1004 emit_move_insn (cmp_rem_before, cmp_rem);
1005 do_add3 (cmp_rem, cmp_rem, GEN_INT (-load_mode_size));
1006 if (bytes_is_const)
1007 bytes_remaining -= load_mode_size;
1008 else
1009 /* See if remaining length is now zero. We previously set
1010 target to 0 so we can just jump to the end. */
1011 do_ifelse (CCmode, EQ, cmp_rem, const0_rtx,
1012 NULL_RTX, final_label);
1016 /* Cases:
1017 bytes_is_const
1018 We can always shift back to do an overlapping compare
1019 of the last chunk because we know length >= 8.
1021 !bytes_is_const
1022 align>=load_mode_size
1023 Read word_mode and mask
1024 align<load_mode_size
1025 avoid stepping past end
1027 Three strategies:
1028 * decrement address and do overlapping compare
1029 * read word_mode and mask
1030 * carefully avoid crossing 4k boundary
1033 if ((!bytes_is_const || (bytes_is_const && bytes_remaining && isP7))
1034 && align1 >= load_mode_size && align2 >= load_mode_size)
1036 /* Alignment is larger than word_mode so we do not need to be
1037 concerned with extra page crossings. But, we do not know
1038 that the length is larger than load_mode_size so we might
1039 end up compareing against data before the block if we try
1040 an overlapping compare. Also we use this on P7 for fixed length
1041 remainder because P7 doesn't like overlapping unaligned.
1042 Strategy: load 8B, shift off bytes past length, and compare. */
1043 emit_label (final_cleanup);
1044 do_load_mask_compare (load_mode, diff, cmp_rem, dcond,
1045 src1_addr, src2_addr, orig_src1, orig_src2);
1047 else if (bytes_remaining && bytes_is_const)
1049 /* We do not do loop expand if length < 32 so we know at the
1050 end we can do an overlapping compare.
1051 Strategy: shift address back and do word_mode load that
1052 ends at the end of the block. */
1053 emit_label (final_cleanup);
1054 do_overlap_load_compare (load_mode, true, bytes_remaining, diff,
1055 cmp_rem, dcond, src1_addr, src2_addr,
1056 orig_src1, orig_src2);
1058 else if (!bytes_is_const)
1060 rtx handle4k_label = gen_label_rtx ();
1061 rtx nonconst_overlap = gen_label_rtx ();
1062 emit_label (nonconst_overlap);
1064 /* Here we have to handle the case where whe have runtime
1065 length which may be too short for overlap compare, and
1066 alignment is not at least load_mode_size so we have to
1067 tread carefully to avoid stepping across 4k boundaries. */
1069 /* If the length after the loop was larger than word_mode
1070 size, we can just do an overlapping compare and we're
1071 done. We fall through to this code from the word_mode
1072 compare that preceeds this. */
1073 do_overlap_load_compare (load_mode, false, 0, diff,
1074 cmp_rem, dcond, src1_addr, src2_addr,
1075 orig_src1, orig_src2);
1077 rtx diff_ref = gen_rtx_LABEL_REF (VOIDmode, diff_label);
1078 j = emit_jump_insn (gen_rtx_SET (pc_rtx, diff_ref));
1079 JUMP_LABEL (j) = diff_label;
1080 LABEL_NUSES (diff_label) += 1;
1081 emit_barrier ();
1083 /* If we couldn't do the overlap compare we have to be more
1084 careful of the 4k boundary. Test to see if either
1085 address is less than word_mode_size away from a 4k
1086 boundary. If not, then we can do a load/shift/compare
1087 and we are done. We come to this code if length was less
1088 than word_mode_size. */
1090 emit_label (final_cleanup);
1092 /* We can still avoid the slow case if the length was larger
1093 than one loop iteration, in which case go do the overlap
1094 load compare path. */
1095 do_ifelse (CCmode, GT, bytes_rtx, GEN_INT (loop_bytes),
1096 NULL_RTX, nonconst_overlap);
1098 rtx rem4k = gen_reg_rtx (word_mode);
1099 rtx dist1 = gen_reg_rtx (word_mode);
1100 rtx dist2 = gen_reg_rtx (word_mode);
1101 do_sub3 (rem4k, GEN_INT (4096), cmp_rem);
1102 if (word_mode == SImode)
1103 emit_insn (gen_andsi3 (dist1, src1_addr, GEN_INT (0xfff)));
1104 else
1105 emit_insn (gen_anddi3 (dist1, src1_addr, GEN_INT (0xfff)));
1106 do_ifelse (CCmode, LE, dist1, rem4k, NULL_RTX, handle4k_label);
1107 if (word_mode == SImode)
1108 emit_insn (gen_andsi3 (dist2, src2_addr, GEN_INT (0xfff)));
1109 else
1110 emit_insn (gen_anddi3 (dist2, src2_addr, GEN_INT (0xfff)));
1111 do_ifelse (CCmode, LE, dist2, rem4k, NULL_RTX, handle4k_label);
1113 /* We don't have a 4k boundary to deal with, so do
1114 a load/shift/compare and jump to diff. */
1116 do_load_mask_compare (load_mode, diff, cmp_rem, dcond,
1117 src1_addr, src2_addr, orig_src1, orig_src2);
1119 j = emit_jump_insn (gen_rtx_SET (pc_rtx, diff_ref));
1120 JUMP_LABEL (j) = diff_label;
1121 LABEL_NUSES (diff_label) += 1;
1122 emit_barrier ();
1124 /* Finally in the unlikely case we are inching up to a
1125 4k boundary we use a compact lbzx/compare loop to do
1126 it a byte at a time. */
1128 emit_label (handle4k_label);
1130 rtx ctr = gen_rtx_REG (Pmode, CTR_REGNO);
1131 emit_move_insn (ctr, cmp_rem);
1132 rtx ixreg = gen_reg_rtx (Pmode);
1133 emit_move_insn (ixreg, const0_rtx);
1135 rtx src1_ix = gen_rtx_PLUS (word_mode, src1_addr, ixreg);
1136 rtx src2_ix = gen_rtx_PLUS (word_mode, src2_addr, ixreg);
1137 rtx d1 = gen_reg_rtx (word_mode);
1138 rtx d2 = gen_reg_rtx (word_mode);
1140 rtx fc_loop = gen_label_rtx ();
1141 emit_label (fc_loop);
1143 do_load_for_compare_from_addr (QImode, d1, src1_ix, orig_src1);
1144 do_load_for_compare_from_addr (QImode, d2, src2_ix, orig_src2);
1146 do_add3 (ixreg, ixreg, const1_rtx);
1148 rtx cond = gen_reg_rtx (CCmode);
1149 rtx subexpr = gen_rtx_MINUS (word_mode, d1, d2);
1150 rs6000_emit_dot_insn (diff, subexpr, 2, cond);
1152 rtx eqrtx = gen_rtx_EQ (VOIDmode, d1, d2);
1153 if (TARGET_64BIT)
1154 j = emit_jump_insn (gen_bdnztf_di (fc_loop, ctr, ctr,
1155 eqrtx, cond));
1156 else
1157 j = emit_jump_insn (gen_bdnztf_si (fc_loop, ctr, ctr,
1158 eqrtx, cond));
1159 JUMP_LABEL (j) = fc_loop;
1160 LABEL_NUSES (fc_loop) += 1;
1162 if (TARGET_64BIT)
1163 emit_insn (gen_movsi (target, gen_lowpart (SImode, diff)));
1164 else
1165 emit_move_insn (target, diff);
1167 /* Since we are comparing bytes, the difference can be used
1168 as the final result and we are done here. */
1169 j = emit_jump_insn (gen_rtx_SET (pc_rtx, final_ref));
1170 JUMP_LABEL (j) = final_label;
1171 LABEL_NUSES (final_label) += 1;
1172 emit_barrier ();
1176 emit_label (diff_label);
1177 /* difference handling, 64->32 conversion */
1179 /* We need to produce DI result from sub, then convert to target SI
1180 while maintaining <0 / ==0 / >0 properties. This sequence works:
1181 subfc L,A,B
1182 subfe H,H,H
1183 popcntd L,L
1184 rldimi L,H,6,0
1186 This is an alternate one Segher cooked up if somebody
1187 wants to expand this for something that doesn't have popcntd:
1188 subfc L,a,b
1189 subfe H,x,x
1190 addic t,L,-1
1191 subfe v,t,L
1192 or z,v,H
1194 And finally, p9 can just do this:
1195 cmpld A,B
1196 setb r */
1198 if (TARGET_P9_MISC)
1199 emit_insn (gen_setb_unsigned (target, dcond));
1200 else
1202 if (TARGET_64BIT)
1204 rtx tmp_reg_ca = gen_reg_rtx (DImode);
1205 emit_insn (gen_subfdi3_carry_in_xx (tmp_reg_ca));
1206 emit_insn (gen_popcntddi2 (diff, diff));
1207 emit_insn (gen_iordi3 (diff, diff, tmp_reg_ca));
1208 emit_insn (gen_movsi (target, gen_lowpart (SImode, diff)));
1210 else
1212 rtx tmp_reg_ca = gen_reg_rtx (SImode);
1213 emit_insn (gen_subfsi3_carry_in_xx (tmp_reg_ca));
1214 emit_insn (gen_popcntdsi2 (diff, diff));
1215 emit_insn (gen_iorsi3 (target, diff, tmp_reg_ca));
1219 if (library_call_label != NULL)
1221 /* Branch around memcmp call. */
1222 j = emit_jump_insn (gen_rtx_SET (pc_rtx, final_ref));
1223 JUMP_LABEL (j) = final_label;
1224 LABEL_NUSES (final_label) += 1;
1225 emit_barrier ();
1227 /* Make memcmp library call. cmp_rem is the remaining bytes that
1228 were compared and cmp_rem is the expected amount to be compared
1229 by memcmp. If we don't find a difference in the loop compare, do
1230 the library call directly instead of doing a small compare just
1231 to get to an arbitrary boundary before calling it anyway.
1232 Also, update addresses to point to the next word to examine. */
1233 emit_label (library_call_label);
1235 rtx len_rtx = gen_reg_rtx (word_mode);
1236 if (bytes_is_const)
1238 emit_move_insn (len_rtx, cmp_rem);
1239 do_add3 (src1_addr, src1_addr, iv1);
1240 do_add3 (src2_addr, src2_addr, iv1);
1242 else
1243 emit_move_insn (len_rtx, bytes_rtx);
1245 tree fun = builtin_decl_explicit (BUILT_IN_MEMCMP);
1246 emit_library_call_value (XEXP (DECL_RTL (fun), 0),
1247 target, LCT_NORMAL, GET_MODE (target),
1248 src1_addr, Pmode,
1249 src2_addr, Pmode,
1250 len_rtx, GET_MODE (len_rtx));
1253 /* emit final_label */
1254 emit_label (final_label);
1255 return true;
1258 /* Expand a block compare operation, and return true if successful.
1259 Return false if we should let the compiler generate normal code,
1260 probably a memcmp call.
1262 OPERANDS[0] is the target (result).
1263 OPERANDS[1] is the first source.
1264 OPERANDS[2] is the second source.
1265 OPERANDS[3] is the length.
1266 OPERANDS[4] is the alignment. */
1267 bool
1268 expand_block_compare (rtx operands[])
1270 rtx target = operands[0];
1271 rtx orig_src1 = operands[1];
1272 rtx orig_src2 = operands[2];
1273 rtx bytes_rtx = operands[3];
1274 rtx align_rtx = operands[4];
1275 HOST_WIDE_INT cmp_bytes = 0;
1276 rtx src1 = orig_src1;
1277 rtx src2 = orig_src2;
1279 /* This case is complicated to handle because the subtract
1280 with carry instructions do not generate the 64-bit
1281 carry and so we must emit code to calculate it ourselves.
1282 We choose not to implement this yet. */
1283 if (TARGET_32BIT && TARGET_POWERPC64)
1284 return false;
1286 bool isP7 = (rs6000_tune == PROCESSOR_POWER7);
1288 /* Allow this param to shut off all expansion. */
1289 if (rs6000_block_compare_inline_limit == 0)
1290 return false;
1292 /* targetm.slow_unaligned_access -- don't do unaligned stuff.
1293 However slow_unaligned_access returns true on P7 even though the
1294 performance of this code is good there. */
1295 if (!isP7
1296 && (targetm.slow_unaligned_access (word_mode, MEM_ALIGN (orig_src1))
1297 || targetm.slow_unaligned_access (word_mode, MEM_ALIGN (orig_src2))))
1298 return false;
1300 /* Unaligned l*brx traps on P7 so don't do this. However this should
1301 not affect much because LE isn't really supported on P7 anyway. */
1302 if (isP7 && !BYTES_BIG_ENDIAN)
1303 return false;
1305 /* If this is not a fixed size compare, try generating loop code and
1306 if that fails just call memcmp. */
1307 if (!CONST_INT_P (bytes_rtx))
1308 return expand_compare_loop (operands);
1310 /* This must be a fixed size alignment. */
1311 if (!CONST_INT_P (align_rtx))
1312 return false;
1314 unsigned int base_align = UINTVAL (align_rtx) / BITS_PER_UNIT;
1316 gcc_assert (GET_MODE (target) == SImode);
1318 /* Anything to move? */
1319 unsigned HOST_WIDE_INT bytes = UINTVAL (bytes_rtx);
1320 if (bytes == 0)
1321 return true;
1323 rtx tmp_reg_src1 = gen_reg_rtx (word_mode);
1324 rtx tmp_reg_src2 = gen_reg_rtx (word_mode);
1325 /* P7/P8 code uses cond for subfc. but P9 uses
1326 it for cmpld which needs CCUNSmode. */
1327 rtx cond;
1328 if (TARGET_P9_MISC)
1329 cond = gen_reg_rtx (CCUNSmode);
1330 else
1331 cond = gen_reg_rtx (CCmode);
1333 /* If we have an LE target without ldbrx and word_mode is DImode,
1334 then we must avoid using word_mode. */
1335 int word_mode_ok = !(!BYTES_BIG_ENDIAN && !TARGET_LDBRX
1336 && word_mode == DImode);
1338 /* Strategy phase. How many ops will this take and should we expand it? */
1340 unsigned HOST_WIDE_INT offset = 0;
1341 machine_mode load_mode =
1342 select_block_compare_mode (offset, bytes, base_align, word_mode_ok);
1343 unsigned int load_mode_size = GET_MODE_SIZE (load_mode);
1345 /* We don't want to generate too much code. The loop code can take
1346 over for lengths greater than 31 bytes. */
1347 unsigned HOST_WIDE_INT max_bytes = rs6000_block_compare_inline_limit;
1348 if (!IN_RANGE (bytes, 1, max_bytes))
1349 return expand_compare_loop (operands);
1351 /* The code generated for p7 and older is not faster than glibc
1352 memcmp if alignment is small and length is not short, so bail
1353 out to avoid those conditions. */
1354 if (!TARGET_EFFICIENT_OVERLAPPING_UNALIGNED
1355 && ((base_align == 1 && bytes > 16)
1356 || (base_align == 2 && bytes > 32)))
1357 return false;
1359 bool generate_6432_conversion = false;
1360 rtx convert_label = NULL;
1361 rtx final_label = NULL;
1363 /* Example of generated code for 18 bytes aligned 1 byte.
1364 Compiled with -fno-reorder-blocks for clarity.
1365 ldbrx 10,31,8
1366 ldbrx 9,7,8
1367 subfc. 9,9,10
1368 bne 0,.L6487
1369 addi 9,12,8
1370 addi 5,11,8
1371 ldbrx 10,0,9
1372 ldbrx 9,0,5
1373 subfc. 9,9,10
1374 bne 0,.L6487
1375 addi 9,12,16
1376 lhbrx 10,0,9
1377 addi 9,11,16
1378 lhbrx 9,0,9
1379 subf 9,9,10
1380 b .L6488
1381 .p2align 4,,15
1382 .L6487: #convert_label
1383 popcntd 9,9
1384 subfe 10,10,10
1385 or 9,9,10
1386 .L6488: #final_label
1387 extsw 10,9
1389 We start off with DImode for two blocks that jump to the DI->SI conversion
1390 if the difference is found there, then a final block of HImode that skips
1391 the DI->SI conversion. */
1393 while (bytes > 0)
1395 unsigned int align = compute_current_alignment (base_align, offset);
1396 if (TARGET_EFFICIENT_OVERLAPPING_UNALIGNED)
1397 load_mode = select_block_compare_mode (offset, bytes, align,
1398 word_mode_ok);
1399 else
1400 load_mode = select_block_compare_mode (0, bytes, align, word_mode_ok);
1401 load_mode_size = GET_MODE_SIZE (load_mode);
1402 if (bytes >= load_mode_size)
1403 cmp_bytes = load_mode_size;
1404 else if (TARGET_EFFICIENT_OVERLAPPING_UNALIGNED)
1406 /* Move this load back so it doesn't go past the end.
1407 P8/P9 can do this efficiently. */
1408 unsigned int extra_bytes = load_mode_size - bytes;
1409 cmp_bytes = bytes;
1410 if (extra_bytes < offset)
1412 offset -= extra_bytes;
1413 cmp_bytes = load_mode_size;
1414 bytes = cmp_bytes;
1417 else
1418 /* P7 and earlier can't do the overlapping load trick fast,
1419 so this forces a non-overlapping load and a shift to get
1420 rid of the extra bytes. */
1421 cmp_bytes = bytes;
1423 src1 = adjust_address (orig_src1, load_mode, offset);
1424 src2 = adjust_address (orig_src2, load_mode, offset);
1426 if (!REG_P (XEXP (src1, 0)))
1428 rtx src1_reg = copy_addr_to_reg (XEXP (src1, 0));
1429 src1 = replace_equiv_address (src1, src1_reg);
1431 set_mem_size (src1, load_mode_size);
1433 if (!REG_P (XEXP (src2, 0)))
1435 rtx src2_reg = copy_addr_to_reg (XEXP (src2, 0));
1436 src2 = replace_equiv_address (src2, src2_reg);
1438 set_mem_size (src2, load_mode_size);
1440 do_load_for_compare (tmp_reg_src1, src1, load_mode);
1441 do_load_for_compare (tmp_reg_src2, src2, load_mode);
1443 if (cmp_bytes < load_mode_size)
1445 /* Shift unneeded bytes off. */
1446 rtx sh = GEN_INT (BITS_PER_UNIT * (load_mode_size - cmp_bytes));
1447 if (word_mode == DImode)
1449 emit_insn (gen_lshrdi3 (tmp_reg_src1, tmp_reg_src1, sh));
1450 emit_insn (gen_lshrdi3 (tmp_reg_src2, tmp_reg_src2, sh));
1452 else
1454 emit_insn (gen_lshrsi3 (tmp_reg_src1, tmp_reg_src1, sh));
1455 emit_insn (gen_lshrsi3 (tmp_reg_src2, tmp_reg_src2, sh));
1459 int remain = bytes - cmp_bytes;
1460 if (GET_MODE_SIZE (GET_MODE (target)) > GET_MODE_SIZE (load_mode))
1462 /* Target is larger than load size so we don't need to
1463 reduce result size. */
1465 /* We previously did a block that need 64->32 conversion but
1466 the current block does not, so a label is needed to jump
1467 to the end. */
1468 if (generate_6432_conversion && !final_label)
1469 final_label = gen_label_rtx ();
1471 if (remain > 0)
1473 /* This is not the last block, branch to the end if the result
1474 of this subtract is not zero. */
1475 if (!final_label)
1476 final_label = gen_label_rtx ();
1477 rtx fin_ref = gen_rtx_LABEL_REF (VOIDmode, final_label);
1478 rtx tmp = gen_rtx_MINUS (word_mode, tmp_reg_src1, tmp_reg_src2);
1479 rtx cr = gen_reg_rtx (CCmode);
1480 rs6000_emit_dot_insn (tmp_reg_src2, tmp, 2, cr);
1481 emit_insn (gen_movsi (target,
1482 gen_lowpart (SImode, tmp_reg_src2)));
1483 rtx ne_rtx = gen_rtx_NE (VOIDmode, cr, const0_rtx);
1484 rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, ne_rtx,
1485 fin_ref, pc_rtx);
1486 rtx j = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse));
1487 JUMP_LABEL (j) = final_label;
1488 LABEL_NUSES (final_label) += 1;
1490 else
1492 if (word_mode == DImode)
1494 emit_insn (gen_subdi3 (tmp_reg_src2, tmp_reg_src1,
1495 tmp_reg_src2));
1496 emit_insn (gen_movsi (target,
1497 gen_lowpart (SImode, tmp_reg_src2)));
1499 else
1500 emit_insn (gen_subsi3 (target, tmp_reg_src1, tmp_reg_src2));
1502 if (final_label)
1504 rtx fin_ref = gen_rtx_LABEL_REF (VOIDmode, final_label);
1505 rtx j = emit_jump_insn (gen_rtx_SET (pc_rtx, fin_ref));
1506 JUMP_LABEL (j) = final_label;
1507 LABEL_NUSES (final_label) += 1;
1508 emit_barrier ();
1512 else
1514 /* Do we need a 64->32 conversion block? We need the 64->32
1515 conversion even if target size == load_mode size because
1516 the subtract generates one extra bit. */
1517 generate_6432_conversion = true;
1519 if (remain > 0)
1521 if (!convert_label)
1522 convert_label = gen_label_rtx ();
1524 /* Compare to zero and branch to convert_label if not zero. */
1525 rtx cvt_ref = gen_rtx_LABEL_REF (VOIDmode, convert_label);
1526 if (TARGET_P9_MISC)
1528 /* Generate a compare, and convert with a setb later. */
1529 rtx cmp = gen_rtx_COMPARE (CCUNSmode, tmp_reg_src1,
1530 tmp_reg_src2);
1531 emit_insn (gen_rtx_SET (cond, cmp));
1533 else
1534 /* Generate a subfc. and use the longer
1535 sequence for conversion. */
1536 if (TARGET_64BIT)
1537 emit_insn (gen_subfdi3_carry_dot2 (tmp_reg_src2, tmp_reg_src2,
1538 tmp_reg_src1, cond));
1539 else
1540 emit_insn (gen_subfsi3_carry_dot2 (tmp_reg_src2, tmp_reg_src2,
1541 tmp_reg_src1, cond));
1542 rtx ne_rtx = gen_rtx_NE (VOIDmode, cond, const0_rtx);
1543 rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, ne_rtx,
1544 cvt_ref, pc_rtx);
1545 rtx j = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse));
1546 JUMP_LABEL (j) = convert_label;
1547 LABEL_NUSES (convert_label) += 1;
1549 else
1551 /* Just do the subtract/compare. Since this is the last block
1552 the convert code will be generated immediately following. */
1553 if (TARGET_P9_MISC)
1555 rtx cmp = gen_rtx_COMPARE (CCUNSmode, tmp_reg_src1,
1556 tmp_reg_src2);
1557 emit_insn (gen_rtx_SET (cond, cmp));
1559 else
1560 if (TARGET_64BIT)
1561 emit_insn (gen_subfdi3_carry (tmp_reg_src2, tmp_reg_src2,
1562 tmp_reg_src1));
1563 else
1564 emit_insn (gen_subfsi3_carry (tmp_reg_src2, tmp_reg_src2,
1565 tmp_reg_src1));
1569 offset += cmp_bytes;
1570 bytes -= cmp_bytes;
1573 if (generate_6432_conversion)
1575 if (convert_label)
1576 emit_label (convert_label);
1578 /* We need to produce DI result from sub, then convert to target SI
1579 while maintaining <0 / ==0 / >0 properties. This sequence works:
1580 subfc L,A,B
1581 subfe H,H,H
1582 popcntd L,L
1583 rldimi L,H,6,0
1585 This is an alternate one Segher cooked up if somebody
1586 wants to expand this for something that doesn't have popcntd:
1587 subfc L,a,b
1588 subfe H,x,x
1589 addic t,L,-1
1590 subfe v,t,L
1591 or z,v,H
1593 And finally, p9 can just do this:
1594 cmpld A,B
1595 setb r */
1597 if (TARGET_P9_MISC)
1599 emit_insn (gen_setb_unsigned (target, cond));
1601 else
1603 if (TARGET_64BIT)
1605 rtx tmp_reg_ca = gen_reg_rtx (DImode);
1606 emit_insn (gen_subfdi3_carry_in_xx (tmp_reg_ca));
1607 emit_insn (gen_popcntddi2 (tmp_reg_src2, tmp_reg_src2));
1608 emit_insn (gen_iordi3 (tmp_reg_src2, tmp_reg_src2, tmp_reg_ca));
1609 emit_insn (gen_movsi (target, gen_lowpart (SImode, tmp_reg_src2)));
1611 else
1613 rtx tmp_reg_ca = gen_reg_rtx (SImode);
1614 emit_insn (gen_subfsi3_carry_in_xx (tmp_reg_ca));
1615 emit_insn (gen_popcntdsi2 (tmp_reg_src2, tmp_reg_src2));
1616 emit_insn (gen_iorsi3 (target, tmp_reg_src2, tmp_reg_ca));
1621 if (final_label)
1622 emit_label (final_label);
1624 gcc_assert (bytes == 0);
1625 return true;
1628 /* Generate alignment check and branch code to set up for
1629 strncmp when we don't have DI alignment.
1630 STRNCMP_LABEL is the label to branch if there is a page crossing.
1631 SRC is the string pointer to be examined.
1632 BYTES is the max number of bytes to compare. */
1633 static void
1634 expand_strncmp_align_check (rtx strncmp_label, rtx src, HOST_WIDE_INT bytes)
1636 rtx lab_ref = gen_rtx_LABEL_REF (VOIDmode, strncmp_label);
1637 rtx src_check = copy_addr_to_reg (XEXP (src, 0));
1638 if (GET_MODE (src_check) == SImode)
1639 emit_insn (gen_andsi3 (src_check, src_check, GEN_INT (0xfff)));
1640 else
1641 emit_insn (gen_anddi3 (src_check, src_check, GEN_INT (0xfff)));
1642 rtx cond = gen_reg_rtx (CCmode);
1643 emit_move_insn (cond, gen_rtx_COMPARE (CCmode, src_check,
1644 GEN_INT (4096 - bytes)));
1646 rtx cmp_rtx = gen_rtx_GE (VOIDmode, cond, const0_rtx);
1648 rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, cmp_rtx,
1649 lab_ref, pc_rtx);
1650 rtx j = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse));
1651 JUMP_LABEL (j) = strncmp_label;
1652 LABEL_NUSES (strncmp_label) += 1;
1655 /* Expand a string compare operation with length, and return
1656 true if successful. Return false if we should let the
1657 compiler generate normal code, probably a strncmp call.
1659 OPERANDS[0] is the target (result).
1660 OPERANDS[1] is the first source.
1661 OPERANDS[2] is the second source.
1662 If NO_LENGTH is zero, then:
1663 OPERANDS[3] is the length.
1664 OPERANDS[4] is the alignment in bytes.
1665 If NO_LENGTH is nonzero, then:
1666 OPERANDS[3] is the alignment in bytes. */
1667 bool
1668 expand_strn_compare (rtx operands[], int no_length)
1670 rtx target = operands[0];
1671 rtx orig_src1 = operands[1];
1672 rtx orig_src2 = operands[2];
1673 rtx bytes_rtx, align_rtx;
1674 if (no_length)
1676 bytes_rtx = NULL;
1677 align_rtx = operands[3];
1679 else
1681 bytes_rtx = operands[3];
1682 align_rtx = operands[4];
1684 unsigned HOST_WIDE_INT cmp_bytes = 0;
1685 rtx src1 = orig_src1;
1686 rtx src2 = orig_src2;
1688 /* If we have a length, it must be constant. This simplifies things
1689 a bit as we don't have to generate code to check if we've exceeded
1690 the length. Later this could be expanded to handle this case. */
1691 if (!no_length && !CONST_INT_P (bytes_rtx))
1692 return false;
1694 /* This must be a fixed size alignment. */
1695 if (!CONST_INT_P (align_rtx))
1696 return false;
1698 unsigned int base_align = UINTVAL (align_rtx);
1699 int align1 = MEM_ALIGN (orig_src1) / BITS_PER_UNIT;
1700 int align2 = MEM_ALIGN (orig_src2) / BITS_PER_UNIT;
1702 /* targetm.slow_unaligned_access -- don't do unaligned stuff. */
1703 if (targetm.slow_unaligned_access (word_mode, align1)
1704 || targetm.slow_unaligned_access (word_mode, align2))
1705 return false;
1707 gcc_assert (GET_MODE (target) == SImode);
1709 /* If we have an LE target without ldbrx and word_mode is DImode,
1710 then we must avoid using word_mode. */
1711 int word_mode_ok = !(!BYTES_BIG_ENDIAN && !TARGET_LDBRX
1712 && word_mode == DImode);
1714 unsigned int word_mode_size = GET_MODE_SIZE (word_mode);
1716 unsigned HOST_WIDE_INT offset = 0;
1717 unsigned HOST_WIDE_INT bytes; /* N from the strncmp args if available. */
1718 unsigned HOST_WIDE_INT compare_length; /* How much to compare inline. */
1719 if (no_length)
1720 /* Use this as a standin to determine the mode to use. */
1721 bytes = rs6000_string_compare_inline_limit * word_mode_size;
1722 else
1723 bytes = UINTVAL (bytes_rtx);
1725 machine_mode load_mode =
1726 select_block_compare_mode (offset, bytes, base_align, word_mode_ok);
1727 unsigned int load_mode_size = GET_MODE_SIZE (load_mode);
1728 compare_length = rs6000_string_compare_inline_limit * load_mode_size;
1730 /* If we have equality at the end of the last compare and we have not
1731 found the end of the string, we need to call strcmp/strncmp to
1732 compare the remainder. */
1733 bool equality_compare_rest = false;
1735 if (no_length)
1737 bytes = compare_length;
1738 equality_compare_rest = true;
1740 else
1742 if (bytes <= compare_length)
1743 compare_length = bytes;
1744 else
1745 equality_compare_rest = true;
1748 rtx result_reg = gen_reg_rtx (word_mode);
1749 rtx final_move_label = gen_label_rtx ();
1750 rtx final_label = gen_label_rtx ();
1751 rtx begin_compare_label = NULL;
1753 if (base_align < 8)
1755 /* Generate code that checks distance to 4k boundary for this case. */
1756 begin_compare_label = gen_label_rtx ();
1757 rtx strncmp_label = gen_label_rtx ();
1758 rtx jmp;
1760 /* Strncmp for power8 in glibc does this:
1761 rldicl r8,r3,0,52
1762 cmpldi cr7,r8,4096-16
1763 bgt cr7,L(pagecross) */
1765 /* Make sure that the length we use for the alignment test and
1766 the subsequent code generation are in agreement so we do not
1767 go past the length we tested for a 4k boundary crossing. */
1768 unsigned HOST_WIDE_INT align_test = compare_length;
1769 if (align_test < 8)
1771 align_test = HOST_WIDE_INT_1U << ceil_log2 (align_test);
1772 base_align = align_test;
1774 else
1776 align_test = ROUND_UP (align_test, 8);
1777 base_align = 8;
1780 if (align1 < 8)
1781 expand_strncmp_align_check (strncmp_label, src1, align_test);
1782 if (align2 < 8)
1783 expand_strncmp_align_check (strncmp_label, src2, align_test);
1785 /* Now generate the following sequence:
1786 - branch to begin_compare
1787 - strncmp_label
1788 - call to strncmp
1789 - branch to final_label
1790 - begin_compare_label */
1792 rtx cmp_ref = gen_rtx_LABEL_REF (VOIDmode, begin_compare_label);
1793 jmp = emit_jump_insn (gen_rtx_SET (pc_rtx, cmp_ref));
1794 JUMP_LABEL (jmp) = begin_compare_label;
1795 LABEL_NUSES (begin_compare_label) += 1;
1796 emit_barrier ();
1798 emit_label (strncmp_label);
1800 if (!REG_P (XEXP (src1, 0)))
1802 rtx src1_reg = copy_addr_to_reg (XEXP (src1, 0));
1803 src1 = replace_equiv_address (src1, src1_reg);
1806 if (!REG_P (XEXP (src2, 0)))
1808 rtx src2_reg = copy_addr_to_reg (XEXP (src2, 0));
1809 src2 = replace_equiv_address (src2, src2_reg);
1812 if (no_length)
1814 tree fun = builtin_decl_explicit (BUILT_IN_STRCMP);
1815 emit_library_call_value (XEXP (DECL_RTL (fun), 0),
1816 target, LCT_NORMAL, GET_MODE (target),
1817 force_reg (Pmode, XEXP (src1, 0)), Pmode,
1818 force_reg (Pmode, XEXP (src2, 0)), Pmode);
1820 else
1822 /* -m32 -mpowerpc64 results in word_mode being DImode even
1823 though otherwise it is 32-bit. The length arg to strncmp
1824 is a size_t which will be the same size as pointers. */
1825 rtx len_rtx;
1826 if (TARGET_64BIT)
1827 len_rtx = gen_reg_rtx (DImode);
1828 else
1829 len_rtx = gen_reg_rtx (SImode);
1831 emit_move_insn (len_rtx, bytes_rtx);
1833 tree fun = builtin_decl_explicit (BUILT_IN_STRNCMP);
1834 emit_library_call_value (XEXP (DECL_RTL (fun), 0),
1835 target, LCT_NORMAL, GET_MODE (target),
1836 force_reg (Pmode, XEXP (src1, 0)), Pmode,
1837 force_reg (Pmode, XEXP (src2, 0)), Pmode,
1838 len_rtx, GET_MODE (len_rtx));
1841 rtx fin_ref = gen_rtx_LABEL_REF (VOIDmode, final_label);
1842 jmp = emit_jump_insn (gen_rtx_SET (pc_rtx, fin_ref));
1843 JUMP_LABEL (jmp) = final_label;
1844 LABEL_NUSES (final_label) += 1;
1845 emit_barrier ();
1846 emit_label (begin_compare_label);
1849 rtx cleanup_label = NULL;
1850 rtx tmp_reg_src1 = gen_reg_rtx (word_mode);
1851 rtx tmp_reg_src2 = gen_reg_rtx (word_mode);
1853 /* Generate sequence of ld/ldbrx, cmpb to compare out
1854 to the length specified. */
1855 unsigned HOST_WIDE_INT bytes_to_compare = compare_length;
1856 while (bytes_to_compare > 0)
1858 /* Compare sequence:
1859 check each 8B with: ld/ld cmpd bne
1860 If equal, use rldicr/cmpb to check for zero byte.
1861 cleanup code at end:
1862 cmpb get byte that differs
1863 cmpb look for zero byte
1864 orc combine
1865 cntlzd get bit of first zero/diff byte
1866 subfic convert for rldcl use
1867 rldcl rldcl extract diff/zero byte
1868 subf subtract for final result
1870 The last compare can branch around the cleanup code if the
1871 result is zero because the strings are exactly equal. */
1872 unsigned int align = compute_current_alignment (base_align, offset);
1873 if (TARGET_EFFICIENT_OVERLAPPING_UNALIGNED)
1874 load_mode = select_block_compare_mode (offset, bytes_to_compare, align,
1875 word_mode_ok);
1876 else
1877 load_mode = select_block_compare_mode (0, bytes_to_compare, align,
1878 word_mode_ok);
1879 load_mode_size = GET_MODE_SIZE (load_mode);
1880 if (bytes_to_compare >= load_mode_size)
1881 cmp_bytes = load_mode_size;
1882 else if (TARGET_EFFICIENT_OVERLAPPING_UNALIGNED)
1884 /* Move this load back so it doesn't go past the end.
1885 P8/P9 can do this efficiently. */
1886 unsigned int extra_bytes = load_mode_size - bytes_to_compare;
1887 cmp_bytes = bytes_to_compare;
1888 if (extra_bytes < offset)
1890 offset -= extra_bytes;
1891 cmp_bytes = load_mode_size;
1892 bytes_to_compare = cmp_bytes;
1895 else
1896 /* P7 and earlier can't do the overlapping load trick fast,
1897 so this forces a non-overlapping load and a shift to get
1898 rid of the extra bytes. */
1899 cmp_bytes = bytes_to_compare;
1901 src1 = adjust_address (orig_src1, load_mode, offset);
1902 src2 = adjust_address (orig_src2, load_mode, offset);
1904 if (!REG_P (XEXP (src1, 0)))
1906 rtx src1_reg = copy_addr_to_reg (XEXP (src1, 0));
1907 src1 = replace_equiv_address (src1, src1_reg);
1909 set_mem_size (src1, load_mode_size);
1911 if (!REG_P (XEXP (src2, 0)))
1913 rtx src2_reg = copy_addr_to_reg (XEXP (src2, 0));
1914 src2 = replace_equiv_address (src2, src2_reg);
1916 set_mem_size (src2, load_mode_size);
1918 do_load_for_compare (tmp_reg_src1, src1, load_mode);
1919 do_load_for_compare (tmp_reg_src2, src2, load_mode);
1921 /* We must always left-align the data we read, and
1922 clear any bytes to the right that are beyond the string.
1923 Otherwise the cmpb sequence won't produce the correct
1924 results. The beginning of the compare will be done
1925 with word_mode so will not have any extra shifts or
1926 clear rights. */
1928 if (load_mode_size < word_mode_size)
1930 /* Rotate left first. */
1931 rtx sh = GEN_INT (BITS_PER_UNIT * (word_mode_size - load_mode_size));
1932 if (word_mode == DImode)
1934 emit_insn (gen_rotldi3 (tmp_reg_src1, tmp_reg_src1, sh));
1935 emit_insn (gen_rotldi3 (tmp_reg_src2, tmp_reg_src2, sh));
1937 else
1939 emit_insn (gen_rotlsi3 (tmp_reg_src1, tmp_reg_src1, sh));
1940 emit_insn (gen_rotlsi3 (tmp_reg_src2, tmp_reg_src2, sh));
1944 if (cmp_bytes < word_mode_size)
1946 /* Now clear right. This plus the rotate can be
1947 turned into a rldicr instruction. */
1948 HOST_WIDE_INT mb = BITS_PER_UNIT * (word_mode_size - cmp_bytes);
1949 rtx mask = GEN_INT (HOST_WIDE_INT_M1U << mb);
1950 if (word_mode == DImode)
1952 emit_insn (gen_anddi3_mask (tmp_reg_src1, tmp_reg_src1, mask));
1953 emit_insn (gen_anddi3_mask (tmp_reg_src2, tmp_reg_src2, mask));
1955 else
1957 emit_insn (gen_andsi3_mask (tmp_reg_src1, tmp_reg_src1, mask));
1958 emit_insn (gen_andsi3_mask (tmp_reg_src2, tmp_reg_src2, mask));
1962 /* Cases to handle. A and B are chunks of the two strings.
1963 1: Not end of comparison:
1964 A != B: branch to cleanup code to compute result.
1965 A == B: check for 0 byte, next block if not found.
1966 2: End of the inline comparison:
1967 A != B: branch to cleanup code to compute result.
1968 A == B: check for 0 byte, call strcmp/strncmp
1969 3: compared requested N bytes:
1970 A == B: branch to result 0.
1971 A != B: cleanup code to compute result. */
1973 unsigned HOST_WIDE_INT remain = bytes_to_compare - cmp_bytes;
1975 rtx dst_label;
1976 if (remain > 0 || equality_compare_rest)
1978 /* Branch to cleanup code, otherwise fall through to do
1979 more compares. */
1980 if (!cleanup_label)
1981 cleanup_label = gen_label_rtx ();
1982 dst_label = cleanup_label;
1984 else
1985 /* Branch to end and produce result of 0. */
1986 dst_label = final_move_label;
1988 rtx lab_ref = gen_rtx_LABEL_REF (VOIDmode, dst_label);
1989 rtx cond = gen_reg_rtx (CCmode);
1991 /* Always produce the 0 result, it is needed if
1992 cmpb finds a 0 byte in this chunk. */
1993 rtx tmp = gen_rtx_MINUS (word_mode, tmp_reg_src1, tmp_reg_src2);
1994 rs6000_emit_dot_insn (result_reg, tmp, 1, cond);
1996 rtx cmp_rtx;
1997 if (remain == 0 && !equality_compare_rest)
1998 cmp_rtx = gen_rtx_EQ (VOIDmode, cond, const0_rtx);
1999 else
2000 cmp_rtx = gen_rtx_NE (VOIDmode, cond, const0_rtx);
2002 rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, cmp_rtx,
2003 lab_ref, pc_rtx);
2004 rtx j = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse));
2005 JUMP_LABEL (j) = dst_label;
2006 LABEL_NUSES (dst_label) += 1;
2008 if (remain > 0 || equality_compare_rest)
2010 /* Generate a cmpb to test for a 0 byte and branch
2011 to final result if found. */
2012 rtx cmpb_zero = gen_reg_rtx (word_mode);
2013 rtx lab_ref_fin = gen_rtx_LABEL_REF (VOIDmode, final_move_label);
2014 rtx condz = gen_reg_rtx (CCmode);
2015 rtx zero_reg = gen_reg_rtx (word_mode);
2016 if (word_mode == SImode)
2018 emit_insn (gen_movsi (zero_reg, GEN_INT (0)));
2019 emit_insn (gen_cmpbsi3 (cmpb_zero, tmp_reg_src1, zero_reg));
2020 if (cmp_bytes < word_mode_size)
2022 /* Don't want to look at zero bytes past end. */
2023 HOST_WIDE_INT mb =
2024 BITS_PER_UNIT * (word_mode_size - cmp_bytes);
2025 rtx mask = GEN_INT (HOST_WIDE_INT_M1U << mb);
2026 emit_insn (gen_andsi3_mask (cmpb_zero, cmpb_zero, mask));
2029 else
2031 emit_insn (gen_movdi (zero_reg, GEN_INT (0)));
2032 emit_insn (gen_cmpbdi3 (cmpb_zero, tmp_reg_src1, zero_reg));
2033 if (cmp_bytes < word_mode_size)
2035 /* Don't want to look at zero bytes past end. */
2036 HOST_WIDE_INT mb =
2037 BITS_PER_UNIT * (word_mode_size - cmp_bytes);
2038 rtx mask = GEN_INT (HOST_WIDE_INT_M1U << mb);
2039 emit_insn (gen_anddi3_mask (cmpb_zero, cmpb_zero, mask));
2043 emit_move_insn (condz, gen_rtx_COMPARE (CCmode, cmpb_zero, zero_reg));
2044 rtx cmpnz_rtx = gen_rtx_NE (VOIDmode, condz, const0_rtx);
2045 rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, cmpnz_rtx,
2046 lab_ref_fin, pc_rtx);
2047 rtx j2 = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse));
2048 JUMP_LABEL (j2) = final_move_label;
2049 LABEL_NUSES (final_move_label) += 1;
2053 offset += cmp_bytes;
2054 bytes_to_compare -= cmp_bytes;
2057 if (equality_compare_rest)
2059 /* Update pointers past what has been compared already. */
2060 src1 = adjust_address (orig_src1, load_mode, offset);
2061 src2 = adjust_address (orig_src2, load_mode, offset);
2063 if (!REG_P (XEXP (src1, 0)))
2065 rtx src1_reg = copy_addr_to_reg (XEXP (src1, 0));
2066 src1 = replace_equiv_address (src1, src1_reg);
2068 set_mem_size (src1, load_mode_size);
2070 if (!REG_P (XEXP (src2, 0)))
2072 rtx src2_reg = copy_addr_to_reg (XEXP (src2, 0));
2073 src2 = replace_equiv_address (src2, src2_reg);
2075 set_mem_size (src2, load_mode_size);
2077 /* Construct call to strcmp/strncmp to compare the rest of the string. */
2078 if (no_length)
2080 tree fun = builtin_decl_explicit (BUILT_IN_STRCMP);
2081 emit_library_call_value (XEXP (DECL_RTL (fun), 0),
2082 target, LCT_NORMAL, GET_MODE (target),
2083 force_reg (Pmode, XEXP (src1, 0)), Pmode,
2084 force_reg (Pmode, XEXP (src2, 0)), Pmode);
2086 else
2088 rtx len_rtx;
2089 if (TARGET_64BIT)
2090 len_rtx = gen_reg_rtx (DImode);
2091 else
2092 len_rtx = gen_reg_rtx (SImode);
2094 emit_move_insn (len_rtx, GEN_INT (bytes - compare_length));
2095 tree fun = builtin_decl_explicit (BUILT_IN_STRNCMP);
2096 emit_library_call_value (XEXP (DECL_RTL (fun), 0),
2097 target, LCT_NORMAL, GET_MODE (target),
2098 force_reg (Pmode, XEXP (src1, 0)), Pmode,
2099 force_reg (Pmode, XEXP (src2, 0)), Pmode,
2100 len_rtx, GET_MODE (len_rtx));
2103 rtx fin_ref = gen_rtx_LABEL_REF (VOIDmode, final_label);
2104 rtx jmp = emit_jump_insn (gen_rtx_SET (pc_rtx, fin_ref));
2105 JUMP_LABEL (jmp) = final_label;
2106 LABEL_NUSES (final_label) += 1;
2107 emit_barrier ();
2110 if (cleanup_label)
2111 emit_label (cleanup_label);
2113 /* Generate the final sequence that identifies the differing
2114 byte and generates the final result, taking into account
2115 zero bytes:
2117 cmpb cmpb_result1, src1, src2
2118 cmpb cmpb_result2, src1, zero
2119 orc cmpb_result1, cmp_result1, cmpb_result2
2120 cntlzd get bit of first zero/diff byte
2121 addi convert for rldcl use
2122 rldcl rldcl extract diff/zero byte
2123 subf subtract for final result
2126 rtx cmpb_diff = gen_reg_rtx (word_mode);
2127 rtx cmpb_zero = gen_reg_rtx (word_mode);
2128 rtx rot_amt = gen_reg_rtx (word_mode);
2129 rtx zero_reg = gen_reg_rtx (word_mode);
2131 rtx rot1_1 = gen_reg_rtx (word_mode);
2132 rtx rot1_2 = gen_reg_rtx (word_mode);
2133 rtx rot2_1 = gen_reg_rtx (word_mode);
2134 rtx rot2_2 = gen_reg_rtx (word_mode);
2136 if (word_mode == SImode)
2138 emit_insn (gen_cmpbsi3 (cmpb_diff, tmp_reg_src1, tmp_reg_src2));
2139 emit_insn (gen_movsi (zero_reg, GEN_INT (0)));
2140 emit_insn (gen_cmpbsi3 (cmpb_zero, tmp_reg_src1, zero_reg));
2141 emit_insn (gen_one_cmplsi2 (cmpb_diff,cmpb_diff));
2142 emit_insn (gen_iorsi3 (cmpb_diff, cmpb_diff, cmpb_zero));
2143 emit_insn (gen_clzsi2 (rot_amt, cmpb_diff));
2144 emit_insn (gen_addsi3 (rot_amt, rot_amt, GEN_INT (8)));
2145 emit_insn (gen_rotlsi3 (rot1_1, tmp_reg_src1,
2146 gen_lowpart (SImode, rot_amt)));
2147 emit_insn (gen_andsi3_mask (rot1_2, rot1_1, GEN_INT (0xff)));
2148 emit_insn (gen_rotlsi3 (rot2_1, tmp_reg_src2,
2149 gen_lowpart (SImode, rot_amt)));
2150 emit_insn (gen_andsi3_mask (rot2_2, rot2_1, GEN_INT (0xff)));
2151 emit_insn (gen_subsi3 (result_reg, rot1_2, rot2_2));
2153 else
2155 emit_insn (gen_cmpbdi3 (cmpb_diff, tmp_reg_src1, tmp_reg_src2));
2156 emit_insn (gen_movdi (zero_reg, GEN_INT (0)));
2157 emit_insn (gen_cmpbdi3 (cmpb_zero, tmp_reg_src1, zero_reg));
2158 emit_insn (gen_one_cmpldi2 (cmpb_diff,cmpb_diff));
2159 emit_insn (gen_iordi3 (cmpb_diff, cmpb_diff, cmpb_zero));
2160 emit_insn (gen_clzdi2 (rot_amt, cmpb_diff));
2161 emit_insn (gen_adddi3 (rot_amt, rot_amt, GEN_INT (8)));
2162 emit_insn (gen_rotldi3 (rot1_1, tmp_reg_src1,
2163 gen_lowpart (SImode, rot_amt)));
2164 emit_insn (gen_anddi3_mask (rot1_2, rot1_1, GEN_INT (0xff)));
2165 emit_insn (gen_rotldi3 (rot2_1, tmp_reg_src2,
2166 gen_lowpart (SImode, rot_amt)));
2167 emit_insn (gen_anddi3_mask (rot2_2, rot2_1, GEN_INT (0xff)));
2168 emit_insn (gen_subdi3 (result_reg, rot1_2, rot2_2));
2171 emit_label (final_move_label);
2172 emit_insn (gen_movsi (target,
2173 gen_lowpart (SImode, result_reg)));
2174 emit_label (final_label);
2175 return true;
2178 /* Expand a block move operation, and return 1 if successful. Return 0
2179 if we should let the compiler generate normal code.
2181 operands[0] is the destination
2182 operands[1] is the source
2183 operands[2] is the length
2184 operands[3] is the alignment */
2186 #define MAX_MOVE_REG 4
2189 expand_block_move (rtx operands[])
2191 rtx orig_dest = operands[0];
2192 rtx orig_src = operands[1];
2193 rtx bytes_rtx = operands[2];
2194 rtx align_rtx = operands[3];
2195 int constp = (GET_CODE (bytes_rtx) == CONST_INT);
2196 int align;
2197 int bytes;
2198 int offset;
2199 int move_bytes;
2200 rtx stores[MAX_MOVE_REG];
2201 int num_reg = 0;
2203 /* If this is not a fixed size move, just call memcpy */
2204 if (! constp)
2205 return 0;
2207 /* This must be a fixed size alignment */
2208 gcc_assert (GET_CODE (align_rtx) == CONST_INT);
2209 align = INTVAL (align_rtx) * BITS_PER_UNIT;
2211 /* Anything to move? */
2212 bytes = INTVAL (bytes_rtx);
2213 if (bytes <= 0)
2214 return 1;
2216 if (bytes > rs6000_block_move_inline_limit)
2217 return 0;
2219 for (offset = 0; bytes > 0; offset += move_bytes, bytes -= move_bytes)
2221 union {
2222 rtx (*movmemsi) (rtx, rtx, rtx, rtx);
2223 rtx (*mov) (rtx, rtx);
2224 } gen_func;
2225 machine_mode mode = BLKmode;
2226 rtx src, dest;
2228 /* Altivec first, since it will be faster than a string move
2229 when it applies, and usually not significantly larger. */
2230 if (TARGET_ALTIVEC && bytes >= 16 && (TARGET_EFFICIENT_UNALIGNED_VSX || align >= 128))
2232 move_bytes = 16;
2233 mode = V4SImode;
2234 gen_func.mov = gen_movv4si;
2236 else if (bytes >= 8 && TARGET_POWERPC64
2237 && (align >= 64 || !STRICT_ALIGNMENT))
2239 move_bytes = 8;
2240 mode = DImode;
2241 gen_func.mov = gen_movdi;
2242 if (offset == 0 && align < 64)
2244 rtx addr;
2246 /* If the address form is reg+offset with offset not a
2247 multiple of four, reload into reg indirect form here
2248 rather than waiting for reload. This way we get one
2249 reload, not one per load and/or store. */
2250 addr = XEXP (orig_dest, 0);
2251 if ((GET_CODE (addr) == PLUS || GET_CODE (addr) == LO_SUM)
2252 && GET_CODE (XEXP (addr, 1)) == CONST_INT
2253 && (INTVAL (XEXP (addr, 1)) & 3) != 0)
2255 addr = copy_addr_to_reg (addr);
2256 orig_dest = replace_equiv_address (orig_dest, addr);
2258 addr = XEXP (orig_src, 0);
2259 if ((GET_CODE (addr) == PLUS || GET_CODE (addr) == LO_SUM)
2260 && GET_CODE (XEXP (addr, 1)) == CONST_INT
2261 && (INTVAL (XEXP (addr, 1)) & 3) != 0)
2263 addr = copy_addr_to_reg (addr);
2264 orig_src = replace_equiv_address (orig_src, addr);
2268 else if (bytes >= 4 && (align >= 32 || !STRICT_ALIGNMENT))
2269 { /* move 4 bytes */
2270 move_bytes = 4;
2271 mode = SImode;
2272 gen_func.mov = gen_movsi;
2274 else if (bytes >= 2 && (align >= 16 || !STRICT_ALIGNMENT))
2275 { /* move 2 bytes */
2276 move_bytes = 2;
2277 mode = HImode;
2278 gen_func.mov = gen_movhi;
2280 else /* move 1 byte at a time */
2282 move_bytes = 1;
2283 mode = QImode;
2284 gen_func.mov = gen_movqi;
2287 src = adjust_address (orig_src, mode, offset);
2288 dest = adjust_address (orig_dest, mode, offset);
2290 if (mode != BLKmode)
2292 rtx tmp_reg = gen_reg_rtx (mode);
2294 emit_insn ((*gen_func.mov) (tmp_reg, src));
2295 stores[num_reg++] = (*gen_func.mov) (dest, tmp_reg);
2298 if (mode == BLKmode || num_reg >= MAX_MOVE_REG || bytes == move_bytes)
2300 int i;
2301 for (i = 0; i < num_reg; i++)
2302 emit_insn (stores[i]);
2303 num_reg = 0;
2306 if (mode == BLKmode)
2308 /* Move the address into scratch registers. The movmemsi
2309 patterns require zero offset. */
2310 if (!REG_P (XEXP (src, 0)))
2312 rtx src_reg = copy_addr_to_reg (XEXP (src, 0));
2313 src = replace_equiv_address (src, src_reg);
2315 set_mem_size (src, move_bytes);
2317 if (!REG_P (XEXP (dest, 0)))
2319 rtx dest_reg = copy_addr_to_reg (XEXP (dest, 0));
2320 dest = replace_equiv_address (dest, dest_reg);
2322 set_mem_size (dest, move_bytes);
2324 emit_insn ((*gen_func.movmemsi) (dest, src,
2325 GEN_INT (move_bytes & 31),
2326 align_rtx));
2330 return 1;