1 /* Subroutines used to expand string and block move, clear,
2 compare and other operations for PowerPC.
3 Copyright (C) 1991-2018 Free Software Foundation, Inc.
5 This file is part of GCC.
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published
9 by the Free Software Foundation; either version 3, or (at your
10 option) any later version.
12 GCC is distributed in the hope that it will be useful, but WITHOUT
13 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public
15 License for more details.
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
21 #define IN_TARGET_CODE 1
25 #include "coretypes.h"
32 #include "print-tree.h"
39 /* Expand a block clear operation, and return 1 if successful. Return 0
40 if we should let the compiler generate normal code.
42 operands[0] is the destination
43 operands[1] is the length
44 operands[3] is the alignment */
47 expand_block_clear (rtx operands
[])
49 rtx orig_dest
= operands
[0];
50 rtx bytes_rtx
= operands
[1];
51 rtx align_rtx
= operands
[3];
52 bool constp
= (GET_CODE (bytes_rtx
) == CONST_INT
);
59 /* If this is not a fixed size move, just call memcpy */
63 /* This must be a fixed size alignment */
64 gcc_assert (GET_CODE (align_rtx
) == CONST_INT
);
65 align
= INTVAL (align_rtx
) * BITS_PER_UNIT
;
67 /* Anything to clear? */
68 bytes
= INTVAL (bytes_rtx
);
72 /* Use the builtin memset after a point, to avoid huge code bloat.
73 When optimize_size, avoid any significant code bloat; calling
74 memset is about 4 instructions, so allow for one instruction to
75 load zero and three to do clearing. */
76 if (TARGET_ALTIVEC
&& (align
>= 128 || TARGET_EFFICIENT_UNALIGNED_VSX
))
78 else if (TARGET_POWERPC64
&& (align
>= 64 || !STRICT_ALIGNMENT
))
83 if (optimize_size
&& bytes
> 3 * clear_step
)
85 if (! optimize_size
&& bytes
> 8 * clear_step
)
88 for (offset
= 0; bytes
> 0; offset
+= clear_bytes
, bytes
-= clear_bytes
)
90 machine_mode mode
= BLKmode
;
93 if (bytes
>= 16 && TARGET_ALTIVEC
&& (align
>= 128 || TARGET_EFFICIENT_UNALIGNED_VSX
))
98 else if (bytes
>= 8 && TARGET_POWERPC64
99 && (align
>= 64 || !STRICT_ALIGNMENT
))
103 if (offset
== 0 && align
< 64)
107 /* If the address form is reg+offset with offset not a
108 multiple of four, reload into reg indirect form here
109 rather than waiting for reload. This way we get one
110 reload, not one per store. */
111 addr
= XEXP (orig_dest
, 0);
112 if ((GET_CODE (addr
) == PLUS
|| GET_CODE (addr
) == LO_SUM
)
113 && GET_CODE (XEXP (addr
, 1)) == CONST_INT
114 && (INTVAL (XEXP (addr
, 1)) & 3) != 0)
116 addr
= copy_addr_to_reg (addr
);
117 orig_dest
= replace_equiv_address (orig_dest
, addr
);
121 else if (bytes
>= 4 && (align
>= 32 || !STRICT_ALIGNMENT
))
126 else if (bytes
>= 2 && (align
>= 16 || !STRICT_ALIGNMENT
))
131 else /* move 1 byte at a time */
137 dest
= adjust_address (orig_dest
, mode
, offset
);
139 emit_move_insn (dest
, CONST0_RTX (mode
));
145 /* Figure out the correct instructions to generate to load data for
146 block compare. MODE is used for the read from memory, and
147 data is zero extended if REG is wider than MODE. If LE code
148 is being generated, bswap loads are used.
150 REG is the destination register to move the data into.
151 MEM is the memory block being read.
152 MODE is the mode of memory to use for the read. */
154 do_load_for_compare (rtx reg
, rtx mem
, machine_mode mode
)
156 switch (GET_MODE (reg
))
162 emit_insn (gen_zero_extendqidi2 (reg
, mem
));
167 if (!BYTES_BIG_ENDIAN
)
169 src
= gen_reg_rtx (HImode
);
170 emit_insn (gen_bswaphi2 (src
, mem
));
172 emit_insn (gen_zero_extendhidi2 (reg
, src
));
178 if (!BYTES_BIG_ENDIAN
)
180 src
= gen_reg_rtx (SImode
);
181 emit_insn (gen_bswapsi2 (src
, mem
));
183 emit_insn (gen_zero_extendsidi2 (reg
, src
));
187 if (!BYTES_BIG_ENDIAN
)
188 emit_insn (gen_bswapdi2 (reg
, mem
));
190 emit_insn (gen_movdi (reg
, mem
));
201 emit_insn (gen_zero_extendqisi2 (reg
, mem
));
206 if (!BYTES_BIG_ENDIAN
)
208 src
= gen_reg_rtx (HImode
);
209 emit_insn (gen_bswaphi2 (src
, mem
));
211 emit_insn (gen_zero_extendhisi2 (reg
, src
));
215 if (!BYTES_BIG_ENDIAN
)
216 emit_insn (gen_bswapsi2 (reg
, mem
));
218 emit_insn (gen_movsi (reg
, mem
));
221 /* DImode is larger than the destination reg so is not expected. */
234 /* Select the mode to be used for reading the next chunk of bytes
237 OFFSET is the current read offset from the beginning of the block.
238 BYTES is the number of bytes remaining to be read.
239 ALIGN is the minimum alignment of the memory blocks being compared in bytes.
240 WORD_MODE_OK indicates using WORD_MODE is allowed, else SImode is
241 the largest allowable mode. */
243 select_block_compare_mode (unsigned HOST_WIDE_INT offset
,
244 unsigned HOST_WIDE_INT bytes
,
245 unsigned HOST_WIDE_INT align
, bool word_mode_ok
)
247 /* First see if we can do a whole load unit
248 as that will be more efficient than a larger load + shift. */
250 /* If big, use biggest chunk.
251 If exactly chunk size, use that size.
252 If remainder can be done in one piece with shifting, do that.
253 Do largest chunk possible without violating alignment rules. */
255 /* The most we can read without potential page crossing. */
256 unsigned HOST_WIDE_INT maxread
= ROUND_UP (bytes
, align
);
258 if (word_mode_ok
&& bytes
>= UNITS_PER_WORD
)
260 else if (bytes
== GET_MODE_SIZE (SImode
))
262 else if (bytes
== GET_MODE_SIZE (HImode
))
264 else if (bytes
== GET_MODE_SIZE (QImode
))
266 else if (bytes
< GET_MODE_SIZE (SImode
)
267 && offset
>= GET_MODE_SIZE (SImode
) - bytes
)
268 /* This matches the case were we have SImode and 3 bytes
269 and offset >= 1 and permits us to move back one and overlap
270 with the previous read, thus avoiding having to shift
271 unwanted bytes off of the input. */
273 else if (word_mode_ok
&& bytes
< UNITS_PER_WORD
274 && offset
>= UNITS_PER_WORD
-bytes
)
275 /* Similarly, if we can use DImode it will get matched here and
276 can do an overlapping read that ends at the end of the block. */
278 else if (word_mode_ok
&& maxread
>= UNITS_PER_WORD
)
279 /* It is safe to do all remaining in one load of largest size,
280 possibly with a shift to get rid of unwanted bytes. */
282 else if (maxread
>= GET_MODE_SIZE (SImode
))
283 /* It is safe to do all remaining in one SImode load,
284 possibly with a shift to get rid of unwanted bytes. */
286 else if (bytes
> GET_MODE_SIZE (SImode
))
288 else if (bytes
> GET_MODE_SIZE (HImode
))
291 /* final fallback is do one byte */
295 /* Compute the alignment of pointer+OFFSET where the original alignment
296 of pointer was BASE_ALIGN. */
297 static unsigned HOST_WIDE_INT
298 compute_current_alignment (unsigned HOST_WIDE_INT base_align
,
299 unsigned HOST_WIDE_INT offset
)
303 return MIN (base_align
, offset
& -offset
);
306 /* Prepare address and then do a load.
308 MODE is the mode to use for the load.
309 DEST is the destination register for the data.
310 ADDR is the address to be loaded.
311 ORIG_ADDR is the original address expression. */
313 do_load_for_compare_from_addr (machine_mode mode
, rtx dest
, rtx addr
,
316 rtx mem
= gen_rtx_MEM (mode
, addr
);
317 MEM_COPY_ATTRIBUTES (mem
, orig_addr
);
318 set_mem_size (mem
, GET_MODE_SIZE (mode
));
319 do_load_for_compare (dest
, mem
, mode
);
323 /* Do a branch for an if/else decision.
325 CMPMODE is the mode to use for the comparison.
326 COMPARISON is the rtx code for the compare needed.
327 A is the first thing to be compared.
328 B is the second thing to be compared.
329 CR is the condition code reg input, or NULL_RTX.
330 TRUE_LABEL is the label to branch to if the condition is true.
332 The return value is the CR used for the comparison.
333 If CR is null_rtx, then a new register of CMPMODE is generated.
334 If A and B are both null_rtx, then CR must not be null, and the
335 compare is not generated so you can use this with a dot form insn. */
338 do_ifelse (machine_mode cmpmode
, rtx_code comparison
,
339 rtx a
, rtx b
, rtx cr
, rtx true_label
)
341 gcc_assert ((a
== NULL_RTX
&& b
== NULL_RTX
&& cr
!= NULL_RTX
)
342 || (a
!= NULL_RTX
&& b
!= NULL_RTX
));
345 gcc_assert (GET_MODE (cr
) == cmpmode
);
347 cr
= gen_reg_rtx (cmpmode
);
349 rtx label_ref
= gen_rtx_LABEL_REF (VOIDmode
, true_label
);
352 emit_move_insn (cr
, gen_rtx_COMPARE (cmpmode
, a
, b
));
354 rtx cmp_rtx
= gen_rtx_fmt_ee (comparison
, VOIDmode
, cr
, const0_rtx
);
356 rtx ifelse
= gen_rtx_IF_THEN_ELSE (VOIDmode
, cmp_rtx
, label_ref
, pc_rtx
);
357 rtx j
= emit_jump_insn (gen_rtx_SET (pc_rtx
, ifelse
));
358 JUMP_LABEL (j
) = true_label
;
359 LABEL_NUSES (true_label
) += 1;
362 /* Emit an isel of the proper mode for DEST.
364 DEST is the isel destination register.
365 SRC1 is the isel source if CR is true.
366 SRC2 is the isel source if CR is false.
367 CR is the condition for the isel. */
369 do_isel (rtx dest
, rtx cmp
, rtx src_t
, rtx src_f
, rtx cr
)
371 if (GET_MODE (dest
) == DImode
)
372 emit_insn (gen_isel_signed_di (dest
, cmp
, src_t
, src_f
, cr
));
374 emit_insn (gen_isel_signed_si (dest
, cmp
, src_t
, src_f
, cr
));
377 /* Emit a subtract of the proper mode for DEST.
379 DEST is the destination register for the subtract.
380 SRC1 is the first subtract input.
381 SRC2 is the second subtract input.
383 Computes DEST = SRC1-SRC2. */
385 do_sub3 (rtx dest
, rtx src1
, rtx src2
)
387 if (GET_MODE (dest
) == DImode
)
388 emit_insn (gen_subdi3 (dest
, src1
, src2
));
390 emit_insn (gen_subsi3 (dest
, src1
, src2
));
393 /* Emit an add of the proper mode for DEST.
395 DEST is the destination register for the add.
396 SRC1 is the first add input.
397 SRC2 is the second add input.
399 Computes DEST = SRC1+SRC2. */
401 do_add3 (rtx dest
, rtx src1
, rtx src2
)
403 if (GET_MODE (dest
) == DImode
)
404 emit_insn (gen_adddi3 (dest
, src1
, src2
));
406 emit_insn (gen_addsi3 (dest
, src1
, src2
));
409 /* Generate rtl for a load, shift, and compare of less than a full word.
411 LOAD_MODE is the machine mode for the loads.
412 DIFF is the reg for the difference.
413 CMP_REM is the reg containing the remaining bytes to compare.
414 DCOND is the CCUNS reg for the compare if we are doing P9 code with setb.
415 SRC1_ADDR is the first source address.
416 SRC2_ADDR is the second source address.
417 ORIG_SRC1 is the original first source block's address rtx.
418 ORIG_SRC2 is the original second source block's address rtx. */
420 do_load_mask_compare (const machine_mode load_mode
, rtx diff
, rtx cmp_rem
, rtx dcond
,
421 rtx src1_addr
, rtx src2_addr
, rtx orig_src1
, rtx orig_src2
)
423 HOST_WIDE_INT load_mode_size
= GET_MODE_SIZE (load_mode
);
424 rtx shift_amount
= gen_reg_rtx (word_mode
);
425 rtx d1
= gen_reg_rtx (word_mode
);
426 rtx d2
= gen_reg_rtx (word_mode
);
428 do_load_for_compare_from_addr (load_mode
, d1
, src1_addr
, orig_src1
);
429 do_load_for_compare_from_addr (load_mode
, d2
, src2_addr
, orig_src2
);
430 do_sub3 (shift_amount
, GEN_INT (load_mode_size
), cmp_rem
);
432 if (word_mode
== DImode
)
434 emit_insn (gen_ashldi3 (shift_amount
, shift_amount
,
435 GEN_INT (LOG2_BITS_PER_UNIT
)));
436 emit_insn (gen_lshrdi3 (d1
, d1
,
437 gen_lowpart (SImode
, shift_amount
)));
438 emit_insn (gen_lshrdi3 (d2
, d2
,
439 gen_lowpart (SImode
, shift_amount
)));
443 emit_insn (gen_ashlsi3 (shift_amount
, shift_amount
,
444 GEN_INT (LOG2_BITS_PER_UNIT
)));
445 emit_insn (gen_lshrsi3 (d1
, d1
, shift_amount
));
446 emit_insn (gen_lshrsi3 (d2
, d2
, shift_amount
));
451 /* Generate a compare, and convert with a setb later. */
452 rtx cmp
= gen_rtx_COMPARE (CCUNSmode
, d1
, d2
);
453 emit_insn (gen_rtx_SET (dcond
, cmp
));
457 if (word_mode
== DImode
)
458 emit_insn (gen_subfdi3_carry (diff
, d2
, d1
));
460 emit_insn (gen_subfsi3_carry (diff
, d2
, d1
));
464 /* Generate rtl for an overlapping load and compare of less than a
465 full load_mode. This assumes that the previous word is part of the
466 block being compared so it's ok to back up part of a word so we can
467 compare the last unaligned full word that ends at the end of the block.
469 LOAD_MODE is the machine mode for the loads.
470 ISCONST tells whether the remaining length is a constant or in a register.
471 BYTES_REM is the remaining length if ISCONST is true.
472 DIFF is the reg for the difference.
473 CMP_REM is the reg containing the remaining bytes to compare if !ISCONST.
474 DCOND is the CCUNS reg for the compare if we are doing P9 code with setb.
475 SRC1_ADDR is the first source address.
476 SRC2_ADDR is the second source address.
477 ORIG_SRC1 is the original first source block's address rtx.
478 ORIG_SRC2 is the original second source block's address rtx. */
480 do_overlap_load_compare (machine_mode load_mode
, bool isConst
,
481 HOST_WIDE_INT bytes_rem
, rtx diff
,
482 rtx cmp_rem
, rtx dcond
, rtx src1_addr
, rtx src2_addr
,
483 rtx orig_src1
, rtx orig_src2
)
485 HOST_WIDE_INT load_mode_size
= GET_MODE_SIZE (load_mode
);
486 HOST_WIDE_INT addr_adj
= load_mode_size
- bytes_rem
;
487 rtx d1
= gen_reg_rtx (word_mode
);
488 rtx d2
= gen_reg_rtx (word_mode
);
491 if (!isConst
|| addr_adj
)
493 rtx adj_reg
= gen_reg_rtx (word_mode
);
495 emit_move_insn (adj_reg
, GEN_INT (-addr_adj
));
498 rtx reg_lms
= gen_reg_rtx (word_mode
);
499 emit_move_insn (reg_lms
, GEN_INT (load_mode_size
));
500 do_sub3 (adj_reg
, cmp_rem
, reg_lms
);
503 addr1
= gen_rtx_PLUS (word_mode
, src1_addr
, adj_reg
);
504 addr2
= gen_rtx_PLUS (word_mode
, src2_addr
, adj_reg
);
512 do_load_for_compare_from_addr (load_mode
, d1
, addr1
, orig_src1
);
513 do_load_for_compare_from_addr (load_mode
, d2
, addr2
, orig_src2
);
517 /* Generate a compare, and convert with a setb later. */
518 rtx cmp
= gen_rtx_COMPARE (CCUNSmode
, d1
, d2
);
519 emit_insn (gen_rtx_SET (dcond
, cmp
));
523 if (word_mode
== DImode
)
524 emit_insn (gen_subfdi3_carry (diff
, d2
, d1
));
526 emit_insn (gen_subfsi3_carry (diff
, d2
, d1
));
530 /* Expand a block compare operation using loop code, and return true
531 if successful. Return false if we should let the compiler generate
532 normal code, probably a memcmp call.
534 OPERANDS[0] is the target (result).
535 OPERANDS[1] is the first source.
536 OPERANDS[2] is the second source.
537 OPERANDS[3] is the length.
538 OPERANDS[4] is the alignment. */
540 expand_compare_loop (rtx operands
[])
542 rtx target
= operands
[0];
543 rtx orig_src1
= operands
[1];
544 rtx orig_src2
= operands
[2];
545 rtx bytes_rtx
= operands
[3];
546 rtx align_rtx
= operands
[4];
548 /* This case is complicated to handle because the subtract
549 with carry instructions do not generate the 64-bit
550 carry and so we must emit code to calculate it ourselves.
551 We choose not to implement this yet. */
552 if (TARGET_32BIT
&& TARGET_POWERPC64
)
555 /* Allow non-const length. */
556 int bytes_is_const
= CONST_INT_P (bytes_rtx
);
558 /* This must be a fixed size alignment. */
559 if (!CONST_INT_P (align_rtx
))
562 HOST_WIDE_INT align1
= MEM_ALIGN (orig_src1
) / BITS_PER_UNIT
;
563 HOST_WIDE_INT align2
= MEM_ALIGN (orig_src2
) / BITS_PER_UNIT
;
564 HOST_WIDE_INT minalign
= MIN (align1
, align2
);
566 bool isP7
= (rs6000_tune
== PROCESSOR_POWER7
);
568 gcc_assert (GET_MODE (target
) == SImode
);
570 /* Anything to move? */
571 HOST_WIDE_INT bytes
= 0;
573 bytes
= INTVAL (bytes_rtx
);
575 if (bytes_is_const
&& bytes
== 0)
578 /* Limit the amount we compare, if known statically. */
579 HOST_WIDE_INT max_bytes
;
582 case PROCESSOR_POWER7
:
594 case PROCESSOR_POWER8
:
603 case PROCESSOR_POWER9
:
613 /* Allow the option to override the default. */
614 if (rs6000_block_compare_inline_loop_limit
>= 0)
615 max_bytes
= (unsigned HOST_WIDE_INT
) rs6000_block_compare_inline_loop_limit
;
620 rtx cmp_rem
= gen_reg_rtx (word_mode
); /* Remainder for library call. */
621 rtx loop_cmp
= gen_reg_rtx (word_mode
); /* Actual amount compared by loop. */
623 rtx iter
= gen_reg_rtx (word_mode
);
624 rtx iv1
= gen_reg_rtx (word_mode
);
625 rtx iv2
= gen_reg_rtx (word_mode
);
626 rtx d1_1
= gen_reg_rtx (word_mode
); /* Addr expression src1+iv1 */
627 rtx d1_2
= gen_reg_rtx (word_mode
); /* Addr expression src1+iv2 */
628 rtx d2_1
= gen_reg_rtx (word_mode
); /* Addr expression src2+iv1 */
629 rtx d2_2
= gen_reg_rtx (word_mode
); /* Addr expression src2+iv2 */
631 /* Strip unneeded subreg from length if there is one. */
632 if (SUBREG_P (bytes_rtx
) && subreg_lowpart_p (bytes_rtx
))
633 bytes_rtx
= SUBREG_REG (bytes_rtx
);
634 /* Extend bytes_rtx to word_mode if needed. But, we expect only to
635 maybe have to deal with the case were bytes_rtx is SImode and
636 word_mode is DImode. */
639 if (GET_MODE_SIZE (GET_MODE (bytes_rtx
)) > GET_MODE_SIZE (word_mode
))
640 /* Do not expect length longer than word_mode. */
642 else if (GET_MODE_SIZE (GET_MODE (bytes_rtx
)) < GET_MODE_SIZE (word_mode
))
644 bytes_rtx
= force_reg (GET_MODE (bytes_rtx
), bytes_rtx
);
645 bytes_rtx
= force_reg (word_mode
,
646 gen_rtx_fmt_e (ZERO_EXTEND
, word_mode
,
650 /* Make sure it's in a register before we get started. */
651 bytes_rtx
= force_reg (GET_MODE (bytes_rtx
), bytes_rtx
);
654 machine_mode load_mode
= word_mode
;
655 HOST_WIDE_INT load_mode_size
= GET_MODE_SIZE (load_mode
);
657 /* Number of bytes per iteration of the unrolled loop. */
658 HOST_WIDE_INT loop_bytes
= 2 * load_mode_size
;
659 /* max iters and bytes compared in the loop. */
660 HOST_WIDE_INT max_loop_iter
= max_bytes
/ loop_bytes
;
661 HOST_WIDE_INT max_loop_bytes
= max_loop_iter
* loop_bytes
;
662 int l2lb
= floor_log2 (loop_bytes
);
664 if (bytes_is_const
&& (max_bytes
< load_mode_size
665 || !IN_RANGE (bytes
, load_mode_size
, max_bytes
)))
668 bool no_remainder_code
= false;
669 rtx final_label
= gen_label_rtx ();
670 rtx final_ref
= gen_rtx_LABEL_REF (VOIDmode
, final_label
);
671 rtx diff_label
= gen_label_rtx ();
672 rtx library_call_label
= NULL
;
673 rtx cleanup_label
= gen_label_rtx ();
677 rtx src1_addr
= copy_addr_to_reg (XEXP (orig_src1
, 0));
678 rtx src2_addr
= copy_addr_to_reg (XEXP (orig_src2
, 0));
680 /* Difference found is stored here before jump to diff_label. */
681 rtx diff
= gen_reg_rtx (word_mode
);
684 /* Example of generated code for 35 bytes aligned 1 byte.
714 Compiled with -fno-reorder-blocks for clarity. */
716 /* Structure of what we're going to do:
717 Two separate lengths: what we will compare before bailing to library
718 call (max_bytes), and the total length to be checked.
719 if length <= 16, branch to linear cleanup code starting with
720 remainder length check (length not known at compile time)
721 set up 2 iv's and load count reg, compute remainder length
722 unrollx2 compare loop
723 if loop exit due to a difference, branch to difference handling code
724 if remainder length < 8, branch to final cleanup compare
726 final cleanup comparison (depends on alignment and length)
727 load 8B, shift off bytes past length, compare
728 load 8B ending at last byte and compare
729 load/compare 1 byte at a time (short block abutting 4k boundary)
730 difference handling, 64->32 conversion
732 branch around memcmp call
736 /* If bytes is not const, compare length and branch directly
737 to the cleanup code that can handle 0-16 bytes if length
738 is >= 16. Stash away bytes-max_bytes for the library call. */
741 /* These need to be set for some of the places we may jump to. */
742 if (bytes
> max_bytes
)
744 no_remainder_code
= true;
745 niter
= max_loop_iter
;
746 library_call_label
= gen_label_rtx ();
750 niter
= bytes
/ loop_bytes
;
752 emit_move_insn (iter
, GEN_INT (niter
));
753 emit_move_insn (loop_cmp
, GEN_INT (niter
* loop_bytes
));
754 emit_move_insn (cmp_rem
, GEN_INT (bytes
- niter
* loop_bytes
));
758 library_call_label
= gen_label_rtx ();
760 /* If we go to the cleanup code, it expects length to be in cmp_rem. */
761 emit_move_insn (cmp_rem
, bytes_rtx
);
763 /* Check for > max_bytes bytes. We want to bail out as quickly as
764 possible if we have to go over to memcmp. */
765 do_ifelse (CCmode
, GT
, bytes_rtx
, GEN_INT (max_bytes
),
766 NULL_RTX
, library_call_label
);
768 /* Check for < loop_bytes bytes. */
769 do_ifelse (CCmode
, LT
, bytes_rtx
, GEN_INT (loop_bytes
),
770 NULL_RTX
, cleanup_label
);
772 /* Loop compare bytes and iterations if bytes>max_bytes. */
773 rtx mb_reg
= gen_reg_rtx (word_mode
);
774 emit_move_insn (mb_reg
, GEN_INT (max_loop_bytes
));
775 rtx mi_reg
= gen_reg_rtx (word_mode
);
776 emit_move_insn (mi_reg
, GEN_INT (max_loop_iter
));
778 /* Compute number of loop iterations if bytes <= max_bytes. */
779 if (word_mode
== DImode
)
780 emit_insn (gen_lshrdi3 (iter
, bytes_rtx
, GEN_INT (l2lb
)));
782 emit_insn (gen_lshrsi3 (iter
, bytes_rtx
, GEN_INT (l2lb
)));
784 /* Compute bytes to compare in loop if bytes <= max_bytes. */
785 rtx mask
= GEN_INT (HOST_WIDE_INT_M1U
<< l2lb
);
786 if (word_mode
== DImode
)
788 emit_insn (gen_anddi3 (loop_cmp
, bytes_rtx
, mask
));
792 emit_insn (gen_andsi3 (loop_cmp
, bytes_rtx
, mask
));
795 /* Check for bytes <= max_bytes. */
798 /* P9 has fast isel so we use one compare and two isel. */
799 cr
= gen_reg_rtx (CCmode
);
800 rtx compare_rtx
= gen_rtx_COMPARE (CCmode
, bytes_rtx
,
801 GEN_INT (max_bytes
));
802 emit_move_insn (cr
, compare_rtx
);
803 rtx cmp_rtx
= gen_rtx_LE (VOIDmode
, cr
, const0_rtx
);
804 do_isel (loop_cmp
, cmp_rtx
, loop_cmp
, mb_reg
, cr
);
805 do_isel (iter
, cmp_rtx
, iter
, mi_reg
, cr
);
809 rtx lab_after
= gen_label_rtx ();
810 do_ifelse (CCmode
, LE
, bytes_rtx
, GEN_INT (max_bytes
),
811 NULL_RTX
, lab_after
);
812 emit_move_insn (loop_cmp
, mb_reg
);
813 emit_move_insn (iter
, mi_reg
);
814 emit_label (lab_after
);
817 /* Now compute remainder bytes which isn't used until after the loop. */
818 do_sub3 (cmp_rem
, bytes_rtx
, loop_cmp
);
821 rtx dcond
= NULL_RTX
; /* Used for when we jump to diff_label. */
822 /* For p9 we need to have just one of these as multiple places define
823 it and it gets used by the setb at the end. */
825 dcond
= gen_reg_rtx (CCUNSmode
);
827 if (!bytes_is_const
|| bytes
>= loop_bytes
)
829 /* It should not be possible to come here if remaining bytes is
830 < 16 in the runtime case either. Compute number of loop
831 iterations. We compare 2*word_mode per iteration so 16B for
832 64-bit code and 8B for 32-bit. Set up two induction
833 variables and load count register. */
835 /* HACK ALERT: create hard reg for CTR here. If we just use a
836 pseudo, cse will get rid of it and then the allocator will
837 see it used in the lshr above and won't give us ctr. */
838 rtx ctr
= gen_rtx_REG (Pmode
, CTR_REGNO
);
839 emit_move_insn (ctr
, iter
);
840 emit_move_insn (diff
, GEN_INT (0));
841 emit_move_insn (iv1
, GEN_INT (0));
842 emit_move_insn (iv2
, GEN_INT (load_mode_size
));
844 /* inner loop to compare 2*word_mode */
845 rtx loop_top_label
= gen_label_rtx ();
846 emit_label (loop_top_label
);
848 rtx src1_ix1
= gen_rtx_PLUS (word_mode
, src1_addr
, iv1
);
849 rtx src2_ix1
= gen_rtx_PLUS (word_mode
, src2_addr
, iv1
);
851 do_load_for_compare_from_addr (load_mode
, d1_1
,
852 src1_ix1
, orig_src1
);
853 do_load_for_compare_from_addr (load_mode
, d2_1
,
854 src2_ix1
, orig_src2
);
855 do_add3 (iv1
, iv1
, GEN_INT (loop_bytes
));
857 rtx src1_ix2
= gen_rtx_PLUS (word_mode
, src1_addr
, iv2
);
858 rtx src2_ix2
= gen_rtx_PLUS (word_mode
, src2_addr
, iv2
);
860 do_load_for_compare_from_addr (load_mode
, d1_2
,
861 src1_ix2
, orig_src1
);
862 do_load_for_compare_from_addr (load_mode
, d2_2
,
863 src2_ix2
, orig_src2
);
864 do_add3 (iv2
, iv2
, GEN_INT (loop_bytes
));
868 /* Generate a compare, and convert with a setb later. */
869 rtx cmp
= gen_rtx_COMPARE (CCUNSmode
, d1_1
, d2_1
);
870 emit_insn (gen_rtx_SET (dcond
, cmp
));
874 dcond
= gen_reg_rtx (CCmode
);
875 if (word_mode
== DImode
)
876 emit_insn (gen_subfdi3_carry_dot2 (diff
, d2_1
, d1_1
, dcond
));
878 emit_insn (gen_subfsi3_carry_dot2 (diff
, d2_1
, d1_1
, dcond
));
881 do_ifelse (GET_MODE (dcond
), NE
, NULL_RTX
, NULL_RTX
,
886 /* Generate a compare, and convert with a setb later. */
887 rtx cmp
= gen_rtx_COMPARE (CCUNSmode
, d1_2
, d2_2
);
888 emit_insn (gen_rtx_SET (dcond
, cmp
));
892 dcond
= gen_reg_rtx (CCmode
);
893 if (word_mode
== DImode
)
894 emit_insn (gen_subfdi3_carry_dot2 (diff
, d2_2
, d1_2
, dcond
));
896 emit_insn (gen_subfsi3_carry_dot2 (diff
, d2_2
, d1_2
, dcond
));
899 rtx eqrtx
= gen_rtx_EQ (VOIDmode
, d1_2
, d2_2
);
901 j
= emit_jump_insn (gen_bdnztf_di (loop_top_label
, ctr
, ctr
,
904 j
= emit_jump_insn (gen_bdnztf_si (loop_top_label
, ctr
, ctr
,
906 JUMP_LABEL (j
) = loop_top_label
;
907 LABEL_NUSES (loop_top_label
) += 1;
910 HOST_WIDE_INT bytes_remaining
= 0;
912 bytes_remaining
= (bytes
% loop_bytes
);
914 /* If diff is nonzero, branch to difference handling
915 code. If we exit here with a nonzero diff, it is
916 because the second word differed. */
918 do_ifelse (CCUNSmode
, NE
, NULL_RTX
, NULL_RTX
, dcond
, diff_label
);
920 do_ifelse (CCmode
, NE
, diff
, const0_rtx
, NULL_RTX
, diff_label
);
922 if (library_call_label
!= NULL
&& bytes_is_const
&& bytes
> max_bytes
)
924 /* If the length is known at compile time, then we will always
925 have a remainder to go to the library call with. */
926 rtx library_call_ref
= gen_rtx_LABEL_REF (VOIDmode
, library_call_label
);
927 j
= emit_jump_insn (gen_rtx_SET (pc_rtx
, library_call_ref
));
928 JUMP_LABEL (j
) = library_call_label
;
929 LABEL_NUSES (library_call_label
) += 1;
933 if (bytes_is_const
&& bytes_remaining
== 0)
935 /* No remainder and if we are here then diff is 0 so just return 0 */
937 emit_insn (gen_movsi (target
, gen_lowpart (SImode
, diff
)));
939 emit_move_insn (target
, diff
);
940 j
= emit_jump_insn (gen_rtx_SET (pc_rtx
, final_ref
));
941 JUMP_LABEL (j
) = final_label
;
942 LABEL_NUSES (final_label
) += 1;
945 else if (!no_remainder_code
)
947 /* Update addresses to point to the next word to examine. */
948 do_add3 (src1_addr
, src1_addr
, iv1
);
949 do_add3 (src2_addr
, src2_addr
, iv1
);
951 emit_label (cleanup_label
);
955 /* If we're dealing with runtime length, we have to check if
956 it's zero after the loop. When length is known at compile
957 time the no-remainder condition is dealt with above. By
958 doing this after cleanup_label, we also deal with the
959 case where length is 0 at the start and we bypass the
960 loop with a branch to cleanup_label. */
961 emit_move_insn (target
, const0_rtx
);
962 do_ifelse (CCmode
, EQ
, cmp_rem
, const0_rtx
,
963 NULL_RTX
, final_label
);
966 rtx final_cleanup
= gen_label_rtx ();
967 rtx cmp_rem_before
= gen_reg_rtx (word_mode
);
968 /* Compare one more word_mode chunk if needed. */
969 if (!bytes_is_const
|| bytes_remaining
>= load_mode_size
)
971 /* If remainder length < word length, branch to final
974 do_ifelse (CCmode
, LT
, cmp_rem
, GEN_INT (load_mode_size
),
975 NULL_RTX
, final_cleanup
);
977 /* load and compare 8B */
978 do_load_for_compare_from_addr (load_mode
, d1_1
,
979 src1_addr
, orig_src1
);
980 do_load_for_compare_from_addr (load_mode
, d2_1
,
981 src2_addr
, orig_src2
);
983 /* Compare the word, see if we need to do the last partial. */
986 /* Generate a compare, and convert with a setb later. */
987 rtx cmp
= gen_rtx_COMPARE (CCUNSmode
, d1_1
, d2_1
);
988 emit_insn (gen_rtx_SET (dcond
, cmp
));
992 dcond
= gen_reg_rtx (CCmode
);
993 if (word_mode
== DImode
)
994 emit_insn (gen_subfdi3_carry_dot2 (diff
, d2_1
, d1_1
, dcond
));
996 emit_insn (gen_subfsi3_carry_dot2 (diff
, d2_1
, d1_1
, dcond
));
999 do_ifelse (GET_MODE (dcond
), NE
, NULL_RTX
, NULL_RTX
,
1002 do_add3 (src1_addr
, src1_addr
, GEN_INT (load_mode_size
));
1003 do_add3 (src2_addr
, src2_addr
, GEN_INT (load_mode_size
));
1004 emit_move_insn (cmp_rem_before
, cmp_rem
);
1005 do_add3 (cmp_rem
, cmp_rem
, GEN_INT (-load_mode_size
));
1007 bytes_remaining
-= load_mode_size
;
1009 /* See if remaining length is now zero. We previously set
1010 target to 0 so we can just jump to the end. */
1011 do_ifelse (CCmode
, EQ
, cmp_rem
, const0_rtx
,
1012 NULL_RTX
, final_label
);
1018 We can always shift back to do an overlapping compare
1019 of the last chunk because we know length >= 8.
1022 align>=load_mode_size
1023 Read word_mode and mask
1024 align<load_mode_size
1025 avoid stepping past end
1028 * decrement address and do overlapping compare
1029 * read word_mode and mask
1030 * carefully avoid crossing 4k boundary
1033 if ((!bytes_is_const
|| (bytes_is_const
&& bytes_remaining
&& isP7
))
1034 && align1
>= load_mode_size
&& align2
>= load_mode_size
)
1036 /* Alignment is larger than word_mode so we do not need to be
1037 concerned with extra page crossings. But, we do not know
1038 that the length is larger than load_mode_size so we might
1039 end up compareing against data before the block if we try
1040 an overlapping compare. Also we use this on P7 for fixed length
1041 remainder because P7 doesn't like overlapping unaligned.
1042 Strategy: load 8B, shift off bytes past length, and compare. */
1043 emit_label (final_cleanup
);
1044 do_load_mask_compare (load_mode
, diff
, cmp_rem
, dcond
,
1045 src1_addr
, src2_addr
, orig_src1
, orig_src2
);
1047 else if (bytes_remaining
&& bytes_is_const
)
1049 /* We do not do loop expand if length < 32 so we know at the
1050 end we can do an overlapping compare.
1051 Strategy: shift address back and do word_mode load that
1052 ends at the end of the block. */
1053 emit_label (final_cleanup
);
1054 do_overlap_load_compare (load_mode
, true, bytes_remaining
, diff
,
1055 cmp_rem
, dcond
, src1_addr
, src2_addr
,
1056 orig_src1
, orig_src2
);
1058 else if (!bytes_is_const
)
1060 rtx handle4k_label
= gen_label_rtx ();
1061 rtx nonconst_overlap
= gen_label_rtx ();
1062 emit_label (nonconst_overlap
);
1064 /* Here we have to handle the case where whe have runtime
1065 length which may be too short for overlap compare, and
1066 alignment is not at least load_mode_size so we have to
1067 tread carefully to avoid stepping across 4k boundaries. */
1069 /* If the length after the loop was larger than word_mode
1070 size, we can just do an overlapping compare and we're
1071 done. We fall through to this code from the word_mode
1072 compare that preceeds this. */
1073 do_overlap_load_compare (load_mode
, false, 0, diff
,
1074 cmp_rem
, dcond
, src1_addr
, src2_addr
,
1075 orig_src1
, orig_src2
);
1077 rtx diff_ref
= gen_rtx_LABEL_REF (VOIDmode
, diff_label
);
1078 j
= emit_jump_insn (gen_rtx_SET (pc_rtx
, diff_ref
));
1079 JUMP_LABEL (j
) = diff_label
;
1080 LABEL_NUSES (diff_label
) += 1;
1083 /* If we couldn't do the overlap compare we have to be more
1084 careful of the 4k boundary. Test to see if either
1085 address is less than word_mode_size away from a 4k
1086 boundary. If not, then we can do a load/shift/compare
1087 and we are done. We come to this code if length was less
1088 than word_mode_size. */
1090 emit_label (final_cleanup
);
1092 /* We can still avoid the slow case if the length was larger
1093 than one loop iteration, in which case go do the overlap
1094 load compare path. */
1095 do_ifelse (CCmode
, GT
, bytes_rtx
, GEN_INT (loop_bytes
),
1096 NULL_RTX
, nonconst_overlap
);
1098 rtx rem4k
= gen_reg_rtx (word_mode
);
1099 rtx dist1
= gen_reg_rtx (word_mode
);
1100 rtx dist2
= gen_reg_rtx (word_mode
);
1101 do_sub3 (rem4k
, GEN_INT (4096), cmp_rem
);
1102 if (word_mode
== SImode
)
1103 emit_insn (gen_andsi3 (dist1
, src1_addr
, GEN_INT (0xfff)));
1105 emit_insn (gen_anddi3 (dist1
, src1_addr
, GEN_INT (0xfff)));
1106 do_ifelse (CCmode
, LE
, dist1
, rem4k
, NULL_RTX
, handle4k_label
);
1107 if (word_mode
== SImode
)
1108 emit_insn (gen_andsi3 (dist2
, src2_addr
, GEN_INT (0xfff)));
1110 emit_insn (gen_anddi3 (dist2
, src2_addr
, GEN_INT (0xfff)));
1111 do_ifelse (CCmode
, LE
, dist2
, rem4k
, NULL_RTX
, handle4k_label
);
1113 /* We don't have a 4k boundary to deal with, so do
1114 a load/shift/compare and jump to diff. */
1116 do_load_mask_compare (load_mode
, diff
, cmp_rem
, dcond
,
1117 src1_addr
, src2_addr
, orig_src1
, orig_src2
);
1119 j
= emit_jump_insn (gen_rtx_SET (pc_rtx
, diff_ref
));
1120 JUMP_LABEL (j
) = diff_label
;
1121 LABEL_NUSES (diff_label
) += 1;
1124 /* Finally in the unlikely case we are inching up to a
1125 4k boundary we use a compact lbzx/compare loop to do
1126 it a byte at a time. */
1128 emit_label (handle4k_label
);
1130 rtx ctr
= gen_rtx_REG (Pmode
, CTR_REGNO
);
1131 emit_move_insn (ctr
, cmp_rem
);
1132 rtx ixreg
= gen_reg_rtx (Pmode
);
1133 emit_move_insn (ixreg
, const0_rtx
);
1135 rtx src1_ix
= gen_rtx_PLUS (word_mode
, src1_addr
, ixreg
);
1136 rtx src2_ix
= gen_rtx_PLUS (word_mode
, src2_addr
, ixreg
);
1137 rtx d1
= gen_reg_rtx (word_mode
);
1138 rtx d2
= gen_reg_rtx (word_mode
);
1140 rtx fc_loop
= gen_label_rtx ();
1141 emit_label (fc_loop
);
1143 do_load_for_compare_from_addr (QImode
, d1
, src1_ix
, orig_src1
);
1144 do_load_for_compare_from_addr (QImode
, d2
, src2_ix
, orig_src2
);
1146 do_add3 (ixreg
, ixreg
, const1_rtx
);
1148 rtx cond
= gen_reg_rtx (CCmode
);
1149 rtx subexpr
= gen_rtx_MINUS (word_mode
, d1
, d2
);
1150 rs6000_emit_dot_insn (diff
, subexpr
, 2, cond
);
1152 rtx eqrtx
= gen_rtx_EQ (VOIDmode
, d1
, d2
);
1154 j
= emit_jump_insn (gen_bdnztf_di (fc_loop
, ctr
, ctr
,
1157 j
= emit_jump_insn (gen_bdnztf_si (fc_loop
, ctr
, ctr
,
1159 JUMP_LABEL (j
) = fc_loop
;
1160 LABEL_NUSES (fc_loop
) += 1;
1163 emit_insn (gen_movsi (target
, gen_lowpart (SImode
, diff
)));
1165 emit_move_insn (target
, diff
);
1167 /* Since we are comparing bytes, the difference can be used
1168 as the final result and we are done here. */
1169 j
= emit_jump_insn (gen_rtx_SET (pc_rtx
, final_ref
));
1170 JUMP_LABEL (j
) = final_label
;
1171 LABEL_NUSES (final_label
) += 1;
1176 emit_label (diff_label
);
1177 /* difference handling, 64->32 conversion */
1179 /* We need to produce DI result from sub, then convert to target SI
1180 while maintaining <0 / ==0 / >0 properties. This sequence works:
1186 This is an alternate one Segher cooked up if somebody
1187 wants to expand this for something that doesn't have popcntd:
1194 And finally, p9 can just do this:
1199 emit_insn (gen_setb_unsigned (target
, dcond
));
1204 rtx tmp_reg_ca
= gen_reg_rtx (DImode
);
1205 emit_insn (gen_subfdi3_carry_in_xx (tmp_reg_ca
));
1206 emit_insn (gen_popcntddi2 (diff
, diff
));
1207 emit_insn (gen_iordi3 (diff
, diff
, tmp_reg_ca
));
1208 emit_insn (gen_movsi (target
, gen_lowpart (SImode
, diff
)));
1212 rtx tmp_reg_ca
= gen_reg_rtx (SImode
);
1213 emit_insn (gen_subfsi3_carry_in_xx (tmp_reg_ca
));
1214 emit_insn (gen_popcntdsi2 (diff
, diff
));
1215 emit_insn (gen_iorsi3 (target
, diff
, tmp_reg_ca
));
1219 if (library_call_label
!= NULL
)
1221 /* Branch around memcmp call. */
1222 j
= emit_jump_insn (gen_rtx_SET (pc_rtx
, final_ref
));
1223 JUMP_LABEL (j
) = final_label
;
1224 LABEL_NUSES (final_label
) += 1;
1227 /* Make memcmp library call. cmp_rem is the remaining bytes that
1228 were compared and cmp_rem is the expected amount to be compared
1229 by memcmp. If we don't find a difference in the loop compare, do
1230 the library call directly instead of doing a small compare just
1231 to get to an arbitrary boundary before calling it anyway.
1232 Also, update addresses to point to the next word to examine. */
1233 emit_label (library_call_label
);
1235 rtx len_rtx
= gen_reg_rtx (word_mode
);
1238 emit_move_insn (len_rtx
, cmp_rem
);
1239 do_add3 (src1_addr
, src1_addr
, iv1
);
1240 do_add3 (src2_addr
, src2_addr
, iv1
);
1243 emit_move_insn (len_rtx
, bytes_rtx
);
1245 tree fun
= builtin_decl_explicit (BUILT_IN_MEMCMP
);
1246 emit_library_call_value (XEXP (DECL_RTL (fun
), 0),
1247 target
, LCT_NORMAL
, GET_MODE (target
),
1250 len_rtx
, GET_MODE (len_rtx
));
1253 /* emit final_label */
1254 emit_label (final_label
);
1258 /* Expand a block compare operation, and return true if successful.
1259 Return false if we should let the compiler generate normal code,
1260 probably a memcmp call.
1262 OPERANDS[0] is the target (result).
1263 OPERANDS[1] is the first source.
1264 OPERANDS[2] is the second source.
1265 OPERANDS[3] is the length.
1266 OPERANDS[4] is the alignment. */
1268 expand_block_compare (rtx operands
[])
1270 rtx target
= operands
[0];
1271 rtx orig_src1
= operands
[1];
1272 rtx orig_src2
= operands
[2];
1273 rtx bytes_rtx
= operands
[3];
1274 rtx align_rtx
= operands
[4];
1275 HOST_WIDE_INT cmp_bytes
= 0;
1276 rtx src1
= orig_src1
;
1277 rtx src2
= orig_src2
;
1279 /* This case is complicated to handle because the subtract
1280 with carry instructions do not generate the 64-bit
1281 carry and so we must emit code to calculate it ourselves.
1282 We choose not to implement this yet. */
1283 if (TARGET_32BIT
&& TARGET_POWERPC64
)
1286 bool isP7
= (rs6000_tune
== PROCESSOR_POWER7
);
1288 /* Allow this param to shut off all expansion. */
1289 if (rs6000_block_compare_inline_limit
== 0)
1292 /* targetm.slow_unaligned_access -- don't do unaligned stuff.
1293 However slow_unaligned_access returns true on P7 even though the
1294 performance of this code is good there. */
1296 && (targetm
.slow_unaligned_access (word_mode
, MEM_ALIGN (orig_src1
))
1297 || targetm
.slow_unaligned_access (word_mode
, MEM_ALIGN (orig_src2
))))
1300 /* Unaligned l*brx traps on P7 so don't do this. However this should
1301 not affect much because LE isn't really supported on P7 anyway. */
1302 if (isP7
&& !BYTES_BIG_ENDIAN
)
1305 /* If this is not a fixed size compare, try generating loop code and
1306 if that fails just call memcmp. */
1307 if (!CONST_INT_P (bytes_rtx
))
1308 return expand_compare_loop (operands
);
1310 /* This must be a fixed size alignment. */
1311 if (!CONST_INT_P (align_rtx
))
1314 unsigned int base_align
= UINTVAL (align_rtx
) / BITS_PER_UNIT
;
1316 gcc_assert (GET_MODE (target
) == SImode
);
1318 /* Anything to move? */
1319 unsigned HOST_WIDE_INT bytes
= UINTVAL (bytes_rtx
);
1323 rtx tmp_reg_src1
= gen_reg_rtx (word_mode
);
1324 rtx tmp_reg_src2
= gen_reg_rtx (word_mode
);
1325 /* P7/P8 code uses cond for subfc. but P9 uses
1326 it for cmpld which needs CCUNSmode. */
1329 cond
= gen_reg_rtx (CCUNSmode
);
1331 cond
= gen_reg_rtx (CCmode
);
1333 /* If we have an LE target without ldbrx and word_mode is DImode,
1334 then we must avoid using word_mode. */
1335 int word_mode_ok
= !(!BYTES_BIG_ENDIAN
&& !TARGET_LDBRX
1336 && word_mode
== DImode
);
1338 /* Strategy phase. How many ops will this take and should we expand it? */
1340 unsigned HOST_WIDE_INT offset
= 0;
1341 machine_mode load_mode
=
1342 select_block_compare_mode (offset
, bytes
, base_align
, word_mode_ok
);
1343 unsigned int load_mode_size
= GET_MODE_SIZE (load_mode
);
1345 /* We don't want to generate too much code. The loop code can take
1346 over for lengths greater than 31 bytes. */
1347 unsigned HOST_WIDE_INT max_bytes
= rs6000_block_compare_inline_limit
;
1348 if (!IN_RANGE (bytes
, 1, max_bytes
))
1349 return expand_compare_loop (operands
);
1351 /* The code generated for p7 and older is not faster than glibc
1352 memcmp if alignment is small and length is not short, so bail
1353 out to avoid those conditions. */
1354 if (!TARGET_EFFICIENT_OVERLAPPING_UNALIGNED
1355 && ((base_align
== 1 && bytes
> 16)
1356 || (base_align
== 2 && bytes
> 32)))
1359 bool generate_6432_conversion
= false;
1360 rtx convert_label
= NULL
;
1361 rtx final_label
= NULL
;
1363 /* Example of generated code for 18 bytes aligned 1 byte.
1364 Compiled with -fno-reorder-blocks for clarity.
1382 .L6487: #convert_label
1386 .L6488: #final_label
1389 We start off with DImode for two blocks that jump to the DI->SI conversion
1390 if the difference is found there, then a final block of HImode that skips
1391 the DI->SI conversion. */
1395 unsigned int align
= compute_current_alignment (base_align
, offset
);
1396 if (TARGET_EFFICIENT_OVERLAPPING_UNALIGNED
)
1397 load_mode
= select_block_compare_mode (offset
, bytes
, align
,
1400 load_mode
= select_block_compare_mode (0, bytes
, align
, word_mode_ok
);
1401 load_mode_size
= GET_MODE_SIZE (load_mode
);
1402 if (bytes
>= load_mode_size
)
1403 cmp_bytes
= load_mode_size
;
1404 else if (TARGET_EFFICIENT_OVERLAPPING_UNALIGNED
)
1406 /* Move this load back so it doesn't go past the end.
1407 P8/P9 can do this efficiently. */
1408 unsigned int extra_bytes
= load_mode_size
- bytes
;
1410 if (extra_bytes
< offset
)
1412 offset
-= extra_bytes
;
1413 cmp_bytes
= load_mode_size
;
1418 /* P7 and earlier can't do the overlapping load trick fast,
1419 so this forces a non-overlapping load and a shift to get
1420 rid of the extra bytes. */
1423 src1
= adjust_address (orig_src1
, load_mode
, offset
);
1424 src2
= adjust_address (orig_src2
, load_mode
, offset
);
1426 if (!REG_P (XEXP (src1
, 0)))
1428 rtx src1_reg
= copy_addr_to_reg (XEXP (src1
, 0));
1429 src1
= replace_equiv_address (src1
, src1_reg
);
1431 set_mem_size (src1
, load_mode_size
);
1433 if (!REG_P (XEXP (src2
, 0)))
1435 rtx src2_reg
= copy_addr_to_reg (XEXP (src2
, 0));
1436 src2
= replace_equiv_address (src2
, src2_reg
);
1438 set_mem_size (src2
, load_mode_size
);
1440 do_load_for_compare (tmp_reg_src1
, src1
, load_mode
);
1441 do_load_for_compare (tmp_reg_src2
, src2
, load_mode
);
1443 if (cmp_bytes
< load_mode_size
)
1445 /* Shift unneeded bytes off. */
1446 rtx sh
= GEN_INT (BITS_PER_UNIT
* (load_mode_size
- cmp_bytes
));
1447 if (word_mode
== DImode
)
1449 emit_insn (gen_lshrdi3 (tmp_reg_src1
, tmp_reg_src1
, sh
));
1450 emit_insn (gen_lshrdi3 (tmp_reg_src2
, tmp_reg_src2
, sh
));
1454 emit_insn (gen_lshrsi3 (tmp_reg_src1
, tmp_reg_src1
, sh
));
1455 emit_insn (gen_lshrsi3 (tmp_reg_src2
, tmp_reg_src2
, sh
));
1459 int remain
= bytes
- cmp_bytes
;
1460 if (GET_MODE_SIZE (GET_MODE (target
)) > GET_MODE_SIZE (load_mode
))
1462 /* Target is larger than load size so we don't need to
1463 reduce result size. */
1465 /* We previously did a block that need 64->32 conversion but
1466 the current block does not, so a label is needed to jump
1468 if (generate_6432_conversion
&& !final_label
)
1469 final_label
= gen_label_rtx ();
1473 /* This is not the last block, branch to the end if the result
1474 of this subtract is not zero. */
1476 final_label
= gen_label_rtx ();
1477 rtx fin_ref
= gen_rtx_LABEL_REF (VOIDmode
, final_label
);
1478 rtx tmp
= gen_rtx_MINUS (word_mode
, tmp_reg_src1
, tmp_reg_src2
);
1479 rtx cr
= gen_reg_rtx (CCmode
);
1480 rs6000_emit_dot_insn (tmp_reg_src2
, tmp
, 2, cr
);
1481 emit_insn (gen_movsi (target
,
1482 gen_lowpart (SImode
, tmp_reg_src2
)));
1483 rtx ne_rtx
= gen_rtx_NE (VOIDmode
, cr
, const0_rtx
);
1484 rtx ifelse
= gen_rtx_IF_THEN_ELSE (VOIDmode
, ne_rtx
,
1486 rtx j
= emit_jump_insn (gen_rtx_SET (pc_rtx
, ifelse
));
1487 JUMP_LABEL (j
) = final_label
;
1488 LABEL_NUSES (final_label
) += 1;
1492 if (word_mode
== DImode
)
1494 emit_insn (gen_subdi3 (tmp_reg_src2
, tmp_reg_src1
,
1496 emit_insn (gen_movsi (target
,
1497 gen_lowpart (SImode
, tmp_reg_src2
)));
1500 emit_insn (gen_subsi3 (target
, tmp_reg_src1
, tmp_reg_src2
));
1504 rtx fin_ref
= gen_rtx_LABEL_REF (VOIDmode
, final_label
);
1505 rtx j
= emit_jump_insn (gen_rtx_SET (pc_rtx
, fin_ref
));
1506 JUMP_LABEL (j
) = final_label
;
1507 LABEL_NUSES (final_label
) += 1;
1514 /* Do we need a 64->32 conversion block? We need the 64->32
1515 conversion even if target size == load_mode size because
1516 the subtract generates one extra bit. */
1517 generate_6432_conversion
= true;
1522 convert_label
= gen_label_rtx ();
1524 /* Compare to zero and branch to convert_label if not zero. */
1525 rtx cvt_ref
= gen_rtx_LABEL_REF (VOIDmode
, convert_label
);
1528 /* Generate a compare, and convert with a setb later. */
1529 rtx cmp
= gen_rtx_COMPARE (CCUNSmode
, tmp_reg_src1
,
1531 emit_insn (gen_rtx_SET (cond
, cmp
));
1534 /* Generate a subfc. and use the longer
1535 sequence for conversion. */
1537 emit_insn (gen_subfdi3_carry_dot2 (tmp_reg_src2
, tmp_reg_src2
,
1538 tmp_reg_src1
, cond
));
1540 emit_insn (gen_subfsi3_carry_dot2 (tmp_reg_src2
, tmp_reg_src2
,
1541 tmp_reg_src1
, cond
));
1542 rtx ne_rtx
= gen_rtx_NE (VOIDmode
, cond
, const0_rtx
);
1543 rtx ifelse
= gen_rtx_IF_THEN_ELSE (VOIDmode
, ne_rtx
,
1545 rtx j
= emit_jump_insn (gen_rtx_SET (pc_rtx
, ifelse
));
1546 JUMP_LABEL (j
) = convert_label
;
1547 LABEL_NUSES (convert_label
) += 1;
1551 /* Just do the subtract/compare. Since this is the last block
1552 the convert code will be generated immediately following. */
1555 rtx cmp
= gen_rtx_COMPARE (CCUNSmode
, tmp_reg_src1
,
1557 emit_insn (gen_rtx_SET (cond
, cmp
));
1561 emit_insn (gen_subfdi3_carry (tmp_reg_src2
, tmp_reg_src2
,
1564 emit_insn (gen_subfsi3_carry (tmp_reg_src2
, tmp_reg_src2
,
1569 offset
+= cmp_bytes
;
1573 if (generate_6432_conversion
)
1576 emit_label (convert_label
);
1578 /* We need to produce DI result from sub, then convert to target SI
1579 while maintaining <0 / ==0 / >0 properties. This sequence works:
1585 This is an alternate one Segher cooked up if somebody
1586 wants to expand this for something that doesn't have popcntd:
1593 And finally, p9 can just do this:
1599 emit_insn (gen_setb_unsigned (target
, cond
));
1605 rtx tmp_reg_ca
= gen_reg_rtx (DImode
);
1606 emit_insn (gen_subfdi3_carry_in_xx (tmp_reg_ca
));
1607 emit_insn (gen_popcntddi2 (tmp_reg_src2
, tmp_reg_src2
));
1608 emit_insn (gen_iordi3 (tmp_reg_src2
, tmp_reg_src2
, tmp_reg_ca
));
1609 emit_insn (gen_movsi (target
, gen_lowpart (SImode
, tmp_reg_src2
)));
1613 rtx tmp_reg_ca
= gen_reg_rtx (SImode
);
1614 emit_insn (gen_subfsi3_carry_in_xx (tmp_reg_ca
));
1615 emit_insn (gen_popcntdsi2 (tmp_reg_src2
, tmp_reg_src2
));
1616 emit_insn (gen_iorsi3 (target
, tmp_reg_src2
, tmp_reg_ca
));
1622 emit_label (final_label
);
1624 gcc_assert (bytes
== 0);
1628 /* Generate alignment check and branch code to set up for
1629 strncmp when we don't have DI alignment.
1630 STRNCMP_LABEL is the label to branch if there is a page crossing.
1631 SRC is the string pointer to be examined.
1632 BYTES is the max number of bytes to compare. */
1634 expand_strncmp_align_check (rtx strncmp_label
, rtx src
, HOST_WIDE_INT bytes
)
1636 rtx lab_ref
= gen_rtx_LABEL_REF (VOIDmode
, strncmp_label
);
1637 rtx src_check
= copy_addr_to_reg (XEXP (src
, 0));
1638 if (GET_MODE (src_check
) == SImode
)
1639 emit_insn (gen_andsi3 (src_check
, src_check
, GEN_INT (0xfff)));
1641 emit_insn (gen_anddi3 (src_check
, src_check
, GEN_INT (0xfff)));
1642 rtx cond
= gen_reg_rtx (CCmode
);
1643 emit_move_insn (cond
, gen_rtx_COMPARE (CCmode
, src_check
,
1644 GEN_INT (4096 - bytes
)));
1646 rtx cmp_rtx
= gen_rtx_GE (VOIDmode
, cond
, const0_rtx
);
1648 rtx ifelse
= gen_rtx_IF_THEN_ELSE (VOIDmode
, cmp_rtx
,
1650 rtx j
= emit_jump_insn (gen_rtx_SET (pc_rtx
, ifelse
));
1651 JUMP_LABEL (j
) = strncmp_label
;
1652 LABEL_NUSES (strncmp_label
) += 1;
1655 /* Expand a string compare operation with length, and return
1656 true if successful. Return false if we should let the
1657 compiler generate normal code, probably a strncmp call.
1659 OPERANDS[0] is the target (result).
1660 OPERANDS[1] is the first source.
1661 OPERANDS[2] is the second source.
1662 If NO_LENGTH is zero, then:
1663 OPERANDS[3] is the length.
1664 OPERANDS[4] is the alignment in bytes.
1665 If NO_LENGTH is nonzero, then:
1666 OPERANDS[3] is the alignment in bytes. */
1668 expand_strn_compare (rtx operands
[], int no_length
)
1670 rtx target
= operands
[0];
1671 rtx orig_src1
= operands
[1];
1672 rtx orig_src2
= operands
[2];
1673 rtx bytes_rtx
, align_rtx
;
1677 align_rtx
= operands
[3];
1681 bytes_rtx
= operands
[3];
1682 align_rtx
= operands
[4];
1684 unsigned HOST_WIDE_INT cmp_bytes
= 0;
1685 rtx src1
= orig_src1
;
1686 rtx src2
= orig_src2
;
1688 /* If we have a length, it must be constant. This simplifies things
1689 a bit as we don't have to generate code to check if we've exceeded
1690 the length. Later this could be expanded to handle this case. */
1691 if (!no_length
&& !CONST_INT_P (bytes_rtx
))
1694 /* This must be a fixed size alignment. */
1695 if (!CONST_INT_P (align_rtx
))
1698 unsigned int base_align
= UINTVAL (align_rtx
);
1699 int align1
= MEM_ALIGN (orig_src1
) / BITS_PER_UNIT
;
1700 int align2
= MEM_ALIGN (orig_src2
) / BITS_PER_UNIT
;
1702 /* targetm.slow_unaligned_access -- don't do unaligned stuff. */
1703 if (targetm
.slow_unaligned_access (word_mode
, align1
)
1704 || targetm
.slow_unaligned_access (word_mode
, align2
))
1707 gcc_assert (GET_MODE (target
) == SImode
);
1709 /* If we have an LE target without ldbrx and word_mode is DImode,
1710 then we must avoid using word_mode. */
1711 int word_mode_ok
= !(!BYTES_BIG_ENDIAN
&& !TARGET_LDBRX
1712 && word_mode
== DImode
);
1714 unsigned int word_mode_size
= GET_MODE_SIZE (word_mode
);
1716 unsigned HOST_WIDE_INT offset
= 0;
1717 unsigned HOST_WIDE_INT bytes
; /* N from the strncmp args if available. */
1718 unsigned HOST_WIDE_INT compare_length
; /* How much to compare inline. */
1720 /* Use this as a standin to determine the mode to use. */
1721 bytes
= rs6000_string_compare_inline_limit
* word_mode_size
;
1723 bytes
= UINTVAL (bytes_rtx
);
1725 machine_mode load_mode
=
1726 select_block_compare_mode (offset
, bytes
, base_align
, word_mode_ok
);
1727 unsigned int load_mode_size
= GET_MODE_SIZE (load_mode
);
1728 compare_length
= rs6000_string_compare_inline_limit
* load_mode_size
;
1730 /* If we have equality at the end of the last compare and we have not
1731 found the end of the string, we need to call strcmp/strncmp to
1732 compare the remainder. */
1733 bool equality_compare_rest
= false;
1737 bytes
= compare_length
;
1738 equality_compare_rest
= true;
1742 if (bytes
<= compare_length
)
1743 compare_length
= bytes
;
1745 equality_compare_rest
= true;
1748 rtx result_reg
= gen_reg_rtx (word_mode
);
1749 rtx final_move_label
= gen_label_rtx ();
1750 rtx final_label
= gen_label_rtx ();
1751 rtx begin_compare_label
= NULL
;
1755 /* Generate code that checks distance to 4k boundary for this case. */
1756 begin_compare_label
= gen_label_rtx ();
1757 rtx strncmp_label
= gen_label_rtx ();
1760 /* Strncmp for power8 in glibc does this:
1762 cmpldi cr7,r8,4096-16
1763 bgt cr7,L(pagecross) */
1765 /* Make sure that the length we use for the alignment test and
1766 the subsequent code generation are in agreement so we do not
1767 go past the length we tested for a 4k boundary crossing. */
1768 unsigned HOST_WIDE_INT align_test
= compare_length
;
1771 align_test
= HOST_WIDE_INT_1U
<< ceil_log2 (align_test
);
1772 base_align
= align_test
;
1776 align_test
= ROUND_UP (align_test
, 8);
1781 expand_strncmp_align_check (strncmp_label
, src1
, align_test
);
1783 expand_strncmp_align_check (strncmp_label
, src2
, align_test
);
1785 /* Now generate the following sequence:
1786 - branch to begin_compare
1789 - branch to final_label
1790 - begin_compare_label */
1792 rtx cmp_ref
= gen_rtx_LABEL_REF (VOIDmode
, begin_compare_label
);
1793 jmp
= emit_jump_insn (gen_rtx_SET (pc_rtx
, cmp_ref
));
1794 JUMP_LABEL (jmp
) = begin_compare_label
;
1795 LABEL_NUSES (begin_compare_label
) += 1;
1798 emit_label (strncmp_label
);
1800 if (!REG_P (XEXP (src1
, 0)))
1802 rtx src1_reg
= copy_addr_to_reg (XEXP (src1
, 0));
1803 src1
= replace_equiv_address (src1
, src1_reg
);
1806 if (!REG_P (XEXP (src2
, 0)))
1808 rtx src2_reg
= copy_addr_to_reg (XEXP (src2
, 0));
1809 src2
= replace_equiv_address (src2
, src2_reg
);
1814 tree fun
= builtin_decl_explicit (BUILT_IN_STRCMP
);
1815 emit_library_call_value (XEXP (DECL_RTL (fun
), 0),
1816 target
, LCT_NORMAL
, GET_MODE (target
),
1817 force_reg (Pmode
, XEXP (src1
, 0)), Pmode
,
1818 force_reg (Pmode
, XEXP (src2
, 0)), Pmode
);
1822 /* -m32 -mpowerpc64 results in word_mode being DImode even
1823 though otherwise it is 32-bit. The length arg to strncmp
1824 is a size_t which will be the same size as pointers. */
1827 len_rtx
= gen_reg_rtx (DImode
);
1829 len_rtx
= gen_reg_rtx (SImode
);
1831 emit_move_insn (len_rtx
, bytes_rtx
);
1833 tree fun
= builtin_decl_explicit (BUILT_IN_STRNCMP
);
1834 emit_library_call_value (XEXP (DECL_RTL (fun
), 0),
1835 target
, LCT_NORMAL
, GET_MODE (target
),
1836 force_reg (Pmode
, XEXP (src1
, 0)), Pmode
,
1837 force_reg (Pmode
, XEXP (src2
, 0)), Pmode
,
1838 len_rtx
, GET_MODE (len_rtx
));
1841 rtx fin_ref
= gen_rtx_LABEL_REF (VOIDmode
, final_label
);
1842 jmp
= emit_jump_insn (gen_rtx_SET (pc_rtx
, fin_ref
));
1843 JUMP_LABEL (jmp
) = final_label
;
1844 LABEL_NUSES (final_label
) += 1;
1846 emit_label (begin_compare_label
);
1849 rtx cleanup_label
= NULL
;
1850 rtx tmp_reg_src1
= gen_reg_rtx (word_mode
);
1851 rtx tmp_reg_src2
= gen_reg_rtx (word_mode
);
1853 /* Generate sequence of ld/ldbrx, cmpb to compare out
1854 to the length specified. */
1855 unsigned HOST_WIDE_INT bytes_to_compare
= compare_length
;
1856 while (bytes_to_compare
> 0)
1858 /* Compare sequence:
1859 check each 8B with: ld/ld cmpd bne
1860 If equal, use rldicr/cmpb to check for zero byte.
1861 cleanup code at end:
1862 cmpb get byte that differs
1863 cmpb look for zero byte
1865 cntlzd get bit of first zero/diff byte
1866 subfic convert for rldcl use
1867 rldcl rldcl extract diff/zero byte
1868 subf subtract for final result
1870 The last compare can branch around the cleanup code if the
1871 result is zero because the strings are exactly equal. */
1872 unsigned int align
= compute_current_alignment (base_align
, offset
);
1873 if (TARGET_EFFICIENT_OVERLAPPING_UNALIGNED
)
1874 load_mode
= select_block_compare_mode (offset
, bytes_to_compare
, align
,
1877 load_mode
= select_block_compare_mode (0, bytes_to_compare
, align
,
1879 load_mode_size
= GET_MODE_SIZE (load_mode
);
1880 if (bytes_to_compare
>= load_mode_size
)
1881 cmp_bytes
= load_mode_size
;
1882 else if (TARGET_EFFICIENT_OVERLAPPING_UNALIGNED
)
1884 /* Move this load back so it doesn't go past the end.
1885 P8/P9 can do this efficiently. */
1886 unsigned int extra_bytes
= load_mode_size
- bytes_to_compare
;
1887 cmp_bytes
= bytes_to_compare
;
1888 if (extra_bytes
< offset
)
1890 offset
-= extra_bytes
;
1891 cmp_bytes
= load_mode_size
;
1892 bytes_to_compare
= cmp_bytes
;
1896 /* P7 and earlier can't do the overlapping load trick fast,
1897 so this forces a non-overlapping load and a shift to get
1898 rid of the extra bytes. */
1899 cmp_bytes
= bytes_to_compare
;
1901 src1
= adjust_address (orig_src1
, load_mode
, offset
);
1902 src2
= adjust_address (orig_src2
, load_mode
, offset
);
1904 if (!REG_P (XEXP (src1
, 0)))
1906 rtx src1_reg
= copy_addr_to_reg (XEXP (src1
, 0));
1907 src1
= replace_equiv_address (src1
, src1_reg
);
1909 set_mem_size (src1
, load_mode_size
);
1911 if (!REG_P (XEXP (src2
, 0)))
1913 rtx src2_reg
= copy_addr_to_reg (XEXP (src2
, 0));
1914 src2
= replace_equiv_address (src2
, src2_reg
);
1916 set_mem_size (src2
, load_mode_size
);
1918 do_load_for_compare (tmp_reg_src1
, src1
, load_mode
);
1919 do_load_for_compare (tmp_reg_src2
, src2
, load_mode
);
1921 /* We must always left-align the data we read, and
1922 clear any bytes to the right that are beyond the string.
1923 Otherwise the cmpb sequence won't produce the correct
1924 results. The beginning of the compare will be done
1925 with word_mode so will not have any extra shifts or
1928 if (load_mode_size
< word_mode_size
)
1930 /* Rotate left first. */
1931 rtx sh
= GEN_INT (BITS_PER_UNIT
* (word_mode_size
- load_mode_size
));
1932 if (word_mode
== DImode
)
1934 emit_insn (gen_rotldi3 (tmp_reg_src1
, tmp_reg_src1
, sh
));
1935 emit_insn (gen_rotldi3 (tmp_reg_src2
, tmp_reg_src2
, sh
));
1939 emit_insn (gen_rotlsi3 (tmp_reg_src1
, tmp_reg_src1
, sh
));
1940 emit_insn (gen_rotlsi3 (tmp_reg_src2
, tmp_reg_src2
, sh
));
1944 if (cmp_bytes
< word_mode_size
)
1946 /* Now clear right. This plus the rotate can be
1947 turned into a rldicr instruction. */
1948 HOST_WIDE_INT mb
= BITS_PER_UNIT
* (word_mode_size
- cmp_bytes
);
1949 rtx mask
= GEN_INT (HOST_WIDE_INT_M1U
<< mb
);
1950 if (word_mode
== DImode
)
1952 emit_insn (gen_anddi3_mask (tmp_reg_src1
, tmp_reg_src1
, mask
));
1953 emit_insn (gen_anddi3_mask (tmp_reg_src2
, tmp_reg_src2
, mask
));
1957 emit_insn (gen_andsi3_mask (tmp_reg_src1
, tmp_reg_src1
, mask
));
1958 emit_insn (gen_andsi3_mask (tmp_reg_src2
, tmp_reg_src2
, mask
));
1962 /* Cases to handle. A and B are chunks of the two strings.
1963 1: Not end of comparison:
1964 A != B: branch to cleanup code to compute result.
1965 A == B: check for 0 byte, next block if not found.
1966 2: End of the inline comparison:
1967 A != B: branch to cleanup code to compute result.
1968 A == B: check for 0 byte, call strcmp/strncmp
1969 3: compared requested N bytes:
1970 A == B: branch to result 0.
1971 A != B: cleanup code to compute result. */
1973 unsigned HOST_WIDE_INT remain
= bytes_to_compare
- cmp_bytes
;
1976 if (remain
> 0 || equality_compare_rest
)
1978 /* Branch to cleanup code, otherwise fall through to do
1981 cleanup_label
= gen_label_rtx ();
1982 dst_label
= cleanup_label
;
1985 /* Branch to end and produce result of 0. */
1986 dst_label
= final_move_label
;
1988 rtx lab_ref
= gen_rtx_LABEL_REF (VOIDmode
, dst_label
);
1989 rtx cond
= gen_reg_rtx (CCmode
);
1991 /* Always produce the 0 result, it is needed if
1992 cmpb finds a 0 byte in this chunk. */
1993 rtx tmp
= gen_rtx_MINUS (word_mode
, tmp_reg_src1
, tmp_reg_src2
);
1994 rs6000_emit_dot_insn (result_reg
, tmp
, 1, cond
);
1997 if (remain
== 0 && !equality_compare_rest
)
1998 cmp_rtx
= gen_rtx_EQ (VOIDmode
, cond
, const0_rtx
);
2000 cmp_rtx
= gen_rtx_NE (VOIDmode
, cond
, const0_rtx
);
2002 rtx ifelse
= gen_rtx_IF_THEN_ELSE (VOIDmode
, cmp_rtx
,
2004 rtx j
= emit_jump_insn (gen_rtx_SET (pc_rtx
, ifelse
));
2005 JUMP_LABEL (j
) = dst_label
;
2006 LABEL_NUSES (dst_label
) += 1;
2008 if (remain
> 0 || equality_compare_rest
)
2010 /* Generate a cmpb to test for a 0 byte and branch
2011 to final result if found. */
2012 rtx cmpb_zero
= gen_reg_rtx (word_mode
);
2013 rtx lab_ref_fin
= gen_rtx_LABEL_REF (VOIDmode
, final_move_label
);
2014 rtx condz
= gen_reg_rtx (CCmode
);
2015 rtx zero_reg
= gen_reg_rtx (word_mode
);
2016 if (word_mode
== SImode
)
2018 emit_insn (gen_movsi (zero_reg
, GEN_INT (0)));
2019 emit_insn (gen_cmpbsi3 (cmpb_zero
, tmp_reg_src1
, zero_reg
));
2020 if (cmp_bytes
< word_mode_size
)
2022 /* Don't want to look at zero bytes past end. */
2024 BITS_PER_UNIT
* (word_mode_size
- cmp_bytes
);
2025 rtx mask
= GEN_INT (HOST_WIDE_INT_M1U
<< mb
);
2026 emit_insn (gen_andsi3_mask (cmpb_zero
, cmpb_zero
, mask
));
2031 emit_insn (gen_movdi (zero_reg
, GEN_INT (0)));
2032 emit_insn (gen_cmpbdi3 (cmpb_zero
, tmp_reg_src1
, zero_reg
));
2033 if (cmp_bytes
< word_mode_size
)
2035 /* Don't want to look at zero bytes past end. */
2037 BITS_PER_UNIT
* (word_mode_size
- cmp_bytes
);
2038 rtx mask
= GEN_INT (HOST_WIDE_INT_M1U
<< mb
);
2039 emit_insn (gen_anddi3_mask (cmpb_zero
, cmpb_zero
, mask
));
2043 emit_move_insn (condz
, gen_rtx_COMPARE (CCmode
, cmpb_zero
, zero_reg
));
2044 rtx cmpnz_rtx
= gen_rtx_NE (VOIDmode
, condz
, const0_rtx
);
2045 rtx ifelse
= gen_rtx_IF_THEN_ELSE (VOIDmode
, cmpnz_rtx
,
2046 lab_ref_fin
, pc_rtx
);
2047 rtx j2
= emit_jump_insn (gen_rtx_SET (pc_rtx
, ifelse
));
2048 JUMP_LABEL (j2
) = final_move_label
;
2049 LABEL_NUSES (final_move_label
) += 1;
2053 offset
+= cmp_bytes
;
2054 bytes_to_compare
-= cmp_bytes
;
2057 if (equality_compare_rest
)
2059 /* Update pointers past what has been compared already. */
2060 src1
= adjust_address (orig_src1
, load_mode
, offset
);
2061 src2
= adjust_address (orig_src2
, load_mode
, offset
);
2063 if (!REG_P (XEXP (src1
, 0)))
2065 rtx src1_reg
= copy_addr_to_reg (XEXP (src1
, 0));
2066 src1
= replace_equiv_address (src1
, src1_reg
);
2068 set_mem_size (src1
, load_mode_size
);
2070 if (!REG_P (XEXP (src2
, 0)))
2072 rtx src2_reg
= copy_addr_to_reg (XEXP (src2
, 0));
2073 src2
= replace_equiv_address (src2
, src2_reg
);
2075 set_mem_size (src2
, load_mode_size
);
2077 /* Construct call to strcmp/strncmp to compare the rest of the string. */
2080 tree fun
= builtin_decl_explicit (BUILT_IN_STRCMP
);
2081 emit_library_call_value (XEXP (DECL_RTL (fun
), 0),
2082 target
, LCT_NORMAL
, GET_MODE (target
),
2083 force_reg (Pmode
, XEXP (src1
, 0)), Pmode
,
2084 force_reg (Pmode
, XEXP (src2
, 0)), Pmode
);
2090 len_rtx
= gen_reg_rtx (DImode
);
2092 len_rtx
= gen_reg_rtx (SImode
);
2094 emit_move_insn (len_rtx
, GEN_INT (bytes
- compare_length
));
2095 tree fun
= builtin_decl_explicit (BUILT_IN_STRNCMP
);
2096 emit_library_call_value (XEXP (DECL_RTL (fun
), 0),
2097 target
, LCT_NORMAL
, GET_MODE (target
),
2098 force_reg (Pmode
, XEXP (src1
, 0)), Pmode
,
2099 force_reg (Pmode
, XEXP (src2
, 0)), Pmode
,
2100 len_rtx
, GET_MODE (len_rtx
));
2103 rtx fin_ref
= gen_rtx_LABEL_REF (VOIDmode
, final_label
);
2104 rtx jmp
= emit_jump_insn (gen_rtx_SET (pc_rtx
, fin_ref
));
2105 JUMP_LABEL (jmp
) = final_label
;
2106 LABEL_NUSES (final_label
) += 1;
2111 emit_label (cleanup_label
);
2113 /* Generate the final sequence that identifies the differing
2114 byte and generates the final result, taking into account
2117 cmpb cmpb_result1, src1, src2
2118 cmpb cmpb_result2, src1, zero
2119 orc cmpb_result1, cmp_result1, cmpb_result2
2120 cntlzd get bit of first zero/diff byte
2121 addi convert for rldcl use
2122 rldcl rldcl extract diff/zero byte
2123 subf subtract for final result
2126 rtx cmpb_diff
= gen_reg_rtx (word_mode
);
2127 rtx cmpb_zero
= gen_reg_rtx (word_mode
);
2128 rtx rot_amt
= gen_reg_rtx (word_mode
);
2129 rtx zero_reg
= gen_reg_rtx (word_mode
);
2131 rtx rot1_1
= gen_reg_rtx (word_mode
);
2132 rtx rot1_2
= gen_reg_rtx (word_mode
);
2133 rtx rot2_1
= gen_reg_rtx (word_mode
);
2134 rtx rot2_2
= gen_reg_rtx (word_mode
);
2136 if (word_mode
== SImode
)
2138 emit_insn (gen_cmpbsi3 (cmpb_diff
, tmp_reg_src1
, tmp_reg_src2
));
2139 emit_insn (gen_movsi (zero_reg
, GEN_INT (0)));
2140 emit_insn (gen_cmpbsi3 (cmpb_zero
, tmp_reg_src1
, zero_reg
));
2141 emit_insn (gen_one_cmplsi2 (cmpb_diff
,cmpb_diff
));
2142 emit_insn (gen_iorsi3 (cmpb_diff
, cmpb_diff
, cmpb_zero
));
2143 emit_insn (gen_clzsi2 (rot_amt
, cmpb_diff
));
2144 emit_insn (gen_addsi3 (rot_amt
, rot_amt
, GEN_INT (8)));
2145 emit_insn (gen_rotlsi3 (rot1_1
, tmp_reg_src1
,
2146 gen_lowpart (SImode
, rot_amt
)));
2147 emit_insn (gen_andsi3_mask (rot1_2
, rot1_1
, GEN_INT (0xff)));
2148 emit_insn (gen_rotlsi3 (rot2_1
, tmp_reg_src2
,
2149 gen_lowpart (SImode
, rot_amt
)));
2150 emit_insn (gen_andsi3_mask (rot2_2
, rot2_1
, GEN_INT (0xff)));
2151 emit_insn (gen_subsi3 (result_reg
, rot1_2
, rot2_2
));
2155 emit_insn (gen_cmpbdi3 (cmpb_diff
, tmp_reg_src1
, tmp_reg_src2
));
2156 emit_insn (gen_movdi (zero_reg
, GEN_INT (0)));
2157 emit_insn (gen_cmpbdi3 (cmpb_zero
, tmp_reg_src1
, zero_reg
));
2158 emit_insn (gen_one_cmpldi2 (cmpb_diff
,cmpb_diff
));
2159 emit_insn (gen_iordi3 (cmpb_diff
, cmpb_diff
, cmpb_zero
));
2160 emit_insn (gen_clzdi2 (rot_amt
, cmpb_diff
));
2161 emit_insn (gen_adddi3 (rot_amt
, rot_amt
, GEN_INT (8)));
2162 emit_insn (gen_rotldi3 (rot1_1
, tmp_reg_src1
,
2163 gen_lowpart (SImode
, rot_amt
)));
2164 emit_insn (gen_anddi3_mask (rot1_2
, rot1_1
, GEN_INT (0xff)));
2165 emit_insn (gen_rotldi3 (rot2_1
, tmp_reg_src2
,
2166 gen_lowpart (SImode
, rot_amt
)));
2167 emit_insn (gen_anddi3_mask (rot2_2
, rot2_1
, GEN_INT (0xff)));
2168 emit_insn (gen_subdi3 (result_reg
, rot1_2
, rot2_2
));
2171 emit_label (final_move_label
);
2172 emit_insn (gen_movsi (target
,
2173 gen_lowpart (SImode
, result_reg
)));
2174 emit_label (final_label
);
2178 /* Expand a block move operation, and return 1 if successful. Return 0
2179 if we should let the compiler generate normal code.
2181 operands[0] is the destination
2182 operands[1] is the source
2183 operands[2] is the length
2184 operands[3] is the alignment */
2186 #define MAX_MOVE_REG 4
2189 expand_block_move (rtx operands
[])
2191 rtx orig_dest
= operands
[0];
2192 rtx orig_src
= operands
[1];
2193 rtx bytes_rtx
= operands
[2];
2194 rtx align_rtx
= operands
[3];
2195 int constp
= (GET_CODE (bytes_rtx
) == CONST_INT
);
2200 rtx stores
[MAX_MOVE_REG
];
2203 /* If this is not a fixed size move, just call memcpy */
2207 /* This must be a fixed size alignment */
2208 gcc_assert (GET_CODE (align_rtx
) == CONST_INT
);
2209 align
= INTVAL (align_rtx
) * BITS_PER_UNIT
;
2211 /* Anything to move? */
2212 bytes
= INTVAL (bytes_rtx
);
2216 if (bytes
> rs6000_block_move_inline_limit
)
2219 for (offset
= 0; bytes
> 0; offset
+= move_bytes
, bytes
-= move_bytes
)
2222 rtx (*movmemsi
) (rtx
, rtx
, rtx
, rtx
);
2223 rtx (*mov
) (rtx
, rtx
);
2225 machine_mode mode
= BLKmode
;
2228 /* Altivec first, since it will be faster than a string move
2229 when it applies, and usually not significantly larger. */
2230 if (TARGET_ALTIVEC
&& bytes
>= 16 && (TARGET_EFFICIENT_UNALIGNED_VSX
|| align
>= 128))
2234 gen_func
.mov
= gen_movv4si
;
2236 else if (bytes
>= 8 && TARGET_POWERPC64
2237 && (align
>= 64 || !STRICT_ALIGNMENT
))
2241 gen_func
.mov
= gen_movdi
;
2242 if (offset
== 0 && align
< 64)
2246 /* If the address form is reg+offset with offset not a
2247 multiple of four, reload into reg indirect form here
2248 rather than waiting for reload. This way we get one
2249 reload, not one per load and/or store. */
2250 addr
= XEXP (orig_dest
, 0);
2251 if ((GET_CODE (addr
) == PLUS
|| GET_CODE (addr
) == LO_SUM
)
2252 && GET_CODE (XEXP (addr
, 1)) == CONST_INT
2253 && (INTVAL (XEXP (addr
, 1)) & 3) != 0)
2255 addr
= copy_addr_to_reg (addr
);
2256 orig_dest
= replace_equiv_address (orig_dest
, addr
);
2258 addr
= XEXP (orig_src
, 0);
2259 if ((GET_CODE (addr
) == PLUS
|| GET_CODE (addr
) == LO_SUM
)
2260 && GET_CODE (XEXP (addr
, 1)) == CONST_INT
2261 && (INTVAL (XEXP (addr
, 1)) & 3) != 0)
2263 addr
= copy_addr_to_reg (addr
);
2264 orig_src
= replace_equiv_address (orig_src
, addr
);
2268 else if (bytes
>= 4 && (align
>= 32 || !STRICT_ALIGNMENT
))
2269 { /* move 4 bytes */
2272 gen_func
.mov
= gen_movsi
;
2274 else if (bytes
>= 2 && (align
>= 16 || !STRICT_ALIGNMENT
))
2275 { /* move 2 bytes */
2278 gen_func
.mov
= gen_movhi
;
2280 else /* move 1 byte at a time */
2284 gen_func
.mov
= gen_movqi
;
2287 src
= adjust_address (orig_src
, mode
, offset
);
2288 dest
= adjust_address (orig_dest
, mode
, offset
);
2290 if (mode
!= BLKmode
)
2292 rtx tmp_reg
= gen_reg_rtx (mode
);
2294 emit_insn ((*gen_func
.mov
) (tmp_reg
, src
));
2295 stores
[num_reg
++] = (*gen_func
.mov
) (dest
, tmp_reg
);
2298 if (mode
== BLKmode
|| num_reg
>= MAX_MOVE_REG
|| bytes
== move_bytes
)
2301 for (i
= 0; i
< num_reg
; i
++)
2302 emit_insn (stores
[i
]);
2306 if (mode
== BLKmode
)
2308 /* Move the address into scratch registers. The movmemsi
2309 patterns require zero offset. */
2310 if (!REG_P (XEXP (src
, 0)))
2312 rtx src_reg
= copy_addr_to_reg (XEXP (src
, 0));
2313 src
= replace_equiv_address (src
, src_reg
);
2315 set_mem_size (src
, move_bytes
);
2317 if (!REG_P (XEXP (dest
, 0)))
2319 rtx dest_reg
= copy_addr_to_reg (XEXP (dest
, 0));
2320 dest
= replace_equiv_address (dest
, dest_reg
);
2322 set_mem_size (dest
, move_bytes
);
2324 emit_insn ((*gen_func
.movmemsi
) (dest
, src
,
2325 GEN_INT (move_bytes
& 31),