poly_int: get_inner_reference & co.
[official-gcc.git] / gcc / gimple-ssa-store-merging.c
blobdddafb7bf2739bbfd53d5ea7f76ceb1e1d53fd33
1 /* GIMPLE store merging and byte swapping passes.
2 Copyright (C) 2009-2017 Free Software Foundation, Inc.
3 Contributed by ARM Ltd.
5 This file is part of GCC.
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3, or (at your option)
10 any later version.
12 GCC is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
21 /* The purpose of the store merging pass is to combine multiple memory
22 stores of constant values, values loaded from memory or bitwise operations
23 on those to consecutive memory locations into fewer wider stores.
24 For example, if we have a sequence peforming four byte stores to
25 consecutive memory locations:
26 [p ] := imm1;
27 [p + 1B] := imm2;
28 [p + 2B] := imm3;
29 [p + 3B] := imm4;
30 we can transform this into a single 4-byte store if the target supports it:
31 [p] := imm1:imm2:imm3:imm4 //concatenated immediates according to endianness.
33 Or:
34 [p ] := [q ];
35 [p + 1B] := [q + 1B];
36 [p + 2B] := [q + 2B];
37 [p + 3B] := [q + 3B];
38 if there is no overlap can be transformed into a single 4-byte
39 load followed by single 4-byte store.
41 Or:
42 [p ] := [q ] ^ imm1;
43 [p + 1B] := [q + 1B] ^ imm2;
44 [p + 2B] := [q + 2B] ^ imm3;
45 [p + 3B] := [q + 3B] ^ imm4;
46 if there is no overlap can be transformed into a single 4-byte
47 load, xored with imm1:imm2:imm3:imm4 and stored using a single 4-byte store.
49 The algorithm is applied to each basic block in three phases:
51 1) Scan through the basic block recording assignments to
52 destinations that can be expressed as a store to memory of a certain size
53 at a certain bit offset from expressions we can handle. For bit-fields
54 we also note the surrounding bit region, bits that could be stored in
55 a read-modify-write operation when storing the bit-field. Record store
56 chains to different bases in a hash_map (m_stores) and make sure to
57 terminate such chains when appropriate (for example when when the stored
58 values get used subsequently).
59 These stores can be a result of structure element initializers, array stores
60 etc. A store_immediate_info object is recorded for every such store.
61 Record as many such assignments to a single base as possible until a
62 statement that interferes with the store sequence is encountered.
63 Each store has up to 2 operands, which can be an immediate constant
64 or a memory load, from which the value to be stored can be computed.
65 At most one of the operands can be a constant. The operands are recorded
66 in store_operand_info struct.
68 2) Analyze the chain of stores recorded in phase 1) (i.e. the vector of
69 store_immediate_info objects) and coalesce contiguous stores into
70 merged_store_group objects. For bit-fields stores, we don't need to
71 require the stores to be contiguous, just their surrounding bit regions
72 have to be contiguous. If the expression being stored is different
73 between adjacent stores, such as one store storing a constant and
74 following storing a value loaded from memory, or if the loaded memory
75 objects are not adjacent, a new merged_store_group is created as well.
77 For example, given the stores:
78 [p ] := 0;
79 [p + 1B] := 1;
80 [p + 3B] := 0;
81 [p + 4B] := 1;
82 [p + 5B] := 0;
83 [p + 6B] := 0;
84 This phase would produce two merged_store_group objects, one recording the
85 two bytes stored in the memory region [p : p + 1] and another
86 recording the four bytes stored in the memory region [p + 3 : p + 6].
88 3) The merged_store_group objects produced in phase 2) are processed
89 to generate the sequence of wider stores that set the contiguous memory
90 regions to the sequence of bytes that correspond to it. This may emit
91 multiple stores per store group to handle contiguous stores that are not
92 of a size that is a power of 2. For example it can try to emit a 40-bit
93 store as a 32-bit store followed by an 8-bit store.
94 We try to emit as wide stores as we can while respecting STRICT_ALIGNMENT or
95 TARGET_SLOW_UNALIGNED_ACCESS rules.
97 Note on endianness and example:
98 Consider 2 contiguous 16-bit stores followed by 2 contiguous 8-bit stores:
99 [p ] := 0x1234;
100 [p + 2B] := 0x5678;
101 [p + 4B] := 0xab;
102 [p + 5B] := 0xcd;
104 The memory layout for little-endian (LE) and big-endian (BE) must be:
105 p |LE|BE|
106 ---------
107 0 |34|12|
108 1 |12|34|
109 2 |78|56|
110 3 |56|78|
111 4 |ab|ab|
112 5 |cd|cd|
114 To merge these into a single 48-bit merged value 'val' in phase 2)
115 on little-endian we insert stores to higher (consecutive) bitpositions
116 into the most significant bits of the merged value.
117 The final merged value would be: 0xcdab56781234
119 For big-endian we insert stores to higher bitpositions into the least
120 significant bits of the merged value.
121 The final merged value would be: 0x12345678abcd
123 Then, in phase 3), we want to emit this 48-bit value as a 32-bit store
124 followed by a 16-bit store. Again, we must consider endianness when
125 breaking down the 48-bit value 'val' computed above.
126 For little endian we emit:
127 [p] (32-bit) := 0x56781234; // val & 0x0000ffffffff;
128 [p + 4B] (16-bit) := 0xcdab; // (val & 0xffff00000000) >> 32;
130 Whereas for big-endian we emit:
131 [p] (32-bit) := 0x12345678; // (val & 0xffffffff0000) >> 16;
132 [p + 4B] (16-bit) := 0xabcd; // val & 0x00000000ffff; */
134 #include "config.h"
135 #include "system.h"
136 #include "coretypes.h"
137 #include "backend.h"
138 #include "tree.h"
139 #include "gimple.h"
140 #include "builtins.h"
141 #include "fold-const.h"
142 #include "tree-pass.h"
143 #include "ssa.h"
144 #include "gimple-pretty-print.h"
145 #include "alias.h"
146 #include "fold-const.h"
147 #include "params.h"
148 #include "print-tree.h"
149 #include "tree-hash-traits.h"
150 #include "gimple-iterator.h"
151 #include "gimplify.h"
152 #include "stor-layout.h"
153 #include "timevar.h"
154 #include "tree-cfg.h"
155 #include "tree-eh.h"
156 #include "target.h"
157 #include "gimplify-me.h"
158 #include "rtl.h"
159 #include "expr.h" /* For get_bit_range. */
160 #include "optabs-tree.h"
161 #include "selftest.h"
163 /* The maximum size (in bits) of the stores this pass should generate. */
164 #define MAX_STORE_BITSIZE (BITS_PER_WORD)
165 #define MAX_STORE_BYTES (MAX_STORE_BITSIZE / BITS_PER_UNIT)
167 /* Limit to bound the number of aliasing checks for loads with the same
168 vuse as the corresponding store. */
169 #define MAX_STORE_ALIAS_CHECKS 64
171 namespace {
173 struct bswap_stat
175 /* Number of hand-written 16-bit nop / bswaps found. */
176 int found_16bit;
178 /* Number of hand-written 32-bit nop / bswaps found. */
179 int found_32bit;
181 /* Number of hand-written 64-bit nop / bswaps found. */
182 int found_64bit;
183 } nop_stats, bswap_stats;
185 /* A symbolic number structure is used to detect byte permutation and selection
186 patterns of a source. To achieve that, its field N contains an artificial
187 number consisting of BITS_PER_MARKER sized markers tracking where does each
188 byte come from in the source:
190 0 - target byte has the value 0
191 FF - target byte has an unknown value (eg. due to sign extension)
192 1..size - marker value is the byte index in the source (0 for lsb).
194 To detect permutations on memory sources (arrays and structures), a symbolic
195 number is also associated:
196 - a base address BASE_ADDR and an OFFSET giving the address of the source;
197 - a range which gives the difference between the highest and lowest accessed
198 memory location to make such a symbolic number;
199 - the address SRC of the source element of lowest address as a convenience
200 to easily get BASE_ADDR + offset + lowest bytepos;
201 - number of expressions N_OPS bitwise ored together to represent
202 approximate cost of the computation.
204 Note 1: the range is different from size as size reflects the size of the
205 type of the current expression. For instance, for an array char a[],
206 (short) a[0] | (short) a[3] would have a size of 2 but a range of 4 while
207 (short) a[0] | ((short) a[0] << 1) would still have a size of 2 but this
208 time a range of 1.
210 Note 2: for non-memory sources, range holds the same value as size.
212 Note 3: SRC points to the SSA_NAME in case of non-memory source. */
214 struct symbolic_number {
215 uint64_t n;
216 tree type;
217 tree base_addr;
218 tree offset;
219 poly_int64_pod bytepos;
220 tree src;
221 tree alias_set;
222 tree vuse;
223 unsigned HOST_WIDE_INT range;
224 int n_ops;
227 #define BITS_PER_MARKER 8
228 #define MARKER_MASK ((1 << BITS_PER_MARKER) - 1)
229 #define MARKER_BYTE_UNKNOWN MARKER_MASK
230 #define HEAD_MARKER(n, size) \
231 ((n) & ((uint64_t) MARKER_MASK << (((size) - 1) * BITS_PER_MARKER)))
233 /* The number which the find_bswap_or_nop_1 result should match in
234 order to have a nop. The number is masked according to the size of
235 the symbolic number before using it. */
236 #define CMPNOP (sizeof (int64_t) < 8 ? 0 : \
237 (uint64_t)0x08070605 << 32 | 0x04030201)
239 /* The number which the find_bswap_or_nop_1 result should match in
240 order to have a byte swap. The number is masked according to the
241 size of the symbolic number before using it. */
242 #define CMPXCHG (sizeof (int64_t) < 8 ? 0 : \
243 (uint64_t)0x01020304 << 32 | 0x05060708)
245 /* Perform a SHIFT or ROTATE operation by COUNT bits on symbolic
246 number N. Return false if the requested operation is not permitted
247 on a symbolic number. */
249 inline bool
250 do_shift_rotate (enum tree_code code,
251 struct symbolic_number *n,
252 int count)
254 int i, size = TYPE_PRECISION (n->type) / BITS_PER_UNIT;
255 unsigned head_marker;
257 if (count % BITS_PER_UNIT != 0)
258 return false;
259 count = (count / BITS_PER_UNIT) * BITS_PER_MARKER;
261 /* Zero out the extra bits of N in order to avoid them being shifted
262 into the significant bits. */
263 if (size < 64 / BITS_PER_MARKER)
264 n->n &= ((uint64_t) 1 << (size * BITS_PER_MARKER)) - 1;
266 switch (code)
268 case LSHIFT_EXPR:
269 n->n <<= count;
270 break;
271 case RSHIFT_EXPR:
272 head_marker = HEAD_MARKER (n->n, size);
273 n->n >>= count;
274 /* Arithmetic shift of signed type: result is dependent on the value. */
275 if (!TYPE_UNSIGNED (n->type) && head_marker)
276 for (i = 0; i < count / BITS_PER_MARKER; i++)
277 n->n |= (uint64_t) MARKER_BYTE_UNKNOWN
278 << ((size - 1 - i) * BITS_PER_MARKER);
279 break;
280 case LROTATE_EXPR:
281 n->n = (n->n << count) | (n->n >> ((size * BITS_PER_MARKER) - count));
282 break;
283 case RROTATE_EXPR:
284 n->n = (n->n >> count) | (n->n << ((size * BITS_PER_MARKER) - count));
285 break;
286 default:
287 return false;
289 /* Zero unused bits for size. */
290 if (size < 64 / BITS_PER_MARKER)
291 n->n &= ((uint64_t) 1 << (size * BITS_PER_MARKER)) - 1;
292 return true;
295 /* Perform sanity checking for the symbolic number N and the gimple
296 statement STMT. */
298 inline bool
299 verify_symbolic_number_p (struct symbolic_number *n, gimple *stmt)
301 tree lhs_type;
303 lhs_type = gimple_expr_type (stmt);
305 if (TREE_CODE (lhs_type) != INTEGER_TYPE)
306 return false;
308 if (TYPE_PRECISION (lhs_type) != TYPE_PRECISION (n->type))
309 return false;
311 return true;
314 /* Initialize the symbolic number N for the bswap pass from the base element
315 SRC manipulated by the bitwise OR expression. */
317 bool
318 init_symbolic_number (struct symbolic_number *n, tree src)
320 int size;
322 if (! INTEGRAL_TYPE_P (TREE_TYPE (src)))
323 return false;
325 n->base_addr = n->offset = n->alias_set = n->vuse = NULL_TREE;
326 n->src = src;
328 /* Set up the symbolic number N by setting each byte to a value between 1 and
329 the byte size of rhs1. The highest order byte is set to n->size and the
330 lowest order byte to 1. */
331 n->type = TREE_TYPE (src);
332 size = TYPE_PRECISION (n->type);
333 if (size % BITS_PER_UNIT != 0)
334 return false;
335 size /= BITS_PER_UNIT;
336 if (size > 64 / BITS_PER_MARKER)
337 return false;
338 n->range = size;
339 n->n = CMPNOP;
340 n->n_ops = 1;
342 if (size < 64 / BITS_PER_MARKER)
343 n->n &= ((uint64_t) 1 << (size * BITS_PER_MARKER)) - 1;
345 return true;
348 /* Check if STMT might be a byte swap or a nop from a memory source and returns
349 the answer. If so, REF is that memory source and the base of the memory area
350 accessed and the offset of the access from that base are recorded in N. */
352 bool
353 find_bswap_or_nop_load (gimple *stmt, tree ref, struct symbolic_number *n)
355 /* Leaf node is an array or component ref. Memorize its base and
356 offset from base to compare to other such leaf node. */
357 poly_int64 bitsize, bitpos, bytepos;
358 machine_mode mode;
359 int unsignedp, reversep, volatilep;
360 tree offset, base_addr;
362 /* Not prepared to handle PDP endian. */
363 if (BYTES_BIG_ENDIAN != WORDS_BIG_ENDIAN)
364 return false;
366 if (!gimple_assign_load_p (stmt) || gimple_has_volatile_ops (stmt))
367 return false;
369 base_addr = get_inner_reference (ref, &bitsize, &bitpos, &offset, &mode,
370 &unsignedp, &reversep, &volatilep);
372 if (TREE_CODE (base_addr) == TARGET_MEM_REF)
373 /* Do not rewrite TARGET_MEM_REF. */
374 return false;
375 else if (TREE_CODE (base_addr) == MEM_REF)
377 offset_int bit_offset = 0;
378 tree off = TREE_OPERAND (base_addr, 1);
380 if (!integer_zerop (off))
382 offset_int boff, coff = mem_ref_offset (base_addr);
383 boff = coff << LOG2_BITS_PER_UNIT;
384 bit_offset += boff;
387 base_addr = TREE_OPERAND (base_addr, 0);
389 /* Avoid returning a negative bitpos as this may wreak havoc later. */
390 if (wi::neg_p (bit_offset))
392 offset_int mask = wi::mask <offset_int> (LOG2_BITS_PER_UNIT, false);
393 offset_int tem = wi::bit_and_not (bit_offset, mask);
394 /* TEM is the bitpos rounded to BITS_PER_UNIT towards -Inf.
395 Subtract it to BIT_OFFSET and add it (scaled) to OFFSET. */
396 bit_offset -= tem;
397 tem >>= LOG2_BITS_PER_UNIT;
398 if (offset)
399 offset = size_binop (PLUS_EXPR, offset,
400 wide_int_to_tree (sizetype, tem));
401 else
402 offset = wide_int_to_tree (sizetype, tem);
405 bitpos += bit_offset.to_shwi ();
407 else
408 base_addr = build_fold_addr_expr (base_addr);
410 if (!multiple_p (bitpos, BITS_PER_UNIT, &bytepos))
411 return false;
412 if (!multiple_p (bitsize, BITS_PER_UNIT))
413 return false;
414 if (reversep)
415 return false;
417 if (!init_symbolic_number (n, ref))
418 return false;
419 n->base_addr = base_addr;
420 n->offset = offset;
421 n->bytepos = bytepos;
422 n->alias_set = reference_alias_ptr_type (ref);
423 n->vuse = gimple_vuse (stmt);
424 return true;
427 /* Compute the symbolic number N representing the result of a bitwise OR on 2
428 symbolic number N1 and N2 whose source statements are respectively
429 SOURCE_STMT1 and SOURCE_STMT2. */
431 gimple *
432 perform_symbolic_merge (gimple *source_stmt1, struct symbolic_number *n1,
433 gimple *source_stmt2, struct symbolic_number *n2,
434 struct symbolic_number *n)
436 int i, size;
437 uint64_t mask;
438 gimple *source_stmt;
439 struct symbolic_number *n_start;
441 tree rhs1 = gimple_assign_rhs1 (source_stmt1);
442 if (TREE_CODE (rhs1) == BIT_FIELD_REF
443 && TREE_CODE (TREE_OPERAND (rhs1, 0)) == SSA_NAME)
444 rhs1 = TREE_OPERAND (rhs1, 0);
445 tree rhs2 = gimple_assign_rhs1 (source_stmt2);
446 if (TREE_CODE (rhs2) == BIT_FIELD_REF
447 && TREE_CODE (TREE_OPERAND (rhs2, 0)) == SSA_NAME)
448 rhs2 = TREE_OPERAND (rhs2, 0);
450 /* Sources are different, cancel bswap if they are not memory location with
451 the same base (array, structure, ...). */
452 if (rhs1 != rhs2)
454 uint64_t inc;
455 HOST_WIDE_INT start1, start2, start_sub, end_sub, end1, end2, end;
456 struct symbolic_number *toinc_n_ptr, *n_end;
457 basic_block bb1, bb2;
459 if (!n1->base_addr || !n2->base_addr
460 || !operand_equal_p (n1->base_addr, n2->base_addr, 0))
461 return NULL;
463 if (!n1->offset != !n2->offset
464 || (n1->offset && !operand_equal_p (n1->offset, n2->offset, 0)))
465 return NULL;
467 start1 = 0;
468 if (!(n2->bytepos - n1->bytepos).is_constant (&start2))
469 return NULL;
471 if (start1 < start2)
473 n_start = n1;
474 start_sub = start2 - start1;
476 else
478 n_start = n2;
479 start_sub = start1 - start2;
482 bb1 = gimple_bb (source_stmt1);
483 bb2 = gimple_bb (source_stmt2);
484 if (dominated_by_p (CDI_DOMINATORS, bb1, bb2))
485 source_stmt = source_stmt1;
486 else
487 source_stmt = source_stmt2;
489 /* Find the highest address at which a load is performed and
490 compute related info. */
491 end1 = start1 + (n1->range - 1);
492 end2 = start2 + (n2->range - 1);
493 if (end1 < end2)
495 end = end2;
496 end_sub = end2 - end1;
498 else
500 end = end1;
501 end_sub = end1 - end2;
503 n_end = (end2 > end1) ? n2 : n1;
505 /* Find symbolic number whose lsb is the most significant. */
506 if (BYTES_BIG_ENDIAN)
507 toinc_n_ptr = (n_end == n1) ? n2 : n1;
508 else
509 toinc_n_ptr = (n_start == n1) ? n2 : n1;
511 n->range = end - MIN (start1, start2) + 1;
513 /* Check that the range of memory covered can be represented by
514 a symbolic number. */
515 if (n->range > 64 / BITS_PER_MARKER)
516 return NULL;
518 /* Reinterpret byte marks in symbolic number holding the value of
519 bigger weight according to target endianness. */
520 inc = BYTES_BIG_ENDIAN ? end_sub : start_sub;
521 size = TYPE_PRECISION (n1->type) / BITS_PER_UNIT;
522 for (i = 0; i < size; i++, inc <<= BITS_PER_MARKER)
524 unsigned marker
525 = (toinc_n_ptr->n >> (i * BITS_PER_MARKER)) & MARKER_MASK;
526 if (marker && marker != MARKER_BYTE_UNKNOWN)
527 toinc_n_ptr->n += inc;
530 else
532 n->range = n1->range;
533 n_start = n1;
534 source_stmt = source_stmt1;
537 if (!n1->alias_set
538 || alias_ptr_types_compatible_p (n1->alias_set, n2->alias_set))
539 n->alias_set = n1->alias_set;
540 else
541 n->alias_set = ptr_type_node;
542 n->vuse = n_start->vuse;
543 n->base_addr = n_start->base_addr;
544 n->offset = n_start->offset;
545 n->src = n_start->src;
546 n->bytepos = n_start->bytepos;
547 n->type = n_start->type;
548 size = TYPE_PRECISION (n->type) / BITS_PER_UNIT;
550 for (i = 0, mask = MARKER_MASK; i < size; i++, mask <<= BITS_PER_MARKER)
552 uint64_t masked1, masked2;
554 masked1 = n1->n & mask;
555 masked2 = n2->n & mask;
556 if (masked1 && masked2 && masked1 != masked2)
557 return NULL;
559 n->n = n1->n | n2->n;
560 n->n_ops = n1->n_ops + n2->n_ops;
562 return source_stmt;
565 /* find_bswap_or_nop_1 invokes itself recursively with N and tries to perform
566 the operation given by the rhs of STMT on the result. If the operation
567 could successfully be executed the function returns a gimple stmt whose
568 rhs's first tree is the expression of the source operand and NULL
569 otherwise. */
571 gimple *
572 find_bswap_or_nop_1 (gimple *stmt, struct symbolic_number *n, int limit)
574 enum tree_code code;
575 tree rhs1, rhs2 = NULL;
576 gimple *rhs1_stmt, *rhs2_stmt, *source_stmt1;
577 enum gimple_rhs_class rhs_class;
579 if (!limit || !is_gimple_assign (stmt))
580 return NULL;
582 rhs1 = gimple_assign_rhs1 (stmt);
584 if (find_bswap_or_nop_load (stmt, rhs1, n))
585 return stmt;
587 /* Handle BIT_FIELD_REF. */
588 if (TREE_CODE (rhs1) == BIT_FIELD_REF
589 && TREE_CODE (TREE_OPERAND (rhs1, 0)) == SSA_NAME)
591 unsigned HOST_WIDE_INT bitsize = tree_to_uhwi (TREE_OPERAND (rhs1, 1));
592 unsigned HOST_WIDE_INT bitpos = tree_to_uhwi (TREE_OPERAND (rhs1, 2));
593 if (bitpos % BITS_PER_UNIT == 0
594 && bitsize % BITS_PER_UNIT == 0
595 && init_symbolic_number (n, TREE_OPERAND (rhs1, 0)))
597 /* Handle big-endian bit numbering in BIT_FIELD_REF. */
598 if (BYTES_BIG_ENDIAN)
599 bitpos = TYPE_PRECISION (n->type) - bitpos - bitsize;
601 /* Shift. */
602 if (!do_shift_rotate (RSHIFT_EXPR, n, bitpos))
603 return NULL;
605 /* Mask. */
606 uint64_t mask = 0;
607 uint64_t tmp = (1 << BITS_PER_UNIT) - 1;
608 for (unsigned i = 0; i < bitsize / BITS_PER_UNIT;
609 i++, tmp <<= BITS_PER_UNIT)
610 mask |= (uint64_t) MARKER_MASK << (i * BITS_PER_MARKER);
611 n->n &= mask;
613 /* Convert. */
614 n->type = TREE_TYPE (rhs1);
615 if (!n->base_addr)
616 n->range = TYPE_PRECISION (n->type) / BITS_PER_UNIT;
618 return verify_symbolic_number_p (n, stmt) ? stmt : NULL;
621 return NULL;
624 if (TREE_CODE (rhs1) != SSA_NAME)
625 return NULL;
627 code = gimple_assign_rhs_code (stmt);
628 rhs_class = gimple_assign_rhs_class (stmt);
629 rhs1_stmt = SSA_NAME_DEF_STMT (rhs1);
631 if (rhs_class == GIMPLE_BINARY_RHS)
632 rhs2 = gimple_assign_rhs2 (stmt);
634 /* Handle unary rhs and binary rhs with integer constants as second
635 operand. */
637 if (rhs_class == GIMPLE_UNARY_RHS
638 || (rhs_class == GIMPLE_BINARY_RHS
639 && TREE_CODE (rhs2) == INTEGER_CST))
641 if (code != BIT_AND_EXPR
642 && code != LSHIFT_EXPR
643 && code != RSHIFT_EXPR
644 && code != LROTATE_EXPR
645 && code != RROTATE_EXPR
646 && !CONVERT_EXPR_CODE_P (code))
647 return NULL;
649 source_stmt1 = find_bswap_or_nop_1 (rhs1_stmt, n, limit - 1);
651 /* If find_bswap_or_nop_1 returned NULL, STMT is a leaf node and
652 we have to initialize the symbolic number. */
653 if (!source_stmt1)
655 if (gimple_assign_load_p (stmt)
656 || !init_symbolic_number (n, rhs1))
657 return NULL;
658 source_stmt1 = stmt;
661 switch (code)
663 case BIT_AND_EXPR:
665 int i, size = TYPE_PRECISION (n->type) / BITS_PER_UNIT;
666 uint64_t val = int_cst_value (rhs2), mask = 0;
667 uint64_t tmp = (1 << BITS_PER_UNIT) - 1;
669 /* Only constants masking full bytes are allowed. */
670 for (i = 0; i < size; i++, tmp <<= BITS_PER_UNIT)
671 if ((val & tmp) != 0 && (val & tmp) != tmp)
672 return NULL;
673 else if (val & tmp)
674 mask |= (uint64_t) MARKER_MASK << (i * BITS_PER_MARKER);
676 n->n &= mask;
678 break;
679 case LSHIFT_EXPR:
680 case RSHIFT_EXPR:
681 case LROTATE_EXPR:
682 case RROTATE_EXPR:
683 if (!do_shift_rotate (code, n, (int) TREE_INT_CST_LOW (rhs2)))
684 return NULL;
685 break;
686 CASE_CONVERT:
688 int i, type_size, old_type_size;
689 tree type;
691 type = gimple_expr_type (stmt);
692 type_size = TYPE_PRECISION (type);
693 if (type_size % BITS_PER_UNIT != 0)
694 return NULL;
695 type_size /= BITS_PER_UNIT;
696 if (type_size > 64 / BITS_PER_MARKER)
697 return NULL;
699 /* Sign extension: result is dependent on the value. */
700 old_type_size = TYPE_PRECISION (n->type) / BITS_PER_UNIT;
701 if (!TYPE_UNSIGNED (n->type) && type_size > old_type_size
702 && HEAD_MARKER (n->n, old_type_size))
703 for (i = 0; i < type_size - old_type_size; i++)
704 n->n |= (uint64_t) MARKER_BYTE_UNKNOWN
705 << ((type_size - 1 - i) * BITS_PER_MARKER);
707 if (type_size < 64 / BITS_PER_MARKER)
709 /* If STMT casts to a smaller type mask out the bits not
710 belonging to the target type. */
711 n->n &= ((uint64_t) 1 << (type_size * BITS_PER_MARKER)) - 1;
713 n->type = type;
714 if (!n->base_addr)
715 n->range = type_size;
717 break;
718 default:
719 return NULL;
721 return verify_symbolic_number_p (n, stmt) ? source_stmt1 : NULL;
724 /* Handle binary rhs. */
726 if (rhs_class == GIMPLE_BINARY_RHS)
728 struct symbolic_number n1, n2;
729 gimple *source_stmt, *source_stmt2;
731 if (code != BIT_IOR_EXPR)
732 return NULL;
734 if (TREE_CODE (rhs2) != SSA_NAME)
735 return NULL;
737 rhs2_stmt = SSA_NAME_DEF_STMT (rhs2);
739 switch (code)
741 case BIT_IOR_EXPR:
742 source_stmt1 = find_bswap_or_nop_1 (rhs1_stmt, &n1, limit - 1);
744 if (!source_stmt1)
745 return NULL;
747 source_stmt2 = find_bswap_or_nop_1 (rhs2_stmt, &n2, limit - 1);
749 if (!source_stmt2)
750 return NULL;
752 if (TYPE_PRECISION (n1.type) != TYPE_PRECISION (n2.type))
753 return NULL;
755 if (n1.vuse != n2.vuse)
756 return NULL;
758 source_stmt
759 = perform_symbolic_merge (source_stmt1, &n1, source_stmt2, &n2, n);
761 if (!source_stmt)
762 return NULL;
764 if (!verify_symbolic_number_p (n, stmt))
765 return NULL;
767 break;
768 default:
769 return NULL;
771 return source_stmt;
773 return NULL;
776 /* Helper for find_bswap_or_nop and try_coalesce_bswap to compute
777 *CMPXCHG, *CMPNOP and adjust *N. */
779 void
780 find_bswap_or_nop_finalize (struct symbolic_number *n, uint64_t *cmpxchg,
781 uint64_t *cmpnop)
783 unsigned rsize;
784 uint64_t tmpn, mask;
786 /* The number which the find_bswap_or_nop_1 result should match in order
787 to have a full byte swap. The number is shifted to the right
788 according to the size of the symbolic number before using it. */
789 *cmpxchg = CMPXCHG;
790 *cmpnop = CMPNOP;
792 /* Find real size of result (highest non-zero byte). */
793 if (n->base_addr)
794 for (tmpn = n->n, rsize = 0; tmpn; tmpn >>= BITS_PER_MARKER, rsize++);
795 else
796 rsize = n->range;
798 /* Zero out the bits corresponding to untouched bytes in original gimple
799 expression. */
800 if (n->range < (int) sizeof (int64_t))
802 mask = ((uint64_t) 1 << (n->range * BITS_PER_MARKER)) - 1;
803 *cmpxchg >>= (64 / BITS_PER_MARKER - n->range) * BITS_PER_MARKER;
804 *cmpnop &= mask;
807 /* Zero out the bits corresponding to unused bytes in the result of the
808 gimple expression. */
809 if (rsize < n->range)
811 if (BYTES_BIG_ENDIAN)
813 mask = ((uint64_t) 1 << (rsize * BITS_PER_MARKER)) - 1;
814 *cmpxchg &= mask;
815 *cmpnop >>= (n->range - rsize) * BITS_PER_MARKER;
817 else
819 mask = ((uint64_t) 1 << (rsize * BITS_PER_MARKER)) - 1;
820 *cmpxchg >>= (n->range - rsize) * BITS_PER_MARKER;
821 *cmpnop &= mask;
823 n->range = rsize;
826 n->range *= BITS_PER_UNIT;
829 /* Check if STMT completes a bswap implementation or a read in a given
830 endianness consisting of ORs, SHIFTs and ANDs and sets *BSWAP
831 accordingly. It also sets N to represent the kind of operations
832 performed: size of the resulting expression and whether it works on
833 a memory source, and if so alias-set and vuse. At last, the
834 function returns a stmt whose rhs's first tree is the source
835 expression. */
837 gimple *
838 find_bswap_or_nop (gimple *stmt, struct symbolic_number *n, bool *bswap)
840 /* The last parameter determines the depth search limit. It usually
841 correlates directly to the number n of bytes to be touched. We
842 increase that number by log2(n) + 1 here in order to also
843 cover signed -> unsigned conversions of the src operand as can be seen
844 in libgcc, and for initial shift/and operation of the src operand. */
845 int limit = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (gimple_expr_type (stmt)));
846 limit += 1 + (int) ceil_log2 ((unsigned HOST_WIDE_INT) limit);
847 gimple *ins_stmt = find_bswap_or_nop_1 (stmt, n, limit);
849 if (!ins_stmt)
850 return NULL;
852 uint64_t cmpxchg, cmpnop;
853 find_bswap_or_nop_finalize (n, &cmpxchg, &cmpnop);
855 /* A complete byte swap should make the symbolic number to start with
856 the largest digit in the highest order byte. Unchanged symbolic
857 number indicates a read with same endianness as target architecture. */
858 if (n->n == cmpnop)
859 *bswap = false;
860 else if (n->n == cmpxchg)
861 *bswap = true;
862 else
863 return NULL;
865 /* Useless bit manipulation performed by code. */
866 if (!n->base_addr && n->n == cmpnop && n->n_ops == 1)
867 return NULL;
869 return ins_stmt;
872 const pass_data pass_data_optimize_bswap =
874 GIMPLE_PASS, /* type */
875 "bswap", /* name */
876 OPTGROUP_NONE, /* optinfo_flags */
877 TV_NONE, /* tv_id */
878 PROP_ssa, /* properties_required */
879 0, /* properties_provided */
880 0, /* properties_destroyed */
881 0, /* todo_flags_start */
882 0, /* todo_flags_finish */
885 class pass_optimize_bswap : public gimple_opt_pass
887 public:
888 pass_optimize_bswap (gcc::context *ctxt)
889 : gimple_opt_pass (pass_data_optimize_bswap, ctxt)
892 /* opt_pass methods: */
893 virtual bool gate (function *)
895 return flag_expensive_optimizations && optimize && BITS_PER_UNIT == 8;
898 virtual unsigned int execute (function *);
900 }; // class pass_optimize_bswap
902 /* Perform the bswap optimization: replace the expression computed in the rhs
903 of gsi_stmt (GSI) (or if NULL add instead of replace) by an equivalent
904 bswap, load or load + bswap expression.
905 Which of these alternatives replace the rhs is given by N->base_addr (non
906 null if a load is needed) and BSWAP. The type, VUSE and set-alias of the
907 load to perform are also given in N while the builtin bswap invoke is given
908 in FNDEL. Finally, if a load is involved, INS_STMT refers to one of the
909 load statements involved to construct the rhs in gsi_stmt (GSI) and
910 N->range gives the size of the rhs expression for maintaining some
911 statistics.
913 Note that if the replacement involve a load and if gsi_stmt (GSI) is
914 non-NULL, that stmt is moved just after INS_STMT to do the load with the
915 same VUSE which can lead to gsi_stmt (GSI) changing of basic block. */
917 tree
918 bswap_replace (gimple_stmt_iterator gsi, gimple *ins_stmt, tree fndecl,
919 tree bswap_type, tree load_type, struct symbolic_number *n,
920 bool bswap)
922 tree src, tmp, tgt = NULL_TREE;
923 gimple *bswap_stmt;
925 gimple *cur_stmt = gsi_stmt (gsi);
926 src = n->src;
927 if (cur_stmt)
928 tgt = gimple_assign_lhs (cur_stmt);
930 /* Need to load the value from memory first. */
931 if (n->base_addr)
933 gimple_stmt_iterator gsi_ins = gsi;
934 if (ins_stmt)
935 gsi_ins = gsi_for_stmt (ins_stmt);
936 tree addr_expr, addr_tmp, val_expr, val_tmp;
937 tree load_offset_ptr, aligned_load_type;
938 gimple *load_stmt;
939 unsigned align = get_object_alignment (src);
940 poly_int64 load_offset = 0;
942 if (cur_stmt)
944 basic_block ins_bb = gimple_bb (ins_stmt);
945 basic_block cur_bb = gimple_bb (cur_stmt);
946 if (!dominated_by_p (CDI_DOMINATORS, cur_bb, ins_bb))
947 return NULL_TREE;
949 /* Move cur_stmt just before one of the load of the original
950 to ensure it has the same VUSE. See PR61517 for what could
951 go wrong. */
952 if (gimple_bb (cur_stmt) != gimple_bb (ins_stmt))
953 reset_flow_sensitive_info (gimple_assign_lhs (cur_stmt));
954 gsi_move_before (&gsi, &gsi_ins);
955 gsi = gsi_for_stmt (cur_stmt);
957 else
958 gsi = gsi_ins;
960 /* Compute address to load from and cast according to the size
961 of the load. */
962 addr_expr = build_fold_addr_expr (src);
963 if (is_gimple_mem_ref_addr (addr_expr))
964 addr_tmp = unshare_expr (addr_expr);
965 else
967 addr_tmp = unshare_expr (n->base_addr);
968 if (!is_gimple_mem_ref_addr (addr_tmp))
969 addr_tmp = force_gimple_operand_gsi_1 (&gsi, addr_tmp,
970 is_gimple_mem_ref_addr,
971 NULL_TREE, true,
972 GSI_SAME_STMT);
973 load_offset = n->bytepos;
974 if (n->offset)
976 tree off
977 = force_gimple_operand_gsi (&gsi, unshare_expr (n->offset),
978 true, NULL_TREE, true,
979 GSI_SAME_STMT);
980 gimple *stmt
981 = gimple_build_assign (make_ssa_name (TREE_TYPE (addr_tmp)),
982 POINTER_PLUS_EXPR, addr_tmp, off);
983 gsi_insert_before (&gsi, stmt, GSI_SAME_STMT);
984 addr_tmp = gimple_assign_lhs (stmt);
988 /* Perform the load. */
989 aligned_load_type = load_type;
990 if (align < TYPE_ALIGN (load_type))
991 aligned_load_type = build_aligned_type (load_type, align);
992 load_offset_ptr = build_int_cst (n->alias_set, load_offset);
993 val_expr = fold_build2 (MEM_REF, aligned_load_type, addr_tmp,
994 load_offset_ptr);
996 if (!bswap)
998 if (n->range == 16)
999 nop_stats.found_16bit++;
1000 else if (n->range == 32)
1001 nop_stats.found_32bit++;
1002 else
1004 gcc_assert (n->range == 64);
1005 nop_stats.found_64bit++;
1008 /* Convert the result of load if necessary. */
1009 if (tgt && !useless_type_conversion_p (TREE_TYPE (tgt), load_type))
1011 val_tmp = make_temp_ssa_name (aligned_load_type, NULL,
1012 "load_dst");
1013 load_stmt = gimple_build_assign (val_tmp, val_expr);
1014 gimple_set_vuse (load_stmt, n->vuse);
1015 gsi_insert_before (&gsi, load_stmt, GSI_SAME_STMT);
1016 gimple_assign_set_rhs_with_ops (&gsi, NOP_EXPR, val_tmp);
1017 update_stmt (cur_stmt);
1019 else if (cur_stmt)
1021 gimple_assign_set_rhs_with_ops (&gsi, MEM_REF, val_expr);
1022 gimple_set_vuse (cur_stmt, n->vuse);
1023 update_stmt (cur_stmt);
1025 else
1027 tgt = make_ssa_name (load_type);
1028 cur_stmt = gimple_build_assign (tgt, MEM_REF, val_expr);
1029 gimple_set_vuse (cur_stmt, n->vuse);
1030 gsi_insert_before (&gsi, cur_stmt, GSI_SAME_STMT);
1033 if (dump_file)
1035 fprintf (dump_file,
1036 "%d bit load in target endianness found at: ",
1037 (int) n->range);
1038 print_gimple_stmt (dump_file, cur_stmt, 0);
1040 return tgt;
1042 else
1044 val_tmp = make_temp_ssa_name (aligned_load_type, NULL, "load_dst");
1045 load_stmt = gimple_build_assign (val_tmp, val_expr);
1046 gimple_set_vuse (load_stmt, n->vuse);
1047 gsi_insert_before (&gsi, load_stmt, GSI_SAME_STMT);
1049 src = val_tmp;
1051 else if (!bswap)
1053 gimple *g = NULL;
1054 if (tgt && !useless_type_conversion_p (TREE_TYPE (tgt), TREE_TYPE (src)))
1056 if (!is_gimple_val (src))
1057 return NULL_TREE;
1058 g = gimple_build_assign (tgt, NOP_EXPR, src);
1060 else if (cur_stmt)
1061 g = gimple_build_assign (tgt, src);
1062 else
1063 tgt = src;
1064 if (n->range == 16)
1065 nop_stats.found_16bit++;
1066 else if (n->range == 32)
1067 nop_stats.found_32bit++;
1068 else
1070 gcc_assert (n->range == 64);
1071 nop_stats.found_64bit++;
1073 if (dump_file)
1075 fprintf (dump_file,
1076 "%d bit reshuffle in target endianness found at: ",
1077 (int) n->range);
1078 if (cur_stmt)
1079 print_gimple_stmt (dump_file, cur_stmt, 0);
1080 else
1082 print_generic_expr (dump_file, tgt, 0);
1083 fprintf (dump_file, "\n");
1086 if (cur_stmt)
1087 gsi_replace (&gsi, g, true);
1088 return tgt;
1090 else if (TREE_CODE (src) == BIT_FIELD_REF)
1091 src = TREE_OPERAND (src, 0);
1093 if (n->range == 16)
1094 bswap_stats.found_16bit++;
1095 else if (n->range == 32)
1096 bswap_stats.found_32bit++;
1097 else
1099 gcc_assert (n->range == 64);
1100 bswap_stats.found_64bit++;
1103 tmp = src;
1105 /* Convert the src expression if necessary. */
1106 if (!useless_type_conversion_p (TREE_TYPE (tmp), bswap_type))
1108 gimple *convert_stmt;
1110 tmp = make_temp_ssa_name (bswap_type, NULL, "bswapsrc");
1111 convert_stmt = gimple_build_assign (tmp, NOP_EXPR, src);
1112 gsi_insert_before (&gsi, convert_stmt, GSI_SAME_STMT);
1115 /* Canonical form for 16 bit bswap is a rotate expression. Only 16bit values
1116 are considered as rotation of 2N bit values by N bits is generally not
1117 equivalent to a bswap. Consider for instance 0x01020304 r>> 16 which
1118 gives 0x03040102 while a bswap for that value is 0x04030201. */
1119 if (bswap && n->range == 16)
1121 tree count = build_int_cst (NULL, BITS_PER_UNIT);
1122 src = fold_build2 (LROTATE_EXPR, bswap_type, tmp, count);
1123 bswap_stmt = gimple_build_assign (NULL, src);
1125 else
1126 bswap_stmt = gimple_build_call (fndecl, 1, tmp);
1128 if (tgt == NULL_TREE)
1129 tgt = make_ssa_name (bswap_type);
1130 tmp = tgt;
1132 /* Convert the result if necessary. */
1133 if (!useless_type_conversion_p (TREE_TYPE (tgt), bswap_type))
1135 gimple *convert_stmt;
1137 tmp = make_temp_ssa_name (bswap_type, NULL, "bswapdst");
1138 convert_stmt = gimple_build_assign (tgt, NOP_EXPR, tmp);
1139 gsi_insert_after (&gsi, convert_stmt, GSI_SAME_STMT);
1142 gimple_set_lhs (bswap_stmt, tmp);
1144 if (dump_file)
1146 fprintf (dump_file, "%d bit bswap implementation found at: ",
1147 (int) n->range);
1148 if (cur_stmt)
1149 print_gimple_stmt (dump_file, cur_stmt, 0);
1150 else
1152 print_generic_expr (dump_file, tgt, 0);
1153 fprintf (dump_file, "\n");
1157 if (cur_stmt)
1159 gsi_insert_after (&gsi, bswap_stmt, GSI_SAME_STMT);
1160 gsi_remove (&gsi, true);
1162 else
1163 gsi_insert_before (&gsi, bswap_stmt, GSI_SAME_STMT);
1164 return tgt;
1167 /* Find manual byte swap implementations as well as load in a given
1168 endianness. Byte swaps are turned into a bswap builtin invokation
1169 while endian loads are converted to bswap builtin invokation or
1170 simple load according to the target endianness. */
1172 unsigned int
1173 pass_optimize_bswap::execute (function *fun)
1175 basic_block bb;
1176 bool bswap32_p, bswap64_p;
1177 bool changed = false;
1178 tree bswap32_type = NULL_TREE, bswap64_type = NULL_TREE;
1180 bswap32_p = (builtin_decl_explicit_p (BUILT_IN_BSWAP32)
1181 && optab_handler (bswap_optab, SImode) != CODE_FOR_nothing);
1182 bswap64_p = (builtin_decl_explicit_p (BUILT_IN_BSWAP64)
1183 && (optab_handler (bswap_optab, DImode) != CODE_FOR_nothing
1184 || (bswap32_p && word_mode == SImode)));
1186 /* Determine the argument type of the builtins. The code later on
1187 assumes that the return and argument type are the same. */
1188 if (bswap32_p)
1190 tree fndecl = builtin_decl_explicit (BUILT_IN_BSWAP32);
1191 bswap32_type = TREE_VALUE (TYPE_ARG_TYPES (TREE_TYPE (fndecl)));
1194 if (bswap64_p)
1196 tree fndecl = builtin_decl_explicit (BUILT_IN_BSWAP64);
1197 bswap64_type = TREE_VALUE (TYPE_ARG_TYPES (TREE_TYPE (fndecl)));
1200 memset (&nop_stats, 0, sizeof (nop_stats));
1201 memset (&bswap_stats, 0, sizeof (bswap_stats));
1202 calculate_dominance_info (CDI_DOMINATORS);
1204 FOR_EACH_BB_FN (bb, fun)
1206 gimple_stmt_iterator gsi;
1208 /* We do a reverse scan for bswap patterns to make sure we get the
1209 widest match. As bswap pattern matching doesn't handle previously
1210 inserted smaller bswap replacements as sub-patterns, the wider
1211 variant wouldn't be detected. */
1212 for (gsi = gsi_last_bb (bb); !gsi_end_p (gsi);)
1214 gimple *ins_stmt, *cur_stmt = gsi_stmt (gsi);
1215 tree fndecl = NULL_TREE, bswap_type = NULL_TREE, load_type;
1216 enum tree_code code;
1217 struct symbolic_number n;
1218 bool bswap;
1220 /* This gsi_prev (&gsi) is not part of the for loop because cur_stmt
1221 might be moved to a different basic block by bswap_replace and gsi
1222 must not points to it if that's the case. Moving the gsi_prev
1223 there make sure that gsi points to the statement previous to
1224 cur_stmt while still making sure that all statements are
1225 considered in this basic block. */
1226 gsi_prev (&gsi);
1228 if (!is_gimple_assign (cur_stmt))
1229 continue;
1231 code = gimple_assign_rhs_code (cur_stmt);
1232 switch (code)
1234 case LROTATE_EXPR:
1235 case RROTATE_EXPR:
1236 if (!tree_fits_uhwi_p (gimple_assign_rhs2 (cur_stmt))
1237 || tree_to_uhwi (gimple_assign_rhs2 (cur_stmt))
1238 % BITS_PER_UNIT)
1239 continue;
1240 /* Fall through. */
1241 case BIT_IOR_EXPR:
1242 break;
1243 default:
1244 continue;
1247 ins_stmt = find_bswap_or_nop (cur_stmt, &n, &bswap);
1249 if (!ins_stmt)
1250 continue;
1252 switch (n.range)
1254 case 16:
1255 /* Already in canonical form, nothing to do. */
1256 if (code == LROTATE_EXPR || code == RROTATE_EXPR)
1257 continue;
1258 load_type = bswap_type = uint16_type_node;
1259 break;
1260 case 32:
1261 load_type = uint32_type_node;
1262 if (bswap32_p)
1264 fndecl = builtin_decl_explicit (BUILT_IN_BSWAP32);
1265 bswap_type = bswap32_type;
1267 break;
1268 case 64:
1269 load_type = uint64_type_node;
1270 if (bswap64_p)
1272 fndecl = builtin_decl_explicit (BUILT_IN_BSWAP64);
1273 bswap_type = bswap64_type;
1275 break;
1276 default:
1277 continue;
1280 if (bswap && !fndecl && n.range != 16)
1281 continue;
1283 if (bswap_replace (gsi_for_stmt (cur_stmt), ins_stmt, fndecl,
1284 bswap_type, load_type, &n, bswap))
1285 changed = true;
1289 statistics_counter_event (fun, "16-bit nop implementations found",
1290 nop_stats.found_16bit);
1291 statistics_counter_event (fun, "32-bit nop implementations found",
1292 nop_stats.found_32bit);
1293 statistics_counter_event (fun, "64-bit nop implementations found",
1294 nop_stats.found_64bit);
1295 statistics_counter_event (fun, "16-bit bswap implementations found",
1296 bswap_stats.found_16bit);
1297 statistics_counter_event (fun, "32-bit bswap implementations found",
1298 bswap_stats.found_32bit);
1299 statistics_counter_event (fun, "64-bit bswap implementations found",
1300 bswap_stats.found_64bit);
1302 return (changed ? TODO_update_ssa : 0);
1305 } // anon namespace
1307 gimple_opt_pass *
1308 make_pass_optimize_bswap (gcc::context *ctxt)
1310 return new pass_optimize_bswap (ctxt);
1313 namespace {
1315 /* Struct recording one operand for the store, which is either a constant,
1316 then VAL represents the constant and all the other fields are zero,
1317 or a memory load, then VAL represents the reference, BASE_ADDR is non-NULL
1318 and the other fields also reflect the memory load. */
1320 struct store_operand_info
1322 tree val;
1323 tree base_addr;
1324 poly_uint64 bitsize;
1325 poly_uint64 bitpos;
1326 poly_uint64 bitregion_start;
1327 poly_uint64 bitregion_end;
1328 gimple *stmt;
1329 bool bit_not_p;
1330 store_operand_info ();
1333 store_operand_info::store_operand_info ()
1334 : val (NULL_TREE), base_addr (NULL_TREE), bitsize (0), bitpos (0),
1335 bitregion_start (0), bitregion_end (0), stmt (NULL), bit_not_p (false)
1339 /* Struct recording the information about a single store of an immediate
1340 to memory. These are created in the first phase and coalesced into
1341 merged_store_group objects in the second phase. */
1343 struct store_immediate_info
1345 unsigned HOST_WIDE_INT bitsize;
1346 unsigned HOST_WIDE_INT bitpos;
1347 unsigned HOST_WIDE_INT bitregion_start;
1348 /* This is one past the last bit of the bit region. */
1349 unsigned HOST_WIDE_INT bitregion_end;
1350 gimple *stmt;
1351 unsigned int order;
1352 /* INTEGER_CST for constant stores, MEM_REF for memory copy or
1353 BIT_*_EXPR for logical bitwise operation.
1354 LROTATE_EXPR if it can be only bswap optimized and
1355 ops are not really meaningful.
1356 NOP_EXPR if bswap optimization detected identity, ops
1357 are not meaningful. */
1358 enum tree_code rhs_code;
1359 /* Two fields for bswap optimization purposes. */
1360 struct symbolic_number n;
1361 gimple *ins_stmt;
1362 /* True if BIT_{AND,IOR,XOR}_EXPR result is inverted before storing. */
1363 bool bit_not_p;
1364 /* True if ops have been swapped and thus ops[1] represents
1365 rhs1 of BIT_{AND,IOR,XOR}_EXPR and ops[0] represents rhs2. */
1366 bool ops_swapped_p;
1367 /* Operands. For BIT_*_EXPR rhs_code both operands are used, otherwise
1368 just the first one. */
1369 store_operand_info ops[2];
1370 store_immediate_info (unsigned HOST_WIDE_INT, unsigned HOST_WIDE_INT,
1371 unsigned HOST_WIDE_INT, unsigned HOST_WIDE_INT,
1372 gimple *, unsigned int, enum tree_code,
1373 struct symbolic_number &, gimple *, bool,
1374 const store_operand_info &,
1375 const store_operand_info &);
1378 store_immediate_info::store_immediate_info (unsigned HOST_WIDE_INT bs,
1379 unsigned HOST_WIDE_INT bp,
1380 unsigned HOST_WIDE_INT brs,
1381 unsigned HOST_WIDE_INT bre,
1382 gimple *st,
1383 unsigned int ord,
1384 enum tree_code rhscode,
1385 struct symbolic_number &nr,
1386 gimple *ins_stmtp,
1387 bool bitnotp,
1388 const store_operand_info &op0r,
1389 const store_operand_info &op1r)
1390 : bitsize (bs), bitpos (bp), bitregion_start (brs), bitregion_end (bre),
1391 stmt (st), order (ord), rhs_code (rhscode), n (nr),
1392 ins_stmt (ins_stmtp), bit_not_p (bitnotp), ops_swapped_p (false)
1393 #if __cplusplus >= 201103L
1394 , ops { op0r, op1r }
1397 #else
1399 ops[0] = op0r;
1400 ops[1] = op1r;
1402 #endif
1404 /* Struct representing a group of stores to contiguous memory locations.
1405 These are produced by the second phase (coalescing) and consumed in the
1406 third phase that outputs the widened stores. */
1408 struct merged_store_group
1410 unsigned HOST_WIDE_INT start;
1411 unsigned HOST_WIDE_INT width;
1412 unsigned HOST_WIDE_INT bitregion_start;
1413 unsigned HOST_WIDE_INT bitregion_end;
1414 /* The size of the allocated memory for val and mask. */
1415 unsigned HOST_WIDE_INT buf_size;
1416 unsigned HOST_WIDE_INT align_base;
1417 poly_uint64 load_align_base[2];
1419 unsigned int align;
1420 unsigned int load_align[2];
1421 unsigned int first_order;
1422 unsigned int last_order;
1424 auto_vec<store_immediate_info *> stores;
1425 /* We record the first and last original statements in the sequence because
1426 we'll need their vuse/vdef and replacement position. It's easier to keep
1427 track of them separately as 'stores' is reordered by apply_stores. */
1428 gimple *last_stmt;
1429 gimple *first_stmt;
1430 unsigned char *val;
1431 unsigned char *mask;
1433 merged_store_group (store_immediate_info *);
1434 ~merged_store_group ();
1435 void merge_into (store_immediate_info *);
1436 void merge_overlapping (store_immediate_info *);
1437 bool apply_stores ();
1438 private:
1439 void do_merge (store_immediate_info *);
1442 /* Debug helper. Dump LEN elements of byte array PTR to FD in hex. */
1444 static void
1445 dump_char_array (FILE *fd, unsigned char *ptr, unsigned int len)
1447 if (!fd)
1448 return;
1450 for (unsigned int i = 0; i < len; i++)
1451 fprintf (fd, "%x ", ptr[i]);
1452 fprintf (fd, "\n");
1455 /* Shift left the bytes in PTR of SZ elements by AMNT bits, carrying over the
1456 bits between adjacent elements. AMNT should be within
1457 [0, BITS_PER_UNIT).
1458 Example, AMNT = 2:
1459 00011111|11100000 << 2 = 01111111|10000000
1460 PTR[1] | PTR[0] PTR[1] | PTR[0]. */
1462 static void
1463 shift_bytes_in_array (unsigned char *ptr, unsigned int sz, unsigned int amnt)
1465 if (amnt == 0)
1466 return;
1468 unsigned char carry_over = 0U;
1469 unsigned char carry_mask = (~0U) << (unsigned char) (BITS_PER_UNIT - amnt);
1470 unsigned char clear_mask = (~0U) << amnt;
1472 for (unsigned int i = 0; i < sz; i++)
1474 unsigned prev_carry_over = carry_over;
1475 carry_over = (ptr[i] & carry_mask) >> (BITS_PER_UNIT - amnt);
1477 ptr[i] <<= amnt;
1478 if (i != 0)
1480 ptr[i] &= clear_mask;
1481 ptr[i] |= prev_carry_over;
1486 /* Like shift_bytes_in_array but for big-endian.
1487 Shift right the bytes in PTR of SZ elements by AMNT bits, carrying over the
1488 bits between adjacent elements. AMNT should be within
1489 [0, BITS_PER_UNIT).
1490 Example, AMNT = 2:
1491 00011111|11100000 >> 2 = 00000111|11111000
1492 PTR[0] | PTR[1] PTR[0] | PTR[1]. */
1494 static void
1495 shift_bytes_in_array_right (unsigned char *ptr, unsigned int sz,
1496 unsigned int amnt)
1498 if (amnt == 0)
1499 return;
1501 unsigned char carry_over = 0U;
1502 unsigned char carry_mask = ~(~0U << amnt);
1504 for (unsigned int i = 0; i < sz; i++)
1506 unsigned prev_carry_over = carry_over;
1507 carry_over = ptr[i] & carry_mask;
1509 carry_over <<= (unsigned char) BITS_PER_UNIT - amnt;
1510 ptr[i] >>= amnt;
1511 ptr[i] |= prev_carry_over;
1515 /* Clear out LEN bits starting from bit START in the byte array
1516 PTR. This clears the bits to the *right* from START.
1517 START must be within [0, BITS_PER_UNIT) and counts starting from
1518 the least significant bit. */
1520 static void
1521 clear_bit_region_be (unsigned char *ptr, unsigned int start,
1522 unsigned int len)
1524 if (len == 0)
1525 return;
1526 /* Clear len bits to the right of start. */
1527 else if (len <= start + 1)
1529 unsigned char mask = (~(~0U << len));
1530 mask = mask << (start + 1U - len);
1531 ptr[0] &= ~mask;
1533 else if (start != BITS_PER_UNIT - 1)
1535 clear_bit_region_be (ptr, start, (start % BITS_PER_UNIT) + 1);
1536 clear_bit_region_be (ptr + 1, BITS_PER_UNIT - 1,
1537 len - (start % BITS_PER_UNIT) - 1);
1539 else if (start == BITS_PER_UNIT - 1
1540 && len > BITS_PER_UNIT)
1542 unsigned int nbytes = len / BITS_PER_UNIT;
1543 memset (ptr, 0, nbytes);
1544 if (len % BITS_PER_UNIT != 0)
1545 clear_bit_region_be (ptr + nbytes, BITS_PER_UNIT - 1,
1546 len % BITS_PER_UNIT);
1548 else
1549 gcc_unreachable ();
1552 /* In the byte array PTR clear the bit region starting at bit
1553 START and is LEN bits wide.
1554 For regions spanning multiple bytes do this recursively until we reach
1555 zero LEN or a region contained within a single byte. */
1557 static void
1558 clear_bit_region (unsigned char *ptr, unsigned int start,
1559 unsigned int len)
1561 /* Degenerate base case. */
1562 if (len == 0)
1563 return;
1564 else if (start >= BITS_PER_UNIT)
1565 clear_bit_region (ptr + 1, start - BITS_PER_UNIT, len);
1566 /* Second base case. */
1567 else if ((start + len) <= BITS_PER_UNIT)
1569 unsigned char mask = (~0U) << (unsigned char) (BITS_PER_UNIT - len);
1570 mask >>= BITS_PER_UNIT - (start + len);
1572 ptr[0] &= ~mask;
1574 return;
1576 /* Clear most significant bits in a byte and proceed with the next byte. */
1577 else if (start != 0)
1579 clear_bit_region (ptr, start, BITS_PER_UNIT - start);
1580 clear_bit_region (ptr + 1, 0, len - (BITS_PER_UNIT - start));
1582 /* Whole bytes need to be cleared. */
1583 else if (start == 0 && len > BITS_PER_UNIT)
1585 unsigned int nbytes = len / BITS_PER_UNIT;
1586 /* We could recurse on each byte but we clear whole bytes, so a simple
1587 memset will do. */
1588 memset (ptr, '\0', nbytes);
1589 /* Clear the remaining sub-byte region if there is one. */
1590 if (len % BITS_PER_UNIT != 0)
1591 clear_bit_region (ptr + nbytes, 0, len % BITS_PER_UNIT);
1593 else
1594 gcc_unreachable ();
1597 /* Write BITLEN bits of EXPR to the byte array PTR at
1598 bit position BITPOS. PTR should contain TOTAL_BYTES elements.
1599 Return true if the operation succeeded. */
1601 static bool
1602 encode_tree_to_bitpos (tree expr, unsigned char *ptr, int bitlen, int bitpos,
1603 unsigned int total_bytes)
1605 unsigned int first_byte = bitpos / BITS_PER_UNIT;
1606 tree tmp_int = expr;
1607 bool sub_byte_op_p = ((bitlen % BITS_PER_UNIT)
1608 || (bitpos % BITS_PER_UNIT)
1609 || !int_mode_for_size (bitlen, 0).exists ());
1611 if (!sub_byte_op_p)
1612 return native_encode_expr (tmp_int, ptr + first_byte, total_bytes) != 0;
1614 /* LITTLE-ENDIAN
1615 We are writing a non byte-sized quantity or at a position that is not
1616 at a byte boundary.
1617 |--------|--------|--------| ptr + first_byte
1619 xxx xxxxxxxx xxx< bp>
1620 |______EXPR____|
1622 First native_encode_expr EXPR into a temporary buffer and shift each
1623 byte in the buffer by 'bp' (carrying the bits over as necessary).
1624 |00000000|00xxxxxx|xxxxxxxx| << bp = |000xxxxx|xxxxxxxx|xxx00000|
1625 <------bitlen---->< bp>
1626 Then we clear the destination bits:
1627 |---00000|00000000|000-----| ptr + first_byte
1628 <-------bitlen--->< bp>
1630 Finally we ORR the bytes of the shifted EXPR into the cleared region:
1631 |---xxxxx||xxxxxxxx||xxx-----| ptr + first_byte.
1633 BIG-ENDIAN
1634 We are writing a non byte-sized quantity or at a position that is not
1635 at a byte boundary.
1636 ptr + first_byte |--------|--------|--------|
1638 <bp >xxx xxxxxxxx xxx
1639 |_____EXPR_____|
1641 First native_encode_expr EXPR into a temporary buffer and shift each
1642 byte in the buffer to the right by (carrying the bits over as necessary).
1643 We shift by as much as needed to align the most significant bit of EXPR
1644 with bitpos:
1645 |00xxxxxx|xxxxxxxx| >> 3 = |00000xxx|xxxxxxxx|xxxxx000|
1646 <---bitlen----> <bp ><-----bitlen----->
1647 Then we clear the destination bits:
1648 ptr + first_byte |-----000||00000000||00000---|
1649 <bp ><-------bitlen----->
1651 Finally we ORR the bytes of the shifted EXPR into the cleared region:
1652 ptr + first_byte |---xxxxx||xxxxxxxx||xxx-----|.
1653 The awkwardness comes from the fact that bitpos is counted from the
1654 most significant bit of a byte. */
1656 /* We must be dealing with fixed-size data at this point, since the
1657 total size is also fixed. */
1658 fixed_size_mode mode = as_a <fixed_size_mode> (TYPE_MODE (TREE_TYPE (expr)));
1659 /* Allocate an extra byte so that we have space to shift into. */
1660 unsigned int byte_size = GET_MODE_SIZE (mode) + 1;
1661 unsigned char *tmpbuf = XALLOCAVEC (unsigned char, byte_size);
1662 memset (tmpbuf, '\0', byte_size);
1663 /* The store detection code should only have allowed constants that are
1664 accepted by native_encode_expr. */
1665 if (native_encode_expr (expr, tmpbuf, byte_size - 1) == 0)
1666 gcc_unreachable ();
1668 /* The native_encode_expr machinery uses TYPE_MODE to determine how many
1669 bytes to write. This means it can write more than
1670 ROUND_UP (bitlen, BITS_PER_UNIT) / BITS_PER_UNIT bytes (for example
1671 write 8 bytes for a bitlen of 40). Skip the bytes that are not within
1672 bitlen and zero out the bits that are not relevant as well (that may
1673 contain a sign bit due to sign-extension). */
1674 unsigned int padding
1675 = byte_size - ROUND_UP (bitlen, BITS_PER_UNIT) / BITS_PER_UNIT - 1;
1676 /* On big-endian the padding is at the 'front' so just skip the initial
1677 bytes. */
1678 if (BYTES_BIG_ENDIAN)
1679 tmpbuf += padding;
1681 byte_size -= padding;
1683 if (bitlen % BITS_PER_UNIT != 0)
1685 if (BYTES_BIG_ENDIAN)
1686 clear_bit_region_be (tmpbuf, BITS_PER_UNIT - 1,
1687 BITS_PER_UNIT - (bitlen % BITS_PER_UNIT));
1688 else
1689 clear_bit_region (tmpbuf, bitlen,
1690 byte_size * BITS_PER_UNIT - bitlen);
1692 /* Left shifting relies on the last byte being clear if bitlen is
1693 a multiple of BITS_PER_UNIT, which might not be clear if
1694 there are padding bytes. */
1695 else if (!BYTES_BIG_ENDIAN)
1696 tmpbuf[byte_size - 1] = '\0';
1698 /* Clear the bit region in PTR where the bits from TMPBUF will be
1699 inserted into. */
1700 if (BYTES_BIG_ENDIAN)
1701 clear_bit_region_be (ptr + first_byte,
1702 BITS_PER_UNIT - 1 - (bitpos % BITS_PER_UNIT), bitlen);
1703 else
1704 clear_bit_region (ptr + first_byte, bitpos % BITS_PER_UNIT, bitlen);
1706 int shift_amnt;
1707 int bitlen_mod = bitlen % BITS_PER_UNIT;
1708 int bitpos_mod = bitpos % BITS_PER_UNIT;
1710 bool skip_byte = false;
1711 if (BYTES_BIG_ENDIAN)
1713 /* BITPOS and BITLEN are exactly aligned and no shifting
1714 is necessary. */
1715 if (bitpos_mod + bitlen_mod == BITS_PER_UNIT
1716 || (bitpos_mod == 0 && bitlen_mod == 0))
1717 shift_amnt = 0;
1718 /* |. . . . . . . .|
1719 <bp > <blen >.
1720 We always shift right for BYTES_BIG_ENDIAN so shift the beginning
1721 of the value until it aligns with 'bp' in the next byte over. */
1722 else if (bitpos_mod + bitlen_mod < BITS_PER_UNIT)
1724 shift_amnt = bitlen_mod + bitpos_mod;
1725 skip_byte = bitlen_mod != 0;
1727 /* |. . . . . . . .|
1728 <----bp--->
1729 <---blen---->.
1730 Shift the value right within the same byte so it aligns with 'bp'. */
1731 else
1732 shift_amnt = bitlen_mod + bitpos_mod - BITS_PER_UNIT;
1734 else
1735 shift_amnt = bitpos % BITS_PER_UNIT;
1737 /* Create the shifted version of EXPR. */
1738 if (!BYTES_BIG_ENDIAN)
1740 shift_bytes_in_array (tmpbuf, byte_size, shift_amnt);
1741 if (shift_amnt == 0)
1742 byte_size--;
1744 else
1746 gcc_assert (BYTES_BIG_ENDIAN);
1747 shift_bytes_in_array_right (tmpbuf, byte_size, shift_amnt);
1748 /* If shifting right forced us to move into the next byte skip the now
1749 empty byte. */
1750 if (skip_byte)
1752 tmpbuf++;
1753 byte_size--;
1757 /* Insert the bits from TMPBUF. */
1758 for (unsigned int i = 0; i < byte_size; i++)
1759 ptr[first_byte + i] |= tmpbuf[i];
1761 return true;
1764 /* Sorting function for store_immediate_info objects.
1765 Sorts them by bitposition. */
1767 static int
1768 sort_by_bitpos (const void *x, const void *y)
1770 store_immediate_info *const *tmp = (store_immediate_info * const *) x;
1771 store_immediate_info *const *tmp2 = (store_immediate_info * const *) y;
1773 if ((*tmp)->bitpos < (*tmp2)->bitpos)
1774 return -1;
1775 else if ((*tmp)->bitpos > (*tmp2)->bitpos)
1776 return 1;
1777 else
1778 /* If they are the same let's use the order which is guaranteed to
1779 be different. */
1780 return (*tmp)->order - (*tmp2)->order;
1783 /* Sorting function for store_immediate_info objects.
1784 Sorts them by the order field. */
1786 static int
1787 sort_by_order (const void *x, const void *y)
1789 store_immediate_info *const *tmp = (store_immediate_info * const *) x;
1790 store_immediate_info *const *tmp2 = (store_immediate_info * const *) y;
1792 if ((*tmp)->order < (*tmp2)->order)
1793 return -1;
1794 else if ((*tmp)->order > (*tmp2)->order)
1795 return 1;
1797 gcc_unreachable ();
1800 /* Initialize a merged_store_group object from a store_immediate_info
1801 object. */
1803 merged_store_group::merged_store_group (store_immediate_info *info)
1805 start = info->bitpos;
1806 width = info->bitsize;
1807 bitregion_start = info->bitregion_start;
1808 bitregion_end = info->bitregion_end;
1809 /* VAL has memory allocated for it in apply_stores once the group
1810 width has been finalized. */
1811 val = NULL;
1812 mask = NULL;
1813 unsigned HOST_WIDE_INT align_bitpos = 0;
1814 get_object_alignment_1 (gimple_assign_lhs (info->stmt),
1815 &align, &align_bitpos);
1816 align_base = start - align_bitpos;
1817 for (int i = 0; i < 2; ++i)
1819 store_operand_info &op = info->ops[i];
1820 if (op.base_addr == NULL_TREE)
1822 load_align[i] = 0;
1823 load_align_base[i] = 0;
1825 else
1827 get_object_alignment_1 (op.val, &load_align[i], &align_bitpos);
1828 load_align_base[i] = op.bitpos - align_bitpos;
1831 stores.create (1);
1832 stores.safe_push (info);
1833 last_stmt = info->stmt;
1834 last_order = info->order;
1835 first_stmt = last_stmt;
1836 first_order = last_order;
1837 buf_size = 0;
1840 merged_store_group::~merged_store_group ()
1842 if (val)
1843 XDELETEVEC (val);
1846 /* Helper method for merge_into and merge_overlapping to do
1847 the common part. */
1848 void
1849 merged_store_group::do_merge (store_immediate_info *info)
1851 bitregion_start = MIN (bitregion_start, info->bitregion_start);
1852 bitregion_end = MAX (bitregion_end, info->bitregion_end);
1854 unsigned int this_align;
1855 unsigned HOST_WIDE_INT align_bitpos = 0;
1856 get_object_alignment_1 (gimple_assign_lhs (info->stmt),
1857 &this_align, &align_bitpos);
1858 if (this_align > align)
1860 align = this_align;
1861 align_base = info->bitpos - align_bitpos;
1863 for (int i = 0; i < 2; ++i)
1865 store_operand_info &op = info->ops[i];
1866 if (!op.base_addr)
1867 continue;
1869 get_object_alignment_1 (op.val, &this_align, &align_bitpos);
1870 if (this_align > load_align[i])
1872 load_align[i] = this_align;
1873 load_align_base[i] = op.bitpos - align_bitpos;
1877 gimple *stmt = info->stmt;
1878 stores.safe_push (info);
1879 if (info->order > last_order)
1881 last_order = info->order;
1882 last_stmt = stmt;
1884 else if (info->order < first_order)
1886 first_order = info->order;
1887 first_stmt = stmt;
1891 /* Merge a store recorded by INFO into this merged store.
1892 The store is not overlapping with the existing recorded
1893 stores. */
1895 void
1896 merged_store_group::merge_into (store_immediate_info *info)
1898 unsigned HOST_WIDE_INT wid = info->bitsize;
1899 /* Make sure we're inserting in the position we think we're inserting. */
1900 gcc_assert (info->bitpos >= start + width
1901 && info->bitregion_start <= bitregion_end);
1903 width += wid;
1904 do_merge (info);
1907 /* Merge a store described by INFO into this merged store.
1908 INFO overlaps in some way with the current store (i.e. it's not contiguous
1909 which is handled by merged_store_group::merge_into). */
1911 void
1912 merged_store_group::merge_overlapping (store_immediate_info *info)
1914 /* If the store extends the size of the group, extend the width. */
1915 if (info->bitpos + info->bitsize > start + width)
1916 width += info->bitpos + info->bitsize - (start + width);
1918 do_merge (info);
1921 /* Go through all the recorded stores in this group in program order and
1922 apply their values to the VAL byte array to create the final merged
1923 value. Return true if the operation succeeded. */
1925 bool
1926 merged_store_group::apply_stores ()
1928 /* Make sure we have more than one store in the group, otherwise we cannot
1929 merge anything. */
1930 if (bitregion_start % BITS_PER_UNIT != 0
1931 || bitregion_end % BITS_PER_UNIT != 0
1932 || stores.length () == 1)
1933 return false;
1935 stores.qsort (sort_by_order);
1936 store_immediate_info *info;
1937 unsigned int i;
1938 /* Create a buffer of a size that is 2 times the number of bytes we're
1939 storing. That way native_encode_expr can write power-of-2-sized
1940 chunks without overrunning. */
1941 buf_size = 2 * ((bitregion_end - bitregion_start) / BITS_PER_UNIT);
1942 val = XNEWVEC (unsigned char, 2 * buf_size);
1943 mask = val + buf_size;
1944 memset (val, 0, buf_size);
1945 memset (mask, ~0U, buf_size);
1947 FOR_EACH_VEC_ELT (stores, i, info)
1949 unsigned int pos_in_buffer = info->bitpos - bitregion_start;
1950 tree cst = NULL_TREE;
1951 if (info->ops[0].val && info->ops[0].base_addr == NULL_TREE)
1952 cst = info->ops[0].val;
1953 else if (info->ops[1].val && info->ops[1].base_addr == NULL_TREE)
1954 cst = info->ops[1].val;
1955 bool ret = true;
1956 if (cst)
1957 ret = encode_tree_to_bitpos (cst, val, info->bitsize,
1958 pos_in_buffer, buf_size);
1959 if (cst && dump_file && (dump_flags & TDF_DETAILS))
1961 if (ret)
1963 fprintf (dump_file, "After writing ");
1964 print_generic_expr (dump_file, cst, 0);
1965 fprintf (dump_file, " of size " HOST_WIDE_INT_PRINT_DEC
1966 " at position %d the merged region contains:\n",
1967 info->bitsize, pos_in_buffer);
1968 dump_char_array (dump_file, val, buf_size);
1970 else
1971 fprintf (dump_file, "Failed to merge stores\n");
1973 if (!ret)
1974 return false;
1975 unsigned char *m = mask + (pos_in_buffer / BITS_PER_UNIT);
1976 if (BYTES_BIG_ENDIAN)
1977 clear_bit_region_be (m, (BITS_PER_UNIT - 1
1978 - (pos_in_buffer % BITS_PER_UNIT)),
1979 info->bitsize);
1980 else
1981 clear_bit_region (m, pos_in_buffer % BITS_PER_UNIT, info->bitsize);
1983 stores.qsort (sort_by_bitpos);
1984 return true;
1987 /* Structure describing the store chain. */
1989 struct imm_store_chain_info
1991 /* Doubly-linked list that imposes an order on chain processing.
1992 PNXP (prev's next pointer) points to the head of a list, or to
1993 the next field in the previous chain in the list.
1994 See pass_store_merging::m_stores_head for more rationale. */
1995 imm_store_chain_info *next, **pnxp;
1996 tree base_addr;
1997 auto_vec<store_immediate_info *> m_store_info;
1998 auto_vec<merged_store_group *> m_merged_store_groups;
2000 imm_store_chain_info (imm_store_chain_info *&inspt, tree b_a)
2001 : next (inspt), pnxp (&inspt), base_addr (b_a)
2003 inspt = this;
2004 if (next)
2006 gcc_checking_assert (pnxp == next->pnxp);
2007 next->pnxp = &next;
2010 ~imm_store_chain_info ()
2012 *pnxp = next;
2013 if (next)
2015 gcc_checking_assert (&next == next->pnxp);
2016 next->pnxp = pnxp;
2019 bool terminate_and_process_chain ();
2020 bool try_coalesce_bswap (merged_store_group *, unsigned int, unsigned int);
2021 bool coalesce_immediate_stores ();
2022 bool output_merged_store (merged_store_group *);
2023 bool output_merged_stores ();
2026 const pass_data pass_data_tree_store_merging = {
2027 GIMPLE_PASS, /* type */
2028 "store-merging", /* name */
2029 OPTGROUP_NONE, /* optinfo_flags */
2030 TV_GIMPLE_STORE_MERGING, /* tv_id */
2031 PROP_ssa, /* properties_required */
2032 0, /* properties_provided */
2033 0, /* properties_destroyed */
2034 0, /* todo_flags_start */
2035 TODO_update_ssa, /* todo_flags_finish */
2038 class pass_store_merging : public gimple_opt_pass
2040 public:
2041 pass_store_merging (gcc::context *ctxt)
2042 : gimple_opt_pass (pass_data_tree_store_merging, ctxt), m_stores_head ()
2046 /* Pass not supported for PDP-endianness, nor for insane hosts
2047 or target character sizes where native_{encode,interpret}_expr
2048 doesn't work properly. */
2049 virtual bool
2050 gate (function *)
2052 return flag_store_merging
2053 && WORDS_BIG_ENDIAN == BYTES_BIG_ENDIAN
2054 && CHAR_BIT == 8
2055 && BITS_PER_UNIT == 8;
2058 virtual unsigned int execute (function *);
2060 private:
2061 hash_map<tree_operand_hash, struct imm_store_chain_info *> m_stores;
2063 /* Form a doubly-linked stack of the elements of m_stores, so that
2064 we can iterate over them in a predictable way. Using this order
2065 avoids extraneous differences in the compiler output just because
2066 of tree pointer variations (e.g. different chains end up in
2067 different positions of m_stores, so they are handled in different
2068 orders, so they allocate or release SSA names in different
2069 orders, and when they get reused, subsequent passes end up
2070 getting different SSA names, which may ultimately change
2071 decisions when going out of SSA). */
2072 imm_store_chain_info *m_stores_head;
2074 void process_store (gimple *);
2075 bool terminate_and_process_all_chains ();
2076 bool terminate_all_aliasing_chains (imm_store_chain_info **, gimple *);
2077 bool terminate_and_release_chain (imm_store_chain_info *);
2078 }; // class pass_store_merging
2080 /* Terminate and process all recorded chains. Return true if any changes
2081 were made. */
2083 bool
2084 pass_store_merging::terminate_and_process_all_chains ()
2086 bool ret = false;
2087 while (m_stores_head)
2088 ret |= terminate_and_release_chain (m_stores_head);
2089 gcc_assert (m_stores.elements () == 0);
2090 gcc_assert (m_stores_head == NULL);
2092 return ret;
2095 /* Terminate all chains that are affected by the statement STMT.
2096 CHAIN_INFO is the chain we should ignore from the checks if
2097 non-NULL. */
2099 bool
2100 pass_store_merging::terminate_all_aliasing_chains (imm_store_chain_info
2101 **chain_info,
2102 gimple *stmt)
2104 bool ret = false;
2106 /* If the statement doesn't touch memory it can't alias. */
2107 if (!gimple_vuse (stmt))
2108 return false;
2110 tree store_lhs = gimple_store_p (stmt) ? gimple_get_lhs (stmt) : NULL_TREE;
2111 for (imm_store_chain_info *next = m_stores_head, *cur = next; cur; cur = next)
2113 next = cur->next;
2115 /* We already checked all the stores in chain_info and terminated the
2116 chain if necessary. Skip it here. */
2117 if (chain_info && *chain_info == cur)
2118 continue;
2120 store_immediate_info *info;
2121 unsigned int i;
2122 FOR_EACH_VEC_ELT (cur->m_store_info, i, info)
2124 tree lhs = gimple_assign_lhs (info->stmt);
2125 if (ref_maybe_used_by_stmt_p (stmt, lhs)
2126 || stmt_may_clobber_ref_p (stmt, lhs)
2127 || (store_lhs && refs_output_dependent_p (store_lhs, lhs)))
2129 if (dump_file && (dump_flags & TDF_DETAILS))
2131 fprintf (dump_file, "stmt causes chain termination:\n");
2132 print_gimple_stmt (dump_file, stmt, 0);
2134 terminate_and_release_chain (cur);
2135 ret = true;
2136 break;
2141 return ret;
2144 /* Helper function. Terminate the recorded chain storing to base object
2145 BASE. Return true if the merging and output was successful. The m_stores
2146 entry is removed after the processing in any case. */
2148 bool
2149 pass_store_merging::terminate_and_release_chain (imm_store_chain_info *chain_info)
2151 bool ret = chain_info->terminate_and_process_chain ();
2152 m_stores.remove (chain_info->base_addr);
2153 delete chain_info;
2154 return ret;
2157 /* Return true if stmts in between FIRST (inclusive) and LAST (exclusive)
2158 may clobber REF. FIRST and LAST must be in the same basic block and
2159 have non-NULL vdef. We want to be able to sink load of REF across
2160 stores between FIRST and LAST, up to right before LAST. */
2162 bool
2163 stmts_may_clobber_ref_p (gimple *first, gimple *last, tree ref)
2165 ao_ref r;
2166 ao_ref_init (&r, ref);
2167 unsigned int count = 0;
2168 tree vop = gimple_vdef (last);
2169 gimple *stmt;
2171 gcc_checking_assert (gimple_bb (first) == gimple_bb (last));
2174 stmt = SSA_NAME_DEF_STMT (vop);
2175 if (stmt_may_clobber_ref_p_1 (stmt, &r))
2176 return true;
2177 if (gimple_store_p (stmt)
2178 && refs_anti_dependent_p (ref, gimple_get_lhs (stmt)))
2179 return true;
2180 /* Avoid quadratic compile time by bounding the number of checks
2181 we perform. */
2182 if (++count > MAX_STORE_ALIAS_CHECKS)
2183 return true;
2184 vop = gimple_vuse (stmt);
2186 while (stmt != first);
2187 return false;
2190 /* Return true if INFO->ops[IDX] is mergeable with the
2191 corresponding loads already in MERGED_STORE group.
2192 BASE_ADDR is the base address of the whole store group. */
2194 bool
2195 compatible_load_p (merged_store_group *merged_store,
2196 store_immediate_info *info,
2197 tree base_addr, int idx)
2199 store_immediate_info *infof = merged_store->stores[0];
2200 if (!info->ops[idx].base_addr
2201 || maybe_ne (info->ops[idx].bitpos - infof->ops[idx].bitpos,
2202 info->bitpos - infof->bitpos)
2203 || !operand_equal_p (info->ops[idx].base_addr,
2204 infof->ops[idx].base_addr, 0))
2205 return false;
2207 store_immediate_info *infol = merged_store->stores.last ();
2208 tree load_vuse = gimple_vuse (info->ops[idx].stmt);
2209 /* In this case all vuses should be the same, e.g.
2210 _1 = s.a; _2 = s.b; _3 = _1 | 1; t.a = _3; _4 = _2 | 2; t.b = _4;
2212 _1 = s.a; _2 = s.b; t.a = _1; t.b = _2;
2213 and we can emit the coalesced load next to any of those loads. */
2214 if (gimple_vuse (infof->ops[idx].stmt) == load_vuse
2215 && gimple_vuse (infol->ops[idx].stmt) == load_vuse)
2216 return true;
2218 /* Otherwise, at least for now require that the load has the same
2219 vuse as the store. See following examples. */
2220 if (gimple_vuse (info->stmt) != load_vuse)
2221 return false;
2223 if (gimple_vuse (infof->stmt) != gimple_vuse (infof->ops[idx].stmt)
2224 || (infof != infol
2225 && gimple_vuse (infol->stmt) != gimple_vuse (infol->ops[idx].stmt)))
2226 return false;
2228 /* If the load is from the same location as the store, already
2229 the construction of the immediate chain info guarantees no intervening
2230 stores, so no further checks are needed. Example:
2231 _1 = s.a; _2 = _1 & -7; s.a = _2; _3 = s.b; _4 = _3 & -7; s.b = _4; */
2232 if (known_eq (info->ops[idx].bitpos, info->bitpos)
2233 && operand_equal_p (info->ops[idx].base_addr, base_addr, 0))
2234 return true;
2236 /* Otherwise, we need to punt if any of the loads can be clobbered by any
2237 of the stores in the group, or any other stores in between those.
2238 Previous calls to compatible_load_p ensured that for all the
2239 merged_store->stores IDX loads, no stmts starting with
2240 merged_store->first_stmt and ending right before merged_store->last_stmt
2241 clobbers those loads. */
2242 gimple *first = merged_store->first_stmt;
2243 gimple *last = merged_store->last_stmt;
2244 unsigned int i;
2245 store_immediate_info *infoc;
2246 /* The stores are sorted by increasing store bitpos, so if info->stmt store
2247 comes before the so far first load, we'll be changing
2248 merged_store->first_stmt. In that case we need to give up if
2249 any of the earlier processed loads clobber with the stmts in the new
2250 range. */
2251 if (info->order < merged_store->first_order)
2253 FOR_EACH_VEC_ELT (merged_store->stores, i, infoc)
2254 if (stmts_may_clobber_ref_p (info->stmt, first, infoc->ops[idx].val))
2255 return false;
2256 first = info->stmt;
2258 /* Similarly, we could change merged_store->last_stmt, so ensure
2259 in that case no stmts in the new range clobber any of the earlier
2260 processed loads. */
2261 else if (info->order > merged_store->last_order)
2263 FOR_EACH_VEC_ELT (merged_store->stores, i, infoc)
2264 if (stmts_may_clobber_ref_p (last, info->stmt, infoc->ops[idx].val))
2265 return false;
2266 last = info->stmt;
2268 /* And finally, we'd be adding a new load to the set, ensure it isn't
2269 clobbered in the new range. */
2270 if (stmts_may_clobber_ref_p (first, last, info->ops[idx].val))
2271 return false;
2273 /* Otherwise, we are looking for:
2274 _1 = s.a; _2 = _1 ^ 15; t.a = _2; _3 = s.b; _4 = _3 ^ 15; t.b = _4;
2276 _1 = s.a; t.a = _1; _2 = s.b; t.b = _2; */
2277 return true;
2280 /* Add all refs loaded to compute VAL to REFS vector. */
2282 void
2283 gather_bswap_load_refs (vec<tree> *refs, tree val)
2285 if (TREE_CODE (val) != SSA_NAME)
2286 return;
2288 gimple *stmt = SSA_NAME_DEF_STMT (val);
2289 if (!is_gimple_assign (stmt))
2290 return;
2292 if (gimple_assign_load_p (stmt))
2294 refs->safe_push (gimple_assign_rhs1 (stmt));
2295 return;
2298 switch (gimple_assign_rhs_class (stmt))
2300 case GIMPLE_BINARY_RHS:
2301 gather_bswap_load_refs (refs, gimple_assign_rhs2 (stmt));
2302 /* FALLTHRU */
2303 case GIMPLE_UNARY_RHS:
2304 gather_bswap_load_refs (refs, gimple_assign_rhs1 (stmt));
2305 break;
2306 default:
2307 gcc_unreachable ();
2311 /* Return true if m_store_info[first] and at least one following store
2312 form a group which store try_size bitsize value which is byte swapped
2313 from a memory load or some value, or identity from some value.
2314 This uses the bswap pass APIs. */
2316 bool
2317 imm_store_chain_info::try_coalesce_bswap (merged_store_group *merged_store,
2318 unsigned int first,
2319 unsigned int try_size)
2321 unsigned int len = m_store_info.length (), last = first;
2322 unsigned HOST_WIDE_INT width = m_store_info[first]->bitsize;
2323 if (width >= try_size)
2324 return false;
2325 for (unsigned int i = first + 1; i < len; ++i)
2327 if (m_store_info[i]->bitpos != m_store_info[first]->bitpos + width
2328 || m_store_info[i]->ins_stmt == NULL)
2329 return false;
2330 width += m_store_info[i]->bitsize;
2331 if (width >= try_size)
2333 last = i;
2334 break;
2337 if (width != try_size)
2338 return false;
2340 bool allow_unaligned
2341 = !STRICT_ALIGNMENT && PARAM_VALUE (PARAM_STORE_MERGING_ALLOW_UNALIGNED);
2342 /* Punt if the combined store would not be aligned and we need alignment. */
2343 if (!allow_unaligned)
2345 unsigned int align = merged_store->align;
2346 unsigned HOST_WIDE_INT align_base = merged_store->align_base;
2347 for (unsigned int i = first + 1; i <= last; ++i)
2349 unsigned int this_align;
2350 unsigned HOST_WIDE_INT align_bitpos = 0;
2351 get_object_alignment_1 (gimple_assign_lhs (m_store_info[i]->stmt),
2352 &this_align, &align_bitpos);
2353 if (this_align > align)
2355 align = this_align;
2356 align_base = m_store_info[i]->bitpos - align_bitpos;
2359 unsigned HOST_WIDE_INT align_bitpos
2360 = (m_store_info[first]->bitpos - align_base) & (align - 1);
2361 if (align_bitpos)
2362 align = least_bit_hwi (align_bitpos);
2363 if (align < try_size)
2364 return false;
2367 tree type;
2368 switch (try_size)
2370 case 16: type = uint16_type_node; break;
2371 case 32: type = uint32_type_node; break;
2372 case 64: type = uint64_type_node; break;
2373 default: gcc_unreachable ();
2375 struct symbolic_number n;
2376 gimple *ins_stmt = NULL;
2377 int vuse_store = -1;
2378 unsigned int first_order = merged_store->first_order;
2379 unsigned int last_order = merged_store->last_order;
2380 gimple *first_stmt = merged_store->first_stmt;
2381 gimple *last_stmt = merged_store->last_stmt;
2382 store_immediate_info *infof = m_store_info[first];
2384 for (unsigned int i = first; i <= last; ++i)
2386 store_immediate_info *info = m_store_info[i];
2387 struct symbolic_number this_n = info->n;
2388 this_n.type = type;
2389 if (!this_n.base_addr)
2390 this_n.range = try_size / BITS_PER_UNIT;
2391 else
2392 /* Update vuse in case it has changed by output_merged_stores. */
2393 this_n.vuse = gimple_vuse (info->ins_stmt);
2394 unsigned int bitpos = info->bitpos - infof->bitpos;
2395 if (!do_shift_rotate (LSHIFT_EXPR, &this_n,
2396 BYTES_BIG_ENDIAN
2397 ? try_size - info->bitsize - bitpos
2398 : bitpos))
2399 return false;
2400 if (this_n.base_addr && vuse_store)
2402 unsigned int j;
2403 for (j = first; j <= last; ++j)
2404 if (this_n.vuse == gimple_vuse (m_store_info[j]->stmt))
2405 break;
2406 if (j > last)
2408 if (vuse_store == 1)
2409 return false;
2410 vuse_store = 0;
2413 if (i == first)
2415 n = this_n;
2416 ins_stmt = info->ins_stmt;
2418 else
2420 if (n.base_addr)
2422 if (n.vuse != this_n.vuse)
2424 if (vuse_store == 0)
2425 return false;
2426 vuse_store = 1;
2428 if (info->order > last_order)
2430 last_order = info->order;
2431 last_stmt = info->stmt;
2433 else if (info->order < first_order)
2435 first_order = info->order;
2436 first_stmt = info->stmt;
2440 ins_stmt = perform_symbolic_merge (ins_stmt, &n, info->ins_stmt,
2441 &this_n, &n);
2442 if (ins_stmt == NULL)
2443 return false;
2447 uint64_t cmpxchg, cmpnop;
2448 find_bswap_or_nop_finalize (&n, &cmpxchg, &cmpnop);
2450 /* A complete byte swap should make the symbolic number to start with
2451 the largest digit in the highest order byte. Unchanged symbolic
2452 number indicates a read with same endianness as target architecture. */
2453 if (n.n != cmpnop && n.n != cmpxchg)
2454 return false;
2456 if (n.base_addr == NULL_TREE && !is_gimple_val (n.src))
2457 return false;
2459 /* Don't handle memory copy this way if normal non-bswap processing
2460 would handle it too. */
2461 if (n.n == cmpnop && (unsigned) n.n_ops == last - first + 1)
2463 unsigned int i;
2464 for (i = first; i <= last; ++i)
2465 if (m_store_info[i]->rhs_code != MEM_REF)
2466 break;
2467 if (i == last + 1)
2468 return false;
2471 if (n.n == cmpxchg)
2472 switch (try_size)
2474 case 16:
2475 /* Will emit LROTATE_EXPR. */
2476 break;
2477 case 32:
2478 if (builtin_decl_explicit_p (BUILT_IN_BSWAP32)
2479 && optab_handler (bswap_optab, SImode) != CODE_FOR_nothing)
2480 break;
2481 return false;
2482 case 64:
2483 if (builtin_decl_explicit_p (BUILT_IN_BSWAP64)
2484 && optab_handler (bswap_optab, DImode) != CODE_FOR_nothing)
2485 break;
2486 return false;
2487 default:
2488 gcc_unreachable ();
2491 if (!allow_unaligned && n.base_addr)
2493 unsigned int align = get_object_alignment (n.src);
2494 if (align < try_size)
2495 return false;
2498 /* If each load has vuse of the corresponding store, need to verify
2499 the loads can be sunk right before the last store. */
2500 if (vuse_store == 1)
2502 auto_vec<tree, 64> refs;
2503 for (unsigned int i = first; i <= last; ++i)
2504 gather_bswap_load_refs (&refs,
2505 gimple_assign_rhs1 (m_store_info[i]->stmt));
2507 unsigned int i;
2508 tree ref;
2509 FOR_EACH_VEC_ELT (refs, i, ref)
2510 if (stmts_may_clobber_ref_p (first_stmt, last_stmt, ref))
2511 return false;
2512 n.vuse = NULL_TREE;
2515 infof->n = n;
2516 infof->ins_stmt = ins_stmt;
2517 for (unsigned int i = first; i <= last; ++i)
2519 m_store_info[i]->rhs_code = n.n == cmpxchg ? LROTATE_EXPR : NOP_EXPR;
2520 m_store_info[i]->ops[0].base_addr = NULL_TREE;
2521 m_store_info[i]->ops[1].base_addr = NULL_TREE;
2522 if (i != first)
2523 merged_store->merge_into (m_store_info[i]);
2526 return true;
2529 /* Go through the candidate stores recorded in m_store_info and merge them
2530 into merged_store_group objects recorded into m_merged_store_groups
2531 representing the widened stores. Return true if coalescing was successful
2532 and the number of widened stores is fewer than the original number
2533 of stores. */
2535 bool
2536 imm_store_chain_info::coalesce_immediate_stores ()
2538 /* Anything less can't be processed. */
2539 if (m_store_info.length () < 2)
2540 return false;
2542 if (dump_file && (dump_flags & TDF_DETAILS))
2543 fprintf (dump_file, "Attempting to coalesce %u stores in chain.\n",
2544 m_store_info.length ());
2546 store_immediate_info *info;
2547 unsigned int i, ignore = 0;
2549 /* Order the stores by the bitposition they write to. */
2550 m_store_info.qsort (sort_by_bitpos);
2552 info = m_store_info[0];
2553 merged_store_group *merged_store = new merged_store_group (info);
2555 FOR_EACH_VEC_ELT (m_store_info, i, info)
2557 if (dump_file && (dump_flags & TDF_DETAILS))
2559 fprintf (dump_file, "Store %u:\nbitsize:" HOST_WIDE_INT_PRINT_DEC
2560 " bitpos:" HOST_WIDE_INT_PRINT_DEC " val:\n",
2561 i, info->bitsize, info->bitpos);
2562 print_generic_expr (dump_file, gimple_assign_rhs1 (info->stmt));
2563 fprintf (dump_file, "\n------------\n");
2566 if (i <= ignore)
2567 continue;
2569 /* First try to handle group of stores like:
2570 p[0] = data >> 24;
2571 p[1] = data >> 16;
2572 p[2] = data >> 8;
2573 p[3] = data;
2574 using the bswap framework. */
2575 if (info->bitpos == merged_store->start + merged_store->width
2576 && merged_store->stores.length () == 1
2577 && merged_store->stores[0]->ins_stmt != NULL
2578 && info->ins_stmt != NULL)
2580 unsigned int try_size;
2581 for (try_size = 64; try_size >= 16; try_size >>= 1)
2582 if (try_coalesce_bswap (merged_store, i - 1, try_size))
2583 break;
2585 if (try_size >= 16)
2587 ignore = i + merged_store->stores.length () - 1;
2588 m_merged_store_groups.safe_push (merged_store);
2589 if (ignore < m_store_info.length ())
2590 merged_store = new merged_store_group (m_store_info[ignore]);
2591 else
2592 merged_store = NULL;
2593 continue;
2597 /* |---store 1---|
2598 |---store 2---|
2599 Overlapping stores. */
2600 if (IN_RANGE (info->bitpos, merged_store->start,
2601 merged_store->start + merged_store->width - 1))
2603 /* Only allow overlapping stores of constants. */
2604 if (info->rhs_code == INTEGER_CST
2605 && merged_store->stores[0]->rhs_code == INTEGER_CST)
2607 merged_store->merge_overlapping (info);
2608 continue;
2611 /* |---store 1---||---store 2---|
2612 This store is consecutive to the previous one.
2613 Merge it into the current store group. There can be gaps in between
2614 the stores, but there can't be gaps in between bitregions. */
2615 else if (info->rhs_code != LROTATE_EXPR
2616 && info->bitregion_start <= merged_store->bitregion_end
2617 && info->rhs_code == merged_store->stores[0]->rhs_code)
2619 store_immediate_info *infof = merged_store->stores[0];
2621 /* All the rhs_code ops that take 2 operands are commutative,
2622 swap the operands if it could make the operands compatible. */
2623 if (infof->ops[0].base_addr
2624 && infof->ops[1].base_addr
2625 && info->ops[0].base_addr
2626 && info->ops[1].base_addr
2627 && known_eq (info->ops[1].bitpos - infof->ops[0].bitpos,
2628 info->bitpos - infof->bitpos)
2629 && operand_equal_p (info->ops[1].base_addr,
2630 infof->ops[0].base_addr, 0))
2632 std::swap (info->ops[0], info->ops[1]);
2633 info->ops_swapped_p = true;
2635 if ((infof->ops[0].base_addr
2636 ? compatible_load_p (merged_store, info, base_addr, 0)
2637 : !info->ops[0].base_addr)
2638 && (infof->ops[1].base_addr
2639 ? compatible_load_p (merged_store, info, base_addr, 1)
2640 : !info->ops[1].base_addr))
2642 merged_store->merge_into (info);
2643 continue;
2647 /* |---store 1---| <gap> |---store 2---|.
2648 Gap between stores or the rhs not compatible. Start a new group. */
2650 /* Try to apply all the stores recorded for the group to determine
2651 the bitpattern they write and discard it if that fails.
2652 This will also reject single-store groups. */
2653 if (!merged_store->apply_stores ())
2654 delete merged_store;
2655 else
2656 m_merged_store_groups.safe_push (merged_store);
2658 merged_store = new merged_store_group (info);
2661 /* Record or discard the last store group. */
2662 if (merged_store)
2664 if (!merged_store->apply_stores ())
2665 delete merged_store;
2666 else
2667 m_merged_store_groups.safe_push (merged_store);
2670 gcc_assert (m_merged_store_groups.length () <= m_store_info.length ());
2671 bool success
2672 = !m_merged_store_groups.is_empty ()
2673 && m_merged_store_groups.length () < m_store_info.length ();
2675 if (success && dump_file)
2676 fprintf (dump_file, "Coalescing successful!\n"
2677 "Merged into %u stores\n",
2678 m_merged_store_groups.length ());
2680 return success;
2683 /* Return the type to use for the merged stores or loads described by STMTS.
2684 This is needed to get the alias sets right. If IS_LOAD, look for rhs,
2685 otherwise lhs. Additionally set *CLIQUEP and *BASEP to MR_DEPENDENCE_*
2686 of the MEM_REFs if any. */
2688 static tree
2689 get_alias_type_for_stmts (vec<gimple *> &stmts, bool is_load,
2690 unsigned short *cliquep, unsigned short *basep)
2692 gimple *stmt;
2693 unsigned int i;
2694 tree type = NULL_TREE;
2695 tree ret = NULL_TREE;
2696 *cliquep = 0;
2697 *basep = 0;
2699 FOR_EACH_VEC_ELT (stmts, i, stmt)
2701 tree ref = is_load ? gimple_assign_rhs1 (stmt)
2702 : gimple_assign_lhs (stmt);
2703 tree type1 = reference_alias_ptr_type (ref);
2704 tree base = get_base_address (ref);
2706 if (i == 0)
2708 if (TREE_CODE (base) == MEM_REF)
2710 *cliquep = MR_DEPENDENCE_CLIQUE (base);
2711 *basep = MR_DEPENDENCE_BASE (base);
2713 ret = type = type1;
2714 continue;
2716 if (!alias_ptr_types_compatible_p (type, type1))
2717 ret = ptr_type_node;
2718 if (TREE_CODE (base) != MEM_REF
2719 || *cliquep != MR_DEPENDENCE_CLIQUE (base)
2720 || *basep != MR_DEPENDENCE_BASE (base))
2722 *cliquep = 0;
2723 *basep = 0;
2726 return ret;
2729 /* Return the location_t information we can find among the statements
2730 in STMTS. */
2732 static location_t
2733 get_location_for_stmts (vec<gimple *> &stmts)
2735 gimple *stmt;
2736 unsigned int i;
2738 FOR_EACH_VEC_ELT (stmts, i, stmt)
2739 if (gimple_has_location (stmt))
2740 return gimple_location (stmt);
2742 return UNKNOWN_LOCATION;
2745 /* Used to decribe a store resulting from splitting a wide store in smaller
2746 regularly-sized stores in split_group. */
2748 struct split_store
2750 unsigned HOST_WIDE_INT bytepos;
2751 unsigned HOST_WIDE_INT size;
2752 unsigned HOST_WIDE_INT align;
2753 auto_vec<store_immediate_info *> orig_stores;
2754 /* True if there is a single orig stmt covering the whole split store. */
2755 bool orig;
2756 split_store (unsigned HOST_WIDE_INT, unsigned HOST_WIDE_INT,
2757 unsigned HOST_WIDE_INT);
2760 /* Simple constructor. */
2762 split_store::split_store (unsigned HOST_WIDE_INT bp,
2763 unsigned HOST_WIDE_INT sz,
2764 unsigned HOST_WIDE_INT al)
2765 : bytepos (bp), size (sz), align (al), orig (false)
2767 orig_stores.create (0);
2770 /* Record all stores in GROUP that write to the region starting at BITPOS and
2771 is of size BITSIZE. Record infos for such statements in STORES if
2772 non-NULL. The stores in GROUP must be sorted by bitposition. Return INFO
2773 if there is exactly one original store in the range. */
2775 static store_immediate_info *
2776 find_constituent_stores (struct merged_store_group *group,
2777 vec<store_immediate_info *> *stores,
2778 unsigned int *first,
2779 unsigned HOST_WIDE_INT bitpos,
2780 unsigned HOST_WIDE_INT bitsize)
2782 store_immediate_info *info, *ret = NULL;
2783 unsigned int i;
2784 bool second = false;
2785 bool update_first = true;
2786 unsigned HOST_WIDE_INT end = bitpos + bitsize;
2787 for (i = *first; group->stores.iterate (i, &info); ++i)
2789 unsigned HOST_WIDE_INT stmt_start = info->bitpos;
2790 unsigned HOST_WIDE_INT stmt_end = stmt_start + info->bitsize;
2791 if (stmt_end <= bitpos)
2793 /* BITPOS passed to this function never decreases from within the
2794 same split_group call, so optimize and don't scan info records
2795 which are known to end before or at BITPOS next time.
2796 Only do it if all stores before this one also pass this. */
2797 if (update_first)
2798 *first = i + 1;
2799 continue;
2801 else
2802 update_first = false;
2804 /* The stores in GROUP are ordered by bitposition so if we're past
2805 the region for this group return early. */
2806 if (stmt_start >= end)
2807 return ret;
2809 if (stores)
2811 stores->safe_push (info);
2812 if (ret)
2814 ret = NULL;
2815 second = true;
2818 else if (ret)
2819 return NULL;
2820 if (!second)
2821 ret = info;
2823 return ret;
2826 /* Return how many SSA_NAMEs used to compute value to store in the INFO
2827 store have multiple uses. If any SSA_NAME has multiple uses, also
2828 count statements needed to compute it. */
2830 static unsigned
2831 count_multiple_uses (store_immediate_info *info)
2833 gimple *stmt = info->stmt;
2834 unsigned ret = 0;
2835 switch (info->rhs_code)
2837 case INTEGER_CST:
2838 return 0;
2839 case BIT_AND_EXPR:
2840 case BIT_IOR_EXPR:
2841 case BIT_XOR_EXPR:
2842 if (info->bit_not_p)
2844 if (!has_single_use (gimple_assign_rhs1 (stmt)))
2845 ret = 1; /* Fall through below to return
2846 the BIT_NOT_EXPR stmt and then
2847 BIT_{AND,IOR,XOR}_EXPR and anything it
2848 uses. */
2849 else
2850 /* stmt is after this the BIT_NOT_EXPR. */
2851 stmt = SSA_NAME_DEF_STMT (gimple_assign_rhs1 (stmt));
2853 if (!has_single_use (gimple_assign_rhs1 (stmt)))
2855 ret += 1 + info->ops[0].bit_not_p;
2856 if (info->ops[1].base_addr)
2857 ret += 1 + info->ops[1].bit_not_p;
2858 return ret + 1;
2860 stmt = SSA_NAME_DEF_STMT (gimple_assign_rhs1 (stmt));
2861 /* stmt is now the BIT_*_EXPR. */
2862 if (!has_single_use (gimple_assign_rhs1 (stmt)))
2863 ret += 1 + info->ops[info->ops_swapped_p].bit_not_p;
2864 else if (info->ops[info->ops_swapped_p].bit_not_p)
2866 gimple *stmt2 = SSA_NAME_DEF_STMT (gimple_assign_rhs1 (stmt));
2867 if (!has_single_use (gimple_assign_rhs1 (stmt2)))
2868 ++ret;
2870 if (info->ops[1].base_addr == NULL_TREE)
2872 gcc_checking_assert (!info->ops_swapped_p);
2873 return ret;
2875 if (!has_single_use (gimple_assign_rhs2 (stmt)))
2876 ret += 1 + info->ops[1 - info->ops_swapped_p].bit_not_p;
2877 else if (info->ops[1 - info->ops_swapped_p].bit_not_p)
2879 gimple *stmt2 = SSA_NAME_DEF_STMT (gimple_assign_rhs2 (stmt));
2880 if (!has_single_use (gimple_assign_rhs1 (stmt2)))
2881 ++ret;
2883 return ret;
2884 case MEM_REF:
2885 if (!has_single_use (gimple_assign_rhs1 (stmt)))
2886 return 1 + info->ops[0].bit_not_p;
2887 else if (info->ops[0].bit_not_p)
2889 stmt = SSA_NAME_DEF_STMT (gimple_assign_rhs1 (stmt));
2890 if (!has_single_use (gimple_assign_rhs1 (stmt)))
2891 return 1;
2893 return 0;
2894 default:
2895 gcc_unreachable ();
2899 /* Split a merged store described by GROUP by populating the SPLIT_STORES
2900 vector (if non-NULL) with split_store structs describing the byte offset
2901 (from the base), the bit size and alignment of each store as well as the
2902 original statements involved in each such split group.
2903 This is to separate the splitting strategy from the statement
2904 building/emission/linking done in output_merged_store.
2905 Return number of new stores.
2906 If ALLOW_UNALIGNED_STORE is false, then all stores must be aligned.
2907 If ALLOW_UNALIGNED_LOAD is false, then all loads must be aligned.
2908 If SPLIT_STORES is NULL, it is just a dry run to count number of
2909 new stores. */
2911 static unsigned int
2912 split_group (merged_store_group *group, bool allow_unaligned_store,
2913 bool allow_unaligned_load,
2914 vec<struct split_store *> *split_stores,
2915 unsigned *total_orig,
2916 unsigned *total_new)
2918 unsigned HOST_WIDE_INT pos = group->bitregion_start;
2919 unsigned HOST_WIDE_INT size = group->bitregion_end - pos;
2920 unsigned HOST_WIDE_INT bytepos = pos / BITS_PER_UNIT;
2921 unsigned HOST_WIDE_INT group_align = group->align;
2922 unsigned HOST_WIDE_INT align_base = group->align_base;
2923 unsigned HOST_WIDE_INT group_load_align = group_align;
2924 bool any_orig = false;
2926 gcc_assert ((size % BITS_PER_UNIT == 0) && (pos % BITS_PER_UNIT == 0));
2928 if (group->stores[0]->rhs_code == LROTATE_EXPR
2929 || group->stores[0]->rhs_code == NOP_EXPR)
2931 /* For bswap framework using sets of stores, all the checking
2932 has been done earlier in try_coalesce_bswap and needs to be
2933 emitted as a single store. */
2934 if (total_orig)
2936 /* Avoid the old/new stmt count heuristics. It should be
2937 always beneficial. */
2938 total_new[0] = 1;
2939 total_orig[0] = 2;
2942 if (split_stores)
2944 unsigned HOST_WIDE_INT align_bitpos
2945 = (group->start - align_base) & (group_align - 1);
2946 unsigned HOST_WIDE_INT align = group_align;
2947 if (align_bitpos)
2948 align = least_bit_hwi (align_bitpos);
2949 bytepos = group->start / BITS_PER_UNIT;
2950 struct split_store *store
2951 = new split_store (bytepos, group->width, align);
2952 unsigned int first = 0;
2953 find_constituent_stores (group, &store->orig_stores,
2954 &first, group->start, group->width);
2955 split_stores->safe_push (store);
2958 return 1;
2961 unsigned int ret = 0, first = 0;
2962 unsigned HOST_WIDE_INT try_pos = bytepos;
2964 if (total_orig)
2966 unsigned int i;
2967 store_immediate_info *info = group->stores[0];
2969 total_new[0] = 0;
2970 total_orig[0] = 1; /* The orig store. */
2971 info = group->stores[0];
2972 if (info->ops[0].base_addr)
2973 total_orig[0]++;
2974 if (info->ops[1].base_addr)
2975 total_orig[0]++;
2976 switch (info->rhs_code)
2978 case BIT_AND_EXPR:
2979 case BIT_IOR_EXPR:
2980 case BIT_XOR_EXPR:
2981 total_orig[0]++; /* The orig BIT_*_EXPR stmt. */
2982 break;
2983 default:
2984 break;
2986 total_orig[0] *= group->stores.length ();
2988 FOR_EACH_VEC_ELT (group->stores, i, info)
2990 total_new[0] += count_multiple_uses (info);
2991 total_orig[0] += (info->bit_not_p
2992 + info->ops[0].bit_not_p
2993 + info->ops[1].bit_not_p);
2997 if (!allow_unaligned_load)
2998 for (int i = 0; i < 2; ++i)
2999 if (group->load_align[i])
3000 group_load_align = MIN (group_load_align, group->load_align[i]);
3002 while (size > 0)
3004 if ((allow_unaligned_store || group_align <= BITS_PER_UNIT)
3005 && group->mask[try_pos - bytepos] == (unsigned char) ~0U)
3007 /* Skip padding bytes. */
3008 ++try_pos;
3009 size -= BITS_PER_UNIT;
3010 continue;
3013 unsigned HOST_WIDE_INT try_bitpos = try_pos * BITS_PER_UNIT;
3014 unsigned int try_size = MAX_STORE_BITSIZE, nonmasked;
3015 unsigned HOST_WIDE_INT align_bitpos
3016 = (try_bitpos - align_base) & (group_align - 1);
3017 unsigned HOST_WIDE_INT align = group_align;
3018 if (align_bitpos)
3019 align = least_bit_hwi (align_bitpos);
3020 if (!allow_unaligned_store)
3021 try_size = MIN (try_size, align);
3022 if (!allow_unaligned_load)
3024 /* If we can't do or don't want to do unaligned stores
3025 as well as loads, we need to take the loads into account
3026 as well. */
3027 unsigned HOST_WIDE_INT load_align = group_load_align;
3028 align_bitpos = (try_bitpos - align_base) & (load_align - 1);
3029 if (align_bitpos)
3030 load_align = least_bit_hwi (align_bitpos);
3031 for (int i = 0; i < 2; ++i)
3032 if (group->load_align[i])
3034 align_bitpos
3035 = known_alignment (try_bitpos
3036 - group->stores[0]->bitpos
3037 + group->stores[0]->ops[i].bitpos
3038 - group->load_align_base[i]);
3039 if (align_bitpos & (group_load_align - 1))
3041 unsigned HOST_WIDE_INT a = least_bit_hwi (align_bitpos);
3042 load_align = MIN (load_align, a);
3045 try_size = MIN (try_size, load_align);
3047 store_immediate_info *info
3048 = find_constituent_stores (group, NULL, &first, try_bitpos, try_size);
3049 if (info)
3051 /* If there is just one original statement for the range, see if
3052 we can just reuse the original store which could be even larger
3053 than try_size. */
3054 unsigned HOST_WIDE_INT stmt_end
3055 = ROUND_UP (info->bitpos + info->bitsize, BITS_PER_UNIT);
3056 info = find_constituent_stores (group, NULL, &first, try_bitpos,
3057 stmt_end - try_bitpos);
3058 if (info && info->bitpos >= try_bitpos)
3060 try_size = stmt_end - try_bitpos;
3061 goto found;
3065 /* Approximate store bitsize for the case when there are no padding
3066 bits. */
3067 while (try_size > size)
3068 try_size /= 2;
3069 /* Now look for whole padding bytes at the end of that bitsize. */
3070 for (nonmasked = try_size / BITS_PER_UNIT; nonmasked > 0; --nonmasked)
3071 if (group->mask[try_pos - bytepos + nonmasked - 1]
3072 != (unsigned char) ~0U)
3073 break;
3074 if (nonmasked == 0)
3076 /* If entire try_size range is padding, skip it. */
3077 try_pos += try_size / BITS_PER_UNIT;
3078 size -= try_size;
3079 continue;
3081 /* Otherwise try to decrease try_size if second half, last 3 quarters
3082 etc. are padding. */
3083 nonmasked *= BITS_PER_UNIT;
3084 while (nonmasked <= try_size / 2)
3085 try_size /= 2;
3086 if (!allow_unaligned_store && group_align > BITS_PER_UNIT)
3088 /* Now look for whole padding bytes at the start of that bitsize. */
3089 unsigned int try_bytesize = try_size / BITS_PER_UNIT, masked;
3090 for (masked = 0; masked < try_bytesize; ++masked)
3091 if (group->mask[try_pos - bytepos + masked] != (unsigned char) ~0U)
3092 break;
3093 masked *= BITS_PER_UNIT;
3094 gcc_assert (masked < try_size);
3095 if (masked >= try_size / 2)
3097 while (masked >= try_size / 2)
3099 try_size /= 2;
3100 try_pos += try_size / BITS_PER_UNIT;
3101 size -= try_size;
3102 masked -= try_size;
3104 /* Need to recompute the alignment, so just retry at the new
3105 position. */
3106 continue;
3110 found:
3111 ++ret;
3113 if (split_stores)
3115 struct split_store *store
3116 = new split_store (try_pos, try_size, align);
3117 info = find_constituent_stores (group, &store->orig_stores,
3118 &first, try_bitpos, try_size);
3119 if (info
3120 && info->bitpos >= try_bitpos
3121 && info->bitpos + info->bitsize <= try_bitpos + try_size)
3123 store->orig = true;
3124 any_orig = true;
3126 split_stores->safe_push (store);
3129 try_pos += try_size / BITS_PER_UNIT;
3130 size -= try_size;
3133 if (total_orig)
3135 unsigned int i;
3136 struct split_store *store;
3137 /* If we are reusing some original stores and any of the
3138 original SSA_NAMEs had multiple uses, we need to subtract
3139 those now before we add the new ones. */
3140 if (total_new[0] && any_orig)
3142 FOR_EACH_VEC_ELT (*split_stores, i, store)
3143 if (store->orig)
3144 total_new[0] -= count_multiple_uses (store->orig_stores[0]);
3146 total_new[0] += ret; /* The new store. */
3147 store_immediate_info *info = group->stores[0];
3148 if (info->ops[0].base_addr)
3149 total_new[0] += ret;
3150 if (info->ops[1].base_addr)
3151 total_new[0] += ret;
3152 switch (info->rhs_code)
3154 case BIT_AND_EXPR:
3155 case BIT_IOR_EXPR:
3156 case BIT_XOR_EXPR:
3157 total_new[0] += ret; /* The new BIT_*_EXPR stmt. */
3158 break;
3159 default:
3160 break;
3162 FOR_EACH_VEC_ELT (*split_stores, i, store)
3164 unsigned int j;
3165 bool bit_not_p[3] = { false, false, false };
3166 /* If all orig_stores have certain bit_not_p set, then
3167 we'd use a BIT_NOT_EXPR stmt and need to account for it.
3168 If some orig_stores have certain bit_not_p set, then
3169 we'd use a BIT_XOR_EXPR with a mask and need to account for
3170 it. */
3171 FOR_EACH_VEC_ELT (store->orig_stores, j, info)
3173 if (info->ops[0].bit_not_p)
3174 bit_not_p[0] = true;
3175 if (info->ops[1].bit_not_p)
3176 bit_not_p[1] = true;
3177 if (info->bit_not_p)
3178 bit_not_p[2] = true;
3180 total_new[0] += bit_not_p[0] + bit_not_p[1] + bit_not_p[2];
3185 return ret;
3188 /* Return the operation through which the operand IDX (if < 2) or
3189 result (IDX == 2) should be inverted. If NOP_EXPR, no inversion
3190 is done, if BIT_NOT_EXPR, all bits are inverted, if BIT_XOR_EXPR,
3191 the bits should be xored with mask. */
3193 static enum tree_code
3194 invert_op (split_store *split_store, int idx, tree int_type, tree &mask)
3196 unsigned int i;
3197 store_immediate_info *info;
3198 unsigned int cnt = 0;
3199 FOR_EACH_VEC_ELT (split_store->orig_stores, i, info)
3201 bool bit_not_p = idx < 2 ? info->ops[idx].bit_not_p : info->bit_not_p;
3202 if (bit_not_p)
3203 ++cnt;
3205 mask = NULL_TREE;
3206 if (cnt == 0)
3207 return NOP_EXPR;
3208 if (cnt == split_store->orig_stores.length ())
3209 return BIT_NOT_EXPR;
3211 unsigned HOST_WIDE_INT try_bitpos = split_store->bytepos * BITS_PER_UNIT;
3212 unsigned buf_size = split_store->size / BITS_PER_UNIT;
3213 unsigned char *buf
3214 = XALLOCAVEC (unsigned char, buf_size);
3215 memset (buf, ~0U, buf_size);
3216 FOR_EACH_VEC_ELT (split_store->orig_stores, i, info)
3218 bool bit_not_p = idx < 2 ? info->ops[idx].bit_not_p : info->bit_not_p;
3219 if (!bit_not_p)
3220 continue;
3221 /* Clear regions with bit_not_p and invert afterwards, rather than
3222 clear regions with !bit_not_p, so that gaps in between stores aren't
3223 set in the mask. */
3224 unsigned HOST_WIDE_INT bitsize = info->bitsize;
3225 unsigned int pos_in_buffer = 0;
3226 if (info->bitpos < try_bitpos)
3228 gcc_assert (info->bitpos + bitsize > try_bitpos);
3229 bitsize -= (try_bitpos - info->bitpos);
3231 else
3232 pos_in_buffer = info->bitpos - try_bitpos;
3233 if (pos_in_buffer + bitsize > split_store->size)
3234 bitsize = split_store->size - pos_in_buffer;
3235 unsigned char *p = buf + (pos_in_buffer / BITS_PER_UNIT);
3236 if (BYTES_BIG_ENDIAN)
3237 clear_bit_region_be (p, (BITS_PER_UNIT - 1
3238 - (pos_in_buffer % BITS_PER_UNIT)), bitsize);
3239 else
3240 clear_bit_region (p, pos_in_buffer % BITS_PER_UNIT, bitsize);
3242 for (unsigned int i = 0; i < buf_size; ++i)
3243 buf[i] = ~buf[i];
3244 mask = native_interpret_expr (int_type, buf, buf_size);
3245 return BIT_XOR_EXPR;
3248 /* Given a merged store group GROUP output the widened version of it.
3249 The store chain is against the base object BASE.
3250 Try store sizes of at most MAX_STORE_BITSIZE bits wide and don't output
3251 unaligned stores for STRICT_ALIGNMENT targets or if it's too expensive.
3252 Make sure that the number of statements output is less than the number of
3253 original statements. If a better sequence is possible emit it and
3254 return true. */
3256 bool
3257 imm_store_chain_info::output_merged_store (merged_store_group *group)
3259 unsigned HOST_WIDE_INT start_byte_pos
3260 = group->bitregion_start / BITS_PER_UNIT;
3262 unsigned int orig_num_stmts = group->stores.length ();
3263 if (orig_num_stmts < 2)
3264 return false;
3266 auto_vec<struct split_store *, 32> split_stores;
3267 split_stores.create (0);
3268 bool allow_unaligned_store
3269 = !STRICT_ALIGNMENT && PARAM_VALUE (PARAM_STORE_MERGING_ALLOW_UNALIGNED);
3270 bool allow_unaligned_load = allow_unaligned_store;
3271 if (allow_unaligned_store)
3273 /* If unaligned stores are allowed, see how many stores we'd emit
3274 for unaligned and how many stores we'd emit for aligned stores.
3275 Only use unaligned stores if it allows fewer stores than aligned. */
3276 unsigned aligned_cnt
3277 = split_group (group, false, allow_unaligned_load, NULL, NULL, NULL);
3278 unsigned unaligned_cnt
3279 = split_group (group, true, allow_unaligned_load, NULL, NULL, NULL);
3280 if (aligned_cnt <= unaligned_cnt)
3281 allow_unaligned_store = false;
3283 unsigned total_orig, total_new;
3284 split_group (group, allow_unaligned_store, allow_unaligned_load,
3285 &split_stores, &total_orig, &total_new);
3287 if (split_stores.length () >= orig_num_stmts)
3289 /* We didn't manage to reduce the number of statements. Bail out. */
3290 if (dump_file && (dump_flags & TDF_DETAILS))
3291 fprintf (dump_file, "Exceeded original number of stmts (%u)."
3292 " Not profitable to emit new sequence.\n",
3293 orig_num_stmts);
3294 return false;
3296 if (total_orig <= total_new)
3298 /* If number of estimated new statements is above estimated original
3299 statements, bail out too. */
3300 if (dump_file && (dump_flags & TDF_DETAILS))
3301 fprintf (dump_file, "Estimated number of original stmts (%u)"
3302 " not larger than estimated number of new"
3303 " stmts (%u).\n",
3304 total_orig, total_new);
3305 return false;
3308 gimple_stmt_iterator last_gsi = gsi_for_stmt (group->last_stmt);
3309 gimple_seq seq = NULL;
3310 tree last_vdef, new_vuse;
3311 last_vdef = gimple_vdef (group->last_stmt);
3312 new_vuse = gimple_vuse (group->last_stmt);
3313 tree bswap_res = NULL_TREE;
3315 if (group->stores[0]->rhs_code == LROTATE_EXPR
3316 || group->stores[0]->rhs_code == NOP_EXPR)
3318 tree fndecl = NULL_TREE, bswap_type = NULL_TREE, load_type;
3319 gimple *ins_stmt = group->stores[0]->ins_stmt;
3320 struct symbolic_number *n = &group->stores[0]->n;
3321 bool bswap = group->stores[0]->rhs_code == LROTATE_EXPR;
3323 switch (n->range)
3325 case 16:
3326 load_type = bswap_type = uint16_type_node;
3327 break;
3328 case 32:
3329 load_type = uint32_type_node;
3330 if (bswap)
3332 fndecl = builtin_decl_explicit (BUILT_IN_BSWAP32);
3333 bswap_type = TREE_VALUE (TYPE_ARG_TYPES (TREE_TYPE (fndecl)));
3335 break;
3336 case 64:
3337 load_type = uint64_type_node;
3338 if (bswap)
3340 fndecl = builtin_decl_explicit (BUILT_IN_BSWAP64);
3341 bswap_type = TREE_VALUE (TYPE_ARG_TYPES (TREE_TYPE (fndecl)));
3343 break;
3344 default:
3345 gcc_unreachable ();
3348 /* If the loads have each vuse of the corresponding store,
3349 we've checked the aliasing already in try_coalesce_bswap and
3350 we want to sink the need load into seq. So need to use new_vuse
3351 on the load. */
3352 if (n->base_addr)
3354 if (n->vuse == NULL)
3356 n->vuse = new_vuse;
3357 ins_stmt = NULL;
3359 else
3360 /* Update vuse in case it has changed by output_merged_stores. */
3361 n->vuse = gimple_vuse (ins_stmt);
3363 bswap_res = bswap_replace (gsi_start (seq), ins_stmt, fndecl,
3364 bswap_type, load_type, n, bswap);
3365 gcc_assert (bswap_res);
3368 gimple *stmt = NULL;
3369 split_store *split_store;
3370 unsigned int i;
3371 auto_vec<gimple *, 32> orig_stmts;
3372 gimple_seq this_seq;
3373 tree addr = force_gimple_operand_1 (unshare_expr (base_addr), &this_seq,
3374 is_gimple_mem_ref_addr, NULL_TREE);
3375 gimple_seq_add_seq_without_update (&seq, this_seq);
3377 tree load_addr[2] = { NULL_TREE, NULL_TREE };
3378 gimple_seq load_seq[2] = { NULL, NULL };
3379 gimple_stmt_iterator load_gsi[2] = { gsi_none (), gsi_none () };
3380 for (int j = 0; j < 2; ++j)
3382 store_operand_info &op = group->stores[0]->ops[j];
3383 if (op.base_addr == NULL_TREE)
3384 continue;
3386 store_immediate_info *infol = group->stores.last ();
3387 if (gimple_vuse (op.stmt) == gimple_vuse (infol->ops[j].stmt))
3389 /* We can't pick the location randomly; while we've verified
3390 all the loads have the same vuse, they can be still in different
3391 basic blocks and we need to pick the one from the last bb:
3392 int x = q[0];
3393 if (x == N) return;
3394 int y = q[1];
3395 p[0] = x;
3396 p[1] = y;
3397 otherwise if we put the wider load at the q[0] load, we might
3398 segfault if q[1] is not mapped. */
3399 basic_block bb = gimple_bb (op.stmt);
3400 gimple *ostmt = op.stmt;
3401 store_immediate_info *info;
3402 FOR_EACH_VEC_ELT (group->stores, i, info)
3404 gimple *tstmt = info->ops[j].stmt;
3405 basic_block tbb = gimple_bb (tstmt);
3406 if (dominated_by_p (CDI_DOMINATORS, tbb, bb))
3408 ostmt = tstmt;
3409 bb = tbb;
3412 load_gsi[j] = gsi_for_stmt (ostmt);
3413 load_addr[j]
3414 = force_gimple_operand_1 (unshare_expr (op.base_addr),
3415 &load_seq[j], is_gimple_mem_ref_addr,
3416 NULL_TREE);
3418 else if (operand_equal_p (base_addr, op.base_addr, 0))
3419 load_addr[j] = addr;
3420 else
3422 load_addr[j]
3423 = force_gimple_operand_1 (unshare_expr (op.base_addr),
3424 &this_seq, is_gimple_mem_ref_addr,
3425 NULL_TREE);
3426 gimple_seq_add_seq_without_update (&seq, this_seq);
3430 FOR_EACH_VEC_ELT (split_stores, i, split_store)
3432 unsigned HOST_WIDE_INT try_size = split_store->size;
3433 unsigned HOST_WIDE_INT try_pos = split_store->bytepos;
3434 unsigned HOST_WIDE_INT align = split_store->align;
3435 tree dest, src;
3436 location_t loc;
3437 if (split_store->orig)
3439 /* If there is just a single constituent store which covers
3440 the whole area, just reuse the lhs and rhs. */
3441 gimple *orig_stmt = split_store->orig_stores[0]->stmt;
3442 dest = gimple_assign_lhs (orig_stmt);
3443 src = gimple_assign_rhs1 (orig_stmt);
3444 loc = gimple_location (orig_stmt);
3446 else
3448 store_immediate_info *info;
3449 unsigned short clique, base;
3450 unsigned int k;
3451 FOR_EACH_VEC_ELT (split_store->orig_stores, k, info)
3452 orig_stmts.safe_push (info->stmt);
3453 tree offset_type
3454 = get_alias_type_for_stmts (orig_stmts, false, &clique, &base);
3455 loc = get_location_for_stmts (orig_stmts);
3456 orig_stmts.truncate (0);
3458 tree int_type = build_nonstandard_integer_type (try_size, UNSIGNED);
3459 int_type = build_aligned_type (int_type, align);
3460 dest = fold_build2 (MEM_REF, int_type, addr,
3461 build_int_cst (offset_type, try_pos));
3462 if (TREE_CODE (dest) == MEM_REF)
3464 MR_DEPENDENCE_CLIQUE (dest) = clique;
3465 MR_DEPENDENCE_BASE (dest) = base;
3468 tree mask = integer_zero_node;
3469 if (!bswap_res)
3470 mask = native_interpret_expr (int_type,
3471 group->mask + try_pos
3472 - start_byte_pos,
3473 group->buf_size);
3475 tree ops[2];
3476 for (int j = 0;
3477 j < 1 + (split_store->orig_stores[0]->ops[1].val != NULL_TREE);
3478 ++j)
3480 store_operand_info &op = split_store->orig_stores[0]->ops[j];
3481 if (bswap_res)
3482 ops[j] = bswap_res;
3483 else if (op.base_addr)
3485 FOR_EACH_VEC_ELT (split_store->orig_stores, k, info)
3486 orig_stmts.safe_push (info->ops[j].stmt);
3488 offset_type = get_alias_type_for_stmts (orig_stmts, true,
3489 &clique, &base);
3490 location_t load_loc = get_location_for_stmts (orig_stmts);
3491 orig_stmts.truncate (0);
3493 unsigned HOST_WIDE_INT load_align = group->load_align[j];
3494 unsigned HOST_WIDE_INT align_bitpos
3495 = known_alignment (try_pos * BITS_PER_UNIT
3496 - split_store->orig_stores[0]->bitpos
3497 + op.bitpos);
3498 if (align_bitpos & (load_align - 1))
3499 load_align = least_bit_hwi (align_bitpos);
3501 tree load_int_type
3502 = build_nonstandard_integer_type (try_size, UNSIGNED);
3503 load_int_type
3504 = build_aligned_type (load_int_type, load_align);
3506 poly_uint64 load_pos
3507 = exact_div (try_pos * BITS_PER_UNIT
3508 - split_store->orig_stores[0]->bitpos
3509 + op.bitpos,
3510 BITS_PER_UNIT);
3511 ops[j] = fold_build2 (MEM_REF, load_int_type, load_addr[j],
3512 build_int_cst (offset_type, load_pos));
3513 if (TREE_CODE (ops[j]) == MEM_REF)
3515 MR_DEPENDENCE_CLIQUE (ops[j]) = clique;
3516 MR_DEPENDENCE_BASE (ops[j]) = base;
3518 if (!integer_zerop (mask))
3519 /* The load might load some bits (that will be masked off
3520 later on) uninitialized, avoid -W*uninitialized
3521 warnings in that case. */
3522 TREE_NO_WARNING (ops[j]) = 1;
3524 stmt = gimple_build_assign (make_ssa_name (int_type),
3525 ops[j]);
3526 gimple_set_location (stmt, load_loc);
3527 if (gsi_bb (load_gsi[j]))
3529 gimple_set_vuse (stmt, gimple_vuse (op.stmt));
3530 gimple_seq_add_stmt_without_update (&load_seq[j], stmt);
3532 else
3534 gimple_set_vuse (stmt, new_vuse);
3535 gimple_seq_add_stmt_without_update (&seq, stmt);
3537 ops[j] = gimple_assign_lhs (stmt);
3538 tree xor_mask;
3539 enum tree_code inv_op
3540 = invert_op (split_store, j, int_type, xor_mask);
3541 if (inv_op != NOP_EXPR)
3543 stmt = gimple_build_assign (make_ssa_name (int_type),
3544 inv_op, ops[j], xor_mask);
3545 gimple_set_location (stmt, load_loc);
3546 ops[j] = gimple_assign_lhs (stmt);
3548 if (gsi_bb (load_gsi[j]))
3549 gimple_seq_add_stmt_without_update (&load_seq[j],
3550 stmt);
3551 else
3552 gimple_seq_add_stmt_without_update (&seq, stmt);
3555 else
3556 ops[j] = native_interpret_expr (int_type,
3557 group->val + try_pos
3558 - start_byte_pos,
3559 group->buf_size);
3562 switch (split_store->orig_stores[0]->rhs_code)
3564 case BIT_AND_EXPR:
3565 case BIT_IOR_EXPR:
3566 case BIT_XOR_EXPR:
3567 FOR_EACH_VEC_ELT (split_store->orig_stores, k, info)
3569 tree rhs1 = gimple_assign_rhs1 (info->stmt);
3570 orig_stmts.safe_push (SSA_NAME_DEF_STMT (rhs1));
3572 location_t bit_loc;
3573 bit_loc = get_location_for_stmts (orig_stmts);
3574 orig_stmts.truncate (0);
3576 stmt
3577 = gimple_build_assign (make_ssa_name (int_type),
3578 split_store->orig_stores[0]->rhs_code,
3579 ops[0], ops[1]);
3580 gimple_set_location (stmt, bit_loc);
3581 /* If there is just one load and there is a separate
3582 load_seq[0], emit the bitwise op right after it. */
3583 if (load_addr[1] == NULL_TREE && gsi_bb (load_gsi[0]))
3584 gimple_seq_add_stmt_without_update (&load_seq[0], stmt);
3585 /* Otherwise, if at least one load is in seq, we need to
3586 emit the bitwise op right before the store. If there
3587 are two loads and are emitted somewhere else, it would
3588 be better to emit the bitwise op as early as possible;
3589 we don't track where that would be possible right now
3590 though. */
3591 else
3592 gimple_seq_add_stmt_without_update (&seq, stmt);
3593 src = gimple_assign_lhs (stmt);
3594 tree xor_mask;
3595 enum tree_code inv_op;
3596 inv_op = invert_op (split_store, 2, int_type, xor_mask);
3597 if (inv_op != NOP_EXPR)
3599 stmt = gimple_build_assign (make_ssa_name (int_type),
3600 inv_op, src, xor_mask);
3601 gimple_set_location (stmt, bit_loc);
3602 if (load_addr[1] == NULL_TREE && gsi_bb (load_gsi[0]))
3603 gimple_seq_add_stmt_without_update (&load_seq[0], stmt);
3604 else
3605 gimple_seq_add_stmt_without_update (&seq, stmt);
3606 src = gimple_assign_lhs (stmt);
3608 break;
3609 case LROTATE_EXPR:
3610 case NOP_EXPR:
3611 src = ops[0];
3612 if (!is_gimple_val (src))
3614 stmt = gimple_build_assign (make_ssa_name (TREE_TYPE (src)),
3615 src);
3616 gimple_seq_add_stmt_without_update (&seq, stmt);
3617 src = gimple_assign_lhs (stmt);
3619 if (!useless_type_conversion_p (int_type, TREE_TYPE (src)))
3621 stmt = gimple_build_assign (make_ssa_name (int_type),
3622 NOP_EXPR, src);
3623 gimple_seq_add_stmt_without_update (&seq, stmt);
3624 src = gimple_assign_lhs (stmt);
3626 break;
3627 default:
3628 src = ops[0];
3629 break;
3632 if (!integer_zerop (mask))
3634 tree tem = make_ssa_name (int_type);
3635 tree load_src = unshare_expr (dest);
3636 /* The load might load some or all bits uninitialized,
3637 avoid -W*uninitialized warnings in that case.
3638 As optimization, it would be nice if all the bits are
3639 provably uninitialized (no stores at all yet or previous
3640 store a CLOBBER) we'd optimize away the load and replace
3641 it e.g. with 0. */
3642 TREE_NO_WARNING (load_src) = 1;
3643 stmt = gimple_build_assign (tem, load_src);
3644 gimple_set_location (stmt, loc);
3645 gimple_set_vuse (stmt, new_vuse);
3646 gimple_seq_add_stmt_without_update (&seq, stmt);
3648 /* FIXME: If there is a single chunk of zero bits in mask,
3649 perhaps use BIT_INSERT_EXPR instead? */
3650 stmt = gimple_build_assign (make_ssa_name (int_type),
3651 BIT_AND_EXPR, tem, mask);
3652 gimple_set_location (stmt, loc);
3653 gimple_seq_add_stmt_without_update (&seq, stmt);
3654 tem = gimple_assign_lhs (stmt);
3656 if (TREE_CODE (src) == INTEGER_CST)
3657 src = wide_int_to_tree (int_type,
3658 wi::bit_and_not (wi::to_wide (src),
3659 wi::to_wide (mask)));
3660 else
3662 tree nmask
3663 = wide_int_to_tree (int_type,
3664 wi::bit_not (wi::to_wide (mask)));
3665 stmt = gimple_build_assign (make_ssa_name (int_type),
3666 BIT_AND_EXPR, src, nmask);
3667 gimple_set_location (stmt, loc);
3668 gimple_seq_add_stmt_without_update (&seq, stmt);
3669 src = gimple_assign_lhs (stmt);
3671 stmt = gimple_build_assign (make_ssa_name (int_type),
3672 BIT_IOR_EXPR, tem, src);
3673 gimple_set_location (stmt, loc);
3674 gimple_seq_add_stmt_without_update (&seq, stmt);
3675 src = gimple_assign_lhs (stmt);
3679 stmt = gimple_build_assign (dest, src);
3680 gimple_set_location (stmt, loc);
3681 gimple_set_vuse (stmt, new_vuse);
3682 gimple_seq_add_stmt_without_update (&seq, stmt);
3684 tree new_vdef;
3685 if (i < split_stores.length () - 1)
3686 new_vdef = make_ssa_name (gimple_vop (cfun), stmt);
3687 else
3688 new_vdef = last_vdef;
3690 gimple_set_vdef (stmt, new_vdef);
3691 SSA_NAME_DEF_STMT (new_vdef) = stmt;
3692 new_vuse = new_vdef;
3695 FOR_EACH_VEC_ELT (split_stores, i, split_store)
3696 delete split_store;
3698 gcc_assert (seq);
3699 if (dump_file)
3701 fprintf (dump_file,
3702 "New sequence of %u stmts to replace old one of %u stmts\n",
3703 split_stores.length (), orig_num_stmts);
3704 if (dump_flags & TDF_DETAILS)
3705 print_gimple_seq (dump_file, seq, 0, TDF_VOPS | TDF_MEMSYMS);
3707 gsi_insert_seq_after (&last_gsi, seq, GSI_SAME_STMT);
3708 for (int j = 0; j < 2; ++j)
3709 if (load_seq[j])
3710 gsi_insert_seq_after (&load_gsi[j], load_seq[j], GSI_SAME_STMT);
3712 return true;
3715 /* Process the merged_store_group objects created in the coalescing phase.
3716 The stores are all against the base object BASE.
3717 Try to output the widened stores and delete the original statements if
3718 successful. Return true iff any changes were made. */
3720 bool
3721 imm_store_chain_info::output_merged_stores ()
3723 unsigned int i;
3724 merged_store_group *merged_store;
3725 bool ret = false;
3726 FOR_EACH_VEC_ELT (m_merged_store_groups, i, merged_store)
3728 if (output_merged_store (merged_store))
3730 unsigned int j;
3731 store_immediate_info *store;
3732 FOR_EACH_VEC_ELT (merged_store->stores, j, store)
3734 gimple *stmt = store->stmt;
3735 gimple_stmt_iterator gsi = gsi_for_stmt (stmt);
3736 gsi_remove (&gsi, true);
3737 if (stmt != merged_store->last_stmt)
3739 unlink_stmt_vdef (stmt);
3740 release_defs (stmt);
3743 ret = true;
3746 if (ret && dump_file)
3747 fprintf (dump_file, "Merging successful!\n");
3749 return ret;
3752 /* Coalesce the store_immediate_info objects recorded against the base object
3753 BASE in the first phase and output them.
3754 Delete the allocated structures.
3755 Return true if any changes were made. */
3757 bool
3758 imm_store_chain_info::terminate_and_process_chain ()
3760 /* Process store chain. */
3761 bool ret = false;
3762 if (m_store_info.length () > 1)
3764 ret = coalesce_immediate_stores ();
3765 if (ret)
3766 ret = output_merged_stores ();
3769 /* Delete all the entries we allocated ourselves. */
3770 store_immediate_info *info;
3771 unsigned int i;
3772 FOR_EACH_VEC_ELT (m_store_info, i, info)
3773 delete info;
3775 merged_store_group *merged_info;
3776 FOR_EACH_VEC_ELT (m_merged_store_groups, i, merged_info)
3777 delete merged_info;
3779 return ret;
3782 /* Return true iff LHS is a destination potentially interesting for
3783 store merging. In practice these are the codes that get_inner_reference
3784 can process. */
3786 static bool
3787 lhs_valid_for_store_merging_p (tree lhs)
3789 tree_code code = TREE_CODE (lhs);
3791 if (code == ARRAY_REF || code == ARRAY_RANGE_REF || code == MEM_REF
3792 || code == COMPONENT_REF || code == BIT_FIELD_REF)
3793 return true;
3795 return false;
3798 /* Return true if the tree RHS is a constant we want to consider
3799 during store merging. In practice accept all codes that
3800 native_encode_expr accepts. */
3802 static bool
3803 rhs_valid_for_store_merging_p (tree rhs)
3805 return native_encode_expr (rhs, NULL,
3806 GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (rhs)))) != 0;
3809 /* If MEM is a memory reference usable for store merging (either as
3810 store destination or for loads), return the non-NULL base_addr
3811 and set *PBITSIZE, *PBITPOS, *PBITREGION_START and *PBITREGION_END.
3812 Otherwise return NULL, *PBITPOS should be still valid even for that
3813 case. */
3815 static tree
3816 mem_valid_for_store_merging (tree mem, poly_uint64 *pbitsize,
3817 poly_uint64 *pbitpos,
3818 poly_uint64 *pbitregion_start,
3819 poly_uint64 *pbitregion_end)
3821 poly_int64 bitsize, bitpos;
3822 poly_uint64 bitregion_start = 0, bitregion_end = 0;
3823 machine_mode mode;
3824 int unsignedp = 0, reversep = 0, volatilep = 0;
3825 tree offset;
3826 tree base_addr = get_inner_reference (mem, &bitsize, &bitpos, &offset, &mode,
3827 &unsignedp, &reversep, &volatilep);
3828 *pbitsize = bitsize;
3829 if (known_eq (bitsize, 0))
3830 return NULL_TREE;
3832 if (TREE_CODE (mem) == COMPONENT_REF
3833 && DECL_BIT_FIELD_TYPE (TREE_OPERAND (mem, 1)))
3835 get_bit_range (&bitregion_start, &bitregion_end, mem, &bitpos, &offset);
3836 if (maybe_ne (bitregion_end, 0U))
3837 bitregion_end += 1;
3840 if (reversep)
3841 return NULL_TREE;
3843 /* We do not want to rewrite TARGET_MEM_REFs. */
3844 if (TREE_CODE (base_addr) == TARGET_MEM_REF)
3845 return NULL_TREE;
3846 /* In some cases get_inner_reference may return a
3847 MEM_REF [ptr + byteoffset]. For the purposes of this pass
3848 canonicalize the base_addr to MEM_REF [ptr] and take
3849 byteoffset into account in the bitpos. This occurs in
3850 PR 23684 and this way we can catch more chains. */
3851 else if (TREE_CODE (base_addr) == MEM_REF)
3853 poly_offset_int byte_off = mem_ref_offset (base_addr);
3854 poly_offset_int bit_off = byte_off << LOG2_BITS_PER_UNIT;
3855 bit_off += bitpos;
3856 if (known_ge (bit_off, 0) && bit_off.to_shwi (&bitpos))
3858 if (maybe_ne (bitregion_end, 0U))
3860 bit_off = byte_off << LOG2_BITS_PER_UNIT;
3861 bit_off += bitregion_start;
3862 if (bit_off.to_uhwi (&bitregion_start))
3864 bit_off = byte_off << LOG2_BITS_PER_UNIT;
3865 bit_off += bitregion_end;
3866 if (!bit_off.to_uhwi (&bitregion_end))
3867 bitregion_end = 0;
3869 else
3870 bitregion_end = 0;
3873 else
3874 return NULL_TREE;
3875 base_addr = TREE_OPERAND (base_addr, 0);
3877 /* get_inner_reference returns the base object, get at its
3878 address now. */
3879 else
3881 if (maybe_lt (bitpos, 0))
3882 return NULL_TREE;
3883 base_addr = build_fold_addr_expr (base_addr);
3886 if (known_eq (bitregion_end, 0U))
3888 bitregion_start = round_down_to_byte_boundary (bitpos);
3889 bitregion_end = round_up_to_byte_boundary (bitpos + bitsize);
3892 if (offset != NULL_TREE)
3894 /* If the access is variable offset then a base decl has to be
3895 address-taken to be able to emit pointer-based stores to it.
3896 ??? We might be able to get away with re-using the original
3897 base up to the first variable part and then wrapping that inside
3898 a BIT_FIELD_REF. */
3899 tree base = get_base_address (base_addr);
3900 if (! base
3901 || (DECL_P (base) && ! TREE_ADDRESSABLE (base)))
3902 return NULL_TREE;
3904 base_addr = build2 (POINTER_PLUS_EXPR, TREE_TYPE (base_addr),
3905 base_addr, offset);
3908 *pbitsize = bitsize;
3909 *pbitpos = bitpos;
3910 *pbitregion_start = bitregion_start;
3911 *pbitregion_end = bitregion_end;
3912 return base_addr;
3915 /* Return true if STMT is a load that can be used for store merging.
3916 In that case fill in *OP. BITSIZE, BITPOS, BITREGION_START and
3917 BITREGION_END are properties of the corresponding store. */
3919 static bool
3920 handled_load (gimple *stmt, store_operand_info *op,
3921 poly_uint64 bitsize, poly_uint64 bitpos,
3922 poly_uint64 bitregion_start, poly_uint64 bitregion_end)
3924 if (!is_gimple_assign (stmt))
3925 return false;
3926 if (gimple_assign_rhs_code (stmt) == BIT_NOT_EXPR)
3928 tree rhs1 = gimple_assign_rhs1 (stmt);
3929 if (TREE_CODE (rhs1) == SSA_NAME
3930 && handled_load (SSA_NAME_DEF_STMT (rhs1), op, bitsize, bitpos,
3931 bitregion_start, bitregion_end))
3933 /* Don't allow _1 = load; _2 = ~1; _3 = ~_2; which should have
3934 been optimized earlier, but if allowed here, would confuse the
3935 multiple uses counting. */
3936 if (op->bit_not_p)
3937 return false;
3938 op->bit_not_p = !op->bit_not_p;
3939 return true;
3941 return false;
3943 if (gimple_vuse (stmt)
3944 && gimple_assign_load_p (stmt)
3945 && !stmt_can_throw_internal (stmt)
3946 && !gimple_has_volatile_ops (stmt))
3948 tree mem = gimple_assign_rhs1 (stmt);
3949 op->base_addr
3950 = mem_valid_for_store_merging (mem, &op->bitsize, &op->bitpos,
3951 &op->bitregion_start,
3952 &op->bitregion_end);
3953 if (op->base_addr != NULL_TREE
3954 && known_eq (op->bitsize, bitsize)
3955 && multiple_p (op->bitpos - bitpos, BITS_PER_UNIT)
3956 && known_ge (op->bitpos - op->bitregion_start,
3957 bitpos - bitregion_start)
3958 && known_ge (op->bitregion_end - op->bitpos,
3959 bitregion_end - bitpos))
3961 op->stmt = stmt;
3962 op->val = mem;
3963 op->bit_not_p = false;
3964 return true;
3967 return false;
3970 /* Record the store STMT for store merging optimization if it can be
3971 optimized. */
3973 void
3974 pass_store_merging::process_store (gimple *stmt)
3976 tree lhs = gimple_assign_lhs (stmt);
3977 tree rhs = gimple_assign_rhs1 (stmt);
3978 poly_uint64 bitsize, bitpos;
3979 poly_uint64 bitregion_start, bitregion_end;
3980 tree base_addr
3981 = mem_valid_for_store_merging (lhs, &bitsize, &bitpos,
3982 &bitregion_start, &bitregion_end);
3983 if (known_eq (bitsize, 0U))
3984 return;
3986 bool invalid = (base_addr == NULL_TREE
3987 || (maybe_gt (bitsize,
3988 (unsigned int) MAX_BITSIZE_MODE_ANY_INT)
3989 && (TREE_CODE (rhs) != INTEGER_CST)));
3990 enum tree_code rhs_code = ERROR_MARK;
3991 bool bit_not_p = false;
3992 struct symbolic_number n;
3993 gimple *ins_stmt = NULL;
3994 store_operand_info ops[2];
3995 if (invalid)
3997 else if (rhs_valid_for_store_merging_p (rhs))
3999 rhs_code = INTEGER_CST;
4000 ops[0].val = rhs;
4002 else if (TREE_CODE (rhs) != SSA_NAME)
4003 invalid = true;
4004 else
4006 gimple *def_stmt = SSA_NAME_DEF_STMT (rhs), *def_stmt1, *def_stmt2;
4007 if (!is_gimple_assign (def_stmt))
4008 invalid = true;
4009 else if (handled_load (def_stmt, &ops[0], bitsize, bitpos,
4010 bitregion_start, bitregion_end))
4011 rhs_code = MEM_REF;
4012 else if (gimple_assign_rhs_code (def_stmt) == BIT_NOT_EXPR)
4014 tree rhs1 = gimple_assign_rhs1 (def_stmt);
4015 if (TREE_CODE (rhs1) == SSA_NAME
4016 && is_gimple_assign (SSA_NAME_DEF_STMT (rhs1)))
4018 bit_not_p = true;
4019 def_stmt = SSA_NAME_DEF_STMT (rhs1);
4022 if (rhs_code == ERROR_MARK && !invalid)
4023 switch ((rhs_code = gimple_assign_rhs_code (def_stmt)))
4025 case BIT_AND_EXPR:
4026 case BIT_IOR_EXPR:
4027 case BIT_XOR_EXPR:
4028 tree rhs1, rhs2;
4029 rhs1 = gimple_assign_rhs1 (def_stmt);
4030 rhs2 = gimple_assign_rhs2 (def_stmt);
4031 invalid = true;
4032 if (TREE_CODE (rhs1) != SSA_NAME)
4033 break;
4034 def_stmt1 = SSA_NAME_DEF_STMT (rhs1);
4035 if (!is_gimple_assign (def_stmt1)
4036 || !handled_load (def_stmt1, &ops[0], bitsize, bitpos,
4037 bitregion_start, bitregion_end))
4038 break;
4039 if (rhs_valid_for_store_merging_p (rhs2))
4040 ops[1].val = rhs2;
4041 else if (TREE_CODE (rhs2) != SSA_NAME)
4042 break;
4043 else
4045 def_stmt2 = SSA_NAME_DEF_STMT (rhs2);
4046 if (!is_gimple_assign (def_stmt2))
4047 break;
4048 else if (!handled_load (def_stmt2, &ops[1], bitsize, bitpos,
4049 bitregion_start, bitregion_end))
4050 break;
4052 invalid = false;
4053 break;
4054 default:
4055 invalid = true;
4056 break;
4058 unsigned HOST_WIDE_INT const_bitsize;
4059 if (bitsize.is_constant (&const_bitsize)
4060 && multiple_p (const_bitsize, BITS_PER_UNIT)
4061 && multiple_p (bitpos, BITS_PER_UNIT)
4062 && const_bitsize <= 64
4063 && BYTES_BIG_ENDIAN == WORDS_BIG_ENDIAN)
4065 ins_stmt = find_bswap_or_nop_1 (def_stmt, &n, 12);
4066 if (ins_stmt)
4068 uint64_t nn = n.n;
4069 for (unsigned HOST_WIDE_INT i = 0;
4070 i < const_bitsize;
4071 i += BITS_PER_UNIT, nn >>= BITS_PER_MARKER)
4072 if ((nn & MARKER_MASK) == 0
4073 || (nn & MARKER_MASK) == MARKER_BYTE_UNKNOWN)
4075 ins_stmt = NULL;
4076 break;
4078 if (ins_stmt)
4080 if (invalid)
4082 rhs_code = LROTATE_EXPR;
4083 ops[0].base_addr = NULL_TREE;
4084 ops[1].base_addr = NULL_TREE;
4086 invalid = false;
4092 unsigned HOST_WIDE_INT const_bitsize, const_bitpos;
4093 unsigned HOST_WIDE_INT const_bitregion_start, const_bitregion_end;
4094 if (invalid
4095 || !bitsize.is_constant (&const_bitsize)
4096 || !bitpos.is_constant (&const_bitpos)
4097 || !bitregion_start.is_constant (&const_bitregion_start)
4098 || !bitregion_end.is_constant (&const_bitregion_end))
4100 terminate_all_aliasing_chains (NULL, stmt);
4101 return;
4104 if (!ins_stmt)
4105 memset (&n, 0, sizeof (n));
4107 struct imm_store_chain_info **chain_info = NULL;
4108 if (base_addr)
4109 chain_info = m_stores.get (base_addr);
4111 store_immediate_info *info;
4112 if (chain_info)
4114 unsigned int ord = (*chain_info)->m_store_info.length ();
4115 info = new store_immediate_info (const_bitsize, const_bitpos,
4116 const_bitregion_start,
4117 const_bitregion_end,
4118 stmt, ord, rhs_code, n, ins_stmt,
4119 bit_not_p, ops[0], ops[1]);
4120 if (dump_file && (dump_flags & TDF_DETAILS))
4122 fprintf (dump_file, "Recording immediate store from stmt:\n");
4123 print_gimple_stmt (dump_file, stmt, 0);
4125 (*chain_info)->m_store_info.safe_push (info);
4126 terminate_all_aliasing_chains (chain_info, stmt);
4127 /* If we reach the limit of stores to merge in a chain terminate and
4128 process the chain now. */
4129 if ((*chain_info)->m_store_info.length ()
4130 == (unsigned int) PARAM_VALUE (PARAM_MAX_STORES_TO_MERGE))
4132 if (dump_file && (dump_flags & TDF_DETAILS))
4133 fprintf (dump_file,
4134 "Reached maximum number of statements to merge:\n");
4135 terminate_and_release_chain (*chain_info);
4137 return;
4140 /* Store aliases any existing chain? */
4141 terminate_all_aliasing_chains (NULL, stmt);
4142 /* Start a new chain. */
4143 struct imm_store_chain_info *new_chain
4144 = new imm_store_chain_info (m_stores_head, base_addr);
4145 info = new store_immediate_info (const_bitsize, const_bitpos,
4146 const_bitregion_start,
4147 const_bitregion_end,
4148 stmt, 0, rhs_code, n, ins_stmt,
4149 bit_not_p, ops[0], ops[1]);
4150 new_chain->m_store_info.safe_push (info);
4151 m_stores.put (base_addr, new_chain);
4152 if (dump_file && (dump_flags & TDF_DETAILS))
4154 fprintf (dump_file, "Starting new chain with statement:\n");
4155 print_gimple_stmt (dump_file, stmt, 0);
4156 fprintf (dump_file, "The base object is:\n");
4157 print_generic_expr (dump_file, base_addr);
4158 fprintf (dump_file, "\n");
4162 /* Entry point for the pass. Go over each basic block recording chains of
4163 immediate stores. Upon encountering a terminating statement (as defined
4164 by stmt_terminates_chain_p) process the recorded stores and emit the widened
4165 variants. */
4167 unsigned int
4168 pass_store_merging::execute (function *fun)
4170 basic_block bb;
4171 hash_set<gimple *> orig_stmts;
4173 calculate_dominance_info (CDI_DOMINATORS);
4175 FOR_EACH_BB_FN (bb, fun)
4177 gimple_stmt_iterator gsi;
4178 unsigned HOST_WIDE_INT num_statements = 0;
4179 /* Record the original statements so that we can keep track of
4180 statements emitted in this pass and not re-process new
4181 statements. */
4182 for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi); gsi_next (&gsi))
4184 if (is_gimple_debug (gsi_stmt (gsi)))
4185 continue;
4187 if (++num_statements >= 2)
4188 break;
4191 if (num_statements < 2)
4192 continue;
4194 if (dump_file && (dump_flags & TDF_DETAILS))
4195 fprintf (dump_file, "Processing basic block <%d>:\n", bb->index);
4197 for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi); gsi_next (&gsi))
4199 gimple *stmt = gsi_stmt (gsi);
4201 if (is_gimple_debug (stmt))
4202 continue;
4204 if (gimple_has_volatile_ops (stmt))
4206 /* Terminate all chains. */
4207 if (dump_file && (dump_flags & TDF_DETAILS))
4208 fprintf (dump_file, "Volatile access terminates "
4209 "all chains\n");
4210 terminate_and_process_all_chains ();
4211 continue;
4214 if (gimple_assign_single_p (stmt) && gimple_vdef (stmt)
4215 && !stmt_can_throw_internal (stmt)
4216 && lhs_valid_for_store_merging_p (gimple_assign_lhs (stmt)))
4217 process_store (stmt);
4218 else
4219 terminate_all_aliasing_chains (NULL, stmt);
4221 terminate_and_process_all_chains ();
4223 return 0;
4226 } // anon namespace
4228 /* Construct and return a store merging pass object. */
4230 gimple_opt_pass *
4231 make_pass_store_merging (gcc::context *ctxt)
4233 return new pass_store_merging (ctxt);
4236 #if CHECKING_P
4238 namespace selftest {
4240 /* Selftests for store merging helpers. */
4242 /* Assert that all elements of the byte arrays X and Y, both of length N
4243 are equal. */
4245 static void
4246 verify_array_eq (unsigned char *x, unsigned char *y, unsigned int n)
4248 for (unsigned int i = 0; i < n; i++)
4250 if (x[i] != y[i])
4252 fprintf (stderr, "Arrays do not match. X:\n");
4253 dump_char_array (stderr, x, n);
4254 fprintf (stderr, "Y:\n");
4255 dump_char_array (stderr, y, n);
4257 ASSERT_EQ (x[i], y[i]);
4261 /* Test shift_bytes_in_array and that it carries bits across between
4262 bytes correctly. */
4264 static void
4265 verify_shift_bytes_in_array (void)
4267 /* byte 1 | byte 0
4268 00011111 | 11100000. */
4269 unsigned char orig[2] = { 0xe0, 0x1f };
4270 unsigned char in[2];
4271 memcpy (in, orig, sizeof orig);
4273 unsigned char expected[2] = { 0x80, 0x7f };
4274 shift_bytes_in_array (in, sizeof (in), 2);
4275 verify_array_eq (in, expected, sizeof (in));
4277 memcpy (in, orig, sizeof orig);
4278 memcpy (expected, orig, sizeof orig);
4279 /* Check that shifting by zero doesn't change anything. */
4280 shift_bytes_in_array (in, sizeof (in), 0);
4281 verify_array_eq (in, expected, sizeof (in));
4285 /* Test shift_bytes_in_array_right and that it carries bits across between
4286 bytes correctly. */
4288 static void
4289 verify_shift_bytes_in_array_right (void)
4291 /* byte 1 | byte 0
4292 00011111 | 11100000. */
4293 unsigned char orig[2] = { 0x1f, 0xe0};
4294 unsigned char in[2];
4295 memcpy (in, orig, sizeof orig);
4296 unsigned char expected[2] = { 0x07, 0xf8};
4297 shift_bytes_in_array_right (in, sizeof (in), 2);
4298 verify_array_eq (in, expected, sizeof (in));
4300 memcpy (in, orig, sizeof orig);
4301 memcpy (expected, orig, sizeof orig);
4302 /* Check that shifting by zero doesn't change anything. */
4303 shift_bytes_in_array_right (in, sizeof (in), 0);
4304 verify_array_eq (in, expected, sizeof (in));
4307 /* Test clear_bit_region that it clears exactly the bits asked and
4308 nothing more. */
4310 static void
4311 verify_clear_bit_region (void)
4313 /* Start with all bits set and test clearing various patterns in them. */
4314 unsigned char orig[3] = { 0xff, 0xff, 0xff};
4315 unsigned char in[3];
4316 unsigned char expected[3];
4317 memcpy (in, orig, sizeof in);
4319 /* Check zeroing out all the bits. */
4320 clear_bit_region (in, 0, 3 * BITS_PER_UNIT);
4321 expected[0] = expected[1] = expected[2] = 0;
4322 verify_array_eq (in, expected, sizeof in);
4324 memcpy (in, orig, sizeof in);
4325 /* Leave the first and last bits intact. */
4326 clear_bit_region (in, 1, 3 * BITS_PER_UNIT - 2);
4327 expected[0] = 0x1;
4328 expected[1] = 0;
4329 expected[2] = 0x80;
4330 verify_array_eq (in, expected, sizeof in);
4333 /* Test verify_clear_bit_region_be that it clears exactly the bits asked and
4334 nothing more. */
4336 static void
4337 verify_clear_bit_region_be (void)
4339 /* Start with all bits set and test clearing various patterns in them. */
4340 unsigned char orig[3] = { 0xff, 0xff, 0xff};
4341 unsigned char in[3];
4342 unsigned char expected[3];
4343 memcpy (in, orig, sizeof in);
4345 /* Check zeroing out all the bits. */
4346 clear_bit_region_be (in, BITS_PER_UNIT - 1, 3 * BITS_PER_UNIT);
4347 expected[0] = expected[1] = expected[2] = 0;
4348 verify_array_eq (in, expected, sizeof in);
4350 memcpy (in, orig, sizeof in);
4351 /* Leave the first and last bits intact. */
4352 clear_bit_region_be (in, BITS_PER_UNIT - 2, 3 * BITS_PER_UNIT - 2);
4353 expected[0] = 0x80;
4354 expected[1] = 0;
4355 expected[2] = 0x1;
4356 verify_array_eq (in, expected, sizeof in);
4360 /* Run all of the selftests within this file. */
4362 void
4363 store_merging_c_tests (void)
4365 verify_shift_bytes_in_array ();
4366 verify_shift_bytes_in_array_right ();
4367 verify_clear_bit_region ();
4368 verify_clear_bit_region_be ();
4371 } // namespace selftest
4372 #endif /* CHECKING_P. */