gcc/tree-ssa-math-opts.c

   1 /* Global, SSA-based optimizations using mathematical identities.
   2    Copyright (C) 2005-2014 Free Software Foundation, Inc.
   3
   4 This file is part of GCC.
   5
   6 GCC is free software; you can redistribute it and/or modify it
   7 under the terms of the GNU General Public License as published by the
   8 Free Software Foundation; either version 3, or (at your option) any
   9 later version.
  10
  11 GCC is distributed in the hope that it will be useful, but WITHOUT
  12 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14 for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with GCC; see the file COPYING3.  If not see
  18 <http://www.gnu.org/licenses/>.  */
  19
  20 /* Currently, the only mini-pass in this file tries to CSE reciprocal
  21    operations.  These are common in sequences such as this one:
  22
  23         modulus = sqrt(x*x + y*y + z*z);
  24         x = x / modulus;
  25         y = y / modulus;
  26         z = z / modulus;
  27
  28    that can be optimized to
  29
  30         modulus = sqrt(x*x + y*y + z*z);
  31         rmodulus = 1.0 / modulus;
  32         x = x * rmodulus;
  33         y = y * rmodulus;
  34         z = z * rmodulus;
  35
  36    We do this for loop invariant divisors, and with this pass whenever
  37    we notice that a division has the same divisor multiple times.
  38
  39    Of course, like in PRE, we don't insert a division if a dominator
  40    already has one.  However, this cannot be done as an extension of
  41    PRE for several reasons.
  42
  43    First of all, with some experiments it was found out that the
  44    transformation is not always useful if there are only two divisions
  45    hy the same divisor.  This is probably because modern processors
  46    can pipeline the divisions; on older, in-order processors it should
  47    still be effective to optimize two divisions by the same number.
  48    We make this a param, and it shall be called N in the remainder of
  49    this comment.
  50
  51    Second, if trapping math is active, we have less freedom on where
  52    to insert divisions: we can only do so in basic blocks that already
  53    contain one.  (If divisions don't trap, instead, we can insert
  54    divisions elsewhere, which will be in blocks that are common dominators
  55    of those that have the division).
  56
  57    We really don't want to compute the reciprocal unless a division will
  58    be found.  To do this, we won't insert the division in a basic block
  59    that has less than N divisions *post-dominating* it.
  60
  61    The algorithm constructs a subset of the dominator tree, holding the
  62    blocks containing the divisions and the common dominators to them,
  63    and walk it twice.  The first walk is in post-order, and it annotates
  64    each block with the number of divisions that post-dominate it: this
  65    gives information on where divisions can be inserted profitably.
  66    The second walk is in pre-order, and it inserts divisions as explained
  67    above, and replaces divisions by multiplications.
  68
  69    In the best case, the cost of the pass is O(n_statements).  In the
  70    worst-case, the cost is due to creating the dominator tree subset,
  71    with a cost of O(n_basic_blocks ^ 2); however this can only happen
  72    for n_statements / n_basic_blocks statements.  So, the amortized cost
  73    of creating the dominator tree subset is O(n_basic_blocks) and the
  74    worst-case cost of the pass is O(n_statements * n_basic_blocks).
  75
  76    More practically, the cost will be small because there are few
  77    divisions, and they tend to be in the same basic block, so insert_bb
  78    is called very few times.
  79
  80    If we did this using domwalk.c, an efficient implementation would have
  81    to work on all the variables in a single pass, because we could not
  82    work on just a subset of the dominator tree, as we do now, and the
  83    cost would also be something like O(n_statements * n_basic_blocks).
  84    The data structures would be more complex in order to work on all the
  85    variables in a single pass.  */
  86
  87 #include "config.h"
  88 #include "system.h"
  89 #include "coretypes.h"
  90 #include "tm.h"
  91 #include "flags.h"
  92 #include "tree.h"
  93 #include "predict.h"
  94 #include "vec.h"
  95 #include "hashtab.h"
  96 #include "hash-set.h"
  97 #include "machmode.h"
  98 #include "hard-reg-set.h"
  99 #include "input.h"
 100 #include "function.h"
 101 #include "dominance.h"
 102 #include "cfg.h"
 103 #include "basic-block.h"
 104 #include "tree-ssa-alias.h"
 105 #include "internal-fn.h"
 106 #include "gimple-fold.h"
 107 #include "gimple-expr.h"
 108 #include "is-a.h"
 109 #include "gimple.h"
 110 #include "gimple-iterator.h"
 111 #include "gimplify.h"
 112 #include "gimplify-me.h"
 113 #include "stor-layout.h"
 114 #include "gimple-ssa.h"
 115 #include "tree-cfg.h"
 116 #include "tree-phinodes.h"
 117 #include "ssa-iterators.h"
 118 #include "stringpool.h"
 119 #include "tree-ssanames.h"
 120 #include "expr.h"
 121 #include "tree-dfa.h"
 122 #include "tree-ssa.h"
 123 #include "tree-pass.h"
 124 #include "alloc-pool.h"
 125 #include "target.h"
 126 #include "gimple-pretty-print.h"
 127 #include "builtins.h"
 128
 129 /* FIXME: RTL headers have to be included here for optabs.  */
 130 #include "rtl.h"                /* Because optabs.h wants enum rtx_code.  */
 131 #include "expr.h"               /* Because optabs.h wants sepops.  */
 132 #include "insn-codes.h"
 133 #include "optabs.h"
 134
 135 /* This structure represents one basic block that either computes a
 136    division, or is a common dominator for basic block that compute a
 137    division.  */
 138 struct occurrence {
 139   /* The basic block represented by this structure.  */
 140   basic_block bb;
 141
 142   /* If non-NULL, the SSA_NAME holding the definition for a reciprocal
 143      inserted in BB.  */
 144   tree recip_def;
 145
 146   /* If non-NULL, the GIMPLE_ASSIGN for a reciprocal computation that
 147      was inserted in BB.  */
 148   gimple recip_def_stmt;
 149
 150   /* Pointer to a list of "struct occurrence"s for blocks dominated
 151      by BB.  */
 152   struct occurrence *children;
 153
 154   /* Pointer to the next "struct occurrence"s in the list of blocks
 155      sharing a common dominator.  */
 156   struct occurrence *next;
 157
 158   /* The number of divisions that are in BB before compute_merit.  The
 159      number of divisions that are in BB or post-dominate it after
 160      compute_merit.  */
 161   int num_divisions;
 162
 163   /* True if the basic block has a division, false if it is a common
 164      dominator for basic blocks that do.  If it is false and trapping
 165      math is active, BB is not a candidate for inserting a reciprocal.  */
 166   bool bb_has_division;
 167 };
 168
 169 static struct
 170 {
 171   /* Number of 1.0/X ops inserted.  */
 172   int rdivs_inserted;
 173
 174   /* Number of 1.0/FUNC ops inserted.  */
 175   int rfuncs_inserted;
 176 } reciprocal_stats;
 177
 178 static struct
 179 {
 180   /* Number of cexpi calls inserted.  */
 181   int inserted;
 182 } sincos_stats;
 183
 184 static struct
 185 {
 186   /* Number of hand-written 16-bit nop / bswaps found.  */
 187   int found_16bit;
 188
 189   /* Number of hand-written 32-bit nop / bswaps found.  */
 190   int found_32bit;
 191
 192   /* Number of hand-written 64-bit nop / bswaps found.  */
 193   int found_64bit;
 194 } nop_stats, bswap_stats;
 195
 196 static struct
 197 {
 198   /* Number of widening multiplication ops inserted.  */
 199   int widen_mults_inserted;
 200
 201   /* Number of integer multiply-and-accumulate ops inserted.  */
 202   int maccs_inserted;
 203
 204   /* Number of fp fused multiply-add ops inserted.  */
 205   int fmas_inserted;
 206 } widen_mul_stats;
 207
 208 /* The instance of "struct occurrence" representing the highest
 209    interesting block in the dominator tree.  */
 210 static struct occurrence *occ_head;
 211
 212 /* Allocation pool for getting instances of "struct occurrence".  */
 213 static alloc_pool occ_pool;
 214
 215
 216
 217 /* Allocate and return a new struct occurrence for basic block BB, and
 218    whose children list is headed by CHILDREN.  */
 219 static struct occurrence *
 220 occ_new (basic_block bb, struct occurrence *children)
 221 {
 222   struct occurrence *occ;
 223
 224   bb->aux = occ = (struct occurrence *) pool_alloc (occ_pool);
 225   memset (occ, 0, sizeof (struct occurrence));
 226
 227   occ->bb = bb;
 228   occ->children = children;
 229   return occ;
 230 }
 231
 232
 233 /* Insert NEW_OCC into our subset of the dominator tree.  P_HEAD points to a
 234    list of "struct occurrence"s, one per basic block, having IDOM as
 235    their common dominator.
 236
 237    We try to insert NEW_OCC as deep as possible in the tree, and we also
 238    insert any other block that is a common dominator for BB and one
 239    block already in the tree.  */
 240
 241 static void
 242 insert_bb (struct occurrence *new_occ, basic_block idom,
 243            struct occurrence **p_head)
 244 {
 245   struct occurrence *occ, **p_occ;
 246
 247   for (p_occ = p_head; (occ = *p_occ) != NULL; )
 248     {
 249       basic_block bb = new_occ->bb, occ_bb = occ->bb;
 250       basic_block dom = nearest_common_dominator (CDI_DOMINATORS, occ_bb, bb);
 251       if (dom == bb)
 252         {
 253           /* BB dominates OCC_BB.  OCC becomes NEW_OCC's child: remove OCC
 254              from its list.  */
 255           *p_occ = occ->next;
 256           occ->next = new_occ->children;
 257           new_occ->children = occ;
 258
 259           /* Try the next block (it may as well be dominated by BB).  */
 260         }
 261
 262       else if (dom == occ_bb)
 263         {
 264           /* OCC_BB dominates BB.  Tail recurse to look deeper.  */
 265           insert_bb (new_occ, dom, &occ->children);
 266           return;
 267         }
 268
 269       else if (dom != idom)
 270         {
 271           gcc_assert (!dom->aux);
 272
 273           /* There is a dominator between IDOM and BB, add it and make
 274              two children out of NEW_OCC and OCC.  First, remove OCC from
 275              its list.  */
 276           *p_occ = occ->next;
 277           new_occ->next = occ;
 278           occ->next = NULL;
 279
 280           /* None of the previous blocks has DOM as a dominator: if we tail
 281              recursed, we would reexamine them uselessly. Just switch BB with
 282              DOM, and go on looking for blocks dominated by DOM.  */
 283           new_occ = occ_new (dom, new_occ);
 284         }
 285
 286       else
 287         {
 288           /* Nothing special, go on with the next element.  */
 289           p_occ = &occ->next;
 290         }
 291     }
 292
 293   /* No place was found as a child of IDOM.  Make BB a sibling of IDOM.  */
 294   new_occ->next = *p_head;
 295   *p_head = new_occ;
 296 }
 297
 298 /* Register that we found a division in BB.  */
 299
 300 static inline void
 301 register_division_in (basic_block bb)
 302 {
 303   struct occurrence *occ;
 304
 305   occ = (struct occurrence *) bb->aux;
 306   if (!occ)
 307     {
 308       occ = occ_new (bb, NULL);
 309       insert_bb (occ, ENTRY_BLOCK_PTR_FOR_FN (cfun), &occ_head);
 310     }
 311
 312   occ->bb_has_division = true;
 313   occ->num_divisions++;
 314 }
 315
 316
 317 /* Compute the number of divisions that postdominate each block in OCC and
 318    its children.  */
 319
 320 static void
 321 compute_merit (struct occurrence *occ)
 322 {
 323   struct occurrence *occ_child;
 324   basic_block dom = occ->bb;
 325
 326   for (occ_child = occ->children; occ_child; occ_child = occ_child->next)
 327     {
 328       basic_block bb;
 329       if (occ_child->children)
 330         compute_merit (occ_child);
 331
 332       if (flag_exceptions)
 333         bb = single_noncomplex_succ (dom);
 334       else
 335         bb = dom;
 336
 337       if (dominated_by_p (CDI_POST_DOMINATORS, bb, occ_child->bb))
 338         occ->num_divisions += occ_child->num_divisions;
 339     }
 340 }
 341
 342
 343 /* Return whether USE_STMT is a floating-point division by DEF.  */
 344 static inline bool
 345 is_division_by (gimple use_stmt, tree def)
 346 {
 347   return is_gimple_assign (use_stmt)
 348          && gimple_assign_rhs_code (use_stmt) == RDIV_EXPR
 349          && gimple_assign_rhs2 (use_stmt) == def
 350          /* Do not recognize x / x as valid division, as we are getting
 351             confused later by replacing all immediate uses x in such
 352             a stmt.  */
 353          && gimple_assign_rhs1 (use_stmt) != def;
 354 }
 355
 356 /* Walk the subset of the dominator tree rooted at OCC, setting the
 357    RECIP_DEF field to a definition of 1.0 / DEF that can be used in
 358    the given basic block.  The field may be left NULL, of course,
 359    if it is not possible or profitable to do the optimization.
 360
 361    DEF_BSI is an iterator pointing at the statement defining DEF.
 362    If RECIP_DEF is set, a dominator already has a computation that can
 363    be used.  */
 364
 365 static void
 366 insert_reciprocals (gimple_stmt_iterator *def_gsi, struct occurrence *occ,
 367                     tree def, tree recip_def, int threshold)
 368 {
 369   tree type;
 370   gimple new_stmt;
 371   gimple_stmt_iterator gsi;
 372   struct occurrence *occ_child;
 373
 374   if (!recip_def
 375       && (occ->bb_has_division || !flag_trapping_math)
 376       && occ->num_divisions >= threshold)
 377     {
 378       /* Make a variable with the replacement and substitute it.  */
 379       type = TREE_TYPE (def);
 380       recip_def = create_tmp_reg (type, "reciptmp");
 381       new_stmt = gimple_build_assign_with_ops (RDIV_EXPR, recip_def,
 382                                                build_one_cst (type), def);
 383
 384       if (occ->bb_has_division)
 385         {
 386           /* Case 1: insert before an existing division.  */
 387           gsi = gsi_after_labels (occ->bb);
 388           while (!gsi_end_p (gsi) && !is_division_by (gsi_stmt (gsi), def))
 389             gsi_next (&gsi);
 390
 391           gsi_insert_before (&gsi, new_stmt, GSI_SAME_STMT);
 392         }
 393       else if (def_gsi && occ->bb == def_gsi->bb)
 394         {
 395           /* Case 2: insert right after the definition.  Note that this will
 396              never happen if the definition statement can throw, because in
 397              that case the sole successor of the statement's basic block will
 398              dominate all the uses as well.  */
 399           gsi_insert_after (def_gsi, new_stmt, GSI_NEW_STMT);
 400         }
 401       else
 402         {
 403           /* Case 3: insert in a basic block not containing defs/uses.  */
 404           gsi = gsi_after_labels (occ->bb);
 405           gsi_insert_before (&gsi, new_stmt, GSI_SAME_STMT);
 406         }
 407
 408       reciprocal_stats.rdivs_inserted++;
 409
 410       occ->recip_def_stmt = new_stmt;
 411     }
 412
 413   occ->recip_def = recip_def;
 414   for (occ_child = occ->children; occ_child; occ_child = occ_child->next)
 415     insert_reciprocals (def_gsi, occ_child, def, recip_def, threshold);
 416 }
 417
 418
 419 /* Replace the division at USE_P with a multiplication by the reciprocal, if
 420    possible.  */
 421
 422 static inline void
 423 replace_reciprocal (use_operand_p use_p)
 424 {
 425   gimple use_stmt = USE_STMT (use_p);
 426   basic_block bb = gimple_bb (use_stmt);
 427   struct occurrence *occ = (struct occurrence *) bb->aux;
 428
 429   if (optimize_bb_for_speed_p (bb)
 430       && occ->recip_def && use_stmt != occ->recip_def_stmt)
 431     {
 432       gimple_stmt_iterator gsi = gsi_for_stmt (use_stmt);
 433       gimple_assign_set_rhs_code (use_stmt, MULT_EXPR);
 434       SET_USE (use_p, occ->recip_def);
 435       fold_stmt_inplace (&gsi);
 436       update_stmt (use_stmt);
 437     }
 438 }
 439
 440
 441 /* Free OCC and return one more "struct occurrence" to be freed.  */
 442
 443 static struct occurrence *
 444 free_bb (struct occurrence *occ)
 445 {
 446   struct occurrence *child, *next;
 447
 448   /* First get the two pointers hanging off OCC.  */
 449   next = occ->next;
 450   child = occ->children;
 451   occ->bb->aux = NULL;
 452   pool_free (occ_pool, occ);
 453
 454   /* Now ensure that we don't recurse unless it is necessary.  */
 455   if (!child)
 456     return next;
 457   else
 458     {
 459       while (next)
 460         next = free_bb (next);
 461
 462       return child;
 463     }
 464 }
 465
 466
 467 /* Look for floating-point divisions among DEF's uses, and try to
 468    replace them by multiplications with the reciprocal.  Add
 469    as many statements computing the reciprocal as needed.
 470
 471    DEF must be a GIMPLE register of a floating-point type.  */
 472
 473 static void
 474 execute_cse_reciprocals_1 (gimple_stmt_iterator *def_gsi, tree def)
 475 {
 476   use_operand_p use_p;
 477   imm_use_iterator use_iter;
 478   struct occurrence *occ;
 479   int count = 0, threshold;
 480
 481   gcc_assert (FLOAT_TYPE_P (TREE_TYPE (def)) && is_gimple_reg (def));
 482
 483   FOR_EACH_IMM_USE_FAST (use_p, use_iter, def)
 484     {
 485       gimple use_stmt = USE_STMT (use_p);
 486       if (is_division_by (use_stmt, def))
 487         {
 488           register_division_in (gimple_bb (use_stmt));
 489           count++;
 490         }
 491     }
 492
 493   /* Do the expensive part only if we can hope to optimize something.  */
 494   threshold = targetm.min_divisions_for_recip_mul (TYPE_MODE (TREE_TYPE (def)));
 495   if (count >= threshold)
 496     {
 497       gimple use_stmt;
 498       for (occ = occ_head; occ; occ = occ->next)
 499         {
 500           compute_merit (occ);
 501           insert_reciprocals (def_gsi, occ, def, NULL, threshold);
 502         }
 503
 504       FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, def)
 505         {
 506           if (is_division_by (use_stmt, def))
 507             {
 508               FOR_EACH_IMM_USE_ON_STMT (use_p, use_iter)
 509                 replace_reciprocal (use_p);
 510             }
 511         }
 512     }
 513
 514   for (occ = occ_head; occ; )
 515     occ = free_bb (occ);
 516
 517   occ_head = NULL;
 518 }
 519
 520 /* Go through all the floating-point SSA_NAMEs, and call
 521    execute_cse_reciprocals_1 on each of them.  */
 522 namespace {
 523
 524 const pass_data pass_data_cse_reciprocals =
 525 {
 526   GIMPLE_PASS, /* type */
 527   "recip", /* name */
 528   OPTGROUP_NONE, /* optinfo_flags */
 529   TV_NONE, /* tv_id */
 530   PROP_ssa, /* properties_required */
 531   0, /* properties_provided */
 532   0, /* properties_destroyed */
 533   0, /* todo_flags_start */
 534   TODO_update_ssa, /* todo_flags_finish */
 535 };
 536
 537 class pass_cse_reciprocals : public gimple_opt_pass
 538 {
 539 public:
 540   pass_cse_reciprocals (gcc::context *ctxt)
 541     : gimple_opt_pass (pass_data_cse_reciprocals, ctxt)
 542   {}
 543
 544   /* opt_pass methods: */
 545   virtual bool gate (function *) { return optimize && flag_reciprocal_math; }
 546   virtual unsigned int execute (function *);
 547
 548 }; // class pass_cse_reciprocals
 549
 550 unsigned int
 551 pass_cse_reciprocals::execute (function *fun)
 552 {
 553   basic_block bb;
 554   tree arg;
 555
 556   occ_pool = create_alloc_pool ("dominators for recip",
 557                                 sizeof (struct occurrence),
 558                                 n_basic_blocks_for_fn (fun) / 3 + 1);
 559
 560   memset (&reciprocal_stats, 0, sizeof (reciprocal_stats));
 561   calculate_dominance_info (CDI_DOMINATORS);
 562   calculate_dominance_info (CDI_POST_DOMINATORS);
 563
 564 #ifdef ENABLE_CHECKING
 565   FOR_EACH_BB_FN (bb, fun)
 566     gcc_assert (!bb->aux);
 567 #endif
 568
 569   for (arg = DECL_ARGUMENTS (fun->decl); arg; arg = DECL_CHAIN (arg))
 570     if (FLOAT_TYPE_P (TREE_TYPE (arg))
 571         && is_gimple_reg (arg))
 572       {
 573         tree name = ssa_default_def (fun, arg);
 574         if (name)
 575           execute_cse_reciprocals_1 (NULL, name);
 576       }
 577
 578   FOR_EACH_BB_FN (bb, fun)
 579     {
 580       gimple_stmt_iterator gsi;
 581       gimple phi;
 582       tree def;
 583
 584       for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 585         {
 586           phi = gsi_stmt (gsi);
 587           def = PHI_RESULT (phi);
 588           if (! virtual_operand_p (def)
 589               && FLOAT_TYPE_P (TREE_TYPE (def)))
 590             execute_cse_reciprocals_1 (NULL, def);
 591         }
 592
 593       for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 594         {
 595           gimple stmt = gsi_stmt (gsi);
 596
 597           if (gimple_has_lhs (stmt)
 598               && (def = SINGLE_SSA_TREE_OPERAND (stmt, SSA_OP_DEF)) != NULL
 599               && FLOAT_TYPE_P (TREE_TYPE (def))
 600               && TREE_CODE (def) == SSA_NAME)
 601             execute_cse_reciprocals_1 (&gsi, def);
 602         }
 603
 604       if (optimize_bb_for_size_p (bb))
 605         continue;
 606
 607       /* Scan for a/func(b) and convert it to reciprocal a*rfunc(b).  */
 608       for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 609         {
 610           gimple stmt = gsi_stmt (gsi);
 611           tree fndecl;
 612
 613           if (is_gimple_assign (stmt)
 614               && gimple_assign_rhs_code (stmt) == RDIV_EXPR)
 615             {
 616               tree arg1 = gimple_assign_rhs2 (stmt);
 617               gimple stmt1;
 618
 619               if (TREE_CODE (arg1) != SSA_NAME)
 620                 continue;
 621
 622               stmt1 = SSA_NAME_DEF_STMT (arg1);
 623
 624               if (is_gimple_call (stmt1)
 625                   && gimple_call_lhs (stmt1)
 626                   && (fndecl = gimple_call_fndecl (stmt1))
 627                   && (DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_NORMAL
 628                       || DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD))
 629                 {
 630                   enum built_in_function code;
 631                   bool md_code, fail;
 632                   imm_use_iterator ui;
 633                   use_operand_p use_p;
 634
 635                   code = DECL_FUNCTION_CODE (fndecl);
 636                   md_code = DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD;
 637
 638                   fndecl = targetm.builtin_reciprocal (code, md_code, false);
 639                   if (!fndecl)
 640                     continue;
 641
 642                   /* Check that all uses of the SSA name are divisions,
 643                      otherwise replacing the defining statement will do
 644                      the wrong thing.  */
 645                   fail = false;
 646                   FOR_EACH_IMM_USE_FAST (use_p, ui, arg1)
 647                     {
 648                       gimple stmt2 = USE_STMT (use_p);
 649                       if (is_gimple_debug (stmt2))
 650                         continue;
 651                       if (!is_gimple_assign (stmt2)
 652                           || gimple_assign_rhs_code (stmt2) != RDIV_EXPR
 653                           || gimple_assign_rhs1 (stmt2) == arg1
 654                           || gimple_assign_rhs2 (stmt2) != arg1)
 655                         {
 656                           fail = true;
 657                           break;
 658                         }
 659                     }
 660                   if (fail)
 661                     continue;
 662
 663                   gimple_replace_ssa_lhs (stmt1, arg1);
 664                   gimple_call_set_fndecl (stmt1, fndecl);
 665                   update_stmt (stmt1);
 666                   reciprocal_stats.rfuncs_inserted++;
 667
 668                   FOR_EACH_IMM_USE_STMT (stmt, ui, arg1)
 669                     {
 670                       gimple_stmt_iterator gsi = gsi_for_stmt (stmt);
 671                       gimple_assign_set_rhs_code (stmt, MULT_EXPR);
 672                       fold_stmt_inplace (&gsi);
 673                       update_stmt (stmt);
 674                     }
 675                 }
 676             }
 677         }
 678     }
 679
 680   statistics_counter_event (fun, "reciprocal divs inserted",
 681                             reciprocal_stats.rdivs_inserted);
 682   statistics_counter_event (fun, "reciprocal functions inserted",
 683                             reciprocal_stats.rfuncs_inserted);
 684
 685   free_dominance_info (CDI_DOMINATORS);
 686   free_dominance_info (CDI_POST_DOMINATORS);
 687   free_alloc_pool (occ_pool);
 688   return 0;
 689 }
 690
 691 } // anon namespace
 692
 693 gimple_opt_pass *
 694 make_pass_cse_reciprocals (gcc::context *ctxt)
 695 {
 696   return new pass_cse_reciprocals (ctxt);
 697 }
 698
 699 /* Records an occurrence at statement USE_STMT in the vector of trees
 700    STMTS if it is dominated by *TOP_BB or dominates it or this basic block
 701    is not yet initialized.  Returns true if the occurrence was pushed on
 702    the vector.  Adjusts *TOP_BB to be the basic block dominating all
 703    statements in the vector.  */
 704
 705 static bool
 706 maybe_record_sincos (vec<gimple> *stmts,
 707                      basic_block *top_bb, gimple use_stmt)
 708 {
 709   basic_block use_bb = gimple_bb (use_stmt);
 710   if (*top_bb
 711       && (*top_bb == use_bb
 712           || dominated_by_p (CDI_DOMINATORS, use_bb, *top_bb)))
 713     stmts->safe_push (use_stmt);
 714   else if (!*top_bb
 715            || dominated_by_p (CDI_DOMINATORS, *top_bb, use_bb))
 716     {
 717       stmts->safe_push (use_stmt);
 718       *top_bb = use_bb;
 719     }
 720   else
 721     return false;
 722
 723   return true;
 724 }
 725
 726 /* Look for sin, cos and cexpi calls with the same argument NAME and
 727    create a single call to cexpi CSEing the result in this case.
 728    We first walk over all immediate uses of the argument collecting
 729    statements that we can CSE in a vector and in a second pass replace
 730    the statement rhs with a REALPART or IMAGPART expression on the
 731    result of the cexpi call we insert before the use statement that
 732    dominates all other candidates.  */
 733
 734 static bool
 735 execute_cse_sincos_1 (tree name)
 736 {
 737   gimple_stmt_iterator gsi;
 738   imm_use_iterator use_iter;
 739   tree fndecl, res, type;
 740   gimple def_stmt, use_stmt, stmt;
 741   int seen_cos = 0, seen_sin = 0, seen_cexpi = 0;
 742   vec<gimple> stmts = vNULL;
 743   basic_block top_bb = NULL;
 744   int i;
 745   bool cfg_changed = false;
 746
 747   type = TREE_TYPE (name);
 748   FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, name)
 749     {
 750       if (gimple_code (use_stmt) != GIMPLE_CALL
 751           || !gimple_call_lhs (use_stmt)
 752           || !(fndecl = gimple_call_fndecl (use_stmt))
 753           || DECL_BUILT_IN_CLASS (fndecl) != BUILT_IN_NORMAL)
 754         continue;
 755
 756       switch (DECL_FUNCTION_CODE (fndecl))
 757         {
 758         CASE_FLT_FN (BUILT_IN_COS):
 759           seen_cos |= maybe_record_sincos (&stmts, &top_bb, use_stmt) ? 1 : 0;
 760           break;
 761
 762         CASE_FLT_FN (BUILT_IN_SIN):
 763           seen_sin |= maybe_record_sincos (&stmts, &top_bb, use_stmt) ? 1 : 0;
 764           break;
 765
 766         CASE_FLT_FN (BUILT_IN_CEXPI):
 767           seen_cexpi |= maybe_record_sincos (&stmts, &top_bb, use_stmt) ? 1 : 0;
 768           break;
 769
 770         default:;
 771         }
 772     }
 773
 774   if (seen_cos + seen_sin + seen_cexpi <= 1)
 775     {
 776       stmts.release ();
 777       return false;
 778     }
 779
 780   /* Simply insert cexpi at the beginning of top_bb but not earlier than
 781      the name def statement.  */
 782   fndecl = mathfn_built_in (type, BUILT_IN_CEXPI);
 783   if (!fndecl)
 784     return false;
 785   stmt = gimple_build_call (fndecl, 1, name);
 786   res = make_temp_ssa_name (TREE_TYPE (TREE_TYPE (fndecl)), stmt, "sincostmp");
 787   gimple_call_set_lhs (stmt, res);
 788
 789   def_stmt = SSA_NAME_DEF_STMT (name);
 790   if (!SSA_NAME_IS_DEFAULT_DEF (name)
 791       && gimple_code (def_stmt) != GIMPLE_PHI
 792       && gimple_bb (def_stmt) == top_bb)
 793     {
 794       gsi = gsi_for_stmt (def_stmt);
 795       gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
 796     }
 797   else
 798     {
 799       gsi = gsi_after_labels (top_bb);
 800       gsi_insert_before (&gsi, stmt, GSI_SAME_STMT);
 801     }
 802   sincos_stats.inserted++;
 803
 804   /* And adjust the recorded old call sites.  */
 805   for (i = 0; stmts.iterate (i, &use_stmt); ++i)
 806     {
 807       tree rhs = NULL;
 808       fndecl = gimple_call_fndecl (use_stmt);
 809
 810       switch (DECL_FUNCTION_CODE (fndecl))
 811         {
 812         CASE_FLT_FN (BUILT_IN_COS):
 813           rhs = fold_build1 (REALPART_EXPR, type, res);
 814           break;
 815
 816         CASE_FLT_FN (BUILT_IN_SIN):
 817           rhs = fold_build1 (IMAGPART_EXPR, type, res);
 818           break;
 819
 820         CASE_FLT_FN (BUILT_IN_CEXPI):
 821           rhs = res;
 822           break;
 823
 824         default:;
 825           gcc_unreachable ();
 826         }
 827
 828         /* Replace call with a copy.  */
 829         stmt = gimple_build_assign (gimple_call_lhs (use_stmt), rhs);
 830
 831         gsi = gsi_for_stmt (use_stmt);
 832         gsi_replace (&gsi, stmt, true);
 833         if (gimple_purge_dead_eh_edges (gimple_bb (stmt)))
 834           cfg_changed = true;
 835     }
 836
 837   stmts.release ();
 838
 839   return cfg_changed;
 840 }
 841
 842 /* To evaluate powi(x,n), the floating point value x raised to the
 843    constant integer exponent n, we use a hybrid algorithm that
 844    combines the "window method" with look-up tables.  For an
 845    introduction to exponentiation algorithms and "addition chains",
 846    see section 4.6.3, "Evaluation of Powers" of Donald E. Knuth,
 847    "Seminumerical Algorithms", Vol. 2, "The Art of Computer Programming",
 848    3rd Edition, 1998, and Daniel M. Gordon, "A Survey of Fast Exponentiation
 849    Methods", Journal of Algorithms, Vol. 27, pp. 129-146, 1998.  */
 850
 851 /* Provide a default value for POWI_MAX_MULTS, the maximum number of
 852    multiplications to inline before calling the system library's pow
 853    function.  powi(x,n) requires at worst 2*bits(n)-2 multiplications,
 854    so this default never requires calling pow, powf or powl.  */
 855
 856 #ifndef POWI_MAX_MULTS
 857 #define POWI_MAX_MULTS  (2*HOST_BITS_PER_WIDE_INT-2)
 858 #endif
 859
 860 /* The size of the "optimal power tree" lookup table.  All
 861    exponents less than this value are simply looked up in the
 862    powi_table below.  This threshold is also used to size the
 863    cache of pseudo registers that hold intermediate results.  */
 864 #define POWI_TABLE_SIZE 256
 865
 866 /* The size, in bits of the window, used in the "window method"
 867    exponentiation algorithm.  This is equivalent to a radix of
 868    (1<<POWI_WINDOW_SIZE) in the corresponding "m-ary method".  */
 869 #define POWI_WINDOW_SIZE 3
 870
 871 /* The following table is an efficient representation of an
 872    "optimal power tree".  For each value, i, the corresponding
 873    value, j, in the table states than an optimal evaluation
 874    sequence for calculating pow(x,i) can be found by evaluating
 875    pow(x,j)*pow(x,i-j).  An optimal power tree for the first
 876    100 integers is given in Knuth's "Seminumerical algorithms".  */
 877
 878 static const unsigned char powi_table[POWI_TABLE_SIZE] =
 879   {
 880       0,   1,   1,   2,   2,   3,   3,   4,  /*   0 -   7 */
 881       4,   6,   5,   6,   6,  10,   7,   9,  /*   8 -  15 */
 882       8,  16,   9,  16,  10,  12,  11,  13,  /*  16 -  23 */
 883      12,  17,  13,  18,  14,  24,  15,  26,  /*  24 -  31 */
 884      16,  17,  17,  19,  18,  33,  19,  26,  /*  32 -  39 */
 885      20,  25,  21,  40,  22,  27,  23,  44,  /*  40 -  47 */
 886      24,  32,  25,  34,  26,  29,  27,  44,  /*  48 -  55 */
 887      28,  31,  29,  34,  30,  60,  31,  36,  /*  56 -  63 */
 888      32,  64,  33,  34,  34,  46,  35,  37,  /*  64 -  71 */
 889      36,  65,  37,  50,  38,  48,  39,  69,  /*  72 -  79 */
 890      40,  49,  41,  43,  42,  51,  43,  58,  /*  80 -  87 */
 891      44,  64,  45,  47,  46,  59,  47,  76,  /*  88 -  95 */
 892      48,  65,  49,  66,  50,  67,  51,  66,  /*  96 - 103 */
 893      52,  70,  53,  74,  54, 104,  55,  74,  /* 104 - 111 */
 894      56,  64,  57,  69,  58,  78,  59,  68,  /* 112 - 119 */
 895      60,  61,  61,  80,  62,  75,  63,  68,  /* 120 - 127 */
 896      64,  65,  65, 128,  66, 129,  67,  90,  /* 128 - 135 */
 897      68,  73,  69, 131,  70,  94,  71,  88,  /* 136 - 143 */
 898      72, 128,  73,  98,  74, 132,  75, 121,  /* 144 - 151 */
 899      76, 102,  77, 124,  78, 132,  79, 106,  /* 152 - 159 */
 900      80,  97,  81, 160,  82,  99,  83, 134,  /* 160 - 167 */
 901      84,  86,  85,  95,  86, 160,  87, 100,  /* 168 - 175 */
 902      88, 113,  89,  98,  90, 107,  91, 122,  /* 176 - 183 */
 903      92, 111,  93, 102,  94, 126,  95, 150,  /* 184 - 191 */
 904      96, 128,  97, 130,  98, 133,  99, 195,  /* 192 - 199 */
 905     100, 128, 101, 123, 102, 164, 103, 138,  /* 200 - 207 */
 906     104, 145, 105, 146, 106, 109, 107, 149,  /* 208 - 215 */
 907     108, 200, 109, 146, 110, 170, 111, 157,  /* 216 - 223 */
 908     112, 128, 113, 130, 114, 182, 115, 132,  /* 224 - 231 */
 909     116, 200, 117, 132, 118, 158, 119, 206,  /* 232 - 239 */
 910     120, 240, 121, 162, 122, 147, 123, 152,  /* 240 - 247 */
 911     124, 166, 125, 214, 126, 138, 127, 153,  /* 248 - 255 */
 912   };
 913
 914
 915 /* Return the number of multiplications required to calculate
 916    powi(x,n) where n is less than POWI_TABLE_SIZE.  This is a
 917    subroutine of powi_cost.  CACHE is an array indicating
 918    which exponents have already been calculated.  */
 919
 920 static int
 921 powi_lookup_cost (unsigned HOST_WIDE_INT n, bool *cache)
 922 {
 923   /* If we've already calculated this exponent, then this evaluation
 924      doesn't require any additional multiplications.  */
 925   if (cache[n])
 926     return 0;
 927
 928   cache[n] = true;
 929   return powi_lookup_cost (n - powi_table[n], cache)
 930          + powi_lookup_cost (powi_table[n], cache) + 1;
 931 }
 932
 933 /* Return the number of multiplications required to calculate
 934    powi(x,n) for an arbitrary x, given the exponent N.  This
 935    function needs to be kept in sync with powi_as_mults below.  */
 936
 937 static int
 938 powi_cost (HOST_WIDE_INT n)
 939 {
 940   bool cache[POWI_TABLE_SIZE];
 941   unsigned HOST_WIDE_INT digit;
 942   unsigned HOST_WIDE_INT val;
 943   int result;
 944
 945   if (n == 0)
 946     return 0;
 947
 948   /* Ignore the reciprocal when calculating the cost.  */
 949   val = (n < 0) ? -n : n;
 950
 951   /* Initialize the exponent cache.  */
 952   memset (cache, 0, POWI_TABLE_SIZE * sizeof (bool));
 953   cache[1] = true;
 954
 955   result = 0;
 956
 957   while (val >= POWI_TABLE_SIZE)
 958     {
 959       if (val & 1)
 960         {
 961           digit = val & ((1 << POWI_WINDOW_SIZE) - 1);
 962           result += powi_lookup_cost (digit, cache)
 963                     + POWI_WINDOW_SIZE + 1;
 964           val >>= POWI_WINDOW_SIZE;
 965         }
 966       else
 967         {
 968           val >>= 1;
 969           result++;
 970         }
 971     }
 972
 973   return result + powi_lookup_cost (val, cache);
 974 }
 975
 976 /* Recursive subroutine of powi_as_mults.  This function takes the
 977    array, CACHE, of already calculated exponents and an exponent N and
 978    returns a tree that corresponds to CACHE[1]**N, with type TYPE.  */
 979
 980 static tree
 981 powi_as_mults_1 (gimple_stmt_iterator *gsi, location_t loc, tree type,
 982                  HOST_WIDE_INT n, tree *cache)
 983 {
 984   tree op0, op1, ssa_target;
 985   unsigned HOST_WIDE_INT digit;
 986   gimple mult_stmt;
 987
 988   if (n < POWI_TABLE_SIZE && cache[n])
 989     return cache[n];
 990
 991   ssa_target = make_temp_ssa_name (type, NULL, "powmult");
 992
 993   if (n < POWI_TABLE_SIZE)
 994     {
 995       cache[n] = ssa_target;
 996       op0 = powi_as_mults_1 (gsi, loc, type, n - powi_table[n], cache);
 997       op1 = powi_as_mults_1 (gsi, loc, type, powi_table[n], cache);
 998     }
 999   else if (n & 1)
1000     {
1001       digit = n & ((1 << POWI_WINDOW_SIZE) - 1);
1002       op0 = powi_as_mults_1 (gsi, loc, type, n - digit, cache);
1003       op1 = powi_as_mults_1 (gsi, loc, type, digit, cache);
1004     }
1005   else
1006     {
1007       op0 = powi_as_mults_1 (gsi, loc, type, n >> 1, cache);
1008       op1 = op0;
1009     }
1010
1011   mult_stmt = gimple_build_assign_with_ops (MULT_EXPR, ssa_target, op0, op1);
1012   gimple_set_location (mult_stmt, loc);
1013   gsi_insert_before (gsi, mult_stmt, GSI_SAME_STMT);
1014
1015   return ssa_target;
1016 }
1017
1018 /* Convert ARG0**N to a tree of multiplications of ARG0 with itself.
1019    This function needs to be kept in sync with powi_cost above.  */
1020
1021 static tree
1022 powi_as_mults (gimple_stmt_iterator *gsi, location_t loc,
1023                tree arg0, HOST_WIDE_INT n)
1024 {
1025   tree cache[POWI_TABLE_SIZE], result, type = TREE_TYPE (arg0);
1026   gimple div_stmt;
1027   tree target;
1028
1029   if (n == 0)
1030     return build_real (type, dconst1);
1031
1032   memset (cache, 0,  sizeof (cache));
1033   cache[1] = arg0;
1034
1035   result = powi_as_mults_1 (gsi, loc, type, (n < 0) ? -n : n, cache);
1036   if (n >= 0)
1037     return result;
1038
1039   /* If the original exponent was negative, reciprocate the result.  */
1040   target = make_temp_ssa_name (type, NULL, "powmult");
1041   div_stmt = gimple_build_assign_with_ops (RDIV_EXPR, target,
1042                                            build_real (type, dconst1),
1043                                            result);
1044   gimple_set_location (div_stmt, loc);
1045   gsi_insert_before (gsi, div_stmt, GSI_SAME_STMT);
1046
1047   return target;
1048 }
1049
1050 /* ARG0 and N are the two arguments to a powi builtin in GSI with
1051    location info LOC.  If the arguments are appropriate, create an
1052    equivalent sequence of statements prior to GSI using an optimal
1053    number of multiplications, and return an expession holding the
1054    result.  */
1055
1056 static tree
1057 gimple_expand_builtin_powi (gimple_stmt_iterator *gsi, location_t loc,
1058                             tree arg0, HOST_WIDE_INT n)
1059 {
1060   /* Avoid largest negative number.  */
1061   if (n != -n
1062       && ((n >= -1 && n <= 2)
1063           || (optimize_function_for_speed_p (cfun)
1064               && powi_cost (n) <= POWI_MAX_MULTS)))
1065     return powi_as_mults (gsi, loc, arg0, n);
1066
1067   return NULL_TREE;
1068 }
1069
1070 /* Build a gimple call statement that calls FN with argument ARG.
1071    Set the lhs of the call statement to a fresh SSA name.  Insert the
1072    statement prior to GSI's current position, and return the fresh
1073    SSA name.  */
1074
1075 static tree
1076 build_and_insert_call (gimple_stmt_iterator *gsi, location_t loc,
1077                        tree fn, tree arg)
1078 {
1079   gimple call_stmt;
1080   tree ssa_target;
1081
1082   call_stmt = gimple_build_call (fn, 1, arg);
1083   ssa_target = make_temp_ssa_name (TREE_TYPE (arg), NULL, "powroot");
1084   gimple_set_lhs (call_stmt, ssa_target);
1085   gimple_set_location (call_stmt, loc);
1086   gsi_insert_before (gsi, call_stmt, GSI_SAME_STMT);
1087
1088   return ssa_target;
1089 }
1090
1091 /* Build a gimple binary operation with the given CODE and arguments
1092    ARG0, ARG1, assigning the result to a new SSA name for variable
1093    TARGET.  Insert the statement prior to GSI's current position, and
1094    return the fresh SSA name.*/
1095
1096 static tree
1097 build_and_insert_binop (gimple_stmt_iterator *gsi, location_t loc,
1098                         const char *name, enum tree_code code,
1099                         tree arg0, tree arg1)
1100 {
1101   tree result = make_temp_ssa_name (TREE_TYPE (arg0), NULL, name);
1102   gimple stmt = gimple_build_assign_with_ops (code, result, arg0, arg1);
1103   gimple_set_location (stmt, loc);
1104   gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
1105   return result;
1106 }
1107
1108 /* Build a gimple reference operation with the given CODE and argument
1109    ARG, assigning the result to a new SSA name of TYPE with NAME.
1110    Insert the statement prior to GSI's current position, and return
1111    the fresh SSA name.  */
1112
1113 static inline tree
1114 build_and_insert_ref (gimple_stmt_iterator *gsi, location_t loc, tree type,
1115                       const char *name, enum tree_code code, tree arg0)
1116 {
1117   tree result = make_temp_ssa_name (type, NULL, name);
1118   gimple stmt = gimple_build_assign (result, build1 (code, type, arg0));
1119   gimple_set_location (stmt, loc);
1120   gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
1121   return result;
1122 }
1123
1124 /* Build a gimple assignment to cast VAL to TYPE.  Insert the statement
1125    prior to GSI's current position, and return the fresh SSA name.  */
1126
1127 static tree
1128 build_and_insert_cast (gimple_stmt_iterator *gsi, location_t loc,
1129                        tree type, tree val)
1130 {
1131   tree result = make_ssa_name (type, NULL);
1132   gimple stmt = gimple_build_assign_with_ops (NOP_EXPR, result, val, NULL_TREE);
1133   gimple_set_location (stmt, loc);
1134   gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
1135   return result;
1136 }
1137
1138 /* ARG0 and ARG1 are the two arguments to a pow builtin call in GSI
1139    with location info LOC.  If possible, create an equivalent and
1140    less expensive sequence of statements prior to GSI, and return an
1141    expession holding the result.  */
1142
1143 static tree
1144 gimple_expand_builtin_pow (gimple_stmt_iterator *gsi, location_t loc,
1145                            tree arg0, tree arg1)
1146 {
1147   REAL_VALUE_TYPE c, cint, dconst1_4, dconst3_4, dconst1_3, dconst1_6;
1148   REAL_VALUE_TYPE c2, dconst3;
1149   HOST_WIDE_INT n;
1150   tree type, sqrtfn, cbrtfn, sqrt_arg0, sqrt_sqrt, result, cbrt_x, powi_cbrt_x;
1151   machine_mode mode;
1152   bool hw_sqrt_exists, c_is_int, c2_is_int;
1153
1154   /* If the exponent isn't a constant, there's nothing of interest
1155      to be done.  */
1156   if (TREE_CODE (arg1) != REAL_CST)
1157     return NULL_TREE;
1158
1159   /* If the exponent is equivalent to an integer, expand to an optimal
1160      multiplication sequence when profitable.  */
1161   c = TREE_REAL_CST (arg1);
1162   n = real_to_integer (&c);
1163   real_from_integer (&cint, VOIDmode, n, SIGNED);
1164   c_is_int = real_identical (&c, &cint);
1165
1166   if (c_is_int
1167       && ((n >= -1 && n <= 2)
1168           || (flag_unsafe_math_optimizations
1169               && optimize_bb_for_speed_p (gsi_bb (*gsi))
1170               && powi_cost (n) <= POWI_MAX_MULTS)))
1171     return gimple_expand_builtin_powi (gsi, loc, arg0, n);
1172
1173   /* Attempt various optimizations using sqrt and cbrt.  */
1174   type = TREE_TYPE (arg0);
1175   mode = TYPE_MODE (type);
1176   sqrtfn = mathfn_built_in (type, BUILT_IN_SQRT);
1177
1178   /* Optimize pow(x,0.5) = sqrt(x).  This replacement is always safe
1179      unless signed zeros must be maintained.  pow(-0,0.5) = +0, while
1180      sqrt(-0) = -0.  */
1181   if (sqrtfn
1182       && REAL_VALUES_EQUAL (c, dconsthalf)
1183       && !HONOR_SIGNED_ZEROS (mode))
1184     return build_and_insert_call (gsi, loc, sqrtfn, arg0);
1185
1186   /* Optimize pow(x,0.25) = sqrt(sqrt(x)).  Assume on most machines that
1187      a builtin sqrt instruction is smaller than a call to pow with 0.25,
1188      so do this optimization even if -Os.  Don't do this optimization
1189      if we don't have a hardware sqrt insn.  */
1190   dconst1_4 = dconst1;
1191   SET_REAL_EXP (&dconst1_4, REAL_EXP (&dconst1_4) - 2);
1192   hw_sqrt_exists = optab_handler (sqrt_optab, mode) != CODE_FOR_nothing;
1193
1194   if (flag_unsafe_math_optimizations
1195       && sqrtfn
1196       && REAL_VALUES_EQUAL (c, dconst1_4)
1197       && hw_sqrt_exists)
1198     {
1199       /* sqrt(x)  */
1200       sqrt_arg0 = build_and_insert_call (gsi, loc, sqrtfn, arg0);
1201
1202       /* sqrt(sqrt(x))  */
1203       return build_and_insert_call (gsi, loc, sqrtfn, sqrt_arg0);
1204     }
1205
1206   /* Optimize pow(x,0.75) = sqrt(x) * sqrt(sqrt(x)) unless we are
1207      optimizing for space.  Don't do this optimization if we don't have
1208      a hardware sqrt insn.  */
1209   real_from_integer (&dconst3_4, VOIDmode, 3, SIGNED);
1210   SET_REAL_EXP (&dconst3_4, REAL_EXP (&dconst3_4) - 2);
1211
1212   if (flag_unsafe_math_optimizations
1213       && sqrtfn
1214       && optimize_function_for_speed_p (cfun)
1215       && REAL_VALUES_EQUAL (c, dconst3_4)
1216       && hw_sqrt_exists)
1217     {
1218       /* sqrt(x)  */
1219       sqrt_arg0 = build_and_insert_call (gsi, loc, sqrtfn, arg0);
1220
1221       /* sqrt(sqrt(x))  */
1222       sqrt_sqrt = build_and_insert_call (gsi, loc, sqrtfn, sqrt_arg0);
1223
1224       /* sqrt(x) * sqrt(sqrt(x))  */
1225       return build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1226                                      sqrt_arg0, sqrt_sqrt);
1227     }
1228
1229   /* Optimize pow(x,1./3.) = cbrt(x).  This requires unsafe math
1230      optimizations since 1./3. is not exactly representable.  If x
1231      is negative and finite, the correct value of pow(x,1./3.) is
1232      a NaN with the "invalid" exception raised, because the value
1233      of 1./3. actually has an even denominator.  The correct value
1234      of cbrt(x) is a negative real value.  */
1235   cbrtfn = mathfn_built_in (type, BUILT_IN_CBRT);
1236   dconst1_3 = real_value_truncate (mode, dconst_third ());
1237
1238   if (flag_unsafe_math_optimizations
1239       && cbrtfn
1240       && (gimple_val_nonnegative_real_p (arg0) || !HONOR_NANS (mode))
1241       && REAL_VALUES_EQUAL (c, dconst1_3))
1242     return build_and_insert_call (gsi, loc, cbrtfn, arg0);
1243
1244   /* Optimize pow(x,1./6.) = cbrt(sqrt(x)).  Don't do this optimization
1245      if we don't have a hardware sqrt insn.  */
1246   dconst1_6 = dconst1_3;
1247   SET_REAL_EXP (&dconst1_6, REAL_EXP (&dconst1_6) - 1);
1248
1249   if (flag_unsafe_math_optimizations
1250       && sqrtfn
1251       && cbrtfn
1252       && (gimple_val_nonnegative_real_p (arg0) || !HONOR_NANS (mode))
1253       && optimize_function_for_speed_p (cfun)
1254       && hw_sqrt_exists
1255       && REAL_VALUES_EQUAL (c, dconst1_6))
1256     {
1257       /* sqrt(x)  */
1258       sqrt_arg0 = build_and_insert_call (gsi, loc, sqrtfn, arg0);
1259
1260       /* cbrt(sqrt(x))  */
1261       return build_and_insert_call (gsi, loc, cbrtfn, sqrt_arg0);
1262     }
1263
1264   /* Optimize pow(x,c), where n = 2c for some nonzero integer n
1265      and c not an integer, into
1266
1267        sqrt(x) * powi(x, n/2),                n > 0;
1268        1.0 / (sqrt(x) * powi(x, abs(n/2))),   n < 0.
1269
1270      Do not calculate the powi factor when n/2 = 0.  */
1271   real_arithmetic (&c2, MULT_EXPR, &c, &dconst2);
1272   n = real_to_integer (&c2);
1273   real_from_integer (&cint, VOIDmode, n, SIGNED);
1274   c2_is_int = real_identical (&c2, &cint);
1275
1276   if (flag_unsafe_math_optimizations
1277       && sqrtfn
1278       && c2_is_int
1279       && !c_is_int
1280       && optimize_function_for_speed_p (cfun))
1281     {
1282       tree powi_x_ndiv2 = NULL_TREE;
1283
1284       /* Attempt to fold powi(arg0, abs(n/2)) into multiplies.  If not
1285          possible or profitable, give up.  Skip the degenerate case when
1286          n is 1 or -1, where the result is always 1.  */
1287       if (absu_hwi (n) != 1)
1288         {
1289           powi_x_ndiv2 = gimple_expand_builtin_powi (gsi, loc, arg0,
1290                                                      abs_hwi (n / 2));
1291           if (!powi_x_ndiv2)
1292             return NULL_TREE;
1293         }
1294
1295       /* Calculate sqrt(x).  When n is not 1 or -1, multiply it by the
1296          result of the optimal multiply sequence just calculated.  */
1297       sqrt_arg0 = build_and_insert_call (gsi, loc, sqrtfn, arg0);
1298
1299       if (absu_hwi (n) == 1)
1300         result = sqrt_arg0;
1301       else
1302         result = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1303                                          sqrt_arg0, powi_x_ndiv2);
1304
1305       /* If n is negative, reciprocate the result.  */
1306       if (n < 0)
1307         result = build_and_insert_binop (gsi, loc, "powroot", RDIV_EXPR,
1308                                          build_real (type, dconst1), result);
1309       return result;
1310     }
1311
1312   /* Optimize pow(x,c), where 3c = n for some nonzero integer n, into
1313
1314      powi(x, n/3) * powi(cbrt(x), n%3),                    n > 0;
1315      1.0 / (powi(x, abs(n)/3) * powi(cbrt(x), abs(n)%3)),  n < 0.
1316
1317      Do not calculate the first factor when n/3 = 0.  As cbrt(x) is
1318      different from pow(x, 1./3.) due to rounding and behavior with
1319      negative x, we need to constrain this transformation to unsafe
1320      math and positive x or finite math.  */
1321   real_from_integer (&dconst3, VOIDmode, 3, SIGNED);
1322   real_arithmetic (&c2, MULT_EXPR, &c, &dconst3);
1323   real_round (&c2, mode, &c2);
1324   n = real_to_integer (&c2);
1325   real_from_integer (&cint, VOIDmode, n, SIGNED);
1326   real_arithmetic (&c2, RDIV_EXPR, &cint, &dconst3);
1327   real_convert (&c2, mode, &c2);
1328
1329   if (flag_unsafe_math_optimizations
1330       && cbrtfn
1331       && (gimple_val_nonnegative_real_p (arg0) || !HONOR_NANS (mode))
1332       && real_identical (&c2, &c)
1333       && !c2_is_int
1334       && optimize_function_for_speed_p (cfun)
1335       && powi_cost (n / 3) <= POWI_MAX_MULTS)
1336     {
1337       tree powi_x_ndiv3 = NULL_TREE;
1338
1339       /* Attempt to fold powi(arg0, abs(n/3)) into multiplies.  If not
1340          possible or profitable, give up.  Skip the degenerate case when
1341          abs(n) < 3, where the result is always 1.  */
1342       if (absu_hwi (n) >= 3)
1343         {
1344           powi_x_ndiv3 = gimple_expand_builtin_powi (gsi, loc, arg0,
1345                                                      abs_hwi (n / 3));
1346           if (!powi_x_ndiv3)
1347             return NULL_TREE;
1348         }
1349
1350       /* Calculate powi(cbrt(x), n%3).  Don't use gimple_expand_builtin_powi
1351          as that creates an unnecessary variable.  Instead, just produce
1352          either cbrt(x) or cbrt(x) * cbrt(x).  */
1353       cbrt_x = build_and_insert_call (gsi, loc, cbrtfn, arg0);
1354
1355       if (absu_hwi (n) % 3 == 1)
1356         powi_cbrt_x = cbrt_x;
1357       else
1358         powi_cbrt_x = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1359                                               cbrt_x, cbrt_x);
1360
1361       /* Multiply the two subexpressions, unless powi(x,abs(n)/3) = 1.  */
1362       if (absu_hwi (n) < 3)
1363         result = powi_cbrt_x;
1364       else
1365         result = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1366                                          powi_x_ndiv3, powi_cbrt_x);
1367
1368       /* If n is negative, reciprocate the result.  */
1369       if (n < 0)
1370         result = build_and_insert_binop (gsi, loc, "powroot", RDIV_EXPR,
1371                                          build_real (type, dconst1), result);
1372
1373       return result;
1374     }
1375
1376   /* No optimizations succeeded.  */
1377   return NULL_TREE;
1378 }
1379
1380 /* ARG is the argument to a cabs builtin call in GSI with location info
1381    LOC.  Create a sequence of statements prior to GSI that calculates
1382    sqrt(R*R + I*I), where R and I are the real and imaginary components
1383    of ARG, respectively.  Return an expression holding the result.  */
1384
1385 static tree
1386 gimple_expand_builtin_cabs (gimple_stmt_iterator *gsi, location_t loc, tree arg)
1387 {
1388   tree real_part, imag_part, addend1, addend2, sum, result;
1389   tree type = TREE_TYPE (TREE_TYPE (arg));
1390   tree sqrtfn = mathfn_built_in (type, BUILT_IN_SQRT);
1391   machine_mode mode = TYPE_MODE (type);
1392
1393   if (!flag_unsafe_math_optimizations
1394       || !optimize_bb_for_speed_p (gimple_bb (gsi_stmt (*gsi)))
1395       || !sqrtfn
1396       || optab_handler (sqrt_optab, mode) == CODE_FOR_nothing)
1397     return NULL_TREE;
1398
1399   real_part = build_and_insert_ref (gsi, loc, type, "cabs",
1400                                     REALPART_EXPR, arg);
1401   addend1 = build_and_insert_binop (gsi, loc, "cabs", MULT_EXPR,
1402                                     real_part, real_part);
1403   imag_part = build_and_insert_ref (gsi, loc, type, "cabs",
1404                                     IMAGPART_EXPR, arg);
1405   addend2 = build_and_insert_binop (gsi, loc, "cabs", MULT_EXPR,
1406                                     imag_part, imag_part);
1407   sum = build_and_insert_binop (gsi, loc, "cabs", PLUS_EXPR, addend1, addend2);
1408   result = build_and_insert_call (gsi, loc, sqrtfn, sum);
1409
1410   return result;
1411 }
1412
1413 /* Go through all calls to sin, cos and cexpi and call execute_cse_sincos_1
1414    on the SSA_NAME argument of each of them.  Also expand powi(x,n) into
1415    an optimal number of multiplies, when n is a constant.  */
1416
1417 namespace {
1418
1419 const pass_data pass_data_cse_sincos =
1420 {
1421   GIMPLE_PASS, /* type */
1422   "sincos", /* name */
1423   OPTGROUP_NONE, /* optinfo_flags */
1424   TV_NONE, /* tv_id */
1425   PROP_ssa, /* properties_required */
1426   0, /* properties_provided */
1427   0, /* properties_destroyed */
1428   0, /* todo_flags_start */
1429   TODO_update_ssa, /* todo_flags_finish */
1430 };
1431
1432 class pass_cse_sincos : public gimple_opt_pass
1433 {
1434 public:
1435   pass_cse_sincos (gcc::context *ctxt)
1436     : gimple_opt_pass (pass_data_cse_sincos, ctxt)
1437   {}
1438
1439   /* opt_pass methods: */
1440   virtual bool gate (function *)
1441     {
1442       /* We no longer require either sincos or cexp, since powi expansion
1443          piggybacks on this pass.  */
1444       return optimize;
1445     }
1446
1447   virtual unsigned int execute (function *);
1448
1449 }; // class pass_cse_sincos
1450
1451 unsigned int
1452 pass_cse_sincos::execute (function *fun)
1453 {
1454   basic_block bb;
1455   bool cfg_changed = false;
1456
1457   calculate_dominance_info (CDI_DOMINATORS);
1458   memset (&sincos_stats, 0, sizeof (sincos_stats));
1459
1460   FOR_EACH_BB_FN (bb, fun)
1461     {
1462       gimple_stmt_iterator gsi;
1463       bool cleanup_eh = false;
1464
1465       for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi); gsi_next (&gsi))
1466         {
1467           gimple stmt = gsi_stmt (gsi);
1468           tree fndecl;
1469
1470           /* Only the last stmt in a bb could throw, no need to call
1471              gimple_purge_dead_eh_edges if we change something in the middle
1472              of a basic block.  */
1473           cleanup_eh = false;
1474
1475           if (is_gimple_call (stmt)
1476               && gimple_call_lhs (stmt)
1477               && (fndecl = gimple_call_fndecl (stmt))
1478               && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_NORMAL)
1479             {
1480               tree arg, arg0, arg1, result;
1481               HOST_WIDE_INT n;
1482               location_t loc;
1483
1484               switch (DECL_FUNCTION_CODE (fndecl))
1485                 {
1486                 CASE_FLT_FN (BUILT_IN_COS):
1487                 CASE_FLT_FN (BUILT_IN_SIN):
1488                 CASE_FLT_FN (BUILT_IN_CEXPI):
1489                   /* Make sure we have either sincos or cexp.  */
1490                   if (!targetm.libc_has_function (function_c99_math_complex)
1491                       && !targetm.libc_has_function (function_sincos))
1492                     break;
1493
1494                   arg = gimple_call_arg (stmt, 0);
1495                   if (TREE_CODE (arg) == SSA_NAME)
1496                     cfg_changed |= execute_cse_sincos_1 (arg);
1497                   break;
1498
1499                 CASE_FLT_FN (BUILT_IN_POW):
1500                   arg0 = gimple_call_arg (stmt, 0);
1501                   arg1 = gimple_call_arg (stmt, 1);
1502
1503                   loc = gimple_location (stmt);
1504                   result = gimple_expand_builtin_pow (&gsi, loc, arg0, arg1);
1505
1506                   if (result)
1507                     {
1508                       tree lhs = gimple_get_lhs (stmt);
1509                       gimple new_stmt = gimple_build_assign (lhs, result);
1510                       gimple_set_location (new_stmt, loc);
1511                       unlink_stmt_vdef (stmt);
1512                       gsi_replace (&gsi, new_stmt, true);
1513                       cleanup_eh = true;
1514                       if (gimple_vdef (stmt))
1515                         release_ssa_name (gimple_vdef (stmt));
1516                     }
1517                   break;
1518
1519                 CASE_FLT_FN (BUILT_IN_POWI):
1520                   arg0 = gimple_call_arg (stmt, 0);
1521                   arg1 = gimple_call_arg (stmt, 1);
1522                   loc = gimple_location (stmt);
1523
1524                   if (real_minus_onep (arg0))
1525                     {
1526                       tree t0, t1, cond, one, minus_one;
1527                       gimple stmt;
1528
1529                       t0 = TREE_TYPE (arg0);
1530                       t1 = TREE_TYPE (arg1);
1531                       one = build_real (t0, dconst1);
1532                       minus_one = build_real (t0, dconstm1);
1533
1534                       cond = make_temp_ssa_name (t1, NULL, "powi_cond");
1535                       stmt = gimple_build_assign_with_ops (BIT_AND_EXPR, cond,
1536                                                            arg1,
1537                                                            build_int_cst (t1,
1538                                                                           1));
1539                       gimple_set_location (stmt, loc);
1540                       gsi_insert_before (&gsi, stmt, GSI_SAME_STMT);
1541
1542                       result = make_temp_ssa_name (t0, NULL, "powi");
1543                       stmt = gimple_build_assign_with_ops (COND_EXPR, result,
1544                                                            cond,
1545                                                            minus_one, one);
1546                       gimple_set_location (stmt, loc);
1547                       gsi_insert_before (&gsi, stmt, GSI_SAME_STMT);
1548                     }
1549                   else
1550                     {
1551                       if (!tree_fits_shwi_p (arg1))
1552                         break;
1553
1554                       n = tree_to_shwi (arg1);
1555                       result = gimple_expand_builtin_powi (&gsi, loc, arg0, n);
1556                     }
1557
1558                   if (result)
1559                     {
1560                       tree lhs = gimple_get_lhs (stmt);
1561                       gimple new_stmt = gimple_build_assign (lhs, result);
1562                       gimple_set_location (new_stmt, loc);
1563                       unlink_stmt_vdef (stmt);
1564                       gsi_replace (&gsi, new_stmt, true);
1565                       cleanup_eh = true;
1566                       if (gimple_vdef (stmt))
1567                         release_ssa_name (gimple_vdef (stmt));
1568                     }
1569                   break;
1570
1571                 CASE_FLT_FN (BUILT_IN_CABS):
1572                   arg0 = gimple_call_arg (stmt, 0);
1573                   loc = gimple_location (stmt);
1574                   result = gimple_expand_builtin_cabs (&gsi, loc, arg0);
1575
1576                   if (result)
1577                     {
1578                       tree lhs = gimple_get_lhs (stmt);
1579                       gimple new_stmt = gimple_build_assign (lhs, result);
1580                       gimple_set_location (new_stmt, loc);
1581                       unlink_stmt_vdef (stmt);
1582                       gsi_replace (&gsi, new_stmt, true);
1583                       cleanup_eh = true;
1584                       if (gimple_vdef (stmt))
1585                         release_ssa_name (gimple_vdef (stmt));
1586                     }
1587                   break;
1588
1589                 default:;
1590                 }
1591             }
1592         }
1593       if (cleanup_eh)
1594         cfg_changed |= gimple_purge_dead_eh_edges (bb);
1595     }
1596
1597   statistics_counter_event (fun, "sincos statements inserted",
1598                             sincos_stats.inserted);
1599
1600   free_dominance_info (CDI_DOMINATORS);
1601   return cfg_changed ? TODO_cleanup_cfg : 0;
1602 }
1603
1604 } // anon namespace
1605
1606 gimple_opt_pass *
1607 make_pass_cse_sincos (gcc::context *ctxt)
1608 {
1609   return new pass_cse_sincos (ctxt);
1610 }
1611
1612 /* A symbolic number is used to detect byte permutation and selection
1613    patterns.  Therefore the field N contains an artificial number
1614    consisting of octet sized markers:
1615
1616    0    - target byte has the value 0
1617    FF   - target byte has an unknown value (eg. due to sign extension)
1618    1..size - marker value is the target byte index minus one.
1619
1620    To detect permutations on memory sources (arrays and structures), a symbolic
1621    number is also associated a base address (the array or structure the load is
1622    made from), an offset from the base address and a range which gives the
1623    difference between the highest and lowest accessed memory location to make
1624    such a symbolic number. The range is thus different from size which reflects
1625    the size of the type of current expression. Note that for non memory source,
1626    range holds the same value as size.
1627
1628    For instance, for an array char a[], (short) a[0] | (short) a[3] would have
1629    a size of 2 but a range of 4 while (short) a[0] | ((short) a[0] << 1) would
1630    still have a size of 2 but this time a range of 1.  */
1631
1632 struct symbolic_number {
1633   uint64_t n;
1634   tree type;
1635   tree base_addr;
1636   tree offset;
1637   HOST_WIDE_INT bytepos;
1638   tree alias_set;
1639   tree vuse;
1640   unsigned HOST_WIDE_INT range;
1641 };
1642
1643 #define BITS_PER_MARKER 8
1644 #define MARKER_MASK ((1 << BITS_PER_MARKER) - 1)
1645 #define MARKER_BYTE_UNKNOWN MARKER_MASK
1646 #define HEAD_MARKER(n, size) \
1647   ((n) & ((uint64_t) MARKER_MASK << (((size) - 1) * BITS_PER_MARKER)))
1648
1649 /* The number which the find_bswap_or_nop_1 result should match in
1650    order to have a nop.  The number is masked according to the size of
1651    the symbolic number before using it.  */
1652 #define CMPNOP (sizeof (int64_t) < 8 ? 0 : \
1653   (uint64_t)0x08070605 << 32 | 0x04030201)
1654
1655 /* The number which the find_bswap_or_nop_1 result should match in
1656    order to have a byte swap.  The number is masked according to the
1657    size of the symbolic number before using it.  */
1658 #define CMPXCHG (sizeof (int64_t) < 8 ? 0 : \
1659   (uint64_t)0x01020304 << 32 | 0x05060708)
1660
1661 /* Perform a SHIFT or ROTATE operation by COUNT bits on symbolic
1662    number N.  Return false if the requested operation is not permitted
1663    on a symbolic number.  */
1664
1665 static inline bool
1666 do_shift_rotate (enum tree_code code,
1667                  struct symbolic_number *n,
1668                  int count)
1669 {
1670   int i, size = TYPE_PRECISION (n->type) / BITS_PER_UNIT;
1671   unsigned head_marker;
1672
1673   if (count % BITS_PER_UNIT != 0)
1674     return false;
1675   count = (count / BITS_PER_UNIT) * BITS_PER_MARKER;
1676
1677   /* Zero out the extra bits of N in order to avoid them being shifted
1678      into the significant bits.  */
1679   if (size < 64 / BITS_PER_MARKER)
1680     n->n &= ((uint64_t) 1 << (size * BITS_PER_MARKER)) - 1;
1681
1682   switch (code)
1683     {
1684     case LSHIFT_EXPR:
1685       n->n <<= count;
1686       break;
1687     case RSHIFT_EXPR:
1688       head_marker = HEAD_MARKER (n->n, size);
1689       n->n >>= count;
1690       /* Arithmetic shift of signed type: result is dependent on the value.  */
1691       if (!TYPE_UNSIGNED (n->type) && head_marker)
1692         for (i = 0; i < count / BITS_PER_MARKER; i++)
1693           n->n |= (uint64_t) MARKER_BYTE_UNKNOWN
1694                   << ((size - 1 - i) * BITS_PER_MARKER);
1695       break;
1696     case LROTATE_EXPR:
1697       n->n = (n->n << count) | (n->n >> ((size * BITS_PER_MARKER) - count));
1698       break;
1699     case RROTATE_EXPR:
1700       n->n = (n->n >> count) | (n->n << ((size * BITS_PER_MARKER) - count));
1701       break;
1702     default:
1703       return false;
1704     }
1705   /* Zero unused bits for size.  */
1706   if (size < 64 / BITS_PER_MARKER)
1707     n->n &= ((uint64_t) 1 << (size * BITS_PER_MARKER)) - 1;
1708   return true;
1709 }
1710
1711 /* Perform sanity checking for the symbolic number N and the gimple
1712    statement STMT.  */
1713
1714 static inline bool
1715 verify_symbolic_number_p (struct symbolic_number *n, gimple stmt)
1716 {
1717   tree lhs_type;
1718
1719   lhs_type = gimple_expr_type (stmt);
1720
1721   if (TREE_CODE (lhs_type) != INTEGER_TYPE)
1722     return false;
1723
1724   if (TYPE_PRECISION (lhs_type) != TYPE_PRECISION (n->type))
1725     return false;
1726
1727   return true;
1728 }
1729
1730 /* Initialize the symbolic number N for the bswap pass from the base element
1731    SRC manipulated by the bitwise OR expression.  */
1732
1733 static bool
1734 init_symbolic_number (struct symbolic_number *n, tree src)
1735 {
1736   int size;
1737
1738   n->base_addr = n->offset = n->alias_set = n->vuse = NULL_TREE;
1739
1740   /* Set up the symbolic number N by setting each byte to a value between 1 and
1741      the byte size of rhs1.  The highest order byte is set to n->size and the
1742      lowest order byte to 1.  */
1743   n->type = TREE_TYPE (src);
1744   size = TYPE_PRECISION (n->type);
1745   if (size % BITS_PER_UNIT != 0)
1746     return false;
1747   size /= BITS_PER_UNIT;
1748   if (size > 64 / BITS_PER_MARKER)
1749     return false;
1750   n->range = size;
1751   n->n = CMPNOP;
1752
1753   if (size < 64 / BITS_PER_MARKER)
1754     n->n &= ((uint64_t) 1 << (size * BITS_PER_MARKER)) - 1;
1755
1756   return true;
1757 }
1758
1759 /* Check if STMT might be a byte swap or a nop from a memory source and returns
1760    the answer. If so, REF is that memory source and the base of the memory area
1761    accessed and the offset of the access from that base are recorded in N.  */
1762
1763 bool
1764 find_bswap_or_nop_load (gimple stmt, tree ref, struct symbolic_number *n)
1765 {
1766   /* Leaf node is an array or component ref. Memorize its base and
1767      offset from base to compare to other such leaf node.  */
1768   HOST_WIDE_INT bitsize, bitpos;
1769   machine_mode mode;
1770   int unsignedp, volatilep;
1771   tree offset, base_addr;
1772
1773   if (!gimple_assign_load_p (stmt) || gimple_has_volatile_ops (stmt))
1774     return false;
1775
1776   base_addr = get_inner_reference (ref, &bitsize, &bitpos, &offset, &mode,
1777                                    &unsignedp, &volatilep, false);
1778
1779   if (TREE_CODE (base_addr) == MEM_REF)
1780     {
1781       offset_int bit_offset = 0;
1782       tree off = TREE_OPERAND (base_addr, 1);
1783
1784       if (!integer_zerop (off))
1785         {
1786           offset_int boff, coff = mem_ref_offset (base_addr);
1787           boff = wi::lshift (coff, LOG2_BITS_PER_UNIT);
1788           bit_offset += boff;
1789         }
1790
1791       base_addr = TREE_OPERAND (base_addr, 0);
1792
1793       /* Avoid returning a negative bitpos as this may wreak havoc later.  */
1794       if (wi::neg_p (bit_offset))
1795         {
1796           offset_int mask = wi::mask <offset_int> (LOG2_BITS_PER_UNIT, false);
1797           offset_int tem = bit_offset.and_not (mask);
1798           /* TEM is the bitpos rounded to BITS_PER_UNIT towards -Inf.
1799              Subtract it to BIT_OFFSET and add it (scaled) to OFFSET.  */
1800           bit_offset -= tem;
1801           tem = wi::arshift (tem, LOG2_BITS_PER_UNIT);
1802           if (offset)
1803             offset = size_binop (PLUS_EXPR, offset,
1804                                     wide_int_to_tree (sizetype, tem));
1805           else
1806             offset = wide_int_to_tree (sizetype, tem);
1807         }
1808
1809       bitpos += bit_offset.to_shwi ();
1810     }
1811
1812   if (bitpos % BITS_PER_UNIT)
1813     return false;
1814   if (bitsize % BITS_PER_UNIT)
1815     return false;
1816
1817   if (!init_symbolic_number (n, ref))
1818     return false;
1819   n->base_addr = base_addr;
1820   n->offset = offset;
1821   n->bytepos = bitpos / BITS_PER_UNIT;
1822   n->alias_set = reference_alias_ptr_type (ref);
1823   n->vuse = gimple_vuse (stmt);
1824   return true;
1825 }
1826
1827 /* find_bswap_or_nop_1 invokes itself recursively with N and tries to perform
1828    the operation given by the rhs of STMT on the result.  If the operation
1829    could successfully be executed the function returns a gimple stmt whose
1830    rhs's first tree is the expression of the source operand and NULL
1831    otherwise.  */
1832
1833 static gimple
1834 find_bswap_or_nop_1 (gimple stmt, struct symbolic_number *n, int limit)
1835 {
1836   enum tree_code code;
1837   tree rhs1, rhs2 = NULL;
1838   gimple rhs1_stmt, rhs2_stmt, source_stmt1;
1839   enum gimple_rhs_class rhs_class;
1840
1841   if (!limit || !is_gimple_assign (stmt))
1842     return NULL;
1843
1844   rhs1 = gimple_assign_rhs1 (stmt);
1845
1846   if (find_bswap_or_nop_load (stmt, rhs1, n))
1847     return stmt;
1848
1849   if (TREE_CODE (rhs1) != SSA_NAME)
1850     return NULL;
1851
1852   code = gimple_assign_rhs_code (stmt);
1853   rhs_class = gimple_assign_rhs_class (stmt);
1854   rhs1_stmt = SSA_NAME_DEF_STMT (rhs1);
1855
1856   if (rhs_class == GIMPLE_BINARY_RHS)
1857     rhs2 = gimple_assign_rhs2 (stmt);
1858
1859   /* Handle unary rhs and binary rhs with integer constants as second
1860      operand.  */
1861
1862   if (rhs_class == GIMPLE_UNARY_RHS
1863       || (rhs_class == GIMPLE_BINARY_RHS
1864           && TREE_CODE (rhs2) == INTEGER_CST))
1865     {
1866       if (code != BIT_AND_EXPR
1867           && code != LSHIFT_EXPR
1868           && code != RSHIFT_EXPR
1869           && code != LROTATE_EXPR
1870           && code != RROTATE_EXPR
1871           && !CONVERT_EXPR_CODE_P (code))
1872         return NULL;
1873
1874       source_stmt1 = find_bswap_or_nop_1 (rhs1_stmt, n, limit - 1);
1875
1876       /* If find_bswap_or_nop_1 returned NULL, STMT is a leaf node and
1877          we have to initialize the symbolic number.  */
1878       if (!source_stmt1)
1879         {
1880           if (gimple_assign_load_p (stmt)
1881               || !init_symbolic_number (n, rhs1))
1882             return NULL;
1883           source_stmt1 = stmt;
1884         }
1885
1886       switch (code)
1887         {
1888         case BIT_AND_EXPR:
1889           {
1890             int i, size = TYPE_PRECISION (n->type) / BITS_PER_UNIT;
1891             uint64_t val = int_cst_value (rhs2), mask = 0;
1892             uint64_t tmp = (1 << BITS_PER_UNIT) - 1;
1893
1894             /* Only constants masking full bytes are allowed.  */
1895             for (i = 0; i < size; i++, tmp <<= BITS_PER_UNIT)
1896               if ((val & tmp) != 0 && (val & tmp) != tmp)
1897                 return NULL;
1898               else if (val & tmp)
1899                 mask |= (uint64_t) MARKER_MASK << (i * BITS_PER_MARKER);
1900
1901             n->n &= mask;
1902           }
1903           break;
1904         case LSHIFT_EXPR:
1905         case RSHIFT_EXPR:
1906         case LROTATE_EXPR:
1907         case RROTATE_EXPR:
1908           if (!do_shift_rotate (code, n, (int)TREE_INT_CST_LOW (rhs2)))
1909             return NULL;
1910           break;
1911         CASE_CONVERT:
1912           {
1913             int i, type_size, old_type_size;
1914             tree type;
1915
1916             type = gimple_expr_type (stmt);
1917             type_size = TYPE_PRECISION (type);
1918             if (type_size % BITS_PER_UNIT != 0)
1919               return NULL;
1920             type_size /= BITS_PER_UNIT;
1921             if (type_size > 64 / BITS_PER_MARKER)
1922               return NULL;
1923
1924             /* Sign extension: result is dependent on the value.  */
1925             old_type_size = TYPE_PRECISION (n->type) / BITS_PER_UNIT;
1926             if (!TYPE_UNSIGNED (n->type) && type_size > old_type_size
1927                 && HEAD_MARKER (n->n, old_type_size))
1928               for (i = 0; i < type_size - old_type_size; i++)
1929                 n->n |= (uint64_t) MARKER_BYTE_UNKNOWN
1930                         << ((type_size - 1 - i) * BITS_PER_MARKER);
1931
1932             if (type_size < 64 / BITS_PER_MARKER)
1933               {
1934                 /* If STMT casts to a smaller type mask out the bits not
1935                    belonging to the target type.  */
1936                 n->n &= ((uint64_t) 1 << (type_size * BITS_PER_MARKER)) - 1;
1937               }
1938             n->type = type;
1939             if (!n->base_addr)
1940               n->range = type_size;
1941           }
1942           break;
1943         default:
1944           return NULL;
1945         };
1946       return verify_symbolic_number_p (n, stmt) ? source_stmt1 : NULL;
1947     }
1948
1949   /* Handle binary rhs.  */
1950
1951   if (rhs_class == GIMPLE_BINARY_RHS)
1952     {
1953       int i, size;
1954       struct symbolic_number n1, n2;
1955       uint64_t mask;
1956       gimple source_stmt2;
1957
1958       if (code != BIT_IOR_EXPR)
1959         return NULL;
1960
1961       if (TREE_CODE (rhs2) != SSA_NAME)
1962         return NULL;
1963
1964       rhs2_stmt = SSA_NAME_DEF_STMT (rhs2);
1965
1966       switch (code)
1967         {
1968         case BIT_IOR_EXPR:
1969           source_stmt1 = find_bswap_or_nop_1 (rhs1_stmt, &n1, limit - 1);
1970
1971           if (!source_stmt1)
1972             return NULL;
1973
1974           source_stmt2 = find_bswap_or_nop_1 (rhs2_stmt, &n2, limit - 1);
1975
1976           if (!source_stmt2)
1977             return NULL;
1978
1979           if (TYPE_PRECISION (n1.type) != TYPE_PRECISION (n2.type))
1980             return NULL;
1981
1982           if (!n1.vuse != !n2.vuse ||
1983           (n1.vuse && !operand_equal_p (n1.vuse, n2.vuse, 0)))
1984             return NULL;
1985
1986           if (gimple_assign_rhs1 (source_stmt1)
1987               != gimple_assign_rhs1 (source_stmt2))
1988             {
1989               int64_t inc;
1990               HOST_WIDE_INT off_sub;
1991               struct symbolic_number *n_ptr;
1992
1993               if (!n1.base_addr || !n2.base_addr
1994                   || !operand_equal_p (n1.base_addr, n2.base_addr, 0))
1995                 return NULL;
1996               if (!n1.offset != !n2.offset ||
1997                   (n1.offset && !operand_equal_p (n1.offset, n2.offset, 0)))
1998                 return NULL;
1999
2000               /* We swap n1 with n2 to have n1 < n2.  */
2001               if (n2.bytepos < n1.bytepos)
2002                 {
2003                   struct symbolic_number tmpn;
2004
2005                   tmpn = n2;
2006                   n2 = n1;
2007                   n1 = tmpn;
2008                   source_stmt1 = source_stmt2;
2009                 }
2010
2011               off_sub = n2.bytepos - n1.bytepos;
2012
2013               /* Check that the range of memory covered can be represented by
2014                  a symbolic number.  */
2015               if (off_sub + n2.range > 64 / BITS_PER_MARKER)
2016                 return NULL;
2017               n->range = n2.range + off_sub;
2018
2019               /* Reinterpret byte marks in symbolic number holding the value of
2020                  bigger weight according to target endianness.  */
2021               inc = BYTES_BIG_ENDIAN ? off_sub + n2.range - n1.range : off_sub;
2022               size = TYPE_PRECISION (n1.type) / BITS_PER_UNIT;
2023               if (BYTES_BIG_ENDIAN)
2024                 n_ptr = &n1;
2025               else
2026                 n_ptr = &n2;
2027               for (i = 0; i < size; i++, inc <<= BITS_PER_MARKER)
2028                 {
2029                   unsigned marker =
2030                     (n_ptr->n >> (i * BITS_PER_MARKER)) & MARKER_MASK;
2031                   if (marker && marker != MARKER_BYTE_UNKNOWN)
2032                     n_ptr->n += inc;
2033                 }
2034             }
2035           else
2036             n->range = n1.range;
2037
2038           if (!n1.alias_set
2039               || alias_ptr_types_compatible_p (n1.alias_set, n2.alias_set))
2040             n->alias_set = n1.alias_set;
2041           else
2042             n->alias_set = ptr_type_node;
2043           n->vuse = n1.vuse;
2044           n->base_addr = n1.base_addr;
2045           n->offset = n1.offset;
2046           n->bytepos = n1.bytepos;
2047           n->type = n1.type;
2048           size = TYPE_PRECISION (n->type) / BITS_PER_UNIT;
2049           for (i = 0, mask = MARKER_MASK; i < size;
2050                i++, mask <<= BITS_PER_MARKER)
2051             {
2052               uint64_t masked1, masked2;
2053
2054               masked1 = n1.n & mask;
2055               masked2 = n2.n & mask;
2056               if (masked1 && masked2 && masked1 != masked2)
2057                 return NULL;
2058             }
2059           n->n = n1.n | n2.n;
2060
2061           if (!verify_symbolic_number_p (n, stmt))
2062             return NULL;
2063
2064           break;
2065         default:
2066           return NULL;
2067         }
2068       return source_stmt1;
2069     }
2070   return NULL;
2071 }
2072
2073 /* Check if STMT completes a bswap implementation or a read in a given
2074    endianness consisting of ORs, SHIFTs and ANDs and sets *BSWAP
2075    accordingly.  It also sets N to represent the kind of operations
2076    performed: size of the resulting expression and whether it works on
2077    a memory source, and if so alias-set and vuse.  At last, the
2078    function returns a stmt whose rhs's first tree is the source
2079    expression.  */
2080
2081 static gimple
2082 find_bswap_or_nop (gimple stmt, struct symbolic_number *n, bool *bswap)
2083 {
2084 /* The number which the find_bswap_or_nop_1 result should match in order
2085    to have a full byte swap.  The number is shifted to the right
2086    according to the size of the symbolic number before using it.  */
2087   uint64_t cmpxchg = CMPXCHG;
2088   uint64_t cmpnop = CMPNOP;
2089
2090   gimple source_stmt;
2091   int limit;
2092
2093   /* The last parameter determines the depth search limit.  It usually
2094      correlates directly to the number n of bytes to be touched.  We
2095      increase that number by log2(n) + 1 here in order to also
2096      cover signed -> unsigned conversions of the src operand as can be seen
2097      in libgcc, and for initial shift/and operation of the src operand.  */
2098   limit = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (gimple_expr_type (stmt)));
2099   limit += 1 + (int) ceil_log2 ((unsigned HOST_WIDE_INT) limit);
2100   source_stmt =  find_bswap_or_nop_1 (stmt, n, limit);
2101
2102   if (!source_stmt)
2103     return NULL;
2104
2105   /* Find real size of result (highest non zero byte).  */
2106   if (n->base_addr)
2107     {
2108       int rsize;
2109       uint64_t tmpn;
2110
2111       for (tmpn = n->n, rsize = 0; tmpn; tmpn >>= BITS_PER_MARKER, rsize++);
2112       n->range = rsize;
2113     }
2114
2115   /* Zero out the extra bits of N and CMP*.  */
2116   if (n->range < (int) sizeof (int64_t))
2117     {
2118       uint64_t mask;
2119
2120       mask = ((uint64_t) 1 << (n->range * BITS_PER_MARKER)) - 1;
2121       cmpxchg >>= (64 / BITS_PER_MARKER - n->range) * BITS_PER_MARKER;
2122       cmpnop &= mask;
2123     }
2124
2125   /* A complete byte swap should make the symbolic number to start with
2126      the largest digit in the highest order byte. Unchanged symbolic
2127      number indicates a read with same endianness as target architecture.  */
2128   if (n->n == cmpnop)
2129     *bswap = false;
2130   else if (n->n == cmpxchg)
2131     *bswap = true;
2132   else
2133     return NULL;
2134
2135   /* Useless bit manipulation performed by code.  */
2136   if (!n->base_addr && n->n == cmpnop)
2137     return NULL;
2138
2139   n->range *= BITS_PER_UNIT;
2140   return source_stmt;
2141 }
2142
2143 namespace {
2144
2145 const pass_data pass_data_optimize_bswap =
2146 {
2147   GIMPLE_PASS, /* type */
2148   "bswap", /* name */
2149   OPTGROUP_NONE, /* optinfo_flags */
2150   TV_NONE, /* tv_id */
2151   PROP_ssa, /* properties_required */
2152   0, /* properties_provided */
2153   0, /* properties_destroyed */
2154   0, /* todo_flags_start */
2155   0, /* todo_flags_finish */
2156 };
2157
2158 class pass_optimize_bswap : public gimple_opt_pass
2159 {
2160 public:
2161   pass_optimize_bswap (gcc::context *ctxt)
2162     : gimple_opt_pass (pass_data_optimize_bswap, ctxt)
2163   {}
2164
2165   /* opt_pass methods: */
2166   virtual bool gate (function *)
2167     {
2168       return flag_expensive_optimizations && optimize;
2169     }
2170
2171   virtual unsigned int execute (function *);
2172
2173 }; // class pass_optimize_bswap
2174
2175 /* Perform the bswap optimization: replace the statement CUR_STMT at
2176    GSI with a load of type, VUSE and set-alias as described by N if a
2177    memory source is involved (N->base_addr is non null), followed by
2178    the builtin bswap invocation in FNDECL if BSWAP is true.  SRC_STMT
2179    gives where should the replacement be made.  It also gives the
2180    source on which CUR_STMT is operating via its rhs's first tree nad
2181    N->range gives the size of the expression involved for maintaining
2182    some statistics.  */
2183
2184 static bool
2185 bswap_replace (gimple cur_stmt, gimple_stmt_iterator gsi, gimple src_stmt,
2186                tree fndecl, tree bswap_type, tree load_type,
2187                struct symbolic_number *n, bool bswap)
2188 {
2189   tree src, tmp, tgt;
2190   gimple bswap_stmt;
2191
2192   src = gimple_assign_rhs1 (src_stmt);
2193   tgt = gimple_assign_lhs (cur_stmt);
2194
2195   /* Need to load the value from memory first.  */
2196   if (n->base_addr)
2197     {
2198       gimple_stmt_iterator gsi_ins = gsi_for_stmt (src_stmt);
2199       tree addr_expr, addr_tmp, val_expr, val_tmp;
2200       tree load_offset_ptr, aligned_load_type;
2201       gimple addr_stmt, load_stmt;
2202       unsigned align;
2203
2204       align = get_object_alignment (src);
2205       if (bswap
2206           && align < GET_MODE_ALIGNMENT (TYPE_MODE (load_type))
2207           && SLOW_UNALIGNED_ACCESS (TYPE_MODE (load_type), align))
2208         return false;
2209
2210       gsi_move_before (&gsi, &gsi_ins);
2211       gsi = gsi_for_stmt (cur_stmt);
2212
2213       /*  Compute address to load from and cast according to the size
2214           of the load.  */
2215       addr_expr = build_fold_addr_expr (unshare_expr (src));
2216       if (is_gimple_min_invariant (addr_expr))
2217         addr_tmp = addr_expr;
2218       else
2219         {
2220           addr_tmp = make_temp_ssa_name (TREE_TYPE (addr_expr), NULL,
2221                                          "load_src");
2222           addr_stmt = gimple_build_assign (addr_tmp, addr_expr);
2223           gsi_insert_before (&gsi, addr_stmt, GSI_SAME_STMT);
2224         }
2225
2226       /* Perform the load.  */
2227       aligned_load_type = load_type;
2228       if (align < TYPE_ALIGN (load_type))
2229         aligned_load_type = build_aligned_type (load_type, align);
2230       load_offset_ptr = build_int_cst (n->alias_set, 0);
2231       val_expr = fold_build2 (MEM_REF, aligned_load_type, addr_tmp,
2232                               load_offset_ptr);
2233
2234       if (!bswap)
2235         {
2236           if (n->range == 16)
2237             nop_stats.found_16bit++;
2238           else if (n->range == 32)
2239             nop_stats.found_32bit++;
2240           else
2241             {
2242               gcc_assert (n->range == 64);
2243               nop_stats.found_64bit++;
2244             }
2245
2246           /* Convert the result of load if necessary.  */
2247           if (!useless_type_conversion_p (TREE_TYPE (tgt), load_type))
2248             {
2249               val_tmp = make_temp_ssa_name (aligned_load_type, NULL,
2250                                             "load_dst");
2251               load_stmt = gimple_build_assign (val_tmp, val_expr);
2252               gimple_set_vuse (load_stmt, n->vuse);
2253               gsi_insert_before (&gsi, load_stmt, GSI_SAME_STMT);
2254               gimple_assign_set_rhs_with_ops_1 (&gsi, NOP_EXPR, val_tmp,
2255                                                 NULL_TREE, NULL_TREE);
2256             }
2257           else
2258             {
2259               gimple_assign_set_rhs_with_ops_1 (&gsi, MEM_REF, val_expr,
2260                                                 NULL_TREE, NULL_TREE);
2261               gimple_set_vuse (cur_stmt, n->vuse);
2262             }
2263           update_stmt (cur_stmt);
2264
2265           if (dump_file)
2266             {
2267               fprintf (dump_file,
2268                        "%d bit load in target endianness found at: ",
2269                        (int)n->range);
2270               print_gimple_stmt (dump_file, cur_stmt, 0, 0);
2271             }
2272           return true;
2273         }
2274       else
2275         {
2276           val_tmp = make_temp_ssa_name (aligned_load_type, NULL, "load_dst");
2277           load_stmt = gimple_build_assign (val_tmp, val_expr);
2278           gimple_set_vuse (load_stmt, n->vuse);
2279           gsi_insert_before (&gsi, load_stmt, GSI_SAME_STMT);
2280         }
2281       src = val_tmp;
2282     }
2283
2284   if (n->range == 16)
2285     bswap_stats.found_16bit++;
2286   else if (n->range == 32)
2287     bswap_stats.found_32bit++;
2288   else
2289     {
2290       gcc_assert (n->range == 64);
2291       bswap_stats.found_64bit++;
2292     }
2293
2294   tmp = src;
2295
2296   /* Canonical form for 16 bit bswap is a rotate expression.  */
2297   if (bswap && n->range == 16)
2298     {
2299       tree count = build_int_cst (NULL, BITS_PER_UNIT);
2300       bswap_type = TREE_TYPE (src);
2301       src = fold_build2 (LROTATE_EXPR, bswap_type, src, count);
2302       bswap_stmt = gimple_build_assign (NULL, src);
2303     }
2304   else
2305     {
2306       /* Convert the src expression if necessary.  */
2307       if (!useless_type_conversion_p (TREE_TYPE (tmp), bswap_type))
2308         {
2309           gimple convert_stmt;
2310           tmp = make_temp_ssa_name (bswap_type, NULL, "bswapsrc");
2311           convert_stmt = gimple_build_assign_with_ops (NOP_EXPR, tmp, src,
2312                                                        NULL);
2313           gsi_insert_before (&gsi, convert_stmt, GSI_SAME_STMT);
2314         }
2315
2316       bswap_stmt = gimple_build_call (fndecl, 1, tmp);
2317     }
2318
2319   tmp = tgt;
2320
2321   /* Convert the result if necessary.  */
2322   if (!useless_type_conversion_p (TREE_TYPE (tgt), bswap_type))
2323     {
2324       gimple convert_stmt;
2325       tmp = make_temp_ssa_name (bswap_type, NULL, "bswapdst");
2326       convert_stmt = gimple_build_assign_with_ops (NOP_EXPR, tgt, tmp, NULL);
2327       gsi_insert_after (&gsi, convert_stmt, GSI_SAME_STMT);
2328     }
2329
2330   gimple_set_lhs (bswap_stmt, tmp);
2331
2332   if (dump_file)
2333     {
2334       fprintf (dump_file, "%d bit bswap implementation found at: ",
2335                (int)n->range);
2336       print_gimple_stmt (dump_file, cur_stmt, 0, 0);
2337     }
2338
2339   gsi_insert_after (&gsi, bswap_stmt, GSI_SAME_STMT);
2340   gsi_remove (&gsi, true);
2341   return true;
2342 }
2343
2344 /* Find manual byte swap implementations as well as load in a given
2345    endianness. Byte swaps are turned into a bswap builtin invokation
2346    while endian loads are converted to bswap builtin invokation or
2347    simple load according to the target endianness.  */
2348
2349 unsigned int
2350 pass_optimize_bswap::execute (function *fun)
2351 {
2352   basic_block bb;
2353   bool bswap16_p, bswap32_p, bswap64_p;
2354   bool changed = false;
2355   tree bswap16_type = NULL_TREE, bswap32_type = NULL_TREE, bswap64_type = NULL_TREE;
2356
2357   if (BITS_PER_UNIT != 8)
2358     return 0;
2359
2360   bswap16_p = (builtin_decl_explicit_p (BUILT_IN_BSWAP16)
2361                && optab_handler (bswap_optab, HImode) != CODE_FOR_nothing);
2362   bswap32_p = (builtin_decl_explicit_p (BUILT_IN_BSWAP32)
2363                && optab_handler (bswap_optab, SImode) != CODE_FOR_nothing);
2364   bswap64_p = (builtin_decl_explicit_p (BUILT_IN_BSWAP64)
2365                && (optab_handler (bswap_optab, DImode) != CODE_FOR_nothing
2366                    || (bswap32_p && word_mode == SImode)));
2367
2368   /* Determine the argument type of the builtins.  The code later on
2369      assumes that the return and argument type are the same.  */
2370   if (bswap16_p)
2371     {
2372       tree fndecl = builtin_decl_explicit (BUILT_IN_BSWAP16);
2373       bswap16_type = TREE_VALUE (TYPE_ARG_TYPES (TREE_TYPE (fndecl)));
2374     }
2375
2376   if (bswap32_p)
2377     {
2378       tree fndecl = builtin_decl_explicit (BUILT_IN_BSWAP32);
2379       bswap32_type = TREE_VALUE (TYPE_ARG_TYPES (TREE_TYPE (fndecl)));
2380     }
2381
2382   if (bswap64_p)
2383     {
2384       tree fndecl = builtin_decl_explicit (BUILT_IN_BSWAP64);
2385       bswap64_type = TREE_VALUE (TYPE_ARG_TYPES (TREE_TYPE (fndecl)));
2386     }
2387
2388   memset (&nop_stats, 0, sizeof (nop_stats));
2389   memset (&bswap_stats, 0, sizeof (bswap_stats));
2390
2391   FOR_EACH_BB_FN (bb, fun)
2392     {
2393       gimple_stmt_iterator gsi;
2394
2395       /* We do a reverse scan for bswap patterns to make sure we get the
2396          widest match. As bswap pattern matching doesn't handle
2397          previously inserted smaller bswap replacements as sub-
2398          patterns, the wider variant wouldn't be detected.  */
2399       for (gsi = gsi_last_bb (bb); !gsi_end_p (gsi); gsi_prev (&gsi))
2400         {
2401           gimple src_stmt, cur_stmt = gsi_stmt (gsi);
2402           tree fndecl = NULL_TREE, bswap_type = NULL_TREE, load_type;
2403           enum tree_code code;
2404           struct symbolic_number n;
2405           bool bswap;
2406
2407           if (!is_gimple_assign (cur_stmt))
2408             continue;
2409
2410           code = gimple_assign_rhs_code (cur_stmt);
2411           switch (code)
2412             {
2413             case LROTATE_EXPR:
2414             case RROTATE_EXPR:
2415               if (!tree_fits_uhwi_p (gimple_assign_rhs2 (cur_stmt))
2416                   || tree_to_uhwi (gimple_assign_rhs2 (cur_stmt))
2417                      % BITS_PER_UNIT)
2418                 continue;
2419               /* Fall through.  */
2420             case BIT_IOR_EXPR:
2421               break;
2422             default:
2423               continue;
2424             }
2425
2426           src_stmt = find_bswap_or_nop (cur_stmt, &n, &bswap);
2427
2428           if (!src_stmt)
2429             continue;
2430
2431           switch (n.range)
2432             {
2433             case 16:
2434               load_type = uint16_type_node;
2435               if (bswap16_p)
2436                 {
2437                   fndecl = builtin_decl_explicit (BUILT_IN_BSWAP16);
2438                   bswap_type = bswap16_type;
2439                 }
2440               break;
2441             case 32:
2442               load_type = uint32_type_node;
2443               if (bswap32_p)
2444                 {
2445                   fndecl = builtin_decl_explicit (BUILT_IN_BSWAP32);
2446                   bswap_type = bswap32_type;
2447                 }
2448               break;
2449             case 64:
2450               load_type = uint64_type_node;
2451               if (bswap64_p)
2452                 {
2453                   fndecl = builtin_decl_explicit (BUILT_IN_BSWAP64);
2454                   bswap_type = bswap64_type;
2455                 }
2456               break;
2457             default:
2458               continue;
2459             }
2460
2461           if (bswap && !fndecl)
2462             continue;
2463
2464           if (bswap_replace (cur_stmt, gsi, src_stmt, fndecl, bswap_type,
2465                              load_type, &n, bswap))
2466             changed = true;
2467         }
2468     }
2469
2470   statistics_counter_event (fun, "16-bit nop implementations found",
2471                             nop_stats.found_16bit);
2472   statistics_counter_event (fun, "32-bit nop implementations found",
2473                             nop_stats.found_32bit);
2474   statistics_counter_event (fun, "64-bit nop implementations found",
2475                             nop_stats.found_64bit);
2476   statistics_counter_event (fun, "16-bit bswap implementations found",
2477                             bswap_stats.found_16bit);
2478   statistics_counter_event (fun, "32-bit bswap implementations found",
2479                             bswap_stats.found_32bit);
2480   statistics_counter_event (fun, "64-bit bswap implementations found",
2481                             bswap_stats.found_64bit);
2482
2483   return (changed ? TODO_update_ssa : 0);
2484 }
2485
2486 } // anon namespace
2487
2488 gimple_opt_pass *
2489 make_pass_optimize_bswap (gcc::context *ctxt)
2490 {
2491   return new pass_optimize_bswap (ctxt);
2492 }
2493
2494 /* Return true if stmt is a type conversion operation that can be stripped
2495    when used in a widening multiply operation.  */
2496 static bool
2497 widening_mult_conversion_strippable_p (tree result_type, gimple stmt)
2498 {
2499   enum tree_code rhs_code = gimple_assign_rhs_code (stmt);
2500
2501   if (TREE_CODE (result_type) == INTEGER_TYPE)
2502     {
2503       tree op_type;
2504       tree inner_op_type;
2505
2506       if (!CONVERT_EXPR_CODE_P (rhs_code))
2507         return false;
2508
2509       op_type = TREE_TYPE (gimple_assign_lhs (stmt));
2510
2511       /* If the type of OP has the same precision as the result, then
2512          we can strip this conversion.  The multiply operation will be
2513          selected to create the correct extension as a by-product.  */
2514       if (TYPE_PRECISION (result_type) == TYPE_PRECISION (op_type))
2515         return true;
2516
2517       /* We can also strip a conversion if it preserves the signed-ness of
2518          the operation and doesn't narrow the range.  */
2519       inner_op_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
2520
2521       /* If the inner-most type is unsigned, then we can strip any
2522          intermediate widening operation.  If it's signed, then the
2523          intermediate widening operation must also be signed.  */
2524       if ((TYPE_UNSIGNED (inner_op_type)
2525            || TYPE_UNSIGNED (op_type) == TYPE_UNSIGNED (inner_op_type))
2526           && TYPE_PRECISION (op_type) > TYPE_PRECISION (inner_op_type))
2527         return true;
2528
2529       return false;
2530     }
2531
2532   return rhs_code == FIXED_CONVERT_EXPR;
2533 }
2534
2535 /* Return true if RHS is a suitable operand for a widening multiplication,
2536    assuming a target type of TYPE.
2537    There are two cases:
2538
2539      - RHS makes some value at least twice as wide.  Store that value
2540        in *NEW_RHS_OUT if so, and store its type in *TYPE_OUT.
2541
2542      - RHS is an integer constant.  Store that value in *NEW_RHS_OUT if so,
2543        but leave *TYPE_OUT untouched.  */
2544
2545 static bool
2546 is_widening_mult_rhs_p (tree type, tree rhs, tree *type_out,
2547                         tree *new_rhs_out)
2548 {
2549   gimple stmt;
2550   tree type1, rhs1;
2551
2552   if (TREE_CODE (rhs) == SSA_NAME)
2553     {
2554       stmt = SSA_NAME_DEF_STMT (rhs);
2555       if (is_gimple_assign (stmt))
2556         {
2557           if (! widening_mult_conversion_strippable_p (type, stmt))
2558             rhs1 = rhs;
2559           else
2560             {
2561               rhs1 = gimple_assign_rhs1 (stmt);
2562
2563               if (TREE_CODE (rhs1) == INTEGER_CST)
2564                 {
2565                   *new_rhs_out = rhs1;
2566                   *type_out = NULL;
2567                   return true;
2568                 }
2569             }
2570         }
2571       else
2572         rhs1 = rhs;
2573
2574       type1 = TREE_TYPE (rhs1);
2575
2576       if (TREE_CODE (type1) != TREE_CODE (type)
2577           || TYPE_PRECISION (type1) * 2 > TYPE_PRECISION (type))
2578         return false;
2579
2580       *new_rhs_out = rhs1;
2581       *type_out = type1;
2582       return true;
2583     }
2584
2585   if (TREE_CODE (rhs) == INTEGER_CST)
2586     {
2587       *new_rhs_out = rhs;
2588       *type_out = NULL;
2589       return true;
2590     }
2591
2592   return false;
2593 }
2594
2595 /* Return true if STMT performs a widening multiplication, assuming the
2596    output type is TYPE.  If so, store the unwidened types of the operands
2597    in *TYPE1_OUT and *TYPE2_OUT respectively.  Also fill *RHS1_OUT and
2598    *RHS2_OUT such that converting those operands to types *TYPE1_OUT
2599    and *TYPE2_OUT would give the operands of the multiplication.  */
2600
2601 static bool
2602 is_widening_mult_p (gimple stmt,
2603                     tree *type1_out, tree *rhs1_out,
2604                     tree *type2_out, tree *rhs2_out)
2605 {
2606   tree type = TREE_TYPE (gimple_assign_lhs (stmt));
2607
2608   if (TREE_CODE (type) != INTEGER_TYPE
2609       && TREE_CODE (type) != FIXED_POINT_TYPE)
2610     return false;
2611
2612   if (!is_widening_mult_rhs_p (type, gimple_assign_rhs1 (stmt), type1_out,
2613                                rhs1_out))
2614     return false;
2615
2616   if (!is_widening_mult_rhs_p (type, gimple_assign_rhs2 (stmt), type2_out,
2617                                rhs2_out))
2618     return false;
2619
2620   if (*type1_out == NULL)
2621     {
2622       if (*type2_out == NULL || !int_fits_type_p (*rhs1_out, *type2_out))
2623         return false;
2624       *type1_out = *type2_out;
2625     }
2626
2627   if (*type2_out == NULL)
2628     {
2629       if (!int_fits_type_p (*rhs2_out, *type1_out))
2630         return false;
2631       *type2_out = *type1_out;
2632     }
2633
2634   /* Ensure that the larger of the two operands comes first. */
2635   if (TYPE_PRECISION (*type1_out) < TYPE_PRECISION (*type2_out))
2636     {
2637       tree tmp;
2638       tmp = *type1_out;
2639       *type1_out = *type2_out;
2640       *type2_out = tmp;
2641       tmp = *rhs1_out;
2642       *rhs1_out = *rhs2_out;
2643       *rhs2_out = tmp;
2644     }
2645
2646   return true;
2647 }
2648
2649 /* Process a single gimple statement STMT, which has a MULT_EXPR as
2650    its rhs, and try to convert it into a WIDEN_MULT_EXPR.  The return
2651    value is true iff we converted the statement.  */
2652
2653 static bool
2654 convert_mult_to_widen (gimple stmt, gimple_stmt_iterator *gsi)
2655 {
2656   tree lhs, rhs1, rhs2, type, type1, type2;
2657   enum insn_code handler;
2658   machine_mode to_mode, from_mode, actual_mode;
2659   optab op;
2660   int actual_precision;
2661   location_t loc = gimple_location (stmt);
2662   bool from_unsigned1, from_unsigned2;
2663
2664   lhs = gimple_assign_lhs (stmt);
2665   type = TREE_TYPE (lhs);
2666   if (TREE_CODE (type) != INTEGER_TYPE)
2667     return false;
2668
2669   if (!is_widening_mult_p (stmt, &type1, &rhs1, &type2, &rhs2))
2670     return false;
2671
2672   to_mode = TYPE_MODE (type);
2673   from_mode = TYPE_MODE (type1);
2674   from_unsigned1 = TYPE_UNSIGNED (type1);
2675   from_unsigned2 = TYPE_UNSIGNED (type2);
2676
2677   if (from_unsigned1 && from_unsigned2)
2678     op = umul_widen_optab;
2679   else if (!from_unsigned1 && !from_unsigned2)
2680     op = smul_widen_optab;
2681   else
2682     op = usmul_widen_optab;
2683
2684   handler = find_widening_optab_handler_and_mode (op, to_mode, from_mode,
2685                                                   0, &actual_mode);
2686
2687   if (handler == CODE_FOR_nothing)
2688     {
2689       if (op != smul_widen_optab)
2690         {
2691           /* We can use a signed multiply with unsigned types as long as
2692              there is a wider mode to use, or it is the smaller of the two
2693              types that is unsigned.  Note that type1 >= type2, always.  */
2694           if ((TYPE_UNSIGNED (type1)
2695                && TYPE_PRECISION (type1) == GET_MODE_PRECISION (from_mode))
2696               || (TYPE_UNSIGNED (type2)
2697                   && TYPE_PRECISION (type2) == GET_MODE_PRECISION (from_mode)))
2698             {
2699               from_mode = GET_MODE_WIDER_MODE (from_mode);
2700               if (GET_MODE_SIZE (to_mode) <= GET_MODE_SIZE (from_mode))
2701                 return false;
2702             }
2703
2704           op = smul_widen_optab;
2705           handler = find_widening_optab_handler_and_mode (op, to_mode,
2706                                                           from_mode, 0,
2707                                                           &actual_mode);
2708
2709           if (handler == CODE_FOR_nothing)
2710             return false;
2711
2712           from_unsigned1 = from_unsigned2 = false;
2713         }
2714       else
2715         return false;
2716     }
2717
2718   /* Ensure that the inputs to the handler are in the correct precison
2719      for the opcode.  This will be the full mode size.  */
2720   actual_precision = GET_MODE_PRECISION (actual_mode);
2721   if (2 * actual_precision > TYPE_PRECISION (type))
2722     return false;
2723   if (actual_precision != TYPE_PRECISION (type1)
2724       || from_unsigned1 != TYPE_UNSIGNED (type1))
2725     rhs1 = build_and_insert_cast (gsi, loc,
2726                                   build_nonstandard_integer_type
2727                                     (actual_precision, from_unsigned1), rhs1);
2728   if (actual_precision != TYPE_PRECISION (type2)
2729       || from_unsigned2 != TYPE_UNSIGNED (type2))
2730     rhs2 = build_and_insert_cast (gsi, loc,
2731                                   build_nonstandard_integer_type
2732                                     (actual_precision, from_unsigned2), rhs2);
2733
2734   /* Handle constants.  */
2735   if (TREE_CODE (rhs1) == INTEGER_CST)
2736     rhs1 = fold_convert (type1, rhs1);
2737   if (TREE_CODE (rhs2) == INTEGER_CST)
2738     rhs2 = fold_convert (type2, rhs2);
2739
2740   gimple_assign_set_rhs1 (stmt, rhs1);
2741   gimple_assign_set_rhs2 (stmt, rhs2);
2742   gimple_assign_set_rhs_code (stmt, WIDEN_MULT_EXPR);
2743   update_stmt (stmt);
2744   widen_mul_stats.widen_mults_inserted++;
2745   return true;
2746 }
2747
2748 /* Process a single gimple statement STMT, which is found at the
2749    iterator GSI and has a either a PLUS_EXPR or a MINUS_EXPR as its
2750    rhs (given by CODE), and try to convert it into a
2751    WIDEN_MULT_PLUS_EXPR or a WIDEN_MULT_MINUS_EXPR.  The return value
2752    is true iff we converted the statement.  */
2753
2754 static bool
2755 convert_plusminus_to_widen (gimple_stmt_iterator *gsi, gimple stmt,
2756                             enum tree_code code)
2757 {
2758   gimple rhs1_stmt = NULL, rhs2_stmt = NULL;
2759   gimple conv1_stmt = NULL, conv2_stmt = NULL, conv_stmt;
2760   tree type, type1, type2, optype;
2761   tree lhs, rhs1, rhs2, mult_rhs1, mult_rhs2, add_rhs;
2762   enum tree_code rhs1_code = ERROR_MARK, rhs2_code = ERROR_MARK;
2763   optab this_optab;
2764   enum tree_code wmult_code;
2765   enum insn_code handler;
2766   machine_mode to_mode, from_mode, actual_mode;
2767   location_t loc = gimple_location (stmt);
2768   int actual_precision;
2769   bool from_unsigned1, from_unsigned2;
2770
2771   lhs = gimple_assign_lhs (stmt);
2772   type = TREE_TYPE (lhs);
2773   if (TREE_CODE (type) != INTEGER_TYPE
2774       && TREE_CODE (type) != FIXED_POINT_TYPE)
2775     return false;
2776
2777   if (code == MINUS_EXPR)
2778     wmult_code = WIDEN_MULT_MINUS_EXPR;
2779   else
2780     wmult_code = WIDEN_MULT_PLUS_EXPR;
2781
2782   rhs1 = gimple_assign_rhs1 (stmt);
2783   rhs2 = gimple_assign_rhs2 (stmt);
2784
2785   if (TREE_CODE (rhs1) == SSA_NAME)
2786     {
2787       rhs1_stmt = SSA_NAME_DEF_STMT (rhs1);
2788       if (is_gimple_assign (rhs1_stmt))
2789         rhs1_code = gimple_assign_rhs_code (rhs1_stmt);
2790     }
2791
2792   if (TREE_CODE (rhs2) == SSA_NAME)
2793     {
2794       rhs2_stmt = SSA_NAME_DEF_STMT (rhs2);
2795       if (is_gimple_assign (rhs2_stmt))
2796         rhs2_code = gimple_assign_rhs_code (rhs2_stmt);
2797     }
2798
2799   /* Allow for one conversion statement between the multiply
2800      and addition/subtraction statement.  If there are more than
2801      one conversions then we assume they would invalidate this
2802      transformation.  If that's not the case then they should have
2803      been folded before now.  */
2804   if (CONVERT_EXPR_CODE_P (rhs1_code))
2805     {
2806       conv1_stmt = rhs1_stmt;
2807       rhs1 = gimple_assign_rhs1 (rhs1_stmt);
2808       if (TREE_CODE (rhs1) == SSA_NAME)
2809         {
2810           rhs1_stmt = SSA_NAME_DEF_STMT (rhs1);
2811           if (is_gimple_assign (rhs1_stmt))
2812             rhs1_code = gimple_assign_rhs_code (rhs1_stmt);
2813         }
2814       else
2815         return false;
2816     }
2817   if (CONVERT_EXPR_CODE_P (rhs2_code))
2818     {
2819       conv2_stmt = rhs2_stmt;
2820       rhs2 = gimple_assign_rhs1 (rhs2_stmt);
2821       if (TREE_CODE (rhs2) == SSA_NAME)
2822         {
2823           rhs2_stmt = SSA_NAME_DEF_STMT (rhs2);
2824           if (is_gimple_assign (rhs2_stmt))
2825             rhs2_code = gimple_assign_rhs_code (rhs2_stmt);
2826         }
2827       else
2828         return false;
2829     }
2830
2831   /* If code is WIDEN_MULT_EXPR then it would seem unnecessary to call
2832      is_widening_mult_p, but we still need the rhs returns.
2833
2834      It might also appear that it would be sufficient to use the existing
2835      operands of the widening multiply, but that would limit the choice of
2836      multiply-and-accumulate instructions.
2837
2838      If the widened-multiplication result has more than one uses, it is
2839      probably wiser not to do the conversion.  */
2840   if (code == PLUS_EXPR
2841       && (rhs1_code == MULT_EXPR || rhs1_code == WIDEN_MULT_EXPR))
2842     {
2843       if (!has_single_use (rhs1)
2844           || !is_widening_mult_p (rhs1_stmt, &type1, &mult_rhs1,
2845                                   &type2, &mult_rhs2))
2846         return false;
2847       add_rhs = rhs2;
2848       conv_stmt = conv1_stmt;
2849     }
2850   else if (rhs2_code == MULT_EXPR || rhs2_code == WIDEN_MULT_EXPR)
2851     {
2852       if (!has_single_use (rhs2)
2853           || !is_widening_mult_p (rhs2_stmt, &type1, &mult_rhs1,
2854                                   &type2, &mult_rhs2))
2855         return false;
2856       add_rhs = rhs1;
2857       conv_stmt = conv2_stmt;
2858     }
2859   else
2860     return false;
2861
2862   to_mode = TYPE_MODE (type);
2863   from_mode = TYPE_MODE (type1);
2864   from_unsigned1 = TYPE_UNSIGNED (type1);
2865   from_unsigned2 = TYPE_UNSIGNED (type2);
2866   optype = type1;
2867
2868   /* There's no such thing as a mixed sign madd yet, so use a wider mode.  */
2869   if (from_unsigned1 != from_unsigned2)
2870     {
2871       if (!INTEGRAL_TYPE_P (type))
2872         return false;
2873       /* We can use a signed multiply with unsigned types as long as
2874          there is a wider mode to use, or it is the smaller of the two
2875          types that is unsigned.  Note that type1 >= type2, always.  */
2876       if ((from_unsigned1
2877            && TYPE_PRECISION (type1) == GET_MODE_PRECISION (from_mode))
2878           || (from_unsigned2
2879               && TYPE_PRECISION (type2) == GET_MODE_PRECISION (from_mode)))
2880         {
2881           from_mode = GET_MODE_WIDER_MODE (from_mode);
2882           if (GET_MODE_SIZE (from_mode) >= GET_MODE_SIZE (to_mode))
2883             return false;
2884         }
2885
2886       from_unsigned1 = from_unsigned2 = false;
2887       optype = build_nonstandard_integer_type (GET_MODE_PRECISION (from_mode),
2888                                                false);
2889     }
2890
2891   /* If there was a conversion between the multiply and addition
2892      then we need to make sure it fits a multiply-and-accumulate.
2893      The should be a single mode change which does not change the
2894      value.  */
2895   if (conv_stmt)
2896     {
2897       /* We use the original, unmodified data types for this.  */
2898       tree from_type = TREE_TYPE (gimple_assign_rhs1 (conv_stmt));
2899       tree to_type = TREE_TYPE (gimple_assign_lhs (conv_stmt));
2900       int data_size = TYPE_PRECISION (type1) + TYPE_PRECISION (type2);
2901       bool is_unsigned = TYPE_UNSIGNED (type1) && TYPE_UNSIGNED (type2);
2902
2903       if (TYPE_PRECISION (from_type) > TYPE_PRECISION (to_type))
2904         {
2905           /* Conversion is a truncate.  */
2906           if (TYPE_PRECISION (to_type) < data_size)
2907             return false;
2908         }
2909       else if (TYPE_PRECISION (from_type) < TYPE_PRECISION (to_type))
2910         {
2911           /* Conversion is an extend.  Check it's the right sort.  */
2912           if (TYPE_UNSIGNED (from_type) != is_unsigned
2913               && !(is_unsigned && TYPE_PRECISION (from_type) > data_size))
2914             return false;
2915         }
2916       /* else convert is a no-op for our purposes.  */
2917     }
2918
2919   /* Verify that the machine can perform a widening multiply
2920      accumulate in this mode/signedness combination, otherwise
2921      this transformation is likely to pessimize code.  */
2922   this_optab = optab_for_tree_code (wmult_code, optype, optab_default);
2923   handler = find_widening_optab_handler_and_mode (this_optab, to_mode,
2924                                                   from_mode, 0, &actual_mode);
2925
2926   if (handler == CODE_FOR_nothing)
2927     return false;
2928
2929   /* Ensure that the inputs to the handler are in the correct precison
2930      for the opcode.  This will be the full mode size.  */
2931   actual_precision = GET_MODE_PRECISION (actual_mode);
2932   if (actual_precision != TYPE_PRECISION (type1)
2933       || from_unsigned1 != TYPE_UNSIGNED (type1))
2934     mult_rhs1 = build_and_insert_cast (gsi, loc,
2935                                        build_nonstandard_integer_type
2936                                          (actual_precision, from_unsigned1),
2937                                        mult_rhs1);
2938   if (actual_precision != TYPE_PRECISION (type2)
2939       || from_unsigned2 != TYPE_UNSIGNED (type2))
2940     mult_rhs2 = build_and_insert_cast (gsi, loc,
2941                                        build_nonstandard_integer_type
2942                                          (actual_precision, from_unsigned2),
2943                                        mult_rhs2);
2944
2945   if (!useless_type_conversion_p (type, TREE_TYPE (add_rhs)))
2946     add_rhs = build_and_insert_cast (gsi, loc, type, add_rhs);
2947
2948   /* Handle constants.  */
2949   if (TREE_CODE (mult_rhs1) == INTEGER_CST)
2950     mult_rhs1 = fold_convert (type1, mult_rhs1);
2951   if (TREE_CODE (mult_rhs2) == INTEGER_CST)
2952     mult_rhs2 = fold_convert (type2, mult_rhs2);
2953
2954   gimple_assign_set_rhs_with_ops_1 (gsi, wmult_code, mult_rhs1, mult_rhs2,
2955                                     add_rhs);
2956   update_stmt (gsi_stmt (*gsi));
2957   widen_mul_stats.maccs_inserted++;
2958   return true;
2959 }
2960
2961 /* Combine the multiplication at MUL_STMT with operands MULOP1 and MULOP2
2962    with uses in additions and subtractions to form fused multiply-add
2963    operations.  Returns true if successful and MUL_STMT should be removed.  */
2964
2965 static bool
2966 convert_mult_to_fma (gimple mul_stmt, tree op1, tree op2)
2967 {
2968   tree mul_result = gimple_get_lhs (mul_stmt);
2969   tree type = TREE_TYPE (mul_result);
2970   gimple use_stmt, neguse_stmt, fma_stmt;
2971   use_operand_p use_p;
2972   imm_use_iterator imm_iter;
2973
2974   if (FLOAT_TYPE_P (type)
2975       && flag_fp_contract_mode == FP_CONTRACT_OFF)
2976     return false;
2977
2978   /* We don't want to do bitfield reduction ops.  */
2979   if (INTEGRAL_TYPE_P (type)
2980       && (TYPE_PRECISION (type)
2981           != GET_MODE_PRECISION (TYPE_MODE (type))))
2982     return false;
2983
2984   /* If the target doesn't support it, don't generate it.  We assume that
2985      if fma isn't available then fms, fnma or fnms are not either.  */
2986   if (optab_handler (fma_optab, TYPE_MODE (type)) == CODE_FOR_nothing)
2987     return false;
2988
2989   /* If the multiplication has zero uses, it is kept around probably because
2990      of -fnon-call-exceptions.  Don't optimize it away in that case,
2991      it is DCE job.  */
2992   if (has_zero_uses (mul_result))
2993     return false;
2994
2995   /* Make sure that the multiplication statement becomes dead after
2996      the transformation, thus that all uses are transformed to FMAs.
2997      This means we assume that an FMA operation has the same cost
2998      as an addition.  */
2999   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, mul_result)
3000     {
3001       enum tree_code use_code;
3002       tree result = mul_result;
3003       bool negate_p = false;
3004
3005       use_stmt = USE_STMT (use_p);
3006
3007       if (is_gimple_debug (use_stmt))
3008         continue;
3009
3010       /* For now restrict this operations to single basic blocks.  In theory
3011          we would want to support sinking the multiplication in
3012          m = a*b;
3013          if ()
3014            ma = m + c;
3015          else
3016            d = m;
3017          to form a fma in the then block and sink the multiplication to the
3018          else block.  */
3019       if (gimple_bb (use_stmt) != gimple_bb (mul_stmt))
3020         return false;
3021
3022       if (!is_gimple_assign (use_stmt))
3023         return false;
3024
3025       use_code = gimple_assign_rhs_code (use_stmt);
3026
3027       /* A negate on the multiplication leads to FNMA.  */
3028       if (use_code == NEGATE_EXPR)
3029         {
3030           ssa_op_iter iter;
3031           use_operand_p usep;
3032
3033           result = gimple_assign_lhs (use_stmt);
3034
3035           /* Make sure the negate statement becomes dead with this
3036              single transformation.  */
3037           if (!single_imm_use (gimple_assign_lhs (use_stmt),
3038                                &use_p, &neguse_stmt))
3039             return false;
3040
3041           /* Make sure the multiplication isn't also used on that stmt.  */
3042           FOR_EACH_PHI_OR_STMT_USE (usep, neguse_stmt, iter, SSA_OP_USE)
3043             if (USE_FROM_PTR (usep) == mul_result)
3044               return false;
3045
3046           /* Re-validate.  */
3047           use_stmt = neguse_stmt;
3048           if (gimple_bb (use_stmt) != gimple_bb (mul_stmt))
3049             return false;
3050           if (!is_gimple_assign (use_stmt))
3051             return false;
3052
3053           use_code = gimple_assign_rhs_code (use_stmt);
3054           negate_p = true;
3055         }
3056
3057       switch (use_code)
3058         {
3059         case MINUS_EXPR:
3060           if (gimple_assign_rhs2 (use_stmt) == result)
3061             negate_p = !negate_p;
3062           break;
3063         case PLUS_EXPR:
3064           break;
3065         default:
3066           /* FMA can only be formed from PLUS and MINUS.  */
3067           return false;
3068         }
3069
3070       /* If the subtrahend (gimple_assign_rhs2 (use_stmt)) is computed
3071          by a MULT_EXPR that we'll visit later, we might be able to
3072          get a more profitable match with fnma.
3073          OTOH, if we don't, a negate / fma pair has likely lower latency
3074          that a mult / subtract pair.  */
3075       if (use_code == MINUS_EXPR && !negate_p
3076           && gimple_assign_rhs1 (use_stmt) == result
3077           && optab_handler (fms_optab, TYPE_MODE (type)) == CODE_FOR_nothing
3078           && optab_handler (fnma_optab, TYPE_MODE (type)) != CODE_FOR_nothing)
3079         {
3080           tree rhs2 = gimple_assign_rhs2 (use_stmt);
3081
3082           if (TREE_CODE (rhs2) == SSA_NAME)
3083             {
3084               gimple stmt2 = SSA_NAME_DEF_STMT (rhs2);
3085               if (has_single_use (rhs2)
3086                   && is_gimple_assign (stmt2)
3087                   && gimple_assign_rhs_code (stmt2) == MULT_EXPR)
3088               return false;
3089             }
3090         }
3091
3092       /* We can't handle a * b + a * b.  */
3093       if (gimple_assign_rhs1 (use_stmt) == gimple_assign_rhs2 (use_stmt))
3094         return false;
3095
3096       /* While it is possible to validate whether or not the exact form
3097          that we've recognized is available in the backend, the assumption
3098          is that the transformation is never a loss.  For instance, suppose
3099          the target only has the plain FMA pattern available.  Consider
3100          a*b-c -> fma(a,b,-c): we've exchanged MUL+SUB for FMA+NEG, which
3101          is still two operations.  Consider -(a*b)-c -> fma(-a,b,-c): we
3102          still have 3 operations, but in the FMA form the two NEGs are
3103          independent and could be run in parallel.  */
3104     }
3105
3106   FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, mul_result)
3107     {
3108       gimple_stmt_iterator gsi = gsi_for_stmt (use_stmt);
3109       enum tree_code use_code;
3110       tree addop, mulop1 = op1, result = mul_result;
3111       bool negate_p = false;
3112
3113       if (is_gimple_debug (use_stmt))
3114         continue;
3115
3116       use_code = gimple_assign_rhs_code (use_stmt);
3117       if (use_code == NEGATE_EXPR)
3118         {
3119           result = gimple_assign_lhs (use_stmt);
3120           single_imm_use (gimple_assign_lhs (use_stmt), &use_p, &neguse_stmt);
3121           gsi_remove (&gsi, true);
3122           release_defs (use_stmt);
3123
3124           use_stmt = neguse_stmt;
3125           gsi = gsi_for_stmt (use_stmt);
3126           use_code = gimple_assign_rhs_code (use_stmt);
3127           negate_p = true;
3128         }
3129
3130       if (gimple_assign_rhs1 (use_stmt) == result)
3131         {
3132           addop = gimple_assign_rhs2 (use_stmt);
3133           /* a * b - c -> a * b + (-c)  */
3134           if (gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
3135             addop = force_gimple_operand_gsi (&gsi,
3136                                               build1 (NEGATE_EXPR,
3137                                                       type, addop),
3138                                               true, NULL_TREE, true,
3139                                               GSI_SAME_STMT);
3140         }
3141       else
3142         {
3143           addop = gimple_assign_rhs1 (use_stmt);
3144           /* a - b * c -> (-b) * c + a */
3145           if (gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
3146             negate_p = !negate_p;
3147         }
3148
3149       if (negate_p)
3150         mulop1 = force_gimple_operand_gsi (&gsi,
3151                                            build1 (NEGATE_EXPR,
3152                                                    type, mulop1),
3153                                            true, NULL_TREE, true,
3154                                            GSI_SAME_STMT);
3155
3156       fma_stmt = gimple_build_assign_with_ops (FMA_EXPR,
3157                                                gimple_assign_lhs (use_stmt),
3158                                                mulop1, op2,
3159                                                addop);
3160       gsi_replace (&gsi, fma_stmt, true);
3161       widen_mul_stats.fmas_inserted++;
3162     }
3163
3164   return true;
3165 }
3166
3167 /* Find integer multiplications where the operands are extended from
3168    smaller types, and replace the MULT_EXPR with a WIDEN_MULT_EXPR
3169    where appropriate.  */
3170
3171 namespace {
3172
3173 const pass_data pass_data_optimize_widening_mul =
3174 {
3175   GIMPLE_PASS, /* type */
3176   "widening_mul", /* name */
3177   OPTGROUP_NONE, /* optinfo_flags */
3178   TV_NONE, /* tv_id */
3179   PROP_ssa, /* properties_required */
3180   0, /* properties_provided */
3181   0, /* properties_destroyed */
3182   0, /* todo_flags_start */
3183   TODO_update_ssa, /* todo_flags_finish */
3184 };
3185
3186 class pass_optimize_widening_mul : public gimple_opt_pass
3187 {
3188 public:
3189   pass_optimize_widening_mul (gcc::context *ctxt)
3190     : gimple_opt_pass (pass_data_optimize_widening_mul, ctxt)
3191   {}
3192
3193   /* opt_pass methods: */
3194   virtual bool gate (function *)
3195     {
3196       return flag_expensive_optimizations && optimize;
3197     }
3198
3199   virtual unsigned int execute (function *);
3200
3201 }; // class pass_optimize_widening_mul
3202
3203 unsigned int
3204 pass_optimize_widening_mul::execute (function *fun)
3205 {
3206   basic_block bb;
3207   bool cfg_changed = false;
3208
3209   memset (&widen_mul_stats, 0, sizeof (widen_mul_stats));
3210
3211   FOR_EACH_BB_FN (bb, fun)
3212     {
3213       gimple_stmt_iterator gsi;
3214
3215       for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi);)
3216         {
3217           gimple stmt = gsi_stmt (gsi);
3218           enum tree_code code;
3219
3220           if (is_gimple_assign (stmt))
3221             {
3222               code = gimple_assign_rhs_code (stmt);
3223               switch (code)
3224                 {
3225                 case MULT_EXPR:
3226                   if (!convert_mult_to_widen (stmt, &gsi)
3227                       && convert_mult_to_fma (stmt,
3228                                               gimple_assign_rhs1 (stmt),
3229                                               gimple_assign_rhs2 (stmt)))
3230                     {
3231                       gsi_remove (&gsi, true);
3232                       release_defs (stmt);
3233                       continue;
3234                     }
3235                   break;
3236
3237                 case PLUS_EXPR:
3238                 case MINUS_EXPR:
3239                   convert_plusminus_to_widen (&gsi, stmt, code);
3240                   break;
3241
3242                 default:;
3243                 }
3244             }
3245           else if (is_gimple_call (stmt)
3246                    && gimple_call_lhs (stmt))
3247             {
3248               tree fndecl = gimple_call_fndecl (stmt);
3249               if (fndecl
3250                   && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_NORMAL)
3251                 {
3252                   switch (DECL_FUNCTION_CODE (fndecl))
3253                     {
3254                       case BUILT_IN_POWF:
3255                       case BUILT_IN_POW:
3256                       case BUILT_IN_POWL:
3257                         if (TREE_CODE (gimple_call_arg (stmt, 1)) == REAL_CST
3258                             && REAL_VALUES_EQUAL
3259                                  (TREE_REAL_CST (gimple_call_arg (stmt, 1)),
3260                                   dconst2)
3261                             && convert_mult_to_fma (stmt,
3262                                                     gimple_call_arg (stmt, 0),
3263                                                     gimple_call_arg (stmt, 0)))
3264                           {
3265                             unlink_stmt_vdef (stmt);
3266                             if (gsi_remove (&gsi, true)
3267                                 && gimple_purge_dead_eh_edges (bb))
3268                               cfg_changed = true;
3269                             release_defs (stmt);
3270                             continue;
3271                           }
3272                           break;
3273
3274                       default:;
3275                     }
3276                 }
3277             }
3278           gsi_next (&gsi);
3279         }
3280     }
3281
3282   statistics_counter_event (fun, "widening multiplications inserted",
3283                             widen_mul_stats.widen_mults_inserted);
3284   statistics_counter_event (fun, "widening maccs inserted",
3285                             widen_mul_stats.maccs_inserted);
3286   statistics_counter_event (fun, "fused multiply-adds inserted",
3287                             widen_mul_stats.fmas_inserted);
3288
3289   return cfg_changed ? TODO_cleanup_cfg : 0;
3290 }
3291
3292 } // anon namespace
3293
3294 gimple_opt_pass *
3295 make_pass_optimize_widening_mul (gcc::context *ctxt)
3296 {
3297   return new pass_optimize_widening_mul (ctxt);
3298 }