gcc/tree-ssa-math-opts.c

   1 /* Global, SSA-based optimizations using mathematical identities.
   2    Copyright (C) 2005-2014 Free Software Foundation, Inc.
   3
   4 This file is part of GCC.
   5
   6 GCC is free software; you can redistribute it and/or modify it
   7 under the terms of the GNU General Public License as published by the
   8 Free Software Foundation; either version 3, or (at your option) any
   9 later version.
  10
  11 GCC is distributed in the hope that it will be useful, but WITHOUT
  12 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14 for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with GCC; see the file COPYING3.  If not see
  18 <http://www.gnu.org/licenses/>.  */
  19
  20 /* Currently, the only mini-pass in this file tries to CSE reciprocal
  21    operations.  These are common in sequences such as this one:
  22
  23         modulus = sqrt(x*x + y*y + z*z);
  24         x = x / modulus;
  25         y = y / modulus;
  26         z = z / modulus;
  27
  28    that can be optimized to
  29
  30         modulus = sqrt(x*x + y*y + z*z);
  31         rmodulus = 1.0 / modulus;
  32         x = x * rmodulus;
  33         y = y * rmodulus;
  34         z = z * rmodulus;
  35
  36    We do this for loop invariant divisors, and with this pass whenever
  37    we notice that a division has the same divisor multiple times.
  38
  39    Of course, like in PRE, we don't insert a division if a dominator
  40    already has one.  However, this cannot be done as an extension of
  41    PRE for several reasons.
  42
  43    First of all, with some experiments it was found out that the
  44    transformation is not always useful if there are only two divisions
  45    hy the same divisor.  This is probably because modern processors
  46    can pipeline the divisions; on older, in-order processors it should
  47    still be effective to optimize two divisions by the same number.
  48    We make this a param, and it shall be called N in the remainder of
  49    this comment.
  50
  51    Second, if trapping math is active, we have less freedom on where
  52    to insert divisions: we can only do so in basic blocks that already
  53    contain one.  (If divisions don't trap, instead, we can insert
  54    divisions elsewhere, which will be in blocks that are common dominators
  55    of those that have the division).
  56
  57    We really don't want to compute the reciprocal unless a division will
  58    be found.  To do this, we won't insert the division in a basic block
  59    that has less than N divisions *post-dominating* it.
  60
  61    The algorithm constructs a subset of the dominator tree, holding the
  62    blocks containing the divisions and the common dominators to them,
  63    and walk it twice.  The first walk is in post-order, and it annotates
  64    each block with the number of divisions that post-dominate it: this
  65    gives information on where divisions can be inserted profitably.
  66    The second walk is in pre-order, and it inserts divisions as explained
  67    above, and replaces divisions by multiplications.
  68
  69    In the best case, the cost of the pass is O(n_statements).  In the
  70    worst-case, the cost is due to creating the dominator tree subset,
  71    with a cost of O(n_basic_blocks ^ 2); however this can only happen
  72    for n_statements / n_basic_blocks statements.  So, the amortized cost
  73    of creating the dominator tree subset is O(n_basic_blocks) and the
  74    worst-case cost of the pass is O(n_statements * n_basic_blocks).
  75
  76    More practically, the cost will be small because there are few
  77    divisions, and they tend to be in the same basic block, so insert_bb
  78    is called very few times.
  79
  80    If we did this using domwalk.c, an efficient implementation would have
  81    to work on all the variables in a single pass, because we could not
  82    work on just a subset of the dominator tree, as we do now, and the
  83    cost would also be something like O(n_statements * n_basic_blocks).
  84    The data structures would be more complex in order to work on all the
  85    variables in a single pass.  */
  86
  87 #include "config.h"
  88 #include "system.h"
  89 #include "coretypes.h"
  90 #include "tm.h"
  91 #include "flags.h"
  92 #include "tree.h"
  93 #include "predict.h"
  94 #include "vec.h"
  95 #include "hashtab.h"
  96 #include "hash-set.h"
  97 #include "machmode.h"
  98 #include "hard-reg-set.h"
  99 #include "input.h"
 100 #include "function.h"
 101 #include "dominance.h"
 102 #include "cfg.h"
 103 #include "basic-block.h"
 104 #include "tree-ssa-alias.h"
 105 #include "internal-fn.h"
 106 #include "gimple-fold.h"
 107 #include "gimple-expr.h"
 108 #include "is-a.h"
 109 #include "gimple.h"
 110 #include "gimple-iterator.h"
 111 #include "gimplify.h"
 112 #include "gimplify-me.h"
 113 #include "stor-layout.h"
 114 #include "gimple-ssa.h"
 115 #include "tree-cfg.h"
 116 #include "tree-phinodes.h"
 117 #include "ssa-iterators.h"
 118 #include "stringpool.h"
 119 #include "tree-ssanames.h"
 120 #include "expr.h"
 121 #include "tree-dfa.h"
 122 #include "tree-ssa.h"
 123 #include "tree-pass.h"
 124 #include "alloc-pool.h"
 125 #include "target.h"
 126 #include "gimple-pretty-print.h"
 127 #include "builtins.h"
 128
 129 /* FIXME: RTL headers have to be included here for optabs.  */
 130 #include "rtl.h"                /* Because optabs.h wants enum rtx_code.  */
 131 #include "expr.h"               /* Because optabs.h wants sepops.  */
 132 #include "insn-codes.h"
 133 #include "optabs.h"
 134
 135 /* This structure represents one basic block that either computes a
 136    division, or is a common dominator for basic block that compute a
 137    division.  */
 138 struct occurrence {
 139   /* The basic block represented by this structure.  */
 140   basic_block bb;
 141
 142   /* If non-NULL, the SSA_NAME holding the definition for a reciprocal
 143      inserted in BB.  */
 144   tree recip_def;
 145
 146   /* If non-NULL, the GIMPLE_ASSIGN for a reciprocal computation that
 147      was inserted in BB.  */
 148   gimple recip_def_stmt;
 149
 150   /* Pointer to a list of "struct occurrence"s for blocks dominated
 151      by BB.  */
 152   struct occurrence *children;
 153
 154   /* Pointer to the next "struct occurrence"s in the list of blocks
 155      sharing a common dominator.  */
 156   struct occurrence *next;
 157
 158   /* The number of divisions that are in BB before compute_merit.  The
 159      number of divisions that are in BB or post-dominate it after
 160      compute_merit.  */
 161   int num_divisions;
 162
 163   /* True if the basic block has a division, false if it is a common
 164      dominator for basic blocks that do.  If it is false and trapping
 165      math is active, BB is not a candidate for inserting a reciprocal.  */
 166   bool bb_has_division;
 167 };
 168
 169 static struct
 170 {
 171   /* Number of 1.0/X ops inserted.  */
 172   int rdivs_inserted;
 173
 174   /* Number of 1.0/FUNC ops inserted.  */
 175   int rfuncs_inserted;
 176 } reciprocal_stats;
 177
 178 static struct
 179 {
 180   /* Number of cexpi calls inserted.  */
 181   int inserted;
 182 } sincos_stats;
 183
 184 static struct
 185 {
 186   /* Number of hand-written 16-bit nop / bswaps found.  */
 187   int found_16bit;
 188
 189   /* Number of hand-written 32-bit nop / bswaps found.  */
 190   int found_32bit;
 191
 192   /* Number of hand-written 64-bit nop / bswaps found.  */
 193   int found_64bit;
 194 } nop_stats, bswap_stats;
 195
 196 static struct
 197 {
 198   /* Number of widening multiplication ops inserted.  */
 199   int widen_mults_inserted;
 200
 201   /* Number of integer multiply-and-accumulate ops inserted.  */
 202   int maccs_inserted;
 203
 204   /* Number of fp fused multiply-add ops inserted.  */
 205   int fmas_inserted;
 206 } widen_mul_stats;
 207
 208 /* The instance of "struct occurrence" representing the highest
 209    interesting block in the dominator tree.  */
 210 static struct occurrence *occ_head;
 211
 212 /* Allocation pool for getting instances of "struct occurrence".  */
 213 static alloc_pool occ_pool;
 214
 215
 216
 217 /* Allocate and return a new struct occurrence for basic block BB, and
 218    whose children list is headed by CHILDREN.  */
 219 static struct occurrence *
 220 occ_new (basic_block bb, struct occurrence *children)
 221 {
 222   struct occurrence *occ;
 223
 224   bb->aux = occ = (struct occurrence *) pool_alloc (occ_pool);
 225   memset (occ, 0, sizeof (struct occurrence));
 226
 227   occ->bb = bb;
 228   occ->children = children;
 229   return occ;
 230 }
 231
 232
 233 /* Insert NEW_OCC into our subset of the dominator tree.  P_HEAD points to a
 234    list of "struct occurrence"s, one per basic block, having IDOM as
 235    their common dominator.
 236
 237    We try to insert NEW_OCC as deep as possible in the tree, and we also
 238    insert any other block that is a common dominator for BB and one
 239    block already in the tree.  */
 240
 241 static void
 242 insert_bb (struct occurrence *new_occ, basic_block idom,
 243            struct occurrence **p_head)
 244 {
 245   struct occurrence *occ, **p_occ;
 246
 247   for (p_occ = p_head; (occ = *p_occ) != NULL; )
 248     {
 249       basic_block bb = new_occ->bb, occ_bb = occ->bb;
 250       basic_block dom = nearest_common_dominator (CDI_DOMINATORS, occ_bb, bb);
 251       if (dom == bb)
 252         {
 253           /* BB dominates OCC_BB.  OCC becomes NEW_OCC's child: remove OCC
 254              from its list.  */
 255           *p_occ = occ->next;
 256           occ->next = new_occ->children;
 257           new_occ->children = occ;
 258
 259           /* Try the next block (it may as well be dominated by BB).  */
 260         }
 261
 262       else if (dom == occ_bb)
 263         {
 264           /* OCC_BB dominates BB.  Tail recurse to look deeper.  */
 265           insert_bb (new_occ, dom, &occ->children);
 266           return;
 267         }
 268
 269       else if (dom != idom)
 270         {
 271           gcc_assert (!dom->aux);
 272
 273           /* There is a dominator between IDOM and BB, add it and make
 274              two children out of NEW_OCC and OCC.  First, remove OCC from
 275              its list.  */
 276           *p_occ = occ->next;
 277           new_occ->next = occ;
 278           occ->next = NULL;
 279
 280           /* None of the previous blocks has DOM as a dominator: if we tail
 281              recursed, we would reexamine them uselessly. Just switch BB with
 282              DOM, and go on looking for blocks dominated by DOM.  */
 283           new_occ = occ_new (dom, new_occ);
 284         }
 285
 286       else
 287         {
 288           /* Nothing special, go on with the next element.  */
 289           p_occ = &occ->next;
 290         }
 291     }
 292
 293   /* No place was found as a child of IDOM.  Make BB a sibling of IDOM.  */
 294   new_occ->next = *p_head;
 295   *p_head = new_occ;
 296 }
 297
 298 /* Register that we found a division in BB.  */
 299
 300 static inline void
 301 register_division_in (basic_block bb)
 302 {
 303   struct occurrence *occ;
 304
 305   occ = (struct occurrence *) bb->aux;
 306   if (!occ)
 307     {
 308       occ = occ_new (bb, NULL);
 309       insert_bb (occ, ENTRY_BLOCK_PTR_FOR_FN (cfun), &occ_head);
 310     }
 311
 312   occ->bb_has_division = true;
 313   occ->num_divisions++;
 314 }
 315
 316
 317 /* Compute the number of divisions that postdominate each block in OCC and
 318    its children.  */
 319
 320 static void
 321 compute_merit (struct occurrence *occ)
 322 {
 323   struct occurrence *occ_child;
 324   basic_block dom = occ->bb;
 325
 326   for (occ_child = occ->children; occ_child; occ_child = occ_child->next)
 327     {
 328       basic_block bb;
 329       if (occ_child->children)
 330         compute_merit (occ_child);
 331
 332       if (flag_exceptions)
 333         bb = single_noncomplex_succ (dom);
 334       else
 335         bb = dom;
 336
 337       if (dominated_by_p (CDI_POST_DOMINATORS, bb, occ_child->bb))
 338         occ->num_divisions += occ_child->num_divisions;
 339     }
 340 }
 341
 342
 343 /* Return whether USE_STMT is a floating-point division by DEF.  */
 344 static inline bool
 345 is_division_by (gimple use_stmt, tree def)
 346 {
 347   return is_gimple_assign (use_stmt)
 348          && gimple_assign_rhs_code (use_stmt) == RDIV_EXPR
 349          && gimple_assign_rhs2 (use_stmt) == def
 350          /* Do not recognize x / x as valid division, as we are getting
 351             confused later by replacing all immediate uses x in such
 352             a stmt.  */
 353          && gimple_assign_rhs1 (use_stmt) != def;
 354 }
 355
 356 /* Walk the subset of the dominator tree rooted at OCC, setting the
 357    RECIP_DEF field to a definition of 1.0 / DEF that can be used in
 358    the given basic block.  The field may be left NULL, of course,
 359    if it is not possible or profitable to do the optimization.
 360
 361    DEF_BSI is an iterator pointing at the statement defining DEF.
 362    If RECIP_DEF is set, a dominator already has a computation that can
 363    be used.  */
 364
 365 static void
 366 insert_reciprocals (gimple_stmt_iterator *def_gsi, struct occurrence *occ,
 367                     tree def, tree recip_def, int threshold)
 368 {
 369   tree type;
 370   gassign *new_stmt;
 371   gimple_stmt_iterator gsi;
 372   struct occurrence *occ_child;
 373
 374   if (!recip_def
 375       && (occ->bb_has_division || !flag_trapping_math)
 376       && occ->num_divisions >= threshold)
 377     {
 378       /* Make a variable with the replacement and substitute it.  */
 379       type = TREE_TYPE (def);
 380       recip_def = create_tmp_reg (type, "reciptmp");
 381       new_stmt = gimple_build_assign_with_ops (RDIV_EXPR, recip_def,
 382                                                build_one_cst (type), def);
 383
 384       if (occ->bb_has_division)
 385         {
 386           /* Case 1: insert before an existing division.  */
 387           gsi = gsi_after_labels (occ->bb);
 388           while (!gsi_end_p (gsi) && !is_division_by (gsi_stmt (gsi), def))
 389             gsi_next (&gsi);
 390
 391           gsi_insert_before (&gsi, new_stmt, GSI_SAME_STMT);
 392         }
 393       else if (def_gsi && occ->bb == def_gsi->bb)
 394         {
 395           /* Case 2: insert right after the definition.  Note that this will
 396              never happen if the definition statement can throw, because in
 397              that case the sole successor of the statement's basic block will
 398              dominate all the uses as well.  */
 399           gsi_insert_after (def_gsi, new_stmt, GSI_NEW_STMT);
 400         }
 401       else
 402         {
 403           /* Case 3: insert in a basic block not containing defs/uses.  */
 404           gsi = gsi_after_labels (occ->bb);
 405           gsi_insert_before (&gsi, new_stmt, GSI_SAME_STMT);
 406         }
 407
 408       reciprocal_stats.rdivs_inserted++;
 409
 410       occ->recip_def_stmt = new_stmt;
 411     }
 412
 413   occ->recip_def = recip_def;
 414   for (occ_child = occ->children; occ_child; occ_child = occ_child->next)
 415     insert_reciprocals (def_gsi, occ_child, def, recip_def, threshold);
 416 }
 417
 418
 419 /* Replace the division at USE_P with a multiplication by the reciprocal, if
 420    possible.  */
 421
 422 static inline void
 423 replace_reciprocal (use_operand_p use_p)
 424 {
 425   gimple use_stmt = USE_STMT (use_p);
 426   basic_block bb = gimple_bb (use_stmt);
 427   struct occurrence *occ = (struct occurrence *) bb->aux;
 428
 429   if (optimize_bb_for_speed_p (bb)
 430       && occ->recip_def && use_stmt != occ->recip_def_stmt)
 431     {
 432       gimple_stmt_iterator gsi = gsi_for_stmt (use_stmt);
 433       gimple_assign_set_rhs_code (use_stmt, MULT_EXPR);
 434       SET_USE (use_p, occ->recip_def);
 435       fold_stmt_inplace (&gsi);
 436       update_stmt (use_stmt);
 437     }
 438 }
 439
 440
 441 /* Free OCC and return one more "struct occurrence" to be freed.  */
 442
 443 static struct occurrence *
 444 free_bb (struct occurrence *occ)
 445 {
 446   struct occurrence *child, *next;
 447
 448   /* First get the two pointers hanging off OCC.  */
 449   next = occ->next;
 450   child = occ->children;
 451   occ->bb->aux = NULL;
 452   pool_free (occ_pool, occ);
 453
 454   /* Now ensure that we don't recurse unless it is necessary.  */
 455   if (!child)
 456     return next;
 457   else
 458     {
 459       while (next)
 460         next = free_bb (next);
 461
 462       return child;
 463     }
 464 }
 465
 466
 467 /* Look for floating-point divisions among DEF's uses, and try to
 468    replace them by multiplications with the reciprocal.  Add
 469    as many statements computing the reciprocal as needed.
 470
 471    DEF must be a GIMPLE register of a floating-point type.  */
 472
 473 static void
 474 execute_cse_reciprocals_1 (gimple_stmt_iterator *def_gsi, tree def)
 475 {
 476   use_operand_p use_p;
 477   imm_use_iterator use_iter;
 478   struct occurrence *occ;
 479   int count = 0, threshold;
 480
 481   gcc_assert (FLOAT_TYPE_P (TREE_TYPE (def)) && is_gimple_reg (def));
 482
 483   FOR_EACH_IMM_USE_FAST (use_p, use_iter, def)
 484     {
 485       gimple use_stmt = USE_STMT (use_p);
 486       if (is_division_by (use_stmt, def))
 487         {
 488           register_division_in (gimple_bb (use_stmt));
 489           count++;
 490         }
 491     }
 492
 493   /* Do the expensive part only if we can hope to optimize something.  */
 494   threshold = targetm.min_divisions_for_recip_mul (TYPE_MODE (TREE_TYPE (def)));
 495   if (count >= threshold)
 496     {
 497       gimple use_stmt;
 498       for (occ = occ_head; occ; occ = occ->next)
 499         {
 500           compute_merit (occ);
 501           insert_reciprocals (def_gsi, occ, def, NULL, threshold);
 502         }
 503
 504       FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, def)
 505         {
 506           if (is_division_by (use_stmt, def))
 507             {
 508               FOR_EACH_IMM_USE_ON_STMT (use_p, use_iter)
 509                 replace_reciprocal (use_p);
 510             }
 511         }
 512     }
 513
 514   for (occ = occ_head; occ; )
 515     occ = free_bb (occ);
 516
 517   occ_head = NULL;
 518 }
 519
 520 /* Go through all the floating-point SSA_NAMEs, and call
 521    execute_cse_reciprocals_1 on each of them.  */
 522 namespace {
 523
 524 const pass_data pass_data_cse_reciprocals =
 525 {
 526   GIMPLE_PASS, /* type */
 527   "recip", /* name */
 528   OPTGROUP_NONE, /* optinfo_flags */
 529   TV_NONE, /* tv_id */
 530   PROP_ssa, /* properties_required */
 531   0, /* properties_provided */
 532   0, /* properties_destroyed */
 533   0, /* todo_flags_start */
 534   TODO_update_ssa, /* todo_flags_finish */
 535 };
 536
 537 class pass_cse_reciprocals : public gimple_opt_pass
 538 {
 539 public:
 540   pass_cse_reciprocals (gcc::context *ctxt)
 541     : gimple_opt_pass (pass_data_cse_reciprocals, ctxt)
 542   {}
 543
 544   /* opt_pass methods: */
 545   virtual bool gate (function *) { return optimize && flag_reciprocal_math; }
 546   virtual unsigned int execute (function *);
 547
 548 }; // class pass_cse_reciprocals
 549
 550 unsigned int
 551 pass_cse_reciprocals::execute (function *fun)
 552 {
 553   basic_block bb;
 554   tree arg;
 555
 556   occ_pool = create_alloc_pool ("dominators for recip",
 557                                 sizeof (struct occurrence),
 558                                 n_basic_blocks_for_fn (fun) / 3 + 1);
 559
 560   memset (&reciprocal_stats, 0, sizeof (reciprocal_stats));
 561   calculate_dominance_info (CDI_DOMINATORS);
 562   calculate_dominance_info (CDI_POST_DOMINATORS);
 563
 564 #ifdef ENABLE_CHECKING
 565   FOR_EACH_BB_FN (bb, fun)
 566     gcc_assert (!bb->aux);
 567 #endif
 568
 569   for (arg = DECL_ARGUMENTS (fun->decl); arg; arg = DECL_CHAIN (arg))
 570     if (FLOAT_TYPE_P (TREE_TYPE (arg))
 571         && is_gimple_reg (arg))
 572       {
 573         tree name = ssa_default_def (fun, arg);
 574         if (name)
 575           execute_cse_reciprocals_1 (NULL, name);
 576       }
 577
 578   FOR_EACH_BB_FN (bb, fun)
 579     {
 580       tree def;
 581
 582       for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
 583            gsi_next (&gsi))
 584         {
 585           gphi *phi = gsi.phi ();
 586           def = PHI_RESULT (phi);
 587           if (! virtual_operand_p (def)
 588               && FLOAT_TYPE_P (TREE_TYPE (def)))
 589             execute_cse_reciprocals_1 (NULL, def);
 590         }
 591
 592       for (gimple_stmt_iterator gsi = gsi_after_labels (bb); !gsi_end_p (gsi);
 593            gsi_next (&gsi))
 594         {
 595           gimple stmt = gsi_stmt (gsi);
 596
 597           if (gimple_has_lhs (stmt)
 598               && (def = SINGLE_SSA_TREE_OPERAND (stmt, SSA_OP_DEF)) != NULL
 599               && FLOAT_TYPE_P (TREE_TYPE (def))
 600               && TREE_CODE (def) == SSA_NAME)
 601             execute_cse_reciprocals_1 (&gsi, def);
 602         }
 603
 604       if (optimize_bb_for_size_p (bb))
 605         continue;
 606
 607       /* Scan for a/func(b) and convert it to reciprocal a*rfunc(b).  */
 608       for (gimple_stmt_iterator gsi = gsi_after_labels (bb); !gsi_end_p (gsi);
 609            gsi_next (&gsi))
 610         {
 611           gimple stmt = gsi_stmt (gsi);
 612           tree fndecl;
 613
 614           if (is_gimple_assign (stmt)
 615               && gimple_assign_rhs_code (stmt) == RDIV_EXPR)
 616             {
 617               tree arg1 = gimple_assign_rhs2 (stmt);
 618               gimple stmt1;
 619
 620               if (TREE_CODE (arg1) != SSA_NAME)
 621                 continue;
 622
 623               stmt1 = SSA_NAME_DEF_STMT (arg1);
 624
 625               if (is_gimple_call (stmt1)
 626                   && gimple_call_lhs (stmt1)
 627                   && (fndecl = gimple_call_fndecl (stmt1))
 628                   && (DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_NORMAL
 629                       || DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD))
 630                 {
 631                   enum built_in_function code;
 632                   bool md_code, fail;
 633                   imm_use_iterator ui;
 634                   use_operand_p use_p;
 635
 636                   code = DECL_FUNCTION_CODE (fndecl);
 637                   md_code = DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD;
 638
 639                   fndecl = targetm.builtin_reciprocal (code, md_code, false);
 640                   if (!fndecl)
 641                     continue;
 642
 643                   /* Check that all uses of the SSA name are divisions,
 644                      otherwise replacing the defining statement will do
 645                      the wrong thing.  */
 646                   fail = false;
 647                   FOR_EACH_IMM_USE_FAST (use_p, ui, arg1)
 648                     {
 649                       gimple stmt2 = USE_STMT (use_p);
 650                       if (is_gimple_debug (stmt2))
 651                         continue;
 652                       if (!is_gimple_assign (stmt2)
 653                           || gimple_assign_rhs_code (stmt2) != RDIV_EXPR
 654                           || gimple_assign_rhs1 (stmt2) == arg1
 655                           || gimple_assign_rhs2 (stmt2) != arg1)
 656                         {
 657                           fail = true;
 658                           break;
 659                         }
 660                     }
 661                   if (fail)
 662                     continue;
 663
 664                   gimple_replace_ssa_lhs (stmt1, arg1);
 665                   gimple_call_set_fndecl (stmt1, fndecl);
 666                   update_stmt (stmt1);
 667                   reciprocal_stats.rfuncs_inserted++;
 668
 669                   FOR_EACH_IMM_USE_STMT (stmt, ui, arg1)
 670                     {
 671                       gimple_stmt_iterator gsi = gsi_for_stmt (stmt);
 672                       gimple_assign_set_rhs_code (stmt, MULT_EXPR);
 673                       fold_stmt_inplace (&gsi);
 674                       update_stmt (stmt);
 675                     }
 676                 }
 677             }
 678         }
 679     }
 680
 681   statistics_counter_event (fun, "reciprocal divs inserted",
 682                             reciprocal_stats.rdivs_inserted);
 683   statistics_counter_event (fun, "reciprocal functions inserted",
 684                             reciprocal_stats.rfuncs_inserted);
 685
 686   free_dominance_info (CDI_DOMINATORS);
 687   free_dominance_info (CDI_POST_DOMINATORS);
 688   free_alloc_pool (occ_pool);
 689   return 0;
 690 }
 691
 692 } // anon namespace
 693
 694 gimple_opt_pass *
 695 make_pass_cse_reciprocals (gcc::context *ctxt)
 696 {
 697   return new pass_cse_reciprocals (ctxt);
 698 }
 699
 700 /* Records an occurrence at statement USE_STMT in the vector of trees
 701    STMTS if it is dominated by *TOP_BB or dominates it or this basic block
 702    is not yet initialized.  Returns true if the occurrence was pushed on
 703    the vector.  Adjusts *TOP_BB to be the basic block dominating all
 704    statements in the vector.  */
 705
 706 static bool
 707 maybe_record_sincos (vec<gimple> *stmts,
 708                      basic_block *top_bb, gimple use_stmt)
 709 {
 710   basic_block use_bb = gimple_bb (use_stmt);
 711   if (*top_bb
 712       && (*top_bb == use_bb
 713           || dominated_by_p (CDI_DOMINATORS, use_bb, *top_bb)))
 714     stmts->safe_push (use_stmt);
 715   else if (!*top_bb
 716            || dominated_by_p (CDI_DOMINATORS, *top_bb, use_bb))
 717     {
 718       stmts->safe_push (use_stmt);
 719       *top_bb = use_bb;
 720     }
 721   else
 722     return false;
 723
 724   return true;
 725 }
 726
 727 /* Look for sin, cos and cexpi calls with the same argument NAME and
 728    create a single call to cexpi CSEing the result in this case.
 729    We first walk over all immediate uses of the argument collecting
 730    statements that we can CSE in a vector and in a second pass replace
 731    the statement rhs with a REALPART or IMAGPART expression on the
 732    result of the cexpi call we insert before the use statement that
 733    dominates all other candidates.  */
 734
 735 static bool
 736 execute_cse_sincos_1 (tree name)
 737 {
 738   gimple_stmt_iterator gsi;
 739   imm_use_iterator use_iter;
 740   tree fndecl, res, type;
 741   gimple def_stmt, use_stmt, stmt;
 742   int seen_cos = 0, seen_sin = 0, seen_cexpi = 0;
 743   vec<gimple> stmts = vNULL;
 744   basic_block top_bb = NULL;
 745   int i;
 746   bool cfg_changed = false;
 747
 748   type = TREE_TYPE (name);
 749   FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, name)
 750     {
 751       if (gimple_code (use_stmt) != GIMPLE_CALL
 752           || !gimple_call_lhs (use_stmt)
 753           || !(fndecl = gimple_call_fndecl (use_stmt))
 754           || DECL_BUILT_IN_CLASS (fndecl) != BUILT_IN_NORMAL)
 755         continue;
 756
 757       switch (DECL_FUNCTION_CODE (fndecl))
 758         {
 759         CASE_FLT_FN (BUILT_IN_COS):
 760           seen_cos |= maybe_record_sincos (&stmts, &top_bb, use_stmt) ? 1 : 0;
 761           break;
 762
 763         CASE_FLT_FN (BUILT_IN_SIN):
 764           seen_sin |= maybe_record_sincos (&stmts, &top_bb, use_stmt) ? 1 : 0;
 765           break;
 766
 767         CASE_FLT_FN (BUILT_IN_CEXPI):
 768           seen_cexpi |= maybe_record_sincos (&stmts, &top_bb, use_stmt) ? 1 : 0;
 769           break;
 770
 771         default:;
 772         }
 773     }
 774
 775   if (seen_cos + seen_sin + seen_cexpi <= 1)
 776     {
 777       stmts.release ();
 778       return false;
 779     }
 780
 781   /* Simply insert cexpi at the beginning of top_bb but not earlier than
 782      the name def statement.  */
 783   fndecl = mathfn_built_in (type, BUILT_IN_CEXPI);
 784   if (!fndecl)
 785     return false;
 786   stmt = gimple_build_call (fndecl, 1, name);
 787   res = make_temp_ssa_name (TREE_TYPE (TREE_TYPE (fndecl)), stmt, "sincostmp");
 788   gimple_call_set_lhs (stmt, res);
 789
 790   def_stmt = SSA_NAME_DEF_STMT (name);
 791   if (!SSA_NAME_IS_DEFAULT_DEF (name)
 792       && gimple_code (def_stmt) != GIMPLE_PHI
 793       && gimple_bb (def_stmt) == top_bb)
 794     {
 795       gsi = gsi_for_stmt (def_stmt);
 796       gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
 797     }
 798   else
 799     {
 800       gsi = gsi_after_labels (top_bb);
 801       gsi_insert_before (&gsi, stmt, GSI_SAME_STMT);
 802     }
 803   sincos_stats.inserted++;
 804
 805   /* And adjust the recorded old call sites.  */
 806   for (i = 0; stmts.iterate (i, &use_stmt); ++i)
 807     {
 808       tree rhs = NULL;
 809       fndecl = gimple_call_fndecl (use_stmt);
 810
 811       switch (DECL_FUNCTION_CODE (fndecl))
 812         {
 813         CASE_FLT_FN (BUILT_IN_COS):
 814           rhs = fold_build1 (REALPART_EXPR, type, res);
 815           break;
 816
 817         CASE_FLT_FN (BUILT_IN_SIN):
 818           rhs = fold_build1 (IMAGPART_EXPR, type, res);
 819           break;
 820
 821         CASE_FLT_FN (BUILT_IN_CEXPI):
 822           rhs = res;
 823           break;
 824
 825         default:;
 826           gcc_unreachable ();
 827         }
 828
 829         /* Replace call with a copy.  */
 830         stmt = gimple_build_assign (gimple_call_lhs (use_stmt), rhs);
 831
 832         gsi = gsi_for_stmt (use_stmt);
 833         gsi_replace (&gsi, stmt, true);
 834         if (gimple_purge_dead_eh_edges (gimple_bb (stmt)))
 835           cfg_changed = true;
 836     }
 837
 838   stmts.release ();
 839
 840   return cfg_changed;
 841 }
 842
 843 /* To evaluate powi(x,n), the floating point value x raised to the
 844    constant integer exponent n, we use a hybrid algorithm that
 845    combines the "window method" with look-up tables.  For an
 846    introduction to exponentiation algorithms and "addition chains",
 847    see section 4.6.3, "Evaluation of Powers" of Donald E. Knuth,
 848    "Seminumerical Algorithms", Vol. 2, "The Art of Computer Programming",
 849    3rd Edition, 1998, and Daniel M. Gordon, "A Survey of Fast Exponentiation
 850    Methods", Journal of Algorithms, Vol. 27, pp. 129-146, 1998.  */
 851
 852 /* Provide a default value for POWI_MAX_MULTS, the maximum number of
 853    multiplications to inline before calling the system library's pow
 854    function.  powi(x,n) requires at worst 2*bits(n)-2 multiplications,
 855    so this default never requires calling pow, powf or powl.  */
 856
 857 #ifndef POWI_MAX_MULTS
 858 #define POWI_MAX_MULTS  (2*HOST_BITS_PER_WIDE_INT-2)
 859 #endif
 860
 861 /* The size of the "optimal power tree" lookup table.  All
 862    exponents less than this value are simply looked up in the
 863    powi_table below.  This threshold is also used to size the
 864    cache of pseudo registers that hold intermediate results.  */
 865 #define POWI_TABLE_SIZE 256
 866
 867 /* The size, in bits of the window, used in the "window method"
 868    exponentiation algorithm.  This is equivalent to a radix of
 869    (1<<POWI_WINDOW_SIZE) in the corresponding "m-ary method".  */
 870 #define POWI_WINDOW_SIZE 3
 871
 872 /* The following table is an efficient representation of an
 873    "optimal power tree".  For each value, i, the corresponding
 874    value, j, in the table states than an optimal evaluation
 875    sequence for calculating pow(x,i) can be found by evaluating
 876    pow(x,j)*pow(x,i-j).  An optimal power tree for the first
 877    100 integers is given in Knuth's "Seminumerical algorithms".  */
 878
 879 static const unsigned char powi_table[POWI_TABLE_SIZE] =
 880   {
 881       0,   1,   1,   2,   2,   3,   3,   4,  /*   0 -   7 */
 882       4,   6,   5,   6,   6,  10,   7,   9,  /*   8 -  15 */
 883       8,  16,   9,  16,  10,  12,  11,  13,  /*  16 -  23 */
 884      12,  17,  13,  18,  14,  24,  15,  26,  /*  24 -  31 */
 885      16,  17,  17,  19,  18,  33,  19,  26,  /*  32 -  39 */
 886      20,  25,  21,  40,  22,  27,  23,  44,  /*  40 -  47 */
 887      24,  32,  25,  34,  26,  29,  27,  44,  /*  48 -  55 */
 888      28,  31,  29,  34,  30,  60,  31,  36,  /*  56 -  63 */
 889      32,  64,  33,  34,  34,  46,  35,  37,  /*  64 -  71 */
 890      36,  65,  37,  50,  38,  48,  39,  69,  /*  72 -  79 */
 891      40,  49,  41,  43,  42,  51,  43,  58,  /*  80 -  87 */
 892      44,  64,  45,  47,  46,  59,  47,  76,  /*  88 -  95 */
 893      48,  65,  49,  66,  50,  67,  51,  66,  /*  96 - 103 */
 894      52,  70,  53,  74,  54, 104,  55,  74,  /* 104 - 111 */
 895      56,  64,  57,  69,  58,  78,  59,  68,  /* 112 - 119 */
 896      60,  61,  61,  80,  62,  75,  63,  68,  /* 120 - 127 */
 897      64,  65,  65, 128,  66, 129,  67,  90,  /* 128 - 135 */
 898      68,  73,  69, 131,  70,  94,  71,  88,  /* 136 - 143 */
 899      72, 128,  73,  98,  74, 132,  75, 121,  /* 144 - 151 */
 900      76, 102,  77, 124,  78, 132,  79, 106,  /* 152 - 159 */
 901      80,  97,  81, 160,  82,  99,  83, 134,  /* 160 - 167 */
 902      84,  86,  85,  95,  86, 160,  87, 100,  /* 168 - 175 */
 903      88, 113,  89,  98,  90, 107,  91, 122,  /* 176 - 183 */
 904      92, 111,  93, 102,  94, 126,  95, 150,  /* 184 - 191 */
 905      96, 128,  97, 130,  98, 133,  99, 195,  /* 192 - 199 */
 906     100, 128, 101, 123, 102, 164, 103, 138,  /* 200 - 207 */
 907     104, 145, 105, 146, 106, 109, 107, 149,  /* 208 - 215 */
 908     108, 200, 109, 146, 110, 170, 111, 157,  /* 216 - 223 */
 909     112, 128, 113, 130, 114, 182, 115, 132,  /* 224 - 231 */
 910     116, 200, 117, 132, 118, 158, 119, 206,  /* 232 - 239 */
 911     120, 240, 121, 162, 122, 147, 123, 152,  /* 240 - 247 */
 912     124, 166, 125, 214, 126, 138, 127, 153,  /* 248 - 255 */
 913   };
 914
 915
 916 /* Return the number of multiplications required to calculate
 917    powi(x,n) where n is less than POWI_TABLE_SIZE.  This is a
 918    subroutine of powi_cost.  CACHE is an array indicating
 919    which exponents have already been calculated.  */
 920
 921 static int
 922 powi_lookup_cost (unsigned HOST_WIDE_INT n, bool *cache)
 923 {
 924   /* If we've already calculated this exponent, then this evaluation
 925      doesn't require any additional multiplications.  */
 926   if (cache[n])
 927     return 0;
 928
 929   cache[n] = true;
 930   return powi_lookup_cost (n - powi_table[n], cache)
 931          + powi_lookup_cost (powi_table[n], cache) + 1;
 932 }
 933
 934 /* Return the number of multiplications required to calculate
 935    powi(x,n) for an arbitrary x, given the exponent N.  This
 936    function needs to be kept in sync with powi_as_mults below.  */
 937
 938 static int
 939 powi_cost (HOST_WIDE_INT n)
 940 {
 941   bool cache[POWI_TABLE_SIZE];
 942   unsigned HOST_WIDE_INT digit;
 943   unsigned HOST_WIDE_INT val;
 944   int result;
 945
 946   if (n == 0)
 947     return 0;
 948
 949   /* Ignore the reciprocal when calculating the cost.  */
 950   val = (n < 0) ? -n : n;
 951
 952   /* Initialize the exponent cache.  */
 953   memset (cache, 0, POWI_TABLE_SIZE * sizeof (bool));
 954   cache[1] = true;
 955
 956   result = 0;
 957
 958   while (val >= POWI_TABLE_SIZE)
 959     {
 960       if (val & 1)
 961         {
 962           digit = val & ((1 << POWI_WINDOW_SIZE) - 1);
 963           result += powi_lookup_cost (digit, cache)
 964                     + POWI_WINDOW_SIZE + 1;
 965           val >>= POWI_WINDOW_SIZE;
 966         }
 967       else
 968         {
 969           val >>= 1;
 970           result++;
 971         }
 972     }
 973
 974   return result + powi_lookup_cost (val, cache);
 975 }
 976
 977 /* Recursive subroutine of powi_as_mults.  This function takes the
 978    array, CACHE, of already calculated exponents and an exponent N and
 979    returns a tree that corresponds to CACHE[1]**N, with type TYPE.  */
 980
 981 static tree
 982 powi_as_mults_1 (gimple_stmt_iterator *gsi, location_t loc, tree type,
 983                  HOST_WIDE_INT n, tree *cache)
 984 {
 985   tree op0, op1, ssa_target;
 986   unsigned HOST_WIDE_INT digit;
 987   gassign *mult_stmt;
 988
 989   if (n < POWI_TABLE_SIZE && cache[n])
 990     return cache[n];
 991
 992   ssa_target = make_temp_ssa_name (type, NULL, "powmult");
 993
 994   if (n < POWI_TABLE_SIZE)
 995     {
 996       cache[n] = ssa_target;
 997       op0 = powi_as_mults_1 (gsi, loc, type, n - powi_table[n], cache);
 998       op1 = powi_as_mults_1 (gsi, loc, type, powi_table[n], cache);
 999     }
1000   else if (n & 1)
1001     {
1002       digit = n & ((1 << POWI_WINDOW_SIZE) - 1);
1003       op0 = powi_as_mults_1 (gsi, loc, type, n - digit, cache);
1004       op1 = powi_as_mults_1 (gsi, loc, type, digit, cache);
1005     }
1006   else
1007     {
1008       op0 = powi_as_mults_1 (gsi, loc, type, n >> 1, cache);
1009       op1 = op0;
1010     }
1011
1012   mult_stmt = gimple_build_assign_with_ops (MULT_EXPR, ssa_target, op0, op1);
1013   gimple_set_location (mult_stmt, loc);
1014   gsi_insert_before (gsi, mult_stmt, GSI_SAME_STMT);
1015
1016   return ssa_target;
1017 }
1018
1019 /* Convert ARG0**N to a tree of multiplications of ARG0 with itself.
1020    This function needs to be kept in sync with powi_cost above.  */
1021
1022 static tree
1023 powi_as_mults (gimple_stmt_iterator *gsi, location_t loc,
1024                tree arg0, HOST_WIDE_INT n)
1025 {
1026   tree cache[POWI_TABLE_SIZE], result, type = TREE_TYPE (arg0);
1027   gassign *div_stmt;
1028   tree target;
1029
1030   if (n == 0)
1031     return build_real (type, dconst1);
1032
1033   memset (cache, 0,  sizeof (cache));
1034   cache[1] = arg0;
1035
1036   result = powi_as_mults_1 (gsi, loc, type, (n < 0) ? -n : n, cache);
1037   if (n >= 0)
1038     return result;
1039
1040   /* If the original exponent was negative, reciprocate the result.  */
1041   target = make_temp_ssa_name (type, NULL, "powmult");
1042   div_stmt = gimple_build_assign_with_ops (RDIV_EXPR, target,
1043                                            build_real (type, dconst1),
1044                                            result);
1045   gimple_set_location (div_stmt, loc);
1046   gsi_insert_before (gsi, div_stmt, GSI_SAME_STMT);
1047
1048   return target;
1049 }
1050
1051 /* ARG0 and N are the two arguments to a powi builtin in GSI with
1052    location info LOC.  If the arguments are appropriate, create an
1053    equivalent sequence of statements prior to GSI using an optimal
1054    number of multiplications, and return an expession holding the
1055    result.  */
1056
1057 static tree
1058 gimple_expand_builtin_powi (gimple_stmt_iterator *gsi, location_t loc,
1059                             tree arg0, HOST_WIDE_INT n)
1060 {
1061   /* Avoid largest negative number.  */
1062   if (n != -n
1063       && ((n >= -1 && n <= 2)
1064           || (optimize_function_for_speed_p (cfun)
1065               && powi_cost (n) <= POWI_MAX_MULTS)))
1066     return powi_as_mults (gsi, loc, arg0, n);
1067
1068   return NULL_TREE;
1069 }
1070
1071 /* Build a gimple call statement that calls FN with argument ARG.
1072    Set the lhs of the call statement to a fresh SSA name.  Insert the
1073    statement prior to GSI's current position, and return the fresh
1074    SSA name.  */
1075
1076 static tree
1077 build_and_insert_call (gimple_stmt_iterator *gsi, location_t loc,
1078                        tree fn, tree arg)
1079 {
1080   gcall *call_stmt;
1081   tree ssa_target;
1082
1083   call_stmt = gimple_build_call (fn, 1, arg);
1084   ssa_target = make_temp_ssa_name (TREE_TYPE (arg), NULL, "powroot");
1085   gimple_set_lhs (call_stmt, ssa_target);
1086   gimple_set_location (call_stmt, loc);
1087   gsi_insert_before (gsi, call_stmt, GSI_SAME_STMT);
1088
1089   return ssa_target;
1090 }
1091
1092 /* Build a gimple binary operation with the given CODE and arguments
1093    ARG0, ARG1, assigning the result to a new SSA name for variable
1094    TARGET.  Insert the statement prior to GSI's current position, and
1095    return the fresh SSA name.*/
1096
1097 static tree
1098 build_and_insert_binop (gimple_stmt_iterator *gsi, location_t loc,
1099                         const char *name, enum tree_code code,
1100                         tree arg0, tree arg1)
1101 {
1102   tree result = make_temp_ssa_name (TREE_TYPE (arg0), NULL, name);
1103   gassign *stmt = gimple_build_assign_with_ops (code, result, arg0, arg1);
1104   gimple_set_location (stmt, loc);
1105   gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
1106   return result;
1107 }
1108
1109 /* Build a gimple reference operation with the given CODE and argument
1110    ARG, assigning the result to a new SSA name of TYPE with NAME.
1111    Insert the statement prior to GSI's current position, and return
1112    the fresh SSA name.  */
1113
1114 static inline tree
1115 build_and_insert_ref (gimple_stmt_iterator *gsi, location_t loc, tree type,
1116                       const char *name, enum tree_code code, tree arg0)
1117 {
1118   tree result = make_temp_ssa_name (type, NULL, name);
1119   gimple stmt = gimple_build_assign (result, build1 (code, type, arg0));
1120   gimple_set_location (stmt, loc);
1121   gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
1122   return result;
1123 }
1124
1125 /* Build a gimple assignment to cast VAL to TYPE.  Insert the statement
1126    prior to GSI's current position, and return the fresh SSA name.  */
1127
1128 static tree
1129 build_and_insert_cast (gimple_stmt_iterator *gsi, location_t loc,
1130                        tree type, tree val)
1131 {
1132   tree result = make_ssa_name (type, NULL);
1133   gassign *stmt =
1134     gimple_build_assign_with_ops (NOP_EXPR, result, val, NULL_TREE);
1135   gimple_set_location (stmt, loc);
1136   gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
1137   return result;
1138 }
1139
1140 /* ARG0 and ARG1 are the two arguments to a pow builtin call in GSI
1141    with location info LOC.  If possible, create an equivalent and
1142    less expensive sequence of statements prior to GSI, and return an
1143    expession holding the result.  */
1144
1145 static tree
1146 gimple_expand_builtin_pow (gimple_stmt_iterator *gsi, location_t loc,
1147                            tree arg0, tree arg1)
1148 {
1149   REAL_VALUE_TYPE c, cint, dconst1_4, dconst3_4, dconst1_3, dconst1_6;
1150   REAL_VALUE_TYPE c2, dconst3;
1151   HOST_WIDE_INT n;
1152   tree type, sqrtfn, cbrtfn, sqrt_arg0, sqrt_sqrt, result, cbrt_x, powi_cbrt_x;
1153   machine_mode mode;
1154   bool hw_sqrt_exists, c_is_int, c2_is_int;
1155
1156   /* If the exponent isn't a constant, there's nothing of interest
1157      to be done.  */
1158   if (TREE_CODE (arg1) != REAL_CST)
1159     return NULL_TREE;
1160
1161   /* If the exponent is equivalent to an integer, expand to an optimal
1162      multiplication sequence when profitable.  */
1163   c = TREE_REAL_CST (arg1);
1164   n = real_to_integer (&c);
1165   real_from_integer (&cint, VOIDmode, n, SIGNED);
1166   c_is_int = real_identical (&c, &cint);
1167
1168   if (c_is_int
1169       && ((n >= -1 && n <= 2)
1170           || (flag_unsafe_math_optimizations
1171               && optimize_bb_for_speed_p (gsi_bb (*gsi))
1172               && powi_cost (n) <= POWI_MAX_MULTS)))
1173     return gimple_expand_builtin_powi (gsi, loc, arg0, n);
1174
1175   /* Attempt various optimizations using sqrt and cbrt.  */
1176   type = TREE_TYPE (arg0);
1177   mode = TYPE_MODE (type);
1178   sqrtfn = mathfn_built_in (type, BUILT_IN_SQRT);
1179
1180   /* Optimize pow(x,0.5) = sqrt(x).  This replacement is always safe
1181      unless signed zeros must be maintained.  pow(-0,0.5) = +0, while
1182      sqrt(-0) = -0.  */
1183   if (sqrtfn
1184       && REAL_VALUES_EQUAL (c, dconsthalf)
1185       && !HONOR_SIGNED_ZEROS (mode))
1186     return build_and_insert_call (gsi, loc, sqrtfn, arg0);
1187
1188   /* Optimize pow(x,0.25) = sqrt(sqrt(x)).  Assume on most machines that
1189      a builtin sqrt instruction is smaller than a call to pow with 0.25,
1190      so do this optimization even if -Os.  Don't do this optimization
1191      if we don't have a hardware sqrt insn.  */
1192   dconst1_4 = dconst1;
1193   SET_REAL_EXP (&dconst1_4, REAL_EXP (&dconst1_4) - 2);
1194   hw_sqrt_exists = optab_handler (sqrt_optab, mode) != CODE_FOR_nothing;
1195
1196   if (flag_unsafe_math_optimizations
1197       && sqrtfn
1198       && REAL_VALUES_EQUAL (c, dconst1_4)
1199       && hw_sqrt_exists)
1200     {
1201       /* sqrt(x)  */
1202       sqrt_arg0 = build_and_insert_call (gsi, loc, sqrtfn, arg0);
1203
1204       /* sqrt(sqrt(x))  */
1205       return build_and_insert_call (gsi, loc, sqrtfn, sqrt_arg0);
1206     }
1207
1208   /* Optimize pow(x,0.75) = sqrt(x) * sqrt(sqrt(x)) unless we are
1209      optimizing for space.  Don't do this optimization if we don't have
1210      a hardware sqrt insn.  */
1211   real_from_integer (&dconst3_4, VOIDmode, 3, SIGNED);
1212   SET_REAL_EXP (&dconst3_4, REAL_EXP (&dconst3_4) - 2);
1213
1214   if (flag_unsafe_math_optimizations
1215       && sqrtfn
1216       && optimize_function_for_speed_p (cfun)
1217       && REAL_VALUES_EQUAL (c, dconst3_4)
1218       && hw_sqrt_exists)
1219     {
1220       /* sqrt(x)  */
1221       sqrt_arg0 = build_and_insert_call (gsi, loc, sqrtfn, arg0);
1222
1223       /* sqrt(sqrt(x))  */
1224       sqrt_sqrt = build_and_insert_call (gsi, loc, sqrtfn, sqrt_arg0);
1225
1226       /* sqrt(x) * sqrt(sqrt(x))  */
1227       return build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1228                                      sqrt_arg0, sqrt_sqrt);
1229     }
1230
1231   /* Optimize pow(x,1./3.) = cbrt(x).  This requires unsafe math
1232      optimizations since 1./3. is not exactly representable.  If x
1233      is negative and finite, the correct value of pow(x,1./3.) is
1234      a NaN with the "invalid" exception raised, because the value
1235      of 1./3. actually has an even denominator.  The correct value
1236      of cbrt(x) is a negative real value.  */
1237   cbrtfn = mathfn_built_in (type, BUILT_IN_CBRT);
1238   dconst1_3 = real_value_truncate (mode, dconst_third ());
1239
1240   if (flag_unsafe_math_optimizations
1241       && cbrtfn
1242       && (gimple_val_nonnegative_real_p (arg0) || !HONOR_NANS (mode))
1243       && REAL_VALUES_EQUAL (c, dconst1_3))
1244     return build_and_insert_call (gsi, loc, cbrtfn, arg0);
1245
1246   /* Optimize pow(x,1./6.) = cbrt(sqrt(x)).  Don't do this optimization
1247      if we don't have a hardware sqrt insn.  */
1248   dconst1_6 = dconst1_3;
1249   SET_REAL_EXP (&dconst1_6, REAL_EXP (&dconst1_6) - 1);
1250
1251   if (flag_unsafe_math_optimizations
1252       && sqrtfn
1253       && cbrtfn
1254       && (gimple_val_nonnegative_real_p (arg0) || !HONOR_NANS (mode))
1255       && optimize_function_for_speed_p (cfun)
1256       && hw_sqrt_exists
1257       && REAL_VALUES_EQUAL (c, dconst1_6))
1258     {
1259       /* sqrt(x)  */
1260       sqrt_arg0 = build_and_insert_call (gsi, loc, sqrtfn, arg0);
1261
1262       /* cbrt(sqrt(x))  */
1263       return build_and_insert_call (gsi, loc, cbrtfn, sqrt_arg0);
1264     }
1265
1266   /* Optimize pow(x,c), where n = 2c for some nonzero integer n
1267      and c not an integer, into
1268
1269        sqrt(x) * powi(x, n/2),                n > 0;
1270        1.0 / (sqrt(x) * powi(x, abs(n/2))),   n < 0.
1271
1272      Do not calculate the powi factor when n/2 = 0.  */
1273   real_arithmetic (&c2, MULT_EXPR, &c, &dconst2);
1274   n = real_to_integer (&c2);
1275   real_from_integer (&cint, VOIDmode, n, SIGNED);
1276   c2_is_int = real_identical (&c2, &cint);
1277
1278   if (flag_unsafe_math_optimizations
1279       && sqrtfn
1280       && c2_is_int
1281       && !c_is_int
1282       && optimize_function_for_speed_p (cfun))
1283     {
1284       tree powi_x_ndiv2 = NULL_TREE;
1285
1286       /* Attempt to fold powi(arg0, abs(n/2)) into multiplies.  If not
1287          possible or profitable, give up.  Skip the degenerate case when
1288          n is 1 or -1, where the result is always 1.  */
1289       if (absu_hwi (n) != 1)
1290         {
1291           powi_x_ndiv2 = gimple_expand_builtin_powi (gsi, loc, arg0,
1292                                                      abs_hwi (n / 2));
1293           if (!powi_x_ndiv2)
1294             return NULL_TREE;
1295         }
1296
1297       /* Calculate sqrt(x).  When n is not 1 or -1, multiply it by the
1298          result of the optimal multiply sequence just calculated.  */
1299       sqrt_arg0 = build_and_insert_call (gsi, loc, sqrtfn, arg0);
1300
1301       if (absu_hwi (n) == 1)
1302         result = sqrt_arg0;
1303       else
1304         result = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1305                                          sqrt_arg0, powi_x_ndiv2);
1306
1307       /* If n is negative, reciprocate the result.  */
1308       if (n < 0)
1309         result = build_and_insert_binop (gsi, loc, "powroot", RDIV_EXPR,
1310                                          build_real (type, dconst1), result);
1311       return result;
1312     }
1313
1314   /* Optimize pow(x,c), where 3c = n for some nonzero integer n, into
1315
1316      powi(x, n/3) * powi(cbrt(x), n%3),                    n > 0;
1317      1.0 / (powi(x, abs(n)/3) * powi(cbrt(x), abs(n)%3)),  n < 0.
1318
1319      Do not calculate the first factor when n/3 = 0.  As cbrt(x) is
1320      different from pow(x, 1./3.) due to rounding and behavior with
1321      negative x, we need to constrain this transformation to unsafe
1322      math and positive x or finite math.  */
1323   real_from_integer (&dconst3, VOIDmode, 3, SIGNED);
1324   real_arithmetic (&c2, MULT_EXPR, &c, &dconst3);
1325   real_round (&c2, mode, &c2);
1326   n = real_to_integer (&c2);
1327   real_from_integer (&cint, VOIDmode, n, SIGNED);
1328   real_arithmetic (&c2, RDIV_EXPR, &cint, &dconst3);
1329   real_convert (&c2, mode, &c2);
1330
1331   if (flag_unsafe_math_optimizations
1332       && cbrtfn
1333       && (gimple_val_nonnegative_real_p (arg0) || !HONOR_NANS (mode))
1334       && real_identical (&c2, &c)
1335       && !c2_is_int
1336       && optimize_function_for_speed_p (cfun)
1337       && powi_cost (n / 3) <= POWI_MAX_MULTS)
1338     {
1339       tree powi_x_ndiv3 = NULL_TREE;
1340
1341       /* Attempt to fold powi(arg0, abs(n/3)) into multiplies.  If not
1342          possible or profitable, give up.  Skip the degenerate case when
1343          abs(n) < 3, where the result is always 1.  */
1344       if (absu_hwi (n) >= 3)
1345         {
1346           powi_x_ndiv3 = gimple_expand_builtin_powi (gsi, loc, arg0,
1347                                                      abs_hwi (n / 3));
1348           if (!powi_x_ndiv3)
1349             return NULL_TREE;
1350         }
1351
1352       /* Calculate powi(cbrt(x), n%3).  Don't use gimple_expand_builtin_powi
1353          as that creates an unnecessary variable.  Instead, just produce
1354          either cbrt(x) or cbrt(x) * cbrt(x).  */
1355       cbrt_x = build_and_insert_call (gsi, loc, cbrtfn, arg0);
1356
1357       if (absu_hwi (n) % 3 == 1)
1358         powi_cbrt_x = cbrt_x;
1359       else
1360         powi_cbrt_x = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1361                                               cbrt_x, cbrt_x);
1362
1363       /* Multiply the two subexpressions, unless powi(x,abs(n)/3) = 1.  */
1364       if (absu_hwi (n) < 3)
1365         result = powi_cbrt_x;
1366       else
1367         result = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1368                                          powi_x_ndiv3, powi_cbrt_x);
1369
1370       /* If n is negative, reciprocate the result.  */
1371       if (n < 0)
1372         result = build_and_insert_binop (gsi, loc, "powroot", RDIV_EXPR,
1373                                          build_real (type, dconst1), result);
1374
1375       return result;
1376     }
1377
1378   /* No optimizations succeeded.  */
1379   return NULL_TREE;
1380 }
1381
1382 /* ARG is the argument to a cabs builtin call in GSI with location info
1383    LOC.  Create a sequence of statements prior to GSI that calculates
1384    sqrt(R*R + I*I), where R and I are the real and imaginary components
1385    of ARG, respectively.  Return an expression holding the result.  */
1386
1387 static tree
1388 gimple_expand_builtin_cabs (gimple_stmt_iterator *gsi, location_t loc, tree arg)
1389 {
1390   tree real_part, imag_part, addend1, addend2, sum, result;
1391   tree type = TREE_TYPE (TREE_TYPE (arg));
1392   tree sqrtfn = mathfn_built_in (type, BUILT_IN_SQRT);
1393   machine_mode mode = TYPE_MODE (type);
1394
1395   if (!flag_unsafe_math_optimizations
1396       || !optimize_bb_for_speed_p (gimple_bb (gsi_stmt (*gsi)))
1397       || !sqrtfn
1398       || optab_handler (sqrt_optab, mode) == CODE_FOR_nothing)
1399     return NULL_TREE;
1400
1401   real_part = build_and_insert_ref (gsi, loc, type, "cabs",
1402                                     REALPART_EXPR, arg);
1403   addend1 = build_and_insert_binop (gsi, loc, "cabs", MULT_EXPR,
1404                                     real_part, real_part);
1405   imag_part = build_and_insert_ref (gsi, loc, type, "cabs",
1406                                     IMAGPART_EXPR, arg);
1407   addend2 = build_and_insert_binop (gsi, loc, "cabs", MULT_EXPR,
1408                                     imag_part, imag_part);
1409   sum = build_and_insert_binop (gsi, loc, "cabs", PLUS_EXPR, addend1, addend2);
1410   result = build_and_insert_call (gsi, loc, sqrtfn, sum);
1411
1412   return result;
1413 }
1414
1415 /* Go through all calls to sin, cos and cexpi and call execute_cse_sincos_1
1416    on the SSA_NAME argument of each of them.  Also expand powi(x,n) into
1417    an optimal number of multiplies, when n is a constant.  */
1418
1419 namespace {
1420
1421 const pass_data pass_data_cse_sincos =
1422 {
1423   GIMPLE_PASS, /* type */
1424   "sincos", /* name */
1425   OPTGROUP_NONE, /* optinfo_flags */
1426   TV_NONE, /* tv_id */
1427   PROP_ssa, /* properties_required */
1428   0, /* properties_provided */
1429   0, /* properties_destroyed */
1430   0, /* todo_flags_start */
1431   TODO_update_ssa, /* todo_flags_finish */
1432 };
1433
1434 class pass_cse_sincos : public gimple_opt_pass
1435 {
1436 public:
1437   pass_cse_sincos (gcc::context *ctxt)
1438     : gimple_opt_pass (pass_data_cse_sincos, ctxt)
1439   {}
1440
1441   /* opt_pass methods: */
1442   virtual bool gate (function *)
1443     {
1444       /* We no longer require either sincos or cexp, since powi expansion
1445          piggybacks on this pass.  */
1446       return optimize;
1447     }
1448
1449   virtual unsigned int execute (function *);
1450
1451 }; // class pass_cse_sincos
1452
1453 unsigned int
1454 pass_cse_sincos::execute (function *fun)
1455 {
1456   basic_block bb;
1457   bool cfg_changed = false;
1458
1459   calculate_dominance_info (CDI_DOMINATORS);
1460   memset (&sincos_stats, 0, sizeof (sincos_stats));
1461
1462   FOR_EACH_BB_FN (bb, fun)
1463     {
1464       gimple_stmt_iterator gsi;
1465       bool cleanup_eh = false;
1466
1467       for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi); gsi_next (&gsi))
1468         {
1469           gimple stmt = gsi_stmt (gsi);
1470           tree fndecl;
1471
1472           /* Only the last stmt in a bb could throw, no need to call
1473              gimple_purge_dead_eh_edges if we change something in the middle
1474              of a basic block.  */
1475           cleanup_eh = false;
1476
1477           if (is_gimple_call (stmt)
1478               && gimple_call_lhs (stmt)
1479               && (fndecl = gimple_call_fndecl (stmt))
1480               && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_NORMAL)
1481             {
1482               tree arg, arg0, arg1, result;
1483               HOST_WIDE_INT n;
1484               location_t loc;
1485
1486               switch (DECL_FUNCTION_CODE (fndecl))
1487                 {
1488                 CASE_FLT_FN (BUILT_IN_COS):
1489                 CASE_FLT_FN (BUILT_IN_SIN):
1490                 CASE_FLT_FN (BUILT_IN_CEXPI):
1491                   /* Make sure we have either sincos or cexp.  */
1492                   if (!targetm.libc_has_function (function_c99_math_complex)
1493                       && !targetm.libc_has_function (function_sincos))
1494                     break;
1495
1496                   arg = gimple_call_arg (stmt, 0);
1497                   if (TREE_CODE (arg) == SSA_NAME)
1498                     cfg_changed |= execute_cse_sincos_1 (arg);
1499                   break;
1500
1501                 CASE_FLT_FN (BUILT_IN_POW):
1502                   arg0 = gimple_call_arg (stmt, 0);
1503                   arg1 = gimple_call_arg (stmt, 1);
1504
1505                   loc = gimple_location (stmt);
1506                   result = gimple_expand_builtin_pow (&gsi, loc, arg0, arg1);
1507
1508                   if (result)
1509                     {
1510                       tree lhs = gimple_get_lhs (stmt);
1511                       gassign *new_stmt =
1512                         gimple_build_assign (lhs, result);
1513                       gimple_set_location (new_stmt, loc);
1514                       unlink_stmt_vdef (stmt);
1515                       gsi_replace (&gsi, new_stmt, true);
1516                       cleanup_eh = true;
1517                       if (gimple_vdef (stmt))
1518                         release_ssa_name (gimple_vdef (stmt));
1519                     }
1520                   break;
1521
1522                 CASE_FLT_FN (BUILT_IN_POWI):
1523                   arg0 = gimple_call_arg (stmt, 0);
1524                   arg1 = gimple_call_arg (stmt, 1);
1525                   loc = gimple_location (stmt);
1526
1527                   if (real_minus_onep (arg0))
1528                     {
1529                       tree t0, t1, cond, one, minus_one;
1530                       gassign *stmt;
1531
1532                       t0 = TREE_TYPE (arg0);
1533                       t1 = TREE_TYPE (arg1);
1534                       one = build_real (t0, dconst1);
1535                       minus_one = build_real (t0, dconstm1);
1536
1537                       cond = make_temp_ssa_name (t1, NULL, "powi_cond");
1538                       stmt = gimple_build_assign_with_ops (BIT_AND_EXPR, cond,
1539                                                            arg1,
1540                                                            build_int_cst (t1,
1541                                                                           1));
1542                       gimple_set_location (stmt, loc);
1543                       gsi_insert_before (&gsi, stmt, GSI_SAME_STMT);
1544
1545                       result = make_temp_ssa_name (t0, NULL, "powi");
1546                       stmt = gimple_build_assign_with_ops (COND_EXPR, result,
1547                                                            cond,
1548                                                            minus_one, one);
1549                       gimple_set_location (stmt, loc);
1550                       gsi_insert_before (&gsi, stmt, GSI_SAME_STMT);
1551                     }
1552                   else
1553                     {
1554                       if (!tree_fits_shwi_p (arg1))
1555                         break;
1556
1557                       n = tree_to_shwi (arg1);
1558                       result = gimple_expand_builtin_powi (&gsi, loc, arg0, n);
1559                     }
1560
1561                   if (result)
1562                     {
1563                       tree lhs = gimple_get_lhs (stmt);
1564                       gassign *new_stmt = gimple_build_assign (lhs, result);
1565                       gimple_set_location (new_stmt, loc);
1566                       unlink_stmt_vdef (stmt);
1567                       gsi_replace (&gsi, new_stmt, true);
1568                       cleanup_eh = true;
1569                       if (gimple_vdef (stmt))
1570                         release_ssa_name (gimple_vdef (stmt));
1571                     }
1572                   break;
1573
1574                 CASE_FLT_FN (BUILT_IN_CABS):
1575                   arg0 = gimple_call_arg (stmt, 0);
1576                   loc = gimple_location (stmt);
1577                   result = gimple_expand_builtin_cabs (&gsi, loc, arg0);
1578
1579                   if (result)
1580                     {
1581                       tree lhs = gimple_get_lhs (stmt);
1582                       gassign *new_stmt = gimple_build_assign (lhs, result);
1583                       gimple_set_location (new_stmt, loc);
1584                       unlink_stmt_vdef (stmt);
1585                       gsi_replace (&gsi, new_stmt, true);
1586                       cleanup_eh = true;
1587                       if (gimple_vdef (stmt))
1588                         release_ssa_name (gimple_vdef (stmt));
1589                     }
1590                   break;
1591
1592                 default:;
1593                 }
1594             }
1595         }
1596       if (cleanup_eh)
1597         cfg_changed |= gimple_purge_dead_eh_edges (bb);
1598     }
1599
1600   statistics_counter_event (fun, "sincos statements inserted",
1601                             sincos_stats.inserted);
1602
1603   free_dominance_info (CDI_DOMINATORS);
1604   return cfg_changed ? TODO_cleanup_cfg : 0;
1605 }
1606
1607 } // anon namespace
1608
1609 gimple_opt_pass *
1610 make_pass_cse_sincos (gcc::context *ctxt)
1611 {
1612   return new pass_cse_sincos (ctxt);
1613 }
1614
1615 /* A symbolic number is used to detect byte permutation and selection
1616    patterns.  Therefore the field N contains an artificial number
1617    consisting of octet sized markers:
1618
1619    0    - target byte has the value 0
1620    FF   - target byte has an unknown value (eg. due to sign extension)
1621    1..size - marker value is the target byte index minus one.
1622
1623    To detect permutations on memory sources (arrays and structures), a symbolic
1624    number is also associated a base address (the array or structure the load is
1625    made from), an offset from the base address and a range which gives the
1626    difference between the highest and lowest accessed memory location to make
1627    such a symbolic number. The range is thus different from size which reflects
1628    the size of the type of current expression. Note that for non memory source,
1629    range holds the same value as size.
1630
1631    For instance, for an array char a[], (short) a[0] | (short) a[3] would have
1632    a size of 2 but a range of 4 while (short) a[0] | ((short) a[0] << 1) would
1633    still have a size of 2 but this time a range of 1.  */
1634
1635 struct symbolic_number {
1636   uint64_t n;
1637   tree type;
1638   tree base_addr;
1639   tree offset;
1640   HOST_WIDE_INT bytepos;
1641   tree alias_set;
1642   tree vuse;
1643   unsigned HOST_WIDE_INT range;
1644 };
1645
1646 #define BITS_PER_MARKER 8
1647 #define MARKER_MASK ((1 << BITS_PER_MARKER) - 1)
1648 #define MARKER_BYTE_UNKNOWN MARKER_MASK
1649 #define HEAD_MARKER(n, size) \
1650   ((n) & ((uint64_t) MARKER_MASK << (((size) - 1) * BITS_PER_MARKER)))
1651
1652 /* The number which the find_bswap_or_nop_1 result should match in
1653    order to have a nop.  The number is masked according to the size of
1654    the symbolic number before using it.  */
1655 #define CMPNOP (sizeof (int64_t) < 8 ? 0 : \
1656   (uint64_t)0x08070605 << 32 | 0x04030201)
1657
1658 /* The number which the find_bswap_or_nop_1 result should match in
1659    order to have a byte swap.  The number is masked according to the
1660    size of the symbolic number before using it.  */
1661 #define CMPXCHG (sizeof (int64_t) < 8 ? 0 : \
1662   (uint64_t)0x01020304 << 32 | 0x05060708)
1663
1664 /* Perform a SHIFT or ROTATE operation by COUNT bits on symbolic
1665    number N.  Return false if the requested operation is not permitted
1666    on a symbolic number.  */
1667
1668 static inline bool
1669 do_shift_rotate (enum tree_code code,
1670                  struct symbolic_number *n,
1671                  int count)
1672 {
1673   int i, size = TYPE_PRECISION (n->type) / BITS_PER_UNIT;
1674   unsigned head_marker;
1675
1676   if (count % BITS_PER_UNIT != 0)
1677     return false;
1678   count = (count / BITS_PER_UNIT) * BITS_PER_MARKER;
1679
1680   /* Zero out the extra bits of N in order to avoid them being shifted
1681      into the significant bits.  */
1682   if (size < 64 / BITS_PER_MARKER)
1683     n->n &= ((uint64_t) 1 << (size * BITS_PER_MARKER)) - 1;
1684
1685   switch (code)
1686     {
1687     case LSHIFT_EXPR:
1688       n->n <<= count;
1689       break;
1690     case RSHIFT_EXPR:
1691       head_marker = HEAD_MARKER (n->n, size);
1692       n->n >>= count;
1693       /* Arithmetic shift of signed type: result is dependent on the value.  */
1694       if (!TYPE_UNSIGNED (n->type) && head_marker)
1695         for (i = 0; i < count / BITS_PER_MARKER; i++)
1696           n->n |= (uint64_t) MARKER_BYTE_UNKNOWN
1697                   << ((size - 1 - i) * BITS_PER_MARKER);
1698       break;
1699     case LROTATE_EXPR:
1700       n->n = (n->n << count) | (n->n >> ((size * BITS_PER_MARKER) - count));
1701       break;
1702     case RROTATE_EXPR:
1703       n->n = (n->n >> count) | (n->n << ((size * BITS_PER_MARKER) - count));
1704       break;
1705     default:
1706       return false;
1707     }
1708   /* Zero unused bits for size.  */
1709   if (size < 64 / BITS_PER_MARKER)
1710     n->n &= ((uint64_t) 1 << (size * BITS_PER_MARKER)) - 1;
1711   return true;
1712 }
1713
1714 /* Perform sanity checking for the symbolic number N and the gimple
1715    statement STMT.  */
1716
1717 static inline bool
1718 verify_symbolic_number_p (struct symbolic_number *n, gimple stmt)
1719 {
1720   tree lhs_type;
1721
1722   lhs_type = gimple_expr_type (stmt);
1723
1724   if (TREE_CODE (lhs_type) != INTEGER_TYPE)
1725     return false;
1726
1727   if (TYPE_PRECISION (lhs_type) != TYPE_PRECISION (n->type))
1728     return false;
1729
1730   return true;
1731 }
1732
1733 /* Initialize the symbolic number N for the bswap pass from the base element
1734    SRC manipulated by the bitwise OR expression.  */
1735
1736 static bool
1737 init_symbolic_number (struct symbolic_number *n, tree src)
1738 {
1739   int size;
1740
1741   n->base_addr = n->offset = n->alias_set = n->vuse = NULL_TREE;
1742
1743   /* Set up the symbolic number N by setting each byte to a value between 1 and
1744      the byte size of rhs1.  The highest order byte is set to n->size and the
1745      lowest order byte to 1.  */
1746   n->type = TREE_TYPE (src);
1747   size = TYPE_PRECISION (n->type);
1748   if (size % BITS_PER_UNIT != 0)
1749     return false;
1750   size /= BITS_PER_UNIT;
1751   if (size > 64 / BITS_PER_MARKER)
1752     return false;
1753   n->range = size;
1754   n->n = CMPNOP;
1755
1756   if (size < 64 / BITS_PER_MARKER)
1757     n->n &= ((uint64_t) 1 << (size * BITS_PER_MARKER)) - 1;
1758
1759   return true;
1760 }
1761
1762 /* Check if STMT might be a byte swap or a nop from a memory source and returns
1763    the answer. If so, REF is that memory source and the base of the memory area
1764    accessed and the offset of the access from that base are recorded in N.  */
1765
1766 bool
1767 find_bswap_or_nop_load (gimple stmt, tree ref, struct symbolic_number *n)
1768 {
1769   /* Leaf node is an array or component ref. Memorize its base and
1770      offset from base to compare to other such leaf node.  */
1771   HOST_WIDE_INT bitsize, bitpos;
1772   machine_mode mode;
1773   int unsignedp, volatilep;
1774   tree offset, base_addr;
1775
1776   if (!gimple_assign_load_p (stmt) || gimple_has_volatile_ops (stmt))
1777     return false;
1778
1779   base_addr = get_inner_reference (ref, &bitsize, &bitpos, &offset, &mode,
1780                                    &unsignedp, &volatilep, false);
1781
1782   if (TREE_CODE (base_addr) == MEM_REF)
1783     {
1784       offset_int bit_offset = 0;
1785       tree off = TREE_OPERAND (base_addr, 1);
1786
1787       if (!integer_zerop (off))
1788         {
1789           offset_int boff, coff = mem_ref_offset (base_addr);
1790           boff = wi::lshift (coff, LOG2_BITS_PER_UNIT);
1791           bit_offset += boff;
1792         }
1793
1794       base_addr = TREE_OPERAND (base_addr, 0);
1795
1796       /* Avoid returning a negative bitpos as this may wreak havoc later.  */
1797       if (wi::neg_p (bit_offset))
1798         {
1799           offset_int mask = wi::mask <offset_int> (LOG2_BITS_PER_UNIT, false);
1800           offset_int tem = bit_offset.and_not (mask);
1801           /* TEM is the bitpos rounded to BITS_PER_UNIT towards -Inf.
1802              Subtract it to BIT_OFFSET and add it (scaled) to OFFSET.  */
1803           bit_offset -= tem;
1804           tem = wi::arshift (tem, LOG2_BITS_PER_UNIT);
1805           if (offset)
1806             offset = size_binop (PLUS_EXPR, offset,
1807                                     wide_int_to_tree (sizetype, tem));
1808           else
1809             offset = wide_int_to_tree (sizetype, tem);
1810         }
1811
1812       bitpos += bit_offset.to_shwi ();
1813     }
1814
1815   if (bitpos % BITS_PER_UNIT)
1816     return false;
1817   if (bitsize % BITS_PER_UNIT)
1818     return false;
1819
1820   if (!init_symbolic_number (n, ref))
1821     return false;
1822   n->base_addr = base_addr;
1823   n->offset = offset;
1824   n->bytepos = bitpos / BITS_PER_UNIT;
1825   n->alias_set = reference_alias_ptr_type (ref);
1826   n->vuse = gimple_vuse (stmt);
1827   return true;
1828 }
1829
1830 /* find_bswap_or_nop_1 invokes itself recursively with N and tries to perform
1831    the operation given by the rhs of STMT on the result.  If the operation
1832    could successfully be executed the function returns a gimple stmt whose
1833    rhs's first tree is the expression of the source operand and NULL
1834    otherwise.  */
1835
1836 static gimple
1837 find_bswap_or_nop_1 (gimple stmt, struct symbolic_number *n, int limit)
1838 {
1839   enum tree_code code;
1840   tree rhs1, rhs2 = NULL;
1841   gimple rhs1_stmt, rhs2_stmt, source_stmt1;
1842   enum gimple_rhs_class rhs_class;
1843
1844   if (!limit || !is_gimple_assign (stmt))
1845     return NULL;
1846
1847   rhs1 = gimple_assign_rhs1 (stmt);
1848
1849   if (find_bswap_or_nop_load (stmt, rhs1, n))
1850     return stmt;
1851
1852   if (TREE_CODE (rhs1) != SSA_NAME)
1853     return NULL;
1854
1855   code = gimple_assign_rhs_code (stmt);
1856   rhs_class = gimple_assign_rhs_class (stmt);
1857   rhs1_stmt = SSA_NAME_DEF_STMT (rhs1);
1858
1859   if (rhs_class == GIMPLE_BINARY_RHS)
1860     rhs2 = gimple_assign_rhs2 (stmt);
1861
1862   /* Handle unary rhs and binary rhs with integer constants as second
1863      operand.  */
1864
1865   if (rhs_class == GIMPLE_UNARY_RHS
1866       || (rhs_class == GIMPLE_BINARY_RHS
1867           && TREE_CODE (rhs2) == INTEGER_CST))
1868     {
1869       if (code != BIT_AND_EXPR
1870           && code != LSHIFT_EXPR
1871           && code != RSHIFT_EXPR
1872           && code != LROTATE_EXPR
1873           && code != RROTATE_EXPR
1874           && !CONVERT_EXPR_CODE_P (code))
1875         return NULL;
1876
1877       source_stmt1 = find_bswap_or_nop_1 (rhs1_stmt, n, limit - 1);
1878
1879       /* If find_bswap_or_nop_1 returned NULL, STMT is a leaf node and
1880          we have to initialize the symbolic number.  */
1881       if (!source_stmt1)
1882         {
1883           if (gimple_assign_load_p (stmt)
1884               || !init_symbolic_number (n, rhs1))
1885             return NULL;
1886           source_stmt1 = stmt;
1887         }
1888
1889       switch (code)
1890         {
1891         case BIT_AND_EXPR:
1892           {
1893             int i, size = TYPE_PRECISION (n->type) / BITS_PER_UNIT;
1894             uint64_t val = int_cst_value (rhs2), mask = 0;
1895             uint64_t tmp = (1 << BITS_PER_UNIT) - 1;
1896
1897             /* Only constants masking full bytes are allowed.  */
1898             for (i = 0; i < size; i++, tmp <<= BITS_PER_UNIT)
1899               if ((val & tmp) != 0 && (val & tmp) != tmp)
1900                 return NULL;
1901               else if (val & tmp)
1902                 mask |= (uint64_t) MARKER_MASK << (i * BITS_PER_MARKER);
1903
1904             n->n &= mask;
1905           }
1906           break;
1907         case LSHIFT_EXPR:
1908         case RSHIFT_EXPR:
1909         case LROTATE_EXPR:
1910         case RROTATE_EXPR:
1911           if (!do_shift_rotate (code, n, (int)TREE_INT_CST_LOW (rhs2)))
1912             return NULL;
1913           break;
1914         CASE_CONVERT:
1915           {
1916             int i, type_size, old_type_size;
1917             tree type;
1918
1919             type = gimple_expr_type (stmt);
1920             type_size = TYPE_PRECISION (type);
1921             if (type_size % BITS_PER_UNIT != 0)
1922               return NULL;
1923             type_size /= BITS_PER_UNIT;
1924             if (type_size > 64 / BITS_PER_MARKER)
1925               return NULL;
1926
1927             /* Sign extension: result is dependent on the value.  */
1928             old_type_size = TYPE_PRECISION (n->type) / BITS_PER_UNIT;
1929             if (!TYPE_UNSIGNED (n->type) && type_size > old_type_size
1930                 && HEAD_MARKER (n->n, old_type_size))
1931               for (i = 0; i < type_size - old_type_size; i++)
1932                 n->n |= (uint64_t) MARKER_BYTE_UNKNOWN
1933                         << ((type_size - 1 - i) * BITS_PER_MARKER);
1934
1935             if (type_size < 64 / BITS_PER_MARKER)
1936               {
1937                 /* If STMT casts to a smaller type mask out the bits not
1938                    belonging to the target type.  */
1939                 n->n &= ((uint64_t) 1 << (type_size * BITS_PER_MARKER)) - 1;
1940               }
1941             n->type = type;
1942             if (!n->base_addr)
1943               n->range = type_size;
1944           }
1945           break;
1946         default:
1947           return NULL;
1948         };
1949       return verify_symbolic_number_p (n, stmt) ? source_stmt1 : NULL;
1950     }
1951
1952   /* Handle binary rhs.  */
1953
1954   if (rhs_class == GIMPLE_BINARY_RHS)
1955     {
1956       int i, size;
1957       struct symbolic_number n1, n2;
1958       uint64_t mask;
1959       gimple source_stmt2;
1960
1961       if (code != BIT_IOR_EXPR)
1962         return NULL;
1963
1964       if (TREE_CODE (rhs2) != SSA_NAME)
1965         return NULL;
1966
1967       rhs2_stmt = SSA_NAME_DEF_STMT (rhs2);
1968
1969       switch (code)
1970         {
1971         case BIT_IOR_EXPR:
1972           source_stmt1 = find_bswap_or_nop_1 (rhs1_stmt, &n1, limit - 1);
1973
1974           if (!source_stmt1)
1975             return NULL;
1976
1977           source_stmt2 = find_bswap_or_nop_1 (rhs2_stmt, &n2, limit - 1);
1978
1979           if (!source_stmt2)
1980             return NULL;
1981
1982           if (TYPE_PRECISION (n1.type) != TYPE_PRECISION (n2.type))
1983             return NULL;
1984
1985           if (!n1.vuse != !n2.vuse ||
1986           (n1.vuse && !operand_equal_p (n1.vuse, n2.vuse, 0)))
1987             return NULL;
1988
1989           if (gimple_assign_rhs1 (source_stmt1)
1990               != gimple_assign_rhs1 (source_stmt2))
1991             {
1992               int64_t inc;
1993               HOST_WIDE_INT off_sub;
1994               struct symbolic_number *n_ptr;
1995
1996               if (!n1.base_addr || !n2.base_addr
1997                   || !operand_equal_p (n1.base_addr, n2.base_addr, 0))
1998                 return NULL;
1999               if (!n1.offset != !n2.offset ||
2000                   (n1.offset && !operand_equal_p (n1.offset, n2.offset, 0)))
2001                 return NULL;
2002
2003               /* We swap n1 with n2 to have n1 < n2.  */
2004               if (n2.bytepos < n1.bytepos)
2005                 {
2006                   struct symbolic_number tmpn;
2007
2008                   tmpn = n2;
2009                   n2 = n1;
2010                   n1 = tmpn;
2011                   source_stmt1 = source_stmt2;
2012                 }
2013
2014               off_sub = n2.bytepos - n1.bytepos;
2015
2016               /* Check that the range of memory covered can be represented by
2017                  a symbolic number.  */
2018               if (off_sub + n2.range > 64 / BITS_PER_MARKER)
2019                 return NULL;
2020               n->range = n2.range + off_sub;
2021
2022               /* Reinterpret byte marks in symbolic number holding the value of
2023                  bigger weight according to target endianness.  */
2024               inc = BYTES_BIG_ENDIAN ? off_sub + n2.range - n1.range : off_sub;
2025               size = TYPE_PRECISION (n1.type) / BITS_PER_UNIT;
2026               if (BYTES_BIG_ENDIAN)
2027                 n_ptr = &n1;
2028               else
2029                 n_ptr = &n2;
2030               for (i = 0; i < size; i++, inc <<= BITS_PER_MARKER)
2031                 {
2032                   unsigned marker =
2033                     (n_ptr->n >> (i * BITS_PER_MARKER)) & MARKER_MASK;
2034                   if (marker && marker != MARKER_BYTE_UNKNOWN)
2035                     n_ptr->n += inc;
2036                 }
2037             }
2038           else
2039             n->range = n1.range;
2040
2041           if (!n1.alias_set
2042               || alias_ptr_types_compatible_p (n1.alias_set, n2.alias_set))
2043             n->alias_set = n1.alias_set;
2044           else
2045             n->alias_set = ptr_type_node;
2046           n->vuse = n1.vuse;
2047           n->base_addr = n1.base_addr;
2048           n->offset = n1.offset;
2049           n->bytepos = n1.bytepos;
2050           n->type = n1.type;
2051           size = TYPE_PRECISION (n->type) / BITS_PER_UNIT;
2052           for (i = 0, mask = MARKER_MASK; i < size;
2053                i++, mask <<= BITS_PER_MARKER)
2054             {
2055               uint64_t masked1, masked2;
2056
2057               masked1 = n1.n & mask;
2058               masked2 = n2.n & mask;
2059               if (masked1 && masked2 && masked1 != masked2)
2060                 return NULL;
2061             }
2062           n->n = n1.n | n2.n;
2063
2064           if (!verify_symbolic_number_p (n, stmt))
2065             return NULL;
2066
2067           break;
2068         default:
2069           return NULL;
2070         }
2071       return source_stmt1;
2072     }
2073   return NULL;
2074 }
2075
2076 /* Check if STMT completes a bswap implementation or a read in a given
2077    endianness consisting of ORs, SHIFTs and ANDs and sets *BSWAP
2078    accordingly.  It also sets N to represent the kind of operations
2079    performed: size of the resulting expression and whether it works on
2080    a memory source, and if so alias-set and vuse.  At last, the
2081    function returns a stmt whose rhs's first tree is the source
2082    expression.  */
2083
2084 static gimple
2085 find_bswap_or_nop (gimple stmt, struct symbolic_number *n, bool *bswap)
2086 {
2087 /* The number which the find_bswap_or_nop_1 result should match in order
2088    to have a full byte swap.  The number is shifted to the right
2089    according to the size of the symbolic number before using it.  */
2090   uint64_t cmpxchg = CMPXCHG;
2091   uint64_t cmpnop = CMPNOP;
2092
2093   gimple source_stmt;
2094   int limit;
2095
2096   /* The last parameter determines the depth search limit.  It usually
2097      correlates directly to the number n of bytes to be touched.  We
2098      increase that number by log2(n) + 1 here in order to also
2099      cover signed -> unsigned conversions of the src operand as can be seen
2100      in libgcc, and for initial shift/and operation of the src operand.  */
2101   limit = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (gimple_expr_type (stmt)));
2102   limit += 1 + (int) ceil_log2 ((unsigned HOST_WIDE_INT) limit);
2103   source_stmt =  find_bswap_or_nop_1 (stmt, n, limit);
2104
2105   if (!source_stmt)
2106     return NULL;
2107
2108   /* Find real size of result (highest non zero byte).  */
2109   if (n->base_addr)
2110     {
2111       int rsize;
2112       uint64_t tmpn;
2113
2114       for (tmpn = n->n, rsize = 0; tmpn; tmpn >>= BITS_PER_MARKER, rsize++);
2115       n->range = rsize;
2116     }
2117
2118   /* Zero out the extra bits of N and CMP*.  */
2119   if (n->range < (int) sizeof (int64_t))
2120     {
2121       uint64_t mask;
2122
2123       mask = ((uint64_t) 1 << (n->range * BITS_PER_MARKER)) - 1;
2124       cmpxchg >>= (64 / BITS_PER_MARKER - n->range) * BITS_PER_MARKER;
2125       cmpnop &= mask;
2126     }
2127
2128   /* A complete byte swap should make the symbolic number to start with
2129      the largest digit in the highest order byte. Unchanged symbolic
2130      number indicates a read with same endianness as target architecture.  */
2131   if (n->n == cmpnop)
2132     *bswap = false;
2133   else if (n->n == cmpxchg)
2134     *bswap = true;
2135   else
2136     return NULL;
2137
2138   /* Useless bit manipulation performed by code.  */
2139   if (!n->base_addr && n->n == cmpnop)
2140     return NULL;
2141
2142   n->range *= BITS_PER_UNIT;
2143   return source_stmt;
2144 }
2145
2146 namespace {
2147
2148 const pass_data pass_data_optimize_bswap =
2149 {
2150   GIMPLE_PASS, /* type */
2151   "bswap", /* name */
2152   OPTGROUP_NONE, /* optinfo_flags */
2153   TV_NONE, /* tv_id */
2154   PROP_ssa, /* properties_required */
2155   0, /* properties_provided */
2156   0, /* properties_destroyed */
2157   0, /* todo_flags_start */
2158   0, /* todo_flags_finish */
2159 };
2160
2161 class pass_optimize_bswap : public gimple_opt_pass
2162 {
2163 public:
2164   pass_optimize_bswap (gcc::context *ctxt)
2165     : gimple_opt_pass (pass_data_optimize_bswap, ctxt)
2166   {}
2167
2168   /* opt_pass methods: */
2169   virtual bool gate (function *)
2170     {
2171       return flag_expensive_optimizations && optimize;
2172     }
2173
2174   virtual unsigned int execute (function *);
2175
2176 }; // class pass_optimize_bswap
2177
2178 /* Perform the bswap optimization: replace the expression computed in the rhs
2179    of CUR_STMT by an equivalent bswap, load or load + bswap expression.
2180    Which of these alternatives replace the rhs is given by N->base_addr (non
2181    null if a load is needed) and BSWAP.  The type, VUSE and set-alias of the
2182    load to perform are also given in N while the builtin bswap invoke is given
2183    in FNDEL.  Finally, if a load is involved, SRC_STMT refers to one of the
2184    load statements involved to construct the rhs in CUR_STMT and N->range gives
2185    the size of the rhs expression for maintaining some statistics.
2186
2187    Note that if the replacement involve a load, CUR_STMT is moved just after
2188    SRC_STMT to do the load with the same VUSE which can lead to CUR_STMT
2189    changing of basic block.  */
2190
2191 static bool
2192 bswap_replace (gimple cur_stmt, gimple src_stmt, tree fndecl, tree bswap_type,
2193                tree load_type, struct symbolic_number *n, bool bswap)
2194 {
2195   gimple_stmt_iterator gsi;
2196   tree src, tmp, tgt;
2197   gimple bswap_stmt;
2198
2199   gsi = gsi_for_stmt (cur_stmt);
2200   src = gimple_assign_rhs1 (src_stmt);
2201   tgt = gimple_assign_lhs (cur_stmt);
2202
2203   /* Need to load the value from memory first.  */
2204   if (n->base_addr)
2205     {
2206       gimple_stmt_iterator gsi_ins = gsi_for_stmt (src_stmt);
2207       tree addr_expr, addr_tmp, val_expr, val_tmp;
2208       tree load_offset_ptr, aligned_load_type;
2209       gimple addr_stmt, load_stmt;
2210       unsigned align;
2211
2212       align = get_object_alignment (src);
2213       if (bswap
2214           && align < GET_MODE_ALIGNMENT (TYPE_MODE (load_type))
2215           && SLOW_UNALIGNED_ACCESS (TYPE_MODE (load_type), align))
2216         return false;
2217
2218       /* Move cur_stmt just before  one of the load of the original
2219          to ensure it has the same VUSE.  See PR61517 for what could
2220          go wrong.  */
2221       gsi_move_before (&gsi, &gsi_ins);
2222       gsi = gsi_for_stmt (cur_stmt);
2223
2224       /*  Compute address to load from and cast according to the size
2225           of the load.  */
2226       addr_expr = build_fold_addr_expr (unshare_expr (src));
2227       if (is_gimple_min_invariant (addr_expr))
2228         addr_tmp = addr_expr;
2229       else
2230         {
2231           addr_tmp = make_temp_ssa_name (TREE_TYPE (addr_expr), NULL,
2232                                          "load_src");
2233           addr_stmt = gimple_build_assign (addr_tmp, addr_expr);
2234           gsi_insert_before (&gsi, addr_stmt, GSI_SAME_STMT);
2235         }
2236
2237       /* Perform the load.  */
2238       aligned_load_type = load_type;
2239       if (align < TYPE_ALIGN (load_type))
2240         aligned_load_type = build_aligned_type (load_type, align);
2241       load_offset_ptr = build_int_cst (n->alias_set, 0);
2242       val_expr = fold_build2 (MEM_REF, aligned_load_type, addr_tmp,
2243                               load_offset_ptr);
2244
2245       if (!bswap)
2246         {
2247           if (n->range == 16)
2248             nop_stats.found_16bit++;
2249           else if (n->range == 32)
2250             nop_stats.found_32bit++;
2251           else
2252             {
2253               gcc_assert (n->range == 64);
2254               nop_stats.found_64bit++;
2255             }
2256
2257           /* Convert the result of load if necessary.  */
2258           if (!useless_type_conversion_p (TREE_TYPE (tgt), load_type))
2259             {
2260               val_tmp = make_temp_ssa_name (aligned_load_type, NULL,
2261                                             "load_dst");
2262               load_stmt = gimple_build_assign (val_tmp, val_expr);
2263               gimple_set_vuse (load_stmt, n->vuse);
2264               gsi_insert_before (&gsi, load_stmt, GSI_SAME_STMT);
2265               gimple_assign_set_rhs_with_ops_1 (&gsi, NOP_EXPR, val_tmp,
2266                                                 NULL_TREE, NULL_TREE);
2267             }
2268           else
2269             {
2270               gimple_assign_set_rhs_with_ops_1 (&gsi, MEM_REF, val_expr,
2271                                                 NULL_TREE, NULL_TREE);
2272               gimple_set_vuse (cur_stmt, n->vuse);
2273             }
2274           update_stmt (cur_stmt);
2275
2276           if (dump_file)
2277             {
2278               fprintf (dump_file,
2279                        "%d bit load in target endianness found at: ",
2280                        (int)n->range);
2281               print_gimple_stmt (dump_file, cur_stmt, 0, 0);
2282             }
2283           return true;
2284         }
2285       else
2286         {
2287           val_tmp = make_temp_ssa_name (aligned_load_type, NULL, "load_dst");
2288           load_stmt = gimple_build_assign (val_tmp, val_expr);
2289           gimple_set_vuse (load_stmt, n->vuse);
2290           gsi_insert_before (&gsi, load_stmt, GSI_SAME_STMT);
2291         }
2292       src = val_tmp;
2293     }
2294
2295   if (n->range == 16)
2296     bswap_stats.found_16bit++;
2297   else if (n->range == 32)
2298     bswap_stats.found_32bit++;
2299   else
2300     {
2301       gcc_assert (n->range == 64);
2302       bswap_stats.found_64bit++;
2303     }
2304
2305   tmp = src;
2306
2307   /* Canonical form for 16 bit bswap is a rotate expression.  Only 16bit values
2308      are considered as rotation of 2N bit values by N bits is generally not
2309      equivalent to a bswap.  Consider for instance 0x01020304 >> 16 which gives
2310      0x03040102 while a bswap for that value is 0x04030201.  */
2311   if (bswap && n->range == 16)
2312     {
2313       tree count = build_int_cst (NULL, BITS_PER_UNIT);
2314       bswap_type = TREE_TYPE (src);
2315       src = fold_build2 (LROTATE_EXPR, bswap_type, src, count);
2316       bswap_stmt = gimple_build_assign (NULL, src);
2317     }
2318   else
2319     {
2320       /* Convert the src expression if necessary.  */
2321       if (!useless_type_conversion_p (TREE_TYPE (tmp), bswap_type))
2322         {
2323           gimple convert_stmt;
2324           tmp = make_temp_ssa_name (bswap_type, NULL, "bswapsrc");
2325           convert_stmt = gimple_build_assign_with_ops (NOP_EXPR, tmp, src,
2326                                                        NULL);
2327           gsi_insert_before (&gsi, convert_stmt, GSI_SAME_STMT);
2328         }
2329
2330       bswap_stmt = gimple_build_call (fndecl, 1, tmp);
2331     }
2332
2333   tmp = tgt;
2334
2335   /* Convert the result if necessary.  */
2336   if (!useless_type_conversion_p (TREE_TYPE (tgt), bswap_type))
2337     {
2338       gimple convert_stmt;
2339       tmp = make_temp_ssa_name (bswap_type, NULL, "bswapdst");
2340       convert_stmt = gimple_build_assign_with_ops (NOP_EXPR, tgt, tmp, NULL);
2341       gsi_insert_after (&gsi, convert_stmt, GSI_SAME_STMT);
2342     }
2343
2344   gimple_set_lhs (bswap_stmt, tmp);
2345
2346   if (dump_file)
2347     {
2348       fprintf (dump_file, "%d bit bswap implementation found at: ",
2349                (int)n->range);
2350       print_gimple_stmt (dump_file, cur_stmt, 0, 0);
2351     }
2352
2353   gsi_insert_after (&gsi, bswap_stmt, GSI_SAME_STMT);
2354   gsi_remove (&gsi, true);
2355   return true;
2356 }
2357
2358 /* Find manual byte swap implementations as well as load in a given
2359    endianness. Byte swaps are turned into a bswap builtin invokation
2360    while endian loads are converted to bswap builtin invokation or
2361    simple load according to the target endianness.  */
2362
2363 unsigned int
2364 pass_optimize_bswap::execute (function *fun)
2365 {
2366   basic_block bb;
2367   bool bswap16_p, bswap32_p, bswap64_p;
2368   bool changed = false;
2369   tree bswap16_type = NULL_TREE, bswap32_type = NULL_TREE, bswap64_type = NULL_TREE;
2370
2371   if (BITS_PER_UNIT != 8)
2372     return 0;
2373
2374   bswap16_p = (builtin_decl_explicit_p (BUILT_IN_BSWAP16)
2375                && optab_handler (bswap_optab, HImode) != CODE_FOR_nothing);
2376   bswap32_p = (builtin_decl_explicit_p (BUILT_IN_BSWAP32)
2377                && optab_handler (bswap_optab, SImode) != CODE_FOR_nothing);
2378   bswap64_p = (builtin_decl_explicit_p (BUILT_IN_BSWAP64)
2379                && (optab_handler (bswap_optab, DImode) != CODE_FOR_nothing
2380                    || (bswap32_p && word_mode == SImode)));
2381
2382   /* Determine the argument type of the builtins.  The code later on
2383      assumes that the return and argument type are the same.  */
2384   if (bswap16_p)
2385     {
2386       tree fndecl = builtin_decl_explicit (BUILT_IN_BSWAP16);
2387       bswap16_type = TREE_VALUE (TYPE_ARG_TYPES (TREE_TYPE (fndecl)));
2388     }
2389
2390   if (bswap32_p)
2391     {
2392       tree fndecl = builtin_decl_explicit (BUILT_IN_BSWAP32);
2393       bswap32_type = TREE_VALUE (TYPE_ARG_TYPES (TREE_TYPE (fndecl)));
2394     }
2395
2396   if (bswap64_p)
2397     {
2398       tree fndecl = builtin_decl_explicit (BUILT_IN_BSWAP64);
2399       bswap64_type = TREE_VALUE (TYPE_ARG_TYPES (TREE_TYPE (fndecl)));
2400     }
2401
2402   memset (&nop_stats, 0, sizeof (nop_stats));
2403   memset (&bswap_stats, 0, sizeof (bswap_stats));
2404
2405   FOR_EACH_BB_FN (bb, fun)
2406     {
2407       gimple_stmt_iterator gsi;
2408
2409       /* We do a reverse scan for bswap patterns to make sure we get the
2410          widest match. As bswap pattern matching doesn't handle previously
2411          inserted smaller bswap replacements as sub-patterns, the wider
2412          variant wouldn't be detected.  */
2413       for (gsi = gsi_last_bb (bb); !gsi_end_p (gsi);)
2414         {
2415           gimple src_stmt, cur_stmt = gsi_stmt (gsi);
2416           tree fndecl = NULL_TREE, bswap_type = NULL_TREE, load_type;
2417           enum tree_code code;
2418           struct symbolic_number n;
2419           bool bswap;
2420
2421           /* This gsi_prev (&gsi) is not part of the for loop because cur_stmt
2422              might be moved to a different basic block by bswap_replace and gsi
2423              must not points to it if that's the case.  Moving the gsi_prev
2424              there make sure that gsi points to the statement previous to
2425              cur_stmt while still making sure that all statements are
2426              considered in this basic block.  */
2427           gsi_prev (&gsi);
2428
2429           if (!is_gimple_assign (cur_stmt))
2430             continue;
2431
2432           code = gimple_assign_rhs_code (cur_stmt);
2433           switch (code)
2434             {
2435             case LROTATE_EXPR:
2436             case RROTATE_EXPR:
2437               if (!tree_fits_uhwi_p (gimple_assign_rhs2 (cur_stmt))
2438                   || tree_to_uhwi (gimple_assign_rhs2 (cur_stmt))
2439                      % BITS_PER_UNIT)
2440                 continue;
2441               /* Fall through.  */
2442             case BIT_IOR_EXPR:
2443               break;
2444             default:
2445               continue;
2446             }
2447
2448           src_stmt = find_bswap_or_nop (cur_stmt, &n, &bswap);
2449
2450           if (!src_stmt)
2451             continue;
2452
2453           switch (n.range)
2454             {
2455             case 16:
2456               /* Already in canonical form, nothing to do.  */
2457               if (code == LROTATE_EXPR || code == RROTATE_EXPR)
2458                 continue;
2459               load_type = uint16_type_node;
2460               if (bswap16_p)
2461                 {
2462                   fndecl = builtin_decl_explicit (BUILT_IN_BSWAP16);
2463                   bswap_type = bswap16_type;
2464                 }
2465               break;
2466             case 32:
2467               load_type = uint32_type_node;
2468               if (bswap32_p)
2469                 {
2470                   fndecl = builtin_decl_explicit (BUILT_IN_BSWAP32);
2471                   bswap_type = bswap32_type;
2472                 }
2473               break;
2474             case 64:
2475               load_type = uint64_type_node;
2476               if (bswap64_p)
2477                 {
2478                   fndecl = builtin_decl_explicit (BUILT_IN_BSWAP64);
2479                   bswap_type = bswap64_type;
2480                 }
2481               break;
2482             default:
2483               continue;
2484             }
2485
2486           if (bswap && !fndecl)
2487             continue;
2488
2489           if (bswap_replace (cur_stmt, src_stmt, fndecl, bswap_type, load_type,
2490                              &n, bswap))
2491             changed = true;
2492         }
2493     }
2494
2495   statistics_counter_event (fun, "16-bit nop implementations found",
2496                             nop_stats.found_16bit);
2497   statistics_counter_event (fun, "32-bit nop implementations found",
2498                             nop_stats.found_32bit);
2499   statistics_counter_event (fun, "64-bit nop implementations found",
2500                             nop_stats.found_64bit);
2501   statistics_counter_event (fun, "16-bit bswap implementations found",
2502                             bswap_stats.found_16bit);
2503   statistics_counter_event (fun, "32-bit bswap implementations found",
2504                             bswap_stats.found_32bit);
2505   statistics_counter_event (fun, "64-bit bswap implementations found",
2506                             bswap_stats.found_64bit);
2507
2508   return (changed ? TODO_update_ssa : 0);
2509 }
2510
2511 } // anon namespace
2512
2513 gimple_opt_pass *
2514 make_pass_optimize_bswap (gcc::context *ctxt)
2515 {
2516   return new pass_optimize_bswap (ctxt);
2517 }
2518
2519 /* Return true if stmt is a type conversion operation that can be stripped
2520    when used in a widening multiply operation.  */
2521 static bool
2522 widening_mult_conversion_strippable_p (tree result_type, gimple stmt)
2523 {
2524   enum tree_code rhs_code = gimple_assign_rhs_code (stmt);
2525
2526   if (TREE_CODE (result_type) == INTEGER_TYPE)
2527     {
2528       tree op_type;
2529       tree inner_op_type;
2530
2531       if (!CONVERT_EXPR_CODE_P (rhs_code))
2532         return false;
2533
2534       op_type = TREE_TYPE (gimple_assign_lhs (stmt));
2535
2536       /* If the type of OP has the same precision as the result, then
2537          we can strip this conversion.  The multiply operation will be
2538          selected to create the correct extension as a by-product.  */
2539       if (TYPE_PRECISION (result_type) == TYPE_PRECISION (op_type))
2540         return true;
2541
2542       /* We can also strip a conversion if it preserves the signed-ness of
2543          the operation and doesn't narrow the range.  */
2544       inner_op_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
2545
2546       /* If the inner-most type is unsigned, then we can strip any
2547          intermediate widening operation.  If it's signed, then the
2548          intermediate widening operation must also be signed.  */
2549       if ((TYPE_UNSIGNED (inner_op_type)
2550            || TYPE_UNSIGNED (op_type) == TYPE_UNSIGNED (inner_op_type))
2551           && TYPE_PRECISION (op_type) > TYPE_PRECISION (inner_op_type))
2552         return true;
2553
2554       return false;
2555     }
2556
2557   return rhs_code == FIXED_CONVERT_EXPR;
2558 }
2559
2560 /* Return true if RHS is a suitable operand for a widening multiplication,
2561    assuming a target type of TYPE.
2562    There are two cases:
2563
2564      - RHS makes some value at least twice as wide.  Store that value
2565        in *NEW_RHS_OUT if so, and store its type in *TYPE_OUT.
2566
2567      - RHS is an integer constant.  Store that value in *NEW_RHS_OUT if so,
2568        but leave *TYPE_OUT untouched.  */
2569
2570 static bool
2571 is_widening_mult_rhs_p (tree type, tree rhs, tree *type_out,
2572                         tree *new_rhs_out)
2573 {
2574   gimple stmt;
2575   tree type1, rhs1;
2576
2577   if (TREE_CODE (rhs) == SSA_NAME)
2578     {
2579       stmt = SSA_NAME_DEF_STMT (rhs);
2580       if (is_gimple_assign (stmt))
2581         {
2582           if (! widening_mult_conversion_strippable_p (type, stmt))
2583             rhs1 = rhs;
2584           else
2585             {
2586               rhs1 = gimple_assign_rhs1 (stmt);
2587
2588               if (TREE_CODE (rhs1) == INTEGER_CST)
2589                 {
2590                   *new_rhs_out = rhs1;
2591                   *type_out = NULL;
2592                   return true;
2593                 }
2594             }
2595         }
2596       else
2597         rhs1 = rhs;
2598
2599       type1 = TREE_TYPE (rhs1);
2600
2601       if (TREE_CODE (type1) != TREE_CODE (type)
2602           || TYPE_PRECISION (type1) * 2 > TYPE_PRECISION (type))
2603         return false;
2604
2605       *new_rhs_out = rhs1;
2606       *type_out = type1;
2607       return true;
2608     }
2609
2610   if (TREE_CODE (rhs) == INTEGER_CST)
2611     {
2612       *new_rhs_out = rhs;
2613       *type_out = NULL;
2614       return true;
2615     }
2616
2617   return false;
2618 }
2619
2620 /* Return true if STMT performs a widening multiplication, assuming the
2621    output type is TYPE.  If so, store the unwidened types of the operands
2622    in *TYPE1_OUT and *TYPE2_OUT respectively.  Also fill *RHS1_OUT and
2623    *RHS2_OUT such that converting those operands to types *TYPE1_OUT
2624    and *TYPE2_OUT would give the operands of the multiplication.  */
2625
2626 static bool
2627 is_widening_mult_p (gimple stmt,
2628                     tree *type1_out, tree *rhs1_out,
2629                     tree *type2_out, tree *rhs2_out)
2630 {
2631   tree type = TREE_TYPE (gimple_assign_lhs (stmt));
2632
2633   if (TREE_CODE (type) != INTEGER_TYPE
2634       && TREE_CODE (type) != FIXED_POINT_TYPE)
2635     return false;
2636
2637   if (!is_widening_mult_rhs_p (type, gimple_assign_rhs1 (stmt), type1_out,
2638                                rhs1_out))
2639     return false;
2640
2641   if (!is_widening_mult_rhs_p (type, gimple_assign_rhs2 (stmt), type2_out,
2642                                rhs2_out))
2643     return false;
2644
2645   if (*type1_out == NULL)
2646     {
2647       if (*type2_out == NULL || !int_fits_type_p (*rhs1_out, *type2_out))
2648         return false;
2649       *type1_out = *type2_out;
2650     }
2651
2652   if (*type2_out == NULL)
2653     {
2654       if (!int_fits_type_p (*rhs2_out, *type1_out))
2655         return false;
2656       *type2_out = *type1_out;
2657     }
2658
2659   /* Ensure that the larger of the two operands comes first. */
2660   if (TYPE_PRECISION (*type1_out) < TYPE_PRECISION (*type2_out))
2661     {
2662       tree tmp;
2663       tmp = *type1_out;
2664       *type1_out = *type2_out;
2665       *type2_out = tmp;
2666       tmp = *rhs1_out;
2667       *rhs1_out = *rhs2_out;
2668       *rhs2_out = tmp;
2669     }
2670
2671   return true;
2672 }
2673
2674 /* Process a single gimple statement STMT, which has a MULT_EXPR as
2675    its rhs, and try to convert it into a WIDEN_MULT_EXPR.  The return
2676    value is true iff we converted the statement.  */
2677
2678 static bool
2679 convert_mult_to_widen (gimple stmt, gimple_stmt_iterator *gsi)
2680 {
2681   tree lhs, rhs1, rhs2, type, type1, type2;
2682   enum insn_code handler;
2683   machine_mode to_mode, from_mode, actual_mode;
2684   optab op;
2685   int actual_precision;
2686   location_t loc = gimple_location (stmt);
2687   bool from_unsigned1, from_unsigned2;
2688
2689   lhs = gimple_assign_lhs (stmt);
2690   type = TREE_TYPE (lhs);
2691   if (TREE_CODE (type) != INTEGER_TYPE)
2692     return false;
2693
2694   if (!is_widening_mult_p (stmt, &type1, &rhs1, &type2, &rhs2))
2695     return false;
2696
2697   to_mode = TYPE_MODE (type);
2698   from_mode = TYPE_MODE (type1);
2699   from_unsigned1 = TYPE_UNSIGNED (type1);
2700   from_unsigned2 = TYPE_UNSIGNED (type2);
2701
2702   if (from_unsigned1 && from_unsigned2)
2703     op = umul_widen_optab;
2704   else if (!from_unsigned1 && !from_unsigned2)
2705     op = smul_widen_optab;
2706   else
2707     op = usmul_widen_optab;
2708
2709   handler = find_widening_optab_handler_and_mode (op, to_mode, from_mode,
2710                                                   0, &actual_mode);
2711
2712   if (handler == CODE_FOR_nothing)
2713     {
2714       if (op != smul_widen_optab)
2715         {
2716           /* We can use a signed multiply with unsigned types as long as
2717              there is a wider mode to use, or it is the smaller of the two
2718              types that is unsigned.  Note that type1 >= type2, always.  */
2719           if ((TYPE_UNSIGNED (type1)
2720                && TYPE_PRECISION (type1) == GET_MODE_PRECISION (from_mode))
2721               || (TYPE_UNSIGNED (type2)
2722                   && TYPE_PRECISION (type2) == GET_MODE_PRECISION (from_mode)))
2723             {
2724               from_mode = GET_MODE_WIDER_MODE (from_mode);
2725               if (GET_MODE_SIZE (to_mode) <= GET_MODE_SIZE (from_mode))
2726                 return false;
2727             }
2728
2729           op = smul_widen_optab;
2730           handler = find_widening_optab_handler_and_mode (op, to_mode,
2731                                                           from_mode, 0,
2732                                                           &actual_mode);
2733
2734           if (handler == CODE_FOR_nothing)
2735             return false;
2736
2737           from_unsigned1 = from_unsigned2 = false;
2738         }
2739       else
2740         return false;
2741     }
2742
2743   /* Ensure that the inputs to the handler are in the correct precison
2744      for the opcode.  This will be the full mode size.  */
2745   actual_precision = GET_MODE_PRECISION (actual_mode);
2746   if (2 * actual_precision > TYPE_PRECISION (type))
2747     return false;
2748   if (actual_precision != TYPE_PRECISION (type1)
2749       || from_unsigned1 != TYPE_UNSIGNED (type1))
2750     rhs1 = build_and_insert_cast (gsi, loc,
2751                                   build_nonstandard_integer_type
2752                                     (actual_precision, from_unsigned1), rhs1);
2753   if (actual_precision != TYPE_PRECISION (type2)
2754       || from_unsigned2 != TYPE_UNSIGNED (type2))
2755     rhs2 = build_and_insert_cast (gsi, loc,
2756                                   build_nonstandard_integer_type
2757                                     (actual_precision, from_unsigned2), rhs2);
2758
2759   /* Handle constants.  */
2760   if (TREE_CODE (rhs1) == INTEGER_CST)
2761     rhs1 = fold_convert (type1, rhs1);
2762   if (TREE_CODE (rhs2) == INTEGER_CST)
2763     rhs2 = fold_convert (type2, rhs2);
2764
2765   gimple_assign_set_rhs1 (stmt, rhs1);
2766   gimple_assign_set_rhs2 (stmt, rhs2);
2767   gimple_assign_set_rhs_code (stmt, WIDEN_MULT_EXPR);
2768   update_stmt (stmt);
2769   widen_mul_stats.widen_mults_inserted++;
2770   return true;
2771 }
2772
2773 /* Process a single gimple statement STMT, which is found at the
2774    iterator GSI and has a either a PLUS_EXPR or a MINUS_EXPR as its
2775    rhs (given by CODE), and try to convert it into a
2776    WIDEN_MULT_PLUS_EXPR or a WIDEN_MULT_MINUS_EXPR.  The return value
2777    is true iff we converted the statement.  */
2778
2779 static bool
2780 convert_plusminus_to_widen (gimple_stmt_iterator *gsi, gimple stmt,
2781                             enum tree_code code)
2782 {
2783   gimple rhs1_stmt = NULL, rhs2_stmt = NULL;
2784   gimple conv1_stmt = NULL, conv2_stmt = NULL, conv_stmt;
2785   tree type, type1, type2, optype;
2786   tree lhs, rhs1, rhs2, mult_rhs1, mult_rhs2, add_rhs;
2787   enum tree_code rhs1_code = ERROR_MARK, rhs2_code = ERROR_MARK;
2788   optab this_optab;
2789   enum tree_code wmult_code;
2790   enum insn_code handler;
2791   machine_mode to_mode, from_mode, actual_mode;
2792   location_t loc = gimple_location (stmt);
2793   int actual_precision;
2794   bool from_unsigned1, from_unsigned2;
2795
2796   lhs = gimple_assign_lhs (stmt);
2797   type = TREE_TYPE (lhs);
2798   if (TREE_CODE (type) != INTEGER_TYPE
2799       && TREE_CODE (type) != FIXED_POINT_TYPE)
2800     return false;
2801
2802   if (code == MINUS_EXPR)
2803     wmult_code = WIDEN_MULT_MINUS_EXPR;
2804   else
2805     wmult_code = WIDEN_MULT_PLUS_EXPR;
2806
2807   rhs1 = gimple_assign_rhs1 (stmt);
2808   rhs2 = gimple_assign_rhs2 (stmt);
2809
2810   if (TREE_CODE (rhs1) == SSA_NAME)
2811     {
2812       rhs1_stmt = SSA_NAME_DEF_STMT (rhs1);
2813       if (is_gimple_assign (rhs1_stmt))
2814         rhs1_code = gimple_assign_rhs_code (rhs1_stmt);
2815     }
2816
2817   if (TREE_CODE (rhs2) == SSA_NAME)
2818     {
2819       rhs2_stmt = SSA_NAME_DEF_STMT (rhs2);
2820       if (is_gimple_assign (rhs2_stmt))
2821         rhs2_code = gimple_assign_rhs_code (rhs2_stmt);
2822     }
2823
2824   /* Allow for one conversion statement between the multiply
2825      and addition/subtraction statement.  If there are more than
2826      one conversions then we assume they would invalidate this
2827      transformation.  If that's not the case then they should have
2828      been folded before now.  */
2829   if (CONVERT_EXPR_CODE_P (rhs1_code))
2830     {
2831       conv1_stmt = rhs1_stmt;
2832       rhs1 = gimple_assign_rhs1 (rhs1_stmt);
2833       if (TREE_CODE (rhs1) == SSA_NAME)
2834         {
2835           rhs1_stmt = SSA_NAME_DEF_STMT (rhs1);
2836           if (is_gimple_assign (rhs1_stmt))
2837             rhs1_code = gimple_assign_rhs_code (rhs1_stmt);
2838         }
2839       else
2840         return false;
2841     }
2842   if (CONVERT_EXPR_CODE_P (rhs2_code))
2843     {
2844       conv2_stmt = rhs2_stmt;
2845       rhs2 = gimple_assign_rhs1 (rhs2_stmt);
2846       if (TREE_CODE (rhs2) == SSA_NAME)
2847         {
2848           rhs2_stmt = SSA_NAME_DEF_STMT (rhs2);
2849           if (is_gimple_assign (rhs2_stmt))
2850             rhs2_code = gimple_assign_rhs_code (rhs2_stmt);
2851         }
2852       else
2853         return false;
2854     }
2855
2856   /* If code is WIDEN_MULT_EXPR then it would seem unnecessary to call
2857      is_widening_mult_p, but we still need the rhs returns.
2858
2859      It might also appear that it would be sufficient to use the existing
2860      operands of the widening multiply, but that would limit the choice of
2861      multiply-and-accumulate instructions.
2862
2863      If the widened-multiplication result has more than one uses, it is
2864      probably wiser not to do the conversion.  */
2865   if (code == PLUS_EXPR
2866       && (rhs1_code == MULT_EXPR || rhs1_code == WIDEN_MULT_EXPR))
2867     {
2868       if (!has_single_use (rhs1)
2869           || !is_widening_mult_p (rhs1_stmt, &type1, &mult_rhs1,
2870                                   &type2, &mult_rhs2))
2871         return false;
2872       add_rhs = rhs2;
2873       conv_stmt = conv1_stmt;
2874     }
2875   else if (rhs2_code == MULT_EXPR || rhs2_code == WIDEN_MULT_EXPR)
2876     {
2877       if (!has_single_use (rhs2)
2878           || !is_widening_mult_p (rhs2_stmt, &type1, &mult_rhs1,
2879                                   &type2, &mult_rhs2))
2880         return false;
2881       add_rhs = rhs1;
2882       conv_stmt = conv2_stmt;
2883     }
2884   else
2885     return false;
2886
2887   to_mode = TYPE_MODE (type);
2888   from_mode = TYPE_MODE (type1);
2889   from_unsigned1 = TYPE_UNSIGNED (type1);
2890   from_unsigned2 = TYPE_UNSIGNED (type2);
2891   optype = type1;
2892
2893   /* There's no such thing as a mixed sign madd yet, so use a wider mode.  */
2894   if (from_unsigned1 != from_unsigned2)
2895     {
2896       if (!INTEGRAL_TYPE_P (type))
2897         return false;
2898       /* We can use a signed multiply with unsigned types as long as
2899          there is a wider mode to use, or it is the smaller of the two
2900          types that is unsigned.  Note that type1 >= type2, always.  */
2901       if ((from_unsigned1
2902            && TYPE_PRECISION (type1) == GET_MODE_PRECISION (from_mode))
2903           || (from_unsigned2
2904               && TYPE_PRECISION (type2) == GET_MODE_PRECISION (from_mode)))
2905         {
2906           from_mode = GET_MODE_WIDER_MODE (from_mode);
2907           if (GET_MODE_SIZE (from_mode) >= GET_MODE_SIZE (to_mode))
2908             return false;
2909         }
2910
2911       from_unsigned1 = from_unsigned2 = false;
2912       optype = build_nonstandard_integer_type (GET_MODE_PRECISION (from_mode),
2913                                                false);
2914     }
2915
2916   /* If there was a conversion between the multiply and addition
2917      then we need to make sure it fits a multiply-and-accumulate.
2918      The should be a single mode change which does not change the
2919      value.  */
2920   if (conv_stmt)
2921     {
2922       /* We use the original, unmodified data types for this.  */
2923       tree from_type = TREE_TYPE (gimple_assign_rhs1 (conv_stmt));
2924       tree to_type = TREE_TYPE (gimple_assign_lhs (conv_stmt));
2925       int data_size = TYPE_PRECISION (type1) + TYPE_PRECISION (type2);
2926       bool is_unsigned = TYPE_UNSIGNED (type1) && TYPE_UNSIGNED (type2);
2927
2928       if (TYPE_PRECISION (from_type) > TYPE_PRECISION (to_type))
2929         {
2930           /* Conversion is a truncate.  */
2931           if (TYPE_PRECISION (to_type) < data_size)
2932             return false;
2933         }
2934       else if (TYPE_PRECISION (from_type) < TYPE_PRECISION (to_type))
2935         {
2936           /* Conversion is an extend.  Check it's the right sort.  */
2937           if (TYPE_UNSIGNED (from_type) != is_unsigned
2938               && !(is_unsigned && TYPE_PRECISION (from_type) > data_size))
2939             return false;
2940         }
2941       /* else convert is a no-op for our purposes.  */
2942     }
2943
2944   /* Verify that the machine can perform a widening multiply
2945      accumulate in this mode/signedness combination, otherwise
2946      this transformation is likely to pessimize code.  */
2947   this_optab = optab_for_tree_code (wmult_code, optype, optab_default);
2948   handler = find_widening_optab_handler_and_mode (this_optab, to_mode,
2949                                                   from_mode, 0, &actual_mode);
2950
2951   if (handler == CODE_FOR_nothing)
2952     return false;
2953
2954   /* Ensure that the inputs to the handler are in the correct precison
2955      for the opcode.  This will be the full mode size.  */
2956   actual_precision = GET_MODE_PRECISION (actual_mode);
2957   if (actual_precision != TYPE_PRECISION (type1)
2958       || from_unsigned1 != TYPE_UNSIGNED (type1))
2959     mult_rhs1 = build_and_insert_cast (gsi, loc,
2960                                        build_nonstandard_integer_type
2961                                          (actual_precision, from_unsigned1),
2962                                        mult_rhs1);
2963   if (actual_precision != TYPE_PRECISION (type2)
2964       || from_unsigned2 != TYPE_UNSIGNED (type2))
2965     mult_rhs2 = build_and_insert_cast (gsi, loc,
2966                                        build_nonstandard_integer_type
2967                                          (actual_precision, from_unsigned2),
2968                                        mult_rhs2);
2969
2970   if (!useless_type_conversion_p (type, TREE_TYPE (add_rhs)))
2971     add_rhs = build_and_insert_cast (gsi, loc, type, add_rhs);
2972
2973   /* Handle constants.  */
2974   if (TREE_CODE (mult_rhs1) == INTEGER_CST)
2975     mult_rhs1 = fold_convert (type1, mult_rhs1);
2976   if (TREE_CODE (mult_rhs2) == INTEGER_CST)
2977     mult_rhs2 = fold_convert (type2, mult_rhs2);
2978
2979   gimple_assign_set_rhs_with_ops_1 (gsi, wmult_code, mult_rhs1, mult_rhs2,
2980                                     add_rhs);
2981   update_stmt (gsi_stmt (*gsi));
2982   widen_mul_stats.maccs_inserted++;
2983   return true;
2984 }
2985
2986 /* Combine the multiplication at MUL_STMT with operands MULOP1 and MULOP2
2987    with uses in additions and subtractions to form fused multiply-add
2988    operations.  Returns true if successful and MUL_STMT should be removed.  */
2989
2990 static bool
2991 convert_mult_to_fma (gimple mul_stmt, tree op1, tree op2)
2992 {
2993   tree mul_result = gimple_get_lhs (mul_stmt);
2994   tree type = TREE_TYPE (mul_result);
2995   gimple use_stmt, neguse_stmt;
2996   gassign *fma_stmt;
2997   use_operand_p use_p;
2998   imm_use_iterator imm_iter;
2999
3000   if (FLOAT_TYPE_P (type)
3001       && flag_fp_contract_mode == FP_CONTRACT_OFF)
3002     return false;
3003
3004   /* We don't want to do bitfield reduction ops.  */
3005   if (INTEGRAL_TYPE_P (type)
3006       && (TYPE_PRECISION (type)
3007           != GET_MODE_PRECISION (TYPE_MODE (type))))
3008     return false;
3009
3010   /* If the target doesn't support it, don't generate it.  We assume that
3011      if fma isn't available then fms, fnma or fnms are not either.  */
3012   if (optab_handler (fma_optab, TYPE_MODE (type)) == CODE_FOR_nothing)
3013     return false;
3014
3015   /* If the multiplication has zero uses, it is kept around probably because
3016      of -fnon-call-exceptions.  Don't optimize it away in that case,
3017      it is DCE job.  */
3018   if (has_zero_uses (mul_result))
3019     return false;
3020
3021   /* Make sure that the multiplication statement becomes dead after
3022      the transformation, thus that all uses are transformed to FMAs.
3023      This means we assume that an FMA operation has the same cost
3024      as an addition.  */
3025   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, mul_result)
3026     {
3027       enum tree_code use_code;
3028       tree result = mul_result;
3029       bool negate_p = false;
3030
3031       use_stmt = USE_STMT (use_p);
3032
3033       if (is_gimple_debug (use_stmt))
3034         continue;
3035
3036       /* For now restrict this operations to single basic blocks.  In theory
3037          we would want to support sinking the multiplication in
3038          m = a*b;
3039          if ()
3040            ma = m + c;
3041          else
3042            d = m;
3043          to form a fma in the then block and sink the multiplication to the
3044          else block.  */
3045       if (gimple_bb (use_stmt) != gimple_bb (mul_stmt))
3046         return false;
3047
3048       if (!is_gimple_assign (use_stmt))
3049         return false;
3050
3051       use_code = gimple_assign_rhs_code (use_stmt);
3052
3053       /* A negate on the multiplication leads to FNMA.  */
3054       if (use_code == NEGATE_EXPR)
3055         {
3056           ssa_op_iter iter;
3057           use_operand_p usep;
3058
3059           result = gimple_assign_lhs (use_stmt);
3060
3061           /* Make sure the negate statement becomes dead with this
3062              single transformation.  */
3063           if (!single_imm_use (gimple_assign_lhs (use_stmt),
3064                                &use_p, &neguse_stmt))
3065             return false;
3066
3067           /* Make sure the multiplication isn't also used on that stmt.  */
3068           FOR_EACH_PHI_OR_STMT_USE (usep, neguse_stmt, iter, SSA_OP_USE)
3069             if (USE_FROM_PTR (usep) == mul_result)
3070               return false;
3071
3072           /* Re-validate.  */
3073           use_stmt = neguse_stmt;
3074           if (gimple_bb (use_stmt) != gimple_bb (mul_stmt))
3075             return false;
3076           if (!is_gimple_assign (use_stmt))
3077             return false;
3078
3079           use_code = gimple_assign_rhs_code (use_stmt);
3080           negate_p = true;
3081         }
3082
3083       switch (use_code)
3084         {
3085         case MINUS_EXPR:
3086           if (gimple_assign_rhs2 (use_stmt) == result)
3087             negate_p = !negate_p;
3088           break;
3089         case PLUS_EXPR:
3090           break;
3091         default:
3092           /* FMA can only be formed from PLUS and MINUS.  */
3093           return false;
3094         }
3095
3096       /* If the subtrahend (gimple_assign_rhs2 (use_stmt)) is computed
3097          by a MULT_EXPR that we'll visit later, we might be able to
3098          get a more profitable match with fnma.
3099          OTOH, if we don't, a negate / fma pair has likely lower latency
3100          that a mult / subtract pair.  */
3101       if (use_code == MINUS_EXPR && !negate_p
3102           && gimple_assign_rhs1 (use_stmt) == result
3103           && optab_handler (fms_optab, TYPE_MODE (type)) == CODE_FOR_nothing
3104           && optab_handler (fnma_optab, TYPE_MODE (type)) != CODE_FOR_nothing)
3105         {
3106           tree rhs2 = gimple_assign_rhs2 (use_stmt);
3107
3108           if (TREE_CODE (rhs2) == SSA_NAME)
3109             {
3110               gimple stmt2 = SSA_NAME_DEF_STMT (rhs2);
3111               if (has_single_use (rhs2)
3112                   && is_gimple_assign (stmt2)
3113                   && gimple_assign_rhs_code (stmt2) == MULT_EXPR)
3114               return false;
3115             }
3116         }
3117
3118       /* We can't handle a * b + a * b.  */
3119       if (gimple_assign_rhs1 (use_stmt) == gimple_assign_rhs2 (use_stmt))
3120         return false;
3121
3122       /* While it is possible to validate whether or not the exact form
3123          that we've recognized is available in the backend, the assumption
3124          is that the transformation is never a loss.  For instance, suppose
3125          the target only has the plain FMA pattern available.  Consider
3126          a*b-c -> fma(a,b,-c): we've exchanged MUL+SUB for FMA+NEG, which
3127          is still two operations.  Consider -(a*b)-c -> fma(-a,b,-c): we
3128          still have 3 operations, but in the FMA form the two NEGs are
3129          independent and could be run in parallel.  */
3130     }
3131
3132   FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, mul_result)
3133     {
3134       gimple_stmt_iterator gsi = gsi_for_stmt (use_stmt);
3135       enum tree_code use_code;
3136       tree addop, mulop1 = op1, result = mul_result;
3137       bool negate_p = false;
3138
3139       if (is_gimple_debug (use_stmt))
3140         continue;
3141
3142       use_code = gimple_assign_rhs_code (use_stmt);
3143       if (use_code == NEGATE_EXPR)
3144         {
3145           result = gimple_assign_lhs (use_stmt);
3146           single_imm_use (gimple_assign_lhs (use_stmt), &use_p, &neguse_stmt);
3147           gsi_remove (&gsi, true);
3148           release_defs (use_stmt);
3149
3150           use_stmt = neguse_stmt;
3151           gsi = gsi_for_stmt (use_stmt);
3152           use_code = gimple_assign_rhs_code (use_stmt);
3153           negate_p = true;
3154         }
3155
3156       if (gimple_assign_rhs1 (use_stmt) == result)
3157         {
3158           addop = gimple_assign_rhs2 (use_stmt);
3159           /* a * b - c -> a * b + (-c)  */
3160           if (gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
3161             addop = force_gimple_operand_gsi (&gsi,
3162                                               build1 (NEGATE_EXPR,
3163                                                       type, addop),
3164                                               true, NULL_TREE, true,
3165                                               GSI_SAME_STMT);
3166         }
3167       else
3168         {
3169           addop = gimple_assign_rhs1 (use_stmt);
3170           /* a - b * c -> (-b) * c + a */
3171           if (gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
3172             negate_p = !negate_p;
3173         }
3174
3175       if (negate_p)
3176         mulop1 = force_gimple_operand_gsi (&gsi,
3177                                            build1 (NEGATE_EXPR,
3178                                                    type, mulop1),
3179                                            true, NULL_TREE, true,
3180                                            GSI_SAME_STMT);
3181
3182       fma_stmt = gimple_build_assign_with_ops (FMA_EXPR,
3183                                                gimple_assign_lhs (use_stmt),
3184                                                mulop1, op2,
3185                                                addop);
3186       gsi_replace (&gsi, fma_stmt, true);
3187       widen_mul_stats.fmas_inserted++;
3188     }
3189
3190   return true;
3191 }
3192
3193 /* Find integer multiplications where the operands are extended from
3194    smaller types, and replace the MULT_EXPR with a WIDEN_MULT_EXPR
3195    where appropriate.  */
3196
3197 namespace {
3198
3199 const pass_data pass_data_optimize_widening_mul =
3200 {
3201   GIMPLE_PASS, /* type */
3202   "widening_mul", /* name */
3203   OPTGROUP_NONE, /* optinfo_flags */
3204   TV_NONE, /* tv_id */
3205   PROP_ssa, /* properties_required */
3206   0, /* properties_provided */
3207   0, /* properties_destroyed */
3208   0, /* todo_flags_start */
3209   TODO_update_ssa, /* todo_flags_finish */
3210 };
3211
3212 class pass_optimize_widening_mul : public gimple_opt_pass
3213 {
3214 public:
3215   pass_optimize_widening_mul (gcc::context *ctxt)
3216     : gimple_opt_pass (pass_data_optimize_widening_mul, ctxt)
3217   {}
3218
3219   /* opt_pass methods: */
3220   virtual bool gate (function *)
3221     {
3222       return flag_expensive_optimizations && optimize;
3223     }
3224
3225   virtual unsigned int execute (function *);
3226
3227 }; // class pass_optimize_widening_mul
3228
3229 unsigned int
3230 pass_optimize_widening_mul::execute (function *fun)
3231 {
3232   basic_block bb;
3233   bool cfg_changed = false;
3234
3235   memset (&widen_mul_stats, 0, sizeof (widen_mul_stats));
3236
3237   FOR_EACH_BB_FN (bb, fun)
3238     {
3239       gimple_stmt_iterator gsi;
3240
3241       for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi);)
3242         {
3243           gimple stmt = gsi_stmt (gsi);
3244           enum tree_code code;
3245
3246           if (is_gimple_assign (stmt))
3247             {
3248               code = gimple_assign_rhs_code (stmt);
3249               switch (code)
3250                 {
3251                 case MULT_EXPR:
3252                   if (!convert_mult_to_widen (stmt, &gsi)
3253                       && convert_mult_to_fma (stmt,
3254                                               gimple_assign_rhs1 (stmt),
3255                                               gimple_assign_rhs2 (stmt)))
3256                     {
3257                       gsi_remove (&gsi, true);
3258                       release_defs (stmt);
3259                       continue;
3260                     }
3261                   break;
3262
3263                 case PLUS_EXPR:
3264                 case MINUS_EXPR:
3265                   convert_plusminus_to_widen (&gsi, stmt, code);
3266                   break;
3267
3268                 default:;
3269                 }
3270             }
3271           else if (is_gimple_call (stmt)
3272                    && gimple_call_lhs (stmt))
3273             {
3274               tree fndecl = gimple_call_fndecl (stmt);
3275               if (fndecl
3276                   && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_NORMAL)
3277                 {
3278                   switch (DECL_FUNCTION_CODE (fndecl))
3279                     {
3280                       case BUILT_IN_POWF:
3281                       case BUILT_IN_POW:
3282                       case BUILT_IN_POWL:
3283                         if (TREE_CODE (gimple_call_arg (stmt, 1)) == REAL_CST
3284                             && REAL_VALUES_EQUAL
3285                                  (TREE_REAL_CST (gimple_call_arg (stmt, 1)),
3286                                   dconst2)
3287                             && convert_mult_to_fma (stmt,
3288                                                     gimple_call_arg (stmt, 0),
3289                                                     gimple_call_arg (stmt, 0)))
3290                           {
3291                             unlink_stmt_vdef (stmt);
3292                             if (gsi_remove (&gsi, true)
3293                                 && gimple_purge_dead_eh_edges (bb))
3294                               cfg_changed = true;
3295                             release_defs (stmt);
3296                             continue;
3297                           }
3298                           break;
3299
3300                       default:;
3301                     }
3302                 }
3303             }
3304           gsi_next (&gsi);
3305         }
3306     }
3307
3308   statistics_counter_event (fun, "widening multiplications inserted",
3309                             widen_mul_stats.widen_mults_inserted);
3310   statistics_counter_event (fun, "widening maccs inserted",
3311                             widen_mul_stats.maccs_inserted);
3312   statistics_counter_event (fun, "fused multiply-adds inserted",
3313                             widen_mul_stats.fmas_inserted);
3314
3315   return cfg_changed ? TODO_cleanup_cfg : 0;
3316 }
3317
3318 } // anon namespace
3319
3320 gimple_opt_pass *
3321 make_pass_optimize_widening_mul (gcc::context *ctxt)
3322 {
3323   return new pass_optimize_widening_mul (ctxt);
3324 }