gcc/tree-ssa-math-opts.c

   1 /* Global, SSA-based optimizations using mathematical identities.
   2    Copyright (C) 2005-2015 Free Software Foundation, Inc.
   3
   4 This file is part of GCC.
   5
   6 GCC is free software; you can redistribute it and/or modify it
   7 under the terms of the GNU General Public License as published by the
   8 Free Software Foundation; either version 3, or (at your option) any
   9 later version.
  10
  11 GCC is distributed in the hope that it will be useful, but WITHOUT
  12 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14 for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with GCC; see the file COPYING3.  If not see
  18 <http://www.gnu.org/licenses/>.  */
  19
  20 /* Currently, the only mini-pass in this file tries to CSE reciprocal
  21    operations.  These are common in sequences such as this one:
  22
  23         modulus = sqrt(x*x + y*y + z*z);
  24         x = x / modulus;
  25         y = y / modulus;
  26         z = z / modulus;
  27
  28    that can be optimized to
  29
  30         modulus = sqrt(x*x + y*y + z*z);
  31         rmodulus = 1.0 / modulus;
  32         x = x * rmodulus;
  33         y = y * rmodulus;
  34         z = z * rmodulus;
  35
  36    We do this for loop invariant divisors, and with this pass whenever
  37    we notice that a division has the same divisor multiple times.
  38
  39    Of course, like in PRE, we don't insert a division if a dominator
  40    already has one.  However, this cannot be done as an extension of
  41    PRE for several reasons.
  42
  43    First of all, with some experiments it was found out that the
  44    transformation is not always useful if there are only two divisions
  45    hy the same divisor.  This is probably because modern processors
  46    can pipeline the divisions; on older, in-order processors it should
  47    still be effective to optimize two divisions by the same number.
  48    We make this a param, and it shall be called N in the remainder of
  49    this comment.
  50
  51    Second, if trapping math is active, we have less freedom on where
  52    to insert divisions: we can only do so in basic blocks that already
  53    contain one.  (If divisions don't trap, instead, we can insert
  54    divisions elsewhere, which will be in blocks that are common dominators
  55    of those that have the division).
  56
  57    We really don't want to compute the reciprocal unless a division will
  58    be found.  To do this, we won't insert the division in a basic block
  59    that has less than N divisions *post-dominating* it.
  60
  61    The algorithm constructs a subset of the dominator tree, holding the
  62    blocks containing the divisions and the common dominators to them,
  63    and walk it twice.  The first walk is in post-order, and it annotates
  64    each block with the number of divisions that post-dominate it: this
  65    gives information on where divisions can be inserted profitably.
  66    The second walk is in pre-order, and it inserts divisions as explained
  67    above, and replaces divisions by multiplications.
  68
  69    In the best case, the cost of the pass is O(n_statements).  In the
  70    worst-case, the cost is due to creating the dominator tree subset,
  71    with a cost of O(n_basic_blocks ^ 2); however this can only happen
  72    for n_statements / n_basic_blocks statements.  So, the amortized cost
  73    of creating the dominator tree subset is O(n_basic_blocks) and the
  74    worst-case cost of the pass is O(n_statements * n_basic_blocks).
  75
  76    More practically, the cost will be small because there are few
  77    divisions, and they tend to be in the same basic block, so insert_bb
  78    is called very few times.
  79
  80    If we did this using domwalk.c, an efficient implementation would have
  81    to work on all the variables in a single pass, because we could not
  82    work on just a subset of the dominator tree, as we do now, and the
  83    cost would also be something like O(n_statements * n_basic_blocks).
  84    The data structures would be more complex in order to work on all the
  85    variables in a single pass.  */
  86
  87 #include "config.h"
  88 #include "system.h"
  89 #include "coretypes.h"
  90 #include "tm.h"
  91 #include "flags.h"
  92 #include "hash-set.h"
  93 #include "machmode.h"
  94 #include "vec.h"
  95 #include "double-int.h"
  96 #include "input.h"
  97 #include "alias.h"
  98 #include "symtab.h"
  99 #include "wide-int.h"
 100 #include "inchash.h"
 101 #include "tree.h"
 102 #include "fold-const.h"
 103 #include "predict.h"
 104 #include "hard-reg-set.h"
 105 #include "input.h"
 106 #include "function.h"
 107 #include "dominance.h"
 108 #include "cfg.h"
 109 #include "basic-block.h"
 110 #include "tree-ssa-alias.h"
 111 #include "internal-fn.h"
 112 #include "gimple-fold.h"
 113 #include "gimple-expr.h"
 114 #include "is-a.h"
 115 #include "gimple.h"
 116 #include "gimple-iterator.h"
 117 #include "gimplify.h"
 118 #include "gimplify-me.h"
 119 #include "stor-layout.h"
 120 #include "gimple-ssa.h"
 121 #include "tree-cfg.h"
 122 #include "tree-phinodes.h"
 123 #include "ssa-iterators.h"
 124 #include "stringpool.h"
 125 #include "tree-ssanames.h"
 126 #include "expr.h"
 127 #include "tree-dfa.h"
 128 #include "tree-ssa.h"
 129 #include "tree-pass.h"
 130 #include "alloc-pool.h"
 131 #include "target.h"
 132 #include "gimple-pretty-print.h"
 133 #include "builtins.h"
 134
 135 /* FIXME: RTL headers have to be included here for optabs.  */
 136 #include "rtl.h"                /* Because optabs.h wants enum rtx_code.  */
 137 #include "expr.h"               /* Because optabs.h wants sepops.  */
 138 #include "insn-codes.h"
 139 #include "optabs.h"
 140
 141 /* This structure represents one basic block that either computes a
 142    division, or is a common dominator for basic block that compute a
 143    division.  */
 144 struct occurrence {
 145   /* The basic block represented by this structure.  */
 146   basic_block bb;
 147
 148   /* If non-NULL, the SSA_NAME holding the definition for a reciprocal
 149      inserted in BB.  */
 150   tree recip_def;
 151
 152   /* If non-NULL, the GIMPLE_ASSIGN for a reciprocal computation that
 153      was inserted in BB.  */
 154   gimple recip_def_stmt;
 155
 156   /* Pointer to a list of "struct occurrence"s for blocks dominated
 157      by BB.  */
 158   struct occurrence *children;
 159
 160   /* Pointer to the next "struct occurrence"s in the list of blocks
 161      sharing a common dominator.  */
 162   struct occurrence *next;
 163
 164   /* The number of divisions that are in BB before compute_merit.  The
 165      number of divisions that are in BB or post-dominate it after
 166      compute_merit.  */
 167   int num_divisions;
 168
 169   /* True if the basic block has a division, false if it is a common
 170      dominator for basic blocks that do.  If it is false and trapping
 171      math is active, BB is not a candidate for inserting a reciprocal.  */
 172   bool bb_has_division;
 173 };
 174
 175 static struct
 176 {
 177   /* Number of 1.0/X ops inserted.  */
 178   int rdivs_inserted;
 179
 180   /* Number of 1.0/FUNC ops inserted.  */
 181   int rfuncs_inserted;
 182 } reciprocal_stats;
 183
 184 static struct
 185 {
 186   /* Number of cexpi calls inserted.  */
 187   int inserted;
 188 } sincos_stats;
 189
 190 static struct
 191 {
 192   /* Number of hand-written 16-bit nop / bswaps found.  */
 193   int found_16bit;
 194
 195   /* Number of hand-written 32-bit nop / bswaps found.  */
 196   int found_32bit;
 197
 198   /* Number of hand-written 64-bit nop / bswaps found.  */
 199   int found_64bit;
 200 } nop_stats, bswap_stats;
 201
 202 static struct
 203 {
 204   /* Number of widening multiplication ops inserted.  */
 205   int widen_mults_inserted;
 206
 207   /* Number of integer multiply-and-accumulate ops inserted.  */
 208   int maccs_inserted;
 209
 210   /* Number of fp fused multiply-add ops inserted.  */
 211   int fmas_inserted;
 212 } widen_mul_stats;
 213
 214 /* The instance of "struct occurrence" representing the highest
 215    interesting block in the dominator tree.  */
 216 static struct occurrence *occ_head;
 217
 218 /* Allocation pool for getting instances of "struct occurrence".  */
 219 static alloc_pool occ_pool;
 220
 221
 222
 223 /* Allocate and return a new struct occurrence for basic block BB, and
 224    whose children list is headed by CHILDREN.  */
 225 static struct occurrence *
 226 occ_new (basic_block bb, struct occurrence *children)
 227 {
 228   struct occurrence *occ;
 229
 230   bb->aux = occ = (struct occurrence *) pool_alloc (occ_pool);
 231   memset (occ, 0, sizeof (struct occurrence));
 232
 233   occ->bb = bb;
 234   occ->children = children;
 235   return occ;
 236 }
 237
 238
 239 /* Insert NEW_OCC into our subset of the dominator tree.  P_HEAD points to a
 240    list of "struct occurrence"s, one per basic block, having IDOM as
 241    their common dominator.
 242
 243    We try to insert NEW_OCC as deep as possible in the tree, and we also
 244    insert any other block that is a common dominator for BB and one
 245    block already in the tree.  */
 246
 247 static void
 248 insert_bb (struct occurrence *new_occ, basic_block idom,
 249            struct occurrence **p_head)
 250 {
 251   struct occurrence *occ, **p_occ;
 252
 253   for (p_occ = p_head; (occ = *p_occ) != NULL; )
 254     {
 255       basic_block bb = new_occ->bb, occ_bb = occ->bb;
 256       basic_block dom = nearest_common_dominator (CDI_DOMINATORS, occ_bb, bb);
 257       if (dom == bb)
 258         {
 259           /* BB dominates OCC_BB.  OCC becomes NEW_OCC's child: remove OCC
 260              from its list.  */
 261           *p_occ = occ->next;
 262           occ->next = new_occ->children;
 263           new_occ->children = occ;
 264
 265           /* Try the next block (it may as well be dominated by BB).  */
 266         }
 267
 268       else if (dom == occ_bb)
 269         {
 270           /* OCC_BB dominates BB.  Tail recurse to look deeper.  */
 271           insert_bb (new_occ, dom, &occ->children);
 272           return;
 273         }
 274
 275       else if (dom != idom)
 276         {
 277           gcc_assert (!dom->aux);
 278
 279           /* There is a dominator between IDOM and BB, add it and make
 280              two children out of NEW_OCC and OCC.  First, remove OCC from
 281              its list.  */
 282           *p_occ = occ->next;
 283           new_occ->next = occ;
 284           occ->next = NULL;
 285
 286           /* None of the previous blocks has DOM as a dominator: if we tail
 287              recursed, we would reexamine them uselessly. Just switch BB with
 288              DOM, and go on looking for blocks dominated by DOM.  */
 289           new_occ = occ_new (dom, new_occ);
 290         }
 291
 292       else
 293         {
 294           /* Nothing special, go on with the next element.  */
 295           p_occ = &occ->next;
 296         }
 297     }
 298
 299   /* No place was found as a child of IDOM.  Make BB a sibling of IDOM.  */
 300   new_occ->next = *p_head;
 301   *p_head = new_occ;
 302 }
 303
 304 /* Register that we found a division in BB.  */
 305
 306 static inline void
 307 register_division_in (basic_block bb)
 308 {
 309   struct occurrence *occ;
 310
 311   occ = (struct occurrence *) bb->aux;
 312   if (!occ)
 313     {
 314       occ = occ_new (bb, NULL);
 315       insert_bb (occ, ENTRY_BLOCK_PTR_FOR_FN (cfun), &occ_head);
 316     }
 317
 318   occ->bb_has_division = true;
 319   occ->num_divisions++;
 320 }
 321
 322
 323 /* Compute the number of divisions that postdominate each block in OCC and
 324    its children.  */
 325
 326 static void
 327 compute_merit (struct occurrence *occ)
 328 {
 329   struct occurrence *occ_child;
 330   basic_block dom = occ->bb;
 331
 332   for (occ_child = occ->children; occ_child; occ_child = occ_child->next)
 333     {
 334       basic_block bb;
 335       if (occ_child->children)
 336         compute_merit (occ_child);
 337
 338       if (flag_exceptions)
 339         bb = single_noncomplex_succ (dom);
 340       else
 341         bb = dom;
 342
 343       if (dominated_by_p (CDI_POST_DOMINATORS, bb, occ_child->bb))
 344         occ->num_divisions += occ_child->num_divisions;
 345     }
 346 }
 347
 348
 349 /* Return whether USE_STMT is a floating-point division by DEF.  */
 350 static inline bool
 351 is_division_by (gimple use_stmt, tree def)
 352 {
 353   return is_gimple_assign (use_stmt)
 354          && gimple_assign_rhs_code (use_stmt) == RDIV_EXPR
 355          && gimple_assign_rhs2 (use_stmt) == def
 356          /* Do not recognize x / x as valid division, as we are getting
 357             confused later by replacing all immediate uses x in such
 358             a stmt.  */
 359          && gimple_assign_rhs1 (use_stmt) != def;
 360 }
 361
 362 /* Walk the subset of the dominator tree rooted at OCC, setting the
 363    RECIP_DEF field to a definition of 1.0 / DEF that can be used in
 364    the given basic block.  The field may be left NULL, of course,
 365    if it is not possible or profitable to do the optimization.
 366
 367    DEF_BSI is an iterator pointing at the statement defining DEF.
 368    If RECIP_DEF is set, a dominator already has a computation that can
 369    be used.  */
 370
 371 static void
 372 insert_reciprocals (gimple_stmt_iterator *def_gsi, struct occurrence *occ,
 373                     tree def, tree recip_def, int threshold)
 374 {
 375   tree type;
 376   gassign *new_stmt;
 377   gimple_stmt_iterator gsi;
 378   struct occurrence *occ_child;
 379
 380   if (!recip_def
 381       && (occ->bb_has_division || !flag_trapping_math)
 382       && occ->num_divisions >= threshold)
 383     {
 384       /* Make a variable with the replacement and substitute it.  */
 385       type = TREE_TYPE (def);
 386       recip_def = create_tmp_reg (type, "reciptmp");
 387       new_stmt = gimple_build_assign (recip_def, RDIV_EXPR,
 388                                       build_one_cst (type), def);
 389
 390       if (occ->bb_has_division)
 391         {
 392           /* Case 1: insert before an existing division.  */
 393           gsi = gsi_after_labels (occ->bb);
 394           while (!gsi_end_p (gsi) && !is_division_by (gsi_stmt (gsi), def))
 395             gsi_next (&gsi);
 396
 397           gsi_insert_before (&gsi, new_stmt, GSI_SAME_STMT);
 398         }
 399       else if (def_gsi && occ->bb == def_gsi->bb)
 400         {
 401           /* Case 2: insert right after the definition.  Note that this will
 402              never happen if the definition statement can throw, because in
 403              that case the sole successor of the statement's basic block will
 404              dominate all the uses as well.  */
 405           gsi_insert_after (def_gsi, new_stmt, GSI_NEW_STMT);
 406         }
 407       else
 408         {
 409           /* Case 3: insert in a basic block not containing defs/uses.  */
 410           gsi = gsi_after_labels (occ->bb);
 411           gsi_insert_before (&gsi, new_stmt, GSI_SAME_STMT);
 412         }
 413
 414       reciprocal_stats.rdivs_inserted++;
 415
 416       occ->recip_def_stmt = new_stmt;
 417     }
 418
 419   occ->recip_def = recip_def;
 420   for (occ_child = occ->children; occ_child; occ_child = occ_child->next)
 421     insert_reciprocals (def_gsi, occ_child, def, recip_def, threshold);
 422 }
 423
 424
 425 /* Replace the division at USE_P with a multiplication by the reciprocal, if
 426    possible.  */
 427
 428 static inline void
 429 replace_reciprocal (use_operand_p use_p)
 430 {
 431   gimple use_stmt = USE_STMT (use_p);
 432   basic_block bb = gimple_bb (use_stmt);
 433   struct occurrence *occ = (struct occurrence *) bb->aux;
 434
 435   if (optimize_bb_for_speed_p (bb)
 436       && occ->recip_def && use_stmt != occ->recip_def_stmt)
 437     {
 438       gimple_stmt_iterator gsi = gsi_for_stmt (use_stmt);
 439       gimple_assign_set_rhs_code (use_stmt, MULT_EXPR);
 440       SET_USE (use_p, occ->recip_def);
 441       fold_stmt_inplace (&gsi);
 442       update_stmt (use_stmt);
 443     }
 444 }
 445
 446
 447 /* Free OCC and return one more "struct occurrence" to be freed.  */
 448
 449 static struct occurrence *
 450 free_bb (struct occurrence *occ)
 451 {
 452   struct occurrence *child, *next;
 453
 454   /* First get the two pointers hanging off OCC.  */
 455   next = occ->next;
 456   child = occ->children;
 457   occ->bb->aux = NULL;
 458   pool_free (occ_pool, occ);
 459
 460   /* Now ensure that we don't recurse unless it is necessary.  */
 461   if (!child)
 462     return next;
 463   else
 464     {
 465       while (next)
 466         next = free_bb (next);
 467
 468       return child;
 469     }
 470 }
 471
 472
 473 /* Look for floating-point divisions among DEF's uses, and try to
 474    replace them by multiplications with the reciprocal.  Add
 475    as many statements computing the reciprocal as needed.
 476
 477    DEF must be a GIMPLE register of a floating-point type.  */
 478
 479 static void
 480 execute_cse_reciprocals_1 (gimple_stmt_iterator *def_gsi, tree def)
 481 {
 482   use_operand_p use_p;
 483   imm_use_iterator use_iter;
 484   struct occurrence *occ;
 485   int count = 0, threshold;
 486
 487   gcc_assert (FLOAT_TYPE_P (TREE_TYPE (def)) && is_gimple_reg (def));
 488
 489   FOR_EACH_IMM_USE_FAST (use_p, use_iter, def)
 490     {
 491       gimple use_stmt = USE_STMT (use_p);
 492       if (is_division_by (use_stmt, def))
 493         {
 494           register_division_in (gimple_bb (use_stmt));
 495           count++;
 496         }
 497     }
 498
 499   /* Do the expensive part only if we can hope to optimize something.  */
 500   threshold = targetm.min_divisions_for_recip_mul (TYPE_MODE (TREE_TYPE (def)));
 501   if (count >= threshold)
 502     {
 503       gimple use_stmt;
 504       for (occ = occ_head; occ; occ = occ->next)
 505         {
 506           compute_merit (occ);
 507           insert_reciprocals (def_gsi, occ, def, NULL, threshold);
 508         }
 509
 510       FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, def)
 511         {
 512           if (is_division_by (use_stmt, def))
 513             {
 514               FOR_EACH_IMM_USE_ON_STMT (use_p, use_iter)
 515                 replace_reciprocal (use_p);
 516             }
 517         }
 518     }
 519
 520   for (occ = occ_head; occ; )
 521     occ = free_bb (occ);
 522
 523   occ_head = NULL;
 524 }
 525
 526 /* Go through all the floating-point SSA_NAMEs, and call
 527    execute_cse_reciprocals_1 on each of them.  */
 528 namespace {
 529
 530 const pass_data pass_data_cse_reciprocals =
 531 {
 532   GIMPLE_PASS, /* type */
 533   "recip", /* name */
 534   OPTGROUP_NONE, /* optinfo_flags */
 535   TV_NONE, /* tv_id */
 536   PROP_ssa, /* properties_required */
 537   0, /* properties_provided */
 538   0, /* properties_destroyed */
 539   0, /* todo_flags_start */
 540   TODO_update_ssa, /* todo_flags_finish */
 541 };
 542
 543 class pass_cse_reciprocals : public gimple_opt_pass
 544 {
 545 public:
 546   pass_cse_reciprocals (gcc::context *ctxt)
 547     : gimple_opt_pass (pass_data_cse_reciprocals, ctxt)
 548   {}
 549
 550   /* opt_pass methods: */
 551   virtual bool gate (function *) { return optimize && flag_reciprocal_math; }
 552   virtual unsigned int execute (function *);
 553
 554 }; // class pass_cse_reciprocals
 555
 556 unsigned int
 557 pass_cse_reciprocals::execute (function *fun)
 558 {
 559   basic_block bb;
 560   tree arg;
 561
 562   occ_pool = create_alloc_pool ("dominators for recip",
 563                                 sizeof (struct occurrence),
 564                                 n_basic_blocks_for_fn (fun) / 3 + 1);
 565
 566   memset (&reciprocal_stats, 0, sizeof (reciprocal_stats));
 567   calculate_dominance_info (CDI_DOMINATORS);
 568   calculate_dominance_info (CDI_POST_DOMINATORS);
 569
 570 #ifdef ENABLE_CHECKING
 571   FOR_EACH_BB_FN (bb, fun)
 572     gcc_assert (!bb->aux);
 573 #endif
 574
 575   for (arg = DECL_ARGUMENTS (fun->decl); arg; arg = DECL_CHAIN (arg))
 576     if (FLOAT_TYPE_P (TREE_TYPE (arg))
 577         && is_gimple_reg (arg))
 578       {
 579         tree name = ssa_default_def (fun, arg);
 580         if (name)
 581           execute_cse_reciprocals_1 (NULL, name);
 582       }
 583
 584   FOR_EACH_BB_FN (bb, fun)
 585     {
 586       tree def;
 587
 588       for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
 589            gsi_next (&gsi))
 590         {
 591           gphi *phi = gsi.phi ();
 592           def = PHI_RESULT (phi);
 593           if (! virtual_operand_p (def)
 594               && FLOAT_TYPE_P (TREE_TYPE (def)))
 595             execute_cse_reciprocals_1 (NULL, def);
 596         }
 597
 598       for (gimple_stmt_iterator gsi = gsi_after_labels (bb); !gsi_end_p (gsi);
 599            gsi_next (&gsi))
 600         {
 601           gimple stmt = gsi_stmt (gsi);
 602
 603           if (gimple_has_lhs (stmt)
 604               && (def = SINGLE_SSA_TREE_OPERAND (stmt, SSA_OP_DEF)) != NULL
 605               && FLOAT_TYPE_P (TREE_TYPE (def))
 606               && TREE_CODE (def) == SSA_NAME)
 607             execute_cse_reciprocals_1 (&gsi, def);
 608         }
 609
 610       if (optimize_bb_for_size_p (bb))
 611         continue;
 612
 613       /* Scan for a/func(b) and convert it to reciprocal a*rfunc(b).  */
 614       for (gimple_stmt_iterator gsi = gsi_after_labels (bb); !gsi_end_p (gsi);
 615            gsi_next (&gsi))
 616         {
 617           gimple stmt = gsi_stmt (gsi);
 618           tree fndecl;
 619
 620           if (is_gimple_assign (stmt)
 621               && gimple_assign_rhs_code (stmt) == RDIV_EXPR)
 622             {
 623               tree arg1 = gimple_assign_rhs2 (stmt);
 624               gimple stmt1;
 625
 626               if (TREE_CODE (arg1) != SSA_NAME)
 627                 continue;
 628
 629               stmt1 = SSA_NAME_DEF_STMT (arg1);
 630
 631               if (is_gimple_call (stmt1)
 632                   && gimple_call_lhs (stmt1)
 633                   && (fndecl = gimple_call_fndecl (stmt1))
 634                   && (DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_NORMAL
 635                       || DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD))
 636                 {
 637                   enum built_in_function code;
 638                   bool md_code, fail;
 639                   imm_use_iterator ui;
 640                   use_operand_p use_p;
 641
 642                   code = DECL_FUNCTION_CODE (fndecl);
 643                   md_code = DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD;
 644
 645                   fndecl = targetm.builtin_reciprocal (code, md_code, false);
 646                   if (!fndecl)
 647                     continue;
 648
 649                   /* Check that all uses of the SSA name are divisions,
 650                      otherwise replacing the defining statement will do
 651                      the wrong thing.  */
 652                   fail = false;
 653                   FOR_EACH_IMM_USE_FAST (use_p, ui, arg1)
 654                     {
 655                       gimple stmt2 = USE_STMT (use_p);
 656                       if (is_gimple_debug (stmt2))
 657                         continue;
 658                       if (!is_gimple_assign (stmt2)
 659                           || gimple_assign_rhs_code (stmt2) != RDIV_EXPR
 660                           || gimple_assign_rhs1 (stmt2) == arg1
 661                           || gimple_assign_rhs2 (stmt2) != arg1)
 662                         {
 663                           fail = true;
 664                           break;
 665                         }
 666                     }
 667                   if (fail)
 668                     continue;
 669
 670                   gimple_replace_ssa_lhs (stmt1, arg1);
 671                   gimple_call_set_fndecl (stmt1, fndecl);
 672                   update_stmt (stmt1);
 673                   reciprocal_stats.rfuncs_inserted++;
 674
 675                   FOR_EACH_IMM_USE_STMT (stmt, ui, arg1)
 676                     {
 677                       gimple_stmt_iterator gsi = gsi_for_stmt (stmt);
 678                       gimple_assign_set_rhs_code (stmt, MULT_EXPR);
 679                       fold_stmt_inplace (&gsi);
 680                       update_stmt (stmt);
 681                     }
 682                 }
 683             }
 684         }
 685     }
 686
 687   statistics_counter_event (fun, "reciprocal divs inserted",
 688                             reciprocal_stats.rdivs_inserted);
 689   statistics_counter_event (fun, "reciprocal functions inserted",
 690                             reciprocal_stats.rfuncs_inserted);
 691
 692   free_dominance_info (CDI_DOMINATORS);
 693   free_dominance_info (CDI_POST_DOMINATORS);
 694   free_alloc_pool (occ_pool);
 695   return 0;
 696 }
 697
 698 } // anon namespace
 699
 700 gimple_opt_pass *
 701 make_pass_cse_reciprocals (gcc::context *ctxt)
 702 {
 703   return new pass_cse_reciprocals (ctxt);
 704 }
 705
 706 /* Records an occurrence at statement USE_STMT in the vector of trees
 707    STMTS if it is dominated by *TOP_BB or dominates it or this basic block
 708    is not yet initialized.  Returns true if the occurrence was pushed on
 709    the vector.  Adjusts *TOP_BB to be the basic block dominating all
 710    statements in the vector.  */
 711
 712 static bool
 713 maybe_record_sincos (vec<gimple> *stmts,
 714                      basic_block *top_bb, gimple use_stmt)
 715 {
 716   basic_block use_bb = gimple_bb (use_stmt);
 717   if (*top_bb
 718       && (*top_bb == use_bb
 719           || dominated_by_p (CDI_DOMINATORS, use_bb, *top_bb)))
 720     stmts->safe_push (use_stmt);
 721   else if (!*top_bb
 722            || dominated_by_p (CDI_DOMINATORS, *top_bb, use_bb))
 723     {
 724       stmts->safe_push (use_stmt);
 725       *top_bb = use_bb;
 726     }
 727   else
 728     return false;
 729
 730   return true;
 731 }
 732
 733 /* Look for sin, cos and cexpi calls with the same argument NAME and
 734    create a single call to cexpi CSEing the result in this case.
 735    We first walk over all immediate uses of the argument collecting
 736    statements that we can CSE in a vector and in a second pass replace
 737    the statement rhs with a REALPART or IMAGPART expression on the
 738    result of the cexpi call we insert before the use statement that
 739    dominates all other candidates.  */
 740
 741 static bool
 742 execute_cse_sincos_1 (tree name)
 743 {
 744   gimple_stmt_iterator gsi;
 745   imm_use_iterator use_iter;
 746   tree fndecl, res, type;
 747   gimple def_stmt, use_stmt, stmt;
 748   int seen_cos = 0, seen_sin = 0, seen_cexpi = 0;
 749   auto_vec<gimple> stmts;
 750   basic_block top_bb = NULL;
 751   int i;
 752   bool cfg_changed = false;
 753
 754   type = TREE_TYPE (name);
 755   FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, name)
 756     {
 757       if (gimple_code (use_stmt) != GIMPLE_CALL
 758           || !gimple_call_lhs (use_stmt)
 759           || !(fndecl = gimple_call_fndecl (use_stmt))
 760           || DECL_BUILT_IN_CLASS (fndecl) != BUILT_IN_NORMAL)
 761         continue;
 762
 763       switch (DECL_FUNCTION_CODE (fndecl))
 764         {
 765         CASE_FLT_FN (BUILT_IN_COS):
 766           seen_cos |= maybe_record_sincos (&stmts, &top_bb, use_stmt) ? 1 : 0;
 767           break;
 768
 769         CASE_FLT_FN (BUILT_IN_SIN):
 770           seen_sin |= maybe_record_sincos (&stmts, &top_bb, use_stmt) ? 1 : 0;
 771           break;
 772
 773         CASE_FLT_FN (BUILT_IN_CEXPI):
 774           seen_cexpi |= maybe_record_sincos (&stmts, &top_bb, use_stmt) ? 1 : 0;
 775           break;
 776
 777         default:;
 778         }
 779     }
 780
 781   if (seen_cos + seen_sin + seen_cexpi <= 1)
 782     return false;
 783
 784   /* Simply insert cexpi at the beginning of top_bb but not earlier than
 785      the name def statement.  */
 786   fndecl = mathfn_built_in (type, BUILT_IN_CEXPI);
 787   if (!fndecl)
 788     return false;
 789   stmt = gimple_build_call (fndecl, 1, name);
 790   res = make_temp_ssa_name (TREE_TYPE (TREE_TYPE (fndecl)), stmt, "sincostmp");
 791   gimple_call_set_lhs (stmt, res);
 792
 793   def_stmt = SSA_NAME_DEF_STMT (name);
 794   if (!SSA_NAME_IS_DEFAULT_DEF (name)
 795       && gimple_code (def_stmt) != GIMPLE_PHI
 796       && gimple_bb (def_stmt) == top_bb)
 797     {
 798       gsi = gsi_for_stmt (def_stmt);
 799       gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
 800     }
 801   else
 802     {
 803       gsi = gsi_after_labels (top_bb);
 804       gsi_insert_before (&gsi, stmt, GSI_SAME_STMT);
 805     }
 806   sincos_stats.inserted++;
 807
 808   /* And adjust the recorded old call sites.  */
 809   for (i = 0; stmts.iterate (i, &use_stmt); ++i)
 810     {
 811       tree rhs = NULL;
 812       fndecl = gimple_call_fndecl (use_stmt);
 813
 814       switch (DECL_FUNCTION_CODE (fndecl))
 815         {
 816         CASE_FLT_FN (BUILT_IN_COS):
 817           rhs = fold_build1 (REALPART_EXPR, type, res);
 818           break;
 819
 820         CASE_FLT_FN (BUILT_IN_SIN):
 821           rhs = fold_build1 (IMAGPART_EXPR, type, res);
 822           break;
 823
 824         CASE_FLT_FN (BUILT_IN_CEXPI):
 825           rhs = res;
 826           break;
 827
 828         default:;
 829           gcc_unreachable ();
 830         }
 831
 832         /* Replace call with a copy.  */
 833         stmt = gimple_build_assign (gimple_call_lhs (use_stmt), rhs);
 834
 835         gsi = gsi_for_stmt (use_stmt);
 836         gsi_replace (&gsi, stmt, true);
 837         if (gimple_purge_dead_eh_edges (gimple_bb (stmt)))
 838           cfg_changed = true;
 839     }
 840
 841   return cfg_changed;
 842 }
 843
 844 /* To evaluate powi(x,n), the floating point value x raised to the
 845    constant integer exponent n, we use a hybrid algorithm that
 846    combines the "window method" with look-up tables.  For an
 847    introduction to exponentiation algorithms and "addition chains",
 848    see section 4.6.3, "Evaluation of Powers" of Donald E. Knuth,
 849    "Seminumerical Algorithms", Vol. 2, "The Art of Computer Programming",
 850    3rd Edition, 1998, and Daniel M. Gordon, "A Survey of Fast Exponentiation
 851    Methods", Journal of Algorithms, Vol. 27, pp. 129-146, 1998.  */
 852
 853 /* Provide a default value for POWI_MAX_MULTS, the maximum number of
 854    multiplications to inline before calling the system library's pow
 855    function.  powi(x,n) requires at worst 2*bits(n)-2 multiplications,
 856    so this default never requires calling pow, powf or powl.  */
 857
 858 #ifndef POWI_MAX_MULTS
 859 #define POWI_MAX_MULTS  (2*HOST_BITS_PER_WIDE_INT-2)
 860 #endif
 861
 862 /* The size of the "optimal power tree" lookup table.  All
 863    exponents less than this value are simply looked up in the
 864    powi_table below.  This threshold is also used to size the
 865    cache of pseudo registers that hold intermediate results.  */
 866 #define POWI_TABLE_SIZE 256
 867
 868 /* The size, in bits of the window, used in the "window method"
 869    exponentiation algorithm.  This is equivalent to a radix of
 870    (1<<POWI_WINDOW_SIZE) in the corresponding "m-ary method".  */
 871 #define POWI_WINDOW_SIZE 3
 872
 873 /* The following table is an efficient representation of an
 874    "optimal power tree".  For each value, i, the corresponding
 875    value, j, in the table states than an optimal evaluation
 876    sequence for calculating pow(x,i) can be found by evaluating
 877    pow(x,j)*pow(x,i-j).  An optimal power tree for the first
 878    100 integers is given in Knuth's "Seminumerical algorithms".  */
 879
 880 static const unsigned char powi_table[POWI_TABLE_SIZE] =
 881   {
 882       0,   1,   1,   2,   2,   3,   3,   4,  /*   0 -   7 */
 883       4,   6,   5,   6,   6,  10,   7,   9,  /*   8 -  15 */
 884       8,  16,   9,  16,  10,  12,  11,  13,  /*  16 -  23 */
 885      12,  17,  13,  18,  14,  24,  15,  26,  /*  24 -  31 */
 886      16,  17,  17,  19,  18,  33,  19,  26,  /*  32 -  39 */
 887      20,  25,  21,  40,  22,  27,  23,  44,  /*  40 -  47 */
 888      24,  32,  25,  34,  26,  29,  27,  44,  /*  48 -  55 */
 889      28,  31,  29,  34,  30,  60,  31,  36,  /*  56 -  63 */
 890      32,  64,  33,  34,  34,  46,  35,  37,  /*  64 -  71 */
 891      36,  65,  37,  50,  38,  48,  39,  69,  /*  72 -  79 */
 892      40,  49,  41,  43,  42,  51,  43,  58,  /*  80 -  87 */
 893      44,  64,  45,  47,  46,  59,  47,  76,  /*  88 -  95 */
 894      48,  65,  49,  66,  50,  67,  51,  66,  /*  96 - 103 */
 895      52,  70,  53,  74,  54, 104,  55,  74,  /* 104 - 111 */
 896      56,  64,  57,  69,  58,  78,  59,  68,  /* 112 - 119 */
 897      60,  61,  61,  80,  62,  75,  63,  68,  /* 120 - 127 */
 898      64,  65,  65, 128,  66, 129,  67,  90,  /* 128 - 135 */
 899      68,  73,  69, 131,  70,  94,  71,  88,  /* 136 - 143 */
 900      72, 128,  73,  98,  74, 132,  75, 121,  /* 144 - 151 */
 901      76, 102,  77, 124,  78, 132,  79, 106,  /* 152 - 159 */
 902      80,  97,  81, 160,  82,  99,  83, 134,  /* 160 - 167 */
 903      84,  86,  85,  95,  86, 160,  87, 100,  /* 168 - 175 */
 904      88, 113,  89,  98,  90, 107,  91, 122,  /* 176 - 183 */
 905      92, 111,  93, 102,  94, 126,  95, 150,  /* 184 - 191 */
 906      96, 128,  97, 130,  98, 133,  99, 195,  /* 192 - 199 */
 907     100, 128, 101, 123, 102, 164, 103, 138,  /* 200 - 207 */
 908     104, 145, 105, 146, 106, 109, 107, 149,  /* 208 - 215 */
 909     108, 200, 109, 146, 110, 170, 111, 157,  /* 216 - 223 */
 910     112, 128, 113, 130, 114, 182, 115, 132,  /* 224 - 231 */
 911     116, 200, 117, 132, 118, 158, 119, 206,  /* 232 - 239 */
 912     120, 240, 121, 162, 122, 147, 123, 152,  /* 240 - 247 */
 913     124, 166, 125, 214, 126, 138, 127, 153,  /* 248 - 255 */
 914   };
 915
 916
 917 /* Return the number of multiplications required to calculate
 918    powi(x,n) where n is less than POWI_TABLE_SIZE.  This is a
 919    subroutine of powi_cost.  CACHE is an array indicating
 920    which exponents have already been calculated.  */
 921
 922 static int
 923 powi_lookup_cost (unsigned HOST_WIDE_INT n, bool *cache)
 924 {
 925   /* If we've already calculated this exponent, then this evaluation
 926      doesn't require any additional multiplications.  */
 927   if (cache[n])
 928     return 0;
 929
 930   cache[n] = true;
 931   return powi_lookup_cost (n - powi_table[n], cache)
 932          + powi_lookup_cost (powi_table[n], cache) + 1;
 933 }
 934
 935 /* Return the number of multiplications required to calculate
 936    powi(x,n) for an arbitrary x, given the exponent N.  This
 937    function needs to be kept in sync with powi_as_mults below.  */
 938
 939 static int
 940 powi_cost (HOST_WIDE_INT n)
 941 {
 942   bool cache[POWI_TABLE_SIZE];
 943   unsigned HOST_WIDE_INT digit;
 944   unsigned HOST_WIDE_INT val;
 945   int result;
 946
 947   if (n == 0)
 948     return 0;
 949
 950   /* Ignore the reciprocal when calculating the cost.  */
 951   val = (n < 0) ? -n : n;
 952
 953   /* Initialize the exponent cache.  */
 954   memset (cache, 0, POWI_TABLE_SIZE * sizeof (bool));
 955   cache[1] = true;
 956
 957   result = 0;
 958
 959   while (val >= POWI_TABLE_SIZE)
 960     {
 961       if (val & 1)
 962         {
 963           digit = val & ((1 << POWI_WINDOW_SIZE) - 1);
 964           result += powi_lookup_cost (digit, cache)
 965                     + POWI_WINDOW_SIZE + 1;
 966           val >>= POWI_WINDOW_SIZE;
 967         }
 968       else
 969         {
 970           val >>= 1;
 971           result++;
 972         }
 973     }
 974
 975   return result + powi_lookup_cost (val, cache);
 976 }
 977
 978 /* Recursive subroutine of powi_as_mults.  This function takes the
 979    array, CACHE, of already calculated exponents and an exponent N and
 980    returns a tree that corresponds to CACHE[1]**N, with type TYPE.  */
 981
 982 static tree
 983 powi_as_mults_1 (gimple_stmt_iterator *gsi, location_t loc, tree type,
 984                  HOST_WIDE_INT n, tree *cache)
 985 {
 986   tree op0, op1, ssa_target;
 987   unsigned HOST_WIDE_INT digit;
 988   gassign *mult_stmt;
 989
 990   if (n < POWI_TABLE_SIZE && cache[n])
 991     return cache[n];
 992
 993   ssa_target = make_temp_ssa_name (type, NULL, "powmult");
 994
 995   if (n < POWI_TABLE_SIZE)
 996     {
 997       cache[n] = ssa_target;
 998       op0 = powi_as_mults_1 (gsi, loc, type, n - powi_table[n], cache);
 999       op1 = powi_as_mults_1 (gsi, loc, type, powi_table[n], cache);
1000     }
1001   else if (n & 1)
1002     {
1003       digit = n & ((1 << POWI_WINDOW_SIZE) - 1);
1004       op0 = powi_as_mults_1 (gsi, loc, type, n - digit, cache);
1005       op1 = powi_as_mults_1 (gsi, loc, type, digit, cache);
1006     }
1007   else
1008     {
1009       op0 = powi_as_mults_1 (gsi, loc, type, n >> 1, cache);
1010       op1 = op0;
1011     }
1012
1013   mult_stmt = gimple_build_assign (ssa_target, MULT_EXPR, op0, op1);
1014   gimple_set_location (mult_stmt, loc);
1015   gsi_insert_before (gsi, mult_stmt, GSI_SAME_STMT);
1016
1017   return ssa_target;
1018 }
1019
1020 /* Convert ARG0**N to a tree of multiplications of ARG0 with itself.
1021    This function needs to be kept in sync with powi_cost above.  */
1022
1023 static tree
1024 powi_as_mults (gimple_stmt_iterator *gsi, location_t loc,
1025                tree arg0, HOST_WIDE_INT n)
1026 {
1027   tree cache[POWI_TABLE_SIZE], result, type = TREE_TYPE (arg0);
1028   gassign *div_stmt;
1029   tree target;
1030
1031   if (n == 0)
1032     return build_real (type, dconst1);
1033
1034   memset (cache, 0,  sizeof (cache));
1035   cache[1] = arg0;
1036
1037   result = powi_as_mults_1 (gsi, loc, type, (n < 0) ? -n : n, cache);
1038   if (n >= 0)
1039     return result;
1040
1041   /* If the original exponent was negative, reciprocate the result.  */
1042   target = make_temp_ssa_name (type, NULL, "powmult");
1043   div_stmt = gimple_build_assign (target, RDIV_EXPR,
1044                                   build_real (type, dconst1), result);
1045   gimple_set_location (div_stmt, loc);
1046   gsi_insert_before (gsi, div_stmt, GSI_SAME_STMT);
1047
1048   return target;
1049 }
1050
1051 /* ARG0 and N are the two arguments to a powi builtin in GSI with
1052    location info LOC.  If the arguments are appropriate, create an
1053    equivalent sequence of statements prior to GSI using an optimal
1054    number of multiplications, and return an expession holding the
1055    result.  */
1056
1057 static tree
1058 gimple_expand_builtin_powi (gimple_stmt_iterator *gsi, location_t loc,
1059                             tree arg0, HOST_WIDE_INT n)
1060 {
1061   /* Avoid largest negative number.  */
1062   if (n != -n
1063       && ((n >= -1 && n <= 2)
1064           || (optimize_function_for_speed_p (cfun)
1065               && powi_cost (n) <= POWI_MAX_MULTS)))
1066     return powi_as_mults (gsi, loc, arg0, n);
1067
1068   return NULL_TREE;
1069 }
1070
1071 /* Build a gimple call statement that calls FN with argument ARG.
1072    Set the lhs of the call statement to a fresh SSA name.  Insert the
1073    statement prior to GSI's current position, and return the fresh
1074    SSA name.  */
1075
1076 static tree
1077 build_and_insert_call (gimple_stmt_iterator *gsi, location_t loc,
1078                        tree fn, tree arg)
1079 {
1080   gcall *call_stmt;
1081   tree ssa_target;
1082
1083   call_stmt = gimple_build_call (fn, 1, arg);
1084   ssa_target = make_temp_ssa_name (TREE_TYPE (arg), NULL, "powroot");
1085   gimple_set_lhs (call_stmt, ssa_target);
1086   gimple_set_location (call_stmt, loc);
1087   gsi_insert_before (gsi, call_stmt, GSI_SAME_STMT);
1088
1089   return ssa_target;
1090 }
1091
1092 /* Build a gimple binary operation with the given CODE and arguments
1093    ARG0, ARG1, assigning the result to a new SSA name for variable
1094    TARGET.  Insert the statement prior to GSI's current position, and
1095    return the fresh SSA name.*/
1096
1097 static tree
1098 build_and_insert_binop (gimple_stmt_iterator *gsi, location_t loc,
1099                         const char *name, enum tree_code code,
1100                         tree arg0, tree arg1)
1101 {
1102   tree result = make_temp_ssa_name (TREE_TYPE (arg0), NULL, name);
1103   gassign *stmt = gimple_build_assign (result, code, arg0, arg1);
1104   gimple_set_location (stmt, loc);
1105   gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
1106   return result;
1107 }
1108
1109 /* Build a gimple reference operation with the given CODE and argument
1110    ARG, assigning the result to a new SSA name of TYPE with NAME.
1111    Insert the statement prior to GSI's current position, and return
1112    the fresh SSA name.  */
1113
1114 static inline tree
1115 build_and_insert_ref (gimple_stmt_iterator *gsi, location_t loc, tree type,
1116                       const char *name, enum tree_code code, tree arg0)
1117 {
1118   tree result = make_temp_ssa_name (type, NULL, name);
1119   gimple stmt = gimple_build_assign (result, build1 (code, type, arg0));
1120   gimple_set_location (stmt, loc);
1121   gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
1122   return result;
1123 }
1124
1125 /* Build a gimple assignment to cast VAL to TYPE.  Insert the statement
1126    prior to GSI's current position, and return the fresh SSA name.  */
1127
1128 static tree
1129 build_and_insert_cast (gimple_stmt_iterator *gsi, location_t loc,
1130                        tree type, tree val)
1131 {
1132   tree result = make_ssa_name (type);
1133   gassign *stmt = gimple_build_assign (result, NOP_EXPR, val);
1134   gimple_set_location (stmt, loc);
1135   gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
1136   return result;
1137 }
1138
1139 /* ARG0 and ARG1 are the two arguments to a pow builtin call in GSI
1140    with location info LOC.  If possible, create an equivalent and
1141    less expensive sequence of statements prior to GSI, and return an
1142    expession holding the result.  */
1143
1144 static tree
1145 gimple_expand_builtin_pow (gimple_stmt_iterator *gsi, location_t loc,
1146                            tree arg0, tree arg1)
1147 {
1148   REAL_VALUE_TYPE c, cint, dconst1_4, dconst3_4, dconst1_3, dconst1_6;
1149   REAL_VALUE_TYPE c2, dconst3;
1150   HOST_WIDE_INT n;
1151   tree type, sqrtfn, cbrtfn, sqrt_arg0, sqrt_sqrt, result, cbrt_x, powi_cbrt_x;
1152   machine_mode mode;
1153   bool hw_sqrt_exists, c_is_int, c2_is_int;
1154
1155   /* If the exponent isn't a constant, there's nothing of interest
1156      to be done.  */
1157   if (TREE_CODE (arg1) != REAL_CST)
1158     return NULL_TREE;
1159
1160   /* If the exponent is equivalent to an integer, expand to an optimal
1161      multiplication sequence when profitable.  */
1162   c = TREE_REAL_CST (arg1);
1163   n = real_to_integer (&c);
1164   real_from_integer (&cint, VOIDmode, n, SIGNED);
1165   c_is_int = real_identical (&c, &cint);
1166
1167   if (c_is_int
1168       && ((n >= -1 && n <= 2)
1169           || (flag_unsafe_math_optimizations
1170               && optimize_bb_for_speed_p (gsi_bb (*gsi))
1171               && powi_cost (n) <= POWI_MAX_MULTS)))
1172     return gimple_expand_builtin_powi (gsi, loc, arg0, n);
1173
1174   /* Attempt various optimizations using sqrt and cbrt.  */
1175   type = TREE_TYPE (arg0);
1176   mode = TYPE_MODE (type);
1177   sqrtfn = mathfn_built_in (type, BUILT_IN_SQRT);
1178
1179   /* Optimize pow(x,0.5) = sqrt(x).  This replacement is always safe
1180      unless signed zeros must be maintained.  pow(-0,0.5) = +0, while
1181      sqrt(-0) = -0.  */
1182   if (sqrtfn
1183       && REAL_VALUES_EQUAL (c, dconsthalf)
1184       && !HONOR_SIGNED_ZEROS (mode))
1185     return build_and_insert_call (gsi, loc, sqrtfn, arg0);
1186
1187   /* Optimize pow(x,0.25) = sqrt(sqrt(x)).  Assume on most machines that
1188      a builtin sqrt instruction is smaller than a call to pow with 0.25,
1189      so do this optimization even if -Os.  Don't do this optimization
1190      if we don't have a hardware sqrt insn.  */
1191   dconst1_4 = dconst1;
1192   SET_REAL_EXP (&dconst1_4, REAL_EXP (&dconst1_4) - 2);
1193   hw_sqrt_exists = optab_handler (sqrt_optab, mode) != CODE_FOR_nothing;
1194
1195   if (flag_unsafe_math_optimizations
1196       && sqrtfn
1197       && REAL_VALUES_EQUAL (c, dconst1_4)
1198       && hw_sqrt_exists)
1199     {
1200       /* sqrt(x)  */
1201       sqrt_arg0 = build_and_insert_call (gsi, loc, sqrtfn, arg0);
1202
1203       /* sqrt(sqrt(x))  */
1204       return build_and_insert_call (gsi, loc, sqrtfn, sqrt_arg0);
1205     }
1206
1207   /* Optimize pow(x,0.75) = sqrt(x) * sqrt(sqrt(x)) unless we are
1208      optimizing for space.  Don't do this optimization if we don't have
1209      a hardware sqrt insn.  */
1210   real_from_integer (&dconst3_4, VOIDmode, 3, SIGNED);
1211   SET_REAL_EXP (&dconst3_4, REAL_EXP (&dconst3_4) - 2);
1212
1213   if (flag_unsafe_math_optimizations
1214       && sqrtfn
1215       && optimize_function_for_speed_p (cfun)
1216       && REAL_VALUES_EQUAL (c, dconst3_4)
1217       && hw_sqrt_exists)
1218     {
1219       /* sqrt(x)  */
1220       sqrt_arg0 = build_and_insert_call (gsi, loc, sqrtfn, arg0);
1221
1222       /* sqrt(sqrt(x))  */
1223       sqrt_sqrt = build_and_insert_call (gsi, loc, sqrtfn, sqrt_arg0);
1224
1225       /* sqrt(x) * sqrt(sqrt(x))  */
1226       return build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1227                                      sqrt_arg0, sqrt_sqrt);
1228     }
1229
1230   /* Optimize pow(x,1./3.) = cbrt(x).  This requires unsafe math
1231      optimizations since 1./3. is not exactly representable.  If x
1232      is negative and finite, the correct value of pow(x,1./3.) is
1233      a NaN with the "invalid" exception raised, because the value
1234      of 1./3. actually has an even denominator.  The correct value
1235      of cbrt(x) is a negative real value.  */
1236   cbrtfn = mathfn_built_in (type, BUILT_IN_CBRT);
1237   dconst1_3 = real_value_truncate (mode, dconst_third ());
1238
1239   if (flag_unsafe_math_optimizations
1240       && cbrtfn
1241       && (gimple_val_nonnegative_real_p (arg0) || !HONOR_NANS (mode))
1242       && REAL_VALUES_EQUAL (c, dconst1_3))
1243     return build_and_insert_call (gsi, loc, cbrtfn, arg0);
1244
1245   /* Optimize pow(x,1./6.) = cbrt(sqrt(x)).  Don't do this optimization
1246      if we don't have a hardware sqrt insn.  */
1247   dconst1_6 = dconst1_3;
1248   SET_REAL_EXP (&dconst1_6, REAL_EXP (&dconst1_6) - 1);
1249
1250   if (flag_unsafe_math_optimizations
1251       && sqrtfn
1252       && cbrtfn
1253       && (gimple_val_nonnegative_real_p (arg0) || !HONOR_NANS (mode))
1254       && optimize_function_for_speed_p (cfun)
1255       && hw_sqrt_exists
1256       && REAL_VALUES_EQUAL (c, dconst1_6))
1257     {
1258       /* sqrt(x)  */
1259       sqrt_arg0 = build_and_insert_call (gsi, loc, sqrtfn, arg0);
1260
1261       /* cbrt(sqrt(x))  */
1262       return build_and_insert_call (gsi, loc, cbrtfn, sqrt_arg0);
1263     }
1264
1265   /* Optimize pow(x,c), where n = 2c for some nonzero integer n
1266      and c not an integer, into
1267
1268        sqrt(x) * powi(x, n/2),                n > 0;
1269        1.0 / (sqrt(x) * powi(x, abs(n/2))),   n < 0.
1270
1271      Do not calculate the powi factor when n/2 = 0.  */
1272   real_arithmetic (&c2, MULT_EXPR, &c, &dconst2);
1273   n = real_to_integer (&c2);
1274   real_from_integer (&cint, VOIDmode, n, SIGNED);
1275   c2_is_int = real_identical (&c2, &cint);
1276
1277   if (flag_unsafe_math_optimizations
1278       && sqrtfn
1279       && c2_is_int
1280       && !c_is_int
1281       && optimize_function_for_speed_p (cfun))
1282     {
1283       tree powi_x_ndiv2 = NULL_TREE;
1284
1285       /* Attempt to fold powi(arg0, abs(n/2)) into multiplies.  If not
1286          possible or profitable, give up.  Skip the degenerate case when
1287          n is 1 or -1, where the result is always 1.  */
1288       if (absu_hwi (n) != 1)
1289         {
1290           powi_x_ndiv2 = gimple_expand_builtin_powi (gsi, loc, arg0,
1291                                                      abs_hwi (n / 2));
1292           if (!powi_x_ndiv2)
1293             return NULL_TREE;
1294         }
1295
1296       /* Calculate sqrt(x).  When n is not 1 or -1, multiply it by the
1297          result of the optimal multiply sequence just calculated.  */
1298       sqrt_arg0 = build_and_insert_call (gsi, loc, sqrtfn, arg0);
1299
1300       if (absu_hwi (n) == 1)
1301         result = sqrt_arg0;
1302       else
1303         result = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1304                                          sqrt_arg0, powi_x_ndiv2);
1305
1306       /* If n is negative, reciprocate the result.  */
1307       if (n < 0)
1308         result = build_and_insert_binop (gsi, loc, "powroot", RDIV_EXPR,
1309                                          build_real (type, dconst1), result);
1310       return result;
1311     }
1312
1313   /* Optimize pow(x,c), where 3c = n for some nonzero integer n, into
1314
1315      powi(x, n/3) * powi(cbrt(x), n%3),                    n > 0;
1316      1.0 / (powi(x, abs(n)/3) * powi(cbrt(x), abs(n)%3)),  n < 0.
1317
1318      Do not calculate the first factor when n/3 = 0.  As cbrt(x) is
1319      different from pow(x, 1./3.) due to rounding and behavior with
1320      negative x, we need to constrain this transformation to unsafe
1321      math and positive x or finite math.  */
1322   real_from_integer (&dconst3, VOIDmode, 3, SIGNED);
1323   real_arithmetic (&c2, MULT_EXPR, &c, &dconst3);
1324   real_round (&c2, mode, &c2);
1325   n = real_to_integer (&c2);
1326   real_from_integer (&cint, VOIDmode, n, SIGNED);
1327   real_arithmetic (&c2, RDIV_EXPR, &cint, &dconst3);
1328   real_convert (&c2, mode, &c2);
1329
1330   if (flag_unsafe_math_optimizations
1331       && cbrtfn
1332       && (gimple_val_nonnegative_real_p (arg0) || !HONOR_NANS (mode))
1333       && real_identical (&c2, &c)
1334       && !c2_is_int
1335       && optimize_function_for_speed_p (cfun)
1336       && powi_cost (n / 3) <= POWI_MAX_MULTS)
1337     {
1338       tree powi_x_ndiv3 = NULL_TREE;
1339
1340       /* Attempt to fold powi(arg0, abs(n/3)) into multiplies.  If not
1341          possible or profitable, give up.  Skip the degenerate case when
1342          abs(n) < 3, where the result is always 1.  */
1343       if (absu_hwi (n) >= 3)
1344         {
1345           powi_x_ndiv3 = gimple_expand_builtin_powi (gsi, loc, arg0,
1346                                                      abs_hwi (n / 3));
1347           if (!powi_x_ndiv3)
1348             return NULL_TREE;
1349         }
1350
1351       /* Calculate powi(cbrt(x), n%3).  Don't use gimple_expand_builtin_powi
1352          as that creates an unnecessary variable.  Instead, just produce
1353          either cbrt(x) or cbrt(x) * cbrt(x).  */
1354       cbrt_x = build_and_insert_call (gsi, loc, cbrtfn, arg0);
1355
1356       if (absu_hwi (n) % 3 == 1)
1357         powi_cbrt_x = cbrt_x;
1358       else
1359         powi_cbrt_x = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1360                                               cbrt_x, cbrt_x);
1361
1362       /* Multiply the two subexpressions, unless powi(x,abs(n)/3) = 1.  */
1363       if (absu_hwi (n) < 3)
1364         result = powi_cbrt_x;
1365       else
1366         result = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1367                                          powi_x_ndiv3, powi_cbrt_x);
1368
1369       /* If n is negative, reciprocate the result.  */
1370       if (n < 0)
1371         result = build_and_insert_binop (gsi, loc, "powroot", RDIV_EXPR,
1372                                          build_real (type, dconst1), result);
1373
1374       return result;
1375     }
1376
1377   /* No optimizations succeeded.  */
1378   return NULL_TREE;
1379 }
1380
1381 /* ARG is the argument to a cabs builtin call in GSI with location info
1382    LOC.  Create a sequence of statements prior to GSI that calculates
1383    sqrt(R*R + I*I), where R and I are the real and imaginary components
1384    of ARG, respectively.  Return an expression holding the result.  */
1385
1386 static tree
1387 gimple_expand_builtin_cabs (gimple_stmt_iterator *gsi, location_t loc, tree arg)
1388 {
1389   tree real_part, imag_part, addend1, addend2, sum, result;
1390   tree type = TREE_TYPE (TREE_TYPE (arg));
1391   tree sqrtfn = mathfn_built_in (type, BUILT_IN_SQRT);
1392   machine_mode mode = TYPE_MODE (type);
1393
1394   if (!flag_unsafe_math_optimizations
1395       || !optimize_bb_for_speed_p (gimple_bb (gsi_stmt (*gsi)))
1396       || !sqrtfn
1397       || optab_handler (sqrt_optab, mode) == CODE_FOR_nothing)
1398     return NULL_TREE;
1399
1400   real_part = build_and_insert_ref (gsi, loc, type, "cabs",
1401                                     REALPART_EXPR, arg);
1402   addend1 = build_and_insert_binop (gsi, loc, "cabs", MULT_EXPR,
1403                                     real_part, real_part);
1404   imag_part = build_and_insert_ref (gsi, loc, type, "cabs",
1405                                     IMAGPART_EXPR, arg);
1406   addend2 = build_and_insert_binop (gsi, loc, "cabs", MULT_EXPR,
1407                                     imag_part, imag_part);
1408   sum = build_and_insert_binop (gsi, loc, "cabs", PLUS_EXPR, addend1, addend2);
1409   result = build_and_insert_call (gsi, loc, sqrtfn, sum);
1410
1411   return result;
1412 }
1413
1414 /* Go through all calls to sin, cos and cexpi and call execute_cse_sincos_1
1415    on the SSA_NAME argument of each of them.  Also expand powi(x,n) into
1416    an optimal number of multiplies, when n is a constant.  */
1417
1418 namespace {
1419
1420 const pass_data pass_data_cse_sincos =
1421 {
1422   GIMPLE_PASS, /* type */
1423   "sincos", /* name */
1424   OPTGROUP_NONE, /* optinfo_flags */
1425   TV_NONE, /* tv_id */
1426   PROP_ssa, /* properties_required */
1427   0, /* properties_provided */
1428   0, /* properties_destroyed */
1429   0, /* todo_flags_start */
1430   TODO_update_ssa, /* todo_flags_finish */
1431 };
1432
1433 class pass_cse_sincos : public gimple_opt_pass
1434 {
1435 public:
1436   pass_cse_sincos (gcc::context *ctxt)
1437     : gimple_opt_pass (pass_data_cse_sincos, ctxt)
1438   {}
1439
1440   /* opt_pass methods: */
1441   virtual bool gate (function *)
1442     {
1443       /* We no longer require either sincos or cexp, since powi expansion
1444          piggybacks on this pass.  */
1445       return optimize;
1446     }
1447
1448   virtual unsigned int execute (function *);
1449
1450 }; // class pass_cse_sincos
1451
1452 unsigned int
1453 pass_cse_sincos::execute (function *fun)
1454 {
1455   basic_block bb;
1456   bool cfg_changed = false;
1457
1458   calculate_dominance_info (CDI_DOMINATORS);
1459   memset (&sincos_stats, 0, sizeof (sincos_stats));
1460
1461   FOR_EACH_BB_FN (bb, fun)
1462     {
1463       gimple_stmt_iterator gsi;
1464       bool cleanup_eh = false;
1465
1466       for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi); gsi_next (&gsi))
1467         {
1468           gimple stmt = gsi_stmt (gsi);
1469           tree fndecl;
1470
1471           /* Only the last stmt in a bb could throw, no need to call
1472              gimple_purge_dead_eh_edges if we change something in the middle
1473              of a basic block.  */
1474           cleanup_eh = false;
1475
1476           if (is_gimple_call (stmt)
1477               && gimple_call_lhs (stmt)
1478               && (fndecl = gimple_call_fndecl (stmt))
1479               && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_NORMAL)
1480             {
1481               tree arg, arg0, arg1, result;
1482               HOST_WIDE_INT n;
1483               location_t loc;
1484
1485               switch (DECL_FUNCTION_CODE (fndecl))
1486                 {
1487                 CASE_FLT_FN (BUILT_IN_COS):
1488                 CASE_FLT_FN (BUILT_IN_SIN):
1489                 CASE_FLT_FN (BUILT_IN_CEXPI):
1490                   /* Make sure we have either sincos or cexp.  */
1491                   if (!targetm.libc_has_function (function_c99_math_complex)
1492                       && !targetm.libc_has_function (function_sincos))
1493                     break;
1494
1495                   arg = gimple_call_arg (stmt, 0);
1496                   if (TREE_CODE (arg) == SSA_NAME)
1497                     cfg_changed |= execute_cse_sincos_1 (arg);
1498                   break;
1499
1500                 CASE_FLT_FN (BUILT_IN_POW):
1501                   arg0 = gimple_call_arg (stmt, 0);
1502                   arg1 = gimple_call_arg (stmt, 1);
1503
1504                   loc = gimple_location (stmt);
1505                   result = gimple_expand_builtin_pow (&gsi, loc, arg0, arg1);
1506
1507                   if (result)
1508                     {
1509                       tree lhs = gimple_get_lhs (stmt);
1510                       gassign *new_stmt = gimple_build_assign (lhs, result);
1511                       gimple_set_location (new_stmt, loc);
1512                       unlink_stmt_vdef (stmt);
1513                       gsi_replace (&gsi, new_stmt, true);
1514                       cleanup_eh = true;
1515                       if (gimple_vdef (stmt))
1516                         release_ssa_name (gimple_vdef (stmt));
1517                     }
1518                   break;
1519
1520                 CASE_FLT_FN (BUILT_IN_POWI):
1521                   arg0 = gimple_call_arg (stmt, 0);
1522                   arg1 = gimple_call_arg (stmt, 1);
1523                   loc = gimple_location (stmt);
1524
1525                   if (real_minus_onep (arg0))
1526                     {
1527                       tree t0, t1, cond, one, minus_one;
1528                       gassign *stmt;
1529
1530                       t0 = TREE_TYPE (arg0);
1531                       t1 = TREE_TYPE (arg1);
1532                       one = build_real (t0, dconst1);
1533                       minus_one = build_real (t0, dconstm1);
1534
1535                       cond = make_temp_ssa_name (t1, NULL, "powi_cond");
1536                       stmt = gimple_build_assign (cond, BIT_AND_EXPR,
1537                                                   arg1, build_int_cst (t1, 1));
1538                       gimple_set_location (stmt, loc);
1539                       gsi_insert_before (&gsi, stmt, GSI_SAME_STMT);
1540
1541                       result = make_temp_ssa_name (t0, NULL, "powi");
1542                       stmt = gimple_build_assign (result, COND_EXPR, cond,
1543                                                   minus_one, one);
1544                       gimple_set_location (stmt, loc);
1545                       gsi_insert_before (&gsi, stmt, GSI_SAME_STMT);
1546                     }
1547                   else
1548                     {
1549                       if (!tree_fits_shwi_p (arg1))
1550                         break;
1551
1552                       n = tree_to_shwi (arg1);
1553                       result = gimple_expand_builtin_powi (&gsi, loc, arg0, n);
1554                     }
1555
1556                   if (result)
1557                     {
1558                       tree lhs = gimple_get_lhs (stmt);
1559                       gassign *new_stmt = gimple_build_assign (lhs, result);
1560                       gimple_set_location (new_stmt, loc);
1561                       unlink_stmt_vdef (stmt);
1562                       gsi_replace (&gsi, new_stmt, true);
1563                       cleanup_eh = true;
1564                       if (gimple_vdef (stmt))
1565                         release_ssa_name (gimple_vdef (stmt));
1566                     }
1567                   break;
1568
1569                 CASE_FLT_FN (BUILT_IN_CABS):
1570                   arg0 = gimple_call_arg (stmt, 0);
1571                   loc = gimple_location (stmt);
1572                   result = gimple_expand_builtin_cabs (&gsi, loc, arg0);
1573
1574                   if (result)
1575                     {
1576                       tree lhs = gimple_get_lhs (stmt);
1577                       gassign *new_stmt = gimple_build_assign (lhs, result);
1578                       gimple_set_location (new_stmt, loc);
1579                       unlink_stmt_vdef (stmt);
1580                       gsi_replace (&gsi, new_stmt, true);
1581                       cleanup_eh = true;
1582                       if (gimple_vdef (stmt))
1583                         release_ssa_name (gimple_vdef (stmt));
1584                     }
1585                   break;
1586
1587                 default:;
1588                 }
1589             }
1590         }
1591       if (cleanup_eh)
1592         cfg_changed |= gimple_purge_dead_eh_edges (bb);
1593     }
1594
1595   statistics_counter_event (fun, "sincos statements inserted",
1596                             sincos_stats.inserted);
1597
1598   free_dominance_info (CDI_DOMINATORS);
1599   return cfg_changed ? TODO_cleanup_cfg : 0;
1600 }
1601
1602 } // anon namespace
1603
1604 gimple_opt_pass *
1605 make_pass_cse_sincos (gcc::context *ctxt)
1606 {
1607   return new pass_cse_sincos (ctxt);
1608 }
1609
1610 /* A symbolic number is used to detect byte permutation and selection
1611    patterns.  Therefore the field N contains an artificial number
1612    consisting of octet sized markers:
1613
1614    0    - target byte has the value 0
1615    FF   - target byte has an unknown value (eg. due to sign extension)
1616    1..size - marker value is the target byte index minus one.
1617
1618    To detect permutations on memory sources (arrays and structures), a symbolic
1619    number is also associated a base address (the array or structure the load is
1620    made from), an offset from the base address and a range which gives the
1621    difference between the highest and lowest accessed memory location to make
1622    such a symbolic number. The range is thus different from size which reflects
1623    the size of the type of current expression. Note that for non memory source,
1624    range holds the same value as size.
1625
1626    For instance, for an array char a[], (short) a[0] | (short) a[3] would have
1627    a size of 2 but a range of 4 while (short) a[0] | ((short) a[0] << 1) would
1628    still have a size of 2 but this time a range of 1.  */
1629
1630 struct symbolic_number {
1631   uint64_t n;
1632   tree type;
1633   tree base_addr;
1634   tree offset;
1635   HOST_WIDE_INT bytepos;
1636   tree alias_set;
1637   tree vuse;
1638   unsigned HOST_WIDE_INT range;
1639 };
1640
1641 #define BITS_PER_MARKER 8
1642 #define MARKER_MASK ((1 << BITS_PER_MARKER) - 1)
1643 #define MARKER_BYTE_UNKNOWN MARKER_MASK
1644 #define HEAD_MARKER(n, size) \
1645   ((n) & ((uint64_t) MARKER_MASK << (((size) - 1) * BITS_PER_MARKER)))
1646
1647 /* The number which the find_bswap_or_nop_1 result should match in
1648    order to have a nop.  The number is masked according to the size of
1649    the symbolic number before using it.  */
1650 #define CMPNOP (sizeof (int64_t) < 8 ? 0 : \
1651   (uint64_t)0x08070605 << 32 | 0x04030201)
1652
1653 /* The number which the find_bswap_or_nop_1 result should match in
1654    order to have a byte swap.  The number is masked according to the
1655    size of the symbolic number before using it.  */
1656 #define CMPXCHG (sizeof (int64_t) < 8 ? 0 : \
1657   (uint64_t)0x01020304 << 32 | 0x05060708)
1658
1659 /* Perform a SHIFT or ROTATE operation by COUNT bits on symbolic
1660    number N.  Return false if the requested operation is not permitted
1661    on a symbolic number.  */
1662
1663 static inline bool
1664 do_shift_rotate (enum tree_code code,
1665                  struct symbolic_number *n,
1666                  int count)
1667 {
1668   int i, size = TYPE_PRECISION (n->type) / BITS_PER_UNIT;
1669   unsigned head_marker;
1670
1671   if (count % BITS_PER_UNIT != 0)
1672     return false;
1673   count = (count / BITS_PER_UNIT) * BITS_PER_MARKER;
1674
1675   /* Zero out the extra bits of N in order to avoid them being shifted
1676      into the significant bits.  */
1677   if (size < 64 / BITS_PER_MARKER)
1678     n->n &= ((uint64_t) 1 << (size * BITS_PER_MARKER)) - 1;
1679
1680   switch (code)
1681     {
1682     case LSHIFT_EXPR:
1683       n->n <<= count;
1684       break;
1685     case RSHIFT_EXPR:
1686       head_marker = HEAD_MARKER (n->n, size);
1687       n->n >>= count;
1688       /* Arithmetic shift of signed type: result is dependent on the value.  */
1689       if (!TYPE_UNSIGNED (n->type) && head_marker)
1690         for (i = 0; i < count / BITS_PER_MARKER; i++)
1691           n->n |= (uint64_t) MARKER_BYTE_UNKNOWN
1692                   << ((size - 1 - i) * BITS_PER_MARKER);
1693       break;
1694     case LROTATE_EXPR:
1695       n->n = (n->n << count) | (n->n >> ((size * BITS_PER_MARKER) - count));
1696       break;
1697     case RROTATE_EXPR:
1698       n->n = (n->n >> count) | (n->n << ((size * BITS_PER_MARKER) - count));
1699       break;
1700     default:
1701       return false;
1702     }
1703   /* Zero unused bits for size.  */
1704   if (size < 64 / BITS_PER_MARKER)
1705     n->n &= ((uint64_t) 1 << (size * BITS_PER_MARKER)) - 1;
1706   return true;
1707 }
1708
1709 /* Perform sanity checking for the symbolic number N and the gimple
1710    statement STMT.  */
1711
1712 static inline bool
1713 verify_symbolic_number_p (struct symbolic_number *n, gimple stmt)
1714 {
1715   tree lhs_type;
1716
1717   lhs_type = gimple_expr_type (stmt);
1718
1719   if (TREE_CODE (lhs_type) != INTEGER_TYPE)
1720     return false;
1721
1722   if (TYPE_PRECISION (lhs_type) != TYPE_PRECISION (n->type))
1723     return false;
1724
1725   return true;
1726 }
1727
1728 /* Initialize the symbolic number N for the bswap pass from the base element
1729    SRC manipulated by the bitwise OR expression.  */
1730
1731 static bool
1732 init_symbolic_number (struct symbolic_number *n, tree src)
1733 {
1734   int size;
1735
1736   n->base_addr = n->offset = n->alias_set = n->vuse = NULL_TREE;
1737
1738   /* Set up the symbolic number N by setting each byte to a value between 1 and
1739      the byte size of rhs1.  The highest order byte is set to n->size and the
1740      lowest order byte to 1.  */
1741   n->type = TREE_TYPE (src);
1742   size = TYPE_PRECISION (n->type);
1743   if (size % BITS_PER_UNIT != 0)
1744     return false;
1745   size /= BITS_PER_UNIT;
1746   if (size > 64 / BITS_PER_MARKER)
1747     return false;
1748   n->range = size;
1749   n->n = CMPNOP;
1750
1751   if (size < 64 / BITS_PER_MARKER)
1752     n->n &= ((uint64_t) 1 << (size * BITS_PER_MARKER)) - 1;
1753
1754   return true;
1755 }
1756
1757 /* Check if STMT might be a byte swap or a nop from a memory source and returns
1758    the answer. If so, REF is that memory source and the base of the memory area
1759    accessed and the offset of the access from that base are recorded in N.  */
1760
1761 bool
1762 find_bswap_or_nop_load (gimple stmt, tree ref, struct symbolic_number *n)
1763 {
1764   /* Leaf node is an array or component ref. Memorize its base and
1765      offset from base to compare to other such leaf node.  */
1766   HOST_WIDE_INT bitsize, bitpos;
1767   machine_mode mode;
1768   int unsignedp, volatilep;
1769   tree offset, base_addr;
1770
1771   if (!gimple_assign_load_p (stmt) || gimple_has_volatile_ops (stmt))
1772     return false;
1773
1774   base_addr = get_inner_reference (ref, &bitsize, &bitpos, &offset, &mode,
1775                                    &unsignedp, &volatilep, false);
1776
1777   if (TREE_CODE (base_addr) == MEM_REF)
1778     {
1779       offset_int bit_offset = 0;
1780       tree off = TREE_OPERAND (base_addr, 1);
1781
1782       if (!integer_zerop (off))
1783         {
1784           offset_int boff, coff = mem_ref_offset (base_addr);
1785           boff = wi::lshift (coff, LOG2_BITS_PER_UNIT);
1786           bit_offset += boff;
1787         }
1788
1789       base_addr = TREE_OPERAND (base_addr, 0);
1790
1791       /* Avoid returning a negative bitpos as this may wreak havoc later.  */
1792       if (wi::neg_p (bit_offset))
1793         {
1794           offset_int mask = wi::mask <offset_int> (LOG2_BITS_PER_UNIT, false);
1795           offset_int tem = bit_offset.and_not (mask);
1796           /* TEM is the bitpos rounded to BITS_PER_UNIT towards -Inf.
1797              Subtract it to BIT_OFFSET and add it (scaled) to OFFSET.  */
1798           bit_offset -= tem;
1799           tem = wi::arshift (tem, LOG2_BITS_PER_UNIT);
1800           if (offset)
1801             offset = size_binop (PLUS_EXPR, offset,
1802                                     wide_int_to_tree (sizetype, tem));
1803           else
1804             offset = wide_int_to_tree (sizetype, tem);
1805         }
1806
1807       bitpos += bit_offset.to_shwi ();
1808     }
1809
1810   if (bitpos % BITS_PER_UNIT)
1811     return false;
1812   if (bitsize % BITS_PER_UNIT)
1813     return false;
1814
1815   if (!init_symbolic_number (n, ref))
1816     return false;
1817   n->base_addr = base_addr;
1818   n->offset = offset;
1819   n->bytepos = bitpos / BITS_PER_UNIT;
1820   n->alias_set = reference_alias_ptr_type (ref);
1821   n->vuse = gimple_vuse (stmt);
1822   return true;
1823 }
1824
1825 /* Compute the symbolic number N representing the result of a bitwise OR on 2
1826    symbolic number N1 and N2 whose source statements are respectively
1827    SOURCE_STMT1 and SOURCE_STMT2.  */
1828
1829 static gimple
1830 perform_symbolic_merge (gimple source_stmt1, struct symbolic_number *n1,
1831                         gimple source_stmt2, struct symbolic_number *n2,
1832                         struct symbolic_number *n)
1833 {
1834   int i, size;
1835   uint64_t mask;
1836   gimple source_stmt;
1837   struct symbolic_number *n_start;
1838
1839   /* Sources are different, cancel bswap if they are not memory location with
1840      the same base (array, structure, ...).  */
1841   if (gimple_assign_rhs1 (source_stmt1) != gimple_assign_rhs1 (source_stmt2))
1842     {
1843       int64_t inc;
1844       HOST_WIDE_INT start_sub, end_sub, end1, end2, end;
1845       struct symbolic_number *toinc_n_ptr, *n_end;
1846
1847       if (!n1->base_addr || !n2->base_addr
1848           || !operand_equal_p (n1->base_addr, n2->base_addr, 0))
1849         return NULL;
1850
1851       if (!n1->offset != !n2->offset ||
1852           (n1->offset && !operand_equal_p (n1->offset, n2->offset, 0)))
1853         return NULL;
1854
1855       if (n1->bytepos < n2->bytepos)
1856         {
1857           n_start = n1;
1858           start_sub = n2->bytepos - n1->bytepos;
1859           source_stmt = source_stmt1;
1860         }
1861       else
1862         {
1863           n_start = n2;
1864           start_sub = n1->bytepos - n2->bytepos;
1865           source_stmt = source_stmt2;
1866         }
1867
1868       /* Find the highest address at which a load is performed and
1869          compute related info.  */
1870       end1 = n1->bytepos + (n1->range - 1);
1871       end2 = n2->bytepos + (n2->range - 1);
1872       if (end1 < end2)
1873         {
1874           end = end2;
1875           end_sub = end2 - end1;
1876         }
1877       else
1878         {
1879           end = end1;
1880           end_sub = end1 - end2;
1881         }
1882       n_end = (end2 > end1) ? n2 : n1;
1883
1884       /* Find symbolic number whose lsb is the most significant.  */
1885       if (BYTES_BIG_ENDIAN)
1886         toinc_n_ptr = (n_end == n1) ? n2 : n1;
1887       else
1888         toinc_n_ptr = (n_start == n1) ? n2 : n1;
1889
1890       n->range = end - n_start->bytepos + 1;
1891
1892       /* Check that the range of memory covered can be represented by
1893          a symbolic number.  */
1894       if (n->range > 64 / BITS_PER_MARKER)
1895         return NULL;
1896
1897       /* Reinterpret byte marks in symbolic number holding the value of
1898          bigger weight according to target endianness.  */
1899       inc = BYTES_BIG_ENDIAN ? end_sub : start_sub;
1900       size = TYPE_PRECISION (n1->type) / BITS_PER_UNIT;
1901       for (i = 0; i < size; i++, inc <<= BITS_PER_MARKER)
1902         {
1903           unsigned marker =
1904             (toinc_n_ptr->n >> (i * BITS_PER_MARKER)) & MARKER_MASK;
1905           if (marker && marker != MARKER_BYTE_UNKNOWN)
1906             toinc_n_ptr->n += inc;
1907         }
1908     }
1909   else
1910     {
1911       n->range = n1->range;
1912       n_start = n1;
1913       source_stmt = source_stmt1;
1914     }
1915
1916   if (!n1->alias_set
1917       || alias_ptr_types_compatible_p (n1->alias_set, n2->alias_set))
1918     n->alias_set = n1->alias_set;
1919   else
1920     n->alias_set = ptr_type_node;
1921   n->vuse = n_start->vuse;
1922   n->base_addr = n_start->base_addr;
1923   n->offset = n_start->offset;
1924   n->bytepos = n_start->bytepos;
1925   n->type = n_start->type;
1926   size = TYPE_PRECISION (n->type) / BITS_PER_UNIT;
1927
1928   for (i = 0, mask = MARKER_MASK; i < size; i++, mask <<= BITS_PER_MARKER)
1929     {
1930       uint64_t masked1, masked2;
1931
1932       masked1 = n1->n & mask;
1933       masked2 = n2->n & mask;
1934       if (masked1 && masked2 && masked1 != masked2)
1935         return NULL;
1936     }
1937   n->n = n1->n | n2->n;
1938
1939   return source_stmt;
1940 }
1941
1942 /* find_bswap_or_nop_1 invokes itself recursively with N and tries to perform
1943    the operation given by the rhs of STMT on the result.  If the operation
1944    could successfully be executed the function returns a gimple stmt whose
1945    rhs's first tree is the expression of the source operand and NULL
1946    otherwise.  */
1947
1948 static gimple
1949 find_bswap_or_nop_1 (gimple stmt, struct symbolic_number *n, int limit)
1950 {
1951   enum tree_code code;
1952   tree rhs1, rhs2 = NULL;
1953   gimple rhs1_stmt, rhs2_stmt, source_stmt1;
1954   enum gimple_rhs_class rhs_class;
1955
1956   if (!limit || !is_gimple_assign (stmt))
1957     return NULL;
1958
1959   rhs1 = gimple_assign_rhs1 (stmt);
1960
1961   if (find_bswap_or_nop_load (stmt, rhs1, n))
1962     return stmt;
1963
1964   if (TREE_CODE (rhs1) != SSA_NAME)
1965     return NULL;
1966
1967   code = gimple_assign_rhs_code (stmt);
1968   rhs_class = gimple_assign_rhs_class (stmt);
1969   rhs1_stmt = SSA_NAME_DEF_STMT (rhs1);
1970
1971   if (rhs_class == GIMPLE_BINARY_RHS)
1972     rhs2 = gimple_assign_rhs2 (stmt);
1973
1974   /* Handle unary rhs and binary rhs with integer constants as second
1975      operand.  */
1976
1977   if (rhs_class == GIMPLE_UNARY_RHS
1978       || (rhs_class == GIMPLE_BINARY_RHS
1979           && TREE_CODE (rhs2) == INTEGER_CST))
1980     {
1981       if (code != BIT_AND_EXPR
1982           && code != LSHIFT_EXPR
1983           && code != RSHIFT_EXPR
1984           && code != LROTATE_EXPR
1985           && code != RROTATE_EXPR
1986           && !CONVERT_EXPR_CODE_P (code))
1987         return NULL;
1988
1989       source_stmt1 = find_bswap_or_nop_1 (rhs1_stmt, n, limit - 1);
1990
1991       /* If find_bswap_or_nop_1 returned NULL, STMT is a leaf node and
1992          we have to initialize the symbolic number.  */
1993       if (!source_stmt1)
1994         {
1995           if (gimple_assign_load_p (stmt)
1996               || !init_symbolic_number (n, rhs1))
1997             return NULL;
1998           source_stmt1 = stmt;
1999         }
2000
2001       switch (code)
2002         {
2003         case BIT_AND_EXPR:
2004           {
2005             int i, size = TYPE_PRECISION (n->type) / BITS_PER_UNIT;
2006             uint64_t val = int_cst_value (rhs2), mask = 0;
2007             uint64_t tmp = (1 << BITS_PER_UNIT) - 1;
2008
2009             /* Only constants masking full bytes are allowed.  */
2010             for (i = 0; i < size; i++, tmp <<= BITS_PER_UNIT)
2011               if ((val & tmp) != 0 && (val & tmp) != tmp)
2012                 return NULL;
2013               else if (val & tmp)
2014                 mask |= (uint64_t) MARKER_MASK << (i * BITS_PER_MARKER);
2015
2016             n->n &= mask;
2017           }
2018           break;
2019         case LSHIFT_EXPR:
2020         case RSHIFT_EXPR:
2021         case LROTATE_EXPR:
2022         case RROTATE_EXPR:
2023           if (!do_shift_rotate (code, n, (int)TREE_INT_CST_LOW (rhs2)))
2024             return NULL;
2025           break;
2026         CASE_CONVERT:
2027           {
2028             int i, type_size, old_type_size;
2029             tree type;
2030
2031             type = gimple_expr_type (stmt);
2032             type_size = TYPE_PRECISION (type);
2033             if (type_size % BITS_PER_UNIT != 0)
2034               return NULL;
2035             type_size /= BITS_PER_UNIT;
2036             if (type_size > 64 / BITS_PER_MARKER)
2037               return NULL;
2038
2039             /* Sign extension: result is dependent on the value.  */
2040             old_type_size = TYPE_PRECISION (n->type) / BITS_PER_UNIT;
2041             if (!TYPE_UNSIGNED (n->type) && type_size > old_type_size
2042                 && HEAD_MARKER (n->n, old_type_size))
2043               for (i = 0; i < type_size - old_type_size; i++)
2044                 n->n |= (uint64_t) MARKER_BYTE_UNKNOWN
2045                         << ((type_size - 1 - i) * BITS_PER_MARKER);
2046
2047             if (type_size < 64 / BITS_PER_MARKER)
2048               {
2049                 /* If STMT casts to a smaller type mask out the bits not
2050                    belonging to the target type.  */
2051                 n->n &= ((uint64_t) 1 << (type_size * BITS_PER_MARKER)) - 1;
2052               }
2053             n->type = type;
2054             if (!n->base_addr)
2055               n->range = type_size;
2056           }
2057           break;
2058         default:
2059           return NULL;
2060         };
2061       return verify_symbolic_number_p (n, stmt) ? source_stmt1 : NULL;
2062     }
2063
2064   /* Handle binary rhs.  */
2065
2066   if (rhs_class == GIMPLE_BINARY_RHS)
2067     {
2068       struct symbolic_number n1, n2;
2069       gimple source_stmt, source_stmt2;
2070
2071       if (code != BIT_IOR_EXPR)
2072         return NULL;
2073
2074       if (TREE_CODE (rhs2) != SSA_NAME)
2075         return NULL;
2076
2077       rhs2_stmt = SSA_NAME_DEF_STMT (rhs2);
2078
2079       switch (code)
2080         {
2081         case BIT_IOR_EXPR:
2082           source_stmt1 = find_bswap_or_nop_1 (rhs1_stmt, &n1, limit - 1);
2083
2084           if (!source_stmt1)
2085             return NULL;
2086
2087           source_stmt2 = find_bswap_or_nop_1 (rhs2_stmt, &n2, limit - 1);
2088
2089           if (!source_stmt2)
2090             return NULL;
2091
2092           if (TYPE_PRECISION (n1.type) != TYPE_PRECISION (n2.type))
2093             return NULL;
2094
2095           if (!n1.vuse != !n2.vuse ||
2096           (n1.vuse && !operand_equal_p (n1.vuse, n2.vuse, 0)))
2097             return NULL;
2098
2099           source_stmt =
2100             perform_symbolic_merge (source_stmt1, &n1, source_stmt2, &n2, n);
2101
2102           if (!source_stmt)
2103             return NULL;
2104
2105           if (!verify_symbolic_number_p (n, stmt))
2106             return NULL;
2107
2108           break;
2109         default:
2110           return NULL;
2111         }
2112       return source_stmt;
2113     }
2114   return NULL;
2115 }
2116
2117 /* Check if STMT completes a bswap implementation or a read in a given
2118    endianness consisting of ORs, SHIFTs and ANDs and sets *BSWAP
2119    accordingly.  It also sets N to represent the kind of operations
2120    performed: size of the resulting expression and whether it works on
2121    a memory source, and if so alias-set and vuse.  At last, the
2122    function returns a stmt whose rhs's first tree is the source
2123    expression.  */
2124
2125 static gimple
2126 find_bswap_or_nop (gimple stmt, struct symbolic_number *n, bool *bswap)
2127 {
2128 /* The number which the find_bswap_or_nop_1 result should match in order
2129    to have a full byte swap.  The number is shifted to the right
2130    according to the size of the symbolic number before using it.  */
2131   uint64_t cmpxchg = CMPXCHG;
2132   uint64_t cmpnop = CMPNOP;
2133
2134   gimple source_stmt;
2135   int limit;
2136
2137   /* The last parameter determines the depth search limit.  It usually
2138      correlates directly to the number n of bytes to be touched.  We
2139      increase that number by log2(n) + 1 here in order to also
2140      cover signed -> unsigned conversions of the src operand as can be seen
2141      in libgcc, and for initial shift/and operation of the src operand.  */
2142   limit = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (gimple_expr_type (stmt)));
2143   limit += 1 + (int) ceil_log2 ((unsigned HOST_WIDE_INT) limit);
2144   source_stmt =  find_bswap_or_nop_1 (stmt, n, limit);
2145
2146   if (!source_stmt)
2147     return NULL;
2148
2149   /* Find real size of result (highest non zero byte).  */
2150   if (n->base_addr)
2151     {
2152       int rsize;
2153       uint64_t tmpn;
2154
2155       for (tmpn = n->n, rsize = 0; tmpn; tmpn >>= BITS_PER_MARKER, rsize++);
2156       n->range = rsize;
2157     }
2158
2159   /* Zero out the extra bits of N and CMP*.  */
2160   if (n->range < (int) sizeof (int64_t))
2161     {
2162       uint64_t mask;
2163
2164       mask = ((uint64_t) 1 << (n->range * BITS_PER_MARKER)) - 1;
2165       cmpxchg >>= (64 / BITS_PER_MARKER - n->range) * BITS_PER_MARKER;
2166       cmpnop &= mask;
2167     }
2168
2169   /* A complete byte swap should make the symbolic number to start with
2170      the largest digit in the highest order byte. Unchanged symbolic
2171      number indicates a read with same endianness as target architecture.  */
2172   if (n->n == cmpnop)
2173     *bswap = false;
2174   else if (n->n == cmpxchg)
2175     *bswap = true;
2176   else
2177     return NULL;
2178
2179   /* Useless bit manipulation performed by code.  */
2180   if (!n->base_addr && n->n == cmpnop)
2181     return NULL;
2182
2183   n->range *= BITS_PER_UNIT;
2184   return source_stmt;
2185 }
2186
2187 namespace {
2188
2189 const pass_data pass_data_optimize_bswap =
2190 {
2191   GIMPLE_PASS, /* type */
2192   "bswap", /* name */
2193   OPTGROUP_NONE, /* optinfo_flags */
2194   TV_NONE, /* tv_id */
2195   PROP_ssa, /* properties_required */
2196   0, /* properties_provided */
2197   0, /* properties_destroyed */
2198   0, /* todo_flags_start */
2199   0, /* todo_flags_finish */
2200 };
2201
2202 class pass_optimize_bswap : public gimple_opt_pass
2203 {
2204 public:
2205   pass_optimize_bswap (gcc::context *ctxt)
2206     : gimple_opt_pass (pass_data_optimize_bswap, ctxt)
2207   {}
2208
2209   /* opt_pass methods: */
2210   virtual bool gate (function *)
2211     {
2212       return flag_expensive_optimizations && optimize;
2213     }
2214
2215   virtual unsigned int execute (function *);
2216
2217 }; // class pass_optimize_bswap
2218
2219 /* Perform the bswap optimization: replace the expression computed in the rhs
2220    of CUR_STMT by an equivalent bswap, load or load + bswap expression.
2221    Which of these alternatives replace the rhs is given by N->base_addr (non
2222    null if a load is needed) and BSWAP.  The type, VUSE and set-alias of the
2223    load to perform are also given in N while the builtin bswap invoke is given
2224    in FNDEL.  Finally, if a load is involved, SRC_STMT refers to one of the
2225    load statements involved to construct the rhs in CUR_STMT and N->range gives
2226    the size of the rhs expression for maintaining some statistics.
2227
2228    Note that if the replacement involve a load, CUR_STMT is moved just after
2229    SRC_STMT to do the load with the same VUSE which can lead to CUR_STMT
2230    changing of basic block.  */
2231
2232 static bool
2233 bswap_replace (gimple cur_stmt, gimple src_stmt, tree fndecl, tree bswap_type,
2234                tree load_type, struct symbolic_number *n, bool bswap)
2235 {
2236   gimple_stmt_iterator gsi;
2237   tree src, tmp, tgt;
2238   gimple bswap_stmt;
2239
2240   gsi = gsi_for_stmt (cur_stmt);
2241   src = gimple_assign_rhs1 (src_stmt);
2242   tgt = gimple_assign_lhs (cur_stmt);
2243
2244   /* Need to load the value from memory first.  */
2245   if (n->base_addr)
2246     {
2247       gimple_stmt_iterator gsi_ins = gsi_for_stmt (src_stmt);
2248       tree addr_expr, addr_tmp, val_expr, val_tmp;
2249       tree load_offset_ptr, aligned_load_type;
2250       gimple addr_stmt, load_stmt;
2251       unsigned align;
2252
2253       align = get_object_alignment (src);
2254       if (bswap
2255           && align < GET_MODE_ALIGNMENT (TYPE_MODE (load_type))
2256           && SLOW_UNALIGNED_ACCESS (TYPE_MODE (load_type), align))
2257         return false;
2258
2259       /* Move cur_stmt just before  one of the load of the original
2260          to ensure it has the same VUSE.  See PR61517 for what could
2261          go wrong.  */
2262       gsi_move_before (&gsi, &gsi_ins);
2263       gsi = gsi_for_stmt (cur_stmt);
2264
2265       /*  Compute address to load from and cast according to the size
2266           of the load.  */
2267       addr_expr = build_fold_addr_expr (unshare_expr (src));
2268       if (is_gimple_min_invariant (addr_expr))
2269         addr_tmp = addr_expr;
2270       else
2271         {
2272           addr_tmp = make_temp_ssa_name (TREE_TYPE (addr_expr), NULL,
2273                                          "load_src");
2274           addr_stmt = gimple_build_assign (addr_tmp, addr_expr);
2275           gsi_insert_before (&gsi, addr_stmt, GSI_SAME_STMT);
2276         }
2277
2278       /* Perform the load.  */
2279       aligned_load_type = load_type;
2280       if (align < TYPE_ALIGN (load_type))
2281         aligned_load_type = build_aligned_type (load_type, align);
2282       load_offset_ptr = build_int_cst (n->alias_set, 0);
2283       val_expr = fold_build2 (MEM_REF, aligned_load_type, addr_tmp,
2284                               load_offset_ptr);
2285
2286       if (!bswap)
2287         {
2288           if (n->range == 16)
2289             nop_stats.found_16bit++;
2290           else if (n->range == 32)
2291             nop_stats.found_32bit++;
2292           else
2293             {
2294               gcc_assert (n->range == 64);
2295               nop_stats.found_64bit++;
2296             }
2297
2298           /* Convert the result of load if necessary.  */
2299           if (!useless_type_conversion_p (TREE_TYPE (tgt), load_type))
2300             {
2301               val_tmp = make_temp_ssa_name (aligned_load_type, NULL,
2302                                             "load_dst");
2303               load_stmt = gimple_build_assign (val_tmp, val_expr);
2304               gimple_set_vuse (load_stmt, n->vuse);
2305               gsi_insert_before (&gsi, load_stmt, GSI_SAME_STMT);
2306               gimple_assign_set_rhs_with_ops (&gsi, NOP_EXPR, val_tmp);
2307             }
2308           else
2309             {
2310               gimple_assign_set_rhs_with_ops (&gsi, MEM_REF, val_expr);
2311               gimple_set_vuse (cur_stmt, n->vuse);
2312             }
2313           update_stmt (cur_stmt);
2314
2315           if (dump_file)
2316             {
2317               fprintf (dump_file,
2318                        "%d bit load in target endianness found at: ",
2319                        (int)n->range);
2320               print_gimple_stmt (dump_file, cur_stmt, 0, 0);
2321             }
2322           return true;
2323         }
2324       else
2325         {
2326           val_tmp = make_temp_ssa_name (aligned_load_type, NULL, "load_dst");
2327           load_stmt = gimple_build_assign (val_tmp, val_expr);
2328           gimple_set_vuse (load_stmt, n->vuse);
2329           gsi_insert_before (&gsi, load_stmt, GSI_SAME_STMT);
2330         }
2331       src = val_tmp;
2332     }
2333
2334   if (n->range == 16)
2335     bswap_stats.found_16bit++;
2336   else if (n->range == 32)
2337     bswap_stats.found_32bit++;
2338   else
2339     {
2340       gcc_assert (n->range == 64);
2341       bswap_stats.found_64bit++;
2342     }
2343
2344   tmp = src;
2345
2346   /* Canonical form for 16 bit bswap is a rotate expression.  Only 16bit values
2347      are considered as rotation of 2N bit values by N bits is generally not
2348      equivalent to a bswap.  Consider for instance 0x01020304 >> 16 which gives
2349      0x03040102 while a bswap for that value is 0x04030201.  */
2350   if (bswap && n->range == 16)
2351     {
2352       tree count = build_int_cst (NULL, BITS_PER_UNIT);
2353       bswap_type = TREE_TYPE (src);
2354       src = fold_build2 (LROTATE_EXPR, bswap_type, src, count);
2355       bswap_stmt = gimple_build_assign (NULL, src);
2356     }
2357   else
2358     {
2359       /* Convert the src expression if necessary.  */
2360       if (!useless_type_conversion_p (TREE_TYPE (tmp), bswap_type))
2361         {
2362           gimple convert_stmt;
2363           tmp = make_temp_ssa_name (bswap_type, NULL, "bswapsrc");
2364           convert_stmt = gimple_build_assign (tmp, NOP_EXPR, src);
2365           gsi_insert_before (&gsi, convert_stmt, GSI_SAME_STMT);
2366         }
2367
2368       bswap_stmt = gimple_build_call (fndecl, 1, tmp);
2369     }
2370
2371   tmp = tgt;
2372
2373   /* Convert the result if necessary.  */
2374   if (!useless_type_conversion_p (TREE_TYPE (tgt), bswap_type))
2375     {
2376       gimple convert_stmt;
2377       tmp = make_temp_ssa_name (bswap_type, NULL, "bswapdst");
2378       convert_stmt = gimple_build_assign (tgt, NOP_EXPR, tmp);
2379       gsi_insert_after (&gsi, convert_stmt, GSI_SAME_STMT);
2380     }
2381
2382   gimple_set_lhs (bswap_stmt, tmp);
2383
2384   if (dump_file)
2385     {
2386       fprintf (dump_file, "%d bit bswap implementation found at: ",
2387                (int)n->range);
2388       print_gimple_stmt (dump_file, cur_stmt, 0, 0);
2389     }
2390
2391   gsi_insert_after (&gsi, bswap_stmt, GSI_SAME_STMT);
2392   gsi_remove (&gsi, true);
2393   return true;
2394 }
2395
2396 /* Find manual byte swap implementations as well as load in a given
2397    endianness. Byte swaps are turned into a bswap builtin invokation
2398    while endian loads are converted to bswap builtin invokation or
2399    simple load according to the target endianness.  */
2400
2401 unsigned int
2402 pass_optimize_bswap::execute (function *fun)
2403 {
2404   basic_block bb;
2405   bool bswap32_p, bswap64_p;
2406   bool changed = false;
2407   tree bswap32_type = NULL_TREE, bswap64_type = NULL_TREE;
2408
2409   if (BITS_PER_UNIT != 8)
2410     return 0;
2411
2412   bswap32_p = (builtin_decl_explicit_p (BUILT_IN_BSWAP32)
2413                && optab_handler (bswap_optab, SImode) != CODE_FOR_nothing);
2414   bswap64_p = (builtin_decl_explicit_p (BUILT_IN_BSWAP64)
2415                && (optab_handler (bswap_optab, DImode) != CODE_FOR_nothing
2416                    || (bswap32_p && word_mode == SImode)));
2417
2418   /* Determine the argument type of the builtins.  The code later on
2419      assumes that the return and argument type are the same.  */
2420   if (bswap32_p)
2421     {
2422       tree fndecl = builtin_decl_explicit (BUILT_IN_BSWAP32);
2423       bswap32_type = TREE_VALUE (TYPE_ARG_TYPES (TREE_TYPE (fndecl)));
2424     }
2425
2426   if (bswap64_p)
2427     {
2428       tree fndecl = builtin_decl_explicit (BUILT_IN_BSWAP64);
2429       bswap64_type = TREE_VALUE (TYPE_ARG_TYPES (TREE_TYPE (fndecl)));
2430     }
2431
2432   memset (&nop_stats, 0, sizeof (nop_stats));
2433   memset (&bswap_stats, 0, sizeof (bswap_stats));
2434
2435   FOR_EACH_BB_FN (bb, fun)
2436     {
2437       gimple_stmt_iterator gsi;
2438
2439       /* We do a reverse scan for bswap patterns to make sure we get the
2440          widest match. As bswap pattern matching doesn't handle previously
2441          inserted smaller bswap replacements as sub-patterns, the wider
2442          variant wouldn't be detected.  */
2443       for (gsi = gsi_last_bb (bb); !gsi_end_p (gsi);)
2444         {
2445           gimple src_stmt, cur_stmt = gsi_stmt (gsi);
2446           tree fndecl = NULL_TREE, bswap_type = NULL_TREE, load_type;
2447           enum tree_code code;
2448           struct symbolic_number n;
2449           bool bswap;
2450
2451           /* This gsi_prev (&gsi) is not part of the for loop because cur_stmt
2452              might be moved to a different basic block by bswap_replace and gsi
2453              must not points to it if that's the case.  Moving the gsi_prev
2454              there make sure that gsi points to the statement previous to
2455              cur_stmt while still making sure that all statements are
2456              considered in this basic block.  */
2457           gsi_prev (&gsi);
2458
2459           if (!is_gimple_assign (cur_stmt))
2460             continue;
2461
2462           code = gimple_assign_rhs_code (cur_stmt);
2463           switch (code)
2464             {
2465             case LROTATE_EXPR:
2466             case RROTATE_EXPR:
2467               if (!tree_fits_uhwi_p (gimple_assign_rhs2 (cur_stmt))
2468                   || tree_to_uhwi (gimple_assign_rhs2 (cur_stmt))
2469                      % BITS_PER_UNIT)
2470                 continue;
2471               /* Fall through.  */
2472             case BIT_IOR_EXPR:
2473               break;
2474             default:
2475               continue;
2476             }
2477
2478           src_stmt = find_bswap_or_nop (cur_stmt, &n, &bswap);
2479
2480           if (!src_stmt)
2481             continue;
2482
2483           switch (n.range)
2484             {
2485             case 16:
2486               /* Already in canonical form, nothing to do.  */
2487               if (code == LROTATE_EXPR || code == RROTATE_EXPR)
2488                 continue;
2489               load_type = uint16_type_node;
2490               break;
2491             case 32:
2492               load_type = uint32_type_node;
2493               if (bswap32_p)
2494                 {
2495                   fndecl = builtin_decl_explicit (BUILT_IN_BSWAP32);
2496                   bswap_type = bswap32_type;
2497                 }
2498               break;
2499             case 64:
2500               load_type = uint64_type_node;
2501               if (bswap64_p)
2502                 {
2503                   fndecl = builtin_decl_explicit (BUILT_IN_BSWAP64);
2504                   bswap_type = bswap64_type;
2505                 }
2506               break;
2507             default:
2508               continue;
2509             }
2510
2511           if (bswap && !fndecl && n.range != 16)
2512             continue;
2513
2514           if (bswap_replace (cur_stmt, src_stmt, fndecl, bswap_type, load_type,
2515                              &n, bswap))
2516             changed = true;
2517         }
2518     }
2519
2520   statistics_counter_event (fun, "16-bit nop implementations found",
2521                             nop_stats.found_16bit);
2522   statistics_counter_event (fun, "32-bit nop implementations found",
2523                             nop_stats.found_32bit);
2524   statistics_counter_event (fun, "64-bit nop implementations found",
2525                             nop_stats.found_64bit);
2526   statistics_counter_event (fun, "16-bit bswap implementations found",
2527                             bswap_stats.found_16bit);
2528   statistics_counter_event (fun, "32-bit bswap implementations found",
2529                             bswap_stats.found_32bit);
2530   statistics_counter_event (fun, "64-bit bswap implementations found",
2531                             bswap_stats.found_64bit);
2532
2533   return (changed ? TODO_update_ssa : 0);
2534 }
2535
2536 } // anon namespace
2537
2538 gimple_opt_pass *
2539 make_pass_optimize_bswap (gcc::context *ctxt)
2540 {
2541   return new pass_optimize_bswap (ctxt);
2542 }
2543
2544 /* Return true if stmt is a type conversion operation that can be stripped
2545    when used in a widening multiply operation.  */
2546 static bool
2547 widening_mult_conversion_strippable_p (tree result_type, gimple stmt)
2548 {
2549   enum tree_code rhs_code = gimple_assign_rhs_code (stmt);
2550
2551   if (TREE_CODE (result_type) == INTEGER_TYPE)
2552     {
2553       tree op_type;
2554       tree inner_op_type;
2555
2556       if (!CONVERT_EXPR_CODE_P (rhs_code))
2557         return false;
2558
2559       op_type = TREE_TYPE (gimple_assign_lhs (stmt));
2560
2561       /* If the type of OP has the same precision as the result, then
2562          we can strip this conversion.  The multiply operation will be
2563          selected to create the correct extension as a by-product.  */
2564       if (TYPE_PRECISION (result_type) == TYPE_PRECISION (op_type))
2565         return true;
2566
2567       /* We can also strip a conversion if it preserves the signed-ness of
2568          the operation and doesn't narrow the range.  */
2569       inner_op_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
2570
2571       /* If the inner-most type is unsigned, then we can strip any
2572          intermediate widening operation.  If it's signed, then the
2573          intermediate widening operation must also be signed.  */
2574       if ((TYPE_UNSIGNED (inner_op_type)
2575            || TYPE_UNSIGNED (op_type) == TYPE_UNSIGNED (inner_op_type))
2576           && TYPE_PRECISION (op_type) > TYPE_PRECISION (inner_op_type))
2577         return true;
2578
2579       return false;
2580     }
2581
2582   return rhs_code == FIXED_CONVERT_EXPR;
2583 }
2584
2585 /* Return true if RHS is a suitable operand for a widening multiplication,
2586    assuming a target type of TYPE.
2587    There are two cases:
2588
2589      - RHS makes some value at least twice as wide.  Store that value
2590        in *NEW_RHS_OUT if so, and store its type in *TYPE_OUT.
2591
2592      - RHS is an integer constant.  Store that value in *NEW_RHS_OUT if so,
2593        but leave *TYPE_OUT untouched.  */
2594
2595 static bool
2596 is_widening_mult_rhs_p (tree type, tree rhs, tree *type_out,
2597                         tree *new_rhs_out)
2598 {
2599   gimple stmt;
2600   tree type1, rhs1;
2601
2602   if (TREE_CODE (rhs) == SSA_NAME)
2603     {
2604       stmt = SSA_NAME_DEF_STMT (rhs);
2605       if (is_gimple_assign (stmt))
2606         {
2607           if (! widening_mult_conversion_strippable_p (type, stmt))
2608             rhs1 = rhs;
2609           else
2610             {
2611               rhs1 = gimple_assign_rhs1 (stmt);
2612
2613               if (TREE_CODE (rhs1) == INTEGER_CST)
2614                 {
2615                   *new_rhs_out = rhs1;
2616                   *type_out = NULL;
2617                   return true;
2618                 }
2619             }
2620         }
2621       else
2622         rhs1 = rhs;
2623
2624       type1 = TREE_TYPE (rhs1);
2625
2626       if (TREE_CODE (type1) != TREE_CODE (type)
2627           || TYPE_PRECISION (type1) * 2 > TYPE_PRECISION (type))
2628         return false;
2629
2630       *new_rhs_out = rhs1;
2631       *type_out = type1;
2632       return true;
2633     }
2634
2635   if (TREE_CODE (rhs) == INTEGER_CST)
2636     {
2637       *new_rhs_out = rhs;
2638       *type_out = NULL;
2639       return true;
2640     }
2641
2642   return false;
2643 }
2644
2645 /* Return true if STMT performs a widening multiplication, assuming the
2646    output type is TYPE.  If so, store the unwidened types of the operands
2647    in *TYPE1_OUT and *TYPE2_OUT respectively.  Also fill *RHS1_OUT and
2648    *RHS2_OUT such that converting those operands to types *TYPE1_OUT
2649    and *TYPE2_OUT would give the operands of the multiplication.  */
2650
2651 static bool
2652 is_widening_mult_p (gimple stmt,
2653                     tree *type1_out, tree *rhs1_out,
2654                     tree *type2_out, tree *rhs2_out)
2655 {
2656   tree type = TREE_TYPE (gimple_assign_lhs (stmt));
2657
2658   if (TREE_CODE (type) != INTEGER_TYPE
2659       && TREE_CODE (type) != FIXED_POINT_TYPE)
2660     return false;
2661
2662   if (!is_widening_mult_rhs_p (type, gimple_assign_rhs1 (stmt), type1_out,
2663                                rhs1_out))
2664     return false;
2665
2666   if (!is_widening_mult_rhs_p (type, gimple_assign_rhs2 (stmt), type2_out,
2667                                rhs2_out))
2668     return false;
2669
2670   if (*type1_out == NULL)
2671     {
2672       if (*type2_out == NULL || !int_fits_type_p (*rhs1_out, *type2_out))
2673         return false;
2674       *type1_out = *type2_out;
2675     }
2676
2677   if (*type2_out == NULL)
2678     {
2679       if (!int_fits_type_p (*rhs2_out, *type1_out))
2680         return false;
2681       *type2_out = *type1_out;
2682     }
2683
2684   /* Ensure that the larger of the two operands comes first. */
2685   if (TYPE_PRECISION (*type1_out) < TYPE_PRECISION (*type2_out))
2686     {
2687       tree tmp;
2688       tmp = *type1_out;
2689       *type1_out = *type2_out;
2690       *type2_out = tmp;
2691       tmp = *rhs1_out;
2692       *rhs1_out = *rhs2_out;
2693       *rhs2_out = tmp;
2694     }
2695
2696   return true;
2697 }
2698
2699 /* Process a single gimple statement STMT, which has a MULT_EXPR as
2700    its rhs, and try to convert it into a WIDEN_MULT_EXPR.  The return
2701    value is true iff we converted the statement.  */
2702
2703 static bool
2704 convert_mult_to_widen (gimple stmt, gimple_stmt_iterator *gsi)
2705 {
2706   tree lhs, rhs1, rhs2, type, type1, type2;
2707   enum insn_code handler;
2708   machine_mode to_mode, from_mode, actual_mode;
2709   optab op;
2710   int actual_precision;
2711   location_t loc = gimple_location (stmt);
2712   bool from_unsigned1, from_unsigned2;
2713
2714   lhs = gimple_assign_lhs (stmt);
2715   type = TREE_TYPE (lhs);
2716   if (TREE_CODE (type) != INTEGER_TYPE)
2717     return false;
2718
2719   if (!is_widening_mult_p (stmt, &type1, &rhs1, &type2, &rhs2))
2720     return false;
2721
2722   to_mode = TYPE_MODE (type);
2723   from_mode = TYPE_MODE (type1);
2724   from_unsigned1 = TYPE_UNSIGNED (type1);
2725   from_unsigned2 = TYPE_UNSIGNED (type2);
2726
2727   if (from_unsigned1 && from_unsigned2)
2728     op = umul_widen_optab;
2729   else if (!from_unsigned1 && !from_unsigned2)
2730     op = smul_widen_optab;
2731   else
2732     op = usmul_widen_optab;
2733
2734   handler = find_widening_optab_handler_and_mode (op, to_mode, from_mode,
2735                                                   0, &actual_mode);
2736
2737   if (handler == CODE_FOR_nothing)
2738     {
2739       if (op != smul_widen_optab)
2740         {
2741           /* We can use a signed multiply with unsigned types as long as
2742              there is a wider mode to use, or it is the smaller of the two
2743              types that is unsigned.  Note that type1 >= type2, always.  */
2744           if ((TYPE_UNSIGNED (type1)
2745                && TYPE_PRECISION (type1) == GET_MODE_PRECISION (from_mode))
2746               || (TYPE_UNSIGNED (type2)
2747                   && TYPE_PRECISION (type2) == GET_MODE_PRECISION (from_mode)))
2748             {
2749               from_mode = GET_MODE_WIDER_MODE (from_mode);
2750               if (GET_MODE_SIZE (to_mode) <= GET_MODE_SIZE (from_mode))
2751                 return false;
2752             }
2753
2754           op = smul_widen_optab;
2755           handler = find_widening_optab_handler_and_mode (op, to_mode,
2756                                                           from_mode, 0,
2757                                                           &actual_mode);
2758
2759           if (handler == CODE_FOR_nothing)
2760             return false;
2761
2762           from_unsigned1 = from_unsigned2 = false;
2763         }
2764       else
2765         return false;
2766     }
2767
2768   /* Ensure that the inputs to the handler are in the correct precison
2769      for the opcode.  This will be the full mode size.  */
2770   actual_precision = GET_MODE_PRECISION (actual_mode);
2771   if (2 * actual_precision > TYPE_PRECISION (type))
2772     return false;
2773   if (actual_precision != TYPE_PRECISION (type1)
2774       || from_unsigned1 != TYPE_UNSIGNED (type1))
2775     rhs1 = build_and_insert_cast (gsi, loc,
2776                                   build_nonstandard_integer_type
2777                                     (actual_precision, from_unsigned1), rhs1);
2778   if (actual_precision != TYPE_PRECISION (type2)
2779       || from_unsigned2 != TYPE_UNSIGNED (type2))
2780     rhs2 = build_and_insert_cast (gsi, loc,
2781                                   build_nonstandard_integer_type
2782                                     (actual_precision, from_unsigned2), rhs2);
2783
2784   /* Handle constants.  */
2785   if (TREE_CODE (rhs1) == INTEGER_CST)
2786     rhs1 = fold_convert (type1, rhs1);
2787   if (TREE_CODE (rhs2) == INTEGER_CST)
2788     rhs2 = fold_convert (type2, rhs2);
2789
2790   gimple_assign_set_rhs1 (stmt, rhs1);
2791   gimple_assign_set_rhs2 (stmt, rhs2);
2792   gimple_assign_set_rhs_code (stmt, WIDEN_MULT_EXPR);
2793   update_stmt (stmt);
2794   widen_mul_stats.widen_mults_inserted++;
2795   return true;
2796 }
2797
2798 /* Process a single gimple statement STMT, which is found at the
2799    iterator GSI and has a either a PLUS_EXPR or a MINUS_EXPR as its
2800    rhs (given by CODE), and try to convert it into a
2801    WIDEN_MULT_PLUS_EXPR or a WIDEN_MULT_MINUS_EXPR.  The return value
2802    is true iff we converted the statement.  */
2803
2804 static bool
2805 convert_plusminus_to_widen (gimple_stmt_iterator *gsi, gimple stmt,
2806                             enum tree_code code)
2807 {
2808   gimple rhs1_stmt = NULL, rhs2_stmt = NULL;
2809   gimple conv1_stmt = NULL, conv2_stmt = NULL, conv_stmt;
2810   tree type, type1, type2, optype;
2811   tree lhs, rhs1, rhs2, mult_rhs1, mult_rhs2, add_rhs;
2812   enum tree_code rhs1_code = ERROR_MARK, rhs2_code = ERROR_MARK;
2813   optab this_optab;
2814   enum tree_code wmult_code;
2815   enum insn_code handler;
2816   machine_mode to_mode, from_mode, actual_mode;
2817   location_t loc = gimple_location (stmt);
2818   int actual_precision;
2819   bool from_unsigned1, from_unsigned2;
2820
2821   lhs = gimple_assign_lhs (stmt);
2822   type = TREE_TYPE (lhs);
2823   if (TREE_CODE (type) != INTEGER_TYPE
2824       && TREE_CODE (type) != FIXED_POINT_TYPE)
2825     return false;
2826
2827   if (code == MINUS_EXPR)
2828     wmult_code = WIDEN_MULT_MINUS_EXPR;
2829   else
2830     wmult_code = WIDEN_MULT_PLUS_EXPR;
2831
2832   rhs1 = gimple_assign_rhs1 (stmt);
2833   rhs2 = gimple_assign_rhs2 (stmt);
2834
2835   if (TREE_CODE (rhs1) == SSA_NAME)
2836     {
2837       rhs1_stmt = SSA_NAME_DEF_STMT (rhs1);
2838       if (is_gimple_assign (rhs1_stmt))
2839         rhs1_code = gimple_assign_rhs_code (rhs1_stmt);
2840     }
2841
2842   if (TREE_CODE (rhs2) == SSA_NAME)
2843     {
2844       rhs2_stmt = SSA_NAME_DEF_STMT (rhs2);
2845       if (is_gimple_assign (rhs2_stmt))
2846         rhs2_code = gimple_assign_rhs_code (rhs2_stmt);
2847     }
2848
2849   /* Allow for one conversion statement between the multiply
2850      and addition/subtraction statement.  If there are more than
2851      one conversions then we assume they would invalidate this
2852      transformation.  If that's not the case then they should have
2853      been folded before now.  */
2854   if (CONVERT_EXPR_CODE_P (rhs1_code))
2855     {
2856       conv1_stmt = rhs1_stmt;
2857       rhs1 = gimple_assign_rhs1 (rhs1_stmt);
2858       if (TREE_CODE (rhs1) == SSA_NAME)
2859         {
2860           rhs1_stmt = SSA_NAME_DEF_STMT (rhs1);
2861           if (is_gimple_assign (rhs1_stmt))
2862             rhs1_code = gimple_assign_rhs_code (rhs1_stmt);
2863         }
2864       else
2865         return false;
2866     }
2867   if (CONVERT_EXPR_CODE_P (rhs2_code))
2868     {
2869       conv2_stmt = rhs2_stmt;
2870       rhs2 = gimple_assign_rhs1 (rhs2_stmt);
2871       if (TREE_CODE (rhs2) == SSA_NAME)
2872         {
2873           rhs2_stmt = SSA_NAME_DEF_STMT (rhs2);
2874           if (is_gimple_assign (rhs2_stmt))
2875             rhs2_code = gimple_assign_rhs_code (rhs2_stmt);
2876         }
2877       else
2878         return false;
2879     }
2880
2881   /* If code is WIDEN_MULT_EXPR then it would seem unnecessary to call
2882      is_widening_mult_p, but we still need the rhs returns.
2883
2884      It might also appear that it would be sufficient to use the existing
2885      operands of the widening multiply, but that would limit the choice of
2886      multiply-and-accumulate instructions.
2887
2888      If the widened-multiplication result has more than one uses, it is
2889      probably wiser not to do the conversion.  */
2890   if (code == PLUS_EXPR
2891       && (rhs1_code == MULT_EXPR || rhs1_code == WIDEN_MULT_EXPR))
2892     {
2893       if (!has_single_use (rhs1)
2894           || !is_widening_mult_p (rhs1_stmt, &type1, &mult_rhs1,
2895                                   &type2, &mult_rhs2))
2896         return false;
2897       add_rhs = rhs2;
2898       conv_stmt = conv1_stmt;
2899     }
2900   else if (rhs2_code == MULT_EXPR || rhs2_code == WIDEN_MULT_EXPR)
2901     {
2902       if (!has_single_use (rhs2)
2903           || !is_widening_mult_p (rhs2_stmt, &type1, &mult_rhs1,
2904                                   &type2, &mult_rhs2))
2905         return false;
2906       add_rhs = rhs1;
2907       conv_stmt = conv2_stmt;
2908     }
2909   else
2910     return false;
2911
2912   to_mode = TYPE_MODE (type);
2913   from_mode = TYPE_MODE (type1);
2914   from_unsigned1 = TYPE_UNSIGNED (type1);
2915   from_unsigned2 = TYPE_UNSIGNED (type2);
2916   optype = type1;
2917
2918   /* There's no such thing as a mixed sign madd yet, so use a wider mode.  */
2919   if (from_unsigned1 != from_unsigned2)
2920     {
2921       if (!INTEGRAL_TYPE_P (type))
2922         return false;
2923       /* We can use a signed multiply with unsigned types as long as
2924          there is a wider mode to use, or it is the smaller of the two
2925          types that is unsigned.  Note that type1 >= type2, always.  */
2926       if ((from_unsigned1
2927            && TYPE_PRECISION (type1) == GET_MODE_PRECISION (from_mode))
2928           || (from_unsigned2
2929               && TYPE_PRECISION (type2) == GET_MODE_PRECISION (from_mode)))
2930         {
2931           from_mode = GET_MODE_WIDER_MODE (from_mode);
2932           if (GET_MODE_SIZE (from_mode) >= GET_MODE_SIZE (to_mode))
2933             return false;
2934         }
2935
2936       from_unsigned1 = from_unsigned2 = false;
2937       optype = build_nonstandard_integer_type (GET_MODE_PRECISION (from_mode),
2938                                                false);
2939     }
2940
2941   /* If there was a conversion between the multiply and addition
2942      then we need to make sure it fits a multiply-and-accumulate.
2943      The should be a single mode change which does not change the
2944      value.  */
2945   if (conv_stmt)
2946     {
2947       /* We use the original, unmodified data types for this.  */
2948       tree from_type = TREE_TYPE (gimple_assign_rhs1 (conv_stmt));
2949       tree to_type = TREE_TYPE (gimple_assign_lhs (conv_stmt));
2950       int data_size = TYPE_PRECISION (type1) + TYPE_PRECISION (type2);
2951       bool is_unsigned = TYPE_UNSIGNED (type1) && TYPE_UNSIGNED (type2);
2952
2953       if (TYPE_PRECISION (from_type) > TYPE_PRECISION (to_type))
2954         {
2955           /* Conversion is a truncate.  */
2956           if (TYPE_PRECISION (to_type) < data_size)
2957             return false;
2958         }
2959       else if (TYPE_PRECISION (from_type) < TYPE_PRECISION (to_type))
2960         {
2961           /* Conversion is an extend.  Check it's the right sort.  */
2962           if (TYPE_UNSIGNED (from_type) != is_unsigned
2963               && !(is_unsigned && TYPE_PRECISION (from_type) > data_size))
2964             return false;
2965         }
2966       /* else convert is a no-op for our purposes.  */
2967     }
2968
2969   /* Verify that the machine can perform a widening multiply
2970      accumulate in this mode/signedness combination, otherwise
2971      this transformation is likely to pessimize code.  */
2972   this_optab = optab_for_tree_code (wmult_code, optype, optab_default);
2973   handler = find_widening_optab_handler_and_mode (this_optab, to_mode,
2974                                                   from_mode, 0, &actual_mode);
2975
2976   if (handler == CODE_FOR_nothing)
2977     return false;
2978
2979   /* Ensure that the inputs to the handler are in the correct precison
2980      for the opcode.  This will be the full mode size.  */
2981   actual_precision = GET_MODE_PRECISION (actual_mode);
2982   if (actual_precision != TYPE_PRECISION (type1)
2983       || from_unsigned1 != TYPE_UNSIGNED (type1))
2984     mult_rhs1 = build_and_insert_cast (gsi, loc,
2985                                        build_nonstandard_integer_type
2986                                          (actual_precision, from_unsigned1),
2987                                        mult_rhs1);
2988   if (actual_precision != TYPE_PRECISION (type2)
2989       || from_unsigned2 != TYPE_UNSIGNED (type2))
2990     mult_rhs2 = build_and_insert_cast (gsi, loc,
2991                                        build_nonstandard_integer_type
2992                                          (actual_precision, from_unsigned2),
2993                                        mult_rhs2);
2994
2995   if (!useless_type_conversion_p (type, TREE_TYPE (add_rhs)))
2996     add_rhs = build_and_insert_cast (gsi, loc, type, add_rhs);
2997
2998   /* Handle constants.  */
2999   if (TREE_CODE (mult_rhs1) == INTEGER_CST)
3000     mult_rhs1 = fold_convert (type1, mult_rhs1);
3001   if (TREE_CODE (mult_rhs2) == INTEGER_CST)
3002     mult_rhs2 = fold_convert (type2, mult_rhs2);
3003
3004   gimple_assign_set_rhs_with_ops (gsi, wmult_code, mult_rhs1, mult_rhs2,
3005                                   add_rhs);
3006   update_stmt (gsi_stmt (*gsi));
3007   widen_mul_stats.maccs_inserted++;
3008   return true;
3009 }
3010
3011 /* Combine the multiplication at MUL_STMT with operands MULOP1 and MULOP2
3012    with uses in additions and subtractions to form fused multiply-add
3013    operations.  Returns true if successful and MUL_STMT should be removed.  */
3014
3015 static bool
3016 convert_mult_to_fma (gimple mul_stmt, tree op1, tree op2)
3017 {
3018   tree mul_result = gimple_get_lhs (mul_stmt);
3019   tree type = TREE_TYPE (mul_result);
3020   gimple use_stmt, neguse_stmt;
3021   gassign *fma_stmt;
3022   use_operand_p use_p;
3023   imm_use_iterator imm_iter;
3024
3025   if (FLOAT_TYPE_P (type)
3026       && flag_fp_contract_mode == FP_CONTRACT_OFF)
3027     return false;
3028
3029   /* We don't want to do bitfield reduction ops.  */
3030   if (INTEGRAL_TYPE_P (type)
3031       && (TYPE_PRECISION (type)
3032           != GET_MODE_PRECISION (TYPE_MODE (type))))
3033     return false;
3034
3035   /* If the target doesn't support it, don't generate it.  We assume that
3036      if fma isn't available then fms, fnma or fnms are not either.  */
3037   if (optab_handler (fma_optab, TYPE_MODE (type)) == CODE_FOR_nothing)
3038     return false;
3039
3040   /* If the multiplication has zero uses, it is kept around probably because
3041      of -fnon-call-exceptions.  Don't optimize it away in that case,
3042      it is DCE job.  */
3043   if (has_zero_uses (mul_result))
3044     return false;
3045
3046   /* Make sure that the multiplication statement becomes dead after
3047      the transformation, thus that all uses are transformed to FMAs.
3048      This means we assume that an FMA operation has the same cost
3049      as an addition.  */
3050   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, mul_result)
3051     {
3052       enum tree_code use_code;
3053       tree result = mul_result;
3054       bool negate_p = false;
3055
3056       use_stmt = USE_STMT (use_p);
3057
3058       if (is_gimple_debug (use_stmt))
3059         continue;
3060
3061       /* For now restrict this operations to single basic blocks.  In theory
3062          we would want to support sinking the multiplication in
3063          m = a*b;
3064          if ()
3065            ma = m + c;
3066          else
3067            d = m;
3068          to form a fma in the then block and sink the multiplication to the
3069          else block.  */
3070       if (gimple_bb (use_stmt) != gimple_bb (mul_stmt))
3071         return false;
3072
3073       if (!is_gimple_assign (use_stmt))
3074         return false;
3075
3076       use_code = gimple_assign_rhs_code (use_stmt);
3077
3078       /* A negate on the multiplication leads to FNMA.  */
3079       if (use_code == NEGATE_EXPR)
3080         {
3081           ssa_op_iter iter;
3082           use_operand_p usep;
3083
3084           result = gimple_assign_lhs (use_stmt);
3085
3086           /* Make sure the negate statement becomes dead with this
3087              single transformation.  */
3088           if (!single_imm_use (gimple_assign_lhs (use_stmt),
3089                                &use_p, &neguse_stmt))
3090             return false;
3091
3092           /* Make sure the multiplication isn't also used on that stmt.  */
3093           FOR_EACH_PHI_OR_STMT_USE (usep, neguse_stmt, iter, SSA_OP_USE)
3094             if (USE_FROM_PTR (usep) == mul_result)
3095               return false;
3096
3097           /* Re-validate.  */
3098           use_stmt = neguse_stmt;
3099           if (gimple_bb (use_stmt) != gimple_bb (mul_stmt))
3100             return false;
3101           if (!is_gimple_assign (use_stmt))
3102             return false;
3103
3104           use_code = gimple_assign_rhs_code (use_stmt);
3105           negate_p = true;
3106         }
3107
3108       switch (use_code)
3109         {
3110         case MINUS_EXPR:
3111           if (gimple_assign_rhs2 (use_stmt) == result)
3112             negate_p = !negate_p;
3113           break;
3114         case PLUS_EXPR:
3115           break;
3116         default:
3117           /* FMA can only be formed from PLUS and MINUS.  */
3118           return false;
3119         }
3120
3121       /* If the subtrahend (gimple_assign_rhs2 (use_stmt)) is computed
3122          by a MULT_EXPR that we'll visit later, we might be able to
3123          get a more profitable match with fnma.
3124          OTOH, if we don't, a negate / fma pair has likely lower latency
3125          that a mult / subtract pair.  */
3126       if (use_code == MINUS_EXPR && !negate_p
3127           && gimple_assign_rhs1 (use_stmt) == result
3128           && optab_handler (fms_optab, TYPE_MODE (type)) == CODE_FOR_nothing
3129           && optab_handler (fnma_optab, TYPE_MODE (type)) != CODE_FOR_nothing)
3130         {
3131           tree rhs2 = gimple_assign_rhs2 (use_stmt);
3132
3133           if (TREE_CODE (rhs2) == SSA_NAME)
3134             {
3135               gimple stmt2 = SSA_NAME_DEF_STMT (rhs2);
3136               if (has_single_use (rhs2)
3137                   && is_gimple_assign (stmt2)
3138                   && gimple_assign_rhs_code (stmt2) == MULT_EXPR)
3139               return false;
3140             }
3141         }
3142
3143       /* We can't handle a * b + a * b.  */
3144       if (gimple_assign_rhs1 (use_stmt) == gimple_assign_rhs2 (use_stmt))
3145         return false;
3146
3147       /* While it is possible to validate whether or not the exact form
3148          that we've recognized is available in the backend, the assumption
3149          is that the transformation is never a loss.  For instance, suppose
3150          the target only has the plain FMA pattern available.  Consider
3151          a*b-c -> fma(a,b,-c): we've exchanged MUL+SUB for FMA+NEG, which
3152          is still two operations.  Consider -(a*b)-c -> fma(-a,b,-c): we
3153          still have 3 operations, but in the FMA form the two NEGs are
3154          independent and could be run in parallel.  */
3155     }
3156
3157   FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, mul_result)
3158     {
3159       gimple_stmt_iterator gsi = gsi_for_stmt (use_stmt);
3160       enum tree_code use_code;
3161       tree addop, mulop1 = op1, result = mul_result;
3162       bool negate_p = false;
3163
3164       if (is_gimple_debug (use_stmt))
3165         continue;
3166
3167       use_code = gimple_assign_rhs_code (use_stmt);
3168       if (use_code == NEGATE_EXPR)
3169         {
3170           result = gimple_assign_lhs (use_stmt);
3171           single_imm_use (gimple_assign_lhs (use_stmt), &use_p, &neguse_stmt);
3172           gsi_remove (&gsi, true);
3173           release_defs (use_stmt);
3174
3175           use_stmt = neguse_stmt;
3176           gsi = gsi_for_stmt (use_stmt);
3177           use_code = gimple_assign_rhs_code (use_stmt);
3178           negate_p = true;
3179         }
3180
3181       if (gimple_assign_rhs1 (use_stmt) == result)
3182         {
3183           addop = gimple_assign_rhs2 (use_stmt);
3184           /* a * b - c -> a * b + (-c)  */
3185           if (gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
3186             addop = force_gimple_operand_gsi (&gsi,
3187                                               build1 (NEGATE_EXPR,
3188                                                       type, addop),
3189                                               true, NULL_TREE, true,
3190                                               GSI_SAME_STMT);
3191         }
3192       else
3193         {
3194           addop = gimple_assign_rhs1 (use_stmt);
3195           /* a - b * c -> (-b) * c + a */
3196           if (gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
3197             negate_p = !negate_p;
3198         }
3199
3200       if (negate_p)
3201         mulop1 = force_gimple_operand_gsi (&gsi,
3202                                            build1 (NEGATE_EXPR,
3203                                                    type, mulop1),
3204                                            true, NULL_TREE, true,
3205                                            GSI_SAME_STMT);
3206
3207       fma_stmt = gimple_build_assign (gimple_assign_lhs (use_stmt),
3208                                       FMA_EXPR, mulop1, op2, addop);
3209       gsi_replace (&gsi, fma_stmt, true);
3210       widen_mul_stats.fmas_inserted++;
3211     }
3212
3213   return true;
3214 }
3215
3216 /* Find integer multiplications where the operands are extended from
3217    smaller types, and replace the MULT_EXPR with a WIDEN_MULT_EXPR
3218    where appropriate.  */
3219
3220 namespace {
3221
3222 const pass_data pass_data_optimize_widening_mul =
3223 {
3224   GIMPLE_PASS, /* type */
3225   "widening_mul", /* name */
3226   OPTGROUP_NONE, /* optinfo_flags */
3227   TV_NONE, /* tv_id */
3228   PROP_ssa, /* properties_required */
3229   0, /* properties_provided */
3230   0, /* properties_destroyed */
3231   0, /* todo_flags_start */
3232   TODO_update_ssa, /* todo_flags_finish */
3233 };
3234
3235 class pass_optimize_widening_mul : public gimple_opt_pass
3236 {
3237 public:
3238   pass_optimize_widening_mul (gcc::context *ctxt)
3239     : gimple_opt_pass (pass_data_optimize_widening_mul, ctxt)
3240   {}
3241
3242   /* opt_pass methods: */
3243   virtual bool gate (function *)
3244     {
3245       return flag_expensive_optimizations && optimize;
3246     }
3247
3248   virtual unsigned int execute (function *);
3249
3250 }; // class pass_optimize_widening_mul
3251
3252 unsigned int
3253 pass_optimize_widening_mul::execute (function *fun)
3254 {
3255   basic_block bb;
3256   bool cfg_changed = false;
3257
3258   memset (&widen_mul_stats, 0, sizeof (widen_mul_stats));
3259
3260   FOR_EACH_BB_FN (bb, fun)
3261     {
3262       gimple_stmt_iterator gsi;
3263
3264       for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi);)
3265         {
3266           gimple stmt = gsi_stmt (gsi);
3267           enum tree_code code;
3268
3269           if (is_gimple_assign (stmt))
3270             {
3271               code = gimple_assign_rhs_code (stmt);
3272               switch (code)
3273                 {
3274                 case MULT_EXPR:
3275                   if (!convert_mult_to_widen (stmt, &gsi)
3276                       && convert_mult_to_fma (stmt,
3277                                               gimple_assign_rhs1 (stmt),
3278                                               gimple_assign_rhs2 (stmt)))
3279                     {
3280                       gsi_remove (&gsi, true);
3281                       release_defs (stmt);
3282                       continue;
3283                     }
3284                   break;
3285
3286                 case PLUS_EXPR:
3287                 case MINUS_EXPR:
3288                   convert_plusminus_to_widen (&gsi, stmt, code);
3289                   break;
3290
3291                 default:;
3292                 }
3293             }
3294           else if (is_gimple_call (stmt)
3295                    && gimple_call_lhs (stmt))
3296             {
3297               tree fndecl = gimple_call_fndecl (stmt);
3298               if (fndecl
3299                   && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_NORMAL)
3300                 {
3301                   switch (DECL_FUNCTION_CODE (fndecl))
3302                     {
3303                       case BUILT_IN_POWF:
3304                       case BUILT_IN_POW:
3305                       case BUILT_IN_POWL:
3306                         if (TREE_CODE (gimple_call_arg (stmt, 1)) == REAL_CST
3307                             && REAL_VALUES_EQUAL
3308                                  (TREE_REAL_CST (gimple_call_arg (stmt, 1)),
3309                                   dconst2)
3310                             && convert_mult_to_fma (stmt,
3311                                                     gimple_call_arg (stmt, 0),
3312                                                     gimple_call_arg (stmt, 0)))
3313                           {
3314                             unlink_stmt_vdef (stmt);
3315                             if (gsi_remove (&gsi, true)
3316                                 && gimple_purge_dead_eh_edges (bb))
3317                               cfg_changed = true;
3318                             release_defs (stmt);
3319                             continue;
3320                           }
3321                           break;
3322
3323                       default:;
3324                     }
3325                 }
3326             }
3327           gsi_next (&gsi);
3328         }
3329     }
3330
3331   statistics_counter_event (fun, "widening multiplications inserted",
3332                             widen_mul_stats.widen_mults_inserted);
3333   statistics_counter_event (fun, "widening maccs inserted",
3334                             widen_mul_stats.maccs_inserted);
3335   statistics_counter_event (fun, "fused multiply-adds inserted",
3336                             widen_mul_stats.fmas_inserted);
3337
3338   return cfg_changed ? TODO_cleanup_cfg : 0;
3339 }
3340
3341 } // anon namespace
3342
3343 gimple_opt_pass *
3344 make_pass_optimize_widening_mul (gcc::context *ctxt)
3345 {
3346   return new pass_optimize_widening_mul (ctxt);
3347 }