gcc/tree-ssa-math-opts.c

   1 /* Global, SSA-based optimizations using mathematical identities.
   2    Copyright (C) 2005-2013 Free Software Foundation, Inc.
   3
   4 This file is part of GCC.
   5
   6 GCC is free software; you can redistribute it and/or modify it
   7 under the terms of the GNU General Public License as published by the
   8 Free Software Foundation; either version 3, or (at your option) any
   9 later version.
  10
  11 GCC is distributed in the hope that it will be useful, but WITHOUT
  12 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14 for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with GCC; see the file COPYING3.  If not see
  18 <http://www.gnu.org/licenses/>.  */
  19
  20 /* Currently, the only mini-pass in this file tries to CSE reciprocal
  21    operations.  These are common in sequences such as this one:
  22
  23         modulus = sqrt(x*x + y*y + z*z);
  24         x = x / modulus;
  25         y = y / modulus;
  26         z = z / modulus;
  27
  28    that can be optimized to
  29
  30         modulus = sqrt(x*x + y*y + z*z);
  31         rmodulus = 1.0 / modulus;
  32         x = x * rmodulus;
  33         y = y * rmodulus;
  34         z = z * rmodulus;
  35
  36    We do this for loop invariant divisors, and with this pass whenever
  37    we notice that a division has the same divisor multiple times.
  38
  39    Of course, like in PRE, we don't insert a division if a dominator
  40    already has one.  However, this cannot be done as an extension of
  41    PRE for several reasons.
  42
  43    First of all, with some experiments it was found out that the
  44    transformation is not always useful if there are only two divisions
  45    hy the same divisor.  This is probably because modern processors
  46    can pipeline the divisions; on older, in-order processors it should
  47    still be effective to optimize two divisions by the same number.
  48    We make this a param, and it shall be called N in the remainder of
  49    this comment.
  50
  51    Second, if trapping math is active, we have less freedom on where
  52    to insert divisions: we can only do so in basic blocks that already
  53    contain one.  (If divisions don't trap, instead, we can insert
  54    divisions elsewhere, which will be in blocks that are common dominators
  55    of those that have the division).
  56
  57    We really don't want to compute the reciprocal unless a division will
  58    be found.  To do this, we won't insert the division in a basic block
  59    that has less than N divisions *post-dominating* it.
  60
  61    The algorithm constructs a subset of the dominator tree, holding the
  62    blocks containing the divisions and the common dominators to them,
  63    and walk it twice.  The first walk is in post-order, and it annotates
  64    each block with the number of divisions that post-dominate it: this
  65    gives information on where divisions can be inserted profitably.
  66    The second walk is in pre-order, and it inserts divisions as explained
  67    above, and replaces divisions by multiplications.
  68
  69    In the best case, the cost of the pass is O(n_statements).  In the
  70    worst-case, the cost is due to creating the dominator tree subset,
  71    with a cost of O(n_basic_blocks ^ 2); however this can only happen
  72    for n_statements / n_basic_blocks statements.  So, the amortized cost
  73    of creating the dominator tree subset is O(n_basic_blocks) and the
  74    worst-case cost of the pass is O(n_statements * n_basic_blocks).
  75
  76    More practically, the cost will be small because there are few
  77    divisions, and they tend to be in the same basic block, so insert_bb
  78    is called very few times.
  79
  80    If we did this using domwalk.c, an efficient implementation would have
  81    to work on all the variables in a single pass, because we could not
  82    work on just a subset of the dominator tree, as we do now, and the
  83    cost would also be something like O(n_statements * n_basic_blocks).
  84    The data structures would be more complex in order to work on all the
  85    variables in a single pass.  */
  86
  87 #include "config.h"
  88 #include "system.h"
  89 #include "coretypes.h"
  90 #include "tm.h"
  91 #include "flags.h"
  92 #include "tree.h"
  93 #include "gimple.h"
  94 #include "gimple-iterator.h"
  95 #include "gimplify-me.h"
  96 #include "stor-layout.h"
  97 #include "gimple-ssa.h"
  98 #include "tree-cfg.h"
  99 #include "tree-phinodes.h"
 100 #include "ssa-iterators.h"
 101 #include "stringpool.h"
 102 #include "tree-ssanames.h"
 103 #include "expr.h"
 104 #include "tree-dfa.h"
 105 #include "tree-ssa.h"
 106 #include "tree-pass.h"
 107 #include "alloc-pool.h"
 108 #include "basic-block.h"
 109 #include "target.h"
 110 #include "gimple-pretty-print.h"
 111
 112 /* FIXME: RTL headers have to be included here for optabs.  */
 113 #include "rtl.h"                /* Because optabs.h wants enum rtx_code.  */
 114 #include "expr.h"               /* Because optabs.h wants sepops.  */
 115 #include "optabs.h"
 116
 117 /* This structure represents one basic block that either computes a
 118    division, or is a common dominator for basic block that compute a
 119    division.  */
 120 struct occurrence {
 121   /* The basic block represented by this structure.  */
 122   basic_block bb;
 123
 124   /* If non-NULL, the SSA_NAME holding the definition for a reciprocal
 125      inserted in BB.  */
 126   tree recip_def;
 127
 128   /* If non-NULL, the GIMPLE_ASSIGN for a reciprocal computation that
 129      was inserted in BB.  */
 130   gimple recip_def_stmt;
 131
 132   /* Pointer to a list of "struct occurrence"s for blocks dominated
 133      by BB.  */
 134   struct occurrence *children;
 135
 136   /* Pointer to the next "struct occurrence"s in the list of blocks
 137      sharing a common dominator.  */
 138   struct occurrence *next;
 139
 140   /* The number of divisions that are in BB before compute_merit.  The
 141      number of divisions that are in BB or post-dominate it after
 142      compute_merit.  */
 143   int num_divisions;
 144
 145   /* True if the basic block has a division, false if it is a common
 146      dominator for basic blocks that do.  If it is false and trapping
 147      math is active, BB is not a candidate for inserting a reciprocal.  */
 148   bool bb_has_division;
 149 };
 150
 151 static struct
 152 {
 153   /* Number of 1.0/X ops inserted.  */
 154   int rdivs_inserted;
 155
 156   /* Number of 1.0/FUNC ops inserted.  */
 157   int rfuncs_inserted;
 158 } reciprocal_stats;
 159
 160 static struct
 161 {
 162   /* Number of cexpi calls inserted.  */
 163   int inserted;
 164 } sincos_stats;
 165
 166 static struct
 167 {
 168   /* Number of hand-written 16-bit bswaps found.  */
 169   int found_16bit;
 170
 171   /* Number of hand-written 32-bit bswaps found.  */
 172   int found_32bit;
 173
 174   /* Number of hand-written 64-bit bswaps found.  */
 175   int found_64bit;
 176 } bswap_stats;
 177
 178 static struct
 179 {
 180   /* Number of widening multiplication ops inserted.  */
 181   int widen_mults_inserted;
 182
 183   /* Number of integer multiply-and-accumulate ops inserted.  */
 184   int maccs_inserted;
 185
 186   /* Number of fp fused multiply-add ops inserted.  */
 187   int fmas_inserted;
 188 } widen_mul_stats;
 189
 190 /* The instance of "struct occurrence" representing the highest
 191    interesting block in the dominator tree.  */
 192 static struct occurrence *occ_head;
 193
 194 /* Allocation pool for getting instances of "struct occurrence".  */
 195 static alloc_pool occ_pool;
 196
 197
 198
 199 /* Allocate and return a new struct occurrence for basic block BB, and
 200    whose children list is headed by CHILDREN.  */
 201 static struct occurrence *
 202 occ_new (basic_block bb, struct occurrence *children)
 203 {
 204   struct occurrence *occ;
 205
 206   bb->aux = occ = (struct occurrence *) pool_alloc (occ_pool);
 207   memset (occ, 0, sizeof (struct occurrence));
 208
 209   occ->bb = bb;
 210   occ->children = children;
 211   return occ;
 212 }
 213
 214
 215 /* Insert NEW_OCC into our subset of the dominator tree.  P_HEAD points to a
 216    list of "struct occurrence"s, one per basic block, having IDOM as
 217    their common dominator.
 218
 219    We try to insert NEW_OCC as deep as possible in the tree, and we also
 220    insert any other block that is a common dominator for BB and one
 221    block already in the tree.  */
 222
 223 static void
 224 insert_bb (struct occurrence *new_occ, basic_block idom,
 225            struct occurrence **p_head)
 226 {
 227   struct occurrence *occ, **p_occ;
 228
 229   for (p_occ = p_head; (occ = *p_occ) != NULL; )
 230     {
 231       basic_block bb = new_occ->bb, occ_bb = occ->bb;
 232       basic_block dom = nearest_common_dominator (CDI_DOMINATORS, occ_bb, bb);
 233       if (dom == bb)
 234         {
 235           /* BB dominates OCC_BB.  OCC becomes NEW_OCC's child: remove OCC
 236              from its list.  */
 237           *p_occ = occ->next;
 238           occ->next = new_occ->children;
 239           new_occ->children = occ;
 240
 241           /* Try the next block (it may as well be dominated by BB).  */
 242         }
 243
 244       else if (dom == occ_bb)
 245         {
 246           /* OCC_BB dominates BB.  Tail recurse to look deeper.  */
 247           insert_bb (new_occ, dom, &occ->children);
 248           return;
 249         }
 250
 251       else if (dom != idom)
 252         {
 253           gcc_assert (!dom->aux);
 254
 255           /* There is a dominator between IDOM and BB, add it and make
 256              two children out of NEW_OCC and OCC.  First, remove OCC from
 257              its list.  */
 258           *p_occ = occ->next;
 259           new_occ->next = occ;
 260           occ->next = NULL;
 261
 262           /* None of the previous blocks has DOM as a dominator: if we tail
 263              recursed, we would reexamine them uselessly. Just switch BB with
 264              DOM, and go on looking for blocks dominated by DOM.  */
 265           new_occ = occ_new (dom, new_occ);
 266         }
 267
 268       else
 269         {
 270           /* Nothing special, go on with the next element.  */
 271           p_occ = &occ->next;
 272         }
 273     }
 274
 275   /* No place was found as a child of IDOM.  Make BB a sibling of IDOM.  */
 276   new_occ->next = *p_head;
 277   *p_head = new_occ;
 278 }
 279
 280 /* Register that we found a division in BB.  */
 281
 282 static inline void
 283 register_division_in (basic_block bb)
 284 {
 285   struct occurrence *occ;
 286
 287   occ = (struct occurrence *) bb->aux;
 288   if (!occ)
 289     {
 290       occ = occ_new (bb, NULL);
 291       insert_bb (occ, ENTRY_BLOCK_PTR_FOR_FN (cfun), &occ_head);
 292     }
 293
 294   occ->bb_has_division = true;
 295   occ->num_divisions++;
 296 }
 297
 298
 299 /* Compute the number of divisions that postdominate each block in OCC and
 300    its children.  */
 301
 302 static void
 303 compute_merit (struct occurrence *occ)
 304 {
 305   struct occurrence *occ_child;
 306   basic_block dom = occ->bb;
 307
 308   for (occ_child = occ->children; occ_child; occ_child = occ_child->next)
 309     {
 310       basic_block bb;
 311       if (occ_child->children)
 312         compute_merit (occ_child);
 313
 314       if (flag_exceptions)
 315         bb = single_noncomplex_succ (dom);
 316       else
 317         bb = dom;
 318
 319       if (dominated_by_p (CDI_POST_DOMINATORS, bb, occ_child->bb))
 320         occ->num_divisions += occ_child->num_divisions;
 321     }
 322 }
 323
 324
 325 /* Return whether USE_STMT is a floating-point division by DEF.  */
 326 static inline bool
 327 is_division_by (gimple use_stmt, tree def)
 328 {
 329   return is_gimple_assign (use_stmt)
 330          && gimple_assign_rhs_code (use_stmt) == RDIV_EXPR
 331          && gimple_assign_rhs2 (use_stmt) == def
 332          /* Do not recognize x / x as valid division, as we are getting
 333             confused later by replacing all immediate uses x in such
 334             a stmt.  */
 335          && gimple_assign_rhs1 (use_stmt) != def;
 336 }
 337
 338 /* Walk the subset of the dominator tree rooted at OCC, setting the
 339    RECIP_DEF field to a definition of 1.0 / DEF that can be used in
 340    the given basic block.  The field may be left NULL, of course,
 341    if it is not possible or profitable to do the optimization.
 342
 343    DEF_BSI is an iterator pointing at the statement defining DEF.
 344    If RECIP_DEF is set, a dominator already has a computation that can
 345    be used.  */
 346
 347 static void
 348 insert_reciprocals (gimple_stmt_iterator *def_gsi, struct occurrence *occ,
 349                     tree def, tree recip_def, int threshold)
 350 {
 351   tree type;
 352   gimple new_stmt;
 353   gimple_stmt_iterator gsi;
 354   struct occurrence *occ_child;
 355
 356   if (!recip_def
 357       && (occ->bb_has_division || !flag_trapping_math)
 358       && occ->num_divisions >= threshold)
 359     {
 360       /* Make a variable with the replacement and substitute it.  */
 361       type = TREE_TYPE (def);
 362       recip_def = create_tmp_reg (type, "reciptmp");
 363       new_stmt = gimple_build_assign_with_ops (RDIV_EXPR, recip_def,
 364                                                build_one_cst (type), def);
 365
 366       if (occ->bb_has_division)
 367         {
 368           /* Case 1: insert before an existing division.  */
 369           gsi = gsi_after_labels (occ->bb);
 370           while (!gsi_end_p (gsi) && !is_division_by (gsi_stmt (gsi), def))
 371             gsi_next (&gsi);
 372
 373           gsi_insert_before (&gsi, new_stmt, GSI_SAME_STMT);
 374         }
 375       else if (def_gsi && occ->bb == def_gsi->bb)
 376         {
 377           /* Case 2: insert right after the definition.  Note that this will
 378              never happen if the definition statement can throw, because in
 379              that case the sole successor of the statement's basic block will
 380              dominate all the uses as well.  */
 381           gsi_insert_after (def_gsi, new_stmt, GSI_NEW_STMT);
 382         }
 383       else
 384         {
 385           /* Case 3: insert in a basic block not containing defs/uses.  */
 386           gsi = gsi_after_labels (occ->bb);
 387           gsi_insert_before (&gsi, new_stmt, GSI_SAME_STMT);
 388         }
 389
 390       reciprocal_stats.rdivs_inserted++;
 391
 392       occ->recip_def_stmt = new_stmt;
 393     }
 394
 395   occ->recip_def = recip_def;
 396   for (occ_child = occ->children; occ_child; occ_child = occ_child->next)
 397     insert_reciprocals (def_gsi, occ_child, def, recip_def, threshold);
 398 }
 399
 400
 401 /* Replace the division at USE_P with a multiplication by the reciprocal, if
 402    possible.  */
 403
 404 static inline void
 405 replace_reciprocal (use_operand_p use_p)
 406 {
 407   gimple use_stmt = USE_STMT (use_p);
 408   basic_block bb = gimple_bb (use_stmt);
 409   struct occurrence *occ = (struct occurrence *) bb->aux;
 410
 411   if (optimize_bb_for_speed_p (bb)
 412       && occ->recip_def && use_stmt != occ->recip_def_stmt)
 413     {
 414       gimple_stmt_iterator gsi = gsi_for_stmt (use_stmt);
 415       gimple_assign_set_rhs_code (use_stmt, MULT_EXPR);
 416       SET_USE (use_p, occ->recip_def);
 417       fold_stmt_inplace (&gsi);
 418       update_stmt (use_stmt);
 419     }
 420 }
 421
 422
 423 /* Free OCC and return one more "struct occurrence" to be freed.  */
 424
 425 static struct occurrence *
 426 free_bb (struct occurrence *occ)
 427 {
 428   struct occurrence *child, *next;
 429
 430   /* First get the two pointers hanging off OCC.  */
 431   next = occ->next;
 432   child = occ->children;
 433   occ->bb->aux = NULL;
 434   pool_free (occ_pool, occ);
 435
 436   /* Now ensure that we don't recurse unless it is necessary.  */
 437   if (!child)
 438     return next;
 439   else
 440     {
 441       while (next)
 442         next = free_bb (next);
 443
 444       return child;
 445     }
 446 }
 447
 448
 449 /* Look for floating-point divisions among DEF's uses, and try to
 450    replace them by multiplications with the reciprocal.  Add
 451    as many statements computing the reciprocal as needed.
 452
 453    DEF must be a GIMPLE register of a floating-point type.  */
 454
 455 static void
 456 execute_cse_reciprocals_1 (gimple_stmt_iterator *def_gsi, tree def)
 457 {
 458   use_operand_p use_p;
 459   imm_use_iterator use_iter;
 460   struct occurrence *occ;
 461   int count = 0, threshold;
 462
 463   gcc_assert (FLOAT_TYPE_P (TREE_TYPE (def)) && is_gimple_reg (def));
 464
 465   FOR_EACH_IMM_USE_FAST (use_p, use_iter, def)
 466     {
 467       gimple use_stmt = USE_STMT (use_p);
 468       if (is_division_by (use_stmt, def))
 469         {
 470           register_division_in (gimple_bb (use_stmt));
 471           count++;
 472         }
 473     }
 474
 475   /* Do the expensive part only if we can hope to optimize something.  */
 476   threshold = targetm.min_divisions_for_recip_mul (TYPE_MODE (TREE_TYPE (def)));
 477   if (count >= threshold)
 478     {
 479       gimple use_stmt;
 480       for (occ = occ_head; occ; occ = occ->next)
 481         {
 482           compute_merit (occ);
 483           insert_reciprocals (def_gsi, occ, def, NULL, threshold);
 484         }
 485
 486       FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, def)
 487         {
 488           if (is_division_by (use_stmt, def))
 489             {
 490               FOR_EACH_IMM_USE_ON_STMT (use_p, use_iter)
 491                 replace_reciprocal (use_p);
 492             }
 493         }
 494     }
 495
 496   for (occ = occ_head; occ; )
 497     occ = free_bb (occ);
 498
 499   occ_head = NULL;
 500 }
 501
 502 static bool
 503 gate_cse_reciprocals (void)
 504 {
 505   return optimize && flag_reciprocal_math;
 506 }
 507
 508 /* Go through all the floating-point SSA_NAMEs, and call
 509    execute_cse_reciprocals_1 on each of them.  */
 510 static unsigned int
 511 execute_cse_reciprocals (void)
 512 {
 513   basic_block bb;
 514   tree arg;
 515
 516   occ_pool = create_alloc_pool ("dominators for recip",
 517                                 sizeof (struct occurrence),
 518                                 n_basic_blocks_for_fn (cfun) / 3 + 1);
 519
 520   memset (&reciprocal_stats, 0, sizeof (reciprocal_stats));
 521   calculate_dominance_info (CDI_DOMINATORS);
 522   calculate_dominance_info (CDI_POST_DOMINATORS);
 523
 524 #ifdef ENABLE_CHECKING
 525   FOR_EACH_BB (bb)
 526     gcc_assert (!bb->aux);
 527 #endif
 528
 529   for (arg = DECL_ARGUMENTS (cfun->decl); arg; arg = DECL_CHAIN (arg))
 530     if (FLOAT_TYPE_P (TREE_TYPE (arg))
 531         && is_gimple_reg (arg))
 532       {
 533         tree name = ssa_default_def (cfun, arg);
 534         if (name)
 535           execute_cse_reciprocals_1 (NULL, name);
 536       }
 537
 538   FOR_EACH_BB (bb)
 539     {
 540       gimple_stmt_iterator gsi;
 541       gimple phi;
 542       tree def;
 543
 544       for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 545         {
 546           phi = gsi_stmt (gsi);
 547           def = PHI_RESULT (phi);
 548           if (! virtual_operand_p (def)
 549               && FLOAT_TYPE_P (TREE_TYPE (def)))
 550             execute_cse_reciprocals_1 (NULL, def);
 551         }
 552
 553       for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 554         {
 555           gimple stmt = gsi_stmt (gsi);
 556
 557           if (gimple_has_lhs (stmt)
 558               && (def = SINGLE_SSA_TREE_OPERAND (stmt, SSA_OP_DEF)) != NULL
 559               && FLOAT_TYPE_P (TREE_TYPE (def))
 560               && TREE_CODE (def) == SSA_NAME)
 561             execute_cse_reciprocals_1 (&gsi, def);
 562         }
 563
 564       if (optimize_bb_for_size_p (bb))
 565         continue;
 566
 567       /* Scan for a/func(b) and convert it to reciprocal a*rfunc(b).  */
 568       for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 569         {
 570           gimple stmt = gsi_stmt (gsi);
 571           tree fndecl;
 572
 573           if (is_gimple_assign (stmt)
 574               && gimple_assign_rhs_code (stmt) == RDIV_EXPR)
 575             {
 576               tree arg1 = gimple_assign_rhs2 (stmt);
 577               gimple stmt1;
 578
 579               if (TREE_CODE (arg1) != SSA_NAME)
 580                 continue;
 581
 582               stmt1 = SSA_NAME_DEF_STMT (arg1);
 583
 584               if (is_gimple_call (stmt1)
 585                   && gimple_call_lhs (stmt1)
 586                   && (fndecl = gimple_call_fndecl (stmt1))
 587                   && (DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_NORMAL
 588                       || DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD))
 589                 {
 590                   enum built_in_function code;
 591                   bool md_code, fail;
 592                   imm_use_iterator ui;
 593                   use_operand_p use_p;
 594
 595                   code = DECL_FUNCTION_CODE (fndecl);
 596                   md_code = DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD;
 597
 598                   fndecl = targetm.builtin_reciprocal (code, md_code, false);
 599                   if (!fndecl)
 600                     continue;
 601
 602                   /* Check that all uses of the SSA name are divisions,
 603                      otherwise replacing the defining statement will do
 604                      the wrong thing.  */
 605                   fail = false;
 606                   FOR_EACH_IMM_USE_FAST (use_p, ui, arg1)
 607                     {
 608                       gimple stmt2 = USE_STMT (use_p);
 609                       if (is_gimple_debug (stmt2))
 610                         continue;
 611                       if (!is_gimple_assign (stmt2)
 612                           || gimple_assign_rhs_code (stmt2) != RDIV_EXPR
 613                           || gimple_assign_rhs1 (stmt2) == arg1
 614                           || gimple_assign_rhs2 (stmt2) != arg1)
 615                         {
 616                           fail = true;
 617                           break;
 618                         }
 619                     }
 620                   if (fail)
 621                     continue;
 622
 623                   gimple_replace_ssa_lhs (stmt1, arg1);
 624                   gimple_call_set_fndecl (stmt1, fndecl);
 625                   update_stmt (stmt1);
 626                   reciprocal_stats.rfuncs_inserted++;
 627
 628                   FOR_EACH_IMM_USE_STMT (stmt, ui, arg1)
 629                     {
 630                       gimple_stmt_iterator gsi = gsi_for_stmt (stmt);
 631                       gimple_assign_set_rhs_code (stmt, MULT_EXPR);
 632                       fold_stmt_inplace (&gsi);
 633                       update_stmt (stmt);
 634                     }
 635                 }
 636             }
 637         }
 638     }
 639
 640   statistics_counter_event (cfun, "reciprocal divs inserted",
 641                             reciprocal_stats.rdivs_inserted);
 642   statistics_counter_event (cfun, "reciprocal functions inserted",
 643                             reciprocal_stats.rfuncs_inserted);
 644
 645   free_dominance_info (CDI_DOMINATORS);
 646   free_dominance_info (CDI_POST_DOMINATORS);
 647   free_alloc_pool (occ_pool);
 648   return 0;
 649 }
 650
 651 namespace {
 652
 653 const pass_data pass_data_cse_reciprocals =
 654 {
 655   GIMPLE_PASS, /* type */
 656   "recip", /* name */
 657   OPTGROUP_NONE, /* optinfo_flags */
 658   true, /* has_gate */
 659   true, /* has_execute */
 660   TV_NONE, /* tv_id */
 661   PROP_ssa, /* properties_required */
 662   0, /* properties_provided */
 663   0, /* properties_destroyed */
 664   0, /* todo_flags_start */
 665   ( TODO_update_ssa | TODO_verify_ssa
 666     | TODO_verify_stmts ), /* todo_flags_finish */
 667 };
 668
 669 class pass_cse_reciprocals : public gimple_opt_pass
 670 {
 671 public:
 672   pass_cse_reciprocals (gcc::context *ctxt)
 673     : gimple_opt_pass (pass_data_cse_reciprocals, ctxt)
 674   {}
 675
 676   /* opt_pass methods: */
 677   bool gate () { return gate_cse_reciprocals (); }
 678   unsigned int execute () { return execute_cse_reciprocals (); }
 679
 680 }; // class pass_cse_reciprocals
 681
 682 } // anon namespace
 683
 684 gimple_opt_pass *
 685 make_pass_cse_reciprocals (gcc::context *ctxt)
 686 {
 687   return new pass_cse_reciprocals (ctxt);
 688 }
 689
 690 /* Records an occurrence at statement USE_STMT in the vector of trees
 691    STMTS if it is dominated by *TOP_BB or dominates it or this basic block
 692    is not yet initialized.  Returns true if the occurrence was pushed on
 693    the vector.  Adjusts *TOP_BB to be the basic block dominating all
 694    statements in the vector.  */
 695
 696 static bool
 697 maybe_record_sincos (vec<gimple> *stmts,
 698                      basic_block *top_bb, gimple use_stmt)
 699 {
 700   basic_block use_bb = gimple_bb (use_stmt);
 701   if (*top_bb
 702       && (*top_bb == use_bb
 703           || dominated_by_p (CDI_DOMINATORS, use_bb, *top_bb)))
 704     stmts->safe_push (use_stmt);
 705   else if (!*top_bb
 706            || dominated_by_p (CDI_DOMINATORS, *top_bb, use_bb))
 707     {
 708       stmts->safe_push (use_stmt);
 709       *top_bb = use_bb;
 710     }
 711   else
 712     return false;
 713
 714   return true;
 715 }
 716
 717 /* Look for sin, cos and cexpi calls with the same argument NAME and
 718    create a single call to cexpi CSEing the result in this case.
 719    We first walk over all immediate uses of the argument collecting
 720    statements that we can CSE in a vector and in a second pass replace
 721    the statement rhs with a REALPART or IMAGPART expression on the
 722    result of the cexpi call we insert before the use statement that
 723    dominates all other candidates.  */
 724
 725 static bool
 726 execute_cse_sincos_1 (tree name)
 727 {
 728   gimple_stmt_iterator gsi;
 729   imm_use_iterator use_iter;
 730   tree fndecl, res, type;
 731   gimple def_stmt, use_stmt, stmt;
 732   int seen_cos = 0, seen_sin = 0, seen_cexpi = 0;
 733   vec<gimple> stmts = vNULL;
 734   basic_block top_bb = NULL;
 735   int i;
 736   bool cfg_changed = false;
 737
 738   type = TREE_TYPE (name);
 739   FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, name)
 740     {
 741       if (gimple_code (use_stmt) != GIMPLE_CALL
 742           || !gimple_call_lhs (use_stmt)
 743           || !(fndecl = gimple_call_fndecl (use_stmt))
 744           || DECL_BUILT_IN_CLASS (fndecl) != BUILT_IN_NORMAL)
 745         continue;
 746
 747       switch (DECL_FUNCTION_CODE (fndecl))
 748         {
 749         CASE_FLT_FN (BUILT_IN_COS):
 750           seen_cos |= maybe_record_sincos (&stmts, &top_bb, use_stmt) ? 1 : 0;
 751           break;
 752
 753         CASE_FLT_FN (BUILT_IN_SIN):
 754           seen_sin |= maybe_record_sincos (&stmts, &top_bb, use_stmt) ? 1 : 0;
 755           break;
 756
 757         CASE_FLT_FN (BUILT_IN_CEXPI):
 758           seen_cexpi |= maybe_record_sincos (&stmts, &top_bb, use_stmt) ? 1 : 0;
 759           break;
 760
 761         default:;
 762         }
 763     }
 764
 765   if (seen_cos + seen_sin + seen_cexpi <= 1)
 766     {
 767       stmts.release ();
 768       return false;
 769     }
 770
 771   /* Simply insert cexpi at the beginning of top_bb but not earlier than
 772      the name def statement.  */
 773   fndecl = mathfn_built_in (type, BUILT_IN_CEXPI);
 774   if (!fndecl)
 775     return false;
 776   stmt = gimple_build_call (fndecl, 1, name);
 777   res = make_temp_ssa_name (TREE_TYPE (TREE_TYPE (fndecl)), stmt, "sincostmp");
 778   gimple_call_set_lhs (stmt, res);
 779
 780   def_stmt = SSA_NAME_DEF_STMT (name);
 781   if (!SSA_NAME_IS_DEFAULT_DEF (name)
 782       && gimple_code (def_stmt) != GIMPLE_PHI
 783       && gimple_bb (def_stmt) == top_bb)
 784     {
 785       gsi = gsi_for_stmt (def_stmt);
 786       gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
 787     }
 788   else
 789     {
 790       gsi = gsi_after_labels (top_bb);
 791       gsi_insert_before (&gsi, stmt, GSI_SAME_STMT);
 792     }
 793   sincos_stats.inserted++;
 794
 795   /* And adjust the recorded old call sites.  */
 796   for (i = 0; stmts.iterate (i, &use_stmt); ++i)
 797     {
 798       tree rhs = NULL;
 799       fndecl = gimple_call_fndecl (use_stmt);
 800
 801       switch (DECL_FUNCTION_CODE (fndecl))
 802         {
 803         CASE_FLT_FN (BUILT_IN_COS):
 804           rhs = fold_build1 (REALPART_EXPR, type, res);
 805           break;
 806
 807         CASE_FLT_FN (BUILT_IN_SIN):
 808           rhs = fold_build1 (IMAGPART_EXPR, type, res);
 809           break;
 810
 811         CASE_FLT_FN (BUILT_IN_CEXPI):
 812           rhs = res;
 813           break;
 814
 815         default:;
 816           gcc_unreachable ();
 817         }
 818
 819         /* Replace call with a copy.  */
 820         stmt = gimple_build_assign (gimple_call_lhs (use_stmt), rhs);
 821
 822         gsi = gsi_for_stmt (use_stmt);
 823         gsi_replace (&gsi, stmt, true);
 824         if (gimple_purge_dead_eh_edges (gimple_bb (stmt)))
 825           cfg_changed = true;
 826     }
 827
 828   stmts.release ();
 829
 830   return cfg_changed;
 831 }
 832
 833 /* To evaluate powi(x,n), the floating point value x raised to the
 834    constant integer exponent n, we use a hybrid algorithm that
 835    combines the "window method" with look-up tables.  For an
 836    introduction to exponentiation algorithms and "addition chains",
 837    see section 4.6.3, "Evaluation of Powers" of Donald E. Knuth,
 838    "Seminumerical Algorithms", Vol. 2, "The Art of Computer Programming",
 839    3rd Edition, 1998, and Daniel M. Gordon, "A Survey of Fast Exponentiation
 840    Methods", Journal of Algorithms, Vol. 27, pp. 129-146, 1998.  */
 841
 842 /* Provide a default value for POWI_MAX_MULTS, the maximum number of
 843    multiplications to inline before calling the system library's pow
 844    function.  powi(x,n) requires at worst 2*bits(n)-2 multiplications,
 845    so this default never requires calling pow, powf or powl.  */
 846
 847 #ifndef POWI_MAX_MULTS
 848 #define POWI_MAX_MULTS  (2*HOST_BITS_PER_WIDE_INT-2)
 849 #endif
 850
 851 /* The size of the "optimal power tree" lookup table.  All
 852    exponents less than this value are simply looked up in the
 853    powi_table below.  This threshold is also used to size the
 854    cache of pseudo registers that hold intermediate results.  */
 855 #define POWI_TABLE_SIZE 256
 856
 857 /* The size, in bits of the window, used in the "window method"
 858    exponentiation algorithm.  This is equivalent to a radix of
 859    (1<<POWI_WINDOW_SIZE) in the corresponding "m-ary method".  */
 860 #define POWI_WINDOW_SIZE 3
 861
 862 /* The following table is an efficient representation of an
 863    "optimal power tree".  For each value, i, the corresponding
 864    value, j, in the table states than an optimal evaluation
 865    sequence for calculating pow(x,i) can be found by evaluating
 866    pow(x,j)*pow(x,i-j).  An optimal power tree for the first
 867    100 integers is given in Knuth's "Seminumerical algorithms".  */
 868
 869 static const unsigned char powi_table[POWI_TABLE_SIZE] =
 870   {
 871       0,   1,   1,   2,   2,   3,   3,   4,  /*   0 -   7 */
 872       4,   6,   5,   6,   6,  10,   7,   9,  /*   8 -  15 */
 873       8,  16,   9,  16,  10,  12,  11,  13,  /*  16 -  23 */
 874      12,  17,  13,  18,  14,  24,  15,  26,  /*  24 -  31 */
 875      16,  17,  17,  19,  18,  33,  19,  26,  /*  32 -  39 */
 876      20,  25,  21,  40,  22,  27,  23,  44,  /*  40 -  47 */
 877      24,  32,  25,  34,  26,  29,  27,  44,  /*  48 -  55 */
 878      28,  31,  29,  34,  30,  60,  31,  36,  /*  56 -  63 */
 879      32,  64,  33,  34,  34,  46,  35,  37,  /*  64 -  71 */
 880      36,  65,  37,  50,  38,  48,  39,  69,  /*  72 -  79 */
 881      40,  49,  41,  43,  42,  51,  43,  58,  /*  80 -  87 */
 882      44,  64,  45,  47,  46,  59,  47,  76,  /*  88 -  95 */
 883      48,  65,  49,  66,  50,  67,  51,  66,  /*  96 - 103 */
 884      52,  70,  53,  74,  54, 104,  55,  74,  /* 104 - 111 */
 885      56,  64,  57,  69,  58,  78,  59,  68,  /* 112 - 119 */
 886      60,  61,  61,  80,  62,  75,  63,  68,  /* 120 - 127 */
 887      64,  65,  65, 128,  66, 129,  67,  90,  /* 128 - 135 */
 888      68,  73,  69, 131,  70,  94,  71,  88,  /* 136 - 143 */
 889      72, 128,  73,  98,  74, 132,  75, 121,  /* 144 - 151 */
 890      76, 102,  77, 124,  78, 132,  79, 106,  /* 152 - 159 */
 891      80,  97,  81, 160,  82,  99,  83, 134,  /* 160 - 167 */
 892      84,  86,  85,  95,  86, 160,  87, 100,  /* 168 - 175 */
 893      88, 113,  89,  98,  90, 107,  91, 122,  /* 176 - 183 */
 894      92, 111,  93, 102,  94, 126,  95, 150,  /* 184 - 191 */
 895      96, 128,  97, 130,  98, 133,  99, 195,  /* 192 - 199 */
 896     100, 128, 101, 123, 102, 164, 103, 138,  /* 200 - 207 */
 897     104, 145, 105, 146, 106, 109, 107, 149,  /* 208 - 215 */
 898     108, 200, 109, 146, 110, 170, 111, 157,  /* 216 - 223 */
 899     112, 128, 113, 130, 114, 182, 115, 132,  /* 224 - 231 */
 900     116, 200, 117, 132, 118, 158, 119, 206,  /* 232 - 239 */
 901     120, 240, 121, 162, 122, 147, 123, 152,  /* 240 - 247 */
 902     124, 166, 125, 214, 126, 138, 127, 153,  /* 248 - 255 */
 903   };
 904
 905
 906 /* Return the number of multiplications required to calculate
 907    powi(x,n) where n is less than POWI_TABLE_SIZE.  This is a
 908    subroutine of powi_cost.  CACHE is an array indicating
 909    which exponents have already been calculated.  */
 910
 911 static int
 912 powi_lookup_cost (unsigned HOST_WIDE_INT n, bool *cache)
 913 {
 914   /* If we've already calculated this exponent, then this evaluation
 915      doesn't require any additional multiplications.  */
 916   if (cache[n])
 917     return 0;
 918
 919   cache[n] = true;
 920   return powi_lookup_cost (n - powi_table[n], cache)
 921          + powi_lookup_cost (powi_table[n], cache) + 1;
 922 }
 923
 924 /* Return the number of multiplications required to calculate
 925    powi(x,n) for an arbitrary x, given the exponent N.  This
 926    function needs to be kept in sync with powi_as_mults below.  */
 927
 928 static int
 929 powi_cost (HOST_WIDE_INT n)
 930 {
 931   bool cache[POWI_TABLE_SIZE];
 932   unsigned HOST_WIDE_INT digit;
 933   unsigned HOST_WIDE_INT val;
 934   int result;
 935
 936   if (n == 0)
 937     return 0;
 938
 939   /* Ignore the reciprocal when calculating the cost.  */
 940   val = (n < 0) ? -n : n;
 941
 942   /* Initialize the exponent cache.  */
 943   memset (cache, 0, POWI_TABLE_SIZE * sizeof (bool));
 944   cache[1] = true;
 945
 946   result = 0;
 947
 948   while (val >= POWI_TABLE_SIZE)
 949     {
 950       if (val & 1)
 951         {
 952           digit = val & ((1 << POWI_WINDOW_SIZE) - 1);
 953           result += powi_lookup_cost (digit, cache)
 954                     + POWI_WINDOW_SIZE + 1;
 955           val >>= POWI_WINDOW_SIZE;
 956         }
 957       else
 958         {
 959           val >>= 1;
 960           result++;
 961         }
 962     }
 963
 964   return result + powi_lookup_cost (val, cache);
 965 }
 966
 967 /* Recursive subroutine of powi_as_mults.  This function takes the
 968    array, CACHE, of already calculated exponents and an exponent N and
 969    returns a tree that corresponds to CACHE[1]**N, with type TYPE.  */
 970
 971 static tree
 972 powi_as_mults_1 (gimple_stmt_iterator *gsi, location_t loc, tree type,
 973                  HOST_WIDE_INT n, tree *cache)
 974 {
 975   tree op0, op1, ssa_target;
 976   unsigned HOST_WIDE_INT digit;
 977   gimple mult_stmt;
 978
 979   if (n < POWI_TABLE_SIZE && cache[n])
 980     return cache[n];
 981
 982   ssa_target = make_temp_ssa_name (type, NULL, "powmult");
 983
 984   if (n < POWI_TABLE_SIZE)
 985     {
 986       cache[n] = ssa_target;
 987       op0 = powi_as_mults_1 (gsi, loc, type, n - powi_table[n], cache);
 988       op1 = powi_as_mults_1 (gsi, loc, type, powi_table[n], cache);
 989     }
 990   else if (n & 1)
 991     {
 992       digit = n & ((1 << POWI_WINDOW_SIZE) - 1);
 993       op0 = powi_as_mults_1 (gsi, loc, type, n - digit, cache);
 994       op1 = powi_as_mults_1 (gsi, loc, type, digit, cache);
 995     }
 996   else
 997     {
 998       op0 = powi_as_mults_1 (gsi, loc, type, n >> 1, cache);
 999       op1 = op0;
1000     }
1001
1002   mult_stmt = gimple_build_assign_with_ops (MULT_EXPR, ssa_target, op0, op1);
1003   gimple_set_location (mult_stmt, loc);
1004   gsi_insert_before (gsi, mult_stmt, GSI_SAME_STMT);
1005
1006   return ssa_target;
1007 }
1008
1009 /* Convert ARG0**N to a tree of multiplications of ARG0 with itself.
1010    This function needs to be kept in sync with powi_cost above.  */
1011
1012 static tree
1013 powi_as_mults (gimple_stmt_iterator *gsi, location_t loc,
1014                tree arg0, HOST_WIDE_INT n)
1015 {
1016   tree cache[POWI_TABLE_SIZE], result, type = TREE_TYPE (arg0);
1017   gimple div_stmt;
1018   tree target;
1019
1020   if (n == 0)
1021     return build_real (type, dconst1);
1022
1023   memset (cache, 0,  sizeof (cache));
1024   cache[1] = arg0;
1025
1026   result = powi_as_mults_1 (gsi, loc, type, (n < 0) ? -n : n, cache);
1027   if (n >= 0)
1028     return result;
1029
1030   /* If the original exponent was negative, reciprocate the result.  */
1031   target = make_temp_ssa_name (type, NULL, "powmult");
1032   div_stmt = gimple_build_assign_with_ops (RDIV_EXPR, target,
1033                                            build_real (type, dconst1),
1034                                            result);
1035   gimple_set_location (div_stmt, loc);
1036   gsi_insert_before (gsi, div_stmt, GSI_SAME_STMT);
1037
1038   return target;
1039 }
1040
1041 /* ARG0 and N are the two arguments to a powi builtin in GSI with
1042    location info LOC.  If the arguments are appropriate, create an
1043    equivalent sequence of statements prior to GSI using an optimal
1044    number of multiplications, and return an expession holding the
1045    result.  */
1046
1047 static tree
1048 gimple_expand_builtin_powi (gimple_stmt_iterator *gsi, location_t loc,
1049                             tree arg0, HOST_WIDE_INT n)
1050 {
1051   /* Avoid largest negative number.  */
1052   if (n != -n
1053       && ((n >= -1 && n <= 2)
1054           || (optimize_function_for_speed_p (cfun)
1055               && powi_cost (n) <= POWI_MAX_MULTS)))
1056     return powi_as_mults (gsi, loc, arg0, n);
1057
1058   return NULL_TREE;
1059 }
1060
1061 /* Build a gimple call statement that calls FN with argument ARG.
1062    Set the lhs of the call statement to a fresh SSA name.  Insert the
1063    statement prior to GSI's current position, and return the fresh
1064    SSA name.  */
1065
1066 static tree
1067 build_and_insert_call (gimple_stmt_iterator *gsi, location_t loc,
1068                        tree fn, tree arg)
1069 {
1070   gimple call_stmt;
1071   tree ssa_target;
1072
1073   call_stmt = gimple_build_call (fn, 1, arg);
1074   ssa_target = make_temp_ssa_name (TREE_TYPE (arg), NULL, "powroot");
1075   gimple_set_lhs (call_stmt, ssa_target);
1076   gimple_set_location (call_stmt, loc);
1077   gsi_insert_before (gsi, call_stmt, GSI_SAME_STMT);
1078
1079   return ssa_target;
1080 }
1081
1082 /* Build a gimple binary operation with the given CODE and arguments
1083    ARG0, ARG1, assigning the result to a new SSA name for variable
1084    TARGET.  Insert the statement prior to GSI's current position, and
1085    return the fresh SSA name.*/
1086
1087 static tree
1088 build_and_insert_binop (gimple_stmt_iterator *gsi, location_t loc,
1089                         const char *name, enum tree_code code,
1090                         tree arg0, tree arg1)
1091 {
1092   tree result = make_temp_ssa_name (TREE_TYPE (arg0), NULL, name);
1093   gimple stmt = gimple_build_assign_with_ops (code, result, arg0, arg1);
1094   gimple_set_location (stmt, loc);
1095   gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
1096   return result;
1097 }
1098
1099 /* Build a gimple reference operation with the given CODE and argument
1100    ARG, assigning the result to a new SSA name of TYPE with NAME.
1101    Insert the statement prior to GSI's current position, and return
1102    the fresh SSA name.  */
1103
1104 static inline tree
1105 build_and_insert_ref (gimple_stmt_iterator *gsi, location_t loc, tree type,
1106                       const char *name, enum tree_code code, tree arg0)
1107 {
1108   tree result = make_temp_ssa_name (type, NULL, name);
1109   gimple stmt = gimple_build_assign (result, build1 (code, type, arg0));
1110   gimple_set_location (stmt, loc);
1111   gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
1112   return result;
1113 }
1114
1115 /* Build a gimple assignment to cast VAL to TYPE.  Insert the statement
1116    prior to GSI's current position, and return the fresh SSA name.  */
1117
1118 static tree
1119 build_and_insert_cast (gimple_stmt_iterator *gsi, location_t loc,
1120                        tree type, tree val)
1121 {
1122   tree result = make_ssa_name (type, NULL);
1123   gimple stmt = gimple_build_assign_with_ops (NOP_EXPR, result, val, NULL_TREE);
1124   gimple_set_location (stmt, loc);
1125   gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
1126   return result;
1127 }
1128
1129 /* ARG0 and ARG1 are the two arguments to a pow builtin call in GSI
1130    with location info LOC.  If possible, create an equivalent and
1131    less expensive sequence of statements prior to GSI, and return an
1132    expession holding the result.  */
1133
1134 static tree
1135 gimple_expand_builtin_pow (gimple_stmt_iterator *gsi, location_t loc,
1136                            tree arg0, tree arg1)
1137 {
1138   REAL_VALUE_TYPE c, cint, dconst1_4, dconst3_4, dconst1_3, dconst1_6;
1139   REAL_VALUE_TYPE c2, dconst3;
1140   HOST_WIDE_INT n;
1141   tree type, sqrtfn, cbrtfn, sqrt_arg0, sqrt_sqrt, result, cbrt_x, powi_cbrt_x;
1142   enum machine_mode mode;
1143   bool hw_sqrt_exists, c_is_int, c2_is_int;
1144
1145   /* If the exponent isn't a constant, there's nothing of interest
1146      to be done.  */
1147   if (TREE_CODE (arg1) != REAL_CST)
1148     return NULL_TREE;
1149
1150   /* If the exponent is equivalent to an integer, expand to an optimal
1151      multiplication sequence when profitable.  */
1152   c = TREE_REAL_CST (arg1);
1153   n = real_to_integer (&c);
1154   real_from_integer (&cint, VOIDmode, n, n < 0 ? -1 : 0, 0);
1155   c_is_int = real_identical (&c, &cint);
1156
1157   if (c_is_int
1158       && ((n >= -1 && n <= 2)
1159           || (flag_unsafe_math_optimizations
1160               && optimize_insn_for_speed_p ()
1161               && powi_cost (n) <= POWI_MAX_MULTS)))
1162     return gimple_expand_builtin_powi (gsi, loc, arg0, n);
1163
1164   /* Attempt various optimizations using sqrt and cbrt.  */
1165   type = TREE_TYPE (arg0);
1166   mode = TYPE_MODE (type);
1167   sqrtfn = mathfn_built_in (type, BUILT_IN_SQRT);
1168
1169   /* Optimize pow(x,0.5) = sqrt(x).  This replacement is always safe
1170      unless signed zeros must be maintained.  pow(-0,0.5) = +0, while
1171      sqrt(-0) = -0.  */
1172   if (sqrtfn
1173       && REAL_VALUES_EQUAL (c, dconsthalf)
1174       && !HONOR_SIGNED_ZEROS (mode))
1175     return build_and_insert_call (gsi, loc, sqrtfn, arg0);
1176
1177   /* Optimize pow(x,0.25) = sqrt(sqrt(x)).  Assume on most machines that
1178      a builtin sqrt instruction is smaller than a call to pow with 0.25,
1179      so do this optimization even if -Os.  Don't do this optimization
1180      if we don't have a hardware sqrt insn.  */
1181   dconst1_4 = dconst1;
1182   SET_REAL_EXP (&dconst1_4, REAL_EXP (&dconst1_4) - 2);
1183   hw_sqrt_exists = optab_handler (sqrt_optab, mode) != CODE_FOR_nothing;
1184
1185   if (flag_unsafe_math_optimizations
1186       && sqrtfn
1187       && REAL_VALUES_EQUAL (c, dconst1_4)
1188       && hw_sqrt_exists)
1189     {
1190       /* sqrt(x)  */
1191       sqrt_arg0 = build_and_insert_call (gsi, loc, sqrtfn, arg0);
1192
1193       /* sqrt(sqrt(x))  */
1194       return build_and_insert_call (gsi, loc, sqrtfn, sqrt_arg0);
1195     }
1196
1197   /* Optimize pow(x,0.75) = sqrt(x) * sqrt(sqrt(x)) unless we are
1198      optimizing for space.  Don't do this optimization if we don't have
1199      a hardware sqrt insn.  */
1200   real_from_integer (&dconst3_4, VOIDmode, 3, 0, 0);
1201   SET_REAL_EXP (&dconst3_4, REAL_EXP (&dconst3_4) - 2);
1202
1203   if (flag_unsafe_math_optimizations
1204       && sqrtfn
1205       && optimize_function_for_speed_p (cfun)
1206       && REAL_VALUES_EQUAL (c, dconst3_4)
1207       && hw_sqrt_exists)
1208     {
1209       /* sqrt(x)  */
1210       sqrt_arg0 = build_and_insert_call (gsi, loc, sqrtfn, arg0);
1211
1212       /* sqrt(sqrt(x))  */
1213       sqrt_sqrt = build_and_insert_call (gsi, loc, sqrtfn, sqrt_arg0);
1214
1215       /* sqrt(x) * sqrt(sqrt(x))  */
1216       return build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1217                                      sqrt_arg0, sqrt_sqrt);
1218     }
1219
1220   /* Optimize pow(x,1./3.) = cbrt(x).  This requires unsafe math
1221      optimizations since 1./3. is not exactly representable.  If x
1222      is negative and finite, the correct value of pow(x,1./3.) is
1223      a NaN with the "invalid" exception raised, because the value
1224      of 1./3. actually has an even denominator.  The correct value
1225      of cbrt(x) is a negative real value.  */
1226   cbrtfn = mathfn_built_in (type, BUILT_IN_CBRT);
1227   dconst1_3 = real_value_truncate (mode, dconst_third ());
1228
1229   if (flag_unsafe_math_optimizations
1230       && cbrtfn
1231       && (gimple_val_nonnegative_real_p (arg0) || !HONOR_NANS (mode))
1232       && REAL_VALUES_EQUAL (c, dconst1_3))
1233     return build_and_insert_call (gsi, loc, cbrtfn, arg0);
1234
1235   /* Optimize pow(x,1./6.) = cbrt(sqrt(x)).  Don't do this optimization
1236      if we don't have a hardware sqrt insn.  */
1237   dconst1_6 = dconst1_3;
1238   SET_REAL_EXP (&dconst1_6, REAL_EXP (&dconst1_6) - 1);
1239
1240   if (flag_unsafe_math_optimizations
1241       && sqrtfn
1242       && cbrtfn
1243       && (gimple_val_nonnegative_real_p (arg0) || !HONOR_NANS (mode))
1244       && optimize_function_for_speed_p (cfun)
1245       && hw_sqrt_exists
1246       && REAL_VALUES_EQUAL (c, dconst1_6))
1247     {
1248       /* sqrt(x)  */
1249       sqrt_arg0 = build_and_insert_call (gsi, loc, sqrtfn, arg0);
1250
1251       /* cbrt(sqrt(x))  */
1252       return build_and_insert_call (gsi, loc, cbrtfn, sqrt_arg0);
1253     }
1254
1255   /* Optimize pow(x,c), where n = 2c for some nonzero integer n
1256      and c not an integer, into
1257
1258        sqrt(x) * powi(x, n/2),                n > 0;
1259        1.0 / (sqrt(x) * powi(x, abs(n/2))),   n < 0.
1260
1261      Do not calculate the powi factor when n/2 = 0.  */
1262   real_arithmetic (&c2, MULT_EXPR, &c, &dconst2);
1263   n = real_to_integer (&c2);
1264   real_from_integer (&cint, VOIDmode, n, n < 0 ? -1 : 0, 0);
1265   c2_is_int = real_identical (&c2, &cint);
1266
1267   if (flag_unsafe_math_optimizations
1268       && sqrtfn
1269       && c2_is_int
1270       && !c_is_int
1271       && optimize_function_for_speed_p (cfun))
1272     {
1273       tree powi_x_ndiv2 = NULL_TREE;
1274
1275       /* Attempt to fold powi(arg0, abs(n/2)) into multiplies.  If not
1276          possible or profitable, give up.  Skip the degenerate case when
1277          n is 1 or -1, where the result is always 1.  */
1278       if (absu_hwi (n) != 1)
1279         {
1280           powi_x_ndiv2 = gimple_expand_builtin_powi (gsi, loc, arg0,
1281                                                      abs_hwi (n / 2));
1282           if (!powi_x_ndiv2)
1283             return NULL_TREE;
1284         }
1285
1286       /* Calculate sqrt(x).  When n is not 1 or -1, multiply it by the
1287          result of the optimal multiply sequence just calculated.  */
1288       sqrt_arg0 = build_and_insert_call (gsi, loc, sqrtfn, arg0);
1289
1290       if (absu_hwi (n) == 1)
1291         result = sqrt_arg0;
1292       else
1293         result = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1294                                          sqrt_arg0, powi_x_ndiv2);
1295
1296       /* If n is negative, reciprocate the result.  */
1297       if (n < 0)
1298         result = build_and_insert_binop (gsi, loc, "powroot", RDIV_EXPR,
1299                                          build_real (type, dconst1), result);
1300       return result;
1301     }
1302
1303   /* Optimize pow(x,c), where 3c = n for some nonzero integer n, into
1304
1305      powi(x, n/3) * powi(cbrt(x), n%3),                    n > 0;
1306      1.0 / (powi(x, abs(n)/3) * powi(cbrt(x), abs(n)%3)),  n < 0.
1307
1308      Do not calculate the first factor when n/3 = 0.  As cbrt(x) is
1309      different from pow(x, 1./3.) due to rounding and behavior with
1310      negative x, we need to constrain this transformation to unsafe
1311      math and positive x or finite math.  */
1312   real_from_integer (&dconst3, VOIDmode, 3, 0, 0);
1313   real_arithmetic (&c2, MULT_EXPR, &c, &dconst3);
1314   real_round (&c2, mode, &c2);
1315   n = real_to_integer (&c2);
1316   real_from_integer (&cint, VOIDmode, n, n < 0 ? -1 : 0, 0);
1317   real_arithmetic (&c2, RDIV_EXPR, &cint, &dconst3);
1318   real_convert (&c2, mode, &c2);
1319
1320   if (flag_unsafe_math_optimizations
1321       && cbrtfn
1322       && (gimple_val_nonnegative_real_p (arg0) || !HONOR_NANS (mode))
1323       && real_identical (&c2, &c)
1324       && !c2_is_int
1325       && optimize_function_for_speed_p (cfun)
1326       && powi_cost (n / 3) <= POWI_MAX_MULTS)
1327     {
1328       tree powi_x_ndiv3 = NULL_TREE;
1329
1330       /* Attempt to fold powi(arg0, abs(n/3)) into multiplies.  If not
1331          possible or profitable, give up.  Skip the degenerate case when
1332          abs(n) < 3, where the result is always 1.  */
1333       if (absu_hwi (n) >= 3)
1334         {
1335           powi_x_ndiv3 = gimple_expand_builtin_powi (gsi, loc, arg0,
1336                                                      abs_hwi (n / 3));
1337           if (!powi_x_ndiv3)
1338             return NULL_TREE;
1339         }
1340
1341       /* Calculate powi(cbrt(x), n%3).  Don't use gimple_expand_builtin_powi
1342          as that creates an unnecessary variable.  Instead, just produce
1343          either cbrt(x) or cbrt(x) * cbrt(x).  */
1344       cbrt_x = build_and_insert_call (gsi, loc, cbrtfn, arg0);
1345
1346       if (absu_hwi (n) % 3 == 1)
1347         powi_cbrt_x = cbrt_x;
1348       else
1349         powi_cbrt_x = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1350                                               cbrt_x, cbrt_x);
1351
1352       /* Multiply the two subexpressions, unless powi(x,abs(n)/3) = 1.  */
1353       if (absu_hwi (n) < 3)
1354         result = powi_cbrt_x;
1355       else
1356         result = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1357                                          powi_x_ndiv3, powi_cbrt_x);
1358
1359       /* If n is negative, reciprocate the result.  */
1360       if (n < 0)
1361         result = build_and_insert_binop (gsi, loc, "powroot", RDIV_EXPR,
1362                                          build_real (type, dconst1), result);
1363
1364       return result;
1365     }
1366
1367   /* No optimizations succeeded.  */
1368   return NULL_TREE;
1369 }
1370
1371 /* ARG is the argument to a cabs builtin call in GSI with location info
1372    LOC.  Create a sequence of statements prior to GSI that calculates
1373    sqrt(R*R + I*I), where R and I are the real and imaginary components
1374    of ARG, respectively.  Return an expression holding the result.  */
1375
1376 static tree
1377 gimple_expand_builtin_cabs (gimple_stmt_iterator *gsi, location_t loc, tree arg)
1378 {
1379   tree real_part, imag_part, addend1, addend2, sum, result;
1380   tree type = TREE_TYPE (TREE_TYPE (arg));
1381   tree sqrtfn = mathfn_built_in (type, BUILT_IN_SQRT);
1382   enum machine_mode mode = TYPE_MODE (type);
1383
1384   if (!flag_unsafe_math_optimizations
1385       || !optimize_bb_for_speed_p (gimple_bb (gsi_stmt (*gsi)))
1386       || !sqrtfn
1387       || optab_handler (sqrt_optab, mode) == CODE_FOR_nothing)
1388     return NULL_TREE;
1389
1390   real_part = build_and_insert_ref (gsi, loc, type, "cabs",
1391                                     REALPART_EXPR, arg);
1392   addend1 = build_and_insert_binop (gsi, loc, "cabs", MULT_EXPR,
1393                                     real_part, real_part);
1394   imag_part = build_and_insert_ref (gsi, loc, type, "cabs",
1395                                     IMAGPART_EXPR, arg);
1396   addend2 = build_and_insert_binop (gsi, loc, "cabs", MULT_EXPR,
1397                                     imag_part, imag_part);
1398   sum = build_and_insert_binop (gsi, loc, "cabs", PLUS_EXPR, addend1, addend2);
1399   result = build_and_insert_call (gsi, loc, sqrtfn, sum);
1400
1401   return result;
1402 }
1403
1404 /* Go through all calls to sin, cos and cexpi and call execute_cse_sincos_1
1405    on the SSA_NAME argument of each of them.  Also expand powi(x,n) into
1406    an optimal number of multiplies, when n is a constant.  */
1407
1408 static unsigned int
1409 execute_cse_sincos (void)
1410 {
1411   basic_block bb;
1412   bool cfg_changed = false;
1413
1414   calculate_dominance_info (CDI_DOMINATORS);
1415   memset (&sincos_stats, 0, sizeof (sincos_stats));
1416
1417   FOR_EACH_BB (bb)
1418     {
1419       gimple_stmt_iterator gsi;
1420       bool cleanup_eh = false;
1421
1422       for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi); gsi_next (&gsi))
1423         {
1424           gimple stmt = gsi_stmt (gsi);
1425           tree fndecl;
1426
1427           /* Only the last stmt in a bb could throw, no need to call
1428              gimple_purge_dead_eh_edges if we change something in the middle
1429              of a basic block.  */
1430           cleanup_eh = false;
1431
1432           if (is_gimple_call (stmt)
1433               && gimple_call_lhs (stmt)
1434               && (fndecl = gimple_call_fndecl (stmt))
1435               && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_NORMAL)
1436             {
1437               tree arg, arg0, arg1, result;
1438               HOST_WIDE_INT n;
1439               location_t loc;
1440
1441               switch (DECL_FUNCTION_CODE (fndecl))
1442                 {
1443                 CASE_FLT_FN (BUILT_IN_COS):
1444                 CASE_FLT_FN (BUILT_IN_SIN):
1445                 CASE_FLT_FN (BUILT_IN_CEXPI):
1446                   /* Make sure we have either sincos or cexp.  */
1447                   if (!targetm.libc_has_function (function_c99_math_complex)
1448                       && !targetm.libc_has_function (function_sincos))
1449                     break;
1450
1451                   arg = gimple_call_arg (stmt, 0);
1452                   if (TREE_CODE (arg) == SSA_NAME)
1453                     cfg_changed |= execute_cse_sincos_1 (arg);
1454                   break;
1455
1456                 CASE_FLT_FN (BUILT_IN_POW):
1457                   arg0 = gimple_call_arg (stmt, 0);
1458                   arg1 = gimple_call_arg (stmt, 1);
1459
1460                   loc = gimple_location (stmt);
1461                   result = gimple_expand_builtin_pow (&gsi, loc, arg0, arg1);
1462
1463                   if (result)
1464                     {
1465                       tree lhs = gimple_get_lhs (stmt);
1466                       gimple new_stmt = gimple_build_assign (lhs, result);
1467                       gimple_set_location (new_stmt, loc);
1468                       unlink_stmt_vdef (stmt);
1469                       gsi_replace (&gsi, new_stmt, true);
1470                       cleanup_eh = true;
1471                       if (gimple_vdef (stmt))
1472                         release_ssa_name (gimple_vdef (stmt));
1473                     }
1474                   break;
1475
1476                 CASE_FLT_FN (BUILT_IN_POWI):
1477                   arg0 = gimple_call_arg (stmt, 0);
1478                   arg1 = gimple_call_arg (stmt, 1);
1479                   loc = gimple_location (stmt);
1480
1481                   if (real_minus_onep (arg0))
1482                     {
1483                       tree t0, t1, cond, one, minus_one;
1484                       gimple stmt;
1485
1486                       t0 = TREE_TYPE (arg0);
1487                       t1 = TREE_TYPE (arg1);
1488                       one = build_real (t0, dconst1);
1489                       minus_one = build_real (t0, dconstm1);
1490
1491                       cond = make_temp_ssa_name (t1, NULL, "powi_cond");
1492                       stmt = gimple_build_assign_with_ops (BIT_AND_EXPR, cond,
1493                                                            arg1,
1494                                                            build_int_cst (t1,
1495                                                                           1));
1496                       gimple_set_location (stmt, loc);
1497                       gsi_insert_before (&gsi, stmt, GSI_SAME_STMT);
1498
1499                       result = make_temp_ssa_name (t0, NULL, "powi");
1500                       stmt = gimple_build_assign_with_ops (COND_EXPR, result,
1501                                                            cond,
1502                                                            minus_one, one);
1503                       gimple_set_location (stmt, loc);
1504                       gsi_insert_before (&gsi, stmt, GSI_SAME_STMT);
1505                     }
1506                   else
1507                     {
1508                       if (!tree_fits_shwi_p (arg1))
1509                         break;
1510
1511                       n = TREE_INT_CST_LOW (arg1);
1512                       result = gimple_expand_builtin_powi (&gsi, loc, arg0, n);
1513                     }
1514
1515                   if (result)
1516                     {
1517                       tree lhs = gimple_get_lhs (stmt);
1518                       gimple new_stmt = gimple_build_assign (lhs, result);
1519                       gimple_set_location (new_stmt, loc);
1520                       unlink_stmt_vdef (stmt);
1521                       gsi_replace (&gsi, new_stmt, true);
1522                       cleanup_eh = true;
1523                       if (gimple_vdef (stmt))
1524                         release_ssa_name (gimple_vdef (stmt));
1525                     }
1526                   break;
1527
1528                 CASE_FLT_FN (BUILT_IN_CABS):
1529                   arg0 = gimple_call_arg (stmt, 0);
1530                   loc = gimple_location (stmt);
1531                   result = gimple_expand_builtin_cabs (&gsi, loc, arg0);
1532
1533                   if (result)
1534                     {
1535                       tree lhs = gimple_get_lhs (stmt);
1536                       gimple new_stmt = gimple_build_assign (lhs, result);
1537                       gimple_set_location (new_stmt, loc);
1538                       unlink_stmt_vdef (stmt);
1539                       gsi_replace (&gsi, new_stmt, true);
1540                       cleanup_eh = true;
1541                       if (gimple_vdef (stmt))
1542                         release_ssa_name (gimple_vdef (stmt));
1543                     }
1544                   break;
1545
1546                 default:;
1547                 }
1548             }
1549         }
1550       if (cleanup_eh)
1551         cfg_changed |= gimple_purge_dead_eh_edges (bb);
1552     }
1553
1554   statistics_counter_event (cfun, "sincos statements inserted",
1555                             sincos_stats.inserted);
1556
1557   free_dominance_info (CDI_DOMINATORS);
1558   return cfg_changed ? TODO_cleanup_cfg : 0;
1559 }
1560
1561 static bool
1562 gate_cse_sincos (void)
1563 {
1564   /* We no longer require either sincos or cexp, since powi expansion
1565      piggybacks on this pass.  */
1566   return optimize;
1567 }
1568
1569 namespace {
1570
1571 const pass_data pass_data_cse_sincos =
1572 {
1573   GIMPLE_PASS, /* type */
1574   "sincos", /* name */
1575   OPTGROUP_NONE, /* optinfo_flags */
1576   true, /* has_gate */
1577   true, /* has_execute */
1578   TV_NONE, /* tv_id */
1579   PROP_ssa, /* properties_required */
1580   0, /* properties_provided */
1581   0, /* properties_destroyed */
1582   0, /* todo_flags_start */
1583   ( TODO_update_ssa | TODO_verify_ssa
1584     | TODO_verify_stmts ), /* todo_flags_finish */
1585 };
1586
1587 class pass_cse_sincos : public gimple_opt_pass
1588 {
1589 public:
1590   pass_cse_sincos (gcc::context *ctxt)
1591     : gimple_opt_pass (pass_data_cse_sincos, ctxt)
1592   {}
1593
1594   /* opt_pass methods: */
1595   bool gate () { return gate_cse_sincos (); }
1596   unsigned int execute () { return execute_cse_sincos (); }
1597
1598 }; // class pass_cse_sincos
1599
1600 } // anon namespace
1601
1602 gimple_opt_pass *
1603 make_pass_cse_sincos (gcc::context *ctxt)
1604 {
1605   return new pass_cse_sincos (ctxt);
1606 }
1607
1608 /* A symbolic number is used to detect byte permutation and selection
1609    patterns.  Therefore the field N contains an artificial number
1610    consisting of byte size markers:
1611
1612    0    - byte has the value 0
1613    1..size - byte contains the content of the byte
1614    number indexed with that value minus one  */
1615
1616 struct symbolic_number {
1617   unsigned HOST_WIDEST_INT n;
1618   int size;
1619 };
1620
1621 /* Perform a SHIFT or ROTATE operation by COUNT bits on symbolic
1622    number N.  Return false if the requested operation is not permitted
1623    on a symbolic number.  */
1624
1625 static inline bool
1626 do_shift_rotate (enum tree_code code,
1627                  struct symbolic_number *n,
1628                  int count)
1629 {
1630   if (count % 8 != 0)
1631     return false;
1632
1633   /* Zero out the extra bits of N in order to avoid them being shifted
1634      into the significant bits.  */
1635   if (n->size < (int)sizeof (HOST_WIDEST_INT))
1636     n->n &= ((unsigned HOST_WIDEST_INT)1 << (n->size * BITS_PER_UNIT)) - 1;
1637
1638   switch (code)
1639     {
1640     case LSHIFT_EXPR:
1641       n->n <<= count;
1642       break;
1643     case RSHIFT_EXPR:
1644       n->n >>= count;
1645       break;
1646     case LROTATE_EXPR:
1647       n->n = (n->n << count) | (n->n >> ((n->size * BITS_PER_UNIT) - count));
1648       break;
1649     case RROTATE_EXPR:
1650       n->n = (n->n >> count) | (n->n << ((n->size * BITS_PER_UNIT) - count));
1651       break;
1652     default:
1653       return false;
1654     }
1655   /* Zero unused bits for size.  */
1656   if (n->size < (int)sizeof (HOST_WIDEST_INT))
1657     n->n &= ((unsigned HOST_WIDEST_INT)1 << (n->size * BITS_PER_UNIT)) - 1;
1658   return true;
1659 }
1660
1661 /* Perform sanity checking for the symbolic number N and the gimple
1662    statement STMT.  */
1663
1664 static inline bool
1665 verify_symbolic_number_p (struct symbolic_number *n, gimple stmt)
1666 {
1667   tree lhs_type;
1668
1669   lhs_type = gimple_expr_type (stmt);
1670
1671   if (TREE_CODE (lhs_type) != INTEGER_TYPE)
1672     return false;
1673
1674   if (TYPE_PRECISION (lhs_type) != n->size * BITS_PER_UNIT)
1675     return false;
1676
1677   return true;
1678 }
1679
1680 /* find_bswap_1 invokes itself recursively with N and tries to perform
1681    the operation given by the rhs of STMT on the result.  If the
1682    operation could successfully be executed the function returns the
1683    tree expression of the source operand and NULL otherwise.  */
1684
1685 static tree
1686 find_bswap_1 (gimple stmt, struct symbolic_number *n, int limit)
1687 {
1688   enum tree_code code;
1689   tree rhs1, rhs2 = NULL;
1690   gimple rhs1_stmt, rhs2_stmt;
1691   tree source_expr1;
1692   enum gimple_rhs_class rhs_class;
1693
1694   if (!limit || !is_gimple_assign (stmt))
1695     return NULL_TREE;
1696
1697   rhs1 = gimple_assign_rhs1 (stmt);
1698
1699   if (TREE_CODE (rhs1) != SSA_NAME)
1700     return NULL_TREE;
1701
1702   code = gimple_assign_rhs_code (stmt);
1703   rhs_class = gimple_assign_rhs_class (stmt);
1704   rhs1_stmt = SSA_NAME_DEF_STMT (rhs1);
1705
1706   if (rhs_class == GIMPLE_BINARY_RHS)
1707     rhs2 = gimple_assign_rhs2 (stmt);
1708
1709   /* Handle unary rhs and binary rhs with integer constants as second
1710      operand.  */
1711
1712   if (rhs_class == GIMPLE_UNARY_RHS
1713       || (rhs_class == GIMPLE_BINARY_RHS
1714           && TREE_CODE (rhs2) == INTEGER_CST))
1715     {
1716       if (code != BIT_AND_EXPR
1717           && code != LSHIFT_EXPR
1718           && code != RSHIFT_EXPR
1719           && code != LROTATE_EXPR
1720           && code != RROTATE_EXPR
1721           && code != NOP_EXPR
1722           && code != CONVERT_EXPR)
1723         return NULL_TREE;
1724
1725       source_expr1 = find_bswap_1 (rhs1_stmt, n, limit - 1);
1726
1727       /* If find_bswap_1 returned NULL STMT is a leaf node and we have
1728          to initialize the symbolic number.  */
1729       if (!source_expr1)
1730         {
1731           /* Set up the symbolic number N by setting each byte to a
1732              value between 1 and the byte size of rhs1.  The highest
1733              order byte is set to n->size and the lowest order
1734              byte to 1.  */
1735           n->size = TYPE_PRECISION (TREE_TYPE (rhs1));
1736           if (n->size % BITS_PER_UNIT != 0)
1737             return NULL_TREE;
1738           n->size /= BITS_PER_UNIT;
1739           n->n = (sizeof (HOST_WIDEST_INT) < 8 ? 0 :
1740                   (unsigned HOST_WIDEST_INT)0x08070605 << 32 | 0x04030201);
1741
1742           if (n->size < (int)sizeof (HOST_WIDEST_INT))
1743             n->n &= ((unsigned HOST_WIDEST_INT)1 <<
1744                      (n->size * BITS_PER_UNIT)) - 1;
1745
1746           source_expr1 = rhs1;
1747         }
1748
1749       switch (code)
1750         {
1751         case BIT_AND_EXPR:
1752           {
1753             int i;
1754             unsigned HOST_WIDEST_INT val = widest_int_cst_value (rhs2);
1755             unsigned HOST_WIDEST_INT tmp = val;
1756
1757             /* Only constants masking full bytes are allowed.  */
1758             for (i = 0; i < n->size; i++, tmp >>= BITS_PER_UNIT)
1759               if ((tmp & 0xff) != 0 && (tmp & 0xff) != 0xff)
1760                 return NULL_TREE;
1761
1762             n->n &= val;
1763           }
1764           break;
1765         case LSHIFT_EXPR:
1766         case RSHIFT_EXPR:
1767         case LROTATE_EXPR:
1768         case RROTATE_EXPR:
1769           if (!do_shift_rotate (code, n, (int)TREE_INT_CST_LOW (rhs2)))
1770             return NULL_TREE;
1771           break;
1772         CASE_CONVERT:
1773           {
1774             int type_size;
1775
1776             type_size = TYPE_PRECISION (gimple_expr_type (stmt));
1777             if (type_size % BITS_PER_UNIT != 0)
1778               return NULL_TREE;
1779
1780             if (type_size / BITS_PER_UNIT < (int)(sizeof (HOST_WIDEST_INT)))
1781               {
1782                 /* If STMT casts to a smaller type mask out the bits not
1783                    belonging to the target type.  */
1784                 n->n &= ((unsigned HOST_WIDEST_INT)1 << type_size) - 1;
1785               }
1786             n->size = type_size / BITS_PER_UNIT;
1787           }
1788           break;
1789         default:
1790           return NULL_TREE;
1791         };
1792       return verify_symbolic_number_p (n, stmt) ? source_expr1 : NULL;
1793     }
1794
1795   /* Handle binary rhs.  */
1796
1797   if (rhs_class == GIMPLE_BINARY_RHS)
1798     {
1799       struct symbolic_number n1, n2;
1800       tree source_expr2;
1801
1802       if (code != BIT_IOR_EXPR)
1803         return NULL_TREE;
1804
1805       if (TREE_CODE (rhs2) != SSA_NAME)
1806         return NULL_TREE;
1807
1808       rhs2_stmt = SSA_NAME_DEF_STMT (rhs2);
1809
1810       switch (code)
1811         {
1812         case BIT_IOR_EXPR:
1813           source_expr1 = find_bswap_1 (rhs1_stmt, &n1, limit - 1);
1814
1815           if (!source_expr1)
1816             return NULL_TREE;
1817
1818           source_expr2 = find_bswap_1 (rhs2_stmt, &n2, limit - 1);
1819
1820           if (source_expr1 != source_expr2
1821               || n1.size != n2.size)
1822             return NULL_TREE;
1823
1824           n->size = n1.size;
1825           n->n = n1.n | n2.n;
1826
1827           if (!verify_symbolic_number_p (n, stmt))
1828             return NULL_TREE;
1829
1830           break;
1831         default:
1832           return NULL_TREE;
1833         }
1834       return source_expr1;
1835     }
1836   return NULL_TREE;
1837 }
1838
1839 /* Check if STMT completes a bswap implementation consisting of ORs,
1840    SHIFTs and ANDs.  Return the source tree expression on which the
1841    byte swap is performed and NULL if no bswap was found.  */
1842
1843 static tree
1844 find_bswap (gimple stmt)
1845 {
1846 /* The number which the find_bswap result should match in order to
1847    have a full byte swap.  The number is shifted to the left according
1848    to the size of the symbolic number before using it.  */
1849   unsigned HOST_WIDEST_INT cmp =
1850     sizeof (HOST_WIDEST_INT) < 8 ? 0 :
1851     (unsigned HOST_WIDEST_INT)0x01020304 << 32 | 0x05060708;
1852
1853   struct symbolic_number n;
1854   tree source_expr;
1855   int limit;
1856
1857   /* The last parameter determines the depth search limit.  It usually
1858      correlates directly to the number of bytes to be touched.  We
1859      increase that number by three  here in order to also
1860      cover signed -> unsigned converions of the src operand as can be seen
1861      in libgcc, and for initial shift/and operation of the src operand.  */
1862   limit = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (gimple_expr_type (stmt)));
1863   limit += 1 + (int) ceil_log2 ((unsigned HOST_WIDE_INT) limit);
1864   source_expr =  find_bswap_1 (stmt, &n, limit);
1865
1866   if (!source_expr)
1867     return NULL_TREE;
1868
1869   /* Zero out the extra bits of N and CMP.  */
1870   if (n.size < (int)sizeof (HOST_WIDEST_INT))
1871     {
1872       unsigned HOST_WIDEST_INT mask =
1873         ((unsigned HOST_WIDEST_INT)1 << (n.size * BITS_PER_UNIT)) - 1;
1874
1875       n.n &= mask;
1876       cmp >>= (sizeof (HOST_WIDEST_INT) - n.size) * BITS_PER_UNIT;
1877     }
1878
1879   /* A complete byte swap should make the symbolic number to start
1880      with the largest digit in the highest order byte.  */
1881   if (cmp != n.n)
1882     return NULL_TREE;
1883
1884   return source_expr;
1885 }
1886
1887 /* Find manual byte swap implementations and turn them into a bswap
1888    builtin invokation.  */
1889
1890 static unsigned int
1891 execute_optimize_bswap (void)
1892 {
1893   basic_block bb;
1894   bool bswap16_p, bswap32_p, bswap64_p;
1895   bool changed = false;
1896   tree bswap16_type = NULL_TREE, bswap32_type = NULL_TREE, bswap64_type = NULL_TREE;
1897
1898   if (BITS_PER_UNIT != 8)
1899     return 0;
1900
1901   if (sizeof (HOST_WIDEST_INT) < 8)
1902     return 0;
1903
1904   bswap16_p = (builtin_decl_explicit_p (BUILT_IN_BSWAP16)
1905                && optab_handler (bswap_optab, HImode) != CODE_FOR_nothing);
1906   bswap32_p = (builtin_decl_explicit_p (BUILT_IN_BSWAP32)
1907                && optab_handler (bswap_optab, SImode) != CODE_FOR_nothing);
1908   bswap64_p = (builtin_decl_explicit_p (BUILT_IN_BSWAP64)
1909                && (optab_handler (bswap_optab, DImode) != CODE_FOR_nothing
1910                    || (bswap32_p && word_mode == SImode)));
1911
1912   if (!bswap16_p && !bswap32_p && !bswap64_p)
1913     return 0;
1914
1915   /* Determine the argument type of the builtins.  The code later on
1916      assumes that the return and argument type are the same.  */
1917   if (bswap16_p)
1918     {
1919       tree fndecl = builtin_decl_explicit (BUILT_IN_BSWAP16);
1920       bswap16_type = TREE_VALUE (TYPE_ARG_TYPES (TREE_TYPE (fndecl)));
1921     }
1922
1923   if (bswap32_p)
1924     {
1925       tree fndecl = builtin_decl_explicit (BUILT_IN_BSWAP32);
1926       bswap32_type = TREE_VALUE (TYPE_ARG_TYPES (TREE_TYPE (fndecl)));
1927     }
1928
1929   if (bswap64_p)
1930     {
1931       tree fndecl = builtin_decl_explicit (BUILT_IN_BSWAP64);
1932       bswap64_type = TREE_VALUE (TYPE_ARG_TYPES (TREE_TYPE (fndecl)));
1933     }
1934
1935   memset (&bswap_stats, 0, sizeof (bswap_stats));
1936
1937   FOR_EACH_BB (bb)
1938     {
1939       gimple_stmt_iterator gsi;
1940
1941       /* We do a reverse scan for bswap patterns to make sure we get the
1942          widest match. As bswap pattern matching doesn't handle
1943          previously inserted smaller bswap replacements as sub-
1944          patterns, the wider variant wouldn't be detected.  */
1945       for (gsi = gsi_last_bb (bb); !gsi_end_p (gsi); gsi_prev (&gsi))
1946         {
1947           gimple stmt = gsi_stmt (gsi);
1948           tree bswap_src, bswap_type;
1949           tree bswap_tmp;
1950           tree fndecl = NULL_TREE;
1951           int type_size;
1952           gimple call;
1953
1954           if (!is_gimple_assign (stmt)
1955               || gimple_assign_rhs_code (stmt) != BIT_IOR_EXPR)
1956             continue;
1957
1958           type_size = TYPE_PRECISION (gimple_expr_type (stmt));
1959
1960           switch (type_size)
1961             {
1962             case 16:
1963               if (bswap16_p)
1964                 {
1965                   fndecl = builtin_decl_explicit (BUILT_IN_BSWAP16);
1966                   bswap_type = bswap16_type;
1967                 }
1968               break;
1969             case 32:
1970               if (bswap32_p)
1971                 {
1972                   fndecl = builtin_decl_explicit (BUILT_IN_BSWAP32);
1973                   bswap_type = bswap32_type;
1974                 }
1975               break;
1976             case 64:
1977               if (bswap64_p)
1978                 {
1979                   fndecl = builtin_decl_explicit (BUILT_IN_BSWAP64);
1980                   bswap_type = bswap64_type;
1981                 }
1982               break;
1983             default:
1984               continue;
1985             }
1986
1987           if (!fndecl)
1988             continue;
1989
1990           bswap_src = find_bswap (stmt);
1991
1992           if (!bswap_src)
1993             continue;
1994
1995           changed = true;
1996           if (type_size == 16)
1997             bswap_stats.found_16bit++;
1998           else if (type_size == 32)
1999             bswap_stats.found_32bit++;
2000           else
2001             bswap_stats.found_64bit++;
2002
2003           bswap_tmp = bswap_src;
2004
2005           /* Convert the src expression if necessary.  */
2006           if (!useless_type_conversion_p (TREE_TYPE (bswap_tmp), bswap_type))
2007             {
2008               gimple convert_stmt;
2009               bswap_tmp = make_temp_ssa_name (bswap_type, NULL, "bswapsrc");
2010               convert_stmt = gimple_build_assign_with_ops
2011                                 (NOP_EXPR, bswap_tmp, bswap_src, NULL);
2012               gsi_insert_before (&gsi, convert_stmt, GSI_SAME_STMT);
2013             }
2014
2015           call = gimple_build_call (fndecl, 1, bswap_tmp);
2016
2017           bswap_tmp = gimple_assign_lhs (stmt);
2018
2019           /* Convert the result if necessary.  */
2020           if (!useless_type_conversion_p (TREE_TYPE (bswap_tmp), bswap_type))
2021             {
2022               gimple convert_stmt;
2023               bswap_tmp = make_temp_ssa_name (bswap_type, NULL, "bswapdst");
2024               convert_stmt = gimple_build_assign_with_ops
2025                         (NOP_EXPR, gimple_assign_lhs (stmt), bswap_tmp, NULL);
2026               gsi_insert_after (&gsi, convert_stmt, GSI_SAME_STMT);
2027             }
2028
2029           gimple_call_set_lhs (call, bswap_tmp);
2030
2031           if (dump_file)
2032             {
2033               fprintf (dump_file, "%d bit bswap implementation found at: ",
2034                        (int)type_size);
2035               print_gimple_stmt (dump_file, stmt, 0, 0);
2036             }
2037
2038           gsi_insert_after (&gsi, call, GSI_SAME_STMT);
2039           gsi_remove (&gsi, true);
2040         }
2041     }
2042
2043   statistics_counter_event (cfun, "16-bit bswap implementations found",
2044                             bswap_stats.found_16bit);
2045   statistics_counter_event (cfun, "32-bit bswap implementations found",
2046                             bswap_stats.found_32bit);
2047   statistics_counter_event (cfun, "64-bit bswap implementations found",
2048                             bswap_stats.found_64bit);
2049
2050   return (changed ? TODO_update_ssa | TODO_verify_ssa
2051           | TODO_verify_stmts : 0);
2052 }
2053
2054 static bool
2055 gate_optimize_bswap (void)
2056 {
2057   return flag_expensive_optimizations && optimize;
2058 }
2059
2060 namespace {
2061
2062 const pass_data pass_data_optimize_bswap =
2063 {
2064   GIMPLE_PASS, /* type */
2065   "bswap", /* name */
2066   OPTGROUP_NONE, /* optinfo_flags */
2067   true, /* has_gate */
2068   true, /* has_execute */
2069   TV_NONE, /* tv_id */
2070   PROP_ssa, /* properties_required */
2071   0, /* properties_provided */
2072   0, /* properties_destroyed */
2073   0, /* todo_flags_start */
2074   0, /* todo_flags_finish */
2075 };
2076
2077 class pass_optimize_bswap : public gimple_opt_pass
2078 {
2079 public:
2080   pass_optimize_bswap (gcc::context *ctxt)
2081     : gimple_opt_pass (pass_data_optimize_bswap, ctxt)
2082   {}
2083
2084   /* opt_pass methods: */
2085   bool gate () { return gate_optimize_bswap (); }
2086   unsigned int execute () { return execute_optimize_bswap (); }
2087
2088 }; // class pass_optimize_bswap
2089
2090 } // anon namespace
2091
2092 gimple_opt_pass *
2093 make_pass_optimize_bswap (gcc::context *ctxt)
2094 {
2095   return new pass_optimize_bswap (ctxt);
2096 }
2097
2098 /* Return true if stmt is a type conversion operation that can be stripped
2099    when used in a widening multiply operation.  */
2100 static bool
2101 widening_mult_conversion_strippable_p (tree result_type, gimple stmt)
2102 {
2103   enum tree_code rhs_code = gimple_assign_rhs_code (stmt);
2104
2105   if (TREE_CODE (result_type) == INTEGER_TYPE)
2106     {
2107       tree op_type;
2108       tree inner_op_type;
2109
2110       if (!CONVERT_EXPR_CODE_P (rhs_code))
2111         return false;
2112
2113       op_type = TREE_TYPE (gimple_assign_lhs (stmt));
2114
2115       /* If the type of OP has the same precision as the result, then
2116          we can strip this conversion.  The multiply operation will be
2117          selected to create the correct extension as a by-product.  */
2118       if (TYPE_PRECISION (result_type) == TYPE_PRECISION (op_type))
2119         return true;
2120
2121       /* We can also strip a conversion if it preserves the signed-ness of
2122          the operation and doesn't narrow the range.  */
2123       inner_op_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
2124
2125       /* If the inner-most type is unsigned, then we can strip any
2126          intermediate widening operation.  If it's signed, then the
2127          intermediate widening operation must also be signed.  */
2128       if ((TYPE_UNSIGNED (inner_op_type)
2129            || TYPE_UNSIGNED (op_type) == TYPE_UNSIGNED (inner_op_type))
2130           && TYPE_PRECISION (op_type) > TYPE_PRECISION (inner_op_type))
2131         return true;
2132
2133       return false;
2134     }
2135
2136   return rhs_code == FIXED_CONVERT_EXPR;
2137 }
2138
2139 /* Return true if RHS is a suitable operand for a widening multiplication,
2140    assuming a target type of TYPE.
2141    There are two cases:
2142
2143      - RHS makes some value at least twice as wide.  Store that value
2144        in *NEW_RHS_OUT if so, and store its type in *TYPE_OUT.
2145
2146      - RHS is an integer constant.  Store that value in *NEW_RHS_OUT if so,
2147        but leave *TYPE_OUT untouched.  */
2148
2149 static bool
2150 is_widening_mult_rhs_p (tree type, tree rhs, tree *type_out,
2151                         tree *new_rhs_out)
2152 {
2153   gimple stmt;
2154   tree type1, rhs1;
2155
2156   if (TREE_CODE (rhs) == SSA_NAME)
2157     {
2158       stmt = SSA_NAME_DEF_STMT (rhs);
2159       if (is_gimple_assign (stmt))
2160         {
2161           if (! widening_mult_conversion_strippable_p (type, stmt))
2162             rhs1 = rhs;
2163           else
2164             {
2165               rhs1 = gimple_assign_rhs1 (stmt);
2166
2167               if (TREE_CODE (rhs1) == INTEGER_CST)
2168                 {
2169                   *new_rhs_out = rhs1;
2170                   *type_out = NULL;
2171                   return true;
2172                 }
2173             }
2174         }
2175       else
2176         rhs1 = rhs;
2177
2178       type1 = TREE_TYPE (rhs1);
2179
2180       if (TREE_CODE (type1) != TREE_CODE (type)
2181           || TYPE_PRECISION (type1) * 2 > TYPE_PRECISION (type))
2182         return false;
2183
2184       *new_rhs_out = rhs1;
2185       *type_out = type1;
2186       return true;
2187     }
2188
2189   if (TREE_CODE (rhs) == INTEGER_CST)
2190     {
2191       *new_rhs_out = rhs;
2192       *type_out = NULL;
2193       return true;
2194     }
2195
2196   return false;
2197 }
2198
2199 /* Return true if STMT performs a widening multiplication, assuming the
2200    output type is TYPE.  If so, store the unwidened types of the operands
2201    in *TYPE1_OUT and *TYPE2_OUT respectively.  Also fill *RHS1_OUT and
2202    *RHS2_OUT such that converting those operands to types *TYPE1_OUT
2203    and *TYPE2_OUT would give the operands of the multiplication.  */
2204
2205 static bool
2206 is_widening_mult_p (gimple stmt,
2207                     tree *type1_out, tree *rhs1_out,
2208                     tree *type2_out, tree *rhs2_out)
2209 {
2210   tree type = TREE_TYPE (gimple_assign_lhs (stmt));
2211
2212   if (TREE_CODE (type) != INTEGER_TYPE
2213       && TREE_CODE (type) != FIXED_POINT_TYPE)
2214     return false;
2215
2216   if (!is_widening_mult_rhs_p (type, gimple_assign_rhs1 (stmt), type1_out,
2217                                rhs1_out))
2218     return false;
2219
2220   if (!is_widening_mult_rhs_p (type, gimple_assign_rhs2 (stmt), type2_out,
2221                                rhs2_out))
2222     return false;
2223
2224   if (*type1_out == NULL)
2225     {
2226       if (*type2_out == NULL || !int_fits_type_p (*rhs1_out, *type2_out))
2227         return false;
2228       *type1_out = *type2_out;
2229     }
2230
2231   if (*type2_out == NULL)
2232     {
2233       if (!int_fits_type_p (*rhs2_out, *type1_out))
2234         return false;
2235       *type2_out = *type1_out;
2236     }
2237
2238   /* Ensure that the larger of the two operands comes first. */
2239   if (TYPE_PRECISION (*type1_out) < TYPE_PRECISION (*type2_out))
2240     {
2241       tree tmp;
2242       tmp = *type1_out;
2243       *type1_out = *type2_out;
2244       *type2_out = tmp;
2245       tmp = *rhs1_out;
2246       *rhs1_out = *rhs2_out;
2247       *rhs2_out = tmp;
2248     }
2249
2250   return true;
2251 }
2252
2253 /* Process a single gimple statement STMT, which has a MULT_EXPR as
2254    its rhs, and try to convert it into a WIDEN_MULT_EXPR.  The return
2255    value is true iff we converted the statement.  */
2256
2257 static bool
2258 convert_mult_to_widen (gimple stmt, gimple_stmt_iterator *gsi)
2259 {
2260   tree lhs, rhs1, rhs2, type, type1, type2;
2261   enum insn_code handler;
2262   enum machine_mode to_mode, from_mode, actual_mode;
2263   optab op;
2264   int actual_precision;
2265   location_t loc = gimple_location (stmt);
2266   bool from_unsigned1, from_unsigned2;
2267
2268   lhs = gimple_assign_lhs (stmt);
2269   type = TREE_TYPE (lhs);
2270   if (TREE_CODE (type) != INTEGER_TYPE)
2271     return false;
2272
2273   if (!is_widening_mult_p (stmt, &type1, &rhs1, &type2, &rhs2))
2274     return false;
2275
2276   to_mode = TYPE_MODE (type);
2277   from_mode = TYPE_MODE (type1);
2278   from_unsigned1 = TYPE_UNSIGNED (type1);
2279   from_unsigned2 = TYPE_UNSIGNED (type2);
2280
2281   if (from_unsigned1 && from_unsigned2)
2282     op = umul_widen_optab;
2283   else if (!from_unsigned1 && !from_unsigned2)
2284     op = smul_widen_optab;
2285   else
2286     op = usmul_widen_optab;
2287
2288   handler = find_widening_optab_handler_and_mode (op, to_mode, from_mode,
2289                                                   0, &actual_mode);
2290
2291   if (handler == CODE_FOR_nothing)
2292     {
2293       if (op != smul_widen_optab)
2294         {
2295           /* We can use a signed multiply with unsigned types as long as
2296              there is a wider mode to use, or it is the smaller of the two
2297              types that is unsigned.  Note that type1 >= type2, always.  */
2298           if ((TYPE_UNSIGNED (type1)
2299                && TYPE_PRECISION (type1) == GET_MODE_PRECISION (from_mode))
2300               || (TYPE_UNSIGNED (type2)
2301                   && TYPE_PRECISION (type2) == GET_MODE_PRECISION (from_mode)))
2302             {
2303               from_mode = GET_MODE_WIDER_MODE (from_mode);
2304               if (GET_MODE_SIZE (to_mode) <= GET_MODE_SIZE (from_mode))
2305                 return false;
2306             }
2307
2308           op = smul_widen_optab;
2309           handler = find_widening_optab_handler_and_mode (op, to_mode,
2310                                                           from_mode, 0,
2311                                                           &actual_mode);
2312
2313           if (handler == CODE_FOR_nothing)
2314             return false;
2315
2316           from_unsigned1 = from_unsigned2 = false;
2317         }
2318       else
2319         return false;
2320     }
2321
2322   /* Ensure that the inputs to the handler are in the correct precison
2323      for the opcode.  This will be the full mode size.  */
2324   actual_precision = GET_MODE_PRECISION (actual_mode);
2325   if (2 * actual_precision > TYPE_PRECISION (type))
2326     return false;
2327   if (actual_precision != TYPE_PRECISION (type1)
2328       || from_unsigned1 != TYPE_UNSIGNED (type1))
2329     rhs1 = build_and_insert_cast (gsi, loc,
2330                                   build_nonstandard_integer_type
2331                                     (actual_precision, from_unsigned1), rhs1);
2332   if (actual_precision != TYPE_PRECISION (type2)
2333       || from_unsigned2 != TYPE_UNSIGNED (type2))
2334     rhs2 = build_and_insert_cast (gsi, loc,
2335                                   build_nonstandard_integer_type
2336                                     (actual_precision, from_unsigned2), rhs2);
2337
2338   /* Handle constants.  */
2339   if (TREE_CODE (rhs1) == INTEGER_CST)
2340     rhs1 = fold_convert (type1, rhs1);
2341   if (TREE_CODE (rhs2) == INTEGER_CST)
2342     rhs2 = fold_convert (type2, rhs2);
2343
2344   gimple_assign_set_rhs1 (stmt, rhs1);
2345   gimple_assign_set_rhs2 (stmt, rhs2);
2346   gimple_assign_set_rhs_code (stmt, WIDEN_MULT_EXPR);
2347   update_stmt (stmt);
2348   widen_mul_stats.widen_mults_inserted++;
2349   return true;
2350 }
2351
2352 /* Process a single gimple statement STMT, which is found at the
2353    iterator GSI and has a either a PLUS_EXPR or a MINUS_EXPR as its
2354    rhs (given by CODE), and try to convert it into a
2355    WIDEN_MULT_PLUS_EXPR or a WIDEN_MULT_MINUS_EXPR.  The return value
2356    is true iff we converted the statement.  */
2357
2358 static bool
2359 convert_plusminus_to_widen (gimple_stmt_iterator *gsi, gimple stmt,
2360                             enum tree_code code)
2361 {
2362   gimple rhs1_stmt = NULL, rhs2_stmt = NULL;
2363   gimple conv1_stmt = NULL, conv2_stmt = NULL, conv_stmt;
2364   tree type, type1, type2, optype;
2365   tree lhs, rhs1, rhs2, mult_rhs1, mult_rhs2, add_rhs;
2366   enum tree_code rhs1_code = ERROR_MARK, rhs2_code = ERROR_MARK;
2367   optab this_optab;
2368   enum tree_code wmult_code;
2369   enum insn_code handler;
2370   enum machine_mode to_mode, from_mode, actual_mode;
2371   location_t loc = gimple_location (stmt);
2372   int actual_precision;
2373   bool from_unsigned1, from_unsigned2;
2374
2375   lhs = gimple_assign_lhs (stmt);
2376   type = TREE_TYPE (lhs);
2377   if (TREE_CODE (type) != INTEGER_TYPE
2378       && TREE_CODE (type) != FIXED_POINT_TYPE)
2379     return false;
2380
2381   if (code == MINUS_EXPR)
2382     wmult_code = WIDEN_MULT_MINUS_EXPR;
2383   else
2384     wmult_code = WIDEN_MULT_PLUS_EXPR;
2385
2386   rhs1 = gimple_assign_rhs1 (stmt);
2387   rhs2 = gimple_assign_rhs2 (stmt);
2388
2389   if (TREE_CODE (rhs1) == SSA_NAME)
2390     {
2391       rhs1_stmt = SSA_NAME_DEF_STMT (rhs1);
2392       if (is_gimple_assign (rhs1_stmt))
2393         rhs1_code = gimple_assign_rhs_code (rhs1_stmt);
2394     }
2395
2396   if (TREE_CODE (rhs2) == SSA_NAME)
2397     {
2398       rhs2_stmt = SSA_NAME_DEF_STMT (rhs2);
2399       if (is_gimple_assign (rhs2_stmt))
2400         rhs2_code = gimple_assign_rhs_code (rhs2_stmt);
2401     }
2402
2403   /* Allow for one conversion statement between the multiply
2404      and addition/subtraction statement.  If there are more than
2405      one conversions then we assume they would invalidate this
2406      transformation.  If that's not the case then they should have
2407      been folded before now.  */
2408   if (CONVERT_EXPR_CODE_P (rhs1_code))
2409     {
2410       conv1_stmt = rhs1_stmt;
2411       rhs1 = gimple_assign_rhs1 (rhs1_stmt);
2412       if (TREE_CODE (rhs1) == SSA_NAME)
2413         {
2414           rhs1_stmt = SSA_NAME_DEF_STMT (rhs1);
2415           if (is_gimple_assign (rhs1_stmt))
2416             rhs1_code = gimple_assign_rhs_code (rhs1_stmt);
2417         }
2418       else
2419         return false;
2420     }
2421   if (CONVERT_EXPR_CODE_P (rhs2_code))
2422     {
2423       conv2_stmt = rhs2_stmt;
2424       rhs2 = gimple_assign_rhs1 (rhs2_stmt);
2425       if (TREE_CODE (rhs2) == SSA_NAME)
2426         {
2427           rhs2_stmt = SSA_NAME_DEF_STMT (rhs2);
2428           if (is_gimple_assign (rhs2_stmt))
2429             rhs2_code = gimple_assign_rhs_code (rhs2_stmt);
2430         }
2431       else
2432         return false;
2433     }
2434
2435   /* If code is WIDEN_MULT_EXPR then it would seem unnecessary to call
2436      is_widening_mult_p, but we still need the rhs returns.
2437
2438      It might also appear that it would be sufficient to use the existing
2439      operands of the widening multiply, but that would limit the choice of
2440      multiply-and-accumulate instructions.
2441
2442      If the widened-multiplication result has more than one uses, it is
2443      probably wiser not to do the conversion.  */
2444   if (code == PLUS_EXPR
2445       && (rhs1_code == MULT_EXPR || rhs1_code == WIDEN_MULT_EXPR))
2446     {
2447       if (!has_single_use (rhs1)
2448           || !is_widening_mult_p (rhs1_stmt, &type1, &mult_rhs1,
2449                                   &type2, &mult_rhs2))
2450         return false;
2451       add_rhs = rhs2;
2452       conv_stmt = conv1_stmt;
2453     }
2454   else if (rhs2_code == MULT_EXPR || rhs2_code == WIDEN_MULT_EXPR)
2455     {
2456       if (!has_single_use (rhs2)
2457           || !is_widening_mult_p (rhs2_stmt, &type1, &mult_rhs1,
2458                                   &type2, &mult_rhs2))
2459         return false;
2460       add_rhs = rhs1;
2461       conv_stmt = conv2_stmt;
2462     }
2463   else
2464     return false;
2465
2466   to_mode = TYPE_MODE (type);
2467   from_mode = TYPE_MODE (type1);
2468   from_unsigned1 = TYPE_UNSIGNED (type1);
2469   from_unsigned2 = TYPE_UNSIGNED (type2);
2470   optype = type1;
2471
2472   /* There's no such thing as a mixed sign madd yet, so use a wider mode.  */
2473   if (from_unsigned1 != from_unsigned2)
2474     {
2475       if (!INTEGRAL_TYPE_P (type))
2476         return false;
2477       /* We can use a signed multiply with unsigned types as long as
2478          there is a wider mode to use, or it is the smaller of the two
2479          types that is unsigned.  Note that type1 >= type2, always.  */
2480       if ((from_unsigned1
2481            && TYPE_PRECISION (type1) == GET_MODE_PRECISION (from_mode))
2482           || (from_unsigned2
2483               && TYPE_PRECISION (type2) == GET_MODE_PRECISION (from_mode)))
2484         {
2485           from_mode = GET_MODE_WIDER_MODE (from_mode);
2486           if (GET_MODE_SIZE (from_mode) >= GET_MODE_SIZE (to_mode))
2487             return false;
2488         }
2489
2490       from_unsigned1 = from_unsigned2 = false;
2491       optype = build_nonstandard_integer_type (GET_MODE_PRECISION (from_mode),
2492                                                false);
2493     }
2494
2495   /* If there was a conversion between the multiply and addition
2496      then we need to make sure it fits a multiply-and-accumulate.
2497      The should be a single mode change which does not change the
2498      value.  */
2499   if (conv_stmt)
2500     {
2501       /* We use the original, unmodified data types for this.  */
2502       tree from_type = TREE_TYPE (gimple_assign_rhs1 (conv_stmt));
2503       tree to_type = TREE_TYPE (gimple_assign_lhs (conv_stmt));
2504       int data_size = TYPE_PRECISION (type1) + TYPE_PRECISION (type2);
2505       bool is_unsigned = TYPE_UNSIGNED (type1) && TYPE_UNSIGNED (type2);
2506
2507       if (TYPE_PRECISION (from_type) > TYPE_PRECISION (to_type))
2508         {
2509           /* Conversion is a truncate.  */
2510           if (TYPE_PRECISION (to_type) < data_size)
2511             return false;
2512         }
2513       else if (TYPE_PRECISION (from_type) < TYPE_PRECISION (to_type))
2514         {
2515           /* Conversion is an extend.  Check it's the right sort.  */
2516           if (TYPE_UNSIGNED (from_type) != is_unsigned
2517               && !(is_unsigned && TYPE_PRECISION (from_type) > data_size))
2518             return false;
2519         }
2520       /* else convert is a no-op for our purposes.  */
2521     }
2522
2523   /* Verify that the machine can perform a widening multiply
2524      accumulate in this mode/signedness combination, otherwise
2525      this transformation is likely to pessimize code.  */
2526   this_optab = optab_for_tree_code (wmult_code, optype, optab_default);
2527   handler = find_widening_optab_handler_and_mode (this_optab, to_mode,
2528                                                   from_mode, 0, &actual_mode);
2529
2530   if (handler == CODE_FOR_nothing)
2531     return false;
2532
2533   /* Ensure that the inputs to the handler are in the correct precison
2534      for the opcode.  This will be the full mode size.  */
2535   actual_precision = GET_MODE_PRECISION (actual_mode);
2536   if (actual_precision != TYPE_PRECISION (type1)
2537       || from_unsigned1 != TYPE_UNSIGNED (type1))
2538     mult_rhs1 = build_and_insert_cast (gsi, loc,
2539                                        build_nonstandard_integer_type
2540                                          (actual_precision, from_unsigned1),
2541                                        mult_rhs1);
2542   if (actual_precision != TYPE_PRECISION (type2)
2543       || from_unsigned2 != TYPE_UNSIGNED (type2))
2544     mult_rhs2 = build_and_insert_cast (gsi, loc,
2545                                        build_nonstandard_integer_type
2546                                          (actual_precision, from_unsigned2),
2547                                        mult_rhs2);
2548
2549   if (!useless_type_conversion_p (type, TREE_TYPE (add_rhs)))
2550     add_rhs = build_and_insert_cast (gsi, loc, type, add_rhs);
2551
2552   /* Handle constants.  */
2553   if (TREE_CODE (mult_rhs1) == INTEGER_CST)
2554     mult_rhs1 = fold_convert (type1, mult_rhs1);
2555   if (TREE_CODE (mult_rhs2) == INTEGER_CST)
2556     mult_rhs2 = fold_convert (type2, mult_rhs2);
2557
2558   gimple_assign_set_rhs_with_ops_1 (gsi, wmult_code, mult_rhs1, mult_rhs2,
2559                                     add_rhs);
2560   update_stmt (gsi_stmt (*gsi));
2561   widen_mul_stats.maccs_inserted++;
2562   return true;
2563 }
2564
2565 /* Combine the multiplication at MUL_STMT with operands MULOP1 and MULOP2
2566    with uses in additions and subtractions to form fused multiply-add
2567    operations.  Returns true if successful and MUL_STMT should be removed.  */
2568
2569 static bool
2570 convert_mult_to_fma (gimple mul_stmt, tree op1, tree op2)
2571 {
2572   tree mul_result = gimple_get_lhs (mul_stmt);
2573   tree type = TREE_TYPE (mul_result);
2574   gimple use_stmt, neguse_stmt, fma_stmt;
2575   use_operand_p use_p;
2576   imm_use_iterator imm_iter;
2577
2578   if (FLOAT_TYPE_P (type)
2579       && flag_fp_contract_mode == FP_CONTRACT_OFF)
2580     return false;
2581
2582   /* We don't want to do bitfield reduction ops.  */
2583   if (INTEGRAL_TYPE_P (type)
2584       && (TYPE_PRECISION (type)
2585           != GET_MODE_PRECISION (TYPE_MODE (type))))
2586     return false;
2587
2588   /* If the target doesn't support it, don't generate it.  We assume that
2589      if fma isn't available then fms, fnma or fnms are not either.  */
2590   if (optab_handler (fma_optab, TYPE_MODE (type)) == CODE_FOR_nothing)
2591     return false;
2592
2593   /* If the multiplication has zero uses, it is kept around probably because
2594      of -fnon-call-exceptions.  Don't optimize it away in that case,
2595      it is DCE job.  */
2596   if (has_zero_uses (mul_result))
2597     return false;
2598
2599   /* Make sure that the multiplication statement becomes dead after
2600      the transformation, thus that all uses are transformed to FMAs.
2601      This means we assume that an FMA operation has the same cost
2602      as an addition.  */
2603   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, mul_result)
2604     {
2605       enum tree_code use_code;
2606       tree result = mul_result;
2607       bool negate_p = false;
2608
2609       use_stmt = USE_STMT (use_p);
2610
2611       if (is_gimple_debug (use_stmt))
2612         continue;
2613
2614       /* For now restrict this operations to single basic blocks.  In theory
2615          we would want to support sinking the multiplication in
2616          m = a*b;
2617          if ()
2618            ma = m + c;
2619          else
2620            d = m;
2621          to form a fma in the then block and sink the multiplication to the
2622          else block.  */
2623       if (gimple_bb (use_stmt) != gimple_bb (mul_stmt))
2624         return false;
2625
2626       if (!is_gimple_assign (use_stmt))
2627         return false;
2628
2629       use_code = gimple_assign_rhs_code (use_stmt);
2630
2631       /* A negate on the multiplication leads to FNMA.  */
2632       if (use_code == NEGATE_EXPR)
2633         {
2634           ssa_op_iter iter;
2635           use_operand_p usep;
2636
2637           result = gimple_assign_lhs (use_stmt);
2638
2639           /* Make sure the negate statement becomes dead with this
2640              single transformation.  */
2641           if (!single_imm_use (gimple_assign_lhs (use_stmt),
2642                                &use_p, &neguse_stmt))
2643             return false;
2644
2645           /* Make sure the multiplication isn't also used on that stmt.  */
2646           FOR_EACH_PHI_OR_STMT_USE (usep, neguse_stmt, iter, SSA_OP_USE)
2647             if (USE_FROM_PTR (usep) == mul_result)
2648               return false;
2649
2650           /* Re-validate.  */
2651           use_stmt = neguse_stmt;
2652           if (gimple_bb (use_stmt) != gimple_bb (mul_stmt))
2653             return false;
2654           if (!is_gimple_assign (use_stmt))
2655             return false;
2656
2657           use_code = gimple_assign_rhs_code (use_stmt);
2658           negate_p = true;
2659         }
2660
2661       switch (use_code)
2662         {
2663         case MINUS_EXPR:
2664           if (gimple_assign_rhs2 (use_stmt) == result)
2665             negate_p = !negate_p;
2666           break;
2667         case PLUS_EXPR:
2668           break;
2669         default:
2670           /* FMA can only be formed from PLUS and MINUS.  */
2671           return false;
2672         }
2673
2674       /* If the subtrahend (gimple_assign_rhs2 (use_stmt)) is computed
2675          by a MULT_EXPR that we'll visit later, we might be able to
2676          get a more profitable match with fnma.
2677          OTOH, if we don't, a negate / fma pair has likely lower latency
2678          that a mult / subtract pair.  */
2679       if (use_code == MINUS_EXPR && !negate_p
2680           && gimple_assign_rhs1 (use_stmt) == result
2681           && optab_handler (fms_optab, TYPE_MODE (type)) == CODE_FOR_nothing
2682           && optab_handler (fnma_optab, TYPE_MODE (type)) != CODE_FOR_nothing)
2683         {
2684           tree rhs2 = gimple_assign_rhs2 (use_stmt);
2685
2686           if (TREE_CODE (rhs2) == SSA_NAME)
2687             {
2688               gimple stmt2 = SSA_NAME_DEF_STMT (rhs2);
2689               if (has_single_use (rhs2)
2690                   && is_gimple_assign (stmt2)
2691                   && gimple_assign_rhs_code (stmt2) == MULT_EXPR)
2692               return false;
2693             }
2694         }
2695
2696       /* We can't handle a * b + a * b.  */
2697       if (gimple_assign_rhs1 (use_stmt) == gimple_assign_rhs2 (use_stmt))
2698         return false;
2699
2700       /* While it is possible to validate whether or not the exact form
2701          that we've recognized is available in the backend, the assumption
2702          is that the transformation is never a loss.  For instance, suppose
2703          the target only has the plain FMA pattern available.  Consider
2704          a*b-c -> fma(a,b,-c): we've exchanged MUL+SUB for FMA+NEG, which
2705          is still two operations.  Consider -(a*b)-c -> fma(-a,b,-c): we
2706          still have 3 operations, but in the FMA form the two NEGs are
2707          independent and could be run in parallel.  */
2708     }
2709
2710   FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, mul_result)
2711     {
2712       gimple_stmt_iterator gsi = gsi_for_stmt (use_stmt);
2713       enum tree_code use_code;
2714       tree addop, mulop1 = op1, result = mul_result;
2715       bool negate_p = false;
2716
2717       if (is_gimple_debug (use_stmt))
2718         continue;
2719
2720       use_code = gimple_assign_rhs_code (use_stmt);
2721       if (use_code == NEGATE_EXPR)
2722         {
2723           result = gimple_assign_lhs (use_stmt);
2724           single_imm_use (gimple_assign_lhs (use_stmt), &use_p, &neguse_stmt);
2725           gsi_remove (&gsi, true);
2726           release_defs (use_stmt);
2727
2728           use_stmt = neguse_stmt;
2729           gsi = gsi_for_stmt (use_stmt);
2730           use_code = gimple_assign_rhs_code (use_stmt);
2731           negate_p = true;
2732         }
2733
2734       if (gimple_assign_rhs1 (use_stmt) == result)
2735         {
2736           addop = gimple_assign_rhs2 (use_stmt);
2737           /* a * b - c -> a * b + (-c)  */
2738           if (gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
2739             addop = force_gimple_operand_gsi (&gsi,
2740                                               build1 (NEGATE_EXPR,
2741                                                       type, addop),
2742                                               true, NULL_TREE, true,
2743                                               GSI_SAME_STMT);
2744         }
2745       else
2746         {
2747           addop = gimple_assign_rhs1 (use_stmt);
2748           /* a - b * c -> (-b) * c + a */
2749           if (gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
2750             negate_p = !negate_p;
2751         }
2752
2753       if (negate_p)
2754         mulop1 = force_gimple_operand_gsi (&gsi,
2755                                            build1 (NEGATE_EXPR,
2756                                                    type, mulop1),
2757                                            true, NULL_TREE, true,
2758                                            GSI_SAME_STMT);
2759
2760       fma_stmt = gimple_build_assign_with_ops (FMA_EXPR,
2761                                                gimple_assign_lhs (use_stmt),
2762                                                mulop1, op2,
2763                                                addop);
2764       gsi_replace (&gsi, fma_stmt, true);
2765       widen_mul_stats.fmas_inserted++;
2766     }
2767
2768   return true;
2769 }
2770
2771 /* Find integer multiplications where the operands are extended from
2772    smaller types, and replace the MULT_EXPR with a WIDEN_MULT_EXPR
2773    where appropriate.  */
2774
2775 static unsigned int
2776 execute_optimize_widening_mul (void)
2777 {
2778   basic_block bb;
2779   bool cfg_changed = false;
2780
2781   memset (&widen_mul_stats, 0, sizeof (widen_mul_stats));
2782
2783   FOR_EACH_BB (bb)
2784     {
2785       gimple_stmt_iterator gsi;
2786
2787       for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi);)
2788         {
2789           gimple stmt = gsi_stmt (gsi);
2790           enum tree_code code;
2791
2792           if (is_gimple_assign (stmt))
2793             {
2794               code = gimple_assign_rhs_code (stmt);
2795               switch (code)
2796                 {
2797                 case MULT_EXPR:
2798                   if (!convert_mult_to_widen (stmt, &gsi)
2799                       && convert_mult_to_fma (stmt,
2800                                               gimple_assign_rhs1 (stmt),
2801                                               gimple_assign_rhs2 (stmt)))
2802                     {
2803                       gsi_remove (&gsi, true);
2804                       release_defs (stmt);
2805                       continue;
2806                     }
2807                   break;
2808
2809                 case PLUS_EXPR:
2810                 case MINUS_EXPR:
2811                   convert_plusminus_to_widen (&gsi, stmt, code);
2812                   break;
2813
2814                 default:;
2815                 }
2816             }
2817           else if (is_gimple_call (stmt)
2818                    && gimple_call_lhs (stmt))
2819             {
2820               tree fndecl = gimple_call_fndecl (stmt);
2821               if (fndecl
2822                   && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_NORMAL)
2823                 {
2824                   switch (DECL_FUNCTION_CODE (fndecl))
2825                     {
2826                       case BUILT_IN_POWF:
2827                       case BUILT_IN_POW:
2828                       case BUILT_IN_POWL:
2829                         if (TREE_CODE (gimple_call_arg (stmt, 1)) == REAL_CST
2830                             && REAL_VALUES_EQUAL
2831                                  (TREE_REAL_CST (gimple_call_arg (stmt, 1)),
2832                                   dconst2)
2833                             && convert_mult_to_fma (stmt,
2834                                                     gimple_call_arg (stmt, 0),
2835                                                     gimple_call_arg (stmt, 0)))
2836                           {
2837                             unlink_stmt_vdef (stmt);
2838                             if (gsi_remove (&gsi, true)
2839                                 && gimple_purge_dead_eh_edges (bb))
2840                               cfg_changed = true;
2841                             release_defs (stmt);
2842                             continue;
2843                           }
2844                           break;
2845
2846                       default:;
2847                     }
2848                 }
2849             }
2850           gsi_next (&gsi);
2851         }
2852     }
2853
2854   statistics_counter_event (cfun, "widening multiplications inserted",
2855                             widen_mul_stats.widen_mults_inserted);
2856   statistics_counter_event (cfun, "widening maccs inserted",
2857                             widen_mul_stats.maccs_inserted);
2858   statistics_counter_event (cfun, "fused multiply-adds inserted",
2859                             widen_mul_stats.fmas_inserted);
2860
2861   return cfg_changed ? TODO_cleanup_cfg : 0;
2862 }
2863
2864 static bool
2865 gate_optimize_widening_mul (void)
2866 {
2867   return flag_expensive_optimizations && optimize;
2868 }
2869
2870 namespace {
2871
2872 const pass_data pass_data_optimize_widening_mul =
2873 {
2874   GIMPLE_PASS, /* type */
2875   "widening_mul", /* name */
2876   OPTGROUP_NONE, /* optinfo_flags */
2877   true, /* has_gate */
2878   true, /* has_execute */
2879   TV_NONE, /* tv_id */
2880   PROP_ssa, /* properties_required */
2881   0, /* properties_provided */
2882   0, /* properties_destroyed */
2883   0, /* todo_flags_start */
2884   ( TODO_verify_ssa | TODO_verify_stmts
2885     | TODO_update_ssa ), /* todo_flags_finish */
2886 };
2887
2888 class pass_optimize_widening_mul : public gimple_opt_pass
2889 {
2890 public:
2891   pass_optimize_widening_mul (gcc::context *ctxt)
2892     : gimple_opt_pass (pass_data_optimize_widening_mul, ctxt)
2893   {}
2894
2895   /* opt_pass methods: */
2896   bool gate () { return gate_optimize_widening_mul (); }
2897   unsigned int execute () { return execute_optimize_widening_mul (); }
2898
2899 }; // class pass_optimize_widening_mul
2900
2901 } // anon namespace
2902
2903 gimple_opt_pass *
2904 make_pass_optimize_widening_mul (gcc::context *ctxt)
2905 {
2906   return new pass_optimize_widening_mul (ctxt);
2907 }