gcc/tree-ssa-math-opts.c

   1 /* Global, SSA-based optimizations using mathematical identities.
   2    Copyright (C) 2005, 2006, 2007, 2008, 2009, 2010, 2011
   3    Free Software Foundation, Inc.
   4
   5 This file is part of GCC.
   6
   7 GCC is free software; you can redistribute it and/or modify it
   8 under the terms of the GNU General Public License as published by the
   9 Free Software Foundation; either version 3, or (at your option) any
  10 later version.
  11
  12 GCC is distributed in the hope that it will be useful, but WITHOUT
  13 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  14 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  15 for more details.
  16
  17 You should have received a copy of the GNU General Public License
  18 along with GCC; see the file COPYING3.  If not see
  19 <http://www.gnu.org/licenses/>.  */
  20
  21 /* Currently, the only mini-pass in this file tries to CSE reciprocal
  22    operations.  These are common in sequences such as this one:
  23
  24         modulus = sqrt(x*x + y*y + z*z);
  25         x = x / modulus;
  26         y = y / modulus;
  27         z = z / modulus;
  28
  29    that can be optimized to
  30
  31         modulus = sqrt(x*x + y*y + z*z);
  32         rmodulus = 1.0 / modulus;
  33         x = x * rmodulus;
  34         y = y * rmodulus;
  35         z = z * rmodulus;
  36
  37    We do this for loop invariant divisors, and with this pass whenever
  38    we notice that a division has the same divisor multiple times.
  39
  40    Of course, like in PRE, we don't insert a division if a dominator
  41    already has one.  However, this cannot be done as an extension of
  42    PRE for several reasons.
  43
  44    First of all, with some experiments it was found out that the
  45    transformation is not always useful if there are only two divisions
  46    hy the same divisor.  This is probably because modern processors
  47    can pipeline the divisions; on older, in-order processors it should
  48    still be effective to optimize two divisions by the same number.
  49    We make this a param, and it shall be called N in the remainder of
  50    this comment.
  51
  52    Second, if trapping math is active, we have less freedom on where
  53    to insert divisions: we can only do so in basic blocks that already
  54    contain one.  (If divisions don't trap, instead, we can insert
  55    divisions elsewhere, which will be in blocks that are common dominators
  56    of those that have the division).
  57
  58    We really don't want to compute the reciprocal unless a division will
  59    be found.  To do this, we won't insert the division in a basic block
  60    that has less than N divisions *post-dominating* it.
  61
  62    The algorithm constructs a subset of the dominator tree, holding the
  63    blocks containing the divisions and the common dominators to them,
  64    and walk it twice.  The first walk is in post-order, and it annotates
  65    each block with the number of divisions that post-dominate it: this
  66    gives information on where divisions can be inserted profitably.
  67    The second walk is in pre-order, and it inserts divisions as explained
  68    above, and replaces divisions by multiplications.
  69
  70    In the best case, the cost of the pass is O(n_statements).  In the
  71    worst-case, the cost is due to creating the dominator tree subset,
  72    with a cost of O(n_basic_blocks ^ 2); however this can only happen
  73    for n_statements / n_basic_blocks statements.  So, the amortized cost
  74    of creating the dominator tree subset is O(n_basic_blocks) and the
  75    worst-case cost of the pass is O(n_statements * n_basic_blocks).
  76
  77    More practically, the cost will be small because there are few
  78    divisions, and they tend to be in the same basic block, so insert_bb
  79    is called very few times.
  80
  81    If we did this using domwalk.c, an efficient implementation would have
  82    to work on all the variables in a single pass, because we could not
  83    work on just a subset of the dominator tree, as we do now, and the
  84    cost would also be something like O(n_statements * n_basic_blocks).
  85    The data structures would be more complex in order to work on all the
  86    variables in a single pass.  */
  87
  88 #include "config.h"
  89 #include "system.h"
  90 #include "coretypes.h"
  91 #include "tm.h"
  92 #include "flags.h"
  93 #include "tree.h"
  94 #include "tree-flow.h"
  95 #include "tree-pass.h"
  96 #include "alloc-pool.h"
  97 #include "basic-block.h"
  98 #include "target.h"
  99 #include "gimple-pretty-print.h"
 100
 101 /* FIXME: RTL headers have to be included here for optabs.  */
 102 #include "rtl.h"                /* Because optabs.h wants enum rtx_code.  */
 103 #include "expr.h"               /* Because optabs.h wants sepops.  */
 104 #include "optabs.h"
 105
 106 /* This structure represents one basic block that either computes a
 107    division, or is a common dominator for basic block that compute a
 108    division.  */
 109 struct occurrence {
 110   /* The basic block represented by this structure.  */
 111   basic_block bb;
 112
 113   /* If non-NULL, the SSA_NAME holding the definition for a reciprocal
 114      inserted in BB.  */
 115   tree recip_def;
 116
 117   /* If non-NULL, the GIMPLE_ASSIGN for a reciprocal computation that
 118      was inserted in BB.  */
 119   gimple recip_def_stmt;
 120
 121   /* Pointer to a list of "struct occurrence"s for blocks dominated
 122      by BB.  */
 123   struct occurrence *children;
 124
 125   /* Pointer to the next "struct occurrence"s in the list of blocks
 126      sharing a common dominator.  */
 127   struct occurrence *next;
 128
 129   /* The number of divisions that are in BB before compute_merit.  The
 130      number of divisions that are in BB or post-dominate it after
 131      compute_merit.  */
 132   int num_divisions;
 133
 134   /* True if the basic block has a division, false if it is a common
 135      dominator for basic blocks that do.  If it is false and trapping
 136      math is active, BB is not a candidate for inserting a reciprocal.  */
 137   bool bb_has_division;
 138 };
 139
 140 static struct
 141 {
 142   /* Number of 1.0/X ops inserted.  */
 143   int rdivs_inserted;
 144
 145   /* Number of 1.0/FUNC ops inserted.  */
 146   int rfuncs_inserted;
 147 } reciprocal_stats;
 148
 149 static struct
 150 {
 151   /* Number of cexpi calls inserted.  */
 152   int inserted;
 153 } sincos_stats;
 154
 155 static struct
 156 {
 157   /* Number of hand-written 32-bit bswaps found.  */
 158   int found_32bit;
 159
 160   /* Number of hand-written 64-bit bswaps found.  */
 161   int found_64bit;
 162 } bswap_stats;
 163
 164 static struct
 165 {
 166   /* Number of widening multiplication ops inserted.  */
 167   int widen_mults_inserted;
 168
 169   /* Number of integer multiply-and-accumulate ops inserted.  */
 170   int maccs_inserted;
 171
 172   /* Number of fp fused multiply-add ops inserted.  */
 173   int fmas_inserted;
 174 } widen_mul_stats;
 175
 176 /* The instance of "struct occurrence" representing the highest
 177    interesting block in the dominator tree.  */
 178 static struct occurrence *occ_head;
 179
 180 /* Allocation pool for getting instances of "struct occurrence".  */
 181 static alloc_pool occ_pool;
 182
 183
 184
 185 /* Allocate and return a new struct occurrence for basic block BB, and
 186    whose children list is headed by CHILDREN.  */
 187 static struct occurrence *
 188 occ_new (basic_block bb, struct occurrence *children)
 189 {
 190   struct occurrence *occ;
 191
 192   bb->aux = occ = (struct occurrence *) pool_alloc (occ_pool);
 193   memset (occ, 0, sizeof (struct occurrence));
 194
 195   occ->bb = bb;
 196   occ->children = children;
 197   return occ;
 198 }
 199
 200
 201 /* Insert NEW_OCC into our subset of the dominator tree.  P_HEAD points to a
 202    list of "struct occurrence"s, one per basic block, having IDOM as
 203    their common dominator.
 204
 205    We try to insert NEW_OCC as deep as possible in the tree, and we also
 206    insert any other block that is a common dominator for BB and one
 207    block already in the tree.  */
 208
 209 static void
 210 insert_bb (struct occurrence *new_occ, basic_block idom,
 211            struct occurrence **p_head)
 212 {
 213   struct occurrence *occ, **p_occ;
 214
 215   for (p_occ = p_head; (occ = *p_occ) != NULL; )
 216     {
 217       basic_block bb = new_occ->bb, occ_bb = occ->bb;
 218       basic_block dom = nearest_common_dominator (CDI_DOMINATORS, occ_bb, bb);
 219       if (dom == bb)
 220         {
 221           /* BB dominates OCC_BB.  OCC becomes NEW_OCC's child: remove OCC
 222              from its list.  */
 223           *p_occ = occ->next;
 224           occ->next = new_occ->children;
 225           new_occ->children = occ;
 226
 227           /* Try the next block (it may as well be dominated by BB).  */
 228         }
 229
 230       else if (dom == occ_bb)
 231         {
 232           /* OCC_BB dominates BB.  Tail recurse to look deeper.  */
 233           insert_bb (new_occ, dom, &occ->children);
 234           return;
 235         }
 236
 237       else if (dom != idom)
 238         {
 239           gcc_assert (!dom->aux);
 240
 241           /* There is a dominator between IDOM and BB, add it and make
 242              two children out of NEW_OCC and OCC.  First, remove OCC from
 243              its list.  */
 244           *p_occ = occ->next;
 245           new_occ->next = occ;
 246           occ->next = NULL;
 247
 248           /* None of the previous blocks has DOM as a dominator: if we tail
 249              recursed, we would reexamine them uselessly. Just switch BB with
 250              DOM, and go on looking for blocks dominated by DOM.  */
 251           new_occ = occ_new (dom, new_occ);
 252         }
 253
 254       else
 255         {
 256           /* Nothing special, go on with the next element.  */
 257           p_occ = &occ->next;
 258         }
 259     }
 260
 261   /* No place was found as a child of IDOM.  Make BB a sibling of IDOM.  */
 262   new_occ->next = *p_head;
 263   *p_head = new_occ;
 264 }
 265
 266 /* Register that we found a division in BB.  */
 267
 268 static inline void
 269 register_division_in (basic_block bb)
 270 {
 271   struct occurrence *occ;
 272
 273   occ = (struct occurrence *) bb->aux;
 274   if (!occ)
 275     {
 276       occ = occ_new (bb, NULL);
 277       insert_bb (occ, ENTRY_BLOCK_PTR, &occ_head);
 278     }
 279
 280   occ->bb_has_division = true;
 281   occ->num_divisions++;
 282 }
 283
 284
 285 /* Compute the number of divisions that postdominate each block in OCC and
 286    its children.  */
 287
 288 static void
 289 compute_merit (struct occurrence *occ)
 290 {
 291   struct occurrence *occ_child;
 292   basic_block dom = occ->bb;
 293
 294   for (occ_child = occ->children; occ_child; occ_child = occ_child->next)
 295     {
 296       basic_block bb;
 297       if (occ_child->children)
 298         compute_merit (occ_child);
 299
 300       if (flag_exceptions)
 301         bb = single_noncomplex_succ (dom);
 302       else
 303         bb = dom;
 304
 305       if (dominated_by_p (CDI_POST_DOMINATORS, bb, occ_child->bb))
 306         occ->num_divisions += occ_child->num_divisions;
 307     }
 308 }
 309
 310
 311 /* Return whether USE_STMT is a floating-point division by DEF.  */
 312 static inline bool
 313 is_division_by (gimple use_stmt, tree def)
 314 {
 315   return is_gimple_assign (use_stmt)
 316          && gimple_assign_rhs_code (use_stmt) == RDIV_EXPR
 317          && gimple_assign_rhs2 (use_stmt) == def
 318          /* Do not recognize x / x as valid division, as we are getting
 319             confused later by replacing all immediate uses x in such
 320             a stmt.  */
 321          && gimple_assign_rhs1 (use_stmt) != def;
 322 }
 323
 324 /* Walk the subset of the dominator tree rooted at OCC, setting the
 325    RECIP_DEF field to a definition of 1.0 / DEF that can be used in
 326    the given basic block.  The field may be left NULL, of course,
 327    if it is not possible or profitable to do the optimization.
 328
 329    DEF_BSI is an iterator pointing at the statement defining DEF.
 330    If RECIP_DEF is set, a dominator already has a computation that can
 331    be used.  */
 332
 333 static void
 334 insert_reciprocals (gimple_stmt_iterator *def_gsi, struct occurrence *occ,
 335                     tree def, tree recip_def, int threshold)
 336 {
 337   tree type;
 338   gimple new_stmt;
 339   gimple_stmt_iterator gsi;
 340   struct occurrence *occ_child;
 341
 342   if (!recip_def
 343       && (occ->bb_has_division || !flag_trapping_math)
 344       && occ->num_divisions >= threshold)
 345     {
 346       /* Make a variable with the replacement and substitute it.  */
 347       type = TREE_TYPE (def);
 348       recip_def = make_rename_temp (type, "reciptmp");
 349       new_stmt = gimple_build_assign_with_ops (RDIV_EXPR, recip_def,
 350                                                build_one_cst (type), def);
 351
 352       if (occ->bb_has_division)
 353         {
 354           /* Case 1: insert before an existing division.  */
 355           gsi = gsi_after_labels (occ->bb);
 356           while (!gsi_end_p (gsi) && !is_division_by (gsi_stmt (gsi), def))
 357             gsi_next (&gsi);
 358
 359           gsi_insert_before (&gsi, new_stmt, GSI_SAME_STMT);
 360         }
 361       else if (def_gsi && occ->bb == def_gsi->bb)
 362         {
 363           /* Case 2: insert right after the definition.  Note that this will
 364              never happen if the definition statement can throw, because in
 365              that case the sole successor of the statement's basic block will
 366              dominate all the uses as well.  */
 367           gsi_insert_after (def_gsi, new_stmt, GSI_NEW_STMT);
 368         }
 369       else
 370         {
 371           /* Case 3: insert in a basic block not containing defs/uses.  */
 372           gsi = gsi_after_labels (occ->bb);
 373           gsi_insert_before (&gsi, new_stmt, GSI_SAME_STMT);
 374         }
 375
 376       reciprocal_stats.rdivs_inserted++;
 377
 378       occ->recip_def_stmt = new_stmt;
 379     }
 380
 381   occ->recip_def = recip_def;
 382   for (occ_child = occ->children; occ_child; occ_child = occ_child->next)
 383     insert_reciprocals (def_gsi, occ_child, def, recip_def, threshold);
 384 }
 385
 386
 387 /* Replace the division at USE_P with a multiplication by the reciprocal, if
 388    possible.  */
 389
 390 static inline void
 391 replace_reciprocal (use_operand_p use_p)
 392 {
 393   gimple use_stmt = USE_STMT (use_p);
 394   basic_block bb = gimple_bb (use_stmt);
 395   struct occurrence *occ = (struct occurrence *) bb->aux;
 396
 397   if (optimize_bb_for_speed_p (bb)
 398       && occ->recip_def && use_stmt != occ->recip_def_stmt)
 399     {
 400       gimple_stmt_iterator gsi = gsi_for_stmt (use_stmt);
 401       gimple_assign_set_rhs_code (use_stmt, MULT_EXPR);
 402       SET_USE (use_p, occ->recip_def);
 403       fold_stmt_inplace (&gsi);
 404       update_stmt (use_stmt);
 405     }
 406 }
 407
 408
 409 /* Free OCC and return one more "struct occurrence" to be freed.  */
 410
 411 static struct occurrence *
 412 free_bb (struct occurrence *occ)
 413 {
 414   struct occurrence *child, *next;
 415
 416   /* First get the two pointers hanging off OCC.  */
 417   next = occ->next;
 418   child = occ->children;
 419   occ->bb->aux = NULL;
 420   pool_free (occ_pool, occ);
 421
 422   /* Now ensure that we don't recurse unless it is necessary.  */
 423   if (!child)
 424     return next;
 425   else
 426     {
 427       while (next)
 428         next = free_bb (next);
 429
 430       return child;
 431     }
 432 }
 433
 434
 435 /* Look for floating-point divisions among DEF's uses, and try to
 436    replace them by multiplications with the reciprocal.  Add
 437    as many statements computing the reciprocal as needed.
 438
 439    DEF must be a GIMPLE register of a floating-point type.  */
 440
 441 static void
 442 execute_cse_reciprocals_1 (gimple_stmt_iterator *def_gsi, tree def)
 443 {
 444   use_operand_p use_p;
 445   imm_use_iterator use_iter;
 446   struct occurrence *occ;
 447   int count = 0, threshold;
 448
 449   gcc_assert (FLOAT_TYPE_P (TREE_TYPE (def)) && is_gimple_reg (def));
 450
 451   FOR_EACH_IMM_USE_FAST (use_p, use_iter, def)
 452     {
 453       gimple use_stmt = USE_STMT (use_p);
 454       if (is_division_by (use_stmt, def))
 455         {
 456           register_division_in (gimple_bb (use_stmt));
 457           count++;
 458         }
 459     }
 460
 461   /* Do the expensive part only if we can hope to optimize something.  */
 462   threshold = targetm.min_divisions_for_recip_mul (TYPE_MODE (TREE_TYPE (def)));
 463   if (count >= threshold)
 464     {
 465       gimple use_stmt;
 466       for (occ = occ_head; occ; occ = occ->next)
 467         {
 468           compute_merit (occ);
 469           insert_reciprocals (def_gsi, occ, def, NULL, threshold);
 470         }
 471
 472       FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, def)
 473         {
 474           if (is_division_by (use_stmt, def))
 475             {
 476               FOR_EACH_IMM_USE_ON_STMT (use_p, use_iter)
 477                 replace_reciprocal (use_p);
 478             }
 479         }
 480     }
 481
 482   for (occ = occ_head; occ; )
 483     occ = free_bb (occ);
 484
 485   occ_head = NULL;
 486 }
 487
 488 static bool
 489 gate_cse_reciprocals (void)
 490 {
 491   return optimize && flag_reciprocal_math;
 492 }
 493
 494 /* Go through all the floating-point SSA_NAMEs, and call
 495    execute_cse_reciprocals_1 on each of them.  */
 496 static unsigned int
 497 execute_cse_reciprocals (void)
 498 {
 499   basic_block bb;
 500   tree arg;
 501
 502   occ_pool = create_alloc_pool ("dominators for recip",
 503                                 sizeof (struct occurrence),
 504                                 n_basic_blocks / 3 + 1);
 505
 506   memset (&reciprocal_stats, 0, sizeof (reciprocal_stats));
 507   calculate_dominance_info (CDI_DOMINATORS);
 508   calculate_dominance_info (CDI_POST_DOMINATORS);
 509
 510 #ifdef ENABLE_CHECKING
 511   FOR_EACH_BB (bb)
 512     gcc_assert (!bb->aux);
 513 #endif
 514
 515   for (arg = DECL_ARGUMENTS (cfun->decl); arg; arg = DECL_CHAIN (arg))
 516     if (gimple_default_def (cfun, arg)
 517         && FLOAT_TYPE_P (TREE_TYPE (arg))
 518         && is_gimple_reg (arg))
 519       execute_cse_reciprocals_1 (NULL, gimple_default_def (cfun, arg));
 520
 521   FOR_EACH_BB (bb)
 522     {
 523       gimple_stmt_iterator gsi;
 524       gimple phi;
 525       tree def;
 526
 527       for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 528         {
 529           phi = gsi_stmt (gsi);
 530           def = PHI_RESULT (phi);
 531           if (FLOAT_TYPE_P (TREE_TYPE (def))
 532               && is_gimple_reg (def))
 533             execute_cse_reciprocals_1 (NULL, def);
 534         }
 535
 536       for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 537         {
 538           gimple stmt = gsi_stmt (gsi);
 539
 540           if (gimple_has_lhs (stmt)
 541               && (def = SINGLE_SSA_TREE_OPERAND (stmt, SSA_OP_DEF)) != NULL
 542               && FLOAT_TYPE_P (TREE_TYPE (def))
 543               && TREE_CODE (def) == SSA_NAME)
 544             execute_cse_reciprocals_1 (&gsi, def);
 545         }
 546
 547       if (optimize_bb_for_size_p (bb))
 548         continue;
 549
 550       /* Scan for a/func(b) and convert it to reciprocal a*rfunc(b).  */
 551       for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 552         {
 553           gimple stmt = gsi_stmt (gsi);
 554           tree fndecl;
 555
 556           if (is_gimple_assign (stmt)
 557               && gimple_assign_rhs_code (stmt) == RDIV_EXPR)
 558             {
 559               tree arg1 = gimple_assign_rhs2 (stmt);
 560               gimple stmt1;
 561
 562               if (TREE_CODE (arg1) != SSA_NAME)
 563                 continue;
 564
 565               stmt1 = SSA_NAME_DEF_STMT (arg1);
 566
 567               if (is_gimple_call (stmt1)
 568                   && gimple_call_lhs (stmt1)
 569                   && (fndecl = gimple_call_fndecl (stmt1))
 570                   && (DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_NORMAL
 571                       || DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD))
 572                 {
 573                   enum built_in_function code;
 574                   bool md_code, fail;
 575                   imm_use_iterator ui;
 576                   use_operand_p use_p;
 577
 578                   code = DECL_FUNCTION_CODE (fndecl);
 579                   md_code = DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD;
 580
 581                   fndecl = targetm.builtin_reciprocal (code, md_code, false);
 582                   if (!fndecl)
 583                     continue;
 584
 585                   /* Check that all uses of the SSA name are divisions,
 586                      otherwise replacing the defining statement will do
 587                      the wrong thing.  */
 588                   fail = false;
 589                   FOR_EACH_IMM_USE_FAST (use_p, ui, arg1)
 590                     {
 591                       gimple stmt2 = USE_STMT (use_p);
 592                       if (is_gimple_debug (stmt2))
 593                         continue;
 594                       if (!is_gimple_assign (stmt2)
 595                           || gimple_assign_rhs_code (stmt2) != RDIV_EXPR
 596                           || gimple_assign_rhs1 (stmt2) == arg1
 597                           || gimple_assign_rhs2 (stmt2) != arg1)
 598                         {
 599                           fail = true;
 600                           break;
 601                         }
 602                     }
 603                   if (fail)
 604                     continue;
 605
 606                   gimple_replace_lhs (stmt1, arg1);
 607                   gimple_call_set_fndecl (stmt1, fndecl);
 608                   update_stmt (stmt1);
 609                   reciprocal_stats.rfuncs_inserted++;
 610
 611                   FOR_EACH_IMM_USE_STMT (stmt, ui, arg1)
 612                     {
 613                       gimple_stmt_iterator gsi = gsi_for_stmt (stmt);
 614                       gimple_assign_set_rhs_code (stmt, MULT_EXPR);
 615                       fold_stmt_inplace (&gsi);
 616                       update_stmt (stmt);
 617                     }
 618                 }
 619             }
 620         }
 621     }
 622
 623   statistics_counter_event (cfun, "reciprocal divs inserted",
 624                             reciprocal_stats.rdivs_inserted);
 625   statistics_counter_event (cfun, "reciprocal functions inserted",
 626                             reciprocal_stats.rfuncs_inserted);
 627
 628   free_dominance_info (CDI_DOMINATORS);
 629   free_dominance_info (CDI_POST_DOMINATORS);
 630   free_alloc_pool (occ_pool);
 631   return 0;
 632 }
 633
 634 struct gimple_opt_pass pass_cse_reciprocals =
 635 {
 636  {
 637   GIMPLE_PASS,
 638   "recip",                              /* name */
 639   gate_cse_reciprocals,                 /* gate */
 640   execute_cse_reciprocals,              /* execute */
 641   NULL,                                 /* sub */
 642   NULL,                                 /* next */
 643   0,                                    /* static_pass_number */
 644   TV_NONE,                              /* tv_id */
 645   PROP_ssa,                             /* properties_required */
 646   0,                                    /* properties_provided */
 647   0,                                    /* properties_destroyed */
 648   0,                                    /* todo_flags_start */
 649   TODO_update_ssa | TODO_verify_ssa
 650     | TODO_verify_stmts                /* todo_flags_finish */
 651  }
 652 };
 653
 654 /* Records an occurrence at statement USE_STMT in the vector of trees
 655    STMTS if it is dominated by *TOP_BB or dominates it or this basic block
 656    is not yet initialized.  Returns true if the occurrence was pushed on
 657    the vector.  Adjusts *TOP_BB to be the basic block dominating all
 658    statements in the vector.  */
 659
 660 static bool
 661 maybe_record_sincos (VEC(gimple, heap) **stmts,
 662                      basic_block *top_bb, gimple use_stmt)
 663 {
 664   basic_block use_bb = gimple_bb (use_stmt);
 665   if (*top_bb
 666       && (*top_bb == use_bb
 667           || dominated_by_p (CDI_DOMINATORS, use_bb, *top_bb)))
 668     VEC_safe_push (gimple, heap, *stmts, use_stmt);
 669   else if (!*top_bb
 670            || dominated_by_p (CDI_DOMINATORS, *top_bb, use_bb))
 671     {
 672       VEC_safe_push (gimple, heap, *stmts, use_stmt);
 673       *top_bb = use_bb;
 674     }
 675   else
 676     return false;
 677
 678   return true;
 679 }
 680
 681 /* Look for sin, cos and cexpi calls with the same argument NAME and
 682    create a single call to cexpi CSEing the result in this case.
 683    We first walk over all immediate uses of the argument collecting
 684    statements that we can CSE in a vector and in a second pass replace
 685    the statement rhs with a REALPART or IMAGPART expression on the
 686    result of the cexpi call we insert before the use statement that
 687    dominates all other candidates.  */
 688
 689 static bool
 690 execute_cse_sincos_1 (tree name)
 691 {
 692   gimple_stmt_iterator gsi;
 693   imm_use_iterator use_iter;
 694   tree fndecl, res, type;
 695   gimple def_stmt, use_stmt, stmt;
 696   int seen_cos = 0, seen_sin = 0, seen_cexpi = 0;
 697   VEC(gimple, heap) *stmts = NULL;
 698   basic_block top_bb = NULL;
 699   int i;
 700   bool cfg_changed = false;
 701
 702   type = TREE_TYPE (name);
 703   FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, name)
 704     {
 705       if (gimple_code (use_stmt) != GIMPLE_CALL
 706           || !gimple_call_lhs (use_stmt)
 707           || !(fndecl = gimple_call_fndecl (use_stmt))
 708           || DECL_BUILT_IN_CLASS (fndecl) != BUILT_IN_NORMAL)
 709         continue;
 710
 711       switch (DECL_FUNCTION_CODE (fndecl))
 712         {
 713         CASE_FLT_FN (BUILT_IN_COS):
 714           seen_cos |= maybe_record_sincos (&stmts, &top_bb, use_stmt) ? 1 : 0;
 715           break;
 716
 717         CASE_FLT_FN (BUILT_IN_SIN):
 718           seen_sin |= maybe_record_sincos (&stmts, &top_bb, use_stmt) ? 1 : 0;
 719           break;
 720
 721         CASE_FLT_FN (BUILT_IN_CEXPI):
 722           seen_cexpi |= maybe_record_sincos (&stmts, &top_bb, use_stmt) ? 1 : 0;
 723           break;
 724
 725         default:;
 726         }
 727     }
 728
 729   if (seen_cos + seen_sin + seen_cexpi <= 1)
 730     {
 731       VEC_free(gimple, heap, stmts);
 732       return false;
 733     }
 734
 735   /* Simply insert cexpi at the beginning of top_bb but not earlier than
 736      the name def statement.  */
 737   fndecl = mathfn_built_in (type, BUILT_IN_CEXPI);
 738   if (!fndecl)
 739     return false;
 740   res = create_tmp_reg (TREE_TYPE (TREE_TYPE (fndecl)), "sincostmp");
 741   stmt = gimple_build_call (fndecl, 1, name);
 742   res = make_ssa_name (res, stmt);
 743   gimple_call_set_lhs (stmt, res);
 744
 745   def_stmt = SSA_NAME_DEF_STMT (name);
 746   if (!SSA_NAME_IS_DEFAULT_DEF (name)
 747       && gimple_code (def_stmt) != GIMPLE_PHI
 748       && gimple_bb (def_stmt) == top_bb)
 749     {
 750       gsi = gsi_for_stmt (def_stmt);
 751       gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
 752     }
 753   else
 754     {
 755       gsi = gsi_after_labels (top_bb);
 756       gsi_insert_before (&gsi, stmt, GSI_SAME_STMT);
 757     }
 758   update_stmt (stmt);
 759   sincos_stats.inserted++;
 760
 761   /* And adjust the recorded old call sites.  */
 762   for (i = 0; VEC_iterate(gimple, stmts, i, use_stmt); ++i)
 763     {
 764       tree rhs = NULL;
 765       fndecl = gimple_call_fndecl (use_stmt);
 766
 767       switch (DECL_FUNCTION_CODE (fndecl))
 768         {
 769         CASE_FLT_FN (BUILT_IN_COS):
 770           rhs = fold_build1 (REALPART_EXPR, type, res);
 771           break;
 772
 773         CASE_FLT_FN (BUILT_IN_SIN):
 774           rhs = fold_build1 (IMAGPART_EXPR, type, res);
 775           break;
 776
 777         CASE_FLT_FN (BUILT_IN_CEXPI):
 778           rhs = res;
 779           break;
 780
 781         default:;
 782           gcc_unreachable ();
 783         }
 784
 785         /* Replace call with a copy.  */
 786         stmt = gimple_build_assign (gimple_call_lhs (use_stmt), rhs);
 787
 788         gsi = gsi_for_stmt (use_stmt);
 789         gsi_replace (&gsi, stmt, true);
 790         if (gimple_purge_dead_eh_edges (gimple_bb (stmt)))
 791           cfg_changed = true;
 792     }
 793
 794   VEC_free(gimple, heap, stmts);
 795
 796   return cfg_changed;
 797 }
 798
 799 /* To evaluate powi(x,n), the floating point value x raised to the
 800    constant integer exponent n, we use a hybrid algorithm that
 801    combines the "window method" with look-up tables.  For an
 802    introduction to exponentiation algorithms and "addition chains",
 803    see section 4.6.3, "Evaluation of Powers" of Donald E. Knuth,
 804    "Seminumerical Algorithms", Vol. 2, "The Art of Computer Programming",
 805    3rd Edition, 1998, and Daniel M. Gordon, "A Survey of Fast Exponentiation
 806    Methods", Journal of Algorithms, Vol. 27, pp. 129-146, 1998.  */
 807
 808 /* Provide a default value for POWI_MAX_MULTS, the maximum number of
 809    multiplications to inline before calling the system library's pow
 810    function.  powi(x,n) requires at worst 2*bits(n)-2 multiplications,
 811    so this default never requires calling pow, powf or powl.  */
 812
 813 #ifndef POWI_MAX_MULTS
 814 #define POWI_MAX_MULTS  (2*HOST_BITS_PER_WIDE_INT-2)
 815 #endif
 816
 817 /* The size of the "optimal power tree" lookup table.  All
 818    exponents less than this value are simply looked up in the
 819    powi_table below.  This threshold is also used to size the
 820    cache of pseudo registers that hold intermediate results.  */
 821 #define POWI_TABLE_SIZE 256
 822
 823 /* The size, in bits of the window, used in the "window method"
 824    exponentiation algorithm.  This is equivalent to a radix of
 825    (1<<POWI_WINDOW_SIZE) in the corresponding "m-ary method".  */
 826 #define POWI_WINDOW_SIZE 3
 827
 828 /* The following table is an efficient representation of an
 829    "optimal power tree".  For each value, i, the corresponding
 830    value, j, in the table states than an optimal evaluation
 831    sequence for calculating pow(x,i) can be found by evaluating
 832    pow(x,j)*pow(x,i-j).  An optimal power tree for the first
 833    100 integers is given in Knuth's "Seminumerical algorithms".  */
 834
 835 static const unsigned char powi_table[POWI_TABLE_SIZE] =
 836   {
 837       0,   1,   1,   2,   2,   3,   3,   4,  /*   0 -   7 */
 838       4,   6,   5,   6,   6,  10,   7,   9,  /*   8 -  15 */
 839       8,  16,   9,  16,  10,  12,  11,  13,  /*  16 -  23 */
 840      12,  17,  13,  18,  14,  24,  15,  26,  /*  24 -  31 */
 841      16,  17,  17,  19,  18,  33,  19,  26,  /*  32 -  39 */
 842      20,  25,  21,  40,  22,  27,  23,  44,  /*  40 -  47 */
 843      24,  32,  25,  34,  26,  29,  27,  44,  /*  48 -  55 */
 844      28,  31,  29,  34,  30,  60,  31,  36,  /*  56 -  63 */
 845      32,  64,  33,  34,  34,  46,  35,  37,  /*  64 -  71 */
 846      36,  65,  37,  50,  38,  48,  39,  69,  /*  72 -  79 */
 847      40,  49,  41,  43,  42,  51,  43,  58,  /*  80 -  87 */
 848      44,  64,  45,  47,  46,  59,  47,  76,  /*  88 -  95 */
 849      48,  65,  49,  66,  50,  67,  51,  66,  /*  96 - 103 */
 850      52,  70,  53,  74,  54, 104,  55,  74,  /* 104 - 111 */
 851      56,  64,  57,  69,  58,  78,  59,  68,  /* 112 - 119 */
 852      60,  61,  61,  80,  62,  75,  63,  68,  /* 120 - 127 */
 853      64,  65,  65, 128,  66, 129,  67,  90,  /* 128 - 135 */
 854      68,  73,  69, 131,  70,  94,  71,  88,  /* 136 - 143 */
 855      72, 128,  73,  98,  74, 132,  75, 121,  /* 144 - 151 */
 856      76, 102,  77, 124,  78, 132,  79, 106,  /* 152 - 159 */
 857      80,  97,  81, 160,  82,  99,  83, 134,  /* 160 - 167 */
 858      84,  86,  85,  95,  86, 160,  87, 100,  /* 168 - 175 */
 859      88, 113,  89,  98,  90, 107,  91, 122,  /* 176 - 183 */
 860      92, 111,  93, 102,  94, 126,  95, 150,  /* 184 - 191 */
 861      96, 128,  97, 130,  98, 133,  99, 195,  /* 192 - 199 */
 862     100, 128, 101, 123, 102, 164, 103, 138,  /* 200 - 207 */
 863     104, 145, 105, 146, 106, 109, 107, 149,  /* 208 - 215 */
 864     108, 200, 109, 146, 110, 170, 111, 157,  /* 216 - 223 */
 865     112, 128, 113, 130, 114, 182, 115, 132,  /* 224 - 231 */
 866     116, 200, 117, 132, 118, 158, 119, 206,  /* 232 - 239 */
 867     120, 240, 121, 162, 122, 147, 123, 152,  /* 240 - 247 */
 868     124, 166, 125, 214, 126, 138, 127, 153,  /* 248 - 255 */
 869   };
 870
 871
 872 /* Return the number of multiplications required to calculate
 873    powi(x,n) where n is less than POWI_TABLE_SIZE.  This is a
 874    subroutine of powi_cost.  CACHE is an array indicating
 875    which exponents have already been calculated.  */
 876
 877 static int
 878 powi_lookup_cost (unsigned HOST_WIDE_INT n, bool *cache)
 879 {
 880   /* If we've already calculated this exponent, then this evaluation
 881      doesn't require any additional multiplications.  */
 882   if (cache[n])
 883     return 0;
 884
 885   cache[n] = true;
 886   return powi_lookup_cost (n - powi_table[n], cache)
 887          + powi_lookup_cost (powi_table[n], cache) + 1;
 888 }
 889
 890 /* Return the number of multiplications required to calculate
 891    powi(x,n) for an arbitrary x, given the exponent N.  This
 892    function needs to be kept in sync with powi_as_mults below.  */
 893
 894 static int
 895 powi_cost (HOST_WIDE_INT n)
 896 {
 897   bool cache[POWI_TABLE_SIZE];
 898   unsigned HOST_WIDE_INT digit;
 899   unsigned HOST_WIDE_INT val;
 900   int result;
 901
 902   if (n == 0)
 903     return 0;
 904
 905   /* Ignore the reciprocal when calculating the cost.  */
 906   val = (n < 0) ? -n : n;
 907
 908   /* Initialize the exponent cache.  */
 909   memset (cache, 0, POWI_TABLE_SIZE * sizeof (bool));
 910   cache[1] = true;
 911
 912   result = 0;
 913
 914   while (val >= POWI_TABLE_SIZE)
 915     {
 916       if (val & 1)
 917         {
 918           digit = val & ((1 << POWI_WINDOW_SIZE) - 1);
 919           result += powi_lookup_cost (digit, cache)
 920                     + POWI_WINDOW_SIZE + 1;
 921           val >>= POWI_WINDOW_SIZE;
 922         }
 923       else
 924         {
 925           val >>= 1;
 926           result++;
 927         }
 928     }
 929
 930   return result + powi_lookup_cost (val, cache);
 931 }
 932
 933 /* Recursive subroutine of powi_as_mults.  This function takes the
 934    array, CACHE, of already calculated exponents and an exponent N and
 935    returns a tree that corresponds to CACHE[1]**N, with type TYPE.  */
 936
 937 static tree
 938 powi_as_mults_1 (gimple_stmt_iterator *gsi, location_t loc, tree type,
 939                  HOST_WIDE_INT n, tree *cache, tree target)
 940 {
 941   tree op0, op1, ssa_target;
 942   unsigned HOST_WIDE_INT digit;
 943   gimple mult_stmt;
 944
 945   if (n < POWI_TABLE_SIZE && cache[n])
 946     return cache[n];
 947
 948   ssa_target = make_ssa_name (target, NULL);
 949
 950   if (n < POWI_TABLE_SIZE)
 951     {
 952       cache[n] = ssa_target;
 953       op0 = powi_as_mults_1 (gsi, loc, type, n - powi_table[n], cache, target);
 954       op1 = powi_as_mults_1 (gsi, loc, type, powi_table[n], cache, target);
 955     }
 956   else if (n & 1)
 957     {
 958       digit = n & ((1 << POWI_WINDOW_SIZE) - 1);
 959       op0 = powi_as_mults_1 (gsi, loc, type, n - digit, cache, target);
 960       op1 = powi_as_mults_1 (gsi, loc, type, digit, cache, target);
 961     }
 962   else
 963     {
 964       op0 = powi_as_mults_1 (gsi, loc, type, n >> 1, cache, target);
 965       op1 = op0;
 966     }
 967
 968   mult_stmt = gimple_build_assign_with_ops (MULT_EXPR, ssa_target, op0, op1);
 969   gimple_set_location (mult_stmt, loc);
 970   gsi_insert_before (gsi, mult_stmt, GSI_SAME_STMT);
 971
 972   return ssa_target;
 973 }
 974
 975 /* Convert ARG0**N to a tree of multiplications of ARG0 with itself.
 976    This function needs to be kept in sync with powi_cost above.  */
 977
 978 static tree
 979 powi_as_mults (gimple_stmt_iterator *gsi, location_t loc,
 980                tree arg0, HOST_WIDE_INT n)
 981 {
 982   tree cache[POWI_TABLE_SIZE], result, type = TREE_TYPE (arg0), target;
 983   gimple div_stmt;
 984
 985   if (n == 0)
 986     return build_real (type, dconst1);
 987
 988   memset (cache, 0,  sizeof (cache));
 989   cache[1] = arg0;
 990
 991   target = create_tmp_reg (type, "powmult");
 992   add_referenced_var (target);
 993
 994   result = powi_as_mults_1 (gsi, loc, type, (n < 0) ? -n : n, cache, target);
 995
 996   if (n >= 0)
 997     return result;
 998
 999   /* If the original exponent was negative, reciprocate the result.  */
1000   target = make_ssa_name (target, NULL);
1001   div_stmt = gimple_build_assign_with_ops (RDIV_EXPR, target,
1002                                            build_real (type, dconst1),
1003                                            result);
1004   gimple_set_location (div_stmt, loc);
1005   gsi_insert_before (gsi, div_stmt, GSI_SAME_STMT);
1006
1007   return target;
1008 }
1009
1010 /* ARG0 and N are the two arguments to a powi builtin in GSI with
1011    location info LOC.  If the arguments are appropriate, create an
1012    equivalent sequence of statements prior to GSI using an optimal
1013    number of multiplications, and return an expession holding the
1014    result.  */
1015
1016 static tree
1017 gimple_expand_builtin_powi (gimple_stmt_iterator *gsi, location_t loc,
1018                             tree arg0, HOST_WIDE_INT n)
1019 {
1020   /* Avoid largest negative number.  */
1021   if (n != -n
1022       && ((n >= -1 && n <= 2)
1023           || (optimize_function_for_speed_p (cfun)
1024               && powi_cost (n) <= POWI_MAX_MULTS)))
1025     return powi_as_mults (gsi, loc, arg0, n);
1026
1027   return NULL_TREE;
1028 }
1029
1030 /* Build a gimple call statement that calls FN with argument ARG.
1031    Set the lhs of the call statement to a fresh SSA name for
1032    variable VAR.  If VAR is NULL, first allocate it.  Insert the
1033    statement prior to GSI's current position, and return the fresh
1034    SSA name.  */
1035
1036 static tree
1037 build_and_insert_call (gimple_stmt_iterator *gsi, location_t loc,
1038                        tree *var, tree fn, tree arg)
1039 {
1040   gimple call_stmt;
1041   tree ssa_target;
1042
1043   if (!*var)
1044     {
1045       *var = create_tmp_reg (TREE_TYPE (arg), "powroot");
1046       add_referenced_var (*var);
1047     }
1048
1049   call_stmt = gimple_build_call (fn, 1, arg);
1050   ssa_target = make_ssa_name (*var, NULL);
1051   gimple_set_lhs (call_stmt, ssa_target);
1052   gimple_set_location (call_stmt, loc);
1053   gsi_insert_before (gsi, call_stmt, GSI_SAME_STMT);
1054
1055   return ssa_target;
1056 }
1057
1058 /* Build a gimple binary operation with the given CODE and arguments
1059    ARG0, ARG1, assigning the result to a new SSA name for variable
1060    TARGET.  Insert the statement prior to GSI's current position, and
1061    return the fresh SSA name.*/
1062
1063 static tree
1064 build_and_insert_binop (gimple_stmt_iterator *gsi, location_t loc,
1065                         tree target, enum tree_code code, tree arg0, tree arg1)
1066 {
1067   tree result = make_ssa_name (target, NULL);
1068   gimple stmt = gimple_build_assign_with_ops (code, result, arg0, arg1);
1069   gimple_set_location (stmt, loc);
1070   gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
1071   return result;
1072 }
1073
1074 /* Build a gimple reference operation with the given CODE and argument
1075    ARG, assigning the result to a new SSA name for variable TARGET.
1076    Insert the statement prior to GSI's current position, and return
1077    the fresh SSA name.  */
1078
1079 static inline tree
1080 build_and_insert_ref (gimple_stmt_iterator *gsi, location_t loc, tree type,
1081                       tree target, enum tree_code code, tree arg0)
1082 {
1083   tree result = make_ssa_name (target, NULL);
1084   gimple stmt = gimple_build_assign (result, build1 (code, type, arg0));
1085   gimple_set_location (stmt, loc);
1086   gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
1087   return result;
1088 }
1089
1090 /* Build a gimple assignment to cast VAL to TARGET.  Insert the statement
1091    prior to GSI's current position, and return the fresh SSA name.  */
1092
1093 static tree
1094 build_and_insert_cast (gimple_stmt_iterator *gsi, location_t loc,
1095                        tree target, tree val)
1096 {
1097   return build_and_insert_binop (gsi, loc, target, CONVERT_EXPR, val, NULL);
1098 }
1099
1100 /* ARG0 and ARG1 are the two arguments to a pow builtin call in GSI
1101    with location info LOC.  If possible, create an equivalent and
1102    less expensive sequence of statements prior to GSI, and return an
1103    expession holding the result.  */
1104
1105 static tree
1106 gimple_expand_builtin_pow (gimple_stmt_iterator *gsi, location_t loc,
1107                            tree arg0, tree arg1)
1108 {
1109   REAL_VALUE_TYPE c, cint, dconst1_4, dconst3_4, dconst1_3, dconst1_6;
1110   REAL_VALUE_TYPE c2, dconst3;
1111   HOST_WIDE_INT n;
1112   tree type, sqrtfn, cbrtfn, sqrt_arg0, sqrt_sqrt, result, cbrt_x, powi_cbrt_x;
1113   tree target = NULL_TREE;
1114   enum machine_mode mode;
1115   bool hw_sqrt_exists;
1116
1117   /* If the exponent isn't a constant, there's nothing of interest
1118      to be done.  */
1119   if (TREE_CODE (arg1) != REAL_CST)
1120     return NULL_TREE;
1121
1122   /* If the exponent is equivalent to an integer, expand to an optimal
1123      multiplication sequence when profitable.  */
1124   c = TREE_REAL_CST (arg1);
1125   n = real_to_integer (&c);
1126   real_from_integer (&cint, VOIDmode, n, n < 0 ? -1 : 0, 0);
1127
1128   if (real_identical (&c, &cint)
1129       && ((n >= -1 && n <= 2)
1130           || (flag_unsafe_math_optimizations
1131               && optimize_insn_for_speed_p ()
1132               && powi_cost (n) <= POWI_MAX_MULTS)))
1133     return gimple_expand_builtin_powi (gsi, loc, arg0, n);
1134
1135   /* Attempt various optimizations using sqrt and cbrt.  */
1136   type = TREE_TYPE (arg0);
1137   mode = TYPE_MODE (type);
1138   sqrtfn = mathfn_built_in (type, BUILT_IN_SQRT);
1139
1140   /* Optimize pow(x,0.5) = sqrt(x).  This replacement is always safe
1141      unless signed zeros must be maintained.  pow(-0,0.5) = +0, while
1142      sqrt(-0) = -0.  */
1143   if (sqrtfn
1144       && REAL_VALUES_EQUAL (c, dconsthalf)
1145       && !HONOR_SIGNED_ZEROS (mode))
1146     return build_and_insert_call (gsi, loc, &target, sqrtfn, arg0);
1147
1148   /* Optimize pow(x,0.25) = sqrt(sqrt(x)).  Assume on most machines that
1149      a builtin sqrt instruction is smaller than a call to pow with 0.25,
1150      so do this optimization even if -Os.  Don't do this optimization
1151      if we don't have a hardware sqrt insn.  */
1152   dconst1_4 = dconst1;
1153   SET_REAL_EXP (&dconst1_4, REAL_EXP (&dconst1_4) - 2);
1154   hw_sqrt_exists = optab_handler (sqrt_optab, mode) != CODE_FOR_nothing;
1155
1156   if (flag_unsafe_math_optimizations
1157       && sqrtfn
1158       && REAL_VALUES_EQUAL (c, dconst1_4)
1159       && hw_sqrt_exists)
1160     {
1161       /* sqrt(x)  */
1162       sqrt_arg0 = build_and_insert_call (gsi, loc, &target, sqrtfn, arg0);
1163
1164       /* sqrt(sqrt(x))  */
1165       return build_and_insert_call (gsi, loc, &target, sqrtfn, sqrt_arg0);
1166     }
1167
1168   /* Optimize pow(x,0.75) = sqrt(x) * sqrt(sqrt(x)) unless we are
1169      optimizing for space.  Don't do this optimization if we don't have
1170      a hardware sqrt insn.  */
1171   real_from_integer (&dconst3_4, VOIDmode, 3, 0, 0);
1172   SET_REAL_EXP (&dconst3_4, REAL_EXP (&dconst3_4) - 2);
1173
1174   if (flag_unsafe_math_optimizations
1175       && sqrtfn
1176       && optimize_function_for_speed_p (cfun)
1177       && REAL_VALUES_EQUAL (c, dconst3_4)
1178       && hw_sqrt_exists)
1179     {
1180       /* sqrt(x)  */
1181       sqrt_arg0 = build_and_insert_call (gsi, loc, &target, sqrtfn, arg0);
1182
1183       /* sqrt(sqrt(x))  */
1184       sqrt_sqrt = build_and_insert_call (gsi, loc, &target, sqrtfn, sqrt_arg0);
1185
1186       /* sqrt(x) * sqrt(sqrt(x))  */
1187       return build_and_insert_binop (gsi, loc, target, MULT_EXPR,
1188                                      sqrt_arg0, sqrt_sqrt);
1189     }
1190
1191   /* Optimize pow(x,1./3.) = cbrt(x).  This requires unsafe math
1192      optimizations since 1./3. is not exactly representable.  If x
1193      is negative and finite, the correct value of pow(x,1./3.) is
1194      a NaN with the "invalid" exception raised, because the value
1195      of 1./3. actually has an even denominator.  The correct value
1196      of cbrt(x) is a negative real value.  */
1197   cbrtfn = mathfn_built_in (type, BUILT_IN_CBRT);
1198   dconst1_3 = real_value_truncate (mode, dconst_third ());
1199
1200   if (flag_unsafe_math_optimizations
1201       && cbrtfn
1202       && (gimple_val_nonnegative_real_p (arg0) || !HONOR_NANS (mode))
1203       && REAL_VALUES_EQUAL (c, dconst1_3))
1204     return build_and_insert_call (gsi, loc, &target, cbrtfn, arg0);
1205
1206   /* Optimize pow(x,1./6.) = cbrt(sqrt(x)).  Don't do this optimization
1207      if we don't have a hardware sqrt insn.  */
1208   dconst1_6 = dconst1_3;
1209   SET_REAL_EXP (&dconst1_6, REAL_EXP (&dconst1_6) - 1);
1210
1211   if (flag_unsafe_math_optimizations
1212       && sqrtfn
1213       && cbrtfn
1214       && (gimple_val_nonnegative_real_p (arg0) || !HONOR_NANS (mode))
1215       && optimize_function_for_speed_p (cfun)
1216       && hw_sqrt_exists
1217       && REAL_VALUES_EQUAL (c, dconst1_6))
1218     {
1219       /* sqrt(x)  */
1220       sqrt_arg0 = build_and_insert_call (gsi, loc, &target, sqrtfn, arg0);
1221
1222       /* cbrt(sqrt(x))  */
1223       return build_and_insert_call (gsi, loc, &target, cbrtfn, sqrt_arg0);
1224     }
1225
1226   /* Optimize pow(x,c), where n = 2c for some nonzero integer n, into
1227
1228        sqrt(x) * powi(x, n/2),                n > 0;
1229        1.0 / (sqrt(x) * powi(x, abs(n/2))),   n < 0.
1230
1231      Do not calculate the powi factor when n/2 = 0.  */
1232   real_arithmetic (&c2, MULT_EXPR, &c, &dconst2);
1233   n = real_to_integer (&c2);
1234   real_from_integer (&cint, VOIDmode, n, n < 0 ? -1 : 0, 0);
1235
1236   if (flag_unsafe_math_optimizations
1237       && sqrtfn
1238       && real_identical (&c2, &cint))
1239     {
1240       tree powi_x_ndiv2 = NULL_TREE;
1241
1242       /* Attempt to fold powi(arg0, abs(n/2)) into multiplies.  If not
1243          possible or profitable, give up.  Skip the degenerate case when
1244          n is 1 or -1, where the result is always 1.  */
1245       if (absu_hwi (n) != 1)
1246         {
1247           powi_x_ndiv2 = gimple_expand_builtin_powi (gsi, loc, arg0,
1248                                                      abs_hwi (n / 2));
1249           if (!powi_x_ndiv2)
1250             return NULL_TREE;
1251         }
1252
1253       /* Calculate sqrt(x).  When n is not 1 or -1, multiply it by the
1254          result of the optimal multiply sequence just calculated.  */
1255       sqrt_arg0 = build_and_insert_call (gsi, loc, &target, sqrtfn, arg0);
1256
1257       if (absu_hwi (n) == 1)
1258         result = sqrt_arg0;
1259       else
1260         result = build_and_insert_binop (gsi, loc, target, MULT_EXPR,
1261                                          sqrt_arg0, powi_x_ndiv2);
1262
1263       /* If n is negative, reciprocate the result.  */
1264       if (n < 0)
1265         result = build_and_insert_binop (gsi, loc, target, RDIV_EXPR,
1266                                          build_real (type, dconst1), result);
1267       return result;
1268     }
1269
1270   /* Optimize pow(x,c), where 3c = n for some nonzero integer n, into
1271
1272      powi(x, n/3) * powi(cbrt(x), n%3),                    n > 0;
1273      1.0 / (powi(x, abs(n)/3) * powi(cbrt(x), abs(n)%3)),  n < 0.
1274
1275      Do not calculate the first factor when n/3 = 0.  As cbrt(x) is
1276      different from pow(x, 1./3.) due to rounding and behavior with
1277      negative x, we need to constrain this transformation to unsafe
1278      math and positive x or finite math.  */
1279   real_from_integer (&dconst3, VOIDmode, 3, 0, 0);
1280   real_arithmetic (&c2, MULT_EXPR, &c, &dconst3);
1281   real_round (&c2, mode, &c2);
1282   n = real_to_integer (&c2);
1283   real_from_integer (&cint, VOIDmode, n, n < 0 ? -1 : 0, 0);
1284   real_arithmetic (&c2, RDIV_EXPR, &cint, &dconst3);
1285   real_convert (&c2, mode, &c2);
1286
1287   if (flag_unsafe_math_optimizations
1288       && cbrtfn
1289       && (gimple_val_nonnegative_real_p (arg0) || !HONOR_NANS (mode))
1290       && real_identical (&c2, &c)
1291       && optimize_function_for_speed_p (cfun)
1292       && powi_cost (n / 3) <= POWI_MAX_MULTS)
1293     {
1294       tree powi_x_ndiv3 = NULL_TREE;
1295
1296       /* Attempt to fold powi(arg0, abs(n/3)) into multiplies.  If not
1297          possible or profitable, give up.  Skip the degenerate case when
1298          abs(n) < 3, where the result is always 1.  */
1299       if (absu_hwi (n) >= 3)
1300         {
1301           powi_x_ndiv3 = gimple_expand_builtin_powi (gsi, loc, arg0,
1302                                                      abs_hwi (n / 3));
1303           if (!powi_x_ndiv3)
1304             return NULL_TREE;
1305         }
1306
1307       /* Calculate powi(cbrt(x), n%3).  Don't use gimple_expand_builtin_powi
1308          as that creates an unnecessary variable.  Instead, just produce
1309          either cbrt(x) or cbrt(x) * cbrt(x).  */
1310       cbrt_x = build_and_insert_call (gsi, loc, &target, cbrtfn, arg0);
1311
1312       if (absu_hwi (n) % 3 == 1)
1313         powi_cbrt_x = cbrt_x;
1314       else
1315         powi_cbrt_x = build_and_insert_binop (gsi, loc, target, MULT_EXPR,
1316                                               cbrt_x, cbrt_x);
1317
1318       /* Multiply the two subexpressions, unless powi(x,abs(n)/3) = 1.  */
1319       if (absu_hwi (n) < 3)
1320         result = powi_cbrt_x;
1321       else
1322         result = build_and_insert_binop (gsi, loc, target, MULT_EXPR,
1323                                          powi_x_ndiv3, powi_cbrt_x);
1324
1325       /* If n is negative, reciprocate the result.  */
1326       if (n < 0)
1327         result = build_and_insert_binop (gsi, loc, target, RDIV_EXPR,
1328                                          build_real (type, dconst1), result);
1329
1330       return result;
1331     }
1332
1333   /* No optimizations succeeded.  */
1334   return NULL_TREE;
1335 }
1336
1337 /* ARG is the argument to a cabs builtin call in GSI with location info
1338    LOC.  Create a sequence of statements prior to GSI that calculates
1339    sqrt(R*R + I*I), where R and I are the real and imaginary components
1340    of ARG, respectively.  Return an expression holding the result.  */
1341
1342 static tree
1343 gimple_expand_builtin_cabs (gimple_stmt_iterator *gsi, location_t loc, tree arg)
1344 {
1345   tree target, real_part, imag_part, addend1, addend2, sum, result;
1346   tree type = TREE_TYPE (TREE_TYPE (arg));
1347   tree sqrtfn = mathfn_built_in (type, BUILT_IN_SQRT);
1348   enum machine_mode mode = TYPE_MODE (type);
1349
1350   if (!flag_unsafe_math_optimizations
1351       || !optimize_bb_for_speed_p (gimple_bb (gsi_stmt (*gsi)))
1352       || !sqrtfn
1353       || optab_handler (sqrt_optab, mode) == CODE_FOR_nothing)
1354     return NULL_TREE;
1355
1356   target = create_tmp_reg (type, "cabs");
1357   add_referenced_var (target);
1358
1359   real_part = build_and_insert_ref (gsi, loc, type, target,
1360                                     REALPART_EXPR, arg);
1361   addend1 = build_and_insert_binop (gsi, loc, target, MULT_EXPR,
1362                                     real_part, real_part);
1363   imag_part = build_and_insert_ref (gsi, loc, type, target,
1364                                     IMAGPART_EXPR, arg);
1365   addend2 = build_and_insert_binop (gsi, loc, target, MULT_EXPR,
1366                                     imag_part, imag_part);
1367   sum = build_and_insert_binop (gsi, loc, target, PLUS_EXPR, addend1, addend2);
1368   result = build_and_insert_call (gsi, loc, &target, sqrtfn, sum);
1369
1370   return result;
1371 }
1372
1373 /* Go through all calls to sin, cos and cexpi and call execute_cse_sincos_1
1374    on the SSA_NAME argument of each of them.  Also expand powi(x,n) into
1375    an optimal number of multiplies, when n is a constant.  */
1376
1377 static unsigned int
1378 execute_cse_sincos (void)
1379 {
1380   basic_block bb;
1381   bool cfg_changed = false;
1382
1383   calculate_dominance_info (CDI_DOMINATORS);
1384   memset (&sincos_stats, 0, sizeof (sincos_stats));
1385
1386   FOR_EACH_BB (bb)
1387     {
1388       gimple_stmt_iterator gsi;
1389
1390       for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi); gsi_next (&gsi))
1391         {
1392           gimple stmt = gsi_stmt (gsi);
1393           tree fndecl;
1394
1395           if (is_gimple_call (stmt)
1396               && gimple_call_lhs (stmt)
1397               && (fndecl = gimple_call_fndecl (stmt))
1398               && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_NORMAL)
1399             {
1400               tree arg, arg0, arg1, result;
1401               HOST_WIDE_INT n;
1402               location_t loc;
1403
1404               switch (DECL_FUNCTION_CODE (fndecl))
1405                 {
1406                 CASE_FLT_FN (BUILT_IN_COS):
1407                 CASE_FLT_FN (BUILT_IN_SIN):
1408                 CASE_FLT_FN (BUILT_IN_CEXPI):
1409                   /* Make sure we have either sincos or cexp.  */
1410                   if (!TARGET_HAS_SINCOS && !TARGET_C99_FUNCTIONS)
1411                     break;
1412
1413                   arg = gimple_call_arg (stmt, 0);
1414                   if (TREE_CODE (arg) == SSA_NAME)
1415                     cfg_changed |= execute_cse_sincos_1 (arg);
1416                   break;
1417
1418                 CASE_FLT_FN (BUILT_IN_POW):
1419                   arg0 = gimple_call_arg (stmt, 0);
1420                   arg1 = gimple_call_arg (stmt, 1);
1421
1422                   loc = gimple_location (stmt);
1423                   result = gimple_expand_builtin_pow (&gsi, loc, arg0, arg1);
1424
1425                   if (result)
1426                     {
1427                       tree lhs = gimple_get_lhs (stmt);
1428                       gimple new_stmt = gimple_build_assign (lhs, result);
1429                       gimple_set_location (new_stmt, loc);
1430                       unlink_stmt_vdef (stmt);
1431                       gsi_replace (&gsi, new_stmt, true);
1432                       if (gimple_vdef (stmt))
1433                         release_ssa_name (gimple_vdef (stmt));
1434                     }
1435                   break;
1436
1437                 CASE_FLT_FN (BUILT_IN_POWI):
1438                   arg0 = gimple_call_arg (stmt, 0);
1439                   arg1 = gimple_call_arg (stmt, 1);
1440                   if (!host_integerp (arg1, 0))
1441                     break;
1442
1443                   n = TREE_INT_CST_LOW (arg1);
1444                   loc = gimple_location (stmt);
1445                   result = gimple_expand_builtin_powi (&gsi, loc, arg0, n);
1446
1447                   if (result)
1448                     {
1449                       tree lhs = gimple_get_lhs (stmt);
1450                       gimple new_stmt = gimple_build_assign (lhs, result);
1451                       gimple_set_location (new_stmt, loc);
1452                       unlink_stmt_vdef (stmt);
1453                       gsi_replace (&gsi, new_stmt, true);
1454                       if (gimple_vdef (stmt))
1455                         release_ssa_name (gimple_vdef (stmt));
1456                     }
1457                   break;
1458
1459                 CASE_FLT_FN (BUILT_IN_CABS):
1460                   arg0 = gimple_call_arg (stmt, 0);
1461                   loc = gimple_location (stmt);
1462                   result = gimple_expand_builtin_cabs (&gsi, loc, arg0);
1463
1464                   if (result)
1465                     {
1466                       tree lhs = gimple_get_lhs (stmt);
1467                       gimple new_stmt = gimple_build_assign (lhs, result);
1468                       gimple_set_location (new_stmt, loc);
1469                       unlink_stmt_vdef (stmt);
1470                       gsi_replace (&gsi, new_stmt, true);
1471                       if (gimple_vdef (stmt))
1472                         release_ssa_name (gimple_vdef (stmt));
1473                     }
1474                   break;
1475
1476                 default:;
1477                 }
1478             }
1479         }
1480     }
1481
1482   statistics_counter_event (cfun, "sincos statements inserted",
1483                             sincos_stats.inserted);
1484
1485   free_dominance_info (CDI_DOMINATORS);
1486   return cfg_changed ? TODO_cleanup_cfg : 0;
1487 }
1488
1489 static bool
1490 gate_cse_sincos (void)
1491 {
1492   /* We no longer require either sincos or cexp, since powi expansion
1493      piggybacks on this pass.  */
1494   return optimize;
1495 }
1496
1497 struct gimple_opt_pass pass_cse_sincos =
1498 {
1499  {
1500   GIMPLE_PASS,
1501   "sincos",                             /* name */
1502   gate_cse_sincos,                      /* gate */
1503   execute_cse_sincos,                   /* execute */
1504   NULL,                                 /* sub */
1505   NULL,                                 /* next */
1506   0,                                    /* static_pass_number */
1507   TV_NONE,                              /* tv_id */
1508   PROP_ssa,                             /* properties_required */
1509   0,                                    /* properties_provided */
1510   0,                                    /* properties_destroyed */
1511   0,                                    /* todo_flags_start */
1512   TODO_update_ssa | TODO_verify_ssa
1513     | TODO_verify_stmts                 /* todo_flags_finish */
1514  }
1515 };
1516
1517 /* A symbolic number is used to detect byte permutation and selection
1518    patterns.  Therefore the field N contains an artificial number
1519    consisting of byte size markers:
1520
1521    0    - byte has the value 0
1522    1..size - byte contains the content of the byte
1523    number indexed with that value minus one  */
1524
1525 struct symbolic_number {
1526   unsigned HOST_WIDEST_INT n;
1527   int size;
1528 };
1529
1530 /* Perform a SHIFT or ROTATE operation by COUNT bits on symbolic
1531    number N.  Return false if the requested operation is not permitted
1532    on a symbolic number.  */
1533
1534 static inline bool
1535 do_shift_rotate (enum tree_code code,
1536                  struct symbolic_number *n,
1537                  int count)
1538 {
1539   if (count % 8 != 0)
1540     return false;
1541
1542   /* Zero out the extra bits of N in order to avoid them being shifted
1543      into the significant bits.  */
1544   if (n->size < (int)sizeof (HOST_WIDEST_INT))
1545     n->n &= ((unsigned HOST_WIDEST_INT)1 << (n->size * BITS_PER_UNIT)) - 1;
1546
1547   switch (code)
1548     {
1549     case LSHIFT_EXPR:
1550       n->n <<= count;
1551       break;
1552     case RSHIFT_EXPR:
1553       n->n >>= count;
1554       break;
1555     case LROTATE_EXPR:
1556       n->n = (n->n << count) | (n->n >> ((n->size * BITS_PER_UNIT) - count));
1557       break;
1558     case RROTATE_EXPR:
1559       n->n = (n->n >> count) | (n->n << ((n->size * BITS_PER_UNIT) - count));
1560       break;
1561     default:
1562       return false;
1563     }
1564   /* Zero unused bits for size.  */
1565   if (n->size < (int)sizeof (HOST_WIDEST_INT))
1566     n->n &= ((unsigned HOST_WIDEST_INT)1 << (n->size * BITS_PER_UNIT)) - 1;
1567   return true;
1568 }
1569
1570 /* Perform sanity checking for the symbolic number N and the gimple
1571    statement STMT.  */
1572
1573 static inline bool
1574 verify_symbolic_number_p (struct symbolic_number *n, gimple stmt)
1575 {
1576   tree lhs_type;
1577
1578   lhs_type = gimple_expr_type (stmt);
1579
1580   if (TREE_CODE (lhs_type) != INTEGER_TYPE)
1581     return false;
1582
1583   if (TYPE_PRECISION (lhs_type) != n->size * BITS_PER_UNIT)
1584     return false;
1585
1586   return true;
1587 }
1588
1589 /* find_bswap_1 invokes itself recursively with N and tries to perform
1590    the operation given by the rhs of STMT on the result.  If the
1591    operation could successfully be executed the function returns the
1592    tree expression of the source operand and NULL otherwise.  */
1593
1594 static tree
1595 find_bswap_1 (gimple stmt, struct symbolic_number *n, int limit)
1596 {
1597   enum tree_code code;
1598   tree rhs1, rhs2 = NULL;
1599   gimple rhs1_stmt, rhs2_stmt;
1600   tree source_expr1;
1601   enum gimple_rhs_class rhs_class;
1602
1603   if (!limit || !is_gimple_assign (stmt))
1604     return NULL_TREE;
1605
1606   rhs1 = gimple_assign_rhs1 (stmt);
1607
1608   if (TREE_CODE (rhs1) != SSA_NAME)
1609     return NULL_TREE;
1610
1611   code = gimple_assign_rhs_code (stmt);
1612   rhs_class = gimple_assign_rhs_class (stmt);
1613   rhs1_stmt = SSA_NAME_DEF_STMT (rhs1);
1614
1615   if (rhs_class == GIMPLE_BINARY_RHS)
1616     rhs2 = gimple_assign_rhs2 (stmt);
1617
1618   /* Handle unary rhs and binary rhs with integer constants as second
1619      operand.  */
1620
1621   if (rhs_class == GIMPLE_UNARY_RHS
1622       || (rhs_class == GIMPLE_BINARY_RHS
1623           && TREE_CODE (rhs2) == INTEGER_CST))
1624     {
1625       if (code != BIT_AND_EXPR
1626           && code != LSHIFT_EXPR
1627           && code != RSHIFT_EXPR
1628           && code != LROTATE_EXPR
1629           && code != RROTATE_EXPR
1630           && code != NOP_EXPR
1631           && code != CONVERT_EXPR)
1632         return NULL_TREE;
1633
1634       source_expr1 = find_bswap_1 (rhs1_stmt, n, limit - 1);
1635
1636       /* If find_bswap_1 returned NULL STMT is a leaf node and we have
1637          to initialize the symbolic number.  */
1638       if (!source_expr1)
1639         {
1640           /* Set up the symbolic number N by setting each byte to a
1641              value between 1 and the byte size of rhs1.  The highest
1642              order byte is set to n->size and the lowest order
1643              byte to 1.  */
1644           n->size = TYPE_PRECISION (TREE_TYPE (rhs1));
1645           if (n->size % BITS_PER_UNIT != 0)
1646             return NULL_TREE;
1647           n->size /= BITS_PER_UNIT;
1648           n->n = (sizeof (HOST_WIDEST_INT) < 8 ? 0 :
1649                   (unsigned HOST_WIDEST_INT)0x08070605 << 32 | 0x04030201);
1650
1651           if (n->size < (int)sizeof (HOST_WIDEST_INT))
1652             n->n &= ((unsigned HOST_WIDEST_INT)1 <<
1653                      (n->size * BITS_PER_UNIT)) - 1;
1654
1655           source_expr1 = rhs1;
1656         }
1657
1658       switch (code)
1659         {
1660         case BIT_AND_EXPR:
1661           {
1662             int i;
1663             unsigned HOST_WIDEST_INT val = widest_int_cst_value (rhs2);
1664             unsigned HOST_WIDEST_INT tmp = val;
1665
1666             /* Only constants masking full bytes are allowed.  */
1667             for (i = 0; i < n->size; i++, tmp >>= BITS_PER_UNIT)
1668               if ((tmp & 0xff) != 0 && (tmp & 0xff) != 0xff)
1669                 return NULL_TREE;
1670
1671             n->n &= val;
1672           }
1673           break;
1674         case LSHIFT_EXPR:
1675         case RSHIFT_EXPR:
1676         case LROTATE_EXPR:
1677         case RROTATE_EXPR:
1678           if (!do_shift_rotate (code, n, (int)TREE_INT_CST_LOW (rhs2)))
1679             return NULL_TREE;
1680           break;
1681         CASE_CONVERT:
1682           {
1683             int type_size;
1684
1685             type_size = TYPE_PRECISION (gimple_expr_type (stmt));
1686             if (type_size % BITS_PER_UNIT != 0)
1687               return NULL_TREE;
1688
1689             if (type_size / BITS_PER_UNIT < (int)(sizeof (HOST_WIDEST_INT)))
1690               {
1691                 /* If STMT casts to a smaller type mask out the bits not
1692                    belonging to the target type.  */
1693                 n->n &= ((unsigned HOST_WIDEST_INT)1 << type_size) - 1;
1694               }
1695             n->size = type_size / BITS_PER_UNIT;
1696           }
1697           break;
1698         default:
1699           return NULL_TREE;
1700         };
1701       return verify_symbolic_number_p (n, stmt) ? source_expr1 : NULL;
1702     }
1703
1704   /* Handle binary rhs.  */
1705
1706   if (rhs_class == GIMPLE_BINARY_RHS)
1707     {
1708       struct symbolic_number n1, n2;
1709       tree source_expr2;
1710
1711       if (code != BIT_IOR_EXPR)
1712         return NULL_TREE;
1713
1714       if (TREE_CODE (rhs2) != SSA_NAME)
1715         return NULL_TREE;
1716
1717       rhs2_stmt = SSA_NAME_DEF_STMT (rhs2);
1718
1719       switch (code)
1720         {
1721         case BIT_IOR_EXPR:
1722           source_expr1 = find_bswap_1 (rhs1_stmt, &n1, limit - 1);
1723
1724           if (!source_expr1)
1725             return NULL_TREE;
1726
1727           source_expr2 = find_bswap_1 (rhs2_stmt, &n2, limit - 1);
1728
1729           if (source_expr1 != source_expr2
1730               || n1.size != n2.size)
1731             return NULL_TREE;
1732
1733           n->size = n1.size;
1734           n->n = n1.n | n2.n;
1735
1736           if (!verify_symbolic_number_p (n, stmt))
1737             return NULL_TREE;
1738
1739           break;
1740         default:
1741           return NULL_TREE;
1742         }
1743       return source_expr1;
1744     }
1745   return NULL_TREE;
1746 }
1747
1748 /* Check if STMT completes a bswap implementation consisting of ORs,
1749    SHIFTs and ANDs.  Return the source tree expression on which the
1750    byte swap is performed and NULL if no bswap was found.  */
1751
1752 static tree
1753 find_bswap (gimple stmt)
1754 {
1755 /* The number which the find_bswap result should match in order to
1756    have a full byte swap.  The number is shifted to the left according
1757    to the size of the symbolic number before using it.  */
1758   unsigned HOST_WIDEST_INT cmp =
1759     sizeof (HOST_WIDEST_INT) < 8 ? 0 :
1760     (unsigned HOST_WIDEST_INT)0x01020304 << 32 | 0x05060708;
1761
1762   struct symbolic_number n;
1763   tree source_expr;
1764   int limit;
1765
1766   /* The last parameter determines the depth search limit.  It usually
1767      correlates directly to the number of bytes to be touched.  We
1768      increase that number by three  here in order to also
1769      cover signed -> unsigned converions of the src operand as can be seen
1770      in libgcc, and for initial shift/and operation of the src operand.  */
1771   limit = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (gimple_expr_type (stmt)));
1772   limit += 1 + (int) ceil_log2 ((unsigned HOST_WIDE_INT) limit);
1773   source_expr =  find_bswap_1 (stmt, &n, limit);
1774
1775   if (!source_expr)
1776     return NULL_TREE;
1777
1778   /* Zero out the extra bits of N and CMP.  */
1779   if (n.size < (int)sizeof (HOST_WIDEST_INT))
1780     {
1781       unsigned HOST_WIDEST_INT mask =
1782         ((unsigned HOST_WIDEST_INT)1 << (n.size * BITS_PER_UNIT)) - 1;
1783
1784       n.n &= mask;
1785       cmp >>= (sizeof (HOST_WIDEST_INT) - n.size) * BITS_PER_UNIT;
1786     }
1787
1788   /* A complete byte swap should make the symbolic number to start
1789      with the largest digit in the highest order byte.  */
1790   if (cmp != n.n)
1791     return NULL_TREE;
1792
1793   return source_expr;
1794 }
1795
1796 /* Find manual byte swap implementations and turn them into a bswap
1797    builtin invokation.  */
1798
1799 static unsigned int
1800 execute_optimize_bswap (void)
1801 {
1802   basic_block bb;
1803   bool bswap32_p, bswap64_p;
1804   bool changed = false;
1805   tree bswap32_type = NULL_TREE, bswap64_type = NULL_TREE;
1806
1807   if (BITS_PER_UNIT != 8)
1808     return 0;
1809
1810   if (sizeof (HOST_WIDEST_INT) < 8)
1811     return 0;
1812
1813   bswap32_p = (builtin_decl_explicit_p (BUILT_IN_BSWAP32)
1814                && optab_handler (bswap_optab, SImode) != CODE_FOR_nothing);
1815   bswap64_p = (builtin_decl_explicit_p (BUILT_IN_BSWAP64)
1816                && (optab_handler (bswap_optab, DImode) != CODE_FOR_nothing
1817                    || (bswap32_p && word_mode == SImode)));
1818
1819   if (!bswap32_p && !bswap64_p)
1820     return 0;
1821
1822   /* Determine the argument type of the builtins.  The code later on
1823      assumes that the return and argument type are the same.  */
1824   if (bswap32_p)
1825     {
1826       tree fndecl = builtin_decl_explicit (BUILT_IN_BSWAP32);
1827       bswap32_type = TREE_VALUE (TYPE_ARG_TYPES (TREE_TYPE (fndecl)));
1828     }
1829
1830   if (bswap64_p)
1831     {
1832       tree fndecl = builtin_decl_explicit (BUILT_IN_BSWAP64);
1833       bswap64_type = TREE_VALUE (TYPE_ARG_TYPES (TREE_TYPE (fndecl)));
1834     }
1835
1836   memset (&bswap_stats, 0, sizeof (bswap_stats));
1837
1838   FOR_EACH_BB (bb)
1839     {
1840       gimple_stmt_iterator gsi;
1841
1842       /* We do a reverse scan for bswap patterns to make sure we get the
1843          widest match. As bswap pattern matching doesn't handle
1844          previously inserted smaller bswap replacements as sub-
1845          patterns, the wider variant wouldn't be detected.  */
1846       for (gsi = gsi_last_bb (bb); !gsi_end_p (gsi); gsi_prev (&gsi))
1847         {
1848           gimple stmt = gsi_stmt (gsi);
1849           tree bswap_src, bswap_type;
1850           tree bswap_tmp;
1851           tree fndecl = NULL_TREE;
1852           int type_size;
1853           gimple call;
1854
1855           if (!is_gimple_assign (stmt)
1856               || gimple_assign_rhs_code (stmt) != BIT_IOR_EXPR)
1857             continue;
1858
1859           type_size = TYPE_PRECISION (gimple_expr_type (stmt));
1860
1861           switch (type_size)
1862             {
1863             case 32:
1864               if (bswap32_p)
1865                 {
1866                   fndecl = builtin_decl_explicit (BUILT_IN_BSWAP32);
1867                   bswap_type = bswap32_type;
1868                 }
1869               break;
1870             case 64:
1871               if (bswap64_p)
1872                 {
1873                   fndecl = builtin_decl_explicit (BUILT_IN_BSWAP64);
1874                   bswap_type = bswap64_type;
1875                 }
1876               break;
1877             default:
1878               continue;
1879             }
1880
1881           if (!fndecl)
1882             continue;
1883
1884           bswap_src = find_bswap (stmt);
1885
1886           if (!bswap_src)
1887             continue;
1888
1889           changed = true;
1890           if (type_size == 32)
1891             bswap_stats.found_32bit++;
1892           else
1893             bswap_stats.found_64bit++;
1894
1895           bswap_tmp = bswap_src;
1896
1897           /* Convert the src expression if necessary.  */
1898           if (!useless_type_conversion_p (TREE_TYPE (bswap_tmp), bswap_type))
1899             {
1900               gimple convert_stmt;
1901
1902               bswap_tmp = create_tmp_var (bswap_type, "bswapsrc");
1903               add_referenced_var (bswap_tmp);
1904               bswap_tmp = make_ssa_name (bswap_tmp, NULL);
1905
1906               convert_stmt = gimple_build_assign_with_ops (
1907                                CONVERT_EXPR, bswap_tmp, bswap_src, NULL);
1908               gsi_insert_before (&gsi, convert_stmt, GSI_SAME_STMT);
1909             }
1910
1911           call = gimple_build_call (fndecl, 1, bswap_tmp);
1912
1913           bswap_tmp = gimple_assign_lhs (stmt);
1914
1915           /* Convert the result if necessary.  */
1916           if (!useless_type_conversion_p (TREE_TYPE (bswap_tmp), bswap_type))
1917             {
1918               gimple convert_stmt;
1919
1920               bswap_tmp = create_tmp_var (bswap_type, "bswapdst");
1921               add_referenced_var (bswap_tmp);
1922               bswap_tmp = make_ssa_name (bswap_tmp, NULL);
1923               convert_stmt = gimple_build_assign_with_ops (
1924                                CONVERT_EXPR, gimple_assign_lhs (stmt), bswap_tmp, NULL);
1925               gsi_insert_after (&gsi, convert_stmt, GSI_SAME_STMT);
1926             }
1927
1928           gimple_call_set_lhs (call, bswap_tmp);
1929
1930           if (dump_file)
1931             {
1932               fprintf (dump_file, "%d bit bswap implementation found at: ",
1933                        (int)type_size);
1934               print_gimple_stmt (dump_file, stmt, 0, 0);
1935             }
1936
1937           gsi_insert_after (&gsi, call, GSI_SAME_STMT);
1938           gsi_remove (&gsi, true);
1939         }
1940     }
1941
1942   statistics_counter_event (cfun, "32-bit bswap implementations found",
1943                             bswap_stats.found_32bit);
1944   statistics_counter_event (cfun, "64-bit bswap implementations found",
1945                             bswap_stats.found_64bit);
1946
1947   return (changed ? TODO_update_ssa | TODO_verify_ssa
1948           | TODO_verify_stmts : 0);
1949 }
1950
1951 static bool
1952 gate_optimize_bswap (void)
1953 {
1954   return flag_expensive_optimizations && optimize;
1955 }
1956
1957 struct gimple_opt_pass pass_optimize_bswap =
1958 {
1959  {
1960   GIMPLE_PASS,
1961   "bswap",                              /* name */
1962   gate_optimize_bswap,                  /* gate */
1963   execute_optimize_bswap,               /* execute */
1964   NULL,                                 /* sub */
1965   NULL,                                 /* next */
1966   0,                                    /* static_pass_number */
1967   TV_NONE,                              /* tv_id */
1968   PROP_ssa,                             /* properties_required */
1969   0,                                    /* properties_provided */
1970   0,                                    /* properties_destroyed */
1971   0,                                    /* todo_flags_start */
1972   0                                     /* todo_flags_finish */
1973  }
1974 };
1975
1976 /* Return true if RHS is a suitable operand for a widening multiplication,
1977    assuming a target type of TYPE.
1978    There are two cases:
1979
1980      - RHS makes some value at least twice as wide.  Store that value
1981        in *NEW_RHS_OUT if so, and store its type in *TYPE_OUT.
1982
1983      - RHS is an integer constant.  Store that value in *NEW_RHS_OUT if so,
1984        but leave *TYPE_OUT untouched.  */
1985
1986 static bool
1987 is_widening_mult_rhs_p (tree type, tree rhs, tree *type_out,
1988                         tree *new_rhs_out)
1989 {
1990   gimple stmt;
1991   tree type1, rhs1;
1992   enum tree_code rhs_code;
1993
1994   if (TREE_CODE (rhs) == SSA_NAME)
1995     {
1996       stmt = SSA_NAME_DEF_STMT (rhs);
1997       if (is_gimple_assign (stmt))
1998         {
1999           rhs_code = gimple_assign_rhs_code (stmt);
2000           if (TREE_CODE (type) == INTEGER_TYPE
2001               ? !CONVERT_EXPR_CODE_P (rhs_code)
2002               : rhs_code != FIXED_CONVERT_EXPR)
2003             rhs1 = rhs;
2004           else
2005             {
2006               rhs1 = gimple_assign_rhs1 (stmt);
2007
2008               if (TREE_CODE (rhs1) == INTEGER_CST)
2009                 {
2010                   *new_rhs_out = rhs1;
2011                   *type_out = NULL;
2012                   return true;
2013                 }
2014             }
2015         }
2016       else
2017         rhs1 = rhs;
2018
2019       type1 = TREE_TYPE (rhs1);
2020
2021       if (TREE_CODE (type1) != TREE_CODE (type)
2022           || TYPE_PRECISION (type1) * 2 > TYPE_PRECISION (type))
2023         return false;
2024
2025       *new_rhs_out = rhs1;
2026       *type_out = type1;
2027       return true;
2028     }
2029
2030   if (TREE_CODE (rhs) == INTEGER_CST)
2031     {
2032       *new_rhs_out = rhs;
2033       *type_out = NULL;
2034       return true;
2035     }
2036
2037   return false;
2038 }
2039
2040 /* Return true if STMT performs a widening multiplication, assuming the
2041    output type is TYPE.  If so, store the unwidened types of the operands
2042    in *TYPE1_OUT and *TYPE2_OUT respectively.  Also fill *RHS1_OUT and
2043    *RHS2_OUT such that converting those operands to types *TYPE1_OUT
2044    and *TYPE2_OUT would give the operands of the multiplication.  */
2045
2046 static bool
2047 is_widening_mult_p (gimple stmt,
2048                     tree *type1_out, tree *rhs1_out,
2049                     tree *type2_out, tree *rhs2_out)
2050 {
2051   tree type = TREE_TYPE (gimple_assign_lhs (stmt));
2052
2053   if (TREE_CODE (type) != INTEGER_TYPE
2054       && TREE_CODE (type) != FIXED_POINT_TYPE)
2055     return false;
2056
2057   if (!is_widening_mult_rhs_p (type, gimple_assign_rhs1 (stmt), type1_out,
2058                                rhs1_out))
2059     return false;
2060
2061   if (!is_widening_mult_rhs_p (type, gimple_assign_rhs2 (stmt), type2_out,
2062                                rhs2_out))
2063     return false;
2064
2065   if (*type1_out == NULL)
2066     {
2067       if (*type2_out == NULL || !int_fits_type_p (*rhs1_out, *type2_out))
2068         return false;
2069       *type1_out = *type2_out;
2070     }
2071
2072   if (*type2_out == NULL)
2073     {
2074       if (!int_fits_type_p (*rhs2_out, *type1_out))
2075         return false;
2076       *type2_out = *type1_out;
2077     }
2078
2079   /* Ensure that the larger of the two operands comes first. */
2080   if (TYPE_PRECISION (*type1_out) < TYPE_PRECISION (*type2_out))
2081     {
2082       tree tmp;
2083       tmp = *type1_out;
2084       *type1_out = *type2_out;
2085       *type2_out = tmp;
2086       tmp = *rhs1_out;
2087       *rhs1_out = *rhs2_out;
2088       *rhs2_out = tmp;
2089     }
2090
2091   return true;
2092 }
2093
2094 /* Process a single gimple statement STMT, which has a MULT_EXPR as
2095    its rhs, and try to convert it into a WIDEN_MULT_EXPR.  The return
2096    value is true iff we converted the statement.  */
2097
2098 static bool
2099 convert_mult_to_widen (gimple stmt, gimple_stmt_iterator *gsi)
2100 {
2101   tree lhs, rhs1, rhs2, type, type1, type2, tmp = NULL;
2102   enum insn_code handler;
2103   enum machine_mode to_mode, from_mode, actual_mode;
2104   optab op;
2105   int actual_precision;
2106   location_t loc = gimple_location (stmt);
2107   bool from_unsigned1, from_unsigned2;
2108
2109   lhs = gimple_assign_lhs (stmt);
2110   type = TREE_TYPE (lhs);
2111   if (TREE_CODE (type) != INTEGER_TYPE)
2112     return false;
2113
2114   if (!is_widening_mult_p (stmt, &type1, &rhs1, &type2, &rhs2))
2115     return false;
2116
2117   to_mode = TYPE_MODE (type);
2118   from_mode = TYPE_MODE (type1);
2119   from_unsigned1 = TYPE_UNSIGNED (type1);
2120   from_unsigned2 = TYPE_UNSIGNED (type2);
2121
2122   if (from_unsigned1 && from_unsigned2)
2123     op = umul_widen_optab;
2124   else if (!from_unsigned1 && !from_unsigned2)
2125     op = smul_widen_optab;
2126   else
2127     op = usmul_widen_optab;
2128
2129   handler = find_widening_optab_handler_and_mode (op, to_mode, from_mode,
2130                                                   0, &actual_mode);
2131
2132   if (handler == CODE_FOR_nothing)
2133     {
2134       if (op != smul_widen_optab)
2135         {
2136           /* We can use a signed multiply with unsigned types as long as
2137              there is a wider mode to use, or it is the smaller of the two
2138              types that is unsigned.  Note that type1 >= type2, always.  */
2139           if ((TYPE_UNSIGNED (type1)
2140                && TYPE_PRECISION (type1) == GET_MODE_PRECISION (from_mode))
2141               || (TYPE_UNSIGNED (type2)
2142                   && TYPE_PRECISION (type2) == GET_MODE_PRECISION (from_mode)))
2143             {
2144               from_mode = GET_MODE_WIDER_MODE (from_mode);
2145               if (GET_MODE_SIZE (to_mode) <= GET_MODE_SIZE (from_mode))
2146                 return false;
2147             }
2148
2149           op = smul_widen_optab;
2150           handler = find_widening_optab_handler_and_mode (op, to_mode,
2151                                                           from_mode, 0,
2152                                                           &actual_mode);
2153
2154           if (handler == CODE_FOR_nothing)
2155             return false;
2156
2157           from_unsigned1 = from_unsigned2 = false;
2158         }
2159       else
2160         return false;
2161     }
2162
2163   /* Ensure that the inputs to the handler are in the correct precison
2164      for the opcode.  This will be the full mode size.  */
2165   actual_precision = GET_MODE_PRECISION (actual_mode);
2166   if (2 * actual_precision > TYPE_PRECISION (type))
2167     return false;
2168   if (actual_precision != TYPE_PRECISION (type1)
2169       || from_unsigned1 != TYPE_UNSIGNED (type1))
2170     {
2171       tmp = create_tmp_var (build_nonstandard_integer_type
2172                                 (actual_precision, from_unsigned1),
2173                             NULL);
2174       rhs1 = build_and_insert_cast (gsi, loc, tmp, rhs1);
2175     }
2176   if (actual_precision != TYPE_PRECISION (type2)
2177       || from_unsigned2 != TYPE_UNSIGNED (type2))
2178     {
2179       /* Reuse the same type info, if possible.  */
2180       if (!tmp || from_unsigned1 != from_unsigned2)
2181         tmp = create_tmp_var (build_nonstandard_integer_type
2182                                 (actual_precision, from_unsigned2),
2183                               NULL);
2184       rhs2 = build_and_insert_cast (gsi, loc, tmp, rhs2);
2185     }
2186
2187   /* Handle constants.  */
2188   if (TREE_CODE (rhs1) == INTEGER_CST)
2189     rhs1 = fold_convert (type1, rhs1);
2190   if (TREE_CODE (rhs2) == INTEGER_CST)
2191     rhs2 = fold_convert (type2, rhs2);
2192
2193   gimple_assign_set_rhs1 (stmt, rhs1);
2194   gimple_assign_set_rhs2 (stmt, rhs2);
2195   gimple_assign_set_rhs_code (stmt, WIDEN_MULT_EXPR);
2196   update_stmt (stmt);
2197   widen_mul_stats.widen_mults_inserted++;
2198   return true;
2199 }
2200
2201 /* Process a single gimple statement STMT, which is found at the
2202    iterator GSI and has a either a PLUS_EXPR or a MINUS_EXPR as its
2203    rhs (given by CODE), and try to convert it into a
2204    WIDEN_MULT_PLUS_EXPR or a WIDEN_MULT_MINUS_EXPR.  The return value
2205    is true iff we converted the statement.  */
2206
2207 static bool
2208 convert_plusminus_to_widen (gimple_stmt_iterator *gsi, gimple stmt,
2209                             enum tree_code code)
2210 {
2211   gimple rhs1_stmt = NULL, rhs2_stmt = NULL;
2212   gimple conv1_stmt = NULL, conv2_stmt = NULL, conv_stmt;
2213   tree type, type1, type2, optype, tmp = NULL;
2214   tree lhs, rhs1, rhs2, mult_rhs1, mult_rhs2, add_rhs;
2215   enum tree_code rhs1_code = ERROR_MARK, rhs2_code = ERROR_MARK;
2216   optab this_optab;
2217   enum tree_code wmult_code;
2218   enum insn_code handler;
2219   enum machine_mode to_mode, from_mode, actual_mode;
2220   location_t loc = gimple_location (stmt);
2221   int actual_precision;
2222   bool from_unsigned1, from_unsigned2;
2223
2224   lhs = gimple_assign_lhs (stmt);
2225   type = TREE_TYPE (lhs);
2226   if (TREE_CODE (type) != INTEGER_TYPE
2227       && TREE_CODE (type) != FIXED_POINT_TYPE)
2228     return false;
2229
2230   if (code == MINUS_EXPR)
2231     wmult_code = WIDEN_MULT_MINUS_EXPR;
2232   else
2233     wmult_code = WIDEN_MULT_PLUS_EXPR;
2234
2235   rhs1 = gimple_assign_rhs1 (stmt);
2236   rhs2 = gimple_assign_rhs2 (stmt);
2237
2238   if (TREE_CODE (rhs1) == SSA_NAME)
2239     {
2240       rhs1_stmt = SSA_NAME_DEF_STMT (rhs1);
2241       if (is_gimple_assign (rhs1_stmt))
2242         rhs1_code = gimple_assign_rhs_code (rhs1_stmt);
2243     }
2244
2245   if (TREE_CODE (rhs2) == SSA_NAME)
2246     {
2247       rhs2_stmt = SSA_NAME_DEF_STMT (rhs2);
2248       if (is_gimple_assign (rhs2_stmt))
2249         rhs2_code = gimple_assign_rhs_code (rhs2_stmt);
2250     }
2251
2252   /* Allow for one conversion statement between the multiply
2253      and addition/subtraction statement.  If there are more than
2254      one conversions then we assume they would invalidate this
2255      transformation.  If that's not the case then they should have
2256      been folded before now.  */
2257   if (CONVERT_EXPR_CODE_P (rhs1_code))
2258     {
2259       conv1_stmt = rhs1_stmt;
2260       rhs1 = gimple_assign_rhs1 (rhs1_stmt);
2261       if (TREE_CODE (rhs1) == SSA_NAME)
2262         {
2263           rhs1_stmt = SSA_NAME_DEF_STMT (rhs1);
2264           if (is_gimple_assign (rhs1_stmt))
2265             rhs1_code = gimple_assign_rhs_code (rhs1_stmt);
2266         }
2267       else
2268         return false;
2269     }
2270   if (CONVERT_EXPR_CODE_P (rhs2_code))
2271     {
2272       conv2_stmt = rhs2_stmt;
2273       rhs2 = gimple_assign_rhs1 (rhs2_stmt);
2274       if (TREE_CODE (rhs2) == SSA_NAME)
2275         {
2276           rhs2_stmt = SSA_NAME_DEF_STMT (rhs2);
2277           if (is_gimple_assign (rhs2_stmt))
2278             rhs2_code = gimple_assign_rhs_code (rhs2_stmt);
2279         }
2280       else
2281         return false;
2282     }
2283
2284   /* If code is WIDEN_MULT_EXPR then it would seem unnecessary to call
2285      is_widening_mult_p, but we still need the rhs returns.
2286
2287      It might also appear that it would be sufficient to use the existing
2288      operands of the widening multiply, but that would limit the choice of
2289      multiply-and-accumulate instructions.  */
2290   if (code == PLUS_EXPR
2291       && (rhs1_code == MULT_EXPR || rhs1_code == WIDEN_MULT_EXPR))
2292     {
2293       if (!is_widening_mult_p (rhs1_stmt, &type1, &mult_rhs1,
2294                                &type2, &mult_rhs2))
2295         return false;
2296       add_rhs = rhs2;
2297       conv_stmt = conv1_stmt;
2298     }
2299   else if (rhs2_code == MULT_EXPR || rhs2_code == WIDEN_MULT_EXPR)
2300     {
2301       if (!is_widening_mult_p (rhs2_stmt, &type1, &mult_rhs1,
2302                                &type2, &mult_rhs2))
2303         return false;
2304       add_rhs = rhs1;
2305       conv_stmt = conv2_stmt;
2306     }
2307   else
2308     return false;
2309
2310   to_mode = TYPE_MODE (type);
2311   from_mode = TYPE_MODE (type1);
2312   from_unsigned1 = TYPE_UNSIGNED (type1);
2313   from_unsigned2 = TYPE_UNSIGNED (type2);
2314   optype = type1;
2315
2316   /* There's no such thing as a mixed sign madd yet, so use a wider mode.  */
2317   if (from_unsigned1 != from_unsigned2)
2318     {
2319       if (!INTEGRAL_TYPE_P (type))
2320         return false;
2321       /* We can use a signed multiply with unsigned types as long as
2322          there is a wider mode to use, or it is the smaller of the two
2323          types that is unsigned.  Note that type1 >= type2, always.  */
2324       if ((from_unsigned1
2325            && TYPE_PRECISION (type1) == GET_MODE_PRECISION (from_mode))
2326           || (from_unsigned2
2327               && TYPE_PRECISION (type2) == GET_MODE_PRECISION (from_mode)))
2328         {
2329           from_mode = GET_MODE_WIDER_MODE (from_mode);
2330           if (GET_MODE_SIZE (from_mode) >= GET_MODE_SIZE (to_mode))
2331             return false;
2332         }
2333
2334       from_unsigned1 = from_unsigned2 = false;
2335       optype = build_nonstandard_integer_type (GET_MODE_PRECISION (from_mode),
2336                                                false);
2337     }
2338
2339   /* If there was a conversion between the multiply and addition
2340      then we need to make sure it fits a multiply-and-accumulate.
2341      The should be a single mode change which does not change the
2342      value.  */
2343   if (conv_stmt)
2344     {
2345       /* We use the original, unmodified data types for this.  */
2346       tree from_type = TREE_TYPE (gimple_assign_rhs1 (conv_stmt));
2347       tree to_type = TREE_TYPE (gimple_assign_lhs (conv_stmt));
2348       int data_size = TYPE_PRECISION (type1) + TYPE_PRECISION (type2);
2349       bool is_unsigned = TYPE_UNSIGNED (type1) && TYPE_UNSIGNED (type2);
2350
2351       if (TYPE_PRECISION (from_type) > TYPE_PRECISION (to_type))
2352         {
2353           /* Conversion is a truncate.  */
2354           if (TYPE_PRECISION (to_type) < data_size)
2355             return false;
2356         }
2357       else if (TYPE_PRECISION (from_type) < TYPE_PRECISION (to_type))
2358         {
2359           /* Conversion is an extend.  Check it's the right sort.  */
2360           if (TYPE_UNSIGNED (from_type) != is_unsigned
2361               && !(is_unsigned && TYPE_PRECISION (from_type) > data_size))
2362             return false;
2363         }
2364       /* else convert is a no-op for our purposes.  */
2365     }
2366
2367   /* Verify that the machine can perform a widening multiply
2368      accumulate in this mode/signedness combination, otherwise
2369      this transformation is likely to pessimize code.  */
2370   this_optab = optab_for_tree_code (wmult_code, optype, optab_default);
2371   handler = find_widening_optab_handler_and_mode (this_optab, to_mode,
2372                                                   from_mode, 0, &actual_mode);
2373
2374   if (handler == CODE_FOR_nothing)
2375     return false;
2376
2377   /* Ensure that the inputs to the handler are in the correct precison
2378      for the opcode.  This will be the full mode size.  */
2379   actual_precision = GET_MODE_PRECISION (actual_mode);
2380   if (actual_precision != TYPE_PRECISION (type1)
2381       || from_unsigned1 != TYPE_UNSIGNED (type1))
2382     {
2383       tmp = create_tmp_var (build_nonstandard_integer_type
2384                                 (actual_precision, from_unsigned1),
2385                             NULL);
2386       mult_rhs1 = build_and_insert_cast (gsi, loc, tmp, mult_rhs1);
2387     }
2388   if (actual_precision != TYPE_PRECISION (type2)
2389       || from_unsigned2 != TYPE_UNSIGNED (type2))
2390     {
2391       if (!tmp || from_unsigned1 != from_unsigned2)
2392         tmp = create_tmp_var (build_nonstandard_integer_type
2393                                 (actual_precision, from_unsigned2),
2394                               NULL);
2395       mult_rhs2 = build_and_insert_cast (gsi, loc, tmp, mult_rhs2);
2396     }
2397
2398   if (!useless_type_conversion_p (type, TREE_TYPE (add_rhs)))
2399     add_rhs = build_and_insert_cast (gsi, loc, create_tmp_var (type, NULL),
2400                                      add_rhs);
2401
2402   /* Handle constants.  */
2403   if (TREE_CODE (mult_rhs1) == INTEGER_CST)
2404     mult_rhs1 = fold_convert (type1, mult_rhs1);
2405   if (TREE_CODE (mult_rhs2) == INTEGER_CST)
2406     mult_rhs2 = fold_convert (type2, mult_rhs2);
2407
2408   gimple_assign_set_rhs_with_ops_1 (gsi, wmult_code, mult_rhs1, mult_rhs2,
2409                                     add_rhs);
2410   update_stmt (gsi_stmt (*gsi));
2411   widen_mul_stats.maccs_inserted++;
2412   return true;
2413 }
2414
2415 /* Combine the multiplication at MUL_STMT with operands MULOP1 and MULOP2
2416    with uses in additions and subtractions to form fused multiply-add
2417    operations.  Returns true if successful and MUL_STMT should be removed.  */
2418
2419 static bool
2420 convert_mult_to_fma (gimple mul_stmt, tree op1, tree op2)
2421 {
2422   tree mul_result = gimple_get_lhs (mul_stmt);
2423   tree type = TREE_TYPE (mul_result);
2424   gimple use_stmt, neguse_stmt, fma_stmt;
2425   use_operand_p use_p;
2426   imm_use_iterator imm_iter;
2427
2428   if (FLOAT_TYPE_P (type)
2429       && flag_fp_contract_mode == FP_CONTRACT_OFF)
2430     return false;
2431
2432   /* We don't want to do bitfield reduction ops.  */
2433   if (INTEGRAL_TYPE_P (type)
2434       && (TYPE_PRECISION (type)
2435           != GET_MODE_PRECISION (TYPE_MODE (type))))
2436     return false;
2437
2438   /* If the target doesn't support it, don't generate it.  We assume that
2439      if fma isn't available then fms, fnma or fnms are not either.  */
2440   if (optab_handler (fma_optab, TYPE_MODE (type)) == CODE_FOR_nothing)
2441     return false;
2442
2443   /* If the multiplication has zero uses, it is kept around probably because
2444      of -fnon-call-exceptions.  Don't optimize it away in that case,
2445      it is DCE job.  */
2446   if (has_zero_uses (mul_result))
2447     return false;
2448
2449   /* Make sure that the multiplication statement becomes dead after
2450      the transformation, thus that all uses are transformed to FMAs.
2451      This means we assume that an FMA operation has the same cost
2452      as an addition.  */
2453   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, mul_result)
2454     {
2455       enum tree_code use_code;
2456       tree result = mul_result;
2457       bool negate_p = false;
2458
2459       use_stmt = USE_STMT (use_p);
2460
2461       if (is_gimple_debug (use_stmt))
2462         continue;
2463
2464       /* For now restrict this operations to single basic blocks.  In theory
2465          we would want to support sinking the multiplication in
2466          m = a*b;
2467          if ()
2468            ma = m + c;
2469          else
2470            d = m;
2471          to form a fma in the then block and sink the multiplication to the
2472          else block.  */
2473       if (gimple_bb (use_stmt) != gimple_bb (mul_stmt))
2474         return false;
2475
2476       if (!is_gimple_assign (use_stmt))
2477         return false;
2478
2479       use_code = gimple_assign_rhs_code (use_stmt);
2480
2481       /* A negate on the multiplication leads to FNMA.  */
2482       if (use_code == NEGATE_EXPR)
2483         {
2484           ssa_op_iter iter;
2485           use_operand_p usep;
2486
2487           result = gimple_assign_lhs (use_stmt);
2488
2489           /* Make sure the negate statement becomes dead with this
2490              single transformation.  */
2491           if (!single_imm_use (gimple_assign_lhs (use_stmt),
2492                                &use_p, &neguse_stmt))
2493             return false;
2494
2495           /* Make sure the multiplication isn't also used on that stmt.  */
2496           FOR_EACH_PHI_OR_STMT_USE (usep, neguse_stmt, iter, SSA_OP_USE)
2497             if (USE_FROM_PTR (usep) == mul_result)
2498               return false;
2499
2500           /* Re-validate.  */
2501           use_stmt = neguse_stmt;
2502           if (gimple_bb (use_stmt) != gimple_bb (mul_stmt))
2503             return false;
2504           if (!is_gimple_assign (use_stmt))
2505             return false;
2506
2507           use_code = gimple_assign_rhs_code (use_stmt);
2508           negate_p = true;
2509         }
2510
2511       switch (use_code)
2512         {
2513         case MINUS_EXPR:
2514           if (gimple_assign_rhs2 (use_stmt) == result)
2515             negate_p = !negate_p;
2516           break;
2517         case PLUS_EXPR:
2518           break;
2519         default:
2520           /* FMA can only be formed from PLUS and MINUS.  */
2521           return false;
2522         }
2523
2524       /* We can't handle a * b + a * b.  */
2525       if (gimple_assign_rhs1 (use_stmt) == gimple_assign_rhs2 (use_stmt))
2526         return false;
2527
2528       /* While it is possible to validate whether or not the exact form
2529          that we've recognized is available in the backend, the assumption
2530          is that the transformation is never a loss.  For instance, suppose
2531          the target only has the plain FMA pattern available.  Consider
2532          a*b-c -> fma(a,b,-c): we've exchanged MUL+SUB for FMA+NEG, which
2533          is still two operations.  Consider -(a*b)-c -> fma(-a,b,-c): we
2534          still have 3 operations, but in the FMA form the two NEGs are
2535          independent and could be run in parallel.  */
2536     }
2537
2538   FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, mul_result)
2539     {
2540       gimple_stmt_iterator gsi = gsi_for_stmt (use_stmt);
2541       enum tree_code use_code;
2542       tree addop, mulop1 = op1, result = mul_result;
2543       bool negate_p = false;
2544
2545       if (is_gimple_debug (use_stmt))
2546         continue;
2547
2548       use_code = gimple_assign_rhs_code (use_stmt);
2549       if (use_code == NEGATE_EXPR)
2550         {
2551           result = gimple_assign_lhs (use_stmt);
2552           single_imm_use (gimple_assign_lhs (use_stmt), &use_p, &neguse_stmt);
2553           gsi_remove (&gsi, true);
2554           release_defs (use_stmt);
2555
2556           use_stmt = neguse_stmt;
2557           gsi = gsi_for_stmt (use_stmt);
2558           use_code = gimple_assign_rhs_code (use_stmt);
2559           negate_p = true;
2560         }
2561
2562       if (gimple_assign_rhs1 (use_stmt) == result)
2563         {
2564           addop = gimple_assign_rhs2 (use_stmt);
2565           /* a * b - c -> a * b + (-c)  */
2566           if (gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
2567             addop = force_gimple_operand_gsi (&gsi,
2568                                               build1 (NEGATE_EXPR,
2569                                                       type, addop),
2570                                               true, NULL_TREE, true,
2571                                               GSI_SAME_STMT);
2572         }
2573       else
2574         {
2575           addop = gimple_assign_rhs1 (use_stmt);
2576           /* a - b * c -> (-b) * c + a */
2577           if (gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
2578             negate_p = !negate_p;
2579         }
2580
2581       if (negate_p)
2582         mulop1 = force_gimple_operand_gsi (&gsi,
2583                                            build1 (NEGATE_EXPR,
2584                                                    type, mulop1),
2585                                            true, NULL_TREE, true,
2586                                            GSI_SAME_STMT);
2587
2588       fma_stmt = gimple_build_assign_with_ops3 (FMA_EXPR,
2589                                                 gimple_assign_lhs (use_stmt),
2590                                                 mulop1, op2,
2591                                                 addop);
2592       gsi_replace (&gsi, fma_stmt, true);
2593       widen_mul_stats.fmas_inserted++;
2594     }
2595
2596   return true;
2597 }
2598
2599 /* Find integer multiplications where the operands are extended from
2600    smaller types, and replace the MULT_EXPR with a WIDEN_MULT_EXPR
2601    where appropriate.  */
2602
2603 static unsigned int
2604 execute_optimize_widening_mul (void)
2605 {
2606   basic_block bb;
2607   bool cfg_changed = false;
2608
2609   memset (&widen_mul_stats, 0, sizeof (widen_mul_stats));
2610
2611   FOR_EACH_BB (bb)
2612     {
2613       gimple_stmt_iterator gsi;
2614
2615       for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi);)
2616         {
2617           gimple stmt = gsi_stmt (gsi);
2618           enum tree_code code;
2619
2620           if (is_gimple_assign (stmt))
2621             {
2622               code = gimple_assign_rhs_code (stmt);
2623               switch (code)
2624                 {
2625                 case MULT_EXPR:
2626                   if (!convert_mult_to_widen (stmt, &gsi)
2627                       && convert_mult_to_fma (stmt,
2628                                               gimple_assign_rhs1 (stmt),
2629                                               gimple_assign_rhs2 (stmt)))
2630                     {
2631                       gsi_remove (&gsi, true);
2632                       release_defs (stmt);
2633                       continue;
2634                     }
2635                   break;
2636
2637                 case PLUS_EXPR:
2638                 case MINUS_EXPR:
2639                   convert_plusminus_to_widen (&gsi, stmt, code);
2640                   break;
2641
2642                 default:;
2643                 }
2644             }
2645           else if (is_gimple_call (stmt)
2646                    && gimple_call_lhs (stmt))
2647             {
2648               tree fndecl = gimple_call_fndecl (stmt);
2649               if (fndecl
2650                   && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_NORMAL)
2651                 {
2652                   switch (DECL_FUNCTION_CODE (fndecl))
2653                     {
2654                       case BUILT_IN_POWF:
2655                       case BUILT_IN_POW:
2656                       case BUILT_IN_POWL:
2657                         if (TREE_CODE (gimple_call_arg (stmt, 1)) == REAL_CST
2658                             && REAL_VALUES_EQUAL
2659                                  (TREE_REAL_CST (gimple_call_arg (stmt, 1)),
2660                                   dconst2)
2661                             && convert_mult_to_fma (stmt,
2662                                                     gimple_call_arg (stmt, 0),
2663                                                     gimple_call_arg (stmt, 0)))
2664                           {
2665                             unlink_stmt_vdef (stmt);
2666                             if (gsi_remove (&gsi, true)
2667                                 && gimple_purge_dead_eh_edges (bb))
2668                               cfg_changed = true;
2669                             release_defs (stmt);
2670                             continue;
2671                           }
2672                           break;
2673
2674                       default:;
2675                     }
2676                 }
2677             }
2678           gsi_next (&gsi);
2679         }
2680     }
2681
2682   statistics_counter_event (cfun, "widening multiplications inserted",
2683                             widen_mul_stats.widen_mults_inserted);
2684   statistics_counter_event (cfun, "widening maccs inserted",
2685                             widen_mul_stats.maccs_inserted);
2686   statistics_counter_event (cfun, "fused multiply-adds inserted",
2687                             widen_mul_stats.fmas_inserted);
2688
2689   return cfg_changed ? TODO_cleanup_cfg : 0;
2690 }
2691
2692 static bool
2693 gate_optimize_widening_mul (void)
2694 {
2695   return flag_expensive_optimizations && optimize;
2696 }
2697
2698 struct gimple_opt_pass pass_optimize_widening_mul =
2699 {
2700  {
2701   GIMPLE_PASS,
2702   "widening_mul",                       /* name */
2703   gate_optimize_widening_mul,           /* gate */
2704   execute_optimize_widening_mul,        /* execute */
2705   NULL,                                 /* sub */
2706   NULL,                                 /* next */
2707   0,                                    /* static_pass_number */
2708   TV_NONE,                              /* tv_id */
2709   PROP_ssa,                             /* properties_required */
2710   0,                                    /* properties_provided */
2711   0,                                    /* properties_destroyed */
2712   0,                                    /* todo_flags_start */
2713   TODO_verify_ssa
2714   | TODO_verify_stmts
2715   | TODO_update_ssa                     /* todo_flags_finish */
2716  }
2717 };