gcc/tree-ssa-math-opts.c

   1 /* Global, SSA-based optimizations using mathematical identities.
   2    Copyright (C) 2005, 2006, 2007, 2008, 2009, 2010, 2011
   3    Free Software Foundation, Inc.
   4
   5 This file is part of GCC.
   6
   7 GCC is free software; you can redistribute it and/or modify it
   8 under the terms of the GNU General Public License as published by the
   9 Free Software Foundation; either version 3, or (at your option) any
  10 later version.
  11
  12 GCC is distributed in the hope that it will be useful, but WITHOUT
  13 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  14 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  15 for more details.
  16
  17 You should have received a copy of the GNU General Public License
  18 along with GCC; see the file COPYING3.  If not see
  19 <http://www.gnu.org/licenses/>.  */
  20
  21 /* Currently, the only mini-pass in this file tries to CSE reciprocal
  22    operations.  These are common in sequences such as this one:
  23
  24         modulus = sqrt(x*x + y*y + z*z);
  25         x = x / modulus;
  26         y = y / modulus;
  27         z = z / modulus;
  28
  29    that can be optimized to
  30
  31         modulus = sqrt(x*x + y*y + z*z);
  32         rmodulus = 1.0 / modulus;
  33         x = x * rmodulus;
  34         y = y * rmodulus;
  35         z = z * rmodulus;
  36
  37    We do this for loop invariant divisors, and with this pass whenever
  38    we notice that a division has the same divisor multiple times.
  39
  40    Of course, like in PRE, we don't insert a division if a dominator
  41    already has one.  However, this cannot be done as an extension of
  42    PRE for several reasons.
  43
  44    First of all, with some experiments it was found out that the
  45    transformation is not always useful if there are only two divisions
  46    hy the same divisor.  This is probably because modern processors
  47    can pipeline the divisions; on older, in-order processors it should
  48    still be effective to optimize two divisions by the same number.
  49    We make this a param, and it shall be called N in the remainder of
  50    this comment.
  51
  52    Second, if trapping math is active, we have less freedom on where
  53    to insert divisions: we can only do so in basic blocks that already
  54    contain one.  (If divisions don't trap, instead, we can insert
  55    divisions elsewhere, which will be in blocks that are common dominators
  56    of those that have the division).
  57
  58    We really don't want to compute the reciprocal unless a division will
  59    be found.  To do this, we won't insert the division in a basic block
  60    that has less than N divisions *post-dominating* it.
  61
  62    The algorithm constructs a subset of the dominator tree, holding the
  63    blocks containing the divisions and the common dominators to them,
  64    and walk it twice.  The first walk is in post-order, and it annotates
  65    each block with the number of divisions that post-dominate it: this
  66    gives information on where divisions can be inserted profitably.
  67    The second walk is in pre-order, and it inserts divisions as explained
  68    above, and replaces divisions by multiplications.
  69
  70    In the best case, the cost of the pass is O(n_statements).  In the
  71    worst-case, the cost is due to creating the dominator tree subset,
  72    with a cost of O(n_basic_blocks ^ 2); however this can only happen
  73    for n_statements / n_basic_blocks statements.  So, the amortized cost
  74    of creating the dominator tree subset is O(n_basic_blocks) and the
  75    worst-case cost of the pass is O(n_statements * n_basic_blocks).
  76
  77    More practically, the cost will be small because there are few
  78    divisions, and they tend to be in the same basic block, so insert_bb
  79    is called very few times.
  80
  81    If we did this using domwalk.c, an efficient implementation would have
  82    to work on all the variables in a single pass, because we could not
  83    work on just a subset of the dominator tree, as we do now, and the
  84    cost would also be something like O(n_statements * n_basic_blocks).
  85    The data structures would be more complex in order to work on all the
  86    variables in a single pass.  */
  87
  88 #include "config.h"
  89 #include "system.h"
  90 #include "coretypes.h"
  91 #include "tm.h"
  92 #include "flags.h"
  93 #include "tree.h"
  94 #include "tree-flow.h"
  95 #include "timevar.h"
  96 #include "tree-pass.h"
  97 #include "alloc-pool.h"
  98 #include "basic-block.h"
  99 #include "target.h"
 100 #include "gimple-pretty-print.h"
 101
 102 /* FIXME: RTL headers have to be included here for optabs.  */
 103 #include "rtl.h"                /* Because optabs.h wants enum rtx_code.  */
 104 #include "expr.h"               /* Because optabs.h wants sepops.  */
 105 #include "optabs.h"
 106
 107 /* This structure represents one basic block that either computes a
 108    division, or is a common dominator for basic block that compute a
 109    division.  */
 110 struct occurrence {
 111   /* The basic block represented by this structure.  */
 112   basic_block bb;
 113
 114   /* If non-NULL, the SSA_NAME holding the definition for a reciprocal
 115      inserted in BB.  */
 116   tree recip_def;
 117
 118   /* If non-NULL, the GIMPLE_ASSIGN for a reciprocal computation that
 119      was inserted in BB.  */
 120   gimple recip_def_stmt;
 121
 122   /* Pointer to a list of "struct occurrence"s for blocks dominated
 123      by BB.  */
 124   struct occurrence *children;
 125
 126   /* Pointer to the next "struct occurrence"s in the list of blocks
 127      sharing a common dominator.  */
 128   struct occurrence *next;
 129
 130   /* The number of divisions that are in BB before compute_merit.  The
 131      number of divisions that are in BB or post-dominate it after
 132      compute_merit.  */
 133   int num_divisions;
 134
 135   /* True if the basic block has a division, false if it is a common
 136      dominator for basic blocks that do.  If it is false and trapping
 137      math is active, BB is not a candidate for inserting a reciprocal.  */
 138   bool bb_has_division;
 139 };
 140
 141 static struct
 142 {
 143   /* Number of 1.0/X ops inserted.  */
 144   int rdivs_inserted;
 145
 146   /* Number of 1.0/FUNC ops inserted.  */
 147   int rfuncs_inserted;
 148 } reciprocal_stats;
 149
 150 static struct
 151 {
 152   /* Number of cexpi calls inserted.  */
 153   int inserted;
 154 } sincos_stats;
 155
 156 static struct
 157 {
 158   /* Number of hand-written 32-bit bswaps found.  */
 159   int found_32bit;
 160
 161   /* Number of hand-written 64-bit bswaps found.  */
 162   int found_64bit;
 163 } bswap_stats;
 164
 165 static struct
 166 {
 167   /* Number of widening multiplication ops inserted.  */
 168   int widen_mults_inserted;
 169
 170   /* Number of integer multiply-and-accumulate ops inserted.  */
 171   int maccs_inserted;
 172
 173   /* Number of fp fused multiply-add ops inserted.  */
 174   int fmas_inserted;
 175 } widen_mul_stats;
 176
 177 /* The instance of "struct occurrence" representing the highest
 178    interesting block in the dominator tree.  */
 179 static struct occurrence *occ_head;
 180
 181 /* Allocation pool for getting instances of "struct occurrence".  */
 182 static alloc_pool occ_pool;
 183
 184
 185
 186 /* Allocate and return a new struct occurrence for basic block BB, and
 187    whose children list is headed by CHILDREN.  */
 188 static struct occurrence *
 189 occ_new (basic_block bb, struct occurrence *children)
 190 {
 191   struct occurrence *occ;
 192
 193   bb->aux = occ = (struct occurrence *) pool_alloc (occ_pool);
 194   memset (occ, 0, sizeof (struct occurrence));
 195
 196   occ->bb = bb;
 197   occ->children = children;
 198   return occ;
 199 }
 200
 201
 202 /* Insert NEW_OCC into our subset of the dominator tree.  P_HEAD points to a
 203    list of "struct occurrence"s, one per basic block, having IDOM as
 204    their common dominator.
 205
 206    We try to insert NEW_OCC as deep as possible in the tree, and we also
 207    insert any other block that is a common dominator for BB and one
 208    block already in the tree.  */
 209
 210 static void
 211 insert_bb (struct occurrence *new_occ, basic_block idom,
 212            struct occurrence **p_head)
 213 {
 214   struct occurrence *occ, **p_occ;
 215
 216   for (p_occ = p_head; (occ = *p_occ) != NULL; )
 217     {
 218       basic_block bb = new_occ->bb, occ_bb = occ->bb;
 219       basic_block dom = nearest_common_dominator (CDI_DOMINATORS, occ_bb, bb);
 220       if (dom == bb)
 221         {
 222           /* BB dominates OCC_BB.  OCC becomes NEW_OCC's child: remove OCC
 223              from its list.  */
 224           *p_occ = occ->next;
 225           occ->next = new_occ->children;
 226           new_occ->children = occ;
 227
 228           /* Try the next block (it may as well be dominated by BB).  */
 229         }
 230
 231       else if (dom == occ_bb)
 232         {
 233           /* OCC_BB dominates BB.  Tail recurse to look deeper.  */
 234           insert_bb (new_occ, dom, &occ->children);
 235           return;
 236         }
 237
 238       else if (dom != idom)
 239         {
 240           gcc_assert (!dom->aux);
 241
 242           /* There is a dominator between IDOM and BB, add it and make
 243              two children out of NEW_OCC and OCC.  First, remove OCC from
 244              its list.  */
 245           *p_occ = occ->next;
 246           new_occ->next = occ;
 247           occ->next = NULL;
 248
 249           /* None of the previous blocks has DOM as a dominator: if we tail
 250              recursed, we would reexamine them uselessly. Just switch BB with
 251              DOM, and go on looking for blocks dominated by DOM.  */
 252           new_occ = occ_new (dom, new_occ);
 253         }
 254
 255       else
 256         {
 257           /* Nothing special, go on with the next element.  */
 258           p_occ = &occ->next;
 259         }
 260     }
 261
 262   /* No place was found as a child of IDOM.  Make BB a sibling of IDOM.  */
 263   new_occ->next = *p_head;
 264   *p_head = new_occ;
 265 }
 266
 267 /* Register that we found a division in BB.  */
 268
 269 static inline void
 270 register_division_in (basic_block bb)
 271 {
 272   struct occurrence *occ;
 273
 274   occ = (struct occurrence *) bb->aux;
 275   if (!occ)
 276     {
 277       occ = occ_new (bb, NULL);
 278       insert_bb (occ, ENTRY_BLOCK_PTR, &occ_head);
 279     }
 280
 281   occ->bb_has_division = true;
 282   occ->num_divisions++;
 283 }
 284
 285
 286 /* Compute the number of divisions that postdominate each block in OCC and
 287    its children.  */
 288
 289 static void
 290 compute_merit (struct occurrence *occ)
 291 {
 292   struct occurrence *occ_child;
 293   basic_block dom = occ->bb;
 294
 295   for (occ_child = occ->children; occ_child; occ_child = occ_child->next)
 296     {
 297       basic_block bb;
 298       if (occ_child->children)
 299         compute_merit (occ_child);
 300
 301       if (flag_exceptions)
 302         bb = single_noncomplex_succ (dom);
 303       else
 304         bb = dom;
 305
 306       if (dominated_by_p (CDI_POST_DOMINATORS, bb, occ_child->bb))
 307         occ->num_divisions += occ_child->num_divisions;
 308     }
 309 }
 310
 311
 312 /* Return whether USE_STMT is a floating-point division by DEF.  */
 313 static inline bool
 314 is_division_by (gimple use_stmt, tree def)
 315 {
 316   return is_gimple_assign (use_stmt)
 317          && gimple_assign_rhs_code (use_stmt) == RDIV_EXPR
 318          && gimple_assign_rhs2 (use_stmt) == def
 319          /* Do not recognize x / x as valid division, as we are getting
 320             confused later by replacing all immediate uses x in such
 321             a stmt.  */
 322          && gimple_assign_rhs1 (use_stmt) != def;
 323 }
 324
 325 /* Walk the subset of the dominator tree rooted at OCC, setting the
 326    RECIP_DEF field to a definition of 1.0 / DEF that can be used in
 327    the given basic block.  The field may be left NULL, of course,
 328    if it is not possible or profitable to do the optimization.
 329
 330    DEF_BSI is an iterator pointing at the statement defining DEF.
 331    If RECIP_DEF is set, a dominator already has a computation that can
 332    be used.  */
 333
 334 static void
 335 insert_reciprocals (gimple_stmt_iterator *def_gsi, struct occurrence *occ,
 336                     tree def, tree recip_def, int threshold)
 337 {
 338   tree type;
 339   gimple new_stmt;
 340   gimple_stmt_iterator gsi;
 341   struct occurrence *occ_child;
 342
 343   if (!recip_def
 344       && (occ->bb_has_division || !flag_trapping_math)
 345       && occ->num_divisions >= threshold)
 346     {
 347       /* Make a variable with the replacement and substitute it.  */
 348       type = TREE_TYPE (def);
 349       recip_def = make_rename_temp (type, "reciptmp");
 350       new_stmt = gimple_build_assign_with_ops (RDIV_EXPR, recip_def,
 351                                                build_one_cst (type), def);
 352
 353       if (occ->bb_has_division)
 354         {
 355           /* Case 1: insert before an existing division.  */
 356           gsi = gsi_after_labels (occ->bb);
 357           while (!gsi_end_p (gsi) && !is_division_by (gsi_stmt (gsi), def))
 358             gsi_next (&gsi);
 359
 360           gsi_insert_before (&gsi, new_stmt, GSI_SAME_STMT);
 361         }
 362       else if (def_gsi && occ->bb == def_gsi->bb)
 363         {
 364           /* Case 2: insert right after the definition.  Note that this will
 365              never happen if the definition statement can throw, because in
 366              that case the sole successor of the statement's basic block will
 367              dominate all the uses as well.  */
 368           gsi_insert_after (def_gsi, new_stmt, GSI_NEW_STMT);
 369         }
 370       else
 371         {
 372           /* Case 3: insert in a basic block not containing defs/uses.  */
 373           gsi = gsi_after_labels (occ->bb);
 374           gsi_insert_before (&gsi, new_stmt, GSI_SAME_STMT);
 375         }
 376
 377       reciprocal_stats.rdivs_inserted++;
 378
 379       occ->recip_def_stmt = new_stmt;
 380     }
 381
 382   occ->recip_def = recip_def;
 383   for (occ_child = occ->children; occ_child; occ_child = occ_child->next)
 384     insert_reciprocals (def_gsi, occ_child, def, recip_def, threshold);
 385 }
 386
 387
 388 /* Replace the division at USE_P with a multiplication by the reciprocal, if
 389    possible.  */
 390
 391 static inline void
 392 replace_reciprocal (use_operand_p use_p)
 393 {
 394   gimple use_stmt = USE_STMT (use_p);
 395   basic_block bb = gimple_bb (use_stmt);
 396   struct occurrence *occ = (struct occurrence *) bb->aux;
 397
 398   if (optimize_bb_for_speed_p (bb)
 399       && occ->recip_def && use_stmt != occ->recip_def_stmt)
 400     {
 401       gimple_assign_set_rhs_code (use_stmt, MULT_EXPR);
 402       SET_USE (use_p, occ->recip_def);
 403       fold_stmt_inplace (use_stmt);
 404       update_stmt (use_stmt);
 405     }
 406 }
 407
 408
 409 /* Free OCC and return one more "struct occurrence" to be freed.  */
 410
 411 static struct occurrence *
 412 free_bb (struct occurrence *occ)
 413 {
 414   struct occurrence *child, *next;
 415
 416   /* First get the two pointers hanging off OCC.  */
 417   next = occ->next;
 418   child = occ->children;
 419   occ->bb->aux = NULL;
 420   pool_free (occ_pool, occ);
 421
 422   /* Now ensure that we don't recurse unless it is necessary.  */
 423   if (!child)
 424     return next;
 425   else
 426     {
 427       while (next)
 428         next = free_bb (next);
 429
 430       return child;
 431     }
 432 }
 433
 434
 435 /* Look for floating-point divisions among DEF's uses, and try to
 436    replace them by multiplications with the reciprocal.  Add
 437    as many statements computing the reciprocal as needed.
 438
 439    DEF must be a GIMPLE register of a floating-point type.  */
 440
 441 static void
 442 execute_cse_reciprocals_1 (gimple_stmt_iterator *def_gsi, tree def)
 443 {
 444   use_operand_p use_p;
 445   imm_use_iterator use_iter;
 446   struct occurrence *occ;
 447   int count = 0, threshold;
 448
 449   gcc_assert (FLOAT_TYPE_P (TREE_TYPE (def)) && is_gimple_reg (def));
 450
 451   FOR_EACH_IMM_USE_FAST (use_p, use_iter, def)
 452     {
 453       gimple use_stmt = USE_STMT (use_p);
 454       if (is_division_by (use_stmt, def))
 455         {
 456           register_division_in (gimple_bb (use_stmt));
 457           count++;
 458         }
 459     }
 460
 461   /* Do the expensive part only if we can hope to optimize something.  */
 462   threshold = targetm.min_divisions_for_recip_mul (TYPE_MODE (TREE_TYPE (def)));
 463   if (count >= threshold)
 464     {
 465       gimple use_stmt;
 466       for (occ = occ_head; occ; occ = occ->next)
 467         {
 468           compute_merit (occ);
 469           insert_reciprocals (def_gsi, occ, def, NULL, threshold);
 470         }
 471
 472       FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, def)
 473         {
 474           if (is_division_by (use_stmt, def))
 475             {
 476               FOR_EACH_IMM_USE_ON_STMT (use_p, use_iter)
 477                 replace_reciprocal (use_p);
 478             }
 479         }
 480     }
 481
 482   for (occ = occ_head; occ; )
 483     occ = free_bb (occ);
 484
 485   occ_head = NULL;
 486 }
 487
 488 static bool
 489 gate_cse_reciprocals (void)
 490 {
 491   return optimize && flag_reciprocal_math;
 492 }
 493
 494 /* Go through all the floating-point SSA_NAMEs, and call
 495    execute_cse_reciprocals_1 on each of them.  */
 496 static unsigned int
 497 execute_cse_reciprocals (void)
 498 {
 499   basic_block bb;
 500   tree arg;
 501
 502   occ_pool = create_alloc_pool ("dominators for recip",
 503                                 sizeof (struct occurrence),
 504                                 n_basic_blocks / 3 + 1);
 505
 506   memset (&reciprocal_stats, 0, sizeof (reciprocal_stats));
 507   calculate_dominance_info (CDI_DOMINATORS);
 508   calculate_dominance_info (CDI_POST_DOMINATORS);
 509
 510 #ifdef ENABLE_CHECKING
 511   FOR_EACH_BB (bb)
 512     gcc_assert (!bb->aux);
 513 #endif
 514
 515   for (arg = DECL_ARGUMENTS (cfun->decl); arg; arg = DECL_CHAIN (arg))
 516     if (gimple_default_def (cfun, arg)
 517         && FLOAT_TYPE_P (TREE_TYPE (arg))
 518         && is_gimple_reg (arg))
 519       execute_cse_reciprocals_1 (NULL, gimple_default_def (cfun, arg));
 520
 521   FOR_EACH_BB (bb)
 522     {
 523       gimple_stmt_iterator gsi;
 524       gimple phi;
 525       tree def;
 526
 527       for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 528         {
 529           phi = gsi_stmt (gsi);
 530           def = PHI_RESULT (phi);
 531           if (FLOAT_TYPE_P (TREE_TYPE (def))
 532               && is_gimple_reg (def))
 533             execute_cse_reciprocals_1 (NULL, def);
 534         }
 535
 536       for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 537         {
 538           gimple stmt = gsi_stmt (gsi);
 539
 540           if (gimple_has_lhs (stmt)
 541               && (def = SINGLE_SSA_TREE_OPERAND (stmt, SSA_OP_DEF)) != NULL
 542               && FLOAT_TYPE_P (TREE_TYPE (def))
 543               && TREE_CODE (def) == SSA_NAME)
 544             execute_cse_reciprocals_1 (&gsi, def);
 545         }
 546
 547       if (optimize_bb_for_size_p (bb))
 548         continue;
 549
 550       /* Scan for a/func(b) and convert it to reciprocal a*rfunc(b).  */
 551       for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 552         {
 553           gimple stmt = gsi_stmt (gsi);
 554           tree fndecl;
 555
 556           if (is_gimple_assign (stmt)
 557               && gimple_assign_rhs_code (stmt) == RDIV_EXPR)
 558             {
 559               tree arg1 = gimple_assign_rhs2 (stmt);
 560               gimple stmt1;
 561
 562               if (TREE_CODE (arg1) != SSA_NAME)
 563                 continue;
 564
 565               stmt1 = SSA_NAME_DEF_STMT (arg1);
 566
 567               if (is_gimple_call (stmt1)
 568                   && gimple_call_lhs (stmt1)
 569                   && (fndecl = gimple_call_fndecl (stmt1))
 570                   && (DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_NORMAL
 571                       || DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD))
 572                 {
 573                   enum built_in_function code;
 574                   bool md_code, fail;
 575                   imm_use_iterator ui;
 576                   use_operand_p use_p;
 577
 578                   code = DECL_FUNCTION_CODE (fndecl);
 579                   md_code = DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD;
 580
 581                   fndecl = targetm.builtin_reciprocal (code, md_code, false);
 582                   if (!fndecl)
 583                     continue;
 584
 585                   /* Check that all uses of the SSA name are divisions,
 586                      otherwise replacing the defining statement will do
 587                      the wrong thing.  */
 588                   fail = false;
 589                   FOR_EACH_IMM_USE_FAST (use_p, ui, arg1)
 590                     {
 591                       gimple stmt2 = USE_STMT (use_p);
 592                       if (is_gimple_debug (stmt2))
 593                         continue;
 594                       if (!is_gimple_assign (stmt2)
 595                           || gimple_assign_rhs_code (stmt2) != RDIV_EXPR
 596                           || gimple_assign_rhs1 (stmt2) == arg1
 597                           || gimple_assign_rhs2 (stmt2) != arg1)
 598                         {
 599                           fail = true;
 600                           break;
 601                         }
 602                     }
 603                   if (fail)
 604                     continue;
 605
 606                   gimple_replace_lhs (stmt1, arg1);
 607                   gimple_call_set_fndecl (stmt1, fndecl);
 608                   update_stmt (stmt1);
 609                   reciprocal_stats.rfuncs_inserted++;
 610
 611                   FOR_EACH_IMM_USE_STMT (stmt, ui, arg1)
 612                     {
 613                       gimple_assign_set_rhs_code (stmt, MULT_EXPR);
 614                       fold_stmt_inplace (stmt);
 615                       update_stmt (stmt);
 616                     }
 617                 }
 618             }
 619         }
 620     }
 621
 622   statistics_counter_event (cfun, "reciprocal divs inserted",
 623                             reciprocal_stats.rdivs_inserted);
 624   statistics_counter_event (cfun, "reciprocal functions inserted",
 625                             reciprocal_stats.rfuncs_inserted);
 626
 627   free_dominance_info (CDI_DOMINATORS);
 628   free_dominance_info (CDI_POST_DOMINATORS);
 629   free_alloc_pool (occ_pool);
 630   return 0;
 631 }
 632
 633 struct gimple_opt_pass pass_cse_reciprocals =
 634 {
 635  {
 636   GIMPLE_PASS,
 637   "recip",                              /* name */
 638   gate_cse_reciprocals,                 /* gate */
 639   execute_cse_reciprocals,              /* execute */
 640   NULL,                                 /* sub */
 641   NULL,                                 /* next */
 642   0,                                    /* static_pass_number */
 643   TV_NONE,                              /* tv_id */
 644   PROP_ssa,                             /* properties_required */
 645   0,                                    /* properties_provided */
 646   0,                                    /* properties_destroyed */
 647   0,                                    /* todo_flags_start */
 648   TODO_update_ssa | TODO_verify_ssa
 649     | TODO_verify_stmts                /* todo_flags_finish */
 650  }
 651 };
 652
 653 /* Records an occurrence at statement USE_STMT in the vector of trees
 654    STMTS if it is dominated by *TOP_BB or dominates it or this basic block
 655    is not yet initialized.  Returns true if the occurrence was pushed on
 656    the vector.  Adjusts *TOP_BB to be the basic block dominating all
 657    statements in the vector.  */
 658
 659 static bool
 660 maybe_record_sincos (VEC(gimple, heap) **stmts,
 661                      basic_block *top_bb, gimple use_stmt)
 662 {
 663   basic_block use_bb = gimple_bb (use_stmt);
 664   if (*top_bb
 665       && (*top_bb == use_bb
 666           || dominated_by_p (CDI_DOMINATORS, use_bb, *top_bb)))
 667     VEC_safe_push (gimple, heap, *stmts, use_stmt);
 668   else if (!*top_bb
 669            || dominated_by_p (CDI_DOMINATORS, *top_bb, use_bb))
 670     {
 671       VEC_safe_push (gimple, heap, *stmts, use_stmt);
 672       *top_bb = use_bb;
 673     }
 674   else
 675     return false;
 676
 677   return true;
 678 }
 679
 680 /* Look for sin, cos and cexpi calls with the same argument NAME and
 681    create a single call to cexpi CSEing the result in this case.
 682    We first walk over all immediate uses of the argument collecting
 683    statements that we can CSE in a vector and in a second pass replace
 684    the statement rhs with a REALPART or IMAGPART expression on the
 685    result of the cexpi call we insert before the use statement that
 686    dominates all other candidates.  */
 687
 688 static bool
 689 execute_cse_sincos_1 (tree name)
 690 {
 691   gimple_stmt_iterator gsi;
 692   imm_use_iterator use_iter;
 693   tree fndecl, res, type;
 694   gimple def_stmt, use_stmt, stmt;
 695   int seen_cos = 0, seen_sin = 0, seen_cexpi = 0;
 696   VEC(gimple, heap) *stmts = NULL;
 697   basic_block top_bb = NULL;
 698   int i;
 699   bool cfg_changed = false;
 700
 701   type = TREE_TYPE (name);
 702   FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, name)
 703     {
 704       if (gimple_code (use_stmt) != GIMPLE_CALL
 705           || !gimple_call_lhs (use_stmt)
 706           || !(fndecl = gimple_call_fndecl (use_stmt))
 707           || DECL_BUILT_IN_CLASS (fndecl) != BUILT_IN_NORMAL)
 708         continue;
 709
 710       switch (DECL_FUNCTION_CODE (fndecl))
 711         {
 712         CASE_FLT_FN (BUILT_IN_COS):
 713           seen_cos |= maybe_record_sincos (&stmts, &top_bb, use_stmt) ? 1 : 0;
 714           break;
 715
 716         CASE_FLT_FN (BUILT_IN_SIN):
 717           seen_sin |= maybe_record_sincos (&stmts, &top_bb, use_stmt) ? 1 : 0;
 718           break;
 719
 720         CASE_FLT_FN (BUILT_IN_CEXPI):
 721           seen_cexpi |= maybe_record_sincos (&stmts, &top_bb, use_stmt) ? 1 : 0;
 722           break;
 723
 724         default:;
 725         }
 726     }
 727
 728   if (seen_cos + seen_sin + seen_cexpi <= 1)
 729     {
 730       VEC_free(gimple, heap, stmts);
 731       return false;
 732     }
 733
 734   /* Simply insert cexpi at the beginning of top_bb but not earlier than
 735      the name def statement.  */
 736   fndecl = mathfn_built_in (type, BUILT_IN_CEXPI);
 737   if (!fndecl)
 738     return false;
 739   res = create_tmp_reg (TREE_TYPE (TREE_TYPE (fndecl)), "sincostmp");
 740   stmt = gimple_build_call (fndecl, 1, name);
 741   res = make_ssa_name (res, stmt);
 742   gimple_call_set_lhs (stmt, res);
 743
 744   def_stmt = SSA_NAME_DEF_STMT (name);
 745   if (!SSA_NAME_IS_DEFAULT_DEF (name)
 746       && gimple_code (def_stmt) != GIMPLE_PHI
 747       && gimple_bb (def_stmt) == top_bb)
 748     {
 749       gsi = gsi_for_stmt (def_stmt);
 750       gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
 751     }
 752   else
 753     {
 754       gsi = gsi_after_labels (top_bb);
 755       gsi_insert_before (&gsi, stmt, GSI_SAME_STMT);
 756     }
 757   update_stmt (stmt);
 758   sincos_stats.inserted++;
 759
 760   /* And adjust the recorded old call sites.  */
 761   for (i = 0; VEC_iterate(gimple, stmts, i, use_stmt); ++i)
 762     {
 763       tree rhs = NULL;
 764       fndecl = gimple_call_fndecl (use_stmt);
 765
 766       switch (DECL_FUNCTION_CODE (fndecl))
 767         {
 768         CASE_FLT_FN (BUILT_IN_COS):
 769           rhs = fold_build1 (REALPART_EXPR, type, res);
 770           break;
 771
 772         CASE_FLT_FN (BUILT_IN_SIN):
 773           rhs = fold_build1 (IMAGPART_EXPR, type, res);
 774           break;
 775
 776         CASE_FLT_FN (BUILT_IN_CEXPI):
 777           rhs = res;
 778           break;
 779
 780         default:;
 781           gcc_unreachable ();
 782         }
 783
 784         /* Replace call with a copy.  */
 785         stmt = gimple_build_assign (gimple_call_lhs (use_stmt), rhs);
 786
 787         gsi = gsi_for_stmt (use_stmt);
 788         gsi_replace (&gsi, stmt, true);
 789         if (gimple_purge_dead_eh_edges (gimple_bb (stmt)))
 790           cfg_changed = true;
 791     }
 792
 793   VEC_free(gimple, heap, stmts);
 794
 795   return cfg_changed;
 796 }
 797
 798 /* To evaluate powi(x,n), the floating point value x raised to the
 799    constant integer exponent n, we use a hybrid algorithm that
 800    combines the "window method" with look-up tables.  For an
 801    introduction to exponentiation algorithms and "addition chains",
 802    see section 4.6.3, "Evaluation of Powers" of Donald E. Knuth,
 803    "Seminumerical Algorithms", Vol. 2, "The Art of Computer Programming",
 804    3rd Edition, 1998, and Daniel M. Gordon, "A Survey of Fast Exponentiation
 805    Methods", Journal of Algorithms, Vol. 27, pp. 129-146, 1998.  */
 806
 807 /* Provide a default value for POWI_MAX_MULTS, the maximum number of
 808    multiplications to inline before calling the system library's pow
 809    function.  powi(x,n) requires at worst 2*bits(n)-2 multiplications,
 810    so this default never requires calling pow, powf or powl.  */
 811
 812 #ifndef POWI_MAX_MULTS
 813 #define POWI_MAX_MULTS  (2*HOST_BITS_PER_WIDE_INT-2)
 814 #endif
 815
 816 /* The size of the "optimal power tree" lookup table.  All
 817    exponents less than this value are simply looked up in the
 818    powi_table below.  This threshold is also used to size the
 819    cache of pseudo registers that hold intermediate results.  */
 820 #define POWI_TABLE_SIZE 256
 821
 822 /* The size, in bits of the window, used in the "window method"
 823    exponentiation algorithm.  This is equivalent to a radix of
 824    (1<<POWI_WINDOW_SIZE) in the corresponding "m-ary method".  */
 825 #define POWI_WINDOW_SIZE 3
 826
 827 /* The following table is an efficient representation of an
 828    "optimal power tree".  For each value, i, the corresponding
 829    value, j, in the table states than an optimal evaluation
 830    sequence for calculating pow(x,i) can be found by evaluating
 831    pow(x,j)*pow(x,i-j).  An optimal power tree for the first
 832    100 integers is given in Knuth's "Seminumerical algorithms".  */
 833
 834 static const unsigned char powi_table[POWI_TABLE_SIZE] =
 835   {
 836       0,   1,   1,   2,   2,   3,   3,   4,  /*   0 -   7 */
 837       4,   6,   5,   6,   6,  10,   7,   9,  /*   8 -  15 */
 838       8,  16,   9,  16,  10,  12,  11,  13,  /*  16 -  23 */
 839      12,  17,  13,  18,  14,  24,  15,  26,  /*  24 -  31 */
 840      16,  17,  17,  19,  18,  33,  19,  26,  /*  32 -  39 */
 841      20,  25,  21,  40,  22,  27,  23,  44,  /*  40 -  47 */
 842      24,  32,  25,  34,  26,  29,  27,  44,  /*  48 -  55 */
 843      28,  31,  29,  34,  30,  60,  31,  36,  /*  56 -  63 */
 844      32,  64,  33,  34,  34,  46,  35,  37,  /*  64 -  71 */
 845      36,  65,  37,  50,  38,  48,  39,  69,  /*  72 -  79 */
 846      40,  49,  41,  43,  42,  51,  43,  58,  /*  80 -  87 */
 847      44,  64,  45,  47,  46,  59,  47,  76,  /*  88 -  95 */
 848      48,  65,  49,  66,  50,  67,  51,  66,  /*  96 - 103 */
 849      52,  70,  53,  74,  54, 104,  55,  74,  /* 104 - 111 */
 850      56,  64,  57,  69,  58,  78,  59,  68,  /* 112 - 119 */
 851      60,  61,  61,  80,  62,  75,  63,  68,  /* 120 - 127 */
 852      64,  65,  65, 128,  66, 129,  67,  90,  /* 128 - 135 */
 853      68,  73,  69, 131,  70,  94,  71,  88,  /* 136 - 143 */
 854      72, 128,  73,  98,  74, 132,  75, 121,  /* 144 - 151 */
 855      76, 102,  77, 124,  78, 132,  79, 106,  /* 152 - 159 */
 856      80,  97,  81, 160,  82,  99,  83, 134,  /* 160 - 167 */
 857      84,  86,  85,  95,  86, 160,  87, 100,  /* 168 - 175 */
 858      88, 113,  89,  98,  90, 107,  91, 122,  /* 176 - 183 */
 859      92, 111,  93, 102,  94, 126,  95, 150,  /* 184 - 191 */
 860      96, 128,  97, 130,  98, 133,  99, 195,  /* 192 - 199 */
 861     100, 128, 101, 123, 102, 164, 103, 138,  /* 200 - 207 */
 862     104, 145, 105, 146, 106, 109, 107, 149,  /* 208 - 215 */
 863     108, 200, 109, 146, 110, 170, 111, 157,  /* 216 - 223 */
 864     112, 128, 113, 130, 114, 182, 115, 132,  /* 224 - 231 */
 865     116, 200, 117, 132, 118, 158, 119, 206,  /* 232 - 239 */
 866     120, 240, 121, 162, 122, 147, 123, 152,  /* 240 - 247 */
 867     124, 166, 125, 214, 126, 138, 127, 153,  /* 248 - 255 */
 868   };
 869
 870
 871 /* Return the number of multiplications required to calculate
 872    powi(x,n) where n is less than POWI_TABLE_SIZE.  This is a
 873    subroutine of powi_cost.  CACHE is an array indicating
 874    which exponents have already been calculated.  */
 875
 876 static int
 877 powi_lookup_cost (unsigned HOST_WIDE_INT n, bool *cache)
 878 {
 879   /* If we've already calculated this exponent, then this evaluation
 880      doesn't require any additional multiplications.  */
 881   if (cache[n])
 882     return 0;
 883
 884   cache[n] = true;
 885   return powi_lookup_cost (n - powi_table[n], cache)
 886          + powi_lookup_cost (powi_table[n], cache) + 1;
 887 }
 888
 889 /* Return the number of multiplications required to calculate
 890    powi(x,n) for an arbitrary x, given the exponent N.  This
 891    function needs to be kept in sync with powi_as_mults below.  */
 892
 893 static int
 894 powi_cost (HOST_WIDE_INT n)
 895 {
 896   bool cache[POWI_TABLE_SIZE];
 897   unsigned HOST_WIDE_INT digit;
 898   unsigned HOST_WIDE_INT val;
 899   int result;
 900
 901   if (n == 0)
 902     return 0;
 903
 904   /* Ignore the reciprocal when calculating the cost.  */
 905   val = (n < 0) ? -n : n;
 906
 907   /* Initialize the exponent cache.  */
 908   memset (cache, 0, POWI_TABLE_SIZE * sizeof (bool));
 909   cache[1] = true;
 910
 911   result = 0;
 912
 913   while (val >= POWI_TABLE_SIZE)
 914     {
 915       if (val & 1)
 916         {
 917           digit = val & ((1 << POWI_WINDOW_SIZE) - 1);
 918           result += powi_lookup_cost (digit, cache)
 919                     + POWI_WINDOW_SIZE + 1;
 920           val >>= POWI_WINDOW_SIZE;
 921         }
 922       else
 923         {
 924           val >>= 1;
 925           result++;
 926         }
 927     }
 928
 929   return result + powi_lookup_cost (val, cache);
 930 }
 931
 932 /* Recursive subroutine of powi_as_mults.  This function takes the
 933    array, CACHE, of already calculated exponents and an exponent N and
 934    returns a tree that corresponds to CACHE[1]**N, with type TYPE.  */
 935
 936 static tree
 937 powi_as_mults_1 (gimple_stmt_iterator *gsi, location_t loc, tree type,
 938                  HOST_WIDE_INT n, tree *cache, tree target)
 939 {
 940   tree op0, op1, ssa_target;
 941   unsigned HOST_WIDE_INT digit;
 942   gimple mult_stmt;
 943
 944   if (n < POWI_TABLE_SIZE && cache[n])
 945     return cache[n];
 946
 947   ssa_target = make_ssa_name (target, NULL);
 948
 949   if (n < POWI_TABLE_SIZE)
 950     {
 951       cache[n] = ssa_target;
 952       op0 = powi_as_mults_1 (gsi, loc, type, n - powi_table[n], cache, target);
 953       op1 = powi_as_mults_1 (gsi, loc, type, powi_table[n], cache, target);
 954     }
 955   else if (n & 1)
 956     {
 957       digit = n & ((1 << POWI_WINDOW_SIZE) - 1);
 958       op0 = powi_as_mults_1 (gsi, loc, type, n - digit, cache, target);
 959       op1 = powi_as_mults_1 (gsi, loc, type, digit, cache, target);
 960     }
 961   else
 962     {
 963       op0 = powi_as_mults_1 (gsi, loc, type, n >> 1, cache, target);
 964       op1 = op0;
 965     }
 966
 967   mult_stmt = gimple_build_assign_with_ops (MULT_EXPR, ssa_target, op0, op1);
 968   gimple_set_location (mult_stmt, loc);
 969   gsi_insert_before (gsi, mult_stmt, GSI_SAME_STMT);
 970
 971   return ssa_target;
 972 }
 973
 974 /* Convert ARG0**N to a tree of multiplications of ARG0 with itself.
 975    This function needs to be kept in sync with powi_cost above.  */
 976
 977 static tree
 978 powi_as_mults (gimple_stmt_iterator *gsi, location_t loc,
 979                tree arg0, HOST_WIDE_INT n)
 980 {
 981   tree cache[POWI_TABLE_SIZE], result, type = TREE_TYPE (arg0), target;
 982   gimple div_stmt;
 983
 984   if (n == 0)
 985     return build_real (type, dconst1);
 986
 987   memset (cache, 0,  sizeof (cache));
 988   cache[1] = arg0;
 989
 990   target = create_tmp_reg (type, "powmult");
 991   add_referenced_var (target);
 992
 993   result = powi_as_mults_1 (gsi, loc, type, (n < 0) ? -n : n, cache, target);
 994
 995   if (n >= 0)
 996     return result;
 997
 998   /* If the original exponent was negative, reciprocate the result.  */
 999   target = make_ssa_name (target, NULL);
1000   div_stmt = gimple_build_assign_with_ops (RDIV_EXPR, target,
1001                                            build_real (type, dconst1),
1002                                            result);
1003   gimple_set_location (div_stmt, loc);
1004   gsi_insert_before (gsi, div_stmt, GSI_SAME_STMT);
1005
1006   return target;
1007 }
1008
1009 /* ARG0 and N are the two arguments to a powi builtin in GSI with
1010    location info LOC.  If the arguments are appropriate, create an
1011    equivalent sequence of statements prior to GSI using an optimal
1012    number of multiplications, and return an expession holding the
1013    result.  */
1014
1015 static tree
1016 gimple_expand_builtin_powi (gimple_stmt_iterator *gsi, location_t loc,
1017                             tree arg0, HOST_WIDE_INT n)
1018 {
1019   /* Avoid largest negative number.  */
1020   if (n != -n
1021       && ((n >= -1 && n <= 2)
1022           || (optimize_function_for_speed_p (cfun)
1023               && powi_cost (n) <= POWI_MAX_MULTS)))
1024     return powi_as_mults (gsi, loc, arg0, n);
1025
1026   return NULL_TREE;
1027 }
1028
1029 /* Build a gimple call statement that calls FN with argument ARG.
1030    Set the lhs of the call statement to a fresh SSA name for
1031    variable VAR.  If VAR is NULL, first allocate it.  Insert the
1032    statement prior to GSI's current position, and return the fresh
1033    SSA name.  */
1034
1035 static tree
1036 build_and_insert_call (gimple_stmt_iterator *gsi, location_t loc,
1037                        tree *var, tree fn, tree arg)
1038 {
1039   gimple call_stmt;
1040   tree ssa_target;
1041
1042   if (!*var)
1043     {
1044       *var = create_tmp_reg (TREE_TYPE (arg), "powroot");
1045       add_referenced_var (*var);
1046     }
1047
1048   call_stmt = gimple_build_call (fn, 1, arg);
1049   ssa_target = make_ssa_name (*var, NULL);
1050   gimple_set_lhs (call_stmt, ssa_target);
1051   gimple_set_location (call_stmt, loc);
1052   gsi_insert_before (gsi, call_stmt, GSI_SAME_STMT);
1053
1054   return ssa_target;
1055 }
1056
1057 /* Build a gimple binary operation with the given CODE and arguments
1058    ARG0, ARG1, assigning the result to a new SSA name for variable
1059    TARGET.  Insert the statement prior to GSI's current position, and
1060    return the fresh SSA name.*/
1061
1062 static tree
1063 build_and_insert_binop (gimple_stmt_iterator *gsi, location_t loc,
1064                         tree target, enum tree_code code, tree arg0, tree arg1)
1065 {
1066   tree result = make_ssa_name (target, NULL);
1067   gimple stmt = gimple_build_assign_with_ops (code, result, arg0, arg1);
1068   gimple_set_location (stmt, loc);
1069   gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
1070   return result;
1071 }
1072
1073 /* Build a gimple reference operation with the given CODE and argument
1074    ARG, assigning the result to a new SSA name for variable TARGET.
1075    Insert the statement prior to GSI's current position, and return
1076    the fresh SSA name.  */
1077
1078 static inline tree
1079 build_and_insert_ref (gimple_stmt_iterator *gsi, location_t loc, tree type,
1080                       tree target, enum tree_code code, tree arg0)
1081 {
1082   tree result = make_ssa_name (target, NULL);
1083   gimple stmt = gimple_build_assign (result, build1 (code, type, arg0));
1084   gimple_set_location (stmt, loc);
1085   gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
1086   return result;
1087 }
1088
1089 /* Build a gimple assignment to cast VAL to TARGET.  Insert the statement
1090    prior to GSI's current position, and return the fresh SSA name.  */
1091
1092 static tree
1093 build_and_insert_cast (gimple_stmt_iterator *gsi, location_t loc,
1094                        tree target, tree val)
1095 {
1096   return build_and_insert_binop (gsi, loc, target, CONVERT_EXPR, val, NULL);
1097 }
1098
1099 /* ARG0 and ARG1 are the two arguments to a pow builtin call in GSI
1100    with location info LOC.  If possible, create an equivalent and
1101    less expensive sequence of statements prior to GSI, and return an
1102    expession holding the result.  */
1103
1104 static tree
1105 gimple_expand_builtin_pow (gimple_stmt_iterator *gsi, location_t loc,
1106                            tree arg0, tree arg1)
1107 {
1108   REAL_VALUE_TYPE c, cint, dconst1_4, dconst3_4, dconst1_3, dconst1_6;
1109   REAL_VALUE_TYPE c2, dconst3;
1110   HOST_WIDE_INT n;
1111   tree type, sqrtfn, cbrtfn, sqrt_arg0, sqrt_sqrt, result, cbrt_x, powi_cbrt_x;
1112   tree target = NULL_TREE;
1113   enum machine_mode mode;
1114   bool hw_sqrt_exists;
1115
1116   /* If the exponent isn't a constant, there's nothing of interest
1117      to be done.  */
1118   if (TREE_CODE (arg1) != REAL_CST)
1119     return NULL_TREE;
1120
1121   /* If the exponent is equivalent to an integer, expand to an optimal
1122      multiplication sequence when profitable.  */
1123   c = TREE_REAL_CST (arg1);
1124   n = real_to_integer (&c);
1125   real_from_integer (&cint, VOIDmode, n, n < 0 ? -1 : 0, 0);
1126
1127   if (real_identical (&c, &cint)
1128       && ((n >= -1 && n <= 2)
1129           || (flag_unsafe_math_optimizations
1130               && optimize_insn_for_speed_p ()
1131               && powi_cost (n) <= POWI_MAX_MULTS)))
1132     return gimple_expand_builtin_powi (gsi, loc, arg0, n);
1133
1134   /* Attempt various optimizations using sqrt and cbrt.  */
1135   type = TREE_TYPE (arg0);
1136   mode = TYPE_MODE (type);
1137   sqrtfn = mathfn_built_in (type, BUILT_IN_SQRT);
1138
1139   /* Optimize pow(x,0.5) = sqrt(x).  This replacement is always safe
1140      unless signed zeros must be maintained.  pow(-0,0.5) = +0, while
1141      sqrt(-0) = -0.  */
1142   if (sqrtfn
1143       && REAL_VALUES_EQUAL (c, dconsthalf)
1144       && !HONOR_SIGNED_ZEROS (mode))
1145     return build_and_insert_call (gsi, loc, &target, sqrtfn, arg0);
1146
1147   /* Optimize pow(x,0.25) = sqrt(sqrt(x)).  Assume on most machines that
1148      a builtin sqrt instruction is smaller than a call to pow with 0.25,
1149      so do this optimization even if -Os.  Don't do this optimization
1150      if we don't have a hardware sqrt insn.  */
1151   dconst1_4 = dconst1;
1152   SET_REAL_EXP (&dconst1_4, REAL_EXP (&dconst1_4) - 2);
1153   hw_sqrt_exists = optab_handler (sqrt_optab, mode) != CODE_FOR_nothing;
1154
1155   if (flag_unsafe_math_optimizations
1156       && sqrtfn
1157       && REAL_VALUES_EQUAL (c, dconst1_4)
1158       && hw_sqrt_exists)
1159     {
1160       /* sqrt(x)  */
1161       sqrt_arg0 = build_and_insert_call (gsi, loc, &target, sqrtfn, arg0);
1162
1163       /* sqrt(sqrt(x))  */
1164       return build_and_insert_call (gsi, loc, &target, sqrtfn, sqrt_arg0);
1165     }
1166
1167   /* Optimize pow(x,0.75) = sqrt(x) * sqrt(sqrt(x)) unless we are
1168      optimizing for space.  Don't do this optimization if we don't have
1169      a hardware sqrt insn.  */
1170   real_from_integer (&dconst3_4, VOIDmode, 3, 0, 0);
1171   SET_REAL_EXP (&dconst3_4, REAL_EXP (&dconst3_4) - 2);
1172
1173   if (flag_unsafe_math_optimizations
1174       && sqrtfn
1175       && optimize_function_for_speed_p (cfun)
1176       && REAL_VALUES_EQUAL (c, dconst3_4)
1177       && hw_sqrt_exists)
1178     {
1179       /* sqrt(x)  */
1180       sqrt_arg0 = build_and_insert_call (gsi, loc, &target, sqrtfn, arg0);
1181
1182       /* sqrt(sqrt(x))  */
1183       sqrt_sqrt = build_and_insert_call (gsi, loc, &target, sqrtfn, sqrt_arg0);
1184
1185       /* sqrt(x) * sqrt(sqrt(x))  */
1186       return build_and_insert_binop (gsi, loc, target, MULT_EXPR,
1187                                      sqrt_arg0, sqrt_sqrt);
1188     }
1189
1190   /* Optimize pow(x,1./3.) = cbrt(x).  This requires unsafe math
1191      optimizations since 1./3. is not exactly representable.  If x
1192      is negative and finite, the correct value of pow(x,1./3.) is
1193      a NaN with the "invalid" exception raised, because the value
1194      of 1./3. actually has an even denominator.  The correct value
1195      of cbrt(x) is a negative real value.  */
1196   cbrtfn = mathfn_built_in (type, BUILT_IN_CBRT);
1197   dconst1_3 = real_value_truncate (mode, dconst_third ());
1198
1199   if (flag_unsafe_math_optimizations
1200       && cbrtfn
1201       && (gimple_val_nonnegative_real_p (arg0) || !HONOR_NANS (mode))
1202       && REAL_VALUES_EQUAL (c, dconst1_3))
1203     return build_and_insert_call (gsi, loc, &target, cbrtfn, arg0);
1204
1205   /* Optimize pow(x,1./6.) = cbrt(sqrt(x)).  Don't do this optimization
1206      if we don't have a hardware sqrt insn.  */
1207   dconst1_6 = dconst1_3;
1208   SET_REAL_EXP (&dconst1_6, REAL_EXP (&dconst1_6) - 1);
1209
1210   if (flag_unsafe_math_optimizations
1211       && sqrtfn
1212       && cbrtfn
1213       && (gimple_val_nonnegative_real_p (arg0) || !HONOR_NANS (mode))
1214       && optimize_function_for_speed_p (cfun)
1215       && hw_sqrt_exists
1216       && REAL_VALUES_EQUAL (c, dconst1_6))
1217     {
1218       /* sqrt(x)  */
1219       sqrt_arg0 = build_and_insert_call (gsi, loc, &target, sqrtfn, arg0);
1220
1221       /* cbrt(sqrt(x))  */
1222       return build_and_insert_call (gsi, loc, &target, cbrtfn, sqrt_arg0);
1223     }
1224
1225   /* Optimize pow(x,c), where n = 2c for some nonzero integer n, into
1226
1227        sqrt(x) * powi(x, n/2),                n > 0;
1228        1.0 / (sqrt(x) * powi(x, abs(n/2))),   n < 0.
1229
1230      Do not calculate the powi factor when n/2 = 0.  */
1231   real_arithmetic (&c2, MULT_EXPR, &c, &dconst2);
1232   n = real_to_integer (&c2);
1233   real_from_integer (&cint, VOIDmode, n, n < 0 ? -1 : 0, 0);
1234
1235   if (flag_unsafe_math_optimizations
1236       && sqrtfn
1237       && real_identical (&c2, &cint))
1238     {
1239       tree powi_x_ndiv2 = NULL_TREE;
1240
1241       /* Attempt to fold powi(arg0, abs(n/2)) into multiplies.  If not
1242          possible or profitable, give up.  Skip the degenerate case when
1243          n is 1 or -1, where the result is always 1.  */
1244       if (absu_hwi (n) != 1)
1245         {
1246           powi_x_ndiv2 = gimple_expand_builtin_powi (gsi, loc, arg0,
1247                                                      abs_hwi (n / 2));
1248           if (!powi_x_ndiv2)
1249             return NULL_TREE;
1250         }
1251
1252       /* Calculate sqrt(x).  When n is not 1 or -1, multiply it by the
1253          result of the optimal multiply sequence just calculated.  */
1254       sqrt_arg0 = build_and_insert_call (gsi, loc, &target, sqrtfn, arg0);
1255
1256       if (absu_hwi (n) == 1)
1257         result = sqrt_arg0;
1258       else
1259         result = build_and_insert_binop (gsi, loc, target, MULT_EXPR,
1260                                          sqrt_arg0, powi_x_ndiv2);
1261
1262       /* If n is negative, reciprocate the result.  */
1263       if (n < 0)
1264         result = build_and_insert_binop (gsi, loc, target, RDIV_EXPR,
1265                                          build_real (type, dconst1), result);
1266       return result;
1267     }
1268
1269   /* Optimize pow(x,c), where 3c = n for some nonzero integer n, into
1270
1271      powi(x, n/3) * powi(cbrt(x), n%3),                    n > 0;
1272      1.0 / (powi(x, abs(n)/3) * powi(cbrt(x), abs(n)%3)),  n < 0.
1273
1274      Do not calculate the first factor when n/3 = 0.  As cbrt(x) is
1275      different from pow(x, 1./3.) due to rounding and behavior with
1276      negative x, we need to constrain this transformation to unsafe
1277      math and positive x or finite math.  */
1278   real_from_integer (&dconst3, VOIDmode, 3, 0, 0);
1279   real_arithmetic (&c2, MULT_EXPR, &c, &dconst3);
1280   real_round (&c2, mode, &c2);
1281   n = real_to_integer (&c2);
1282   real_from_integer (&cint, VOIDmode, n, n < 0 ? -1 : 0, 0);
1283   real_arithmetic (&c2, RDIV_EXPR, &cint, &dconst3);
1284   real_convert (&c2, mode, &c2);
1285
1286   if (flag_unsafe_math_optimizations
1287       && cbrtfn
1288       && (gimple_val_nonnegative_real_p (arg0) || !HONOR_NANS (mode))
1289       && real_identical (&c2, &c)
1290       && optimize_function_for_speed_p (cfun)
1291       && powi_cost (n / 3) <= POWI_MAX_MULTS)
1292     {
1293       tree powi_x_ndiv3 = NULL_TREE;
1294
1295       /* Attempt to fold powi(arg0, abs(n/3)) into multiplies.  If not
1296          possible or profitable, give up.  Skip the degenerate case when
1297          abs(n) < 3, where the result is always 1.  */
1298       if (absu_hwi (n) >= 3)
1299         {
1300           powi_x_ndiv3 = gimple_expand_builtin_powi (gsi, loc, arg0,
1301                                                      abs_hwi (n / 3));
1302           if (!powi_x_ndiv3)
1303             return NULL_TREE;
1304         }
1305
1306       /* Calculate powi(cbrt(x), n%3).  Don't use gimple_expand_builtin_powi
1307          as that creates an unnecessary variable.  Instead, just produce
1308          either cbrt(x) or cbrt(x) * cbrt(x).  */
1309       cbrt_x = build_and_insert_call (gsi, loc, &target, cbrtfn, arg0);
1310
1311       if (absu_hwi (n) % 3 == 1)
1312         powi_cbrt_x = cbrt_x;
1313       else
1314         powi_cbrt_x = build_and_insert_binop (gsi, loc, target, MULT_EXPR,
1315                                               cbrt_x, cbrt_x);
1316
1317       /* Multiply the two subexpressions, unless powi(x,abs(n)/3) = 1.  */
1318       if (absu_hwi (n) < 3)
1319         result = powi_cbrt_x;
1320       else
1321         result = build_and_insert_binop (gsi, loc, target, MULT_EXPR,
1322                                          powi_x_ndiv3, powi_cbrt_x);
1323
1324       /* If n is negative, reciprocate the result.  */
1325       if (n < 0)
1326         result = build_and_insert_binop (gsi, loc, target, RDIV_EXPR,
1327                                          build_real (type, dconst1), result);
1328
1329       return result;
1330     }
1331
1332   /* No optimizations succeeded.  */
1333   return NULL_TREE;
1334 }
1335
1336 /* ARG is the argument to a cabs builtin call in GSI with location info
1337    LOC.  Create a sequence of statements prior to GSI that calculates
1338    sqrt(R*R + I*I), where R and I are the real and imaginary components
1339    of ARG, respectively.  Return an expression holding the result.  */
1340
1341 static tree
1342 gimple_expand_builtin_cabs (gimple_stmt_iterator *gsi, location_t loc, tree arg)
1343 {
1344   tree target, real_part, imag_part, addend1, addend2, sum, result;
1345   tree type = TREE_TYPE (TREE_TYPE (arg));
1346   tree sqrtfn = mathfn_built_in (type, BUILT_IN_SQRT);
1347   enum machine_mode mode = TYPE_MODE (type);
1348
1349   if (!flag_unsafe_math_optimizations
1350       || !optimize_bb_for_speed_p (gimple_bb (gsi_stmt (*gsi)))
1351       || !sqrtfn
1352       || optab_handler (sqrt_optab, mode) == CODE_FOR_nothing)
1353     return NULL_TREE;
1354
1355   target = create_tmp_reg (type, "cabs");
1356   add_referenced_var (target);
1357
1358   real_part = build_and_insert_ref (gsi, loc, type, target,
1359                                     REALPART_EXPR, arg);
1360   addend1 = build_and_insert_binop (gsi, loc, target, MULT_EXPR,
1361                                     real_part, real_part);
1362   imag_part = build_and_insert_ref (gsi, loc, type, target,
1363                                     IMAGPART_EXPR, arg);
1364   addend2 = build_and_insert_binop (gsi, loc, target, MULT_EXPR,
1365                                     imag_part, imag_part);
1366   sum = build_and_insert_binop (gsi, loc, target, PLUS_EXPR, addend1, addend2);
1367   result = build_and_insert_call (gsi, loc, &target, sqrtfn, sum);
1368
1369   return result;
1370 }
1371
1372 /* Go through all calls to sin, cos and cexpi and call execute_cse_sincos_1
1373    on the SSA_NAME argument of each of them.  Also expand powi(x,n) into
1374    an optimal number of multiplies, when n is a constant.  */
1375
1376 static unsigned int
1377 execute_cse_sincos (void)
1378 {
1379   basic_block bb;
1380   bool cfg_changed = false;
1381
1382   calculate_dominance_info (CDI_DOMINATORS);
1383   memset (&sincos_stats, 0, sizeof (sincos_stats));
1384
1385   FOR_EACH_BB (bb)
1386     {
1387       gimple_stmt_iterator gsi;
1388
1389       for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi); gsi_next (&gsi))
1390         {
1391           gimple stmt = gsi_stmt (gsi);
1392           tree fndecl;
1393
1394           if (is_gimple_call (stmt)
1395               && gimple_call_lhs (stmt)
1396               && (fndecl = gimple_call_fndecl (stmt))
1397               && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_NORMAL)
1398             {
1399               tree arg, arg0, arg1, result;
1400               HOST_WIDE_INT n;
1401               location_t loc;
1402
1403               switch (DECL_FUNCTION_CODE (fndecl))
1404                 {
1405                 CASE_FLT_FN (BUILT_IN_COS):
1406                 CASE_FLT_FN (BUILT_IN_SIN):
1407                 CASE_FLT_FN (BUILT_IN_CEXPI):
1408                   /* Make sure we have either sincos or cexp.  */
1409                   if (!TARGET_HAS_SINCOS && !TARGET_C99_FUNCTIONS)
1410                     break;
1411
1412                   arg = gimple_call_arg (stmt, 0);
1413                   if (TREE_CODE (arg) == SSA_NAME)
1414                     cfg_changed |= execute_cse_sincos_1 (arg);
1415                   break;
1416
1417                 CASE_FLT_FN (BUILT_IN_POW):
1418                   arg0 = gimple_call_arg (stmt, 0);
1419                   arg1 = gimple_call_arg (stmt, 1);
1420
1421                   loc = gimple_location (stmt);
1422                   result = gimple_expand_builtin_pow (&gsi, loc, arg0, arg1);
1423
1424                   if (result)
1425                     {
1426                       tree lhs = gimple_get_lhs (stmt);
1427                       gimple new_stmt = gimple_build_assign (lhs, result);
1428                       gimple_set_location (new_stmt, loc);
1429                       unlink_stmt_vdef (stmt);
1430                       gsi_replace (&gsi, new_stmt, true);
1431                     }
1432                   break;
1433
1434                 CASE_FLT_FN (BUILT_IN_POWI):
1435                   arg0 = gimple_call_arg (stmt, 0);
1436                   arg1 = gimple_call_arg (stmt, 1);
1437                   if (!host_integerp (arg1, 0))
1438                     break;
1439
1440                   n = TREE_INT_CST_LOW (arg1);
1441                   loc = gimple_location (stmt);
1442                   result = gimple_expand_builtin_powi (&gsi, loc, arg0, n);
1443
1444                   if (result)
1445                     {
1446                       tree lhs = gimple_get_lhs (stmt);
1447                       gimple new_stmt = gimple_build_assign (lhs, result);
1448                       gimple_set_location (new_stmt, loc);
1449                       unlink_stmt_vdef (stmt);
1450                       gsi_replace (&gsi, new_stmt, true);
1451                     }
1452                   break;
1453
1454                 CASE_FLT_FN (BUILT_IN_CABS):
1455                   arg0 = gimple_call_arg (stmt, 0);
1456                   loc = gimple_location (stmt);
1457                   result = gimple_expand_builtin_cabs (&gsi, loc, arg0);
1458
1459                   if (result)
1460                     {
1461                       tree lhs = gimple_get_lhs (stmt);
1462                       gimple new_stmt = gimple_build_assign (lhs, result);
1463                       gimple_set_location (new_stmt, loc);
1464                       unlink_stmt_vdef (stmt);
1465                       gsi_replace (&gsi, new_stmt, true);
1466                     }
1467                   break;
1468
1469                 default:;
1470                 }
1471             }
1472         }
1473     }
1474
1475   statistics_counter_event (cfun, "sincos statements inserted",
1476                             sincos_stats.inserted);
1477
1478   free_dominance_info (CDI_DOMINATORS);
1479   return cfg_changed ? TODO_cleanup_cfg : 0;
1480 }
1481
1482 static bool
1483 gate_cse_sincos (void)
1484 {
1485   /* We no longer require either sincos or cexp, since powi expansion
1486      piggybacks on this pass.  */
1487   return optimize;
1488 }
1489
1490 struct gimple_opt_pass pass_cse_sincos =
1491 {
1492  {
1493   GIMPLE_PASS,
1494   "sincos",                             /* name */
1495   gate_cse_sincos,                      /* gate */
1496   execute_cse_sincos,                   /* execute */
1497   NULL,                                 /* sub */
1498   NULL,                                 /* next */
1499   0,                                    /* static_pass_number */
1500   TV_NONE,                              /* tv_id */
1501   PROP_ssa,                             /* properties_required */
1502   0,                                    /* properties_provided */
1503   0,                                    /* properties_destroyed */
1504   0,                                    /* todo_flags_start */
1505   TODO_update_ssa | TODO_verify_ssa
1506     | TODO_verify_stmts                 /* todo_flags_finish */
1507  }
1508 };
1509
1510 /* A symbolic number is used to detect byte permutation and selection
1511    patterns.  Therefore the field N contains an artificial number
1512    consisting of byte size markers:
1513
1514    0    - byte has the value 0
1515    1..size - byte contains the content of the byte
1516    number indexed with that value minus one  */
1517
1518 struct symbolic_number {
1519   unsigned HOST_WIDEST_INT n;
1520   int size;
1521 };
1522
1523 /* Perform a SHIFT or ROTATE operation by COUNT bits on symbolic
1524    number N.  Return false if the requested operation is not permitted
1525    on a symbolic number.  */
1526
1527 static inline bool
1528 do_shift_rotate (enum tree_code code,
1529                  struct symbolic_number *n,
1530                  int count)
1531 {
1532   if (count % 8 != 0)
1533     return false;
1534
1535   /* Zero out the extra bits of N in order to avoid them being shifted
1536      into the significant bits.  */
1537   if (n->size < (int)sizeof (HOST_WIDEST_INT))
1538     n->n &= ((unsigned HOST_WIDEST_INT)1 << (n->size * BITS_PER_UNIT)) - 1;
1539
1540   switch (code)
1541     {
1542     case LSHIFT_EXPR:
1543       n->n <<= count;
1544       break;
1545     case RSHIFT_EXPR:
1546       n->n >>= count;
1547       break;
1548     case LROTATE_EXPR:
1549       n->n = (n->n << count) | (n->n >> ((n->size * BITS_PER_UNIT) - count));
1550       break;
1551     case RROTATE_EXPR:
1552       n->n = (n->n >> count) | (n->n << ((n->size * BITS_PER_UNIT) - count));
1553       break;
1554     default:
1555       return false;
1556     }
1557   /* Zero unused bits for size.  */
1558   if (n->size < (int)sizeof (HOST_WIDEST_INT))
1559     n->n &= ((unsigned HOST_WIDEST_INT)1 << (n->size * BITS_PER_UNIT)) - 1;
1560   return true;
1561 }
1562
1563 /* Perform sanity checking for the symbolic number N and the gimple
1564    statement STMT.  */
1565
1566 static inline bool
1567 verify_symbolic_number_p (struct symbolic_number *n, gimple stmt)
1568 {
1569   tree lhs_type;
1570
1571   lhs_type = gimple_expr_type (stmt);
1572
1573   if (TREE_CODE (lhs_type) != INTEGER_TYPE)
1574     return false;
1575
1576   if (TYPE_PRECISION (lhs_type) != n->size * BITS_PER_UNIT)
1577     return false;
1578
1579   return true;
1580 }
1581
1582 /* find_bswap_1 invokes itself recursively with N and tries to perform
1583    the operation given by the rhs of STMT on the result.  If the
1584    operation could successfully be executed the function returns the
1585    tree expression of the source operand and NULL otherwise.  */
1586
1587 static tree
1588 find_bswap_1 (gimple stmt, struct symbolic_number *n, int limit)
1589 {
1590   enum tree_code code;
1591   tree rhs1, rhs2 = NULL;
1592   gimple rhs1_stmt, rhs2_stmt;
1593   tree source_expr1;
1594   enum gimple_rhs_class rhs_class;
1595
1596   if (!limit || !is_gimple_assign (stmt))
1597     return NULL_TREE;
1598
1599   rhs1 = gimple_assign_rhs1 (stmt);
1600
1601   if (TREE_CODE (rhs1) != SSA_NAME)
1602     return NULL_TREE;
1603
1604   code = gimple_assign_rhs_code (stmt);
1605   rhs_class = gimple_assign_rhs_class (stmt);
1606   rhs1_stmt = SSA_NAME_DEF_STMT (rhs1);
1607
1608   if (rhs_class == GIMPLE_BINARY_RHS)
1609     rhs2 = gimple_assign_rhs2 (stmt);
1610
1611   /* Handle unary rhs and binary rhs with integer constants as second
1612      operand.  */
1613
1614   if (rhs_class == GIMPLE_UNARY_RHS
1615       || (rhs_class == GIMPLE_BINARY_RHS
1616           && TREE_CODE (rhs2) == INTEGER_CST))
1617     {
1618       if (code != BIT_AND_EXPR
1619           && code != LSHIFT_EXPR
1620           && code != RSHIFT_EXPR
1621           && code != LROTATE_EXPR
1622           && code != RROTATE_EXPR
1623           && code != NOP_EXPR
1624           && code != CONVERT_EXPR)
1625         return NULL_TREE;
1626
1627       source_expr1 = find_bswap_1 (rhs1_stmt, n, limit - 1);
1628
1629       /* If find_bswap_1 returned NULL STMT is a leaf node and we have
1630          to initialize the symbolic number.  */
1631       if (!source_expr1)
1632         {
1633           /* Set up the symbolic number N by setting each byte to a
1634              value between 1 and the byte size of rhs1.  The highest
1635              order byte is set to n->size and the lowest order
1636              byte to 1.  */
1637           n->size = TYPE_PRECISION (TREE_TYPE (rhs1));
1638           if (n->size % BITS_PER_UNIT != 0)
1639             return NULL_TREE;
1640           n->size /= BITS_PER_UNIT;
1641           n->n = (sizeof (HOST_WIDEST_INT) < 8 ? 0 :
1642                   (unsigned HOST_WIDEST_INT)0x08070605 << 32 | 0x04030201);
1643
1644           if (n->size < (int)sizeof (HOST_WIDEST_INT))
1645             n->n &= ((unsigned HOST_WIDEST_INT)1 <<
1646                      (n->size * BITS_PER_UNIT)) - 1;
1647
1648           source_expr1 = rhs1;
1649         }
1650
1651       switch (code)
1652         {
1653         case BIT_AND_EXPR:
1654           {
1655             int i;
1656             unsigned HOST_WIDEST_INT val = widest_int_cst_value (rhs2);
1657             unsigned HOST_WIDEST_INT tmp = val;
1658
1659             /* Only constants masking full bytes are allowed.  */
1660             for (i = 0; i < n->size; i++, tmp >>= BITS_PER_UNIT)
1661               if ((tmp & 0xff) != 0 && (tmp & 0xff) != 0xff)
1662                 return NULL_TREE;
1663
1664             n->n &= val;
1665           }
1666           break;
1667         case LSHIFT_EXPR:
1668         case RSHIFT_EXPR:
1669         case LROTATE_EXPR:
1670         case RROTATE_EXPR:
1671           if (!do_shift_rotate (code, n, (int)TREE_INT_CST_LOW (rhs2)))
1672             return NULL_TREE;
1673           break;
1674         CASE_CONVERT:
1675           {
1676             int type_size;
1677
1678             type_size = TYPE_PRECISION (gimple_expr_type (stmt));
1679             if (type_size % BITS_PER_UNIT != 0)
1680               return NULL_TREE;
1681
1682             if (type_size / BITS_PER_UNIT < (int)(sizeof (HOST_WIDEST_INT)))
1683               {
1684                 /* If STMT casts to a smaller type mask out the bits not
1685                    belonging to the target type.  */
1686                 n->n &= ((unsigned HOST_WIDEST_INT)1 << type_size) - 1;
1687               }
1688             n->size = type_size / BITS_PER_UNIT;
1689           }
1690           break;
1691         default:
1692           return NULL_TREE;
1693         };
1694       return verify_symbolic_number_p (n, stmt) ? source_expr1 : NULL;
1695     }
1696
1697   /* Handle binary rhs.  */
1698
1699   if (rhs_class == GIMPLE_BINARY_RHS)
1700     {
1701       struct symbolic_number n1, n2;
1702       tree source_expr2;
1703
1704       if (code != BIT_IOR_EXPR)
1705         return NULL_TREE;
1706
1707       if (TREE_CODE (rhs2) != SSA_NAME)
1708         return NULL_TREE;
1709
1710       rhs2_stmt = SSA_NAME_DEF_STMT (rhs2);
1711
1712       switch (code)
1713         {
1714         case BIT_IOR_EXPR:
1715           source_expr1 = find_bswap_1 (rhs1_stmt, &n1, limit - 1);
1716
1717           if (!source_expr1)
1718             return NULL_TREE;
1719
1720           source_expr2 = find_bswap_1 (rhs2_stmt, &n2, limit - 1);
1721
1722           if (source_expr1 != source_expr2
1723               || n1.size != n2.size)
1724             return NULL_TREE;
1725
1726           n->size = n1.size;
1727           n->n = n1.n | n2.n;
1728
1729           if (!verify_symbolic_number_p (n, stmt))
1730             return NULL_TREE;
1731
1732           break;
1733         default:
1734           return NULL_TREE;
1735         }
1736       return source_expr1;
1737     }
1738   return NULL_TREE;
1739 }
1740
1741 /* Check if STMT completes a bswap implementation consisting of ORs,
1742    SHIFTs and ANDs.  Return the source tree expression on which the
1743    byte swap is performed and NULL if no bswap was found.  */
1744
1745 static tree
1746 find_bswap (gimple stmt)
1747 {
1748 /* The number which the find_bswap result should match in order to
1749    have a full byte swap.  The number is shifted to the left according
1750    to the size of the symbolic number before using it.  */
1751   unsigned HOST_WIDEST_INT cmp =
1752     sizeof (HOST_WIDEST_INT) < 8 ? 0 :
1753     (unsigned HOST_WIDEST_INT)0x01020304 << 32 | 0x05060708;
1754
1755   struct symbolic_number n;
1756   tree source_expr;
1757   int limit;
1758
1759   /* The last parameter determines the depth search limit.  It usually
1760      correlates directly to the number of bytes to be touched.  We
1761      increase that number by three  here in order to also
1762      cover signed -> unsigned converions of the src operand as can be seen
1763      in libgcc, and for initial shift/and operation of the src operand.  */
1764   limit = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (gimple_expr_type (stmt)));
1765   limit += 1 + (int) ceil_log2 ((unsigned HOST_WIDE_INT) limit);
1766   source_expr =  find_bswap_1 (stmt, &n, limit);
1767
1768   if (!source_expr)
1769     return NULL_TREE;
1770
1771   /* Zero out the extra bits of N and CMP.  */
1772   if (n.size < (int)sizeof (HOST_WIDEST_INT))
1773     {
1774       unsigned HOST_WIDEST_INT mask =
1775         ((unsigned HOST_WIDEST_INT)1 << (n.size * BITS_PER_UNIT)) - 1;
1776
1777       n.n &= mask;
1778       cmp >>= (sizeof (HOST_WIDEST_INT) - n.size) * BITS_PER_UNIT;
1779     }
1780
1781   /* A complete byte swap should make the symbolic number to start
1782      with the largest digit in the highest order byte.  */
1783   if (cmp != n.n)
1784     return NULL_TREE;
1785
1786   return source_expr;
1787 }
1788
1789 /* Find manual byte swap implementations and turn them into a bswap
1790    builtin invokation.  */
1791
1792 static unsigned int
1793 execute_optimize_bswap (void)
1794 {
1795   basic_block bb;
1796   bool bswap32_p, bswap64_p;
1797   bool changed = false;
1798   tree bswap32_type = NULL_TREE, bswap64_type = NULL_TREE;
1799
1800   if (BITS_PER_UNIT != 8)
1801     return 0;
1802
1803   if (sizeof (HOST_WIDEST_INT) < 8)
1804     return 0;
1805
1806   bswap32_p = (built_in_decls[BUILT_IN_BSWAP32]
1807                && optab_handler (bswap_optab, SImode) != CODE_FOR_nothing);
1808   bswap64_p = (built_in_decls[BUILT_IN_BSWAP64]
1809                && (optab_handler (bswap_optab, DImode) != CODE_FOR_nothing
1810                    || (bswap32_p && word_mode == SImode)));
1811
1812   if (!bswap32_p && !bswap64_p)
1813     return 0;
1814
1815   /* Determine the argument type of the builtins.  The code later on
1816      assumes that the return and argument type are the same.  */
1817   if (bswap32_p)
1818     {
1819       tree fndecl = built_in_decls[BUILT_IN_BSWAP32];
1820       bswap32_type = TREE_VALUE (TYPE_ARG_TYPES (TREE_TYPE (fndecl)));
1821     }
1822
1823   if (bswap64_p)
1824     {
1825       tree fndecl = built_in_decls[BUILT_IN_BSWAP64];
1826       bswap64_type = TREE_VALUE (TYPE_ARG_TYPES (TREE_TYPE (fndecl)));
1827     }
1828
1829   memset (&bswap_stats, 0, sizeof (bswap_stats));
1830
1831   FOR_EACH_BB (bb)
1832     {
1833       gimple_stmt_iterator gsi;
1834
1835       /* We do a reverse scan for bswap patterns to make sure we get the
1836          widest match. As bswap pattern matching doesn't handle
1837          previously inserted smaller bswap replacements as sub-
1838          patterns, the wider variant wouldn't be detected.  */
1839       for (gsi = gsi_last_bb (bb); !gsi_end_p (gsi); gsi_prev (&gsi))
1840         {
1841           gimple stmt = gsi_stmt (gsi);
1842           tree bswap_src, bswap_type;
1843           tree bswap_tmp;
1844           tree fndecl = NULL_TREE;
1845           int type_size;
1846           gimple call;
1847
1848           if (!is_gimple_assign (stmt)
1849               || gimple_assign_rhs_code (stmt) != BIT_IOR_EXPR)
1850             continue;
1851
1852           type_size = TYPE_PRECISION (gimple_expr_type (stmt));
1853
1854           switch (type_size)
1855             {
1856             case 32:
1857               if (bswap32_p)
1858                 {
1859                   fndecl = built_in_decls[BUILT_IN_BSWAP32];
1860                   bswap_type = bswap32_type;
1861                 }
1862               break;
1863             case 64:
1864               if (bswap64_p)
1865                 {
1866                   fndecl = built_in_decls[BUILT_IN_BSWAP64];
1867                   bswap_type = bswap64_type;
1868                 }
1869               break;
1870             default:
1871               continue;
1872             }
1873
1874           if (!fndecl)
1875             continue;
1876
1877           bswap_src = find_bswap (stmt);
1878
1879           if (!bswap_src)
1880             continue;
1881
1882           changed = true;
1883           if (type_size == 32)
1884             bswap_stats.found_32bit++;
1885           else
1886             bswap_stats.found_64bit++;
1887
1888           bswap_tmp = bswap_src;
1889
1890           /* Convert the src expression if necessary.  */
1891           if (!useless_type_conversion_p (TREE_TYPE (bswap_tmp), bswap_type))
1892             {
1893               gimple convert_stmt;
1894
1895               bswap_tmp = create_tmp_var (bswap_type, "bswapsrc");
1896               add_referenced_var (bswap_tmp);
1897               bswap_tmp = make_ssa_name (bswap_tmp, NULL);
1898
1899               convert_stmt = gimple_build_assign_with_ops (
1900                                CONVERT_EXPR, bswap_tmp, bswap_src, NULL);
1901               gsi_insert_before (&gsi, convert_stmt, GSI_SAME_STMT);
1902             }
1903
1904           call = gimple_build_call (fndecl, 1, bswap_tmp);
1905
1906           bswap_tmp = gimple_assign_lhs (stmt);
1907
1908           /* Convert the result if necessary.  */
1909           if (!useless_type_conversion_p (TREE_TYPE (bswap_tmp), bswap_type))
1910             {
1911               gimple convert_stmt;
1912
1913               bswap_tmp = create_tmp_var (bswap_type, "bswapdst");
1914               add_referenced_var (bswap_tmp);
1915               bswap_tmp = make_ssa_name (bswap_tmp, NULL);
1916               convert_stmt = gimple_build_assign_with_ops (
1917                                CONVERT_EXPR, gimple_assign_lhs (stmt), bswap_tmp, NULL);
1918               gsi_insert_after (&gsi, convert_stmt, GSI_SAME_STMT);
1919             }
1920
1921           gimple_call_set_lhs (call, bswap_tmp);
1922
1923           if (dump_file)
1924             {
1925               fprintf (dump_file, "%d bit bswap implementation found at: ",
1926                        (int)type_size);
1927               print_gimple_stmt (dump_file, stmt, 0, 0);
1928             }
1929
1930           gsi_insert_after (&gsi, call, GSI_SAME_STMT);
1931           gsi_remove (&gsi, true);
1932         }
1933     }
1934
1935   statistics_counter_event (cfun, "32-bit bswap implementations found",
1936                             bswap_stats.found_32bit);
1937   statistics_counter_event (cfun, "64-bit bswap implementations found",
1938                             bswap_stats.found_64bit);
1939
1940   return (changed ? TODO_update_ssa | TODO_verify_ssa
1941           | TODO_verify_stmts : 0);
1942 }
1943
1944 static bool
1945 gate_optimize_bswap (void)
1946 {
1947   return flag_expensive_optimizations && optimize;
1948 }
1949
1950 struct gimple_opt_pass pass_optimize_bswap =
1951 {
1952  {
1953   GIMPLE_PASS,
1954   "bswap",                              /* name */
1955   gate_optimize_bswap,                  /* gate */
1956   execute_optimize_bswap,               /* execute */
1957   NULL,                                 /* sub */
1958   NULL,                                 /* next */
1959   0,                                    /* static_pass_number */
1960   TV_NONE,                              /* tv_id */
1961   PROP_ssa,                             /* properties_required */
1962   0,                                    /* properties_provided */
1963   0,                                    /* properties_destroyed */
1964   0,                                    /* todo_flags_start */
1965   0                                     /* todo_flags_finish */
1966  }
1967 };
1968
1969 /* Return true if RHS is a suitable operand for a widening multiplication,
1970    assuming a target type of TYPE.
1971    There are two cases:
1972
1973      - RHS makes some value at least twice as wide.  Store that value
1974        in *NEW_RHS_OUT if so, and store its type in *TYPE_OUT.
1975
1976      - RHS is an integer constant.  Store that value in *NEW_RHS_OUT if so,
1977        but leave *TYPE_OUT untouched.  */
1978
1979 static bool
1980 is_widening_mult_rhs_p (tree type, tree rhs, tree *type_out,
1981                         tree *new_rhs_out)
1982 {
1983   gimple stmt;
1984   tree type1, rhs1;
1985   enum tree_code rhs_code;
1986
1987   if (TREE_CODE (rhs) == SSA_NAME)
1988     {
1989       stmt = SSA_NAME_DEF_STMT (rhs);
1990       if (is_gimple_assign (stmt))
1991         {
1992           rhs_code = gimple_assign_rhs_code (stmt);
1993           if (TREE_CODE (type) == INTEGER_TYPE
1994               ? !CONVERT_EXPR_CODE_P (rhs_code)
1995               : rhs_code != FIXED_CONVERT_EXPR)
1996             rhs1 = rhs;
1997           else
1998             {
1999               rhs1 = gimple_assign_rhs1 (stmt);
2000
2001               if (TREE_CODE (rhs1) == INTEGER_CST)
2002                 {
2003                   *new_rhs_out = rhs1;
2004                   *type_out = NULL;
2005                   return true;
2006                 }
2007             }
2008         }
2009       else
2010         rhs1 = rhs;
2011
2012       type1 = TREE_TYPE (rhs1);
2013
2014       if (TREE_CODE (type1) != TREE_CODE (type)
2015           || TYPE_PRECISION (type1) * 2 > TYPE_PRECISION (type))
2016         return false;
2017
2018       *new_rhs_out = rhs1;
2019       *type_out = type1;
2020       return true;
2021     }
2022
2023   if (TREE_CODE (rhs) == INTEGER_CST)
2024     {
2025       *new_rhs_out = rhs;
2026       *type_out = NULL;
2027       return true;
2028     }
2029
2030   return false;
2031 }
2032
2033 /* Return true if STMT performs a widening multiplication, assuming the
2034    output type is TYPE.  If so, store the unwidened types of the operands
2035    in *TYPE1_OUT and *TYPE2_OUT respectively.  Also fill *RHS1_OUT and
2036    *RHS2_OUT such that converting those operands to types *TYPE1_OUT
2037    and *TYPE2_OUT would give the operands of the multiplication.  */
2038
2039 static bool
2040 is_widening_mult_p (tree type, gimple stmt,
2041                     tree *type1_out, tree *rhs1_out,
2042                     tree *type2_out, tree *rhs2_out)
2043 {
2044   if (TREE_CODE (type) != INTEGER_TYPE
2045       && TREE_CODE (type) != FIXED_POINT_TYPE)
2046     return false;
2047
2048   if (!is_widening_mult_rhs_p (type, gimple_assign_rhs1 (stmt), type1_out,
2049                                rhs1_out))
2050     return false;
2051
2052   if (!is_widening_mult_rhs_p (type, gimple_assign_rhs2 (stmt), type2_out,
2053                                rhs2_out))
2054     return false;
2055
2056   if (*type1_out == NULL)
2057     {
2058       if (*type2_out == NULL || !int_fits_type_p (*rhs1_out, *type2_out))
2059         return false;
2060       *type1_out = *type2_out;
2061     }
2062
2063   if (*type2_out == NULL)
2064     {
2065       if (!int_fits_type_p (*rhs2_out, *type1_out))
2066         return false;
2067       *type2_out = *type1_out;
2068     }
2069
2070   /* Ensure that the larger of the two operands comes first. */
2071   if (TYPE_PRECISION (*type1_out) < TYPE_PRECISION (*type2_out))
2072     {
2073       tree tmp;
2074       tmp = *type1_out;
2075       *type1_out = *type2_out;
2076       *type2_out = tmp;
2077       tmp = *rhs1_out;
2078       *rhs1_out = *rhs2_out;
2079       *rhs2_out = tmp;
2080     }
2081
2082   return true;
2083 }
2084
2085 /* Process a single gimple statement STMT, which has a MULT_EXPR as
2086    its rhs, and try to convert it into a WIDEN_MULT_EXPR.  The return
2087    value is true iff we converted the statement.  */
2088
2089 static bool
2090 convert_mult_to_widen (gimple stmt, gimple_stmt_iterator *gsi)
2091 {
2092   tree lhs, rhs1, rhs2, type, type1, type2, tmp = NULL;
2093   enum insn_code handler;
2094   enum machine_mode to_mode, from_mode, actual_mode;
2095   optab op;
2096   int actual_precision;
2097   location_t loc = gimple_location (stmt);
2098   bool from_unsigned1, from_unsigned2;
2099
2100   lhs = gimple_assign_lhs (stmt);
2101   type = TREE_TYPE (lhs);
2102   if (TREE_CODE (type) != INTEGER_TYPE)
2103     return false;
2104
2105   if (!is_widening_mult_p (type, stmt, &type1, &rhs1, &type2, &rhs2))
2106     return false;
2107
2108   to_mode = TYPE_MODE (type);
2109   from_mode = TYPE_MODE (type1);
2110   from_unsigned1 = TYPE_UNSIGNED (type1);
2111   from_unsigned2 = TYPE_UNSIGNED (type2);
2112
2113   if (from_unsigned1 && from_unsigned2)
2114     op = umul_widen_optab;
2115   else if (!from_unsigned1 && !from_unsigned2)
2116     op = smul_widen_optab;
2117   else
2118     op = usmul_widen_optab;
2119
2120   handler = find_widening_optab_handler_and_mode (op, to_mode, from_mode,
2121                                                   0, &actual_mode);
2122
2123   if (handler == CODE_FOR_nothing)
2124     {
2125       if (op != smul_widen_optab)
2126         {
2127           /* We can use a signed multiply with unsigned types as long as
2128              there is a wider mode to use, or it is the smaller of the two
2129              types that is unsigned.  Note that type1 >= type2, always.  */
2130           if ((TYPE_UNSIGNED (type1)
2131                && TYPE_PRECISION (type1) == GET_MODE_PRECISION (from_mode))
2132               || (TYPE_UNSIGNED (type2)
2133                   && TYPE_PRECISION (type2) == GET_MODE_PRECISION (from_mode)))
2134             {
2135               from_mode = GET_MODE_WIDER_MODE (from_mode);
2136               if (GET_MODE_SIZE (to_mode) <= GET_MODE_SIZE (from_mode))
2137                 return false;
2138             }
2139
2140           op = smul_widen_optab;
2141           handler = find_widening_optab_handler_and_mode (op, to_mode,
2142                                                           from_mode, 0,
2143                                                           &actual_mode);
2144
2145           if (handler == CODE_FOR_nothing)
2146             return false;
2147
2148           from_unsigned1 = from_unsigned2 = false;
2149         }
2150       else
2151         return false;
2152     }
2153
2154   /* Ensure that the inputs to the handler are in the correct precison
2155      for the opcode.  This will be the full mode size.  */
2156   actual_precision = GET_MODE_PRECISION (actual_mode);
2157   if (actual_precision != TYPE_PRECISION (type1)
2158       || from_unsigned1 != TYPE_UNSIGNED (type1))
2159     {
2160       tmp = create_tmp_var (build_nonstandard_integer_type
2161                                 (actual_precision, from_unsigned1),
2162                             NULL);
2163       rhs1 = build_and_insert_cast (gsi, loc, tmp, rhs1);
2164     }
2165   if (actual_precision != TYPE_PRECISION (type2)
2166       || from_unsigned2 != TYPE_UNSIGNED (type2))
2167     {
2168       /* Reuse the same type info, if possible.  */
2169       if (!tmp || from_unsigned1 != from_unsigned2)
2170         tmp = create_tmp_var (build_nonstandard_integer_type
2171                                 (actual_precision, from_unsigned2),
2172                               NULL);
2173       rhs2 = build_and_insert_cast (gsi, loc, tmp, rhs2);
2174     }
2175
2176   /* Handle constants.  */
2177   if (TREE_CODE (rhs1) == INTEGER_CST)
2178     rhs1 = fold_convert (type1, rhs1);
2179   if (TREE_CODE (rhs2) == INTEGER_CST)
2180     rhs2 = fold_convert (type2, rhs2);
2181
2182   gimple_assign_set_rhs1 (stmt, rhs1);
2183   gimple_assign_set_rhs2 (stmt, rhs2);
2184   gimple_assign_set_rhs_code (stmt, WIDEN_MULT_EXPR);
2185   update_stmt (stmt);
2186   widen_mul_stats.widen_mults_inserted++;
2187   return true;
2188 }
2189
2190 /* Process a single gimple statement STMT, which is found at the
2191    iterator GSI and has a either a PLUS_EXPR or a MINUS_EXPR as its
2192    rhs (given by CODE), and try to convert it into a
2193    WIDEN_MULT_PLUS_EXPR or a WIDEN_MULT_MINUS_EXPR.  The return value
2194    is true iff we converted the statement.  */
2195
2196 static bool
2197 convert_plusminus_to_widen (gimple_stmt_iterator *gsi, gimple stmt,
2198                             enum tree_code code)
2199 {
2200   gimple rhs1_stmt = NULL, rhs2_stmt = NULL;
2201   gimple conv1_stmt = NULL, conv2_stmt = NULL, conv_stmt;
2202   tree type, type1, type2, optype, tmp = NULL;
2203   tree lhs, rhs1, rhs2, mult_rhs1, mult_rhs2, add_rhs;
2204   enum tree_code rhs1_code = ERROR_MARK, rhs2_code = ERROR_MARK;
2205   optab this_optab;
2206   enum tree_code wmult_code;
2207   enum insn_code handler;
2208   enum machine_mode to_mode, from_mode, actual_mode;
2209   location_t loc = gimple_location (stmt);
2210   int actual_precision;
2211   bool from_unsigned1, from_unsigned2;
2212
2213   lhs = gimple_assign_lhs (stmt);
2214   type = TREE_TYPE (lhs);
2215   if (TREE_CODE (type) != INTEGER_TYPE
2216       && TREE_CODE (type) != FIXED_POINT_TYPE)
2217     return false;
2218
2219   if (code == MINUS_EXPR)
2220     wmult_code = WIDEN_MULT_MINUS_EXPR;
2221   else
2222     wmult_code = WIDEN_MULT_PLUS_EXPR;
2223
2224   rhs1 = gimple_assign_rhs1 (stmt);
2225   rhs2 = gimple_assign_rhs2 (stmt);
2226
2227   if (TREE_CODE (rhs1) == SSA_NAME)
2228     {
2229       rhs1_stmt = SSA_NAME_DEF_STMT (rhs1);
2230       if (is_gimple_assign (rhs1_stmt))
2231         rhs1_code = gimple_assign_rhs_code (rhs1_stmt);
2232     }
2233
2234   if (TREE_CODE (rhs2) == SSA_NAME)
2235     {
2236       rhs2_stmt = SSA_NAME_DEF_STMT (rhs2);
2237       if (is_gimple_assign (rhs2_stmt))
2238         rhs2_code = gimple_assign_rhs_code (rhs2_stmt);
2239     }
2240
2241   /* Allow for one conversion statement between the multiply
2242      and addition/subtraction statement.  If there are more than
2243      one conversions then we assume they would invalidate this
2244      transformation.  If that's not the case then they should have
2245      been folded before now.  */
2246   if (CONVERT_EXPR_CODE_P (rhs1_code))
2247     {
2248       conv1_stmt = rhs1_stmt;
2249       rhs1 = gimple_assign_rhs1 (rhs1_stmt);
2250       if (TREE_CODE (rhs1) == SSA_NAME)
2251         {
2252           rhs1_stmt = SSA_NAME_DEF_STMT (rhs1);
2253           if (is_gimple_assign (rhs1_stmt))
2254             rhs1_code = gimple_assign_rhs_code (rhs1_stmt);
2255         }
2256       else
2257         return false;
2258     }
2259   if (CONVERT_EXPR_CODE_P (rhs2_code))
2260     {
2261       conv2_stmt = rhs2_stmt;
2262       rhs2 = gimple_assign_rhs1 (rhs2_stmt);
2263       if (TREE_CODE (rhs2) == SSA_NAME)
2264         {
2265           rhs2_stmt = SSA_NAME_DEF_STMT (rhs2);
2266           if (is_gimple_assign (rhs2_stmt))
2267             rhs2_code = gimple_assign_rhs_code (rhs2_stmt);
2268         }
2269       else
2270         return false;
2271     }
2272
2273   /* If code is WIDEN_MULT_EXPR then it would seem unnecessary to call
2274      is_widening_mult_p, but we still need the rhs returns.
2275
2276      It might also appear that it would be sufficient to use the existing
2277      operands of the widening multiply, but that would limit the choice of
2278      multiply-and-accumulate instructions.  */
2279   if (code == PLUS_EXPR
2280       && (rhs1_code == MULT_EXPR || rhs1_code == WIDEN_MULT_EXPR))
2281     {
2282       if (!is_widening_mult_p (type, rhs1_stmt, &type1, &mult_rhs1,
2283                                &type2, &mult_rhs2))
2284         return false;
2285       add_rhs = rhs2;
2286       conv_stmt = conv1_stmt;
2287     }
2288   else if (rhs2_code == MULT_EXPR || rhs2_code == WIDEN_MULT_EXPR)
2289     {
2290       if (!is_widening_mult_p (type, rhs2_stmt, &type1, &mult_rhs1,
2291                                &type2, &mult_rhs2))
2292         return false;
2293       add_rhs = rhs1;
2294       conv_stmt = conv2_stmt;
2295     }
2296   else
2297     return false;
2298
2299   to_mode = TYPE_MODE (type);
2300   from_mode = TYPE_MODE (type1);
2301   from_unsigned1 = TYPE_UNSIGNED (type1);
2302   from_unsigned2 = TYPE_UNSIGNED (type2);
2303
2304   /* There's no such thing as a mixed sign madd yet, so use a wider mode.  */
2305   if (from_unsigned1 != from_unsigned2)
2306     {
2307       /* We can use a signed multiply with unsigned types as long as
2308          there is a wider mode to use, or it is the smaller of the two
2309          types that is unsigned.  Note that type1 >= type2, always.  */
2310       if ((from_unsigned1
2311            && TYPE_PRECISION (type1) == GET_MODE_PRECISION (from_mode))
2312           || (from_unsigned2
2313               && TYPE_PRECISION (type2) == GET_MODE_PRECISION (from_mode)))
2314         {
2315           from_mode = GET_MODE_WIDER_MODE (from_mode);
2316           if (GET_MODE_SIZE (from_mode) >= GET_MODE_SIZE (to_mode))
2317             return false;
2318         }
2319
2320       from_unsigned1 = from_unsigned2 = false;
2321     }
2322
2323   /* If there was a conversion between the multiply and addition
2324      then we need to make sure it fits a multiply-and-accumulate.
2325      The should be a single mode change which does not change the
2326      value.  */
2327   if (conv_stmt)
2328     {
2329       /* We use the original, unmodified data types for this.  */
2330       tree from_type = TREE_TYPE (gimple_assign_rhs1 (conv_stmt));
2331       tree to_type = TREE_TYPE (gimple_assign_lhs (conv_stmt));
2332       int data_size = TYPE_PRECISION (type1) + TYPE_PRECISION (type2);
2333       bool is_unsigned = TYPE_UNSIGNED (type1) && TYPE_UNSIGNED (type2);
2334
2335       if (TYPE_PRECISION (from_type) > TYPE_PRECISION (to_type))
2336         {
2337           /* Conversion is a truncate.  */
2338           if (TYPE_PRECISION (to_type) < data_size)
2339             return false;
2340         }
2341       else if (TYPE_PRECISION (from_type) < TYPE_PRECISION (to_type))
2342         {
2343           /* Conversion is an extend.  Check it's the right sort.  */
2344           if (TYPE_UNSIGNED (from_type) != is_unsigned
2345               && !(is_unsigned && TYPE_PRECISION (from_type) > data_size))
2346             return false;
2347         }
2348       /* else convert is a no-op for our purposes.  */
2349     }
2350
2351   /* Verify that the machine can perform a widening multiply
2352      accumulate in this mode/signedness combination, otherwise
2353      this transformation is likely to pessimize code.  */
2354   optype = build_nonstandard_integer_type (from_mode, from_unsigned1);
2355   this_optab = optab_for_tree_code (wmult_code, optype, optab_default);
2356   handler = find_widening_optab_handler_and_mode (this_optab, to_mode,
2357                                                   from_mode, 0, &actual_mode);
2358
2359   if (handler == CODE_FOR_nothing)
2360     return false;
2361
2362   /* Ensure that the inputs to the handler are in the correct precison
2363      for the opcode.  This will be the full mode size.  */
2364   actual_precision = GET_MODE_PRECISION (actual_mode);
2365   if (actual_precision != TYPE_PRECISION (type1)
2366       || from_unsigned1 != TYPE_UNSIGNED (type1))
2367     {
2368       tmp = create_tmp_var (build_nonstandard_integer_type
2369                                 (actual_precision, from_unsigned1),
2370                             NULL);
2371       mult_rhs1 = build_and_insert_cast (gsi, loc, tmp, mult_rhs1);
2372     }
2373   if (actual_precision != TYPE_PRECISION (type2)
2374       || from_unsigned2 != TYPE_UNSIGNED (type2))
2375     {
2376       if (!tmp || from_unsigned1 != from_unsigned2)
2377         tmp = create_tmp_var (build_nonstandard_integer_type
2378                                 (actual_precision, from_unsigned2),
2379                               NULL);
2380       mult_rhs2 = build_and_insert_cast (gsi, loc, tmp, mult_rhs2);
2381     }
2382
2383   if (!useless_type_conversion_p (type, TREE_TYPE (add_rhs)))
2384     add_rhs = build_and_insert_cast (gsi, loc, create_tmp_var (type, NULL),
2385                                      add_rhs);
2386
2387   /* Handle constants.  */
2388   if (TREE_CODE (mult_rhs1) == INTEGER_CST)
2389     rhs1 = fold_convert (type1, mult_rhs1);
2390   if (TREE_CODE (mult_rhs2) == INTEGER_CST)
2391     rhs2 = fold_convert (type2, mult_rhs2);
2392
2393   gimple_assign_set_rhs_with_ops_1 (gsi, wmult_code, mult_rhs1, mult_rhs2,
2394                                     add_rhs);
2395   update_stmt (gsi_stmt (*gsi));
2396   widen_mul_stats.maccs_inserted++;
2397   return true;
2398 }
2399
2400 /* Combine the multiplication at MUL_STMT with operands MULOP1 and MULOP2
2401    with uses in additions and subtractions to form fused multiply-add
2402    operations.  Returns true if successful and MUL_STMT should be removed.  */
2403
2404 static bool
2405 convert_mult_to_fma (gimple mul_stmt, tree op1, tree op2)
2406 {
2407   tree mul_result = gimple_get_lhs (mul_stmt);
2408   tree type = TREE_TYPE (mul_result);
2409   gimple use_stmt, neguse_stmt, fma_stmt;
2410   use_operand_p use_p;
2411   imm_use_iterator imm_iter;
2412
2413   if (FLOAT_TYPE_P (type)
2414       && flag_fp_contract_mode == FP_CONTRACT_OFF)
2415     return false;
2416
2417   /* We don't want to do bitfield reduction ops.  */
2418   if (INTEGRAL_TYPE_P (type)
2419       && (TYPE_PRECISION (type)
2420           != GET_MODE_PRECISION (TYPE_MODE (type))))
2421     return false;
2422
2423   /* If the target doesn't support it, don't generate it.  We assume that
2424      if fma isn't available then fms, fnma or fnms are not either.  */
2425   if (optab_handler (fma_optab, TYPE_MODE (type)) == CODE_FOR_nothing)
2426     return false;
2427
2428   /* Make sure that the multiplication statement becomes dead after
2429      the transformation, thus that all uses are transformed to FMAs.
2430      This means we assume that an FMA operation has the same cost
2431      as an addition.  */
2432   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, mul_result)
2433     {
2434       enum tree_code use_code;
2435       tree result = mul_result;
2436       bool negate_p = false;
2437
2438       use_stmt = USE_STMT (use_p);
2439
2440       if (is_gimple_debug (use_stmt))
2441         continue;
2442
2443       /* For now restrict this operations to single basic blocks.  In theory
2444          we would want to support sinking the multiplication in
2445          m = a*b;
2446          if ()
2447            ma = m + c;
2448          else
2449            d = m;
2450          to form a fma in the then block and sink the multiplication to the
2451          else block.  */
2452       if (gimple_bb (use_stmt) != gimple_bb (mul_stmt))
2453         return false;
2454
2455       if (!is_gimple_assign (use_stmt))
2456         return false;
2457
2458       use_code = gimple_assign_rhs_code (use_stmt);
2459
2460       /* A negate on the multiplication leads to FNMA.  */
2461       if (use_code == NEGATE_EXPR)
2462         {
2463           ssa_op_iter iter;
2464           use_operand_p usep;
2465
2466           result = gimple_assign_lhs (use_stmt);
2467
2468           /* Make sure the negate statement becomes dead with this
2469              single transformation.  */
2470           if (!single_imm_use (gimple_assign_lhs (use_stmt),
2471                                &use_p, &neguse_stmt))
2472             return false;
2473
2474           /* Make sure the multiplication isn't also used on that stmt.  */
2475           FOR_EACH_PHI_OR_STMT_USE (usep, neguse_stmt, iter, SSA_OP_USE)
2476             if (USE_FROM_PTR (usep) == mul_result)
2477               return false;
2478
2479           /* Re-validate.  */
2480           use_stmt = neguse_stmt;
2481           if (gimple_bb (use_stmt) != gimple_bb (mul_stmt))
2482             return false;
2483           if (!is_gimple_assign (use_stmt))
2484             return false;
2485
2486           use_code = gimple_assign_rhs_code (use_stmt);
2487           negate_p = true;
2488         }
2489
2490       switch (use_code)
2491         {
2492         case MINUS_EXPR:
2493           if (gimple_assign_rhs2 (use_stmt) == result)
2494             negate_p = !negate_p;
2495           break;
2496         case PLUS_EXPR:
2497           break;
2498         default:
2499           /* FMA can only be formed from PLUS and MINUS.  */
2500           return false;
2501         }
2502
2503       /* We can't handle a * b + a * b.  */
2504       if (gimple_assign_rhs1 (use_stmt) == gimple_assign_rhs2 (use_stmt))
2505         return false;
2506
2507       /* While it is possible to validate whether or not the exact form
2508          that we've recognized is available in the backend, the assumption
2509          is that the transformation is never a loss.  For instance, suppose
2510          the target only has the plain FMA pattern available.  Consider
2511          a*b-c -> fma(a,b,-c): we've exchanged MUL+SUB for FMA+NEG, which
2512          is still two operations.  Consider -(a*b)-c -> fma(-a,b,-c): we
2513          still have 3 operations, but in the FMA form the two NEGs are
2514          independant and could be run in parallel.  */
2515     }
2516
2517   FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, mul_result)
2518     {
2519       gimple_stmt_iterator gsi = gsi_for_stmt (use_stmt);
2520       enum tree_code use_code;
2521       tree addop, mulop1 = op1, result = mul_result;
2522       bool negate_p = false;
2523
2524       if (is_gimple_debug (use_stmt))
2525         continue;
2526
2527       use_code = gimple_assign_rhs_code (use_stmt);
2528       if (use_code == NEGATE_EXPR)
2529         {
2530           result = gimple_assign_lhs (use_stmt);
2531           single_imm_use (gimple_assign_lhs (use_stmt), &use_p, &neguse_stmt);
2532           gsi_remove (&gsi, true);
2533           release_defs (use_stmt);
2534
2535           use_stmt = neguse_stmt;
2536           gsi = gsi_for_stmt (use_stmt);
2537           use_code = gimple_assign_rhs_code (use_stmt);
2538           negate_p = true;
2539         }
2540
2541       if (gimple_assign_rhs1 (use_stmt) == result)
2542         {
2543           addop = gimple_assign_rhs2 (use_stmt);
2544           /* a * b - c -> a * b + (-c)  */
2545           if (gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
2546             addop = force_gimple_operand_gsi (&gsi,
2547                                               build1 (NEGATE_EXPR,
2548                                                       type, addop),
2549                                               true, NULL_TREE, true,
2550                                               GSI_SAME_STMT);
2551         }
2552       else
2553         {
2554           addop = gimple_assign_rhs1 (use_stmt);
2555           /* a - b * c -> (-b) * c + a */
2556           if (gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
2557             negate_p = !negate_p;
2558         }
2559
2560       if (negate_p)
2561         mulop1 = force_gimple_operand_gsi (&gsi,
2562                                            build1 (NEGATE_EXPR,
2563                                                    type, mulop1),
2564                                            true, NULL_TREE, true,
2565                                            GSI_SAME_STMT);
2566
2567       fma_stmt = gimple_build_assign_with_ops3 (FMA_EXPR,
2568                                                 gimple_assign_lhs (use_stmt),
2569                                                 mulop1, op2,
2570                                                 addop);
2571       gsi_replace (&gsi, fma_stmt, true);
2572       widen_mul_stats.fmas_inserted++;
2573     }
2574
2575   return true;
2576 }
2577
2578 /* Find integer multiplications where the operands are extended from
2579    smaller types, and replace the MULT_EXPR with a WIDEN_MULT_EXPR
2580    where appropriate.  */
2581
2582 static unsigned int
2583 execute_optimize_widening_mul (void)
2584 {
2585   basic_block bb;
2586   bool cfg_changed = false;
2587
2588   memset (&widen_mul_stats, 0, sizeof (widen_mul_stats));
2589
2590   FOR_EACH_BB (bb)
2591     {
2592       gimple_stmt_iterator gsi;
2593
2594       for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi);)
2595         {
2596           gimple stmt = gsi_stmt (gsi);
2597           enum tree_code code;
2598
2599           if (is_gimple_assign (stmt))
2600             {
2601               code = gimple_assign_rhs_code (stmt);
2602               switch (code)
2603                 {
2604                 case MULT_EXPR:
2605                   if (!convert_mult_to_widen (stmt, &gsi)
2606                       && convert_mult_to_fma (stmt,
2607                                               gimple_assign_rhs1 (stmt),
2608                                               gimple_assign_rhs2 (stmt)))
2609                     {
2610                       gsi_remove (&gsi, true);
2611                       release_defs (stmt);
2612                       continue;
2613                     }
2614                   break;
2615
2616                 case PLUS_EXPR:
2617                 case MINUS_EXPR:
2618                   convert_plusminus_to_widen (&gsi, stmt, code);
2619                   break;
2620
2621                 default:;
2622                 }
2623             }
2624           else if (is_gimple_call (stmt)
2625                    && gimple_call_lhs (stmt))
2626             {
2627               tree fndecl = gimple_call_fndecl (stmt);
2628               if (fndecl
2629                   && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_NORMAL)
2630                 {
2631                   switch (DECL_FUNCTION_CODE (fndecl))
2632                     {
2633                       case BUILT_IN_POWF:
2634                       case BUILT_IN_POW:
2635                       case BUILT_IN_POWL:
2636                         if (TREE_CODE (gimple_call_arg (stmt, 1)) == REAL_CST
2637                             && REAL_VALUES_EQUAL
2638                                  (TREE_REAL_CST (gimple_call_arg (stmt, 1)),
2639                                   dconst2)
2640                             && convert_mult_to_fma (stmt,
2641                                                     gimple_call_arg (stmt, 0),
2642                                                     gimple_call_arg (stmt, 0)))
2643                           {
2644                             unlink_stmt_vdef (stmt);
2645                             gsi_remove (&gsi, true);
2646                             release_defs (stmt);
2647                             if (gimple_purge_dead_eh_edges (bb))
2648                               cfg_changed = true;
2649                             continue;
2650                           }
2651                           break;
2652
2653                       default:;
2654                     }
2655                 }
2656             }
2657           gsi_next (&gsi);
2658         }
2659     }
2660
2661   statistics_counter_event (cfun, "widening multiplications inserted",
2662                             widen_mul_stats.widen_mults_inserted);
2663   statistics_counter_event (cfun, "widening maccs inserted",
2664                             widen_mul_stats.maccs_inserted);
2665   statistics_counter_event (cfun, "fused multiply-adds inserted",
2666                             widen_mul_stats.fmas_inserted);
2667
2668   return cfg_changed ? TODO_cleanup_cfg : 0;
2669 }
2670
2671 static bool
2672 gate_optimize_widening_mul (void)
2673 {
2674   return flag_expensive_optimizations && optimize;
2675 }
2676
2677 struct gimple_opt_pass pass_optimize_widening_mul =
2678 {
2679  {
2680   GIMPLE_PASS,
2681   "widening_mul",                       /* name */
2682   gate_optimize_widening_mul,           /* gate */
2683   execute_optimize_widening_mul,        /* execute */
2684   NULL,                                 /* sub */
2685   NULL,                                 /* next */
2686   0,                                    /* static_pass_number */
2687   TV_NONE,                              /* tv_id */
2688   PROP_ssa,                             /* properties_required */
2689   0,                                    /* properties_provided */
2690   0,                                    /* properties_destroyed */
2691   0,                                    /* todo_flags_start */
2692   TODO_verify_ssa
2693   | TODO_verify_stmts
2694   | TODO_update_ssa                     /* todo_flags_finish */
2695  }
2696 };