gcc/tree-ssa-math-opts.c

   1 /* Global, SSA-based optimizations using mathematical identities.
   2    Copyright (C) 2005-2013 Free Software Foundation, Inc.
   3
   4 This file is part of GCC.
   5
   6 GCC is free software; you can redistribute it and/or modify it
   7 under the terms of the GNU General Public License as published by the
   8 Free Software Foundation; either version 3, or (at your option) any
   9 later version.
  10
  11 GCC is distributed in the hope that it will be useful, but WITHOUT
  12 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14 for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with GCC; see the file COPYING3.  If not see
  18 <http://www.gnu.org/licenses/>.  */
  19
  20 /* Currently, the only mini-pass in this file tries to CSE reciprocal
  21    operations.  These are common in sequences such as this one:
  22
  23         modulus = sqrt(x*x + y*y + z*z);
  24         x = x / modulus;
  25         y = y / modulus;
  26         z = z / modulus;
  27
  28    that can be optimized to
  29
  30         modulus = sqrt(x*x + y*y + z*z);
  31         rmodulus = 1.0 / modulus;
  32         x = x * rmodulus;
  33         y = y * rmodulus;
  34         z = z * rmodulus;
  35
  36    We do this for loop invariant divisors, and with this pass whenever
  37    we notice that a division has the same divisor multiple times.
  38
  39    Of course, like in PRE, we don't insert a division if a dominator
  40    already has one.  However, this cannot be done as an extension of
  41    PRE for several reasons.
  42
  43    First of all, with some experiments it was found out that the
  44    transformation is not always useful if there are only two divisions
  45    hy the same divisor.  This is probably because modern processors
  46    can pipeline the divisions; on older, in-order processors it should
  47    still be effective to optimize two divisions by the same number.
  48    We make this a param, and it shall be called N in the remainder of
  49    this comment.
  50
  51    Second, if trapping math is active, we have less freedom on where
  52    to insert divisions: we can only do so in basic blocks that already
  53    contain one.  (If divisions don't trap, instead, we can insert
  54    divisions elsewhere, which will be in blocks that are common dominators
  55    of those that have the division).
  56
  57    We really don't want to compute the reciprocal unless a division will
  58    be found.  To do this, we won't insert the division in a basic block
  59    that has less than N divisions *post-dominating* it.
  60
  61    The algorithm constructs a subset of the dominator tree, holding the
  62    blocks containing the divisions and the common dominators to them,
  63    and walk it twice.  The first walk is in post-order, and it annotates
  64    each block with the number of divisions that post-dominate it: this
  65    gives information on where divisions can be inserted profitably.
  66    The second walk is in pre-order, and it inserts divisions as explained
  67    above, and replaces divisions by multiplications.
  68
  69    In the best case, the cost of the pass is O(n_statements).  In the
  70    worst-case, the cost is due to creating the dominator tree subset,
  71    with a cost of O(n_basic_blocks ^ 2); however this can only happen
  72    for n_statements / n_basic_blocks statements.  So, the amortized cost
  73    of creating the dominator tree subset is O(n_basic_blocks) and the
  74    worst-case cost of the pass is O(n_statements * n_basic_blocks).
  75
  76    More practically, the cost will be small because there are few
  77    divisions, and they tend to be in the same basic block, so insert_bb
  78    is called very few times.
  79
  80    If we did this using domwalk.c, an efficient implementation would have
  81    to work on all the variables in a single pass, because we could not
  82    work on just a subset of the dominator tree, as we do now, and the
  83    cost would also be something like O(n_statements * n_basic_blocks).
  84    The data structures would be more complex in order to work on all the
  85    variables in a single pass.  */
  86
  87 #include "config.h"
  88 #include "system.h"
  89 #include "coretypes.h"
  90 #include "tm.h"
  91 #include "flags.h"
  92 #include "tree.h"
  93 #include "gimple.h"
  94 #include "gimple-ssa.h"
  95 #include "tree-cfg.h"
  96 #include "tree-phinodes.h"
  97 #include "ssa-iterators.h"
  98 #include "tree-ssanames.h"
  99 #include "tree-dfa.h"
 100 #include "tree-ssa.h"
 101 #include "tree-pass.h"
 102 #include "alloc-pool.h"
 103 #include "basic-block.h"
 104 #include "target.h"
 105 #include "gimple-pretty-print.h"
 106
 107 /* FIXME: RTL headers have to be included here for optabs.  */
 108 #include "rtl.h"                /* Because optabs.h wants enum rtx_code.  */
 109 #include "expr.h"               /* Because optabs.h wants sepops.  */
 110 #include "optabs.h"
 111
 112 /* This structure represents one basic block that either computes a
 113    division, or is a common dominator for basic block that compute a
 114    division.  */
 115 struct occurrence {
 116   /* The basic block represented by this structure.  */
 117   basic_block bb;
 118
 119   /* If non-NULL, the SSA_NAME holding the definition for a reciprocal
 120      inserted in BB.  */
 121   tree recip_def;
 122
 123   /* If non-NULL, the GIMPLE_ASSIGN for a reciprocal computation that
 124      was inserted in BB.  */
 125   gimple recip_def_stmt;
 126
 127   /* Pointer to a list of "struct occurrence"s for blocks dominated
 128      by BB.  */
 129   struct occurrence *children;
 130
 131   /* Pointer to the next "struct occurrence"s in the list of blocks
 132      sharing a common dominator.  */
 133   struct occurrence *next;
 134
 135   /* The number of divisions that are in BB before compute_merit.  The
 136      number of divisions that are in BB or post-dominate it after
 137      compute_merit.  */
 138   int num_divisions;
 139
 140   /* True if the basic block has a division, false if it is a common
 141      dominator for basic blocks that do.  If it is false and trapping
 142      math is active, BB is not a candidate for inserting a reciprocal.  */
 143   bool bb_has_division;
 144 };
 145
 146 static struct
 147 {
 148   /* Number of 1.0/X ops inserted.  */
 149   int rdivs_inserted;
 150
 151   /* Number of 1.0/FUNC ops inserted.  */
 152   int rfuncs_inserted;
 153 } reciprocal_stats;
 154
 155 static struct
 156 {
 157   /* Number of cexpi calls inserted.  */
 158   int inserted;
 159 } sincos_stats;
 160
 161 static struct
 162 {
 163   /* Number of hand-written 16-bit bswaps found.  */
 164   int found_16bit;
 165
 166   /* Number of hand-written 32-bit bswaps found.  */
 167   int found_32bit;
 168
 169   /* Number of hand-written 64-bit bswaps found.  */
 170   int found_64bit;
 171 } bswap_stats;
 172
 173 static struct
 174 {
 175   /* Number of widening multiplication ops inserted.  */
 176   int widen_mults_inserted;
 177
 178   /* Number of integer multiply-and-accumulate ops inserted.  */
 179   int maccs_inserted;
 180
 181   /* Number of fp fused multiply-add ops inserted.  */
 182   int fmas_inserted;
 183 } widen_mul_stats;
 184
 185 /* The instance of "struct occurrence" representing the highest
 186    interesting block in the dominator tree.  */
 187 static struct occurrence *occ_head;
 188
 189 /* Allocation pool for getting instances of "struct occurrence".  */
 190 static alloc_pool occ_pool;
 191
 192
 193
 194 /* Allocate and return a new struct occurrence for basic block BB, and
 195    whose children list is headed by CHILDREN.  */
 196 static struct occurrence *
 197 occ_new (basic_block bb, struct occurrence *children)
 198 {
 199   struct occurrence *occ;
 200
 201   bb->aux = occ = (struct occurrence *) pool_alloc (occ_pool);
 202   memset (occ, 0, sizeof (struct occurrence));
 203
 204   occ->bb = bb;
 205   occ->children = children;
 206   return occ;
 207 }
 208
 209
 210 /* Insert NEW_OCC into our subset of the dominator tree.  P_HEAD points to a
 211    list of "struct occurrence"s, one per basic block, having IDOM as
 212    their common dominator.
 213
 214    We try to insert NEW_OCC as deep as possible in the tree, and we also
 215    insert any other block that is a common dominator for BB and one
 216    block already in the tree.  */
 217
 218 static void
 219 insert_bb (struct occurrence *new_occ, basic_block idom,
 220            struct occurrence **p_head)
 221 {
 222   struct occurrence *occ, **p_occ;
 223
 224   for (p_occ = p_head; (occ = *p_occ) != NULL; )
 225     {
 226       basic_block bb = new_occ->bb, occ_bb = occ->bb;
 227       basic_block dom = nearest_common_dominator (CDI_DOMINATORS, occ_bb, bb);
 228       if (dom == bb)
 229         {
 230           /* BB dominates OCC_BB.  OCC becomes NEW_OCC's child: remove OCC
 231              from its list.  */
 232           *p_occ = occ->next;
 233           occ->next = new_occ->children;
 234           new_occ->children = occ;
 235
 236           /* Try the next block (it may as well be dominated by BB).  */
 237         }
 238
 239       else if (dom == occ_bb)
 240         {
 241           /* OCC_BB dominates BB.  Tail recurse to look deeper.  */
 242           insert_bb (new_occ, dom, &occ->children);
 243           return;
 244         }
 245
 246       else if (dom != idom)
 247         {
 248           gcc_assert (!dom->aux);
 249
 250           /* There is a dominator between IDOM and BB, add it and make
 251              two children out of NEW_OCC and OCC.  First, remove OCC from
 252              its list.  */
 253           *p_occ = occ->next;
 254           new_occ->next = occ;
 255           occ->next = NULL;
 256
 257           /* None of the previous blocks has DOM as a dominator: if we tail
 258              recursed, we would reexamine them uselessly. Just switch BB with
 259              DOM, and go on looking for blocks dominated by DOM.  */
 260           new_occ = occ_new (dom, new_occ);
 261         }
 262
 263       else
 264         {
 265           /* Nothing special, go on with the next element.  */
 266           p_occ = &occ->next;
 267         }
 268     }
 269
 270   /* No place was found as a child of IDOM.  Make BB a sibling of IDOM.  */
 271   new_occ->next = *p_head;
 272   *p_head = new_occ;
 273 }
 274
 275 /* Register that we found a division in BB.  */
 276
 277 static inline void
 278 register_division_in (basic_block bb)
 279 {
 280   struct occurrence *occ;
 281
 282   occ = (struct occurrence *) bb->aux;
 283   if (!occ)
 284     {
 285       occ = occ_new (bb, NULL);
 286       insert_bb (occ, ENTRY_BLOCK_PTR, &occ_head);
 287     }
 288
 289   occ->bb_has_division = true;
 290   occ->num_divisions++;
 291 }
 292
 293
 294 /* Compute the number of divisions that postdominate each block in OCC and
 295    its children.  */
 296
 297 static void
 298 compute_merit (struct occurrence *occ)
 299 {
 300   struct occurrence *occ_child;
 301   basic_block dom = occ->bb;
 302
 303   for (occ_child = occ->children; occ_child; occ_child = occ_child->next)
 304     {
 305       basic_block bb;
 306       if (occ_child->children)
 307         compute_merit (occ_child);
 308
 309       if (flag_exceptions)
 310         bb = single_noncomplex_succ (dom);
 311       else
 312         bb = dom;
 313
 314       if (dominated_by_p (CDI_POST_DOMINATORS, bb, occ_child->bb))
 315         occ->num_divisions += occ_child->num_divisions;
 316     }
 317 }
 318
 319
 320 /* Return whether USE_STMT is a floating-point division by DEF.  */
 321 static inline bool
 322 is_division_by (gimple use_stmt, tree def)
 323 {
 324   return is_gimple_assign (use_stmt)
 325          && gimple_assign_rhs_code (use_stmt) == RDIV_EXPR
 326          && gimple_assign_rhs2 (use_stmt) == def
 327          /* Do not recognize x / x as valid division, as we are getting
 328             confused later by replacing all immediate uses x in such
 329             a stmt.  */
 330          && gimple_assign_rhs1 (use_stmt) != def;
 331 }
 332
 333 /* Walk the subset of the dominator tree rooted at OCC, setting the
 334    RECIP_DEF field to a definition of 1.0 / DEF that can be used in
 335    the given basic block.  The field may be left NULL, of course,
 336    if it is not possible or profitable to do the optimization.
 337
 338    DEF_BSI is an iterator pointing at the statement defining DEF.
 339    If RECIP_DEF is set, a dominator already has a computation that can
 340    be used.  */
 341
 342 static void
 343 insert_reciprocals (gimple_stmt_iterator *def_gsi, struct occurrence *occ,
 344                     tree def, tree recip_def, int threshold)
 345 {
 346   tree type;
 347   gimple new_stmt;
 348   gimple_stmt_iterator gsi;
 349   struct occurrence *occ_child;
 350
 351   if (!recip_def
 352       && (occ->bb_has_division || !flag_trapping_math)
 353       && occ->num_divisions >= threshold)
 354     {
 355       /* Make a variable with the replacement and substitute it.  */
 356       type = TREE_TYPE (def);
 357       recip_def = create_tmp_reg (type, "reciptmp");
 358       new_stmt = gimple_build_assign_with_ops (RDIV_EXPR, recip_def,
 359                                                build_one_cst (type), def);
 360
 361       if (occ->bb_has_division)
 362         {
 363           /* Case 1: insert before an existing division.  */
 364           gsi = gsi_after_labels (occ->bb);
 365           while (!gsi_end_p (gsi) && !is_division_by (gsi_stmt (gsi), def))
 366             gsi_next (&gsi);
 367
 368           gsi_insert_before (&gsi, new_stmt, GSI_SAME_STMT);
 369         }
 370       else if (def_gsi && occ->bb == def_gsi->bb)
 371         {
 372           /* Case 2: insert right after the definition.  Note that this will
 373              never happen if the definition statement can throw, because in
 374              that case the sole successor of the statement's basic block will
 375              dominate all the uses as well.  */
 376           gsi_insert_after (def_gsi, new_stmt, GSI_NEW_STMT);
 377         }
 378       else
 379         {
 380           /* Case 3: insert in a basic block not containing defs/uses.  */
 381           gsi = gsi_after_labels (occ->bb);
 382           gsi_insert_before (&gsi, new_stmt, GSI_SAME_STMT);
 383         }
 384
 385       reciprocal_stats.rdivs_inserted++;
 386
 387       occ->recip_def_stmt = new_stmt;
 388     }
 389
 390   occ->recip_def = recip_def;
 391   for (occ_child = occ->children; occ_child; occ_child = occ_child->next)
 392     insert_reciprocals (def_gsi, occ_child, def, recip_def, threshold);
 393 }
 394
 395
 396 /* Replace the division at USE_P with a multiplication by the reciprocal, if
 397    possible.  */
 398
 399 static inline void
 400 replace_reciprocal (use_operand_p use_p)
 401 {
 402   gimple use_stmt = USE_STMT (use_p);
 403   basic_block bb = gimple_bb (use_stmt);
 404   struct occurrence *occ = (struct occurrence *) bb->aux;
 405
 406   if (optimize_bb_for_speed_p (bb)
 407       && occ->recip_def && use_stmt != occ->recip_def_stmt)
 408     {
 409       gimple_stmt_iterator gsi = gsi_for_stmt (use_stmt);
 410       gimple_assign_set_rhs_code (use_stmt, MULT_EXPR);
 411       SET_USE (use_p, occ->recip_def);
 412       fold_stmt_inplace (&gsi);
 413       update_stmt (use_stmt);
 414     }
 415 }
 416
 417
 418 /* Free OCC and return one more "struct occurrence" to be freed.  */
 419
 420 static struct occurrence *
 421 free_bb (struct occurrence *occ)
 422 {
 423   struct occurrence *child, *next;
 424
 425   /* First get the two pointers hanging off OCC.  */
 426   next = occ->next;
 427   child = occ->children;
 428   occ->bb->aux = NULL;
 429   pool_free (occ_pool, occ);
 430
 431   /* Now ensure that we don't recurse unless it is necessary.  */
 432   if (!child)
 433     return next;
 434   else
 435     {
 436       while (next)
 437         next = free_bb (next);
 438
 439       return child;
 440     }
 441 }
 442
 443
 444 /* Look for floating-point divisions among DEF's uses, and try to
 445    replace them by multiplications with the reciprocal.  Add
 446    as many statements computing the reciprocal as needed.
 447
 448    DEF must be a GIMPLE register of a floating-point type.  */
 449
 450 static void
 451 execute_cse_reciprocals_1 (gimple_stmt_iterator *def_gsi, tree def)
 452 {
 453   use_operand_p use_p;
 454   imm_use_iterator use_iter;
 455   struct occurrence *occ;
 456   int count = 0, threshold;
 457
 458   gcc_assert (FLOAT_TYPE_P (TREE_TYPE (def)) && is_gimple_reg (def));
 459
 460   FOR_EACH_IMM_USE_FAST (use_p, use_iter, def)
 461     {
 462       gimple use_stmt = USE_STMT (use_p);
 463       if (is_division_by (use_stmt, def))
 464         {
 465           register_division_in (gimple_bb (use_stmt));
 466           count++;
 467         }
 468     }
 469
 470   /* Do the expensive part only if we can hope to optimize something.  */
 471   threshold = targetm.min_divisions_for_recip_mul (TYPE_MODE (TREE_TYPE (def)));
 472   if (count >= threshold)
 473     {
 474       gimple use_stmt;
 475       for (occ = occ_head; occ; occ = occ->next)
 476         {
 477           compute_merit (occ);
 478           insert_reciprocals (def_gsi, occ, def, NULL, threshold);
 479         }
 480
 481       FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, def)
 482         {
 483           if (is_division_by (use_stmt, def))
 484             {
 485               FOR_EACH_IMM_USE_ON_STMT (use_p, use_iter)
 486                 replace_reciprocal (use_p);
 487             }
 488         }
 489     }
 490
 491   for (occ = occ_head; occ; )
 492     occ = free_bb (occ);
 493
 494   occ_head = NULL;
 495 }
 496
 497 static bool
 498 gate_cse_reciprocals (void)
 499 {
 500   return optimize && flag_reciprocal_math;
 501 }
 502
 503 /* Go through all the floating-point SSA_NAMEs, and call
 504    execute_cse_reciprocals_1 on each of them.  */
 505 static unsigned int
 506 execute_cse_reciprocals (void)
 507 {
 508   basic_block bb;
 509   tree arg;
 510
 511   occ_pool = create_alloc_pool ("dominators for recip",
 512                                 sizeof (struct occurrence),
 513                                 n_basic_blocks / 3 + 1);
 514
 515   memset (&reciprocal_stats, 0, sizeof (reciprocal_stats));
 516   calculate_dominance_info (CDI_DOMINATORS);
 517   calculate_dominance_info (CDI_POST_DOMINATORS);
 518
 519 #ifdef ENABLE_CHECKING
 520   FOR_EACH_BB (bb)
 521     gcc_assert (!bb->aux);
 522 #endif
 523
 524   for (arg = DECL_ARGUMENTS (cfun->decl); arg; arg = DECL_CHAIN (arg))
 525     if (FLOAT_TYPE_P (TREE_TYPE (arg))
 526         && is_gimple_reg (arg))
 527       {
 528         tree name = ssa_default_def (cfun, arg);
 529         if (name)
 530           execute_cse_reciprocals_1 (NULL, name);
 531       }
 532
 533   FOR_EACH_BB (bb)
 534     {
 535       gimple_stmt_iterator gsi;
 536       gimple phi;
 537       tree def;
 538
 539       for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 540         {
 541           phi = gsi_stmt (gsi);
 542           def = PHI_RESULT (phi);
 543           if (! virtual_operand_p (def)
 544               && FLOAT_TYPE_P (TREE_TYPE (def)))
 545             execute_cse_reciprocals_1 (NULL, def);
 546         }
 547
 548       for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 549         {
 550           gimple stmt = gsi_stmt (gsi);
 551
 552           if (gimple_has_lhs (stmt)
 553               && (def = SINGLE_SSA_TREE_OPERAND (stmt, SSA_OP_DEF)) != NULL
 554               && FLOAT_TYPE_P (TREE_TYPE (def))
 555               && TREE_CODE (def) == SSA_NAME)
 556             execute_cse_reciprocals_1 (&gsi, def);
 557         }
 558
 559       if (optimize_bb_for_size_p (bb))
 560         continue;
 561
 562       /* Scan for a/func(b) and convert it to reciprocal a*rfunc(b).  */
 563       for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 564         {
 565           gimple stmt = gsi_stmt (gsi);
 566           tree fndecl;
 567
 568           if (is_gimple_assign (stmt)
 569               && gimple_assign_rhs_code (stmt) == RDIV_EXPR)
 570             {
 571               tree arg1 = gimple_assign_rhs2 (stmt);
 572               gimple stmt1;
 573
 574               if (TREE_CODE (arg1) != SSA_NAME)
 575                 continue;
 576
 577               stmt1 = SSA_NAME_DEF_STMT (arg1);
 578
 579               if (is_gimple_call (stmt1)
 580                   && gimple_call_lhs (stmt1)
 581                   && (fndecl = gimple_call_fndecl (stmt1))
 582                   && (DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_NORMAL
 583                       || DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD))
 584                 {
 585                   enum built_in_function code;
 586                   bool md_code, fail;
 587                   imm_use_iterator ui;
 588                   use_operand_p use_p;
 589
 590                   code = DECL_FUNCTION_CODE (fndecl);
 591                   md_code = DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD;
 592
 593                   fndecl = targetm.builtin_reciprocal (code, md_code, false);
 594                   if (!fndecl)
 595                     continue;
 596
 597                   /* Check that all uses of the SSA name are divisions,
 598                      otherwise replacing the defining statement will do
 599                      the wrong thing.  */
 600                   fail = false;
 601                   FOR_EACH_IMM_USE_FAST (use_p, ui, arg1)
 602                     {
 603                       gimple stmt2 = USE_STMT (use_p);
 604                       if (is_gimple_debug (stmt2))
 605                         continue;
 606                       if (!is_gimple_assign (stmt2)
 607                           || gimple_assign_rhs_code (stmt2) != RDIV_EXPR
 608                           || gimple_assign_rhs1 (stmt2) == arg1
 609                           || gimple_assign_rhs2 (stmt2) != arg1)
 610                         {
 611                           fail = true;
 612                           break;
 613                         }
 614                     }
 615                   if (fail)
 616                     continue;
 617
 618                   gimple_replace_ssa_lhs (stmt1, arg1);
 619                   gimple_call_set_fndecl (stmt1, fndecl);
 620                   update_stmt (stmt1);
 621                   reciprocal_stats.rfuncs_inserted++;
 622
 623                   FOR_EACH_IMM_USE_STMT (stmt, ui, arg1)
 624                     {
 625                       gimple_stmt_iterator gsi = gsi_for_stmt (stmt);
 626                       gimple_assign_set_rhs_code (stmt, MULT_EXPR);
 627                       fold_stmt_inplace (&gsi);
 628                       update_stmt (stmt);
 629                     }
 630                 }
 631             }
 632         }
 633     }
 634
 635   statistics_counter_event (cfun, "reciprocal divs inserted",
 636                             reciprocal_stats.rdivs_inserted);
 637   statistics_counter_event (cfun, "reciprocal functions inserted",
 638                             reciprocal_stats.rfuncs_inserted);
 639
 640   free_dominance_info (CDI_DOMINATORS);
 641   free_dominance_info (CDI_POST_DOMINATORS);
 642   free_alloc_pool (occ_pool);
 643   return 0;
 644 }
 645
 646 namespace {
 647
 648 const pass_data pass_data_cse_reciprocals =
 649 {
 650   GIMPLE_PASS, /* type */
 651   "recip", /* name */
 652   OPTGROUP_NONE, /* optinfo_flags */
 653   true, /* has_gate */
 654   true, /* has_execute */
 655   TV_NONE, /* tv_id */
 656   PROP_ssa, /* properties_required */
 657   0, /* properties_provided */
 658   0, /* properties_destroyed */
 659   0, /* todo_flags_start */
 660   ( TODO_update_ssa | TODO_verify_ssa
 661     | TODO_verify_stmts ), /* todo_flags_finish */
 662 };
 663
 664 class pass_cse_reciprocals : public gimple_opt_pass
 665 {
 666 public:
 667   pass_cse_reciprocals (gcc::context *ctxt)
 668     : gimple_opt_pass (pass_data_cse_reciprocals, ctxt)
 669   {}
 670
 671   /* opt_pass methods: */
 672   bool gate () { return gate_cse_reciprocals (); }
 673   unsigned int execute () { return execute_cse_reciprocals (); }
 674
 675 }; // class pass_cse_reciprocals
 676
 677 } // anon namespace
 678
 679 gimple_opt_pass *
 680 make_pass_cse_reciprocals (gcc::context *ctxt)
 681 {
 682   return new pass_cse_reciprocals (ctxt);
 683 }
 684
 685 /* Records an occurrence at statement USE_STMT in the vector of trees
 686    STMTS if it is dominated by *TOP_BB or dominates it or this basic block
 687    is not yet initialized.  Returns true if the occurrence was pushed on
 688    the vector.  Adjusts *TOP_BB to be the basic block dominating all
 689    statements in the vector.  */
 690
 691 static bool
 692 maybe_record_sincos (vec<gimple> *stmts,
 693                      basic_block *top_bb, gimple use_stmt)
 694 {
 695   basic_block use_bb = gimple_bb (use_stmt);
 696   if (*top_bb
 697       && (*top_bb == use_bb
 698           || dominated_by_p (CDI_DOMINATORS, use_bb, *top_bb)))
 699     stmts->safe_push (use_stmt);
 700   else if (!*top_bb
 701            || dominated_by_p (CDI_DOMINATORS, *top_bb, use_bb))
 702     {
 703       stmts->safe_push (use_stmt);
 704       *top_bb = use_bb;
 705     }
 706   else
 707     return false;
 708
 709   return true;
 710 }
 711
 712 /* Look for sin, cos and cexpi calls with the same argument NAME and
 713    create a single call to cexpi CSEing the result in this case.
 714    We first walk over all immediate uses of the argument collecting
 715    statements that we can CSE in a vector and in a second pass replace
 716    the statement rhs with a REALPART or IMAGPART expression on the
 717    result of the cexpi call we insert before the use statement that
 718    dominates all other candidates.  */
 719
 720 static bool
 721 execute_cse_sincos_1 (tree name)
 722 {
 723   gimple_stmt_iterator gsi;
 724   imm_use_iterator use_iter;
 725   tree fndecl, res, type;
 726   gimple def_stmt, use_stmt, stmt;
 727   int seen_cos = 0, seen_sin = 0, seen_cexpi = 0;
 728   vec<gimple> stmts = vNULL;
 729   basic_block top_bb = NULL;
 730   int i;
 731   bool cfg_changed = false;
 732
 733   type = TREE_TYPE (name);
 734   FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, name)
 735     {
 736       if (gimple_code (use_stmt) != GIMPLE_CALL
 737           || !gimple_call_lhs (use_stmt)
 738           || !(fndecl = gimple_call_fndecl (use_stmt))
 739           || DECL_BUILT_IN_CLASS (fndecl) != BUILT_IN_NORMAL)
 740         continue;
 741
 742       switch (DECL_FUNCTION_CODE (fndecl))
 743         {
 744         CASE_FLT_FN (BUILT_IN_COS):
 745           seen_cos |= maybe_record_sincos (&stmts, &top_bb, use_stmt) ? 1 : 0;
 746           break;
 747
 748         CASE_FLT_FN (BUILT_IN_SIN):
 749           seen_sin |= maybe_record_sincos (&stmts, &top_bb, use_stmt) ? 1 : 0;
 750           break;
 751
 752         CASE_FLT_FN (BUILT_IN_CEXPI):
 753           seen_cexpi |= maybe_record_sincos (&stmts, &top_bb, use_stmt) ? 1 : 0;
 754           break;
 755
 756         default:;
 757         }
 758     }
 759
 760   if (seen_cos + seen_sin + seen_cexpi <= 1)
 761     {
 762       stmts.release ();
 763       return false;
 764     }
 765
 766   /* Simply insert cexpi at the beginning of top_bb but not earlier than
 767      the name def statement.  */
 768   fndecl = mathfn_built_in (type, BUILT_IN_CEXPI);
 769   if (!fndecl)
 770     return false;
 771   stmt = gimple_build_call (fndecl, 1, name);
 772   res = make_temp_ssa_name (TREE_TYPE (TREE_TYPE (fndecl)), stmt, "sincostmp");
 773   gimple_call_set_lhs (stmt, res);
 774
 775   def_stmt = SSA_NAME_DEF_STMT (name);
 776   if (!SSA_NAME_IS_DEFAULT_DEF (name)
 777       && gimple_code (def_stmt) != GIMPLE_PHI
 778       && gimple_bb (def_stmt) == top_bb)
 779     {
 780       gsi = gsi_for_stmt (def_stmt);
 781       gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
 782     }
 783   else
 784     {
 785       gsi = gsi_after_labels (top_bb);
 786       gsi_insert_before (&gsi, stmt, GSI_SAME_STMT);
 787     }
 788   sincos_stats.inserted++;
 789
 790   /* And adjust the recorded old call sites.  */
 791   for (i = 0; stmts.iterate (i, &use_stmt); ++i)
 792     {
 793       tree rhs = NULL;
 794       fndecl = gimple_call_fndecl (use_stmt);
 795
 796       switch (DECL_FUNCTION_CODE (fndecl))
 797         {
 798         CASE_FLT_FN (BUILT_IN_COS):
 799           rhs = fold_build1 (REALPART_EXPR, type, res);
 800           break;
 801
 802         CASE_FLT_FN (BUILT_IN_SIN):
 803           rhs = fold_build1 (IMAGPART_EXPR, type, res);
 804           break;
 805
 806         CASE_FLT_FN (BUILT_IN_CEXPI):
 807           rhs = res;
 808           break;
 809
 810         default:;
 811           gcc_unreachable ();
 812         }
 813
 814         /* Replace call with a copy.  */
 815         stmt = gimple_build_assign (gimple_call_lhs (use_stmt), rhs);
 816
 817         gsi = gsi_for_stmt (use_stmt);
 818         gsi_replace (&gsi, stmt, true);
 819         if (gimple_purge_dead_eh_edges (gimple_bb (stmt)))
 820           cfg_changed = true;
 821     }
 822
 823   stmts.release ();
 824
 825   return cfg_changed;
 826 }
 827
 828 /* To evaluate powi(x,n), the floating point value x raised to the
 829    constant integer exponent n, we use a hybrid algorithm that
 830    combines the "window method" with look-up tables.  For an
 831    introduction to exponentiation algorithms and "addition chains",
 832    see section 4.6.3, "Evaluation of Powers" of Donald E. Knuth,
 833    "Seminumerical Algorithms", Vol. 2, "The Art of Computer Programming",
 834    3rd Edition, 1998, and Daniel M. Gordon, "A Survey of Fast Exponentiation
 835    Methods", Journal of Algorithms, Vol. 27, pp. 129-146, 1998.  */
 836
 837 /* Provide a default value for POWI_MAX_MULTS, the maximum number of
 838    multiplications to inline before calling the system library's pow
 839    function.  powi(x,n) requires at worst 2*bits(n)-2 multiplications,
 840    so this default never requires calling pow, powf or powl.  */
 841
 842 #ifndef POWI_MAX_MULTS
 843 #define POWI_MAX_MULTS  (2*HOST_BITS_PER_WIDE_INT-2)
 844 #endif
 845
 846 /* The size of the "optimal power tree" lookup table.  All
 847    exponents less than this value are simply looked up in the
 848    powi_table below.  This threshold is also used to size the
 849    cache of pseudo registers that hold intermediate results.  */
 850 #define POWI_TABLE_SIZE 256
 851
 852 /* The size, in bits of the window, used in the "window method"
 853    exponentiation algorithm.  This is equivalent to a radix of
 854    (1<<POWI_WINDOW_SIZE) in the corresponding "m-ary method".  */
 855 #define POWI_WINDOW_SIZE 3
 856
 857 /* The following table is an efficient representation of an
 858    "optimal power tree".  For each value, i, the corresponding
 859    value, j, in the table states than an optimal evaluation
 860    sequence for calculating pow(x,i) can be found by evaluating
 861    pow(x,j)*pow(x,i-j).  An optimal power tree for the first
 862    100 integers is given in Knuth's "Seminumerical algorithms".  */
 863
 864 static const unsigned char powi_table[POWI_TABLE_SIZE] =
 865   {
 866       0,   1,   1,   2,   2,   3,   3,   4,  /*   0 -   7 */
 867       4,   6,   5,   6,   6,  10,   7,   9,  /*   8 -  15 */
 868       8,  16,   9,  16,  10,  12,  11,  13,  /*  16 -  23 */
 869      12,  17,  13,  18,  14,  24,  15,  26,  /*  24 -  31 */
 870      16,  17,  17,  19,  18,  33,  19,  26,  /*  32 -  39 */
 871      20,  25,  21,  40,  22,  27,  23,  44,  /*  40 -  47 */
 872      24,  32,  25,  34,  26,  29,  27,  44,  /*  48 -  55 */
 873      28,  31,  29,  34,  30,  60,  31,  36,  /*  56 -  63 */
 874      32,  64,  33,  34,  34,  46,  35,  37,  /*  64 -  71 */
 875      36,  65,  37,  50,  38,  48,  39,  69,  /*  72 -  79 */
 876      40,  49,  41,  43,  42,  51,  43,  58,  /*  80 -  87 */
 877      44,  64,  45,  47,  46,  59,  47,  76,  /*  88 -  95 */
 878      48,  65,  49,  66,  50,  67,  51,  66,  /*  96 - 103 */
 879      52,  70,  53,  74,  54, 104,  55,  74,  /* 104 - 111 */
 880      56,  64,  57,  69,  58,  78,  59,  68,  /* 112 - 119 */
 881      60,  61,  61,  80,  62,  75,  63,  68,  /* 120 - 127 */
 882      64,  65,  65, 128,  66, 129,  67,  90,  /* 128 - 135 */
 883      68,  73,  69, 131,  70,  94,  71,  88,  /* 136 - 143 */
 884      72, 128,  73,  98,  74, 132,  75, 121,  /* 144 - 151 */
 885      76, 102,  77, 124,  78, 132,  79, 106,  /* 152 - 159 */
 886      80,  97,  81, 160,  82,  99,  83, 134,  /* 160 - 167 */
 887      84,  86,  85,  95,  86, 160,  87, 100,  /* 168 - 175 */
 888      88, 113,  89,  98,  90, 107,  91, 122,  /* 176 - 183 */
 889      92, 111,  93, 102,  94, 126,  95, 150,  /* 184 - 191 */
 890      96, 128,  97, 130,  98, 133,  99, 195,  /* 192 - 199 */
 891     100, 128, 101, 123, 102, 164, 103, 138,  /* 200 - 207 */
 892     104, 145, 105, 146, 106, 109, 107, 149,  /* 208 - 215 */
 893     108, 200, 109, 146, 110, 170, 111, 157,  /* 216 - 223 */
 894     112, 128, 113, 130, 114, 182, 115, 132,  /* 224 - 231 */
 895     116, 200, 117, 132, 118, 158, 119, 206,  /* 232 - 239 */
 896     120, 240, 121, 162, 122, 147, 123, 152,  /* 240 - 247 */
 897     124, 166, 125, 214, 126, 138, 127, 153,  /* 248 - 255 */
 898   };
 899
 900
 901 /* Return the number of multiplications required to calculate
 902    powi(x,n) where n is less than POWI_TABLE_SIZE.  This is a
 903    subroutine of powi_cost.  CACHE is an array indicating
 904    which exponents have already been calculated.  */
 905
 906 static int
 907 powi_lookup_cost (unsigned HOST_WIDE_INT n, bool *cache)
 908 {
 909   /* If we've already calculated this exponent, then this evaluation
 910      doesn't require any additional multiplications.  */
 911   if (cache[n])
 912     return 0;
 913
 914   cache[n] = true;
 915   return powi_lookup_cost (n - powi_table[n], cache)
 916          + powi_lookup_cost (powi_table[n], cache) + 1;
 917 }
 918
 919 /* Return the number of multiplications required to calculate
 920    powi(x,n) for an arbitrary x, given the exponent N.  This
 921    function needs to be kept in sync with powi_as_mults below.  */
 922
 923 static int
 924 powi_cost (HOST_WIDE_INT n)
 925 {
 926   bool cache[POWI_TABLE_SIZE];
 927   unsigned HOST_WIDE_INT digit;
 928   unsigned HOST_WIDE_INT val;
 929   int result;
 930
 931   if (n == 0)
 932     return 0;
 933
 934   /* Ignore the reciprocal when calculating the cost.  */
 935   val = (n < 0) ? -n : n;
 936
 937   /* Initialize the exponent cache.  */
 938   memset (cache, 0, POWI_TABLE_SIZE * sizeof (bool));
 939   cache[1] = true;
 940
 941   result = 0;
 942
 943   while (val >= POWI_TABLE_SIZE)
 944     {
 945       if (val & 1)
 946         {
 947           digit = val & ((1 << POWI_WINDOW_SIZE) - 1);
 948           result += powi_lookup_cost (digit, cache)
 949                     + POWI_WINDOW_SIZE + 1;
 950           val >>= POWI_WINDOW_SIZE;
 951         }
 952       else
 953         {
 954           val >>= 1;
 955           result++;
 956         }
 957     }
 958
 959   return result + powi_lookup_cost (val, cache);
 960 }
 961
 962 /* Recursive subroutine of powi_as_mults.  This function takes the
 963    array, CACHE, of already calculated exponents and an exponent N and
 964    returns a tree that corresponds to CACHE[1]**N, with type TYPE.  */
 965
 966 static tree
 967 powi_as_mults_1 (gimple_stmt_iterator *gsi, location_t loc, tree type,
 968                  HOST_WIDE_INT n, tree *cache)
 969 {
 970   tree op0, op1, ssa_target;
 971   unsigned HOST_WIDE_INT digit;
 972   gimple mult_stmt;
 973
 974   if (n < POWI_TABLE_SIZE && cache[n])
 975     return cache[n];
 976
 977   ssa_target = make_temp_ssa_name (type, NULL, "powmult");
 978
 979   if (n < POWI_TABLE_SIZE)
 980     {
 981       cache[n] = ssa_target;
 982       op0 = powi_as_mults_1 (gsi, loc, type, n - powi_table[n], cache);
 983       op1 = powi_as_mults_1 (gsi, loc, type, powi_table[n], cache);
 984     }
 985   else if (n & 1)
 986     {
 987       digit = n & ((1 << POWI_WINDOW_SIZE) - 1);
 988       op0 = powi_as_mults_1 (gsi, loc, type, n - digit, cache);
 989       op1 = powi_as_mults_1 (gsi, loc, type, digit, cache);
 990     }
 991   else
 992     {
 993       op0 = powi_as_mults_1 (gsi, loc, type, n >> 1, cache);
 994       op1 = op0;
 995     }
 996
 997   mult_stmt = gimple_build_assign_with_ops (MULT_EXPR, ssa_target, op0, op1);
 998   gimple_set_location (mult_stmt, loc);
 999   gsi_insert_before (gsi, mult_stmt, GSI_SAME_STMT);
1000
1001   return ssa_target;
1002 }
1003
1004 /* Convert ARG0**N to a tree of multiplications of ARG0 with itself.
1005    This function needs to be kept in sync with powi_cost above.  */
1006
1007 static tree
1008 powi_as_mults (gimple_stmt_iterator *gsi, location_t loc,
1009                tree arg0, HOST_WIDE_INT n)
1010 {
1011   tree cache[POWI_TABLE_SIZE], result, type = TREE_TYPE (arg0);
1012   gimple div_stmt;
1013   tree target;
1014
1015   if (n == 0)
1016     return build_real (type, dconst1);
1017
1018   memset (cache, 0,  sizeof (cache));
1019   cache[1] = arg0;
1020
1021   result = powi_as_mults_1 (gsi, loc, type, (n < 0) ? -n : n, cache);
1022   if (n >= 0)
1023     return result;
1024
1025   /* If the original exponent was negative, reciprocate the result.  */
1026   target = make_temp_ssa_name (type, NULL, "powmult");
1027   div_stmt = gimple_build_assign_with_ops (RDIV_EXPR, target,
1028                                            build_real (type, dconst1),
1029                                            result);
1030   gimple_set_location (div_stmt, loc);
1031   gsi_insert_before (gsi, div_stmt, GSI_SAME_STMT);
1032
1033   return target;
1034 }
1035
1036 /* ARG0 and N are the two arguments to a powi builtin in GSI with
1037    location info LOC.  If the arguments are appropriate, create an
1038    equivalent sequence of statements prior to GSI using an optimal
1039    number of multiplications, and return an expession holding the
1040    result.  */
1041
1042 static tree
1043 gimple_expand_builtin_powi (gimple_stmt_iterator *gsi, location_t loc,
1044                             tree arg0, HOST_WIDE_INT n)
1045 {
1046   /* Avoid largest negative number.  */
1047   if (n != -n
1048       && ((n >= -1 && n <= 2)
1049           || (optimize_function_for_speed_p (cfun)
1050               && powi_cost (n) <= POWI_MAX_MULTS)))
1051     return powi_as_mults (gsi, loc, arg0, n);
1052
1053   return NULL_TREE;
1054 }
1055
1056 /* Build a gimple call statement that calls FN with argument ARG.
1057    Set the lhs of the call statement to a fresh SSA name.  Insert the
1058    statement prior to GSI's current position, and return the fresh
1059    SSA name.  */
1060
1061 static tree
1062 build_and_insert_call (gimple_stmt_iterator *gsi, location_t loc,
1063                        tree fn, tree arg)
1064 {
1065   gimple call_stmt;
1066   tree ssa_target;
1067
1068   call_stmt = gimple_build_call (fn, 1, arg);
1069   ssa_target = make_temp_ssa_name (TREE_TYPE (arg), NULL, "powroot");
1070   gimple_set_lhs (call_stmt, ssa_target);
1071   gimple_set_location (call_stmt, loc);
1072   gsi_insert_before (gsi, call_stmt, GSI_SAME_STMT);
1073
1074   return ssa_target;
1075 }
1076
1077 /* Build a gimple binary operation with the given CODE and arguments
1078    ARG0, ARG1, assigning the result to a new SSA name for variable
1079    TARGET.  Insert the statement prior to GSI's current position, and
1080    return the fresh SSA name.*/
1081
1082 static tree
1083 build_and_insert_binop (gimple_stmt_iterator *gsi, location_t loc,
1084                         const char *name, enum tree_code code,
1085                         tree arg0, tree arg1)
1086 {
1087   tree result = make_temp_ssa_name (TREE_TYPE (arg0), NULL, name);
1088   gimple stmt = gimple_build_assign_with_ops (code, result, arg0, arg1);
1089   gimple_set_location (stmt, loc);
1090   gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
1091   return result;
1092 }
1093
1094 /* Build a gimple reference operation with the given CODE and argument
1095    ARG, assigning the result to a new SSA name of TYPE with NAME.
1096    Insert the statement prior to GSI's current position, and return
1097    the fresh SSA name.  */
1098
1099 static inline tree
1100 build_and_insert_ref (gimple_stmt_iterator *gsi, location_t loc, tree type,
1101                       const char *name, enum tree_code code, tree arg0)
1102 {
1103   tree result = make_temp_ssa_name (type, NULL, name);
1104   gimple stmt = gimple_build_assign (result, build1 (code, type, arg0));
1105   gimple_set_location (stmt, loc);
1106   gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
1107   return result;
1108 }
1109
1110 /* Build a gimple assignment to cast VAL to TYPE.  Insert the statement
1111    prior to GSI's current position, and return the fresh SSA name.  */
1112
1113 static tree
1114 build_and_insert_cast (gimple_stmt_iterator *gsi, location_t loc,
1115                        tree type, tree val)
1116 {
1117   tree result = make_ssa_name (type, NULL);
1118   gimple stmt = gimple_build_assign_with_ops (NOP_EXPR, result, val, NULL_TREE);
1119   gimple_set_location (stmt, loc);
1120   gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
1121   return result;
1122 }
1123
1124 /* ARG0 and ARG1 are the two arguments to a pow builtin call in GSI
1125    with location info LOC.  If possible, create an equivalent and
1126    less expensive sequence of statements prior to GSI, and return an
1127    expession holding the result.  */
1128
1129 static tree
1130 gimple_expand_builtin_pow (gimple_stmt_iterator *gsi, location_t loc,
1131                            tree arg0, tree arg1)
1132 {
1133   REAL_VALUE_TYPE c, cint, dconst1_4, dconst3_4, dconst1_3, dconst1_6;
1134   REAL_VALUE_TYPE c2, dconst3;
1135   HOST_WIDE_INT n;
1136   tree type, sqrtfn, cbrtfn, sqrt_arg0, sqrt_sqrt, result, cbrt_x, powi_cbrt_x;
1137   enum machine_mode mode;
1138   bool hw_sqrt_exists, c_is_int, c2_is_int;
1139
1140   /* If the exponent isn't a constant, there's nothing of interest
1141      to be done.  */
1142   if (TREE_CODE (arg1) != REAL_CST)
1143     return NULL_TREE;
1144
1145   /* If the exponent is equivalent to an integer, expand to an optimal
1146      multiplication sequence when profitable.  */
1147   c = TREE_REAL_CST (arg1);
1148   n = real_to_integer (&c);
1149   real_from_integer (&cint, VOIDmode, n, n < 0 ? -1 : 0, 0);
1150   c_is_int = real_identical (&c, &cint);
1151
1152   if (c_is_int
1153       && ((n >= -1 && n <= 2)
1154           || (flag_unsafe_math_optimizations
1155               && optimize_insn_for_speed_p ()
1156               && powi_cost (n) <= POWI_MAX_MULTS)))
1157     return gimple_expand_builtin_powi (gsi, loc, arg0, n);
1158
1159   /* Attempt various optimizations using sqrt and cbrt.  */
1160   type = TREE_TYPE (arg0);
1161   mode = TYPE_MODE (type);
1162   sqrtfn = mathfn_built_in (type, BUILT_IN_SQRT);
1163
1164   /* Optimize pow(x,0.5) = sqrt(x).  This replacement is always safe
1165      unless signed zeros must be maintained.  pow(-0,0.5) = +0, while
1166      sqrt(-0) = -0.  */
1167   if (sqrtfn
1168       && REAL_VALUES_EQUAL (c, dconsthalf)
1169       && !HONOR_SIGNED_ZEROS (mode))
1170     return build_and_insert_call (gsi, loc, sqrtfn, arg0);
1171
1172   /* Optimize pow(x,0.25) = sqrt(sqrt(x)).  Assume on most machines that
1173      a builtin sqrt instruction is smaller than a call to pow with 0.25,
1174      so do this optimization even if -Os.  Don't do this optimization
1175      if we don't have a hardware sqrt insn.  */
1176   dconst1_4 = dconst1;
1177   SET_REAL_EXP (&dconst1_4, REAL_EXP (&dconst1_4) - 2);
1178   hw_sqrt_exists = optab_handler (sqrt_optab, mode) != CODE_FOR_nothing;
1179
1180   if (flag_unsafe_math_optimizations
1181       && sqrtfn
1182       && REAL_VALUES_EQUAL (c, dconst1_4)
1183       && hw_sqrt_exists)
1184     {
1185       /* sqrt(x)  */
1186       sqrt_arg0 = build_and_insert_call (gsi, loc, sqrtfn, arg0);
1187
1188       /* sqrt(sqrt(x))  */
1189       return build_and_insert_call (gsi, loc, sqrtfn, sqrt_arg0);
1190     }
1191
1192   /* Optimize pow(x,0.75) = sqrt(x) * sqrt(sqrt(x)) unless we are
1193      optimizing for space.  Don't do this optimization if we don't have
1194      a hardware sqrt insn.  */
1195   real_from_integer (&dconst3_4, VOIDmode, 3, 0, 0);
1196   SET_REAL_EXP (&dconst3_4, REAL_EXP (&dconst3_4) - 2);
1197
1198   if (flag_unsafe_math_optimizations
1199       && sqrtfn
1200       && optimize_function_for_speed_p (cfun)
1201       && REAL_VALUES_EQUAL (c, dconst3_4)
1202       && hw_sqrt_exists)
1203     {
1204       /* sqrt(x)  */
1205       sqrt_arg0 = build_and_insert_call (gsi, loc, sqrtfn, arg0);
1206
1207       /* sqrt(sqrt(x))  */
1208       sqrt_sqrt = build_and_insert_call (gsi, loc, sqrtfn, sqrt_arg0);
1209
1210       /* sqrt(x) * sqrt(sqrt(x))  */
1211       return build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1212                                      sqrt_arg0, sqrt_sqrt);
1213     }
1214
1215   /* Optimize pow(x,1./3.) = cbrt(x).  This requires unsafe math
1216      optimizations since 1./3. is not exactly representable.  If x
1217      is negative and finite, the correct value of pow(x,1./3.) is
1218      a NaN with the "invalid" exception raised, because the value
1219      of 1./3. actually has an even denominator.  The correct value
1220      of cbrt(x) is a negative real value.  */
1221   cbrtfn = mathfn_built_in (type, BUILT_IN_CBRT);
1222   dconst1_3 = real_value_truncate (mode, dconst_third ());
1223
1224   if (flag_unsafe_math_optimizations
1225       && cbrtfn
1226       && (gimple_val_nonnegative_real_p (arg0) || !HONOR_NANS (mode))
1227       && REAL_VALUES_EQUAL (c, dconst1_3))
1228     return build_and_insert_call (gsi, loc, cbrtfn, arg0);
1229
1230   /* Optimize pow(x,1./6.) = cbrt(sqrt(x)).  Don't do this optimization
1231      if we don't have a hardware sqrt insn.  */
1232   dconst1_6 = dconst1_3;
1233   SET_REAL_EXP (&dconst1_6, REAL_EXP (&dconst1_6) - 1);
1234
1235   if (flag_unsafe_math_optimizations
1236       && sqrtfn
1237       && cbrtfn
1238       && (gimple_val_nonnegative_real_p (arg0) || !HONOR_NANS (mode))
1239       && optimize_function_for_speed_p (cfun)
1240       && hw_sqrt_exists
1241       && REAL_VALUES_EQUAL (c, dconst1_6))
1242     {
1243       /* sqrt(x)  */
1244       sqrt_arg0 = build_and_insert_call (gsi, loc, sqrtfn, arg0);
1245
1246       /* cbrt(sqrt(x))  */
1247       return build_and_insert_call (gsi, loc, cbrtfn, sqrt_arg0);
1248     }
1249
1250   /* Optimize pow(x,c), where n = 2c for some nonzero integer n
1251      and c not an integer, into
1252
1253        sqrt(x) * powi(x, n/2),                n > 0;
1254        1.0 / (sqrt(x) * powi(x, abs(n/2))),   n < 0.
1255
1256      Do not calculate the powi factor when n/2 = 0.  */
1257   real_arithmetic (&c2, MULT_EXPR, &c, &dconst2);
1258   n = real_to_integer (&c2);
1259   real_from_integer (&cint, VOIDmode, n, n < 0 ? -1 : 0, 0);
1260   c2_is_int = real_identical (&c2, &cint);
1261
1262   if (flag_unsafe_math_optimizations
1263       && sqrtfn
1264       && c2_is_int
1265       && !c_is_int
1266       && optimize_function_for_speed_p (cfun))
1267     {
1268       tree powi_x_ndiv2 = NULL_TREE;
1269
1270       /* Attempt to fold powi(arg0, abs(n/2)) into multiplies.  If not
1271          possible or profitable, give up.  Skip the degenerate case when
1272          n is 1 or -1, where the result is always 1.  */
1273       if (absu_hwi (n) != 1)
1274         {
1275           powi_x_ndiv2 = gimple_expand_builtin_powi (gsi, loc, arg0,
1276                                                      abs_hwi (n / 2));
1277           if (!powi_x_ndiv2)
1278             return NULL_TREE;
1279         }
1280
1281       /* Calculate sqrt(x).  When n is not 1 or -1, multiply it by the
1282          result of the optimal multiply sequence just calculated.  */
1283       sqrt_arg0 = build_and_insert_call (gsi, loc, sqrtfn, arg0);
1284
1285       if (absu_hwi (n) == 1)
1286         result = sqrt_arg0;
1287       else
1288         result = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1289                                          sqrt_arg0, powi_x_ndiv2);
1290
1291       /* If n is negative, reciprocate the result.  */
1292       if (n < 0)
1293         result = build_and_insert_binop (gsi, loc, "powroot", RDIV_EXPR,
1294                                          build_real (type, dconst1), result);
1295       return result;
1296     }
1297
1298   /* Optimize pow(x,c), where 3c = n for some nonzero integer n, into
1299
1300      powi(x, n/3) * powi(cbrt(x), n%3),                    n > 0;
1301      1.0 / (powi(x, abs(n)/3) * powi(cbrt(x), abs(n)%3)),  n < 0.
1302
1303      Do not calculate the first factor when n/3 = 0.  As cbrt(x) is
1304      different from pow(x, 1./3.) due to rounding and behavior with
1305      negative x, we need to constrain this transformation to unsafe
1306      math and positive x or finite math.  */
1307   real_from_integer (&dconst3, VOIDmode, 3, 0, 0);
1308   real_arithmetic (&c2, MULT_EXPR, &c, &dconst3);
1309   real_round (&c2, mode, &c2);
1310   n = real_to_integer (&c2);
1311   real_from_integer (&cint, VOIDmode, n, n < 0 ? -1 : 0, 0);
1312   real_arithmetic (&c2, RDIV_EXPR, &cint, &dconst3);
1313   real_convert (&c2, mode, &c2);
1314
1315   if (flag_unsafe_math_optimizations
1316       && cbrtfn
1317       && (gimple_val_nonnegative_real_p (arg0) || !HONOR_NANS (mode))
1318       && real_identical (&c2, &c)
1319       && !c2_is_int
1320       && optimize_function_for_speed_p (cfun)
1321       && powi_cost (n / 3) <= POWI_MAX_MULTS)
1322     {
1323       tree powi_x_ndiv3 = NULL_TREE;
1324
1325       /* Attempt to fold powi(arg0, abs(n/3)) into multiplies.  If not
1326          possible or profitable, give up.  Skip the degenerate case when
1327          abs(n) < 3, where the result is always 1.  */
1328       if (absu_hwi (n) >= 3)
1329         {
1330           powi_x_ndiv3 = gimple_expand_builtin_powi (gsi, loc, arg0,
1331                                                      abs_hwi (n / 3));
1332           if (!powi_x_ndiv3)
1333             return NULL_TREE;
1334         }
1335
1336       /* Calculate powi(cbrt(x), n%3).  Don't use gimple_expand_builtin_powi
1337          as that creates an unnecessary variable.  Instead, just produce
1338          either cbrt(x) or cbrt(x) * cbrt(x).  */
1339       cbrt_x = build_and_insert_call (gsi, loc, cbrtfn, arg0);
1340
1341       if (absu_hwi (n) % 3 == 1)
1342         powi_cbrt_x = cbrt_x;
1343       else
1344         powi_cbrt_x = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1345                                               cbrt_x, cbrt_x);
1346
1347       /* Multiply the two subexpressions, unless powi(x,abs(n)/3) = 1.  */
1348       if (absu_hwi (n) < 3)
1349         result = powi_cbrt_x;
1350       else
1351         result = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1352                                          powi_x_ndiv3, powi_cbrt_x);
1353
1354       /* If n is negative, reciprocate the result.  */
1355       if (n < 0)
1356         result = build_and_insert_binop (gsi, loc, "powroot", RDIV_EXPR,
1357                                          build_real (type, dconst1), result);
1358
1359       return result;
1360     }
1361
1362   /* No optimizations succeeded.  */
1363   return NULL_TREE;
1364 }
1365
1366 /* ARG is the argument to a cabs builtin call in GSI with location info
1367    LOC.  Create a sequence of statements prior to GSI that calculates
1368    sqrt(R*R + I*I), where R and I are the real and imaginary components
1369    of ARG, respectively.  Return an expression holding the result.  */
1370
1371 static tree
1372 gimple_expand_builtin_cabs (gimple_stmt_iterator *gsi, location_t loc, tree arg)
1373 {
1374   tree real_part, imag_part, addend1, addend2, sum, result;
1375   tree type = TREE_TYPE (TREE_TYPE (arg));
1376   tree sqrtfn = mathfn_built_in (type, BUILT_IN_SQRT);
1377   enum machine_mode mode = TYPE_MODE (type);
1378
1379   if (!flag_unsafe_math_optimizations
1380       || !optimize_bb_for_speed_p (gimple_bb (gsi_stmt (*gsi)))
1381       || !sqrtfn
1382       || optab_handler (sqrt_optab, mode) == CODE_FOR_nothing)
1383     return NULL_TREE;
1384
1385   real_part = build_and_insert_ref (gsi, loc, type, "cabs",
1386                                     REALPART_EXPR, arg);
1387   addend1 = build_and_insert_binop (gsi, loc, "cabs", MULT_EXPR,
1388                                     real_part, real_part);
1389   imag_part = build_and_insert_ref (gsi, loc, type, "cabs",
1390                                     IMAGPART_EXPR, arg);
1391   addend2 = build_and_insert_binop (gsi, loc, "cabs", MULT_EXPR,
1392                                     imag_part, imag_part);
1393   sum = build_and_insert_binop (gsi, loc, "cabs", PLUS_EXPR, addend1, addend2);
1394   result = build_and_insert_call (gsi, loc, sqrtfn, sum);
1395
1396   return result;
1397 }
1398
1399 /* Go through all calls to sin, cos and cexpi and call execute_cse_sincos_1
1400    on the SSA_NAME argument of each of them.  Also expand powi(x,n) into
1401    an optimal number of multiplies, when n is a constant.  */
1402
1403 static unsigned int
1404 execute_cse_sincos (void)
1405 {
1406   basic_block bb;
1407   bool cfg_changed = false;
1408
1409   calculate_dominance_info (CDI_DOMINATORS);
1410   memset (&sincos_stats, 0, sizeof (sincos_stats));
1411
1412   FOR_EACH_BB (bb)
1413     {
1414       gimple_stmt_iterator gsi;
1415       bool cleanup_eh = false;
1416
1417       for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi); gsi_next (&gsi))
1418         {
1419           gimple stmt = gsi_stmt (gsi);
1420           tree fndecl;
1421
1422           /* Only the last stmt in a bb could throw, no need to call
1423              gimple_purge_dead_eh_edges if we change something in the middle
1424              of a basic block.  */
1425           cleanup_eh = false;
1426
1427           if (is_gimple_call (stmt)
1428               && gimple_call_lhs (stmt)
1429               && (fndecl = gimple_call_fndecl (stmt))
1430               && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_NORMAL)
1431             {
1432               tree arg, arg0, arg1, result;
1433               HOST_WIDE_INT n;
1434               location_t loc;
1435
1436               switch (DECL_FUNCTION_CODE (fndecl))
1437                 {
1438                 CASE_FLT_FN (BUILT_IN_COS):
1439                 CASE_FLT_FN (BUILT_IN_SIN):
1440                 CASE_FLT_FN (BUILT_IN_CEXPI):
1441                   /* Make sure we have either sincos or cexp.  */
1442                   if (!targetm.libc_has_function (function_c99_math_complex)
1443                       && !targetm.libc_has_function (function_sincos))
1444                     break;
1445
1446                   arg = gimple_call_arg (stmt, 0);
1447                   if (TREE_CODE (arg) == SSA_NAME)
1448                     cfg_changed |= execute_cse_sincos_1 (arg);
1449                   break;
1450
1451                 CASE_FLT_FN (BUILT_IN_POW):
1452                   arg0 = gimple_call_arg (stmt, 0);
1453                   arg1 = gimple_call_arg (stmt, 1);
1454
1455                   loc = gimple_location (stmt);
1456                   result = gimple_expand_builtin_pow (&gsi, loc, arg0, arg1);
1457
1458                   if (result)
1459                     {
1460                       tree lhs = gimple_get_lhs (stmt);
1461                       gimple new_stmt = gimple_build_assign (lhs, result);
1462                       gimple_set_location (new_stmt, loc);
1463                       unlink_stmt_vdef (stmt);
1464                       gsi_replace (&gsi, new_stmt, true);
1465                       cleanup_eh = true;
1466                       if (gimple_vdef (stmt))
1467                         release_ssa_name (gimple_vdef (stmt));
1468                     }
1469                   break;
1470
1471                 CASE_FLT_FN (BUILT_IN_POWI):
1472                   arg0 = gimple_call_arg (stmt, 0);
1473                   arg1 = gimple_call_arg (stmt, 1);
1474                   loc = gimple_location (stmt);
1475
1476                   if (real_minus_onep (arg0))
1477                     {
1478                       tree t0, t1, cond, one, minus_one;
1479                       gimple stmt;
1480
1481                       t0 = TREE_TYPE (arg0);
1482                       t1 = TREE_TYPE (arg1);
1483                       one = build_real (t0, dconst1);
1484                       minus_one = build_real (t0, dconstm1);
1485
1486                       cond = make_temp_ssa_name (t1, NULL, "powi_cond");
1487                       stmt = gimple_build_assign_with_ops (BIT_AND_EXPR, cond,
1488                                                            arg1,
1489                                                            build_int_cst (t1,
1490                                                                           1));
1491                       gimple_set_location (stmt, loc);
1492                       gsi_insert_before (&gsi, stmt, GSI_SAME_STMT);
1493
1494                       result = make_temp_ssa_name (t0, NULL, "powi");
1495                       stmt = gimple_build_assign_with_ops (COND_EXPR, result,
1496                                                            cond,
1497                                                            minus_one, one);
1498                       gimple_set_location (stmt, loc);
1499                       gsi_insert_before (&gsi, stmt, GSI_SAME_STMT);
1500                     }
1501                   else
1502                     {
1503                       if (!host_integerp (arg1, 0))
1504                         break;
1505
1506                       n = TREE_INT_CST_LOW (arg1);
1507                       result = gimple_expand_builtin_powi (&gsi, loc, arg0, n);
1508                     }
1509
1510                   if (result)
1511                     {
1512                       tree lhs = gimple_get_lhs (stmt);
1513                       gimple new_stmt = gimple_build_assign (lhs, result);
1514                       gimple_set_location (new_stmt, loc);
1515                       unlink_stmt_vdef (stmt);
1516                       gsi_replace (&gsi, new_stmt, true);
1517                       cleanup_eh = true;
1518                       if (gimple_vdef (stmt))
1519                         release_ssa_name (gimple_vdef (stmt));
1520                     }
1521                   break;
1522
1523                 CASE_FLT_FN (BUILT_IN_CABS):
1524                   arg0 = gimple_call_arg (stmt, 0);
1525                   loc = gimple_location (stmt);
1526                   result = gimple_expand_builtin_cabs (&gsi, loc, arg0);
1527
1528                   if (result)
1529                     {
1530                       tree lhs = gimple_get_lhs (stmt);
1531                       gimple new_stmt = gimple_build_assign (lhs, result);
1532                       gimple_set_location (new_stmt, loc);
1533                       unlink_stmt_vdef (stmt);
1534                       gsi_replace (&gsi, new_stmt, true);
1535                       cleanup_eh = true;
1536                       if (gimple_vdef (stmt))
1537                         release_ssa_name (gimple_vdef (stmt));
1538                     }
1539                   break;
1540
1541                 default:;
1542                 }
1543             }
1544         }
1545       if (cleanup_eh)
1546         cfg_changed |= gimple_purge_dead_eh_edges (bb);
1547     }
1548
1549   statistics_counter_event (cfun, "sincos statements inserted",
1550                             sincos_stats.inserted);
1551
1552   free_dominance_info (CDI_DOMINATORS);
1553   return cfg_changed ? TODO_cleanup_cfg : 0;
1554 }
1555
1556 static bool
1557 gate_cse_sincos (void)
1558 {
1559   /* We no longer require either sincos or cexp, since powi expansion
1560      piggybacks on this pass.  */
1561   return optimize;
1562 }
1563
1564 namespace {
1565
1566 const pass_data pass_data_cse_sincos =
1567 {
1568   GIMPLE_PASS, /* type */
1569   "sincos", /* name */
1570   OPTGROUP_NONE, /* optinfo_flags */
1571   true, /* has_gate */
1572   true, /* has_execute */
1573   TV_NONE, /* tv_id */
1574   PROP_ssa, /* properties_required */
1575   0, /* properties_provided */
1576   0, /* properties_destroyed */
1577   0, /* todo_flags_start */
1578   ( TODO_update_ssa | TODO_verify_ssa
1579     | TODO_verify_stmts ), /* todo_flags_finish */
1580 };
1581
1582 class pass_cse_sincos : public gimple_opt_pass
1583 {
1584 public:
1585   pass_cse_sincos (gcc::context *ctxt)
1586     : gimple_opt_pass (pass_data_cse_sincos, ctxt)
1587   {}
1588
1589   /* opt_pass methods: */
1590   bool gate () { return gate_cse_sincos (); }
1591   unsigned int execute () { return execute_cse_sincos (); }
1592
1593 }; // class pass_cse_sincos
1594
1595 } // anon namespace
1596
1597 gimple_opt_pass *
1598 make_pass_cse_sincos (gcc::context *ctxt)
1599 {
1600   return new pass_cse_sincos (ctxt);
1601 }
1602
1603 /* A symbolic number is used to detect byte permutation and selection
1604    patterns.  Therefore the field N contains an artificial number
1605    consisting of byte size markers:
1606
1607    0    - byte has the value 0
1608    1..size - byte contains the content of the byte
1609    number indexed with that value minus one  */
1610
1611 struct symbolic_number {
1612   unsigned HOST_WIDEST_INT n;
1613   int size;
1614 };
1615
1616 /* Perform a SHIFT or ROTATE operation by COUNT bits on symbolic
1617    number N.  Return false if the requested operation is not permitted
1618    on a symbolic number.  */
1619
1620 static inline bool
1621 do_shift_rotate (enum tree_code code,
1622                  struct symbolic_number *n,
1623                  int count)
1624 {
1625   if (count % 8 != 0)
1626     return false;
1627
1628   /* Zero out the extra bits of N in order to avoid them being shifted
1629      into the significant bits.  */
1630   if (n->size < (int)sizeof (HOST_WIDEST_INT))
1631     n->n &= ((unsigned HOST_WIDEST_INT)1 << (n->size * BITS_PER_UNIT)) - 1;
1632
1633   switch (code)
1634     {
1635     case LSHIFT_EXPR:
1636       n->n <<= count;
1637       break;
1638     case RSHIFT_EXPR:
1639       n->n >>= count;
1640       break;
1641     case LROTATE_EXPR:
1642       n->n = (n->n << count) | (n->n >> ((n->size * BITS_PER_UNIT) - count));
1643       break;
1644     case RROTATE_EXPR:
1645       n->n = (n->n >> count) | (n->n << ((n->size * BITS_PER_UNIT) - count));
1646       break;
1647     default:
1648       return false;
1649     }
1650   /* Zero unused bits for size.  */
1651   if (n->size < (int)sizeof (HOST_WIDEST_INT))
1652     n->n &= ((unsigned HOST_WIDEST_INT)1 << (n->size * BITS_PER_UNIT)) - 1;
1653   return true;
1654 }
1655
1656 /* Perform sanity checking for the symbolic number N and the gimple
1657    statement STMT.  */
1658
1659 static inline bool
1660 verify_symbolic_number_p (struct symbolic_number *n, gimple stmt)
1661 {
1662   tree lhs_type;
1663
1664   lhs_type = gimple_expr_type (stmt);
1665
1666   if (TREE_CODE (lhs_type) != INTEGER_TYPE)
1667     return false;
1668
1669   if (TYPE_PRECISION (lhs_type) != n->size * BITS_PER_UNIT)
1670     return false;
1671
1672   return true;
1673 }
1674
1675 /* find_bswap_1 invokes itself recursively with N and tries to perform
1676    the operation given by the rhs of STMT on the result.  If the
1677    operation could successfully be executed the function returns the
1678    tree expression of the source operand and NULL otherwise.  */
1679
1680 static tree
1681 find_bswap_1 (gimple stmt, struct symbolic_number *n, int limit)
1682 {
1683   enum tree_code code;
1684   tree rhs1, rhs2 = NULL;
1685   gimple rhs1_stmt, rhs2_stmt;
1686   tree source_expr1;
1687   enum gimple_rhs_class rhs_class;
1688
1689   if (!limit || !is_gimple_assign (stmt))
1690     return NULL_TREE;
1691
1692   rhs1 = gimple_assign_rhs1 (stmt);
1693
1694   if (TREE_CODE (rhs1) != SSA_NAME)
1695     return NULL_TREE;
1696
1697   code = gimple_assign_rhs_code (stmt);
1698   rhs_class = gimple_assign_rhs_class (stmt);
1699   rhs1_stmt = SSA_NAME_DEF_STMT (rhs1);
1700
1701   if (rhs_class == GIMPLE_BINARY_RHS)
1702     rhs2 = gimple_assign_rhs2 (stmt);
1703
1704   /* Handle unary rhs and binary rhs with integer constants as second
1705      operand.  */
1706
1707   if (rhs_class == GIMPLE_UNARY_RHS
1708       || (rhs_class == GIMPLE_BINARY_RHS
1709           && TREE_CODE (rhs2) == INTEGER_CST))
1710     {
1711       if (code != BIT_AND_EXPR
1712           && code != LSHIFT_EXPR
1713           && code != RSHIFT_EXPR
1714           && code != LROTATE_EXPR
1715           && code != RROTATE_EXPR
1716           && code != NOP_EXPR
1717           && code != CONVERT_EXPR)
1718         return NULL_TREE;
1719
1720       source_expr1 = find_bswap_1 (rhs1_stmt, n, limit - 1);
1721
1722       /* If find_bswap_1 returned NULL STMT is a leaf node and we have
1723          to initialize the symbolic number.  */
1724       if (!source_expr1)
1725         {
1726           /* Set up the symbolic number N by setting each byte to a
1727              value between 1 and the byte size of rhs1.  The highest
1728              order byte is set to n->size and the lowest order
1729              byte to 1.  */
1730           n->size = TYPE_PRECISION (TREE_TYPE (rhs1));
1731           if (n->size % BITS_PER_UNIT != 0)
1732             return NULL_TREE;
1733           n->size /= BITS_PER_UNIT;
1734           n->n = (sizeof (HOST_WIDEST_INT) < 8 ? 0 :
1735                   (unsigned HOST_WIDEST_INT)0x08070605 << 32 | 0x04030201);
1736
1737           if (n->size < (int)sizeof (HOST_WIDEST_INT))
1738             n->n &= ((unsigned HOST_WIDEST_INT)1 <<
1739                      (n->size * BITS_PER_UNIT)) - 1;
1740
1741           source_expr1 = rhs1;
1742         }
1743
1744       switch (code)
1745         {
1746         case BIT_AND_EXPR:
1747           {
1748             int i;
1749             unsigned HOST_WIDEST_INT val = widest_int_cst_value (rhs2);
1750             unsigned HOST_WIDEST_INT tmp = val;
1751
1752             /* Only constants masking full bytes are allowed.  */
1753             for (i = 0; i < n->size; i++, tmp >>= BITS_PER_UNIT)
1754               if ((tmp & 0xff) != 0 && (tmp & 0xff) != 0xff)
1755                 return NULL_TREE;
1756
1757             n->n &= val;
1758           }
1759           break;
1760         case LSHIFT_EXPR:
1761         case RSHIFT_EXPR:
1762         case LROTATE_EXPR:
1763         case RROTATE_EXPR:
1764           if (!do_shift_rotate (code, n, (int)TREE_INT_CST_LOW (rhs2)))
1765             return NULL_TREE;
1766           break;
1767         CASE_CONVERT:
1768           {
1769             int type_size;
1770
1771             type_size = TYPE_PRECISION (gimple_expr_type (stmt));
1772             if (type_size % BITS_PER_UNIT != 0)
1773               return NULL_TREE;
1774
1775             if (type_size / BITS_PER_UNIT < (int)(sizeof (HOST_WIDEST_INT)))
1776               {
1777                 /* If STMT casts to a smaller type mask out the bits not
1778                    belonging to the target type.  */
1779                 n->n &= ((unsigned HOST_WIDEST_INT)1 << type_size) - 1;
1780               }
1781             n->size = type_size / BITS_PER_UNIT;
1782           }
1783           break;
1784         default:
1785           return NULL_TREE;
1786         };
1787       return verify_symbolic_number_p (n, stmt) ? source_expr1 : NULL;
1788     }
1789
1790   /* Handle binary rhs.  */
1791
1792   if (rhs_class == GIMPLE_BINARY_RHS)
1793     {
1794       struct symbolic_number n1, n2;
1795       tree source_expr2;
1796
1797       if (code != BIT_IOR_EXPR)
1798         return NULL_TREE;
1799
1800       if (TREE_CODE (rhs2) != SSA_NAME)
1801         return NULL_TREE;
1802
1803       rhs2_stmt = SSA_NAME_DEF_STMT (rhs2);
1804
1805       switch (code)
1806         {
1807         case BIT_IOR_EXPR:
1808           source_expr1 = find_bswap_1 (rhs1_stmt, &n1, limit - 1);
1809
1810           if (!source_expr1)
1811             return NULL_TREE;
1812
1813           source_expr2 = find_bswap_1 (rhs2_stmt, &n2, limit - 1);
1814
1815           if (source_expr1 != source_expr2
1816               || n1.size != n2.size)
1817             return NULL_TREE;
1818
1819           n->size = n1.size;
1820           n->n = n1.n | n2.n;
1821
1822           if (!verify_symbolic_number_p (n, stmt))
1823             return NULL_TREE;
1824
1825           break;
1826         default:
1827           return NULL_TREE;
1828         }
1829       return source_expr1;
1830     }
1831   return NULL_TREE;
1832 }
1833
1834 /* Check if STMT completes a bswap implementation consisting of ORs,
1835    SHIFTs and ANDs.  Return the source tree expression on which the
1836    byte swap is performed and NULL if no bswap was found.  */
1837
1838 static tree
1839 find_bswap (gimple stmt)
1840 {
1841 /* The number which the find_bswap result should match in order to
1842    have a full byte swap.  The number is shifted to the left according
1843    to the size of the symbolic number before using it.  */
1844   unsigned HOST_WIDEST_INT cmp =
1845     sizeof (HOST_WIDEST_INT) < 8 ? 0 :
1846     (unsigned HOST_WIDEST_INT)0x01020304 << 32 | 0x05060708;
1847
1848   struct symbolic_number n;
1849   tree source_expr;
1850   int limit;
1851
1852   /* The last parameter determines the depth search limit.  It usually
1853      correlates directly to the number of bytes to be touched.  We
1854      increase that number by three  here in order to also
1855      cover signed -> unsigned converions of the src operand as can be seen
1856      in libgcc, and for initial shift/and operation of the src operand.  */
1857   limit = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (gimple_expr_type (stmt)));
1858   limit += 1 + (int) ceil_log2 ((unsigned HOST_WIDE_INT) limit);
1859   source_expr =  find_bswap_1 (stmt, &n, limit);
1860
1861   if (!source_expr)
1862     return NULL_TREE;
1863
1864   /* Zero out the extra bits of N and CMP.  */
1865   if (n.size < (int)sizeof (HOST_WIDEST_INT))
1866     {
1867       unsigned HOST_WIDEST_INT mask =
1868         ((unsigned HOST_WIDEST_INT)1 << (n.size * BITS_PER_UNIT)) - 1;
1869
1870       n.n &= mask;
1871       cmp >>= (sizeof (HOST_WIDEST_INT) - n.size) * BITS_PER_UNIT;
1872     }
1873
1874   /* A complete byte swap should make the symbolic number to start
1875      with the largest digit in the highest order byte.  */
1876   if (cmp != n.n)
1877     return NULL_TREE;
1878
1879   return source_expr;
1880 }
1881
1882 /* Find manual byte swap implementations and turn them into a bswap
1883    builtin invokation.  */
1884
1885 static unsigned int
1886 execute_optimize_bswap (void)
1887 {
1888   basic_block bb;
1889   bool bswap16_p, bswap32_p, bswap64_p;
1890   bool changed = false;
1891   tree bswap16_type = NULL_TREE, bswap32_type = NULL_TREE, bswap64_type = NULL_TREE;
1892
1893   if (BITS_PER_UNIT != 8)
1894     return 0;
1895
1896   if (sizeof (HOST_WIDEST_INT) < 8)
1897     return 0;
1898
1899   bswap16_p = (builtin_decl_explicit_p (BUILT_IN_BSWAP16)
1900                && optab_handler (bswap_optab, HImode) != CODE_FOR_nothing);
1901   bswap32_p = (builtin_decl_explicit_p (BUILT_IN_BSWAP32)
1902                && optab_handler (bswap_optab, SImode) != CODE_FOR_nothing);
1903   bswap64_p = (builtin_decl_explicit_p (BUILT_IN_BSWAP64)
1904                && (optab_handler (bswap_optab, DImode) != CODE_FOR_nothing
1905                    || (bswap32_p && word_mode == SImode)));
1906
1907   if (!bswap16_p && !bswap32_p && !bswap64_p)
1908     return 0;
1909
1910   /* Determine the argument type of the builtins.  The code later on
1911      assumes that the return and argument type are the same.  */
1912   if (bswap16_p)
1913     {
1914       tree fndecl = builtin_decl_explicit (BUILT_IN_BSWAP16);
1915       bswap16_type = TREE_VALUE (TYPE_ARG_TYPES (TREE_TYPE (fndecl)));
1916     }
1917
1918   if (bswap32_p)
1919     {
1920       tree fndecl = builtin_decl_explicit (BUILT_IN_BSWAP32);
1921       bswap32_type = TREE_VALUE (TYPE_ARG_TYPES (TREE_TYPE (fndecl)));
1922     }
1923
1924   if (bswap64_p)
1925     {
1926       tree fndecl = builtin_decl_explicit (BUILT_IN_BSWAP64);
1927       bswap64_type = TREE_VALUE (TYPE_ARG_TYPES (TREE_TYPE (fndecl)));
1928     }
1929
1930   memset (&bswap_stats, 0, sizeof (bswap_stats));
1931
1932   FOR_EACH_BB (bb)
1933     {
1934       gimple_stmt_iterator gsi;
1935
1936       /* We do a reverse scan for bswap patterns to make sure we get the
1937          widest match. As bswap pattern matching doesn't handle
1938          previously inserted smaller bswap replacements as sub-
1939          patterns, the wider variant wouldn't be detected.  */
1940       for (gsi = gsi_last_bb (bb); !gsi_end_p (gsi); gsi_prev (&gsi))
1941         {
1942           gimple stmt = gsi_stmt (gsi);
1943           tree bswap_src, bswap_type;
1944           tree bswap_tmp;
1945           tree fndecl = NULL_TREE;
1946           int type_size;
1947           gimple call;
1948
1949           if (!is_gimple_assign (stmt)
1950               || gimple_assign_rhs_code (stmt) != BIT_IOR_EXPR)
1951             continue;
1952
1953           type_size = TYPE_PRECISION (gimple_expr_type (stmt));
1954
1955           switch (type_size)
1956             {
1957             case 16:
1958               if (bswap16_p)
1959                 {
1960                   fndecl = builtin_decl_explicit (BUILT_IN_BSWAP16);
1961                   bswap_type = bswap16_type;
1962                 }
1963               break;
1964             case 32:
1965               if (bswap32_p)
1966                 {
1967                   fndecl = builtin_decl_explicit (BUILT_IN_BSWAP32);
1968                   bswap_type = bswap32_type;
1969                 }
1970               break;
1971             case 64:
1972               if (bswap64_p)
1973                 {
1974                   fndecl = builtin_decl_explicit (BUILT_IN_BSWAP64);
1975                   bswap_type = bswap64_type;
1976                 }
1977               break;
1978             default:
1979               continue;
1980             }
1981
1982           if (!fndecl)
1983             continue;
1984
1985           bswap_src = find_bswap (stmt);
1986
1987           if (!bswap_src)
1988             continue;
1989
1990           changed = true;
1991           if (type_size == 16)
1992             bswap_stats.found_16bit++;
1993           else if (type_size == 32)
1994             bswap_stats.found_32bit++;
1995           else
1996             bswap_stats.found_64bit++;
1997
1998           bswap_tmp = bswap_src;
1999
2000           /* Convert the src expression if necessary.  */
2001           if (!useless_type_conversion_p (TREE_TYPE (bswap_tmp), bswap_type))
2002             {
2003               gimple convert_stmt;
2004               bswap_tmp = make_temp_ssa_name (bswap_type, NULL, "bswapsrc");
2005               convert_stmt = gimple_build_assign_with_ops
2006                                 (NOP_EXPR, bswap_tmp, bswap_src, NULL);
2007               gsi_insert_before (&gsi, convert_stmt, GSI_SAME_STMT);
2008             }
2009
2010           call = gimple_build_call (fndecl, 1, bswap_tmp);
2011
2012           bswap_tmp = gimple_assign_lhs (stmt);
2013
2014           /* Convert the result if necessary.  */
2015           if (!useless_type_conversion_p (TREE_TYPE (bswap_tmp), bswap_type))
2016             {
2017               gimple convert_stmt;
2018               bswap_tmp = make_temp_ssa_name (bswap_type, NULL, "bswapdst");
2019               convert_stmt = gimple_build_assign_with_ops
2020                         (NOP_EXPR, gimple_assign_lhs (stmt), bswap_tmp, NULL);
2021               gsi_insert_after (&gsi, convert_stmt, GSI_SAME_STMT);
2022             }
2023
2024           gimple_call_set_lhs (call, bswap_tmp);
2025
2026           if (dump_file)
2027             {
2028               fprintf (dump_file, "%d bit bswap implementation found at: ",
2029                        (int)type_size);
2030               print_gimple_stmt (dump_file, stmt, 0, 0);
2031             }
2032
2033           gsi_insert_after (&gsi, call, GSI_SAME_STMT);
2034           gsi_remove (&gsi, true);
2035         }
2036     }
2037
2038   statistics_counter_event (cfun, "16-bit bswap implementations found",
2039                             bswap_stats.found_16bit);
2040   statistics_counter_event (cfun, "32-bit bswap implementations found",
2041                             bswap_stats.found_32bit);
2042   statistics_counter_event (cfun, "64-bit bswap implementations found",
2043                             bswap_stats.found_64bit);
2044
2045   return (changed ? TODO_update_ssa | TODO_verify_ssa
2046           | TODO_verify_stmts : 0);
2047 }
2048
2049 static bool
2050 gate_optimize_bswap (void)
2051 {
2052   return flag_expensive_optimizations && optimize;
2053 }
2054
2055 namespace {
2056
2057 const pass_data pass_data_optimize_bswap =
2058 {
2059   GIMPLE_PASS, /* type */
2060   "bswap", /* name */
2061   OPTGROUP_NONE, /* optinfo_flags */
2062   true, /* has_gate */
2063   true, /* has_execute */
2064   TV_NONE, /* tv_id */
2065   PROP_ssa, /* properties_required */
2066   0, /* properties_provided */
2067   0, /* properties_destroyed */
2068   0, /* todo_flags_start */
2069   0, /* todo_flags_finish */
2070 };
2071
2072 class pass_optimize_bswap : public gimple_opt_pass
2073 {
2074 public:
2075   pass_optimize_bswap (gcc::context *ctxt)
2076     : gimple_opt_pass (pass_data_optimize_bswap, ctxt)
2077   {}
2078
2079   /* opt_pass methods: */
2080   bool gate () { return gate_optimize_bswap (); }
2081   unsigned int execute () { return execute_optimize_bswap (); }
2082
2083 }; // class pass_optimize_bswap
2084
2085 } // anon namespace
2086
2087 gimple_opt_pass *
2088 make_pass_optimize_bswap (gcc::context *ctxt)
2089 {
2090   return new pass_optimize_bswap (ctxt);
2091 }
2092
2093 /* Return true if stmt is a type conversion operation that can be stripped
2094    when used in a widening multiply operation.  */
2095 static bool
2096 widening_mult_conversion_strippable_p (tree result_type, gimple stmt)
2097 {
2098   enum tree_code rhs_code = gimple_assign_rhs_code (stmt);
2099
2100   if (TREE_CODE (result_type) == INTEGER_TYPE)
2101     {
2102       tree op_type;
2103       tree inner_op_type;
2104
2105       if (!CONVERT_EXPR_CODE_P (rhs_code))
2106         return false;
2107
2108       op_type = TREE_TYPE (gimple_assign_lhs (stmt));
2109
2110       /* If the type of OP has the same precision as the result, then
2111          we can strip this conversion.  The multiply operation will be
2112          selected to create the correct extension as a by-product.  */
2113       if (TYPE_PRECISION (result_type) == TYPE_PRECISION (op_type))
2114         return true;
2115
2116       /* We can also strip a conversion if it preserves the signed-ness of
2117          the operation and doesn't narrow the range.  */
2118       inner_op_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
2119
2120       /* If the inner-most type is unsigned, then we can strip any
2121          intermediate widening operation.  If it's signed, then the
2122          intermediate widening operation must also be signed.  */
2123       if ((TYPE_UNSIGNED (inner_op_type)
2124            || TYPE_UNSIGNED (op_type) == TYPE_UNSIGNED (inner_op_type))
2125           && TYPE_PRECISION (op_type) > TYPE_PRECISION (inner_op_type))
2126         return true;
2127
2128       return false;
2129     }
2130
2131   return rhs_code == FIXED_CONVERT_EXPR;
2132 }
2133
2134 /* Return true if RHS is a suitable operand for a widening multiplication,
2135    assuming a target type of TYPE.
2136    There are two cases:
2137
2138      - RHS makes some value at least twice as wide.  Store that value
2139        in *NEW_RHS_OUT if so, and store its type in *TYPE_OUT.
2140
2141      - RHS is an integer constant.  Store that value in *NEW_RHS_OUT if so,
2142        but leave *TYPE_OUT untouched.  */
2143
2144 static bool
2145 is_widening_mult_rhs_p (tree type, tree rhs, tree *type_out,
2146                         tree *new_rhs_out)
2147 {
2148   gimple stmt;
2149   tree type1, rhs1;
2150
2151   if (TREE_CODE (rhs) == SSA_NAME)
2152     {
2153       stmt = SSA_NAME_DEF_STMT (rhs);
2154       if (is_gimple_assign (stmt))
2155         {
2156           if (! widening_mult_conversion_strippable_p (type, stmt))
2157             rhs1 = rhs;
2158           else
2159             {
2160               rhs1 = gimple_assign_rhs1 (stmt);
2161
2162               if (TREE_CODE (rhs1) == INTEGER_CST)
2163                 {
2164                   *new_rhs_out = rhs1;
2165                   *type_out = NULL;
2166                   return true;
2167                 }
2168             }
2169         }
2170       else
2171         rhs1 = rhs;
2172
2173       type1 = TREE_TYPE (rhs1);
2174
2175       if (TREE_CODE (type1) != TREE_CODE (type)
2176           || TYPE_PRECISION (type1) * 2 > TYPE_PRECISION (type))
2177         return false;
2178
2179       *new_rhs_out = rhs1;
2180       *type_out = type1;
2181       return true;
2182     }
2183
2184   if (TREE_CODE (rhs) == INTEGER_CST)
2185     {
2186       *new_rhs_out = rhs;
2187       *type_out = NULL;
2188       return true;
2189     }
2190
2191   return false;
2192 }
2193
2194 /* Return true if STMT performs a widening multiplication, assuming the
2195    output type is TYPE.  If so, store the unwidened types of the operands
2196    in *TYPE1_OUT and *TYPE2_OUT respectively.  Also fill *RHS1_OUT and
2197    *RHS2_OUT such that converting those operands to types *TYPE1_OUT
2198    and *TYPE2_OUT would give the operands of the multiplication.  */
2199
2200 static bool
2201 is_widening_mult_p (gimple stmt,
2202                     tree *type1_out, tree *rhs1_out,
2203                     tree *type2_out, tree *rhs2_out)
2204 {
2205   tree type = TREE_TYPE (gimple_assign_lhs (stmt));
2206
2207   if (TREE_CODE (type) != INTEGER_TYPE
2208       && TREE_CODE (type) != FIXED_POINT_TYPE)
2209     return false;
2210
2211   if (!is_widening_mult_rhs_p (type, gimple_assign_rhs1 (stmt), type1_out,
2212                                rhs1_out))
2213     return false;
2214
2215   if (!is_widening_mult_rhs_p (type, gimple_assign_rhs2 (stmt), type2_out,
2216                                rhs2_out))
2217     return false;
2218
2219   if (*type1_out == NULL)
2220     {
2221       if (*type2_out == NULL || !int_fits_type_p (*rhs1_out, *type2_out))
2222         return false;
2223       *type1_out = *type2_out;
2224     }
2225
2226   if (*type2_out == NULL)
2227     {
2228       if (!int_fits_type_p (*rhs2_out, *type1_out))
2229         return false;
2230       *type2_out = *type1_out;
2231     }
2232
2233   /* Ensure that the larger of the two operands comes first. */
2234   if (TYPE_PRECISION (*type1_out) < TYPE_PRECISION (*type2_out))
2235     {
2236       tree tmp;
2237       tmp = *type1_out;
2238       *type1_out = *type2_out;
2239       *type2_out = tmp;
2240       tmp = *rhs1_out;
2241       *rhs1_out = *rhs2_out;
2242       *rhs2_out = tmp;
2243     }
2244
2245   return true;
2246 }
2247
2248 /* Process a single gimple statement STMT, which has a MULT_EXPR as
2249    its rhs, and try to convert it into a WIDEN_MULT_EXPR.  The return
2250    value is true iff we converted the statement.  */
2251
2252 static bool
2253 convert_mult_to_widen (gimple stmt, gimple_stmt_iterator *gsi)
2254 {
2255   tree lhs, rhs1, rhs2, type, type1, type2;
2256   enum insn_code handler;
2257   enum machine_mode to_mode, from_mode, actual_mode;
2258   optab op;
2259   int actual_precision;
2260   location_t loc = gimple_location (stmt);
2261   bool from_unsigned1, from_unsigned2;
2262
2263   lhs = gimple_assign_lhs (stmt);
2264   type = TREE_TYPE (lhs);
2265   if (TREE_CODE (type) != INTEGER_TYPE)
2266     return false;
2267
2268   if (!is_widening_mult_p (stmt, &type1, &rhs1, &type2, &rhs2))
2269     return false;
2270
2271   to_mode = TYPE_MODE (type);
2272   from_mode = TYPE_MODE (type1);
2273   from_unsigned1 = TYPE_UNSIGNED (type1);
2274   from_unsigned2 = TYPE_UNSIGNED (type2);
2275
2276   if (from_unsigned1 && from_unsigned2)
2277     op = umul_widen_optab;
2278   else if (!from_unsigned1 && !from_unsigned2)
2279     op = smul_widen_optab;
2280   else
2281     op = usmul_widen_optab;
2282
2283   handler = find_widening_optab_handler_and_mode (op, to_mode, from_mode,
2284                                                   0, &actual_mode);
2285
2286   if (handler == CODE_FOR_nothing)
2287     {
2288       if (op != smul_widen_optab)
2289         {
2290           /* We can use a signed multiply with unsigned types as long as
2291              there is a wider mode to use, or it is the smaller of the two
2292              types that is unsigned.  Note that type1 >= type2, always.  */
2293           if ((TYPE_UNSIGNED (type1)
2294                && TYPE_PRECISION (type1) == GET_MODE_PRECISION (from_mode))
2295               || (TYPE_UNSIGNED (type2)
2296                   && TYPE_PRECISION (type2) == GET_MODE_PRECISION (from_mode)))
2297             {
2298               from_mode = GET_MODE_WIDER_MODE (from_mode);
2299               if (GET_MODE_SIZE (to_mode) <= GET_MODE_SIZE (from_mode))
2300                 return false;
2301             }
2302
2303           op = smul_widen_optab;
2304           handler = find_widening_optab_handler_and_mode (op, to_mode,
2305                                                           from_mode, 0,
2306                                                           &actual_mode);
2307
2308           if (handler == CODE_FOR_nothing)
2309             return false;
2310
2311           from_unsigned1 = from_unsigned2 = false;
2312         }
2313       else
2314         return false;
2315     }
2316
2317   /* Ensure that the inputs to the handler are in the correct precison
2318      for the opcode.  This will be the full mode size.  */
2319   actual_precision = GET_MODE_PRECISION (actual_mode);
2320   if (2 * actual_precision > TYPE_PRECISION (type))
2321     return false;
2322   if (actual_precision != TYPE_PRECISION (type1)
2323       || from_unsigned1 != TYPE_UNSIGNED (type1))
2324     rhs1 = build_and_insert_cast (gsi, loc,
2325                                   build_nonstandard_integer_type
2326                                     (actual_precision, from_unsigned1), rhs1);
2327   if (actual_precision != TYPE_PRECISION (type2)
2328       || from_unsigned2 != TYPE_UNSIGNED (type2))
2329     rhs2 = build_and_insert_cast (gsi, loc,
2330                                   build_nonstandard_integer_type
2331                                     (actual_precision, from_unsigned2), rhs2);
2332
2333   /* Handle constants.  */
2334   if (TREE_CODE (rhs1) == INTEGER_CST)
2335     rhs1 = fold_convert (type1, rhs1);
2336   if (TREE_CODE (rhs2) == INTEGER_CST)
2337     rhs2 = fold_convert (type2, rhs2);
2338
2339   gimple_assign_set_rhs1 (stmt, rhs1);
2340   gimple_assign_set_rhs2 (stmt, rhs2);
2341   gimple_assign_set_rhs_code (stmt, WIDEN_MULT_EXPR);
2342   update_stmt (stmt);
2343   widen_mul_stats.widen_mults_inserted++;
2344   return true;
2345 }
2346
2347 /* Process a single gimple statement STMT, which is found at the
2348    iterator GSI and has a either a PLUS_EXPR or a MINUS_EXPR as its
2349    rhs (given by CODE), and try to convert it into a
2350    WIDEN_MULT_PLUS_EXPR or a WIDEN_MULT_MINUS_EXPR.  The return value
2351    is true iff we converted the statement.  */
2352
2353 static bool
2354 convert_plusminus_to_widen (gimple_stmt_iterator *gsi, gimple stmt,
2355                             enum tree_code code)
2356 {
2357   gimple rhs1_stmt = NULL, rhs2_stmt = NULL;
2358   gimple conv1_stmt = NULL, conv2_stmt = NULL, conv_stmt;
2359   tree type, type1, type2, optype;
2360   tree lhs, rhs1, rhs2, mult_rhs1, mult_rhs2, add_rhs;
2361   enum tree_code rhs1_code = ERROR_MARK, rhs2_code = ERROR_MARK;
2362   optab this_optab;
2363   enum tree_code wmult_code;
2364   enum insn_code handler;
2365   enum machine_mode to_mode, from_mode, actual_mode;
2366   location_t loc = gimple_location (stmt);
2367   int actual_precision;
2368   bool from_unsigned1, from_unsigned2;
2369
2370   lhs = gimple_assign_lhs (stmt);
2371   type = TREE_TYPE (lhs);
2372   if (TREE_CODE (type) != INTEGER_TYPE
2373       && TREE_CODE (type) != FIXED_POINT_TYPE)
2374     return false;
2375
2376   if (code == MINUS_EXPR)
2377     wmult_code = WIDEN_MULT_MINUS_EXPR;
2378   else
2379     wmult_code = WIDEN_MULT_PLUS_EXPR;
2380
2381   rhs1 = gimple_assign_rhs1 (stmt);
2382   rhs2 = gimple_assign_rhs2 (stmt);
2383
2384   if (TREE_CODE (rhs1) == SSA_NAME)
2385     {
2386       rhs1_stmt = SSA_NAME_DEF_STMT (rhs1);
2387       if (is_gimple_assign (rhs1_stmt))
2388         rhs1_code = gimple_assign_rhs_code (rhs1_stmt);
2389     }
2390
2391   if (TREE_CODE (rhs2) == SSA_NAME)
2392     {
2393       rhs2_stmt = SSA_NAME_DEF_STMT (rhs2);
2394       if (is_gimple_assign (rhs2_stmt))
2395         rhs2_code = gimple_assign_rhs_code (rhs2_stmt);
2396     }
2397
2398   /* Allow for one conversion statement between the multiply
2399      and addition/subtraction statement.  If there are more than
2400      one conversions then we assume they would invalidate this
2401      transformation.  If that's not the case then they should have
2402      been folded before now.  */
2403   if (CONVERT_EXPR_CODE_P (rhs1_code))
2404     {
2405       conv1_stmt = rhs1_stmt;
2406       rhs1 = gimple_assign_rhs1 (rhs1_stmt);
2407       if (TREE_CODE (rhs1) == SSA_NAME)
2408         {
2409           rhs1_stmt = SSA_NAME_DEF_STMT (rhs1);
2410           if (is_gimple_assign (rhs1_stmt))
2411             rhs1_code = gimple_assign_rhs_code (rhs1_stmt);
2412         }
2413       else
2414         return false;
2415     }
2416   if (CONVERT_EXPR_CODE_P (rhs2_code))
2417     {
2418       conv2_stmt = rhs2_stmt;
2419       rhs2 = gimple_assign_rhs1 (rhs2_stmt);
2420       if (TREE_CODE (rhs2) == SSA_NAME)
2421         {
2422           rhs2_stmt = SSA_NAME_DEF_STMT (rhs2);
2423           if (is_gimple_assign (rhs2_stmt))
2424             rhs2_code = gimple_assign_rhs_code (rhs2_stmt);
2425         }
2426       else
2427         return false;
2428     }
2429
2430   /* If code is WIDEN_MULT_EXPR then it would seem unnecessary to call
2431      is_widening_mult_p, but we still need the rhs returns.
2432
2433      It might also appear that it would be sufficient to use the existing
2434      operands of the widening multiply, but that would limit the choice of
2435      multiply-and-accumulate instructions.
2436
2437      If the widened-multiplication result has more than one uses, it is
2438      probably wiser not to do the conversion.  */
2439   if (code == PLUS_EXPR
2440       && (rhs1_code == MULT_EXPR || rhs1_code == WIDEN_MULT_EXPR))
2441     {
2442       if (!has_single_use (rhs1)
2443           || !is_widening_mult_p (rhs1_stmt, &type1, &mult_rhs1,
2444                                   &type2, &mult_rhs2))
2445         return false;
2446       add_rhs = rhs2;
2447       conv_stmt = conv1_stmt;
2448     }
2449   else if (rhs2_code == MULT_EXPR || rhs2_code == WIDEN_MULT_EXPR)
2450     {
2451       if (!has_single_use (rhs2)
2452           || !is_widening_mult_p (rhs2_stmt, &type1, &mult_rhs1,
2453                                   &type2, &mult_rhs2))
2454         return false;
2455       add_rhs = rhs1;
2456       conv_stmt = conv2_stmt;
2457     }
2458   else
2459     return false;
2460
2461   to_mode = TYPE_MODE (type);
2462   from_mode = TYPE_MODE (type1);
2463   from_unsigned1 = TYPE_UNSIGNED (type1);
2464   from_unsigned2 = TYPE_UNSIGNED (type2);
2465   optype = type1;
2466
2467   /* There's no such thing as a mixed sign madd yet, so use a wider mode.  */
2468   if (from_unsigned1 != from_unsigned2)
2469     {
2470       if (!INTEGRAL_TYPE_P (type))
2471         return false;
2472       /* We can use a signed multiply with unsigned types as long as
2473          there is a wider mode to use, or it is the smaller of the two
2474          types that is unsigned.  Note that type1 >= type2, always.  */
2475       if ((from_unsigned1
2476            && TYPE_PRECISION (type1) == GET_MODE_PRECISION (from_mode))
2477           || (from_unsigned2
2478               && TYPE_PRECISION (type2) == GET_MODE_PRECISION (from_mode)))
2479         {
2480           from_mode = GET_MODE_WIDER_MODE (from_mode);
2481           if (GET_MODE_SIZE (from_mode) >= GET_MODE_SIZE (to_mode))
2482             return false;
2483         }
2484
2485       from_unsigned1 = from_unsigned2 = false;
2486       optype = build_nonstandard_integer_type (GET_MODE_PRECISION (from_mode),
2487                                                false);
2488     }
2489
2490   /* If there was a conversion between the multiply and addition
2491      then we need to make sure it fits a multiply-and-accumulate.
2492      The should be a single mode change which does not change the
2493      value.  */
2494   if (conv_stmt)
2495     {
2496       /* We use the original, unmodified data types for this.  */
2497       tree from_type = TREE_TYPE (gimple_assign_rhs1 (conv_stmt));
2498       tree to_type = TREE_TYPE (gimple_assign_lhs (conv_stmt));
2499       int data_size = TYPE_PRECISION (type1) + TYPE_PRECISION (type2);
2500       bool is_unsigned = TYPE_UNSIGNED (type1) && TYPE_UNSIGNED (type2);
2501
2502       if (TYPE_PRECISION (from_type) > TYPE_PRECISION (to_type))
2503         {
2504           /* Conversion is a truncate.  */
2505           if (TYPE_PRECISION (to_type) < data_size)
2506             return false;
2507         }
2508       else if (TYPE_PRECISION (from_type) < TYPE_PRECISION (to_type))
2509         {
2510           /* Conversion is an extend.  Check it's the right sort.  */
2511           if (TYPE_UNSIGNED (from_type) != is_unsigned
2512               && !(is_unsigned && TYPE_PRECISION (from_type) > data_size))
2513             return false;
2514         }
2515       /* else convert is a no-op for our purposes.  */
2516     }
2517
2518   /* Verify that the machine can perform a widening multiply
2519      accumulate in this mode/signedness combination, otherwise
2520      this transformation is likely to pessimize code.  */
2521   this_optab = optab_for_tree_code (wmult_code, optype, optab_default);
2522   handler = find_widening_optab_handler_and_mode (this_optab, to_mode,
2523                                                   from_mode, 0, &actual_mode);
2524
2525   if (handler == CODE_FOR_nothing)
2526     return false;
2527
2528   /* Ensure that the inputs to the handler are in the correct precison
2529      for the opcode.  This will be the full mode size.  */
2530   actual_precision = GET_MODE_PRECISION (actual_mode);
2531   if (actual_precision != TYPE_PRECISION (type1)
2532       || from_unsigned1 != TYPE_UNSIGNED (type1))
2533     mult_rhs1 = build_and_insert_cast (gsi, loc,
2534                                        build_nonstandard_integer_type
2535                                          (actual_precision, from_unsigned1),
2536                                        mult_rhs1);
2537   if (actual_precision != TYPE_PRECISION (type2)
2538       || from_unsigned2 != TYPE_UNSIGNED (type2))
2539     mult_rhs2 = build_and_insert_cast (gsi, loc,
2540                                        build_nonstandard_integer_type
2541                                          (actual_precision, from_unsigned2),
2542                                        mult_rhs2);
2543
2544   if (!useless_type_conversion_p (type, TREE_TYPE (add_rhs)))
2545     add_rhs = build_and_insert_cast (gsi, loc, type, add_rhs);
2546
2547   /* Handle constants.  */
2548   if (TREE_CODE (mult_rhs1) == INTEGER_CST)
2549     mult_rhs1 = fold_convert (type1, mult_rhs1);
2550   if (TREE_CODE (mult_rhs2) == INTEGER_CST)
2551     mult_rhs2 = fold_convert (type2, mult_rhs2);
2552
2553   gimple_assign_set_rhs_with_ops_1 (gsi, wmult_code, mult_rhs1, mult_rhs2,
2554                                     add_rhs);
2555   update_stmt (gsi_stmt (*gsi));
2556   widen_mul_stats.maccs_inserted++;
2557   return true;
2558 }
2559
2560 /* Combine the multiplication at MUL_STMT with operands MULOP1 and MULOP2
2561    with uses in additions and subtractions to form fused multiply-add
2562    operations.  Returns true if successful and MUL_STMT should be removed.  */
2563
2564 static bool
2565 convert_mult_to_fma (gimple mul_stmt, tree op1, tree op2)
2566 {
2567   tree mul_result = gimple_get_lhs (mul_stmt);
2568   tree type = TREE_TYPE (mul_result);
2569   gimple use_stmt, neguse_stmt, fma_stmt;
2570   use_operand_p use_p;
2571   imm_use_iterator imm_iter;
2572
2573   if (FLOAT_TYPE_P (type)
2574       && flag_fp_contract_mode == FP_CONTRACT_OFF)
2575     return false;
2576
2577   /* We don't want to do bitfield reduction ops.  */
2578   if (INTEGRAL_TYPE_P (type)
2579       && (TYPE_PRECISION (type)
2580           != GET_MODE_PRECISION (TYPE_MODE (type))))
2581     return false;
2582
2583   /* If the target doesn't support it, don't generate it.  We assume that
2584      if fma isn't available then fms, fnma or fnms are not either.  */
2585   if (optab_handler (fma_optab, TYPE_MODE (type)) == CODE_FOR_nothing)
2586     return false;
2587
2588   /* If the multiplication has zero uses, it is kept around probably because
2589      of -fnon-call-exceptions.  Don't optimize it away in that case,
2590      it is DCE job.  */
2591   if (has_zero_uses (mul_result))
2592     return false;
2593
2594   /* Make sure that the multiplication statement becomes dead after
2595      the transformation, thus that all uses are transformed to FMAs.
2596      This means we assume that an FMA operation has the same cost
2597      as an addition.  */
2598   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, mul_result)
2599     {
2600       enum tree_code use_code;
2601       tree result = mul_result;
2602       bool negate_p = false;
2603
2604       use_stmt = USE_STMT (use_p);
2605
2606       if (is_gimple_debug (use_stmt))
2607         continue;
2608
2609       /* For now restrict this operations to single basic blocks.  In theory
2610          we would want to support sinking the multiplication in
2611          m = a*b;
2612          if ()
2613            ma = m + c;
2614          else
2615            d = m;
2616          to form a fma in the then block and sink the multiplication to the
2617          else block.  */
2618       if (gimple_bb (use_stmt) != gimple_bb (mul_stmt))
2619         return false;
2620
2621       if (!is_gimple_assign (use_stmt))
2622         return false;
2623
2624       use_code = gimple_assign_rhs_code (use_stmt);
2625
2626       /* A negate on the multiplication leads to FNMA.  */
2627       if (use_code == NEGATE_EXPR)
2628         {
2629           ssa_op_iter iter;
2630           use_operand_p usep;
2631
2632           result = gimple_assign_lhs (use_stmt);
2633
2634           /* Make sure the negate statement becomes dead with this
2635              single transformation.  */
2636           if (!single_imm_use (gimple_assign_lhs (use_stmt),
2637                                &use_p, &neguse_stmt))
2638             return false;
2639
2640           /* Make sure the multiplication isn't also used on that stmt.  */
2641           FOR_EACH_PHI_OR_STMT_USE (usep, neguse_stmt, iter, SSA_OP_USE)
2642             if (USE_FROM_PTR (usep) == mul_result)
2643               return false;
2644
2645           /* Re-validate.  */
2646           use_stmt = neguse_stmt;
2647           if (gimple_bb (use_stmt) != gimple_bb (mul_stmt))
2648             return false;
2649           if (!is_gimple_assign (use_stmt))
2650             return false;
2651
2652           use_code = gimple_assign_rhs_code (use_stmt);
2653           negate_p = true;
2654         }
2655
2656       switch (use_code)
2657         {
2658         case MINUS_EXPR:
2659           if (gimple_assign_rhs2 (use_stmt) == result)
2660             negate_p = !negate_p;
2661           break;
2662         case PLUS_EXPR:
2663           break;
2664         default:
2665           /* FMA can only be formed from PLUS and MINUS.  */
2666           return false;
2667         }
2668
2669       /* If the subtrahend (gimple_assign_rhs2 (use_stmt)) is computed
2670          by a MULT_EXPR that we'll visit later, we might be able to
2671          get a more profitable match with fnma.
2672          OTOH, if we don't, a negate / fma pair has likely lower latency
2673          that a mult / subtract pair.  */
2674       if (use_code == MINUS_EXPR && !negate_p
2675           && gimple_assign_rhs1 (use_stmt) == result
2676           && optab_handler (fms_optab, TYPE_MODE (type)) == CODE_FOR_nothing
2677           && optab_handler (fnma_optab, TYPE_MODE (type)) != CODE_FOR_nothing)
2678         {
2679           tree rhs2 = gimple_assign_rhs2 (use_stmt);
2680
2681           if (TREE_CODE (rhs2) == SSA_NAME)
2682             {
2683               gimple stmt2 = SSA_NAME_DEF_STMT (rhs2);
2684               if (has_single_use (rhs2)
2685                   && is_gimple_assign (stmt2)
2686                   && gimple_assign_rhs_code (stmt2) == MULT_EXPR)
2687               return false;
2688             }
2689         }
2690
2691       /* We can't handle a * b + a * b.  */
2692       if (gimple_assign_rhs1 (use_stmt) == gimple_assign_rhs2 (use_stmt))
2693         return false;
2694
2695       /* While it is possible to validate whether or not the exact form
2696          that we've recognized is available in the backend, the assumption
2697          is that the transformation is never a loss.  For instance, suppose
2698          the target only has the plain FMA pattern available.  Consider
2699          a*b-c -> fma(a,b,-c): we've exchanged MUL+SUB for FMA+NEG, which
2700          is still two operations.  Consider -(a*b)-c -> fma(-a,b,-c): we
2701          still have 3 operations, but in the FMA form the two NEGs are
2702          independent and could be run in parallel.  */
2703     }
2704
2705   FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, mul_result)
2706     {
2707       gimple_stmt_iterator gsi = gsi_for_stmt (use_stmt);
2708       enum tree_code use_code;
2709       tree addop, mulop1 = op1, result = mul_result;
2710       bool negate_p = false;
2711
2712       if (is_gimple_debug (use_stmt))
2713         continue;
2714
2715       use_code = gimple_assign_rhs_code (use_stmt);
2716       if (use_code == NEGATE_EXPR)
2717         {
2718           result = gimple_assign_lhs (use_stmt);
2719           single_imm_use (gimple_assign_lhs (use_stmt), &use_p, &neguse_stmt);
2720           gsi_remove (&gsi, true);
2721           release_defs (use_stmt);
2722
2723           use_stmt = neguse_stmt;
2724           gsi = gsi_for_stmt (use_stmt);
2725           use_code = gimple_assign_rhs_code (use_stmt);
2726           negate_p = true;
2727         }
2728
2729       if (gimple_assign_rhs1 (use_stmt) == result)
2730         {
2731           addop = gimple_assign_rhs2 (use_stmt);
2732           /* a * b - c -> a * b + (-c)  */
2733           if (gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
2734             addop = force_gimple_operand_gsi (&gsi,
2735                                               build1 (NEGATE_EXPR,
2736                                                       type, addop),
2737                                               true, NULL_TREE, true,
2738                                               GSI_SAME_STMT);
2739         }
2740       else
2741         {
2742           addop = gimple_assign_rhs1 (use_stmt);
2743           /* a - b * c -> (-b) * c + a */
2744           if (gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
2745             negate_p = !negate_p;
2746         }
2747
2748       if (negate_p)
2749         mulop1 = force_gimple_operand_gsi (&gsi,
2750                                            build1 (NEGATE_EXPR,
2751                                                    type, mulop1),
2752                                            true, NULL_TREE, true,
2753                                            GSI_SAME_STMT);
2754
2755       fma_stmt = gimple_build_assign_with_ops (FMA_EXPR,
2756                                                gimple_assign_lhs (use_stmt),
2757                                                mulop1, op2,
2758                                                addop);
2759       gsi_replace (&gsi, fma_stmt, true);
2760       widen_mul_stats.fmas_inserted++;
2761     }
2762
2763   return true;
2764 }
2765
2766 /* Find integer multiplications where the operands are extended from
2767    smaller types, and replace the MULT_EXPR with a WIDEN_MULT_EXPR
2768    where appropriate.  */
2769
2770 static unsigned int
2771 execute_optimize_widening_mul (void)
2772 {
2773   basic_block bb;
2774   bool cfg_changed = false;
2775
2776   memset (&widen_mul_stats, 0, sizeof (widen_mul_stats));
2777
2778   FOR_EACH_BB (bb)
2779     {
2780       gimple_stmt_iterator gsi;
2781
2782       for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi);)
2783         {
2784           gimple stmt = gsi_stmt (gsi);
2785           enum tree_code code;
2786
2787           if (is_gimple_assign (stmt))
2788             {
2789               code = gimple_assign_rhs_code (stmt);
2790               switch (code)
2791                 {
2792                 case MULT_EXPR:
2793                   if (!convert_mult_to_widen (stmt, &gsi)
2794                       && convert_mult_to_fma (stmt,
2795                                               gimple_assign_rhs1 (stmt),
2796                                               gimple_assign_rhs2 (stmt)))
2797                     {
2798                       gsi_remove (&gsi, true);
2799                       release_defs (stmt);
2800                       continue;
2801                     }
2802                   break;
2803
2804                 case PLUS_EXPR:
2805                 case MINUS_EXPR:
2806                   convert_plusminus_to_widen (&gsi, stmt, code);
2807                   break;
2808
2809                 default:;
2810                 }
2811             }
2812           else if (is_gimple_call (stmt)
2813                    && gimple_call_lhs (stmt))
2814             {
2815               tree fndecl = gimple_call_fndecl (stmt);
2816               if (fndecl
2817                   && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_NORMAL)
2818                 {
2819                   switch (DECL_FUNCTION_CODE (fndecl))
2820                     {
2821                       case BUILT_IN_POWF:
2822                       case BUILT_IN_POW:
2823                       case BUILT_IN_POWL:
2824                         if (TREE_CODE (gimple_call_arg (stmt, 1)) == REAL_CST
2825                             && REAL_VALUES_EQUAL
2826                                  (TREE_REAL_CST (gimple_call_arg (stmt, 1)),
2827                                   dconst2)
2828                             && convert_mult_to_fma (stmt,
2829                                                     gimple_call_arg (stmt, 0),
2830                                                     gimple_call_arg (stmt, 0)))
2831                           {
2832                             unlink_stmt_vdef (stmt);
2833                             if (gsi_remove (&gsi, true)
2834                                 && gimple_purge_dead_eh_edges (bb))
2835                               cfg_changed = true;
2836                             release_defs (stmt);
2837                             continue;
2838                           }
2839                           break;
2840
2841                       default:;
2842                     }
2843                 }
2844             }
2845           gsi_next (&gsi);
2846         }
2847     }
2848
2849   statistics_counter_event (cfun, "widening multiplications inserted",
2850                             widen_mul_stats.widen_mults_inserted);
2851   statistics_counter_event (cfun, "widening maccs inserted",
2852                             widen_mul_stats.maccs_inserted);
2853   statistics_counter_event (cfun, "fused multiply-adds inserted",
2854                             widen_mul_stats.fmas_inserted);
2855
2856   return cfg_changed ? TODO_cleanup_cfg : 0;
2857 }
2858
2859 static bool
2860 gate_optimize_widening_mul (void)
2861 {
2862   return flag_expensive_optimizations && optimize;
2863 }
2864
2865 namespace {
2866
2867 const pass_data pass_data_optimize_widening_mul =
2868 {
2869   GIMPLE_PASS, /* type */
2870   "widening_mul", /* name */
2871   OPTGROUP_NONE, /* optinfo_flags */
2872   true, /* has_gate */
2873   true, /* has_execute */
2874   TV_NONE, /* tv_id */
2875   PROP_ssa, /* properties_required */
2876   0, /* properties_provided */
2877   0, /* properties_destroyed */
2878   0, /* todo_flags_start */
2879   ( TODO_verify_ssa | TODO_verify_stmts
2880     | TODO_update_ssa ), /* todo_flags_finish */
2881 };
2882
2883 class pass_optimize_widening_mul : public gimple_opt_pass
2884 {
2885 public:
2886   pass_optimize_widening_mul (gcc::context *ctxt)
2887     : gimple_opt_pass (pass_data_optimize_widening_mul, ctxt)
2888   {}
2889
2890   /* opt_pass methods: */
2891   bool gate () { return gate_optimize_widening_mul (); }
2892   unsigned int execute () { return execute_optimize_widening_mul (); }
2893
2894 }; // class pass_optimize_widening_mul
2895
2896 } // anon namespace
2897
2898 gimple_opt_pass *
2899 make_pass_optimize_widening_mul (gcc::context *ctxt)
2900 {
2901   return new pass_optimize_widening_mul (ctxt);
2902 }