gcc/tree-ssa-math-opts.c

   1 /* Global, SSA-based optimizations using mathematical identities.
   2    Copyright (C) 2005-2020 Free Software Foundation, Inc.
   3
   4 This file is part of GCC.
   5
   6 GCC is free software; you can redistribute it and/or modify it
   7 under the terms of the GNU General Public License as published by the
   8 Free Software Foundation; either version 3, or (at your option) any
   9 later version.
  10
  11 GCC is distributed in the hope that it will be useful, but WITHOUT
  12 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14 for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with GCC; see the file COPYING3.  If not see
  18 <http://www.gnu.org/licenses/>.  */
  19
  20 /* Currently, the only mini-pass in this file tries to CSE reciprocal
  21    operations.  These are common in sequences such as this one:
  22
  23         modulus = sqrt(x*x + y*y + z*z);
  24         x = x / modulus;
  25         y = y / modulus;
  26         z = z / modulus;
  27
  28    that can be optimized to
  29
  30         modulus = sqrt(x*x + y*y + z*z);
  31         rmodulus = 1.0 / modulus;
  32         x = x * rmodulus;
  33         y = y * rmodulus;
  34         z = z * rmodulus;
  35
  36    We do this for loop invariant divisors, and with this pass whenever
  37    we notice that a division has the same divisor multiple times.
  38
  39    Of course, like in PRE, we don't insert a division if a dominator
  40    already has one.  However, this cannot be done as an extension of
  41    PRE for several reasons.
  42
  43    First of all, with some experiments it was found out that the
  44    transformation is not always useful if there are only two divisions
  45    by the same divisor.  This is probably because modern processors
  46    can pipeline the divisions; on older, in-order processors it should
  47    still be effective to optimize two divisions by the same number.
  48    We make this a param, and it shall be called N in the remainder of
  49    this comment.
  50
  51    Second, if trapping math is active, we have less freedom on where
  52    to insert divisions: we can only do so in basic blocks that already
  53    contain one.  (If divisions don't trap, instead, we can insert
  54    divisions elsewhere, which will be in blocks that are common dominators
  55    of those that have the division).
  56
  57    We really don't want to compute the reciprocal unless a division will
  58    be found.  To do this, we won't insert the division in a basic block
  59    that has less than N divisions *post-dominating* it.
  60
  61    The algorithm constructs a subset of the dominator tree, holding the
  62    blocks containing the divisions and the common dominators to them,
  63    and walk it twice.  The first walk is in post-order, and it annotates
  64    each block with the number of divisions that post-dominate it: this
  65    gives information on where divisions can be inserted profitably.
  66    The second walk is in pre-order, and it inserts divisions as explained
  67    above, and replaces divisions by multiplications.
  68
  69    In the best case, the cost of the pass is O(n_statements).  In the
  70    worst-case, the cost is due to creating the dominator tree subset,
  71    with a cost of O(n_basic_blocks ^ 2); however this can only happen
  72    for n_statements / n_basic_blocks statements.  So, the amortized cost
  73    of creating the dominator tree subset is O(n_basic_blocks) and the
  74    worst-case cost of the pass is O(n_statements * n_basic_blocks).
  75
  76    More practically, the cost will be small because there are few
  77    divisions, and they tend to be in the same basic block, so insert_bb
  78    is called very few times.
  79
  80    If we did this using domwalk.c, an efficient implementation would have
  81    to work on all the variables in a single pass, because we could not
  82    work on just a subset of the dominator tree, as we do now, and the
  83    cost would also be something like O(n_statements * n_basic_blocks).
  84    The data structures would be more complex in order to work on all the
  85    variables in a single pass.  */
  86
  87 #include "config.h"
  88 #include "system.h"
  89 #include "coretypes.h"
  90 #include "backend.h"
  91 #include "target.h"
  92 #include "rtl.h"
  93 #include "tree.h"
  94 #include "gimple.h"
  95 #include "predict.h"
  96 #include "alloc-pool.h"
  97 #include "tree-pass.h"
  98 #include "ssa.h"
  99 #include "optabs-tree.h"
 100 #include "gimple-pretty-print.h"
 101 #include "alias.h"
 102 #include "fold-const.h"
 103 #include "gimple-fold.h"
 104 #include "gimple-iterator.h"
 105 #include "gimplify.h"
 106 #include "gimplify-me.h"
 107 #include "stor-layout.h"
 108 #include "tree-cfg.h"
 109 #include "tree-dfa.h"
 110 #include "tree-ssa.h"
 111 #include "builtins.h"
 112 #include "internal-fn.h"
 113 #include "case-cfn-macros.h"
 114 #include "optabs-libfuncs.h"
 115 #include "tree-eh.h"
 116 #include "targhooks.h"
 117 #include "domwalk.h"
 118
 119 /* This structure represents one basic block that either computes a
 120    division, or is a common dominator for basic block that compute a
 121    division.  */
 122 struct occurrence {
 123   /* The basic block represented by this structure.  */
 124   basic_block bb = basic_block();
 125
 126   /* If non-NULL, the SSA_NAME holding the definition for a reciprocal
 127      inserted in BB.  */
 128   tree recip_def = tree();
 129
 130   /* If non-NULL, the SSA_NAME holding the definition for a squared
 131      reciprocal inserted in BB.  */
 132   tree square_recip_def = tree();
 133
 134   /* If non-NULL, the GIMPLE_ASSIGN for a reciprocal computation that
 135      was inserted in BB.  */
 136   gimple *recip_def_stmt = nullptr;
 137
 138   /* Pointer to a list of "struct occurrence"s for blocks dominated
 139      by BB.  */
 140   struct occurrence *children = nullptr;
 141
 142   /* Pointer to the next "struct occurrence"s in the list of blocks
 143      sharing a common dominator.  */
 144   struct occurrence *next = nullptr;
 145
 146   /* The number of divisions that are in BB before compute_merit.  The
 147      number of divisions that are in BB or post-dominate it after
 148      compute_merit.  */
 149   int num_divisions = 0;
 150
 151   /* True if the basic block has a division, false if it is a common
 152      dominator for basic blocks that do.  If it is false and trapping
 153      math is active, BB is not a candidate for inserting a reciprocal.  */
 154   bool bb_has_division = false;
 155
 156   /* Construct a struct occurrence for basic block BB, and whose
 157      children list is headed by CHILDREN.  */
 158   occurrence (basic_block bb, struct occurrence *children)
 159   : bb (bb), children (children)
 160   {
 161     bb->aux = this;
 162   }
 163
 164   /* Destroy a struct occurrence and remove it from its basic block.  */
 165   ~occurrence ()
 166   {
 167     bb->aux = nullptr;
 168   }
 169
 170   /* Allocate memory for a struct occurrence from OCC_POOL.  */
 171   static void* operator new (size_t);
 172
 173   /* Return memory for a struct occurrence to OCC_POOL.  */
 174   static void operator delete (void*, size_t);
 175 };
 176
 177 static struct
 178 {
 179   /* Number of 1.0/X ops inserted.  */
 180   int rdivs_inserted;
 181
 182   /* Number of 1.0/FUNC ops inserted.  */
 183   int rfuncs_inserted;
 184 } reciprocal_stats;
 185
 186 static struct
 187 {
 188   /* Number of cexpi calls inserted.  */
 189   int inserted;
 190 } sincos_stats;
 191
 192 static struct
 193 {
 194   /* Number of widening multiplication ops inserted.  */
 195   int widen_mults_inserted;
 196
 197   /* Number of integer multiply-and-accumulate ops inserted.  */
 198   int maccs_inserted;
 199
 200   /* Number of fp fused multiply-add ops inserted.  */
 201   int fmas_inserted;
 202
 203   /* Number of divmod calls inserted.  */
 204   int divmod_calls_inserted;
 205 } widen_mul_stats;
 206
 207 /* The instance of "struct occurrence" representing the highest
 208    interesting block in the dominator tree.  */
 209 static struct occurrence *occ_head;
 210
 211 /* Allocation pool for getting instances of "struct occurrence".  */
 212 static object_allocator<occurrence> *occ_pool;
 213
 214 void* occurrence::operator new (size_t n)
 215 {
 216   gcc_assert (n == sizeof(occurrence));
 217   return occ_pool->allocate_raw ();
 218 }
 219
 220 void occurrence::operator delete (void *occ, size_t n)
 221 {
 222   gcc_assert (n == sizeof(occurrence));
 223   occ_pool->remove_raw (occ);
 224 }
 225
 226 /* Insert NEW_OCC into our subset of the dominator tree.  P_HEAD points to a
 227    list of "struct occurrence"s, one per basic block, having IDOM as
 228    their common dominator.
 229
 230    We try to insert NEW_OCC as deep as possible in the tree, and we also
 231    insert any other block that is a common dominator for BB and one
 232    block already in the tree.  */
 233
 234 static void
 235 insert_bb (struct occurrence *new_occ, basic_block idom,
 236            struct occurrence **p_head)
 237 {
 238   struct occurrence *occ, **p_occ;
 239
 240   for (p_occ = p_head; (occ = *p_occ) != NULL; )
 241     {
 242       basic_block bb = new_occ->bb, occ_bb = occ->bb;
 243       basic_block dom = nearest_common_dominator (CDI_DOMINATORS, occ_bb, bb);
 244       if (dom == bb)
 245         {
 246           /* BB dominates OCC_BB.  OCC becomes NEW_OCC's child: remove OCC
 247              from its list.  */
 248           *p_occ = occ->next;
 249           occ->next = new_occ->children;
 250           new_occ->children = occ;
 251
 252           /* Try the next block (it may as well be dominated by BB).  */
 253         }
 254
 255       else if (dom == occ_bb)
 256         {
 257           /* OCC_BB dominates BB.  Tail recurse to look deeper.  */
 258           insert_bb (new_occ, dom, &occ->children);
 259           return;
 260         }
 261
 262       else if (dom != idom)
 263         {
 264           gcc_assert (!dom->aux);
 265
 266           /* There is a dominator between IDOM and BB, add it and make
 267              two children out of NEW_OCC and OCC.  First, remove OCC from
 268              its list.  */
 269           *p_occ = occ->next;
 270           new_occ->next = occ;
 271           occ->next = NULL;
 272
 273           /* None of the previous blocks has DOM as a dominator: if we tail
 274              recursed, we would reexamine them uselessly. Just switch BB with
 275              DOM, and go on looking for blocks dominated by DOM.  */
 276           new_occ = new occurrence (dom, new_occ);
 277         }
 278
 279       else
 280         {
 281           /* Nothing special, go on with the next element.  */
 282           p_occ = &occ->next;
 283         }
 284     }
 285
 286   /* No place was found as a child of IDOM.  Make BB a sibling of IDOM.  */
 287   new_occ->next = *p_head;
 288   *p_head = new_occ;
 289 }
 290
 291 /* Register that we found a division in BB.
 292    IMPORTANCE is a measure of how much weighting to give
 293    that division.  Use IMPORTANCE = 2 to register a single
 294    division.  If the division is going to be found multiple
 295    times use 1 (as it is with squares).  */
 296
 297 static inline void
 298 register_division_in (basic_block bb, int importance)
 299 {
 300   struct occurrence *occ;
 301
 302   occ = (struct occurrence *) bb->aux;
 303   if (!occ)
 304     {
 305       occ = new occurrence (bb, NULL);
 306       insert_bb (occ, ENTRY_BLOCK_PTR_FOR_FN (cfun), &occ_head);
 307     }
 308
 309   occ->bb_has_division = true;
 310   occ->num_divisions += importance;
 311 }
 312
 313
 314 /* Compute the number of divisions that postdominate each block in OCC and
 315    its children.  */
 316
 317 static void
 318 compute_merit (struct occurrence *occ)
 319 {
 320   struct occurrence *occ_child;
 321   basic_block dom = occ->bb;
 322
 323   for (occ_child = occ->children; occ_child; occ_child = occ_child->next)
 324     {
 325       basic_block bb;
 326       if (occ_child->children)
 327         compute_merit (occ_child);
 328
 329       if (flag_exceptions)
 330         bb = single_noncomplex_succ (dom);
 331       else
 332         bb = dom;
 333
 334       if (dominated_by_p (CDI_POST_DOMINATORS, bb, occ_child->bb))
 335         occ->num_divisions += occ_child->num_divisions;
 336     }
 337 }
 338
 339
 340 /* Return whether USE_STMT is a floating-point division by DEF.  */
 341 static inline bool
 342 is_division_by (gimple *use_stmt, tree def)
 343 {
 344   return is_gimple_assign (use_stmt)
 345          && gimple_assign_rhs_code (use_stmt) == RDIV_EXPR
 346          && gimple_assign_rhs2 (use_stmt) == def
 347          /* Do not recognize x / x as valid division, as we are getting
 348             confused later by replacing all immediate uses x in such
 349             a stmt.  */
 350          && gimple_assign_rhs1 (use_stmt) != def
 351          && !stmt_can_throw_internal (cfun, use_stmt);
 352 }
 353
 354 /* Return TRUE if USE_STMT is a multiplication of DEF by A.  */
 355 static inline bool
 356 is_mult_by (gimple *use_stmt, tree def, tree a)
 357 {
 358   if (gimple_code (use_stmt) == GIMPLE_ASSIGN
 359       && gimple_assign_rhs_code (use_stmt) == MULT_EXPR)
 360     {
 361       tree op0 = gimple_assign_rhs1 (use_stmt);
 362       tree op1 = gimple_assign_rhs2 (use_stmt);
 363
 364       return (op0 == def && op1 == a)
 365               || (op0 == a && op1 == def);
 366     }
 367   return 0;
 368 }
 369
 370 /* Return whether USE_STMT is DEF * DEF.  */
 371 static inline bool
 372 is_square_of (gimple *use_stmt, tree def)
 373 {
 374   return is_mult_by (use_stmt, def, def);
 375 }
 376
 377 /* Return whether USE_STMT is a floating-point division by
 378    DEF * DEF.  */
 379 static inline bool
 380 is_division_by_square (gimple *use_stmt, tree def)
 381 {
 382   if (gimple_code (use_stmt) == GIMPLE_ASSIGN
 383       && gimple_assign_rhs_code (use_stmt) == RDIV_EXPR
 384       && gimple_assign_rhs1 (use_stmt) != gimple_assign_rhs2 (use_stmt)
 385       && !stmt_can_throw_internal (cfun, use_stmt))
 386     {
 387       tree denominator = gimple_assign_rhs2 (use_stmt);
 388       if (TREE_CODE (denominator) == SSA_NAME)
 389         return is_square_of (SSA_NAME_DEF_STMT (denominator), def);
 390     }
 391   return 0;
 392 }
 393
 394 /* Walk the subset of the dominator tree rooted at OCC, setting the
 395    RECIP_DEF field to a definition of 1.0 / DEF that can be used in
 396    the given basic block.  The field may be left NULL, of course,
 397    if it is not possible or profitable to do the optimization.
 398
 399    DEF_BSI is an iterator pointing at the statement defining DEF.
 400    If RECIP_DEF is set, a dominator already has a computation that can
 401    be used.
 402
 403    If should_insert_square_recip is set, then this also inserts
 404    the square of the reciprocal immediately after the definition
 405    of the reciprocal.  */
 406
 407 static void
 408 insert_reciprocals (gimple_stmt_iterator *def_gsi, struct occurrence *occ,
 409                     tree def, tree recip_def, tree square_recip_def,
 410                     int should_insert_square_recip, int threshold)
 411 {
 412   tree type;
 413   gassign *new_stmt, *new_square_stmt;
 414   gimple_stmt_iterator gsi;
 415   struct occurrence *occ_child;
 416
 417   if (!recip_def
 418       && (occ->bb_has_division || !flag_trapping_math)
 419       /* Divide by two as all divisions are counted twice in
 420          the costing loop.  */
 421       && occ->num_divisions / 2 >= threshold)
 422     {
 423       /* Make a variable with the replacement and substitute it.  */
 424       type = TREE_TYPE (def);
 425       recip_def = create_tmp_reg (type, "reciptmp");
 426       new_stmt = gimple_build_assign (recip_def, RDIV_EXPR,
 427                                       build_one_cst (type), def);
 428
 429       if (should_insert_square_recip)
 430         {
 431           square_recip_def = create_tmp_reg (type, "powmult_reciptmp");
 432           new_square_stmt = gimple_build_assign (square_recip_def, MULT_EXPR,
 433                                                  recip_def, recip_def);
 434         }
 435
 436       if (occ->bb_has_division)
 437         {
 438           /* Case 1: insert before an existing division.  */
 439           gsi = gsi_after_labels (occ->bb);
 440           while (!gsi_end_p (gsi)
 441                  && (!is_division_by (gsi_stmt (gsi), def))
 442                  && (!is_division_by_square (gsi_stmt (gsi), def)))
 443             gsi_next (&gsi);
 444
 445           gsi_insert_before (&gsi, new_stmt, GSI_SAME_STMT);
 446           if (should_insert_square_recip)
 447             gsi_insert_before (&gsi, new_square_stmt, GSI_SAME_STMT);
 448         }
 449       else if (def_gsi && occ->bb == gsi_bb (*def_gsi))
 450         {
 451           /* Case 2: insert right after the definition.  Note that this will
 452              never happen if the definition statement can throw, because in
 453              that case the sole successor of the statement's basic block will
 454              dominate all the uses as well.  */
 455           gsi_insert_after (def_gsi, new_stmt, GSI_NEW_STMT);
 456           if (should_insert_square_recip)
 457             gsi_insert_after (def_gsi, new_square_stmt, GSI_NEW_STMT);
 458         }
 459       else
 460         {
 461           /* Case 3: insert in a basic block not containing defs/uses.  */
 462           gsi = gsi_after_labels (occ->bb);
 463           gsi_insert_before (&gsi, new_stmt, GSI_SAME_STMT);
 464           if (should_insert_square_recip)
 465             gsi_insert_before (&gsi, new_square_stmt, GSI_SAME_STMT);
 466         }
 467
 468       reciprocal_stats.rdivs_inserted++;
 469
 470       occ->recip_def_stmt = new_stmt;
 471     }
 472
 473   occ->recip_def = recip_def;
 474   occ->square_recip_def = square_recip_def;
 475   for (occ_child = occ->children; occ_child; occ_child = occ_child->next)
 476     insert_reciprocals (def_gsi, occ_child, def, recip_def,
 477                         square_recip_def, should_insert_square_recip,
 478                         threshold);
 479 }
 480
 481 /* Replace occurrences of expr / (x * x) with expr * ((1 / x) * (1 / x)).
 482    Take as argument the use for (x * x).  */
 483 static inline void
 484 replace_reciprocal_squares (use_operand_p use_p)
 485 {
 486   gimple *use_stmt = USE_STMT (use_p);
 487   basic_block bb = gimple_bb (use_stmt);
 488   struct occurrence *occ = (struct occurrence *) bb->aux;
 489
 490   if (optimize_bb_for_speed_p (bb) && occ->square_recip_def
 491       && occ->recip_def)
 492     {
 493       gimple_stmt_iterator gsi = gsi_for_stmt (use_stmt);
 494       gimple_assign_set_rhs_code (use_stmt, MULT_EXPR);
 495       gimple_assign_set_rhs2 (use_stmt, occ->square_recip_def);
 496       SET_USE (use_p, occ->square_recip_def);
 497       fold_stmt_inplace (&gsi);
 498       update_stmt (use_stmt);
 499     }
 500 }
 501
 502
 503 /* Replace the division at USE_P with a multiplication by the reciprocal, if
 504    possible.  */
 505
 506 static inline void
 507 replace_reciprocal (use_operand_p use_p)
 508 {
 509   gimple *use_stmt = USE_STMT (use_p);
 510   basic_block bb = gimple_bb (use_stmt);
 511   struct occurrence *occ = (struct occurrence *) bb->aux;
 512
 513   if (optimize_bb_for_speed_p (bb)
 514       && occ->recip_def && use_stmt != occ->recip_def_stmt)
 515     {
 516       gimple_stmt_iterator gsi = gsi_for_stmt (use_stmt);
 517       gimple_assign_set_rhs_code (use_stmt, MULT_EXPR);
 518       SET_USE (use_p, occ->recip_def);
 519       fold_stmt_inplace (&gsi);
 520       update_stmt (use_stmt);
 521     }
 522 }
 523
 524
 525 /* Free OCC and return one more "struct occurrence" to be freed.  */
 526
 527 static struct occurrence *
 528 free_bb (struct occurrence *occ)
 529 {
 530   struct occurrence *child, *next;
 531
 532   /* First get the two pointers hanging off OCC.  */
 533   next = occ->next;
 534   child = occ->children;
 535   delete occ;
 536
 537   /* Now ensure that we don't recurse unless it is necessary.  */
 538   if (!child)
 539     return next;
 540   else
 541     {
 542       while (next)
 543         next = free_bb (next);
 544
 545       return child;
 546     }
 547 }
 548
 549 /* Transform sequences like
 550    t = sqrt (a)
 551    x = 1.0 / t;
 552    r1 = x * x;
 553    r2 = a * x;
 554    into:
 555    t = sqrt (a)
 556    r1 = 1.0 / a;
 557    r2 = t;
 558    x = r1 * r2;
 559    depending on the uses of x, r1, r2.  This removes one multiplication and
 560    allows the sqrt and division operations to execute in parallel.
 561    DEF_GSI is the gsi of the initial division by sqrt that defines
 562    DEF (x in the example above).  */
 563
 564 static void
 565 optimize_recip_sqrt (gimple_stmt_iterator *def_gsi, tree def)
 566 {
 567   gimple *use_stmt;
 568   imm_use_iterator use_iter;
 569   gimple *stmt = gsi_stmt (*def_gsi);
 570   tree x = def;
 571   tree orig_sqrt_ssa_name = gimple_assign_rhs2 (stmt);
 572   tree div_rhs1 = gimple_assign_rhs1 (stmt);
 573
 574   if (TREE_CODE (orig_sqrt_ssa_name) != SSA_NAME
 575       || TREE_CODE (div_rhs1) != REAL_CST
 576       || !real_equal (&TREE_REAL_CST (div_rhs1), &dconst1))
 577     return;
 578
 579   gcall *sqrt_stmt
 580     = dyn_cast <gcall *> (SSA_NAME_DEF_STMT (orig_sqrt_ssa_name));
 581
 582   if (!sqrt_stmt || !gimple_call_lhs (sqrt_stmt))
 583     return;
 584
 585   switch (gimple_call_combined_fn (sqrt_stmt))
 586     {
 587     CASE_CFN_SQRT:
 588     CASE_CFN_SQRT_FN:
 589       break;
 590
 591     default:
 592       return;
 593     }
 594   tree a = gimple_call_arg (sqrt_stmt, 0);
 595
 596   /* We have 'a' and 'x'.  Now analyze the uses of 'x'.  */
 597
 598   /* Statements that use x in x * x.  */
 599   auto_vec<gimple *> sqr_stmts;
 600   /* Statements that use x in a * x.  */
 601   auto_vec<gimple *> mult_stmts;
 602   bool has_other_use = false;
 603   bool mult_on_main_path = false;
 604
 605   FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, x)
 606     {
 607       if (is_gimple_debug (use_stmt))
 608         continue;
 609       if (is_square_of (use_stmt, x))
 610         {
 611           sqr_stmts.safe_push (use_stmt);
 612           if (gimple_bb (use_stmt) == gimple_bb (stmt))
 613             mult_on_main_path = true;
 614         }
 615       else if (is_mult_by (use_stmt, x, a))
 616         {
 617           mult_stmts.safe_push (use_stmt);
 618           if (gimple_bb (use_stmt) == gimple_bb (stmt))
 619             mult_on_main_path = true;
 620         }
 621       else
 622         has_other_use = true;
 623     }
 624
 625   /* In the x * x and a * x cases we just rewire stmt operands or
 626      remove multiplications.  In the has_other_use case we introduce
 627      a multiplication so make sure we don't introduce a multiplication
 628      on a path where there was none.  */
 629   if (has_other_use && !mult_on_main_path)
 630     return;
 631
 632   if (sqr_stmts.is_empty () && mult_stmts.is_empty ())
 633     return;
 634
 635   /* If x = 1.0 / sqrt (a) has uses other than those optimized here we want
 636      to be able to compose it from the sqr and mult cases.  */
 637   if (has_other_use && (sqr_stmts.is_empty () || mult_stmts.is_empty ()))
 638     return;
 639
 640   if (dump_file)
 641     {
 642       fprintf (dump_file, "Optimizing reciprocal sqrt multiplications of\n");
 643       print_gimple_stmt (dump_file, sqrt_stmt, 0, TDF_NONE);
 644       print_gimple_stmt (dump_file, stmt, 0, TDF_NONE);
 645       fprintf (dump_file, "\n");
 646     }
 647
 648   bool delete_div = !has_other_use;
 649   tree sqr_ssa_name = NULL_TREE;
 650   if (!sqr_stmts.is_empty ())
 651     {
 652       /* r1 = x * x.  Transform the original
 653          x = 1.0 / t
 654          into
 655          tmp1 = 1.0 / a
 656          r1 = tmp1.  */
 657
 658       sqr_ssa_name
 659         = make_temp_ssa_name (TREE_TYPE (a), NULL, "recip_sqrt_sqr");
 660
 661       if (dump_file)
 662         {
 663           fprintf (dump_file, "Replacing original division\n");
 664           print_gimple_stmt (dump_file, stmt, 0, TDF_NONE);
 665           fprintf (dump_file, "with new division\n");
 666         }
 667       stmt
 668         = gimple_build_assign (sqr_ssa_name, gimple_assign_rhs_code (stmt),
 669                                gimple_assign_rhs1 (stmt), a);
 670       gsi_insert_before (def_gsi, stmt, GSI_SAME_STMT);
 671       gsi_remove (def_gsi, true);
 672       *def_gsi = gsi_for_stmt (stmt);
 673       fold_stmt_inplace (def_gsi);
 674       update_stmt (stmt);
 675
 676       if (dump_file)
 677         print_gimple_stmt (dump_file, stmt, 0, TDF_NONE);
 678
 679       delete_div = false;
 680       gimple *sqr_stmt;
 681       unsigned int i;
 682       FOR_EACH_VEC_ELT (sqr_stmts, i, sqr_stmt)
 683         {
 684           gimple_stmt_iterator gsi2 = gsi_for_stmt (sqr_stmt);
 685           gimple_assign_set_rhs_from_tree (&gsi2, sqr_ssa_name);
 686           update_stmt (sqr_stmt);
 687         }
 688     }
 689   if (!mult_stmts.is_empty ())
 690     {
 691       /* r2 = a * x.  Transform this into:
 692          r2 = t (The original sqrt (a)).  */
 693       unsigned int i;
 694       gimple *mult_stmt = NULL;
 695       FOR_EACH_VEC_ELT (mult_stmts, i, mult_stmt)
 696         {
 697           gimple_stmt_iterator gsi2 = gsi_for_stmt (mult_stmt);
 698
 699           if (dump_file)
 700             {
 701               fprintf (dump_file, "Replacing squaring multiplication\n");
 702               print_gimple_stmt (dump_file, mult_stmt, 0, TDF_NONE);
 703               fprintf (dump_file, "with assignment\n");
 704             }
 705           gimple_assign_set_rhs_from_tree (&gsi2, orig_sqrt_ssa_name);
 706           fold_stmt_inplace (&gsi2);
 707           update_stmt (mult_stmt);
 708           if (dump_file)
 709             print_gimple_stmt (dump_file, mult_stmt, 0, TDF_NONE);
 710       }
 711     }
 712
 713   if (has_other_use)
 714     {
 715       /* Using the two temporaries tmp1, tmp2 from above
 716          the original x is now:
 717          x = tmp1 * tmp2.  */
 718       gcc_assert (orig_sqrt_ssa_name);
 719       gcc_assert (sqr_ssa_name);
 720
 721       gimple *new_stmt
 722         = gimple_build_assign (x, MULT_EXPR,
 723                                orig_sqrt_ssa_name, sqr_ssa_name);
 724       gsi_insert_after (def_gsi, new_stmt, GSI_NEW_STMT);
 725       update_stmt (stmt);
 726     }
 727   else if (delete_div)
 728     {
 729       /* Remove the original division.  */
 730       gimple_stmt_iterator gsi2 = gsi_for_stmt (stmt);
 731       gsi_remove (&gsi2, true);
 732       release_defs (stmt);
 733     }
 734   else
 735     release_ssa_name (x);
 736 }
 737
 738 /* Look for floating-point divisions among DEF's uses, and try to
 739    replace them by multiplications with the reciprocal.  Add
 740    as many statements computing the reciprocal as needed.
 741
 742    DEF must be a GIMPLE register of a floating-point type.  */
 743
 744 static void
 745 execute_cse_reciprocals_1 (gimple_stmt_iterator *def_gsi, tree def)
 746 {
 747   use_operand_p use_p, square_use_p;
 748   imm_use_iterator use_iter, square_use_iter;
 749   tree square_def;
 750   struct occurrence *occ;
 751   int count = 0;
 752   int threshold;
 753   int square_recip_count = 0;
 754   int sqrt_recip_count = 0;
 755
 756   gcc_assert (FLOAT_TYPE_P (TREE_TYPE (def)) && TREE_CODE (def) == SSA_NAME);
 757   threshold = targetm.min_divisions_for_recip_mul (TYPE_MODE (TREE_TYPE (def)));
 758
 759   /* If DEF is a square (x * x), count the number of divisions by x.
 760      If there are more divisions by x than by (DEF * DEF), prefer to optimize
 761      the reciprocal of x instead of DEF.  This improves cases like:
 762        def = x * x
 763        t0 = a / def
 764        t1 = b / def
 765        t2 = c / x
 766      Reciprocal optimization of x results in 1 division rather than 2 or 3.  */
 767   gimple *def_stmt = SSA_NAME_DEF_STMT (def);
 768
 769   if (is_gimple_assign (def_stmt)
 770       && gimple_assign_rhs_code (def_stmt) == MULT_EXPR
 771       && TREE_CODE (gimple_assign_rhs1 (def_stmt)) == SSA_NAME
 772       && gimple_assign_rhs1 (def_stmt) == gimple_assign_rhs2 (def_stmt))
 773     {
 774       tree op0 = gimple_assign_rhs1 (def_stmt);
 775
 776       FOR_EACH_IMM_USE_FAST (use_p, use_iter, op0)
 777         {
 778           gimple *use_stmt = USE_STMT (use_p);
 779           if (is_division_by (use_stmt, op0))
 780             sqrt_recip_count++;
 781         }
 782     }
 783
 784   FOR_EACH_IMM_USE_FAST (use_p, use_iter, def)
 785     {
 786       gimple *use_stmt = USE_STMT (use_p);
 787       if (is_division_by (use_stmt, def))
 788         {
 789           register_division_in (gimple_bb (use_stmt), 2);
 790           count++;
 791         }
 792
 793       if (is_square_of (use_stmt, def))
 794         {
 795           square_def = gimple_assign_lhs (use_stmt);
 796           FOR_EACH_IMM_USE_FAST (square_use_p, square_use_iter, square_def)
 797             {
 798               gimple *square_use_stmt = USE_STMT (square_use_p);
 799               if (is_division_by (square_use_stmt, square_def))
 800                 {
 801                   /* This is executed twice for each division by a square.  */
 802                   register_division_in (gimple_bb (square_use_stmt), 1);
 803                   square_recip_count++;
 804                 }
 805             }
 806         }
 807     }
 808
 809   /* Square reciprocals were counted twice above.  */
 810   square_recip_count /= 2;
 811
 812   /* If it is more profitable to optimize 1 / x, don't optimize 1 / (x * x).  */
 813   if (sqrt_recip_count > square_recip_count)
 814     goto out;
 815
 816   /* Do the expensive part only if we can hope to optimize something.  */
 817   if (count + square_recip_count >= threshold && count >= 1)
 818     {
 819       gimple *use_stmt;
 820       for (occ = occ_head; occ; occ = occ->next)
 821         {
 822           compute_merit (occ);
 823           insert_reciprocals (def_gsi, occ, def, NULL, NULL,
 824                               square_recip_count, threshold);
 825         }
 826
 827       FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, def)
 828         {
 829           if (is_division_by (use_stmt, def))
 830             {
 831               FOR_EACH_IMM_USE_ON_STMT (use_p, use_iter)
 832                 replace_reciprocal (use_p);
 833             }
 834           else if (square_recip_count > 0 && is_square_of (use_stmt, def))
 835             {
 836               FOR_EACH_IMM_USE_ON_STMT (use_p, use_iter)
 837                 {
 838                   /* Find all uses of the square that are divisions and
 839                    * replace them by multiplications with the inverse.  */
 840                   imm_use_iterator square_iterator;
 841                   gimple *powmult_use_stmt = USE_STMT (use_p);
 842                   tree powmult_def_name = gimple_assign_lhs (powmult_use_stmt);
 843
 844                   FOR_EACH_IMM_USE_STMT (powmult_use_stmt,
 845                                          square_iterator, powmult_def_name)
 846                     FOR_EACH_IMM_USE_ON_STMT (square_use_p, square_iterator)
 847                       {
 848                         gimple *powmult_use_stmt = USE_STMT (square_use_p);
 849                         if (is_division_by (powmult_use_stmt, powmult_def_name))
 850                           replace_reciprocal_squares (square_use_p);
 851                       }
 852                 }
 853             }
 854         }
 855     }
 856
 857 out:
 858   for (occ = occ_head; occ; )
 859     occ = free_bb (occ);
 860
 861   occ_head = NULL;
 862 }
 863
 864 /* Return an internal function that implements the reciprocal of CALL,
 865    or IFN_LAST if there is no such function that the target supports.  */
 866
 867 internal_fn
 868 internal_fn_reciprocal (gcall *call)
 869 {
 870   internal_fn ifn;
 871
 872   switch (gimple_call_combined_fn (call))
 873     {
 874     CASE_CFN_SQRT:
 875     CASE_CFN_SQRT_FN:
 876       ifn = IFN_RSQRT;
 877       break;
 878
 879     default:
 880       return IFN_LAST;
 881     }
 882
 883   tree_pair types = direct_internal_fn_types (ifn, call);
 884   if (!direct_internal_fn_supported_p (ifn, types, OPTIMIZE_FOR_SPEED))
 885     return IFN_LAST;
 886
 887   return ifn;
 888 }
 889
 890 /* Go through all the floating-point SSA_NAMEs, and call
 891    execute_cse_reciprocals_1 on each of them.  */
 892 namespace {
 893
 894 const pass_data pass_data_cse_reciprocals =
 895 {
 896   GIMPLE_PASS, /* type */
 897   "recip", /* name */
 898   OPTGROUP_NONE, /* optinfo_flags */
 899   TV_TREE_RECIP, /* tv_id */
 900   PROP_ssa, /* properties_required */
 901   0, /* properties_provided */
 902   0, /* properties_destroyed */
 903   0, /* todo_flags_start */
 904   TODO_update_ssa, /* todo_flags_finish */
 905 };
 906
 907 class pass_cse_reciprocals : public gimple_opt_pass
 908 {
 909 public:
 910   pass_cse_reciprocals (gcc::context *ctxt)
 911     : gimple_opt_pass (pass_data_cse_reciprocals, ctxt)
 912   {}
 913
 914   /* opt_pass methods: */
 915   virtual bool gate (function *) { return optimize && flag_reciprocal_math; }
 916   virtual unsigned int execute (function *);
 917
 918 }; // class pass_cse_reciprocals
 919
 920 unsigned int
 921 pass_cse_reciprocals::execute (function *fun)
 922 {
 923   basic_block bb;
 924   tree arg;
 925
 926   occ_pool = new object_allocator<occurrence> ("dominators for recip");
 927
 928   memset (&reciprocal_stats, 0, sizeof (reciprocal_stats));
 929   calculate_dominance_info (CDI_DOMINATORS);
 930   calculate_dominance_info (CDI_POST_DOMINATORS);
 931
 932   if (flag_checking)
 933     FOR_EACH_BB_FN (bb, fun)
 934       gcc_assert (!bb->aux);
 935
 936   for (arg = DECL_ARGUMENTS (fun->decl); arg; arg = DECL_CHAIN (arg))
 937     if (FLOAT_TYPE_P (TREE_TYPE (arg))
 938         && is_gimple_reg (arg))
 939       {
 940         tree name = ssa_default_def (fun, arg);
 941         if (name)
 942           execute_cse_reciprocals_1 (NULL, name);
 943       }
 944
 945   FOR_EACH_BB_FN (bb, fun)
 946     {
 947       tree def;
 948
 949       for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
 950            gsi_next (&gsi))
 951         {
 952           gphi *phi = gsi.phi ();
 953           def = PHI_RESULT (phi);
 954           if (! virtual_operand_p (def)
 955               && FLOAT_TYPE_P (TREE_TYPE (def)))
 956             execute_cse_reciprocals_1 (NULL, def);
 957         }
 958
 959       for (gimple_stmt_iterator gsi = gsi_after_labels (bb); !gsi_end_p (gsi);
 960            gsi_next (&gsi))
 961         {
 962           gimple *stmt = gsi_stmt (gsi);
 963
 964           if (gimple_has_lhs (stmt)
 965               && (def = SINGLE_SSA_TREE_OPERAND (stmt, SSA_OP_DEF)) != NULL
 966               && FLOAT_TYPE_P (TREE_TYPE (def))
 967               && TREE_CODE (def) == SSA_NAME)
 968             {
 969               execute_cse_reciprocals_1 (&gsi, def);
 970               stmt = gsi_stmt (gsi);
 971               if (flag_unsafe_math_optimizations
 972                   && is_gimple_assign (stmt)
 973                   && gimple_assign_lhs (stmt) == def
 974                   && !stmt_can_throw_internal (cfun, stmt)
 975                   && gimple_assign_rhs_code (stmt) == RDIV_EXPR)
 976                 optimize_recip_sqrt (&gsi, def);
 977             }
 978         }
 979
 980       if (optimize_bb_for_size_p (bb))
 981         continue;
 982
 983       /* Scan for a/func(b) and convert it to reciprocal a*rfunc(b).  */
 984       for (gimple_stmt_iterator gsi = gsi_after_labels (bb); !gsi_end_p (gsi);
 985            gsi_next (&gsi))
 986         {
 987           gimple *stmt = gsi_stmt (gsi);
 988
 989           if (is_gimple_assign (stmt)
 990               && gimple_assign_rhs_code (stmt) == RDIV_EXPR)
 991             {
 992               tree arg1 = gimple_assign_rhs2 (stmt);
 993               gimple *stmt1;
 994
 995               if (TREE_CODE (arg1) != SSA_NAME)
 996                 continue;
 997
 998               stmt1 = SSA_NAME_DEF_STMT (arg1);
 999
1000               if (is_gimple_call (stmt1)
1001                   && gimple_call_lhs (stmt1))
1002                 {
1003                   bool fail;
1004                   imm_use_iterator ui;
1005                   use_operand_p use_p;
1006                   tree fndecl = NULL_TREE;
1007
1008                   gcall *call = as_a <gcall *> (stmt1);
1009                   internal_fn ifn = internal_fn_reciprocal (call);
1010                   if (ifn == IFN_LAST)
1011                     {
1012                       fndecl = gimple_call_fndecl (call);
1013                       if (!fndecl
1014                           || !fndecl_built_in_p (fndecl, BUILT_IN_MD))
1015                         continue;
1016                       fndecl = targetm.builtin_reciprocal (fndecl);
1017                       if (!fndecl)
1018                         continue;
1019                     }
1020
1021                   /* Check that all uses of the SSA name are divisions,
1022                      otherwise replacing the defining statement will do
1023                      the wrong thing.  */
1024                   fail = false;
1025                   FOR_EACH_IMM_USE_FAST (use_p, ui, arg1)
1026                     {
1027                       gimple *stmt2 = USE_STMT (use_p);
1028                       if (is_gimple_debug (stmt2))
1029                         continue;
1030                       if (!is_gimple_assign (stmt2)
1031                           || gimple_assign_rhs_code (stmt2) != RDIV_EXPR
1032                           || gimple_assign_rhs1 (stmt2) == arg1
1033                           || gimple_assign_rhs2 (stmt2) != arg1)
1034                         {
1035                           fail = true;
1036                           break;
1037                         }
1038                     }
1039                   if (fail)
1040                     continue;
1041
1042                   gimple_replace_ssa_lhs (call, arg1);
1043                   if (gimple_call_internal_p (call) != (ifn != IFN_LAST))
1044                     {
1045                       auto_vec<tree, 4> args;
1046                       for (unsigned int i = 0;
1047                            i < gimple_call_num_args (call); i++)
1048                         args.safe_push (gimple_call_arg (call, i));
1049                       gcall *stmt2;
1050                       if (ifn == IFN_LAST)
1051                         stmt2 = gimple_build_call_vec (fndecl, args);
1052                       else
1053                         stmt2 = gimple_build_call_internal_vec (ifn, args);
1054                       gimple_call_set_lhs (stmt2, arg1);
1055                       gimple_move_vops (stmt2, call);
1056                       gimple_call_set_nothrow (stmt2,
1057                                                gimple_call_nothrow_p (call));
1058                       gimple_stmt_iterator gsi2 = gsi_for_stmt (call);
1059                       gsi_replace (&gsi2, stmt2, true);
1060                     }
1061                   else
1062                     {
1063                       if (ifn == IFN_LAST)
1064                         gimple_call_set_fndecl (call, fndecl);
1065                       else
1066                         gimple_call_set_internal_fn (call, ifn);
1067                       update_stmt (call);
1068                     }
1069                   reciprocal_stats.rfuncs_inserted++;
1070
1071                   FOR_EACH_IMM_USE_STMT (stmt, ui, arg1)
1072                     {
1073                       gimple_stmt_iterator gsi = gsi_for_stmt (stmt);
1074                       gimple_assign_set_rhs_code (stmt, MULT_EXPR);
1075                       fold_stmt_inplace (&gsi);
1076                       update_stmt (stmt);
1077                     }
1078                 }
1079             }
1080         }
1081     }
1082
1083   statistics_counter_event (fun, "reciprocal divs inserted",
1084                             reciprocal_stats.rdivs_inserted);
1085   statistics_counter_event (fun, "reciprocal functions inserted",
1086                             reciprocal_stats.rfuncs_inserted);
1087
1088   free_dominance_info (CDI_DOMINATORS);
1089   free_dominance_info (CDI_POST_DOMINATORS);
1090   delete occ_pool;
1091   return 0;
1092 }
1093
1094 } // anon namespace
1095
1096 gimple_opt_pass *
1097 make_pass_cse_reciprocals (gcc::context *ctxt)
1098 {
1099   return new pass_cse_reciprocals (ctxt);
1100 }
1101
1102 /* Records an occurrence at statement USE_STMT in the vector of trees
1103    STMTS if it is dominated by *TOP_BB or dominates it or this basic block
1104    is not yet initialized.  Returns true if the occurrence was pushed on
1105    the vector.  Adjusts *TOP_BB to be the basic block dominating all
1106    statements in the vector.  */
1107
1108 static bool
1109 maybe_record_sincos (vec<gimple *> *stmts,
1110                      basic_block *top_bb, gimple *use_stmt)
1111 {
1112   basic_block use_bb = gimple_bb (use_stmt);
1113   if (*top_bb
1114       && (*top_bb == use_bb
1115           || dominated_by_p (CDI_DOMINATORS, use_bb, *top_bb)))
1116     stmts->safe_push (use_stmt);
1117   else if (!*top_bb
1118            || dominated_by_p (CDI_DOMINATORS, *top_bb, use_bb))
1119     {
1120       stmts->safe_push (use_stmt);
1121       *top_bb = use_bb;
1122     }
1123   else
1124     return false;
1125
1126   return true;
1127 }
1128
1129 /* Look for sin, cos and cexpi calls with the same argument NAME and
1130    create a single call to cexpi CSEing the result in this case.
1131    We first walk over all immediate uses of the argument collecting
1132    statements that we can CSE in a vector and in a second pass replace
1133    the statement rhs with a REALPART or IMAGPART expression on the
1134    result of the cexpi call we insert before the use statement that
1135    dominates all other candidates.  */
1136
1137 static bool
1138 execute_cse_sincos_1 (tree name)
1139 {
1140   gimple_stmt_iterator gsi;
1141   imm_use_iterator use_iter;
1142   tree fndecl, res, type = NULL_TREE;
1143   gimple *def_stmt, *use_stmt, *stmt;
1144   int seen_cos = 0, seen_sin = 0, seen_cexpi = 0;
1145   auto_vec<gimple *> stmts;
1146   basic_block top_bb = NULL;
1147   int i;
1148   bool cfg_changed = false;
1149
1150   FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, name)
1151     {
1152       if (gimple_code (use_stmt) != GIMPLE_CALL
1153           || !gimple_call_lhs (use_stmt))
1154         continue;
1155
1156       switch (gimple_call_combined_fn (use_stmt))
1157         {
1158         CASE_CFN_COS:
1159           seen_cos |= maybe_record_sincos (&stmts, &top_bb, use_stmt) ? 1 : 0;
1160           break;
1161
1162         CASE_CFN_SIN:
1163           seen_sin |= maybe_record_sincos (&stmts, &top_bb, use_stmt) ? 1 : 0;
1164           break;
1165
1166         CASE_CFN_CEXPI:
1167           seen_cexpi |= maybe_record_sincos (&stmts, &top_bb, use_stmt) ? 1 : 0;
1168           break;
1169
1170         default:;
1171           continue;
1172         }
1173
1174       tree t = mathfn_built_in_type (gimple_call_combined_fn (use_stmt));
1175       if (!type)
1176         {
1177           type = t;
1178           t = TREE_TYPE (name);
1179         }
1180       /* This checks that NAME has the right type in the first round,
1181          and, in subsequent rounds, that the built_in type is the same
1182          type, or a compatible type.  */
1183       if (type != t && !types_compatible_p (type, t))
1184         return false;
1185     }
1186   if (seen_cos + seen_sin + seen_cexpi <= 1)
1187     return false;
1188
1189   /* Simply insert cexpi at the beginning of top_bb but not earlier than
1190      the name def statement.  */
1191   fndecl = mathfn_built_in (type, BUILT_IN_CEXPI);
1192   if (!fndecl)
1193     return false;
1194   stmt = gimple_build_call (fndecl, 1, name);
1195   res = make_temp_ssa_name (TREE_TYPE (TREE_TYPE (fndecl)), stmt, "sincostmp");
1196   gimple_call_set_lhs (stmt, res);
1197
1198   def_stmt = SSA_NAME_DEF_STMT (name);
1199   if (!SSA_NAME_IS_DEFAULT_DEF (name)
1200       && gimple_code (def_stmt) != GIMPLE_PHI
1201       && gimple_bb (def_stmt) == top_bb)
1202     {
1203       gsi = gsi_for_stmt (def_stmt);
1204       gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
1205     }
1206   else
1207     {
1208       gsi = gsi_after_labels (top_bb);
1209       gsi_insert_before (&gsi, stmt, GSI_SAME_STMT);
1210     }
1211   sincos_stats.inserted++;
1212
1213   /* And adjust the recorded old call sites.  */
1214   for (i = 0; stmts.iterate (i, &use_stmt); ++i)
1215     {
1216       tree rhs = NULL;
1217
1218       switch (gimple_call_combined_fn (use_stmt))
1219         {
1220         CASE_CFN_COS:
1221           rhs = fold_build1 (REALPART_EXPR, type, res);
1222           break;
1223
1224         CASE_CFN_SIN:
1225           rhs = fold_build1 (IMAGPART_EXPR, type, res);
1226           break;
1227
1228         CASE_CFN_CEXPI:
1229           rhs = res;
1230           break;
1231
1232         default:;
1233           gcc_unreachable ();
1234         }
1235
1236         /* Replace call with a copy.  */
1237         stmt = gimple_build_assign (gimple_call_lhs (use_stmt), rhs);
1238
1239         gsi = gsi_for_stmt (use_stmt);
1240         gsi_replace (&gsi, stmt, true);
1241         if (gimple_purge_dead_eh_edges (gimple_bb (stmt)))
1242           cfg_changed = true;
1243     }
1244
1245   return cfg_changed;
1246 }
1247
1248 /* To evaluate powi(x,n), the floating point value x raised to the
1249    constant integer exponent n, we use a hybrid algorithm that
1250    combines the "window method" with look-up tables.  For an
1251    introduction to exponentiation algorithms and "addition chains",
1252    see section 4.6.3, "Evaluation of Powers" of Donald E. Knuth,
1253    "Seminumerical Algorithms", Vol. 2, "The Art of Computer Programming",
1254    3rd Edition, 1998, and Daniel M. Gordon, "A Survey of Fast Exponentiation
1255    Methods", Journal of Algorithms, Vol. 27, pp. 129-146, 1998.  */
1256
1257 /* Provide a default value for POWI_MAX_MULTS, the maximum number of
1258    multiplications to inline before calling the system library's pow
1259    function.  powi(x,n) requires at worst 2*bits(n)-2 multiplications,
1260    so this default never requires calling pow, powf or powl.  */
1261
1262 #ifndef POWI_MAX_MULTS
1263 #define POWI_MAX_MULTS  (2*HOST_BITS_PER_WIDE_INT-2)
1264 #endif
1265
1266 /* The size of the "optimal power tree" lookup table.  All
1267    exponents less than this value are simply looked up in the
1268    powi_table below.  This threshold is also used to size the
1269    cache of pseudo registers that hold intermediate results.  */
1270 #define POWI_TABLE_SIZE 256
1271
1272 /* The size, in bits of the window, used in the "window method"
1273    exponentiation algorithm.  This is equivalent to a radix of
1274    (1<<POWI_WINDOW_SIZE) in the corresponding "m-ary method".  */
1275 #define POWI_WINDOW_SIZE 3
1276
1277 /* The following table is an efficient representation of an
1278    "optimal power tree".  For each value, i, the corresponding
1279    value, j, in the table states than an optimal evaluation
1280    sequence for calculating pow(x,i) can be found by evaluating
1281    pow(x,j)*pow(x,i-j).  An optimal power tree for the first
1282    100 integers is given in Knuth's "Seminumerical algorithms".  */
1283
1284 static const unsigned char powi_table[POWI_TABLE_SIZE] =
1285   {
1286       0,   1,   1,   2,   2,   3,   3,   4,  /*   0 -   7 */
1287       4,   6,   5,   6,   6,  10,   7,   9,  /*   8 -  15 */
1288       8,  16,   9,  16,  10,  12,  11,  13,  /*  16 -  23 */
1289      12,  17,  13,  18,  14,  24,  15,  26,  /*  24 -  31 */
1290      16,  17,  17,  19,  18,  33,  19,  26,  /*  32 -  39 */
1291      20,  25,  21,  40,  22,  27,  23,  44,  /*  40 -  47 */
1292      24,  32,  25,  34,  26,  29,  27,  44,  /*  48 -  55 */
1293      28,  31,  29,  34,  30,  60,  31,  36,  /*  56 -  63 */
1294      32,  64,  33,  34,  34,  46,  35,  37,  /*  64 -  71 */
1295      36,  65,  37,  50,  38,  48,  39,  69,  /*  72 -  79 */
1296      40,  49,  41,  43,  42,  51,  43,  58,  /*  80 -  87 */
1297      44,  64,  45,  47,  46,  59,  47,  76,  /*  88 -  95 */
1298      48,  65,  49,  66,  50,  67,  51,  66,  /*  96 - 103 */
1299      52,  70,  53,  74,  54, 104,  55,  74,  /* 104 - 111 */
1300      56,  64,  57,  69,  58,  78,  59,  68,  /* 112 - 119 */
1301      60,  61,  61,  80,  62,  75,  63,  68,  /* 120 - 127 */
1302      64,  65,  65, 128,  66, 129,  67,  90,  /* 128 - 135 */
1303      68,  73,  69, 131,  70,  94,  71,  88,  /* 136 - 143 */
1304      72, 128,  73,  98,  74, 132,  75, 121,  /* 144 - 151 */
1305      76, 102,  77, 124,  78, 132,  79, 106,  /* 152 - 159 */
1306      80,  97,  81, 160,  82,  99,  83, 134,  /* 160 - 167 */
1307      84,  86,  85,  95,  86, 160,  87, 100,  /* 168 - 175 */
1308      88, 113,  89,  98,  90, 107,  91, 122,  /* 176 - 183 */
1309      92, 111,  93, 102,  94, 126,  95, 150,  /* 184 - 191 */
1310      96, 128,  97, 130,  98, 133,  99, 195,  /* 192 - 199 */
1311     100, 128, 101, 123, 102, 164, 103, 138,  /* 200 - 207 */
1312     104, 145, 105, 146, 106, 109, 107, 149,  /* 208 - 215 */
1313     108, 200, 109, 146, 110, 170, 111, 157,  /* 216 - 223 */
1314     112, 128, 113, 130, 114, 182, 115, 132,  /* 224 - 231 */
1315     116, 200, 117, 132, 118, 158, 119, 206,  /* 232 - 239 */
1316     120, 240, 121, 162, 122, 147, 123, 152,  /* 240 - 247 */
1317     124, 166, 125, 214, 126, 138, 127, 153,  /* 248 - 255 */
1318   };
1319
1320
1321 /* Return the number of multiplications required to calculate
1322    powi(x,n) where n is less than POWI_TABLE_SIZE.  This is a
1323    subroutine of powi_cost.  CACHE is an array indicating
1324    which exponents have already been calculated.  */
1325
1326 static int
1327 powi_lookup_cost (unsigned HOST_WIDE_INT n, bool *cache)
1328 {
1329   /* If we've already calculated this exponent, then this evaluation
1330      doesn't require any additional multiplications.  */
1331   if (cache[n])
1332     return 0;
1333
1334   cache[n] = true;
1335   return powi_lookup_cost (n - powi_table[n], cache)
1336          + powi_lookup_cost (powi_table[n], cache) + 1;
1337 }
1338
1339 /* Return the number of multiplications required to calculate
1340    powi(x,n) for an arbitrary x, given the exponent N.  This
1341    function needs to be kept in sync with powi_as_mults below.  */
1342
1343 static int
1344 powi_cost (HOST_WIDE_INT n)
1345 {
1346   bool cache[POWI_TABLE_SIZE];
1347   unsigned HOST_WIDE_INT digit;
1348   unsigned HOST_WIDE_INT val;
1349   int result;
1350
1351   if (n == 0)
1352     return 0;
1353
1354   /* Ignore the reciprocal when calculating the cost.  */
1355   val = (n < 0) ? -n : n;
1356
1357   /* Initialize the exponent cache.  */
1358   memset (cache, 0, POWI_TABLE_SIZE * sizeof (bool));
1359   cache[1] = true;
1360
1361   result = 0;
1362
1363   while (val >= POWI_TABLE_SIZE)
1364     {
1365       if (val & 1)
1366         {
1367           digit = val & ((1 << POWI_WINDOW_SIZE) - 1);
1368           result += powi_lookup_cost (digit, cache)
1369                     + POWI_WINDOW_SIZE + 1;
1370           val >>= POWI_WINDOW_SIZE;
1371         }
1372       else
1373         {
1374           val >>= 1;
1375           result++;
1376         }
1377     }
1378
1379   return result + powi_lookup_cost (val, cache);
1380 }
1381
1382 /* Recursive subroutine of powi_as_mults.  This function takes the
1383    array, CACHE, of already calculated exponents and an exponent N and
1384    returns a tree that corresponds to CACHE[1]**N, with type TYPE.  */
1385
1386 static tree
1387 powi_as_mults_1 (gimple_stmt_iterator *gsi, location_t loc, tree type,
1388                  HOST_WIDE_INT n, tree *cache)
1389 {
1390   tree op0, op1, ssa_target;
1391   unsigned HOST_WIDE_INT digit;
1392   gassign *mult_stmt;
1393
1394   if (n < POWI_TABLE_SIZE && cache[n])
1395     return cache[n];
1396
1397   ssa_target = make_temp_ssa_name (type, NULL, "powmult");
1398
1399   if (n < POWI_TABLE_SIZE)
1400     {
1401       cache[n] = ssa_target;
1402       op0 = powi_as_mults_1 (gsi, loc, type, n - powi_table[n], cache);
1403       op1 = powi_as_mults_1 (gsi, loc, type, powi_table[n], cache);
1404     }
1405   else if (n & 1)
1406     {
1407       digit = n & ((1 << POWI_WINDOW_SIZE) - 1);
1408       op0 = powi_as_mults_1 (gsi, loc, type, n - digit, cache);
1409       op1 = powi_as_mults_1 (gsi, loc, type, digit, cache);
1410     }
1411   else
1412     {
1413       op0 = powi_as_mults_1 (gsi, loc, type, n >> 1, cache);
1414       op1 = op0;
1415     }
1416
1417   mult_stmt = gimple_build_assign (ssa_target, MULT_EXPR, op0, op1);
1418   gimple_set_location (mult_stmt, loc);
1419   gsi_insert_before (gsi, mult_stmt, GSI_SAME_STMT);
1420
1421   return ssa_target;
1422 }
1423
1424 /* Convert ARG0**N to a tree of multiplications of ARG0 with itself.
1425    This function needs to be kept in sync with powi_cost above.  */
1426
1427 static tree
1428 powi_as_mults (gimple_stmt_iterator *gsi, location_t loc,
1429                tree arg0, HOST_WIDE_INT n)
1430 {
1431   tree cache[POWI_TABLE_SIZE], result, type = TREE_TYPE (arg0);
1432   gassign *div_stmt;
1433   tree target;
1434
1435   if (n == 0)
1436     return build_real (type, dconst1);
1437
1438   memset (cache, 0,  sizeof (cache));
1439   cache[1] = arg0;
1440
1441   result = powi_as_mults_1 (gsi, loc, type, (n < 0) ? -n : n, cache);
1442   if (n >= 0)
1443     return result;
1444
1445   /* If the original exponent was negative, reciprocate the result.  */
1446   target = make_temp_ssa_name (type, NULL, "powmult");
1447   div_stmt = gimple_build_assign (target, RDIV_EXPR,
1448                                   build_real (type, dconst1), result);
1449   gimple_set_location (div_stmt, loc);
1450   gsi_insert_before (gsi, div_stmt, GSI_SAME_STMT);
1451
1452   return target;
1453 }
1454
1455 /* ARG0 and N are the two arguments to a powi builtin in GSI with
1456    location info LOC.  If the arguments are appropriate, create an
1457    equivalent sequence of statements prior to GSI using an optimal
1458    number of multiplications, and return an expession holding the
1459    result.  */
1460
1461 static tree
1462 gimple_expand_builtin_powi (gimple_stmt_iterator *gsi, location_t loc,
1463                             tree arg0, HOST_WIDE_INT n)
1464 {
1465   /* Avoid largest negative number.  */
1466   if (n != -n
1467       && ((n >= -1 && n <= 2)
1468           || (optimize_function_for_speed_p (cfun)
1469               && powi_cost (n) <= POWI_MAX_MULTS)))
1470     return powi_as_mults (gsi, loc, arg0, n);
1471
1472   return NULL_TREE;
1473 }
1474
1475 /* Build a gimple call statement that calls FN with argument ARG.
1476    Set the lhs of the call statement to a fresh SSA name.  Insert the
1477    statement prior to GSI's current position, and return the fresh
1478    SSA name.  */
1479
1480 static tree
1481 build_and_insert_call (gimple_stmt_iterator *gsi, location_t loc,
1482                        tree fn, tree arg)
1483 {
1484   gcall *call_stmt;
1485   tree ssa_target;
1486
1487   call_stmt = gimple_build_call (fn, 1, arg);
1488   ssa_target = make_temp_ssa_name (TREE_TYPE (arg), NULL, "powroot");
1489   gimple_set_lhs (call_stmt, ssa_target);
1490   gimple_set_location (call_stmt, loc);
1491   gsi_insert_before (gsi, call_stmt, GSI_SAME_STMT);
1492
1493   return ssa_target;
1494 }
1495
1496 /* Build a gimple binary operation with the given CODE and arguments
1497    ARG0, ARG1, assigning the result to a new SSA name for variable
1498    TARGET.  Insert the statement prior to GSI's current position, and
1499    return the fresh SSA name.*/
1500
1501 static tree
1502 build_and_insert_binop (gimple_stmt_iterator *gsi, location_t loc,
1503                         const char *name, enum tree_code code,
1504                         tree arg0, tree arg1)
1505 {
1506   tree result = make_temp_ssa_name (TREE_TYPE (arg0), NULL, name);
1507   gassign *stmt = gimple_build_assign (result, code, arg0, arg1);
1508   gimple_set_location (stmt, loc);
1509   gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
1510   return result;
1511 }
1512
1513 /* Build a gimple reference operation with the given CODE and argument
1514    ARG, assigning the result to a new SSA name of TYPE with NAME.
1515    Insert the statement prior to GSI's current position, and return
1516    the fresh SSA name.  */
1517
1518 static inline tree
1519 build_and_insert_ref (gimple_stmt_iterator *gsi, location_t loc, tree type,
1520                       const char *name, enum tree_code code, tree arg0)
1521 {
1522   tree result = make_temp_ssa_name (type, NULL, name);
1523   gimple *stmt = gimple_build_assign (result, build1 (code, type, arg0));
1524   gimple_set_location (stmt, loc);
1525   gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
1526   return result;
1527 }
1528
1529 /* Build a gimple assignment to cast VAL to TYPE.  Insert the statement
1530    prior to GSI's current position, and return the fresh SSA name.  */
1531
1532 static tree
1533 build_and_insert_cast (gimple_stmt_iterator *gsi, location_t loc,
1534                        tree type, tree val)
1535 {
1536   tree result = make_ssa_name (type);
1537   gassign *stmt = gimple_build_assign (result, NOP_EXPR, val);
1538   gimple_set_location (stmt, loc);
1539   gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
1540   return result;
1541 }
1542
1543 struct pow_synth_sqrt_info
1544 {
1545   bool *factors;
1546   unsigned int deepest;
1547   unsigned int num_mults;
1548 };
1549
1550 /* Return true iff the real value C can be represented as a
1551    sum of powers of 0.5 up to N.  That is:
1552    C == SUM<i from 1..N> (a[i]*(0.5**i)) where a[i] is either 0 or 1.
1553    Record in INFO the various parameters of the synthesis algorithm such
1554    as the factors a[i], the maximum 0.5 power and the number of
1555    multiplications that will be required.  */
1556
1557 bool
1558 representable_as_half_series_p (REAL_VALUE_TYPE c, unsigned n,
1559                                  struct pow_synth_sqrt_info *info)
1560 {
1561   REAL_VALUE_TYPE factor = dconsthalf;
1562   REAL_VALUE_TYPE remainder = c;
1563
1564   info->deepest = 0;
1565   info->num_mults = 0;
1566   memset (info->factors, 0, n * sizeof (bool));
1567
1568   for (unsigned i = 0; i < n; i++)
1569     {
1570       REAL_VALUE_TYPE res;
1571
1572       /* If something inexact happened bail out now.  */
1573       if (real_arithmetic (&res, MINUS_EXPR, &remainder, &factor))
1574         return false;
1575
1576       /* We have hit zero.  The number is representable as a sum
1577          of powers of 0.5.  */
1578       if (real_equal (&res, &dconst0))
1579         {
1580           info->factors[i] = true;
1581           info->deepest = i + 1;
1582           return true;
1583         }
1584       else if (!REAL_VALUE_NEGATIVE (res))
1585         {
1586           remainder = res;
1587           info->factors[i] = true;
1588           info->num_mults++;
1589         }
1590       else
1591         info->factors[i] = false;
1592
1593       real_arithmetic (&factor, MULT_EXPR, &factor, &dconsthalf);
1594     }
1595   return false;
1596 }
1597
1598 /* Return the tree corresponding to FN being applied
1599    to ARG N times at GSI and LOC.
1600    Look up previous results from CACHE if need be.
1601    cache[0] should contain just plain ARG i.e. FN applied to ARG 0 times.  */
1602
1603 static tree
1604 get_fn_chain (tree arg, unsigned int n, gimple_stmt_iterator *gsi,
1605               tree fn, location_t loc, tree *cache)
1606 {
1607   tree res = cache[n];
1608   if (!res)
1609     {
1610       tree prev = get_fn_chain (arg, n - 1, gsi, fn, loc, cache);
1611       res = build_and_insert_call (gsi, loc, fn, prev);
1612       cache[n] = res;
1613     }
1614
1615   return res;
1616 }
1617
1618 /* Print to STREAM the repeated application of function FNAME to ARG
1619    N times.  So, for FNAME = "foo", ARG = "x", N = 2 it would print:
1620    "foo (foo (x))".  */
1621
1622 static void
1623 print_nested_fn (FILE* stream, const char *fname, const char* arg,
1624                  unsigned int n)
1625 {
1626   if (n == 0)
1627     fprintf (stream, "%s", arg);
1628   else
1629     {
1630       fprintf (stream, "%s (", fname);
1631       print_nested_fn (stream, fname, arg, n - 1);
1632       fprintf (stream, ")");
1633     }
1634 }
1635
1636 /* Print to STREAM the fractional sequence of sqrt chains
1637    applied to ARG, described by INFO.  Used for the dump file.  */
1638
1639 static void
1640 dump_fractional_sqrt_sequence (FILE *stream, const char *arg,
1641                                 struct pow_synth_sqrt_info *info)
1642 {
1643   for (unsigned int i = 0; i < info->deepest; i++)
1644     {
1645       bool is_set = info->factors[i];
1646       if (is_set)
1647         {
1648           print_nested_fn (stream, "sqrt", arg, i + 1);
1649           if (i != info->deepest - 1)
1650             fprintf (stream, " * ");
1651         }
1652     }
1653 }
1654
1655 /* Print to STREAM a representation of raising ARG to an integer
1656    power N.  Used for the dump file.  */
1657
1658 static void
1659 dump_integer_part (FILE *stream, const char* arg, HOST_WIDE_INT n)
1660 {
1661   if (n > 1)
1662     fprintf (stream, "powi (%s, " HOST_WIDE_INT_PRINT_DEC ")", arg, n);
1663   else if (n == 1)
1664     fprintf (stream, "%s", arg);
1665 }
1666
1667 /* Attempt to synthesize a POW[F] (ARG0, ARG1) call using chains of
1668    square roots.  Place at GSI and LOC.  Limit the maximum depth
1669    of the sqrt chains to MAX_DEPTH.  Return the tree holding the
1670    result of the expanded sequence or NULL_TREE if the expansion failed.
1671
1672    This routine assumes that ARG1 is a real number with a fractional part
1673    (the integer exponent case will have been handled earlier in
1674    gimple_expand_builtin_pow).
1675
1676    For ARG1 > 0.0:
1677    * For ARG1 composed of a whole part WHOLE_PART and a fractional part
1678      FRAC_PART i.e. WHOLE_PART == floor (ARG1) and
1679                     FRAC_PART == ARG1 - WHOLE_PART:
1680      Produce POWI (ARG0, WHOLE_PART) * POW (ARG0, FRAC_PART) where
1681      POW (ARG0, FRAC_PART) is expanded as a product of square root chains
1682      if it can be expressed as such, that is if FRAC_PART satisfies:
1683      FRAC_PART == <SUM from i = 1 until MAX_DEPTH> (a[i] * (0.5**i))
1684      where integer a[i] is either 0 or 1.
1685
1686      Example:
1687      POW (x, 3.625) == POWI (x, 3) * POW (x, 0.625)
1688        --> POWI (x, 3) * SQRT (x) * SQRT (SQRT (SQRT (x)))
1689
1690    For ARG1 < 0.0 there are two approaches:
1691    * (A) Expand to 1.0 / POW (ARG0, -ARG1) where POW (ARG0, -ARG1)
1692          is calculated as above.
1693
1694      Example:
1695      POW (x, -5.625) == 1.0 / POW (x, 5.625)
1696        -->  1.0 / (POWI (x, 5) * SQRT (x) * SQRT (SQRT (SQRT (x))))
1697
1698    * (B) : WHOLE_PART := - ceil (abs (ARG1))
1699            FRAC_PART  := ARG1 - WHOLE_PART
1700      and expand to POW (x, FRAC_PART) / POWI (x, WHOLE_PART).
1701      Example:
1702      POW (x, -5.875) == POW (x, 0.125) / POWI (X, 6)
1703        --> SQRT (SQRT (SQRT (x))) / (POWI (x, 6))
1704
1705    For ARG1 < 0.0 we choose between (A) and (B) depending on
1706    how many multiplications we'd have to do.
1707    So, for the example in (B): POW (x, -5.875), if we were to
1708    follow algorithm (A) we would produce:
1709    1.0 / POWI (X, 5) * SQRT (X) * SQRT (SQRT (X)) * SQRT (SQRT (SQRT (X)))
1710    which contains more multiplications than approach (B).
1711
1712    Hopefully, this approach will eliminate potentially expensive POW library
1713    calls when unsafe floating point math is enabled and allow the compiler to
1714    further optimise the multiplies, square roots and divides produced by this
1715    function.  */
1716
1717 static tree
1718 expand_pow_as_sqrts (gimple_stmt_iterator *gsi, location_t loc,
1719                      tree arg0, tree arg1, HOST_WIDE_INT max_depth)
1720 {
1721   tree type = TREE_TYPE (arg0);
1722   machine_mode mode = TYPE_MODE (type);
1723   tree sqrtfn = mathfn_built_in (type, BUILT_IN_SQRT);
1724   bool one_over = true;
1725
1726   if (!sqrtfn)
1727     return NULL_TREE;
1728
1729   if (TREE_CODE (arg1) != REAL_CST)
1730     return NULL_TREE;
1731
1732   REAL_VALUE_TYPE exp_init = TREE_REAL_CST (arg1);
1733
1734   gcc_assert (max_depth > 0);
1735   tree *cache = XALLOCAVEC (tree, max_depth + 1);
1736
1737   struct pow_synth_sqrt_info synth_info;
1738   synth_info.factors = XALLOCAVEC (bool, max_depth + 1);
1739   synth_info.deepest = 0;
1740   synth_info.num_mults = 0;
1741
1742   bool neg_exp = REAL_VALUE_NEGATIVE (exp_init);
1743   REAL_VALUE_TYPE exp = real_value_abs (&exp_init);
1744
1745   /* The whole and fractional parts of exp.  */
1746   REAL_VALUE_TYPE whole_part;
1747   REAL_VALUE_TYPE frac_part;
1748
1749   real_floor (&whole_part, mode, &exp);
1750   real_arithmetic (&frac_part, MINUS_EXPR, &exp, &whole_part);
1751
1752
1753   REAL_VALUE_TYPE ceil_whole = dconst0;
1754   REAL_VALUE_TYPE ceil_fract = dconst0;
1755
1756   if (neg_exp)
1757     {
1758       real_ceil (&ceil_whole, mode, &exp);
1759       real_arithmetic (&ceil_fract, MINUS_EXPR, &ceil_whole, &exp);
1760     }
1761
1762   if (!representable_as_half_series_p (frac_part, max_depth, &synth_info))
1763     return NULL_TREE;
1764
1765   /* Check whether it's more profitable to not use 1.0 / ...  */
1766   if (neg_exp)
1767     {
1768       struct pow_synth_sqrt_info alt_synth_info;
1769       alt_synth_info.factors = XALLOCAVEC (bool, max_depth + 1);
1770       alt_synth_info.deepest = 0;
1771       alt_synth_info.num_mults = 0;
1772
1773       if (representable_as_half_series_p (ceil_fract, max_depth,
1774                                            &alt_synth_info)
1775           && alt_synth_info.deepest <= synth_info.deepest
1776           && alt_synth_info.num_mults < synth_info.num_mults)
1777         {
1778           whole_part = ceil_whole;
1779           frac_part = ceil_fract;
1780           synth_info.deepest = alt_synth_info.deepest;
1781           synth_info.num_mults = alt_synth_info.num_mults;
1782           memcpy (synth_info.factors, alt_synth_info.factors,
1783                   (max_depth + 1) * sizeof (bool));
1784           one_over = false;
1785         }
1786     }
1787
1788   HOST_WIDE_INT n = real_to_integer (&whole_part);
1789   REAL_VALUE_TYPE cint;
1790   real_from_integer (&cint, VOIDmode, n, SIGNED);
1791
1792   if (!real_identical (&whole_part, &cint))
1793     return NULL_TREE;
1794
1795   if (powi_cost (n) + synth_info.num_mults > POWI_MAX_MULTS)
1796     return NULL_TREE;
1797
1798   memset (cache, 0, (max_depth + 1) * sizeof (tree));
1799
1800   tree integer_res = n == 0 ? build_real (type, dconst1) : arg0;
1801
1802   /* Calculate the integer part of the exponent.  */
1803   if (n > 1)
1804     {
1805       integer_res = gimple_expand_builtin_powi (gsi, loc, arg0, n);
1806       if (!integer_res)
1807         return NULL_TREE;
1808     }
1809
1810   if (dump_file)
1811     {
1812       char string[64];
1813
1814       real_to_decimal (string, &exp_init, sizeof (string), 0, 1);
1815       fprintf (dump_file, "synthesizing pow (x, %s) as:\n", string);
1816
1817       if (neg_exp)
1818         {
1819           if (one_over)
1820             {
1821               fprintf (dump_file, "1.0 / (");
1822               dump_integer_part (dump_file, "x", n);
1823               if (n > 0)
1824                 fprintf (dump_file, " * ");
1825               dump_fractional_sqrt_sequence (dump_file, "x", &synth_info);
1826               fprintf (dump_file, ")");
1827             }
1828           else
1829             {
1830               dump_fractional_sqrt_sequence (dump_file, "x", &synth_info);
1831               fprintf (dump_file, " / (");
1832               dump_integer_part (dump_file, "x", n);
1833               fprintf (dump_file, ")");
1834             }
1835         }
1836       else
1837         {
1838           dump_fractional_sqrt_sequence (dump_file, "x", &synth_info);
1839           if (n > 0)
1840             fprintf (dump_file, " * ");
1841           dump_integer_part (dump_file, "x", n);
1842         }
1843
1844       fprintf (dump_file, "\ndeepest sqrt chain: %d\n", synth_info.deepest);
1845     }
1846
1847
1848   tree fract_res = NULL_TREE;
1849   cache[0] = arg0;
1850
1851   /* Calculate the fractional part of the exponent.  */
1852   for (unsigned i = 0; i < synth_info.deepest; i++)
1853     {
1854       if (synth_info.factors[i])
1855         {
1856           tree sqrt_chain = get_fn_chain (arg0, i + 1, gsi, sqrtfn, loc, cache);
1857
1858           if (!fract_res)
1859               fract_res = sqrt_chain;
1860
1861           else
1862             fract_res = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1863                                            fract_res, sqrt_chain);
1864         }
1865     }
1866
1867   tree res = NULL_TREE;
1868
1869   if (neg_exp)
1870     {
1871       if (one_over)
1872         {
1873           if (n > 0)
1874             res = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1875                                            fract_res, integer_res);
1876           else
1877             res = fract_res;
1878
1879           res = build_and_insert_binop (gsi, loc, "powrootrecip", RDIV_EXPR,
1880                                           build_real (type, dconst1), res);
1881         }
1882       else
1883         {
1884           res = build_and_insert_binop (gsi, loc, "powroot", RDIV_EXPR,
1885                                          fract_res, integer_res);
1886         }
1887     }
1888   else
1889     res = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1890                                    fract_res, integer_res);
1891   return res;
1892 }
1893
1894 /* ARG0 and ARG1 are the two arguments to a pow builtin call in GSI
1895    with location info LOC.  If possible, create an equivalent and
1896    less expensive sequence of statements prior to GSI, and return an
1897    expession holding the result.  */
1898
1899 static tree
1900 gimple_expand_builtin_pow (gimple_stmt_iterator *gsi, location_t loc,
1901                            tree arg0, tree arg1)
1902 {
1903   REAL_VALUE_TYPE c, cint, dconst1_3, dconst1_4, dconst1_6;
1904   REAL_VALUE_TYPE c2, dconst3;
1905   HOST_WIDE_INT n;
1906   tree type, sqrtfn, cbrtfn, sqrt_arg0, result, cbrt_x, powi_cbrt_x;
1907   machine_mode mode;
1908   bool speed_p = optimize_bb_for_speed_p (gsi_bb (*gsi));
1909   bool hw_sqrt_exists, c_is_int, c2_is_int;
1910
1911   dconst1_4 = dconst1;
1912   SET_REAL_EXP (&dconst1_4, REAL_EXP (&dconst1_4) - 2);
1913
1914   /* If the exponent isn't a constant, there's nothing of interest
1915      to be done.  */
1916   if (TREE_CODE (arg1) != REAL_CST)
1917     return NULL_TREE;
1918
1919   /* Don't perform the operation if flag_signaling_nans is on
1920      and the operand is a signaling NaN.  */
1921   if (HONOR_SNANS (TYPE_MODE (TREE_TYPE (arg1)))
1922       && ((TREE_CODE (arg0) == REAL_CST
1923            && REAL_VALUE_ISSIGNALING_NAN (TREE_REAL_CST (arg0)))
1924           || REAL_VALUE_ISSIGNALING_NAN (TREE_REAL_CST (arg1))))
1925     return NULL_TREE;
1926
1927   /* If the exponent is equivalent to an integer, expand to an optimal
1928      multiplication sequence when profitable.  */
1929   c = TREE_REAL_CST (arg1);
1930   n = real_to_integer (&c);
1931   real_from_integer (&cint, VOIDmode, n, SIGNED);
1932   c_is_int = real_identical (&c, &cint);
1933
1934   if (c_is_int
1935       && ((n >= -1 && n <= 2)
1936           || (flag_unsafe_math_optimizations
1937               && speed_p
1938               && powi_cost (n) <= POWI_MAX_MULTS)))
1939     return gimple_expand_builtin_powi (gsi, loc, arg0, n);
1940
1941   /* Attempt various optimizations using sqrt and cbrt.  */
1942   type = TREE_TYPE (arg0);
1943   mode = TYPE_MODE (type);
1944   sqrtfn = mathfn_built_in (type, BUILT_IN_SQRT);
1945
1946   /* Optimize pow(x,0.5) = sqrt(x).  This replacement is always safe
1947      unless signed zeros must be maintained.  pow(-0,0.5) = +0, while
1948      sqrt(-0) = -0.  */
1949   if (sqrtfn
1950       && real_equal (&c, &dconsthalf)
1951       && !HONOR_SIGNED_ZEROS (mode))
1952     return build_and_insert_call (gsi, loc, sqrtfn, arg0);
1953
1954   hw_sqrt_exists = optab_handler (sqrt_optab, mode) != CODE_FOR_nothing;
1955
1956   /* Optimize pow(x,1./3.) = cbrt(x).  This requires unsafe math
1957      optimizations since 1./3. is not exactly representable.  If x
1958      is negative and finite, the correct value of pow(x,1./3.) is
1959      a NaN with the "invalid" exception raised, because the value
1960      of 1./3. actually has an even denominator.  The correct value
1961      of cbrt(x) is a negative real value.  */
1962   cbrtfn = mathfn_built_in (type, BUILT_IN_CBRT);
1963   dconst1_3 = real_value_truncate (mode, dconst_third ());
1964
1965   if (flag_unsafe_math_optimizations
1966       && cbrtfn
1967       && (!HONOR_NANS (mode) || tree_expr_nonnegative_p (arg0))
1968       && real_equal (&c, &dconst1_3))
1969     return build_and_insert_call (gsi, loc, cbrtfn, arg0);
1970
1971   /* Optimize pow(x,1./6.) = cbrt(sqrt(x)).  Don't do this optimization
1972      if we don't have a hardware sqrt insn.  */
1973   dconst1_6 = dconst1_3;
1974   SET_REAL_EXP (&dconst1_6, REAL_EXP (&dconst1_6) - 1);
1975
1976   if (flag_unsafe_math_optimizations
1977       && sqrtfn
1978       && cbrtfn
1979       && (!HONOR_NANS (mode) || tree_expr_nonnegative_p (arg0))
1980       && speed_p
1981       && hw_sqrt_exists
1982       && real_equal (&c, &dconst1_6))
1983     {
1984       /* sqrt(x)  */
1985       sqrt_arg0 = build_and_insert_call (gsi, loc, sqrtfn, arg0);
1986
1987       /* cbrt(sqrt(x))  */
1988       return build_and_insert_call (gsi, loc, cbrtfn, sqrt_arg0);
1989     }
1990
1991
1992   /* Attempt to expand the POW as a product of square root chains.
1993      Expand the 0.25 case even when otpimising for size.  */
1994   if (flag_unsafe_math_optimizations
1995       && sqrtfn
1996       && hw_sqrt_exists
1997       && (speed_p || real_equal (&c, &dconst1_4))
1998       && !HONOR_SIGNED_ZEROS (mode))
1999     {
2000       unsigned int max_depth = speed_p
2001                                 ? param_max_pow_sqrt_depth
2002                                 : 2;
2003
2004       tree expand_with_sqrts
2005         = expand_pow_as_sqrts (gsi, loc, arg0, arg1, max_depth);
2006
2007       if (expand_with_sqrts)
2008         return expand_with_sqrts;
2009     }
2010
2011   real_arithmetic (&c2, MULT_EXPR, &c, &dconst2);
2012   n = real_to_integer (&c2);
2013   real_from_integer (&cint, VOIDmode, n, SIGNED);
2014   c2_is_int = real_identical (&c2, &cint);
2015
2016   /* Optimize pow(x,c), where 3c = n for some nonzero integer n, into
2017
2018      powi(x, n/3) * powi(cbrt(x), n%3),                    n > 0;
2019      1.0 / (powi(x, abs(n)/3) * powi(cbrt(x), abs(n)%3)),  n < 0.
2020
2021      Do not calculate the first factor when n/3 = 0.  As cbrt(x) is
2022      different from pow(x, 1./3.) due to rounding and behavior with
2023      negative x, we need to constrain this transformation to unsafe
2024      math and positive x or finite math.  */
2025   real_from_integer (&dconst3, VOIDmode, 3, SIGNED);
2026   real_arithmetic (&c2, MULT_EXPR, &c, &dconst3);
2027   real_round (&c2, mode, &c2);
2028   n = real_to_integer (&c2);
2029   real_from_integer (&cint, VOIDmode, n, SIGNED);
2030   real_arithmetic (&c2, RDIV_EXPR, &cint, &dconst3);
2031   real_convert (&c2, mode, &c2);
2032
2033   if (flag_unsafe_math_optimizations
2034       && cbrtfn
2035       && (!HONOR_NANS (mode) || tree_expr_nonnegative_p (arg0))
2036       && real_identical (&c2, &c)
2037       && !c2_is_int
2038       && optimize_function_for_speed_p (cfun)
2039       && powi_cost (n / 3) <= POWI_MAX_MULTS)
2040     {
2041       tree powi_x_ndiv3 = NULL_TREE;
2042
2043       /* Attempt to fold powi(arg0, abs(n/3)) into multiplies.  If not
2044          possible or profitable, give up.  Skip the degenerate case when
2045          abs(n) < 3, where the result is always 1.  */
2046       if (absu_hwi (n) >= 3)
2047         {
2048           powi_x_ndiv3 = gimple_expand_builtin_powi (gsi, loc, arg0,
2049                                                      abs_hwi (n / 3));
2050           if (!powi_x_ndiv3)
2051             return NULL_TREE;
2052         }
2053
2054       /* Calculate powi(cbrt(x), n%3).  Don't use gimple_expand_builtin_powi
2055          as that creates an unnecessary variable.  Instead, just produce
2056          either cbrt(x) or cbrt(x) * cbrt(x).  */
2057       cbrt_x = build_and_insert_call (gsi, loc, cbrtfn, arg0);
2058
2059       if (absu_hwi (n) % 3 == 1)
2060         powi_cbrt_x = cbrt_x;
2061       else
2062         powi_cbrt_x = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
2063                                               cbrt_x, cbrt_x);
2064
2065       /* Multiply the two subexpressions, unless powi(x,abs(n)/3) = 1.  */
2066       if (absu_hwi (n) < 3)
2067         result = powi_cbrt_x;
2068       else
2069         result = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
2070                                          powi_x_ndiv3, powi_cbrt_x);
2071
2072       /* If n is negative, reciprocate the result.  */
2073       if (n < 0)
2074         result = build_and_insert_binop (gsi, loc, "powroot", RDIV_EXPR,
2075                                          build_real (type, dconst1), result);
2076
2077       return result;
2078     }
2079
2080   /* No optimizations succeeded.  */
2081   return NULL_TREE;
2082 }
2083
2084 /* ARG is the argument to a cabs builtin call in GSI with location info
2085    LOC.  Create a sequence of statements prior to GSI that calculates
2086    sqrt(R*R + I*I), where R and I are the real and imaginary components
2087    of ARG, respectively.  Return an expression holding the result.  */
2088
2089 static tree
2090 gimple_expand_builtin_cabs (gimple_stmt_iterator *gsi, location_t loc, tree arg)
2091 {
2092   tree real_part, imag_part, addend1, addend2, sum, result;
2093   tree type = TREE_TYPE (TREE_TYPE (arg));
2094   tree sqrtfn = mathfn_built_in (type, BUILT_IN_SQRT);
2095   machine_mode mode = TYPE_MODE (type);
2096
2097   if (!flag_unsafe_math_optimizations
2098       || !optimize_bb_for_speed_p (gimple_bb (gsi_stmt (*gsi)))
2099       || !sqrtfn
2100       || optab_handler (sqrt_optab, mode) == CODE_FOR_nothing)
2101     return NULL_TREE;
2102
2103   real_part = build_and_insert_ref (gsi, loc, type, "cabs",
2104                                     REALPART_EXPR, arg);
2105   addend1 = build_and_insert_binop (gsi, loc, "cabs", MULT_EXPR,
2106                                     real_part, real_part);
2107   imag_part = build_and_insert_ref (gsi, loc, type, "cabs",
2108                                     IMAGPART_EXPR, arg);
2109   addend2 = build_and_insert_binop (gsi, loc, "cabs", MULT_EXPR,
2110                                     imag_part, imag_part);
2111   sum = build_and_insert_binop (gsi, loc, "cabs", PLUS_EXPR, addend1, addend2);
2112   result = build_and_insert_call (gsi, loc, sqrtfn, sum);
2113
2114   return result;
2115 }
2116
2117 /* Go through all calls to sin, cos and cexpi and call execute_cse_sincos_1
2118    on the SSA_NAME argument of each of them.  Also expand powi(x,n) into
2119    an optimal number of multiplies, when n is a constant.  */
2120
2121 namespace {
2122
2123 const pass_data pass_data_cse_sincos =
2124 {
2125   GIMPLE_PASS, /* type */
2126   "sincos", /* name */
2127   OPTGROUP_NONE, /* optinfo_flags */
2128   TV_TREE_SINCOS, /* tv_id */
2129   PROP_ssa, /* properties_required */
2130   PROP_gimple_opt_math, /* properties_provided */
2131   0, /* properties_destroyed */
2132   0, /* todo_flags_start */
2133   TODO_update_ssa, /* todo_flags_finish */
2134 };
2135
2136 class pass_cse_sincos : public gimple_opt_pass
2137 {
2138 public:
2139   pass_cse_sincos (gcc::context *ctxt)
2140     : gimple_opt_pass (pass_data_cse_sincos, ctxt)
2141   {}
2142
2143   /* opt_pass methods: */
2144   virtual bool gate (function *)
2145     {
2146       /* We no longer require either sincos or cexp, since powi expansion
2147          piggybacks on this pass.  */
2148       return optimize;
2149     }
2150
2151   virtual unsigned int execute (function *);
2152
2153 }; // class pass_cse_sincos
2154
2155 unsigned int
2156 pass_cse_sincos::execute (function *fun)
2157 {
2158   basic_block bb;
2159   bool cfg_changed = false;
2160
2161   calculate_dominance_info (CDI_DOMINATORS);
2162   memset (&sincos_stats, 0, sizeof (sincos_stats));
2163
2164   FOR_EACH_BB_FN (bb, fun)
2165     {
2166       gimple_stmt_iterator gsi;
2167       bool cleanup_eh = false;
2168
2169       for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi); gsi_next (&gsi))
2170         {
2171           gimple *stmt = gsi_stmt (gsi);
2172
2173           /* Only the last stmt in a bb could throw, no need to call
2174              gimple_purge_dead_eh_edges if we change something in the middle
2175              of a basic block.  */
2176           cleanup_eh = false;
2177
2178           if (is_gimple_call (stmt)
2179               && gimple_call_lhs (stmt))
2180             {
2181               tree arg, arg0, arg1, result;
2182               HOST_WIDE_INT n;
2183               location_t loc;
2184
2185               switch (gimple_call_combined_fn (stmt))
2186                 {
2187                 CASE_CFN_COS:
2188                 CASE_CFN_SIN:
2189                 CASE_CFN_CEXPI:
2190                   arg = gimple_call_arg (stmt, 0);
2191                   /* Make sure we have either sincos or cexp.  */
2192                   if (!targetm.libc_has_function (function_c99_math_complex,
2193                                                   TREE_TYPE (arg))
2194                       && !targetm.libc_has_function (function_sincos,
2195                                                      TREE_TYPE (arg)))
2196                     break;
2197
2198                   if (TREE_CODE (arg) == SSA_NAME)
2199                     cfg_changed |= execute_cse_sincos_1 (arg);
2200                   break;
2201
2202                 CASE_CFN_POW:
2203                   arg0 = gimple_call_arg (stmt, 0);
2204                   arg1 = gimple_call_arg (stmt, 1);
2205
2206                   loc = gimple_location (stmt);
2207                   result = gimple_expand_builtin_pow (&gsi, loc, arg0, arg1);
2208
2209                   if (result)
2210                     {
2211                       tree lhs = gimple_get_lhs (stmt);
2212                       gassign *new_stmt = gimple_build_assign (lhs, result);
2213                       gimple_set_location (new_stmt, loc);
2214                       unlink_stmt_vdef (stmt);
2215                       gsi_replace (&gsi, new_stmt, true);
2216                       cleanup_eh = true;
2217                       if (gimple_vdef (stmt))
2218                         release_ssa_name (gimple_vdef (stmt));
2219                     }
2220                   break;
2221
2222                 CASE_CFN_POWI:
2223                   arg0 = gimple_call_arg (stmt, 0);
2224                   arg1 = gimple_call_arg (stmt, 1);
2225                   loc = gimple_location (stmt);
2226
2227                   if (real_minus_onep (arg0))
2228                     {
2229                       tree t0, t1, cond, one, minus_one;
2230                       gassign *stmt;
2231
2232                       t0 = TREE_TYPE (arg0);
2233                       t1 = TREE_TYPE (arg1);
2234                       one = build_real (t0, dconst1);
2235                       minus_one = build_real (t0, dconstm1);
2236
2237                       cond = make_temp_ssa_name (t1, NULL, "powi_cond");
2238                       stmt = gimple_build_assign (cond, BIT_AND_EXPR,
2239                                                   arg1, build_int_cst (t1, 1));
2240                       gimple_set_location (stmt, loc);
2241                       gsi_insert_before (&gsi, stmt, GSI_SAME_STMT);
2242
2243                       result = make_temp_ssa_name (t0, NULL, "powi");
2244                       stmt = gimple_build_assign (result, COND_EXPR, cond,
2245                                                   minus_one, one);
2246                       gimple_set_location (stmt, loc);
2247                       gsi_insert_before (&gsi, stmt, GSI_SAME_STMT);
2248                     }
2249                   else
2250                     {
2251                       if (!tree_fits_shwi_p (arg1))
2252                         break;
2253
2254                       n = tree_to_shwi (arg1);
2255                       result = gimple_expand_builtin_powi (&gsi, loc, arg0, n);
2256                     }
2257
2258                   if (result)
2259                     {
2260                       tree lhs = gimple_get_lhs (stmt);
2261                       gassign *new_stmt = gimple_build_assign (lhs, result);
2262                       gimple_set_location (new_stmt, loc);
2263                       unlink_stmt_vdef (stmt);
2264                       gsi_replace (&gsi, new_stmt, true);
2265                       cleanup_eh = true;
2266                       if (gimple_vdef (stmt))
2267                         release_ssa_name (gimple_vdef (stmt));
2268                     }
2269                   break;
2270
2271                 CASE_CFN_CABS:
2272                   arg0 = gimple_call_arg (stmt, 0);
2273                   loc = gimple_location (stmt);
2274                   result = gimple_expand_builtin_cabs (&gsi, loc, arg0);
2275
2276                   if (result)
2277                     {
2278                       tree lhs = gimple_get_lhs (stmt);
2279                       gassign *new_stmt = gimple_build_assign (lhs, result);
2280                       gimple_set_location (new_stmt, loc);
2281                       unlink_stmt_vdef (stmt);
2282                       gsi_replace (&gsi, new_stmt, true);
2283                       cleanup_eh = true;
2284                       if (gimple_vdef (stmt))
2285                         release_ssa_name (gimple_vdef (stmt));
2286                     }
2287                   break;
2288
2289                 default:;
2290                 }
2291             }
2292         }
2293       if (cleanup_eh)
2294         cfg_changed |= gimple_purge_dead_eh_edges (bb);
2295     }
2296
2297   statistics_counter_event (fun, "sincos statements inserted",
2298                             sincos_stats.inserted);
2299
2300   return cfg_changed ? TODO_cleanup_cfg : 0;
2301 }
2302
2303 } // anon namespace
2304
2305 gimple_opt_pass *
2306 make_pass_cse_sincos (gcc::context *ctxt)
2307 {
2308   return new pass_cse_sincos (ctxt);
2309 }
2310
2311 /* Return true if stmt is a type conversion operation that can be stripped
2312    when used in a widening multiply operation.  */
2313 static bool
2314 widening_mult_conversion_strippable_p (tree result_type, gimple *stmt)
2315 {
2316   enum tree_code rhs_code = gimple_assign_rhs_code (stmt);
2317
2318   if (TREE_CODE (result_type) == INTEGER_TYPE)
2319     {
2320       tree op_type;
2321       tree inner_op_type;
2322
2323       if (!CONVERT_EXPR_CODE_P (rhs_code))
2324         return false;
2325
2326       op_type = TREE_TYPE (gimple_assign_lhs (stmt));
2327
2328       /* If the type of OP has the same precision as the result, then
2329          we can strip this conversion.  The multiply operation will be
2330          selected to create the correct extension as a by-product.  */
2331       if (TYPE_PRECISION (result_type) == TYPE_PRECISION (op_type))
2332         return true;
2333
2334       /* We can also strip a conversion if it preserves the signed-ness of
2335          the operation and doesn't narrow the range.  */
2336       inner_op_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
2337
2338       /* If the inner-most type is unsigned, then we can strip any
2339          intermediate widening operation.  If it's signed, then the
2340          intermediate widening operation must also be signed.  */
2341       if ((TYPE_UNSIGNED (inner_op_type)
2342            || TYPE_UNSIGNED (op_type) == TYPE_UNSIGNED (inner_op_type))
2343           && TYPE_PRECISION (op_type) > TYPE_PRECISION (inner_op_type))
2344         return true;
2345
2346       return false;
2347     }
2348
2349   return rhs_code == FIXED_CONVERT_EXPR;
2350 }
2351
2352 /* Return true if RHS is a suitable operand for a widening multiplication,
2353    assuming a target type of TYPE.
2354    There are two cases:
2355
2356      - RHS makes some value at least twice as wide.  Store that value
2357        in *NEW_RHS_OUT if so, and store its type in *TYPE_OUT.
2358
2359      - RHS is an integer constant.  Store that value in *NEW_RHS_OUT if so,
2360        but leave *TYPE_OUT untouched.  */
2361
2362 static bool
2363 is_widening_mult_rhs_p (tree type, tree rhs, tree *type_out,
2364                         tree *new_rhs_out)
2365 {
2366   gimple *stmt;
2367   tree type1, rhs1;
2368
2369   if (TREE_CODE (rhs) == SSA_NAME)
2370     {
2371       stmt = SSA_NAME_DEF_STMT (rhs);
2372       if (is_gimple_assign (stmt))
2373         {
2374           if (! widening_mult_conversion_strippable_p (type, stmt))
2375             rhs1 = rhs;
2376           else
2377             {
2378               rhs1 = gimple_assign_rhs1 (stmt);
2379
2380               if (TREE_CODE (rhs1) == INTEGER_CST)
2381                 {
2382                   *new_rhs_out = rhs1;
2383                   *type_out = NULL;
2384                   return true;
2385                 }
2386             }
2387         }
2388       else
2389         rhs1 = rhs;
2390
2391       type1 = TREE_TYPE (rhs1);
2392
2393       if (TREE_CODE (type1) != TREE_CODE (type)
2394           || TYPE_PRECISION (type1) * 2 > TYPE_PRECISION (type))
2395         return false;
2396
2397       *new_rhs_out = rhs1;
2398       *type_out = type1;
2399       return true;
2400     }
2401
2402   if (TREE_CODE (rhs) == INTEGER_CST)
2403     {
2404       *new_rhs_out = rhs;
2405       *type_out = NULL;
2406       return true;
2407     }
2408
2409   return false;
2410 }
2411
2412 /* Return true if STMT performs a widening multiplication, assuming the
2413    output type is TYPE.  If so, store the unwidened types of the operands
2414    in *TYPE1_OUT and *TYPE2_OUT respectively.  Also fill *RHS1_OUT and
2415    *RHS2_OUT such that converting those operands to types *TYPE1_OUT
2416    and *TYPE2_OUT would give the operands of the multiplication.  */
2417
2418 static bool
2419 is_widening_mult_p (gimple *stmt,
2420                     tree *type1_out, tree *rhs1_out,
2421                     tree *type2_out, tree *rhs2_out)
2422 {
2423   tree type = TREE_TYPE (gimple_assign_lhs (stmt));
2424
2425   if (TREE_CODE (type) == INTEGER_TYPE)
2426     {
2427       if (TYPE_OVERFLOW_TRAPS (type))
2428         return false;
2429     }
2430   else if (TREE_CODE (type) != FIXED_POINT_TYPE)
2431     return false;
2432
2433   if (!is_widening_mult_rhs_p (type, gimple_assign_rhs1 (stmt), type1_out,
2434                                rhs1_out))
2435     return false;
2436
2437   if (!is_widening_mult_rhs_p (type, gimple_assign_rhs2 (stmt), type2_out,
2438                                rhs2_out))
2439     return false;
2440
2441   if (*type1_out == NULL)
2442     {
2443       if (*type2_out == NULL || !int_fits_type_p (*rhs1_out, *type2_out))
2444         return false;
2445       *type1_out = *type2_out;
2446     }
2447
2448   if (*type2_out == NULL)
2449     {
2450       if (!int_fits_type_p (*rhs2_out, *type1_out))
2451         return false;
2452       *type2_out = *type1_out;
2453     }
2454
2455   /* Ensure that the larger of the two operands comes first. */
2456   if (TYPE_PRECISION (*type1_out) < TYPE_PRECISION (*type2_out))
2457     {
2458       std::swap (*type1_out, *type2_out);
2459       std::swap (*rhs1_out, *rhs2_out);
2460     }
2461
2462   return true;
2463 }
2464
2465 /* Check to see if the CALL statement is an invocation of copysign
2466    with 1. being the first argument.  */
2467 static bool
2468 is_copysign_call_with_1 (gimple *call)
2469 {
2470   gcall *c = dyn_cast <gcall *> (call);
2471   if (! c)
2472     return false;
2473
2474   enum combined_fn code = gimple_call_combined_fn (c);
2475
2476   if (code == CFN_LAST)
2477     return false;
2478
2479   if (builtin_fn_p (code))
2480     {
2481       switch (as_builtin_fn (code))
2482         {
2483         CASE_FLT_FN (BUILT_IN_COPYSIGN):
2484         CASE_FLT_FN_FLOATN_NX (BUILT_IN_COPYSIGN):
2485           return real_onep (gimple_call_arg (c, 0));
2486         default:
2487           return false;
2488         }
2489     }
2490
2491   if (internal_fn_p (code))
2492     {
2493       switch (as_internal_fn (code))
2494         {
2495         case IFN_COPYSIGN:
2496           return real_onep (gimple_call_arg (c, 0));
2497         default:
2498           return false;
2499         }
2500     }
2501
2502    return false;
2503 }
2504
2505 /* Try to expand the pattern x * copysign (1, y) into xorsign (x, y).
2506    This only happens when the xorsign optab is defined, if the
2507    pattern is not a xorsign pattern or if expansion fails FALSE is
2508    returned, otherwise TRUE is returned.  */
2509 static bool
2510 convert_expand_mult_copysign (gimple *stmt, gimple_stmt_iterator *gsi)
2511 {
2512   tree treeop0, treeop1, lhs, type;
2513   location_t loc = gimple_location (stmt);
2514   lhs = gimple_assign_lhs (stmt);
2515   treeop0 = gimple_assign_rhs1 (stmt);
2516   treeop1 = gimple_assign_rhs2 (stmt);
2517   type = TREE_TYPE (lhs);
2518   machine_mode mode = TYPE_MODE (type);
2519
2520   if (HONOR_SNANS (type))
2521     return false;
2522
2523   if (TREE_CODE (treeop0) == SSA_NAME && TREE_CODE (treeop1) == SSA_NAME)
2524     {
2525       gimple *call0 = SSA_NAME_DEF_STMT (treeop0);
2526       if (!has_single_use (treeop0) || !is_copysign_call_with_1 (call0))
2527         {
2528           call0 = SSA_NAME_DEF_STMT (treeop1);
2529           if (!has_single_use (treeop1) || !is_copysign_call_with_1 (call0))
2530             return false;
2531
2532           treeop1 = treeop0;
2533         }
2534         if (optab_handler (xorsign_optab, mode) == CODE_FOR_nothing)
2535           return false;
2536
2537         gcall *c = as_a<gcall*> (call0);
2538         treeop0 = gimple_call_arg (c, 1);
2539
2540         gcall *call_stmt
2541           = gimple_build_call_internal (IFN_XORSIGN, 2, treeop1, treeop0);
2542         gimple_set_lhs (call_stmt, lhs);
2543         gimple_set_location (call_stmt, loc);
2544         gsi_replace (gsi, call_stmt, true);
2545         return true;
2546     }
2547
2548   return false;
2549 }
2550
2551 /* Process a single gimple statement STMT, which has a MULT_EXPR as
2552    its rhs, and try to convert it into a WIDEN_MULT_EXPR.  The return
2553    value is true iff we converted the statement.  */
2554
2555 static bool
2556 convert_mult_to_widen (gimple *stmt, gimple_stmt_iterator *gsi)
2557 {
2558   tree lhs, rhs1, rhs2, type, type1, type2;
2559   enum insn_code handler;
2560   scalar_int_mode to_mode, from_mode, actual_mode;
2561   optab op;
2562   int actual_precision;
2563   location_t loc = gimple_location (stmt);
2564   bool from_unsigned1, from_unsigned2;
2565
2566   lhs = gimple_assign_lhs (stmt);
2567   type = TREE_TYPE (lhs);
2568   if (TREE_CODE (type) != INTEGER_TYPE)
2569     return false;
2570
2571   if (!is_widening_mult_p (stmt, &type1, &rhs1, &type2, &rhs2))
2572     return false;
2573
2574   to_mode = SCALAR_INT_TYPE_MODE (type);
2575   from_mode = SCALAR_INT_TYPE_MODE (type1);
2576   if (to_mode == from_mode)
2577     return false;
2578
2579   from_unsigned1 = TYPE_UNSIGNED (type1);
2580   from_unsigned2 = TYPE_UNSIGNED (type2);
2581
2582   if (from_unsigned1 && from_unsigned2)
2583     op = umul_widen_optab;
2584   else if (!from_unsigned1 && !from_unsigned2)
2585     op = smul_widen_optab;
2586   else
2587     op = usmul_widen_optab;
2588
2589   handler = find_widening_optab_handler_and_mode (op, to_mode, from_mode,
2590                                                   &actual_mode);
2591
2592   if (handler == CODE_FOR_nothing)
2593     {
2594       if (op != smul_widen_optab)
2595         {
2596           /* We can use a signed multiply with unsigned types as long as
2597              there is a wider mode to use, or it is the smaller of the two
2598              types that is unsigned.  Note that type1 >= type2, always.  */
2599           if ((TYPE_UNSIGNED (type1)
2600                && TYPE_PRECISION (type1) == GET_MODE_PRECISION (from_mode))
2601               || (TYPE_UNSIGNED (type2)
2602                   && TYPE_PRECISION (type2) == GET_MODE_PRECISION (from_mode)))
2603             {
2604               if (!GET_MODE_WIDER_MODE (from_mode).exists (&from_mode)
2605                   || GET_MODE_SIZE (to_mode) <= GET_MODE_SIZE (from_mode))
2606                 return false;
2607             }
2608
2609           op = smul_widen_optab;
2610           handler = find_widening_optab_handler_and_mode (op, to_mode,
2611                                                           from_mode,
2612                                                           &actual_mode);
2613
2614           if (handler == CODE_FOR_nothing)
2615             return false;
2616
2617           from_unsigned1 = from_unsigned2 = false;
2618         }
2619       else
2620         return false;
2621     }
2622
2623   /* Ensure that the inputs to the handler are in the correct precison
2624      for the opcode.  This will be the full mode size.  */
2625   actual_precision = GET_MODE_PRECISION (actual_mode);
2626   if (2 * actual_precision > TYPE_PRECISION (type))
2627     return false;
2628   if (actual_precision != TYPE_PRECISION (type1)
2629       || from_unsigned1 != TYPE_UNSIGNED (type1))
2630     rhs1 = build_and_insert_cast (gsi, loc,
2631                                   build_nonstandard_integer_type
2632                                     (actual_precision, from_unsigned1), rhs1);
2633   if (actual_precision != TYPE_PRECISION (type2)
2634       || from_unsigned2 != TYPE_UNSIGNED (type2))
2635     rhs2 = build_and_insert_cast (gsi, loc,
2636                                   build_nonstandard_integer_type
2637                                     (actual_precision, from_unsigned2), rhs2);
2638
2639   /* Handle constants.  */
2640   if (TREE_CODE (rhs1) == INTEGER_CST)
2641     rhs1 = fold_convert (type1, rhs1);
2642   if (TREE_CODE (rhs2) == INTEGER_CST)
2643     rhs2 = fold_convert (type2, rhs2);
2644
2645   gimple_assign_set_rhs1 (stmt, rhs1);
2646   gimple_assign_set_rhs2 (stmt, rhs2);
2647   gimple_assign_set_rhs_code (stmt, WIDEN_MULT_EXPR);
2648   update_stmt (stmt);
2649   widen_mul_stats.widen_mults_inserted++;
2650   return true;
2651 }
2652
2653 /* Process a single gimple statement STMT, which is found at the
2654    iterator GSI and has a either a PLUS_EXPR or a MINUS_EXPR as its
2655    rhs (given by CODE), and try to convert it into a
2656    WIDEN_MULT_PLUS_EXPR or a WIDEN_MULT_MINUS_EXPR.  The return value
2657    is true iff we converted the statement.  */
2658
2659 static bool
2660 convert_plusminus_to_widen (gimple_stmt_iterator *gsi, gimple *stmt,
2661                             enum tree_code code)
2662 {
2663   gimple *rhs1_stmt = NULL, *rhs2_stmt = NULL;
2664   gimple *conv1_stmt = NULL, *conv2_stmt = NULL, *conv_stmt;
2665   tree type, type1, type2, optype;
2666   tree lhs, rhs1, rhs2, mult_rhs1, mult_rhs2, add_rhs;
2667   enum tree_code rhs1_code = ERROR_MARK, rhs2_code = ERROR_MARK;
2668   optab this_optab;
2669   enum tree_code wmult_code;
2670   enum insn_code handler;
2671   scalar_mode to_mode, from_mode, actual_mode;
2672   location_t loc = gimple_location (stmt);
2673   int actual_precision;
2674   bool from_unsigned1, from_unsigned2;
2675
2676   lhs = gimple_assign_lhs (stmt);
2677   type = TREE_TYPE (lhs);
2678   if (TREE_CODE (type) != INTEGER_TYPE
2679       && TREE_CODE (type) != FIXED_POINT_TYPE)
2680     return false;
2681
2682   if (code == MINUS_EXPR)
2683     wmult_code = WIDEN_MULT_MINUS_EXPR;
2684   else
2685     wmult_code = WIDEN_MULT_PLUS_EXPR;
2686
2687   rhs1 = gimple_assign_rhs1 (stmt);
2688   rhs2 = gimple_assign_rhs2 (stmt);
2689
2690   if (TREE_CODE (rhs1) == SSA_NAME)
2691     {
2692       rhs1_stmt = SSA_NAME_DEF_STMT (rhs1);
2693       if (is_gimple_assign (rhs1_stmt))
2694         rhs1_code = gimple_assign_rhs_code (rhs1_stmt);
2695     }
2696
2697   if (TREE_CODE (rhs2) == SSA_NAME)
2698     {
2699       rhs2_stmt = SSA_NAME_DEF_STMT (rhs2);
2700       if (is_gimple_assign (rhs2_stmt))
2701         rhs2_code = gimple_assign_rhs_code (rhs2_stmt);
2702     }
2703
2704   /* Allow for one conversion statement between the multiply
2705      and addition/subtraction statement.  If there are more than
2706      one conversions then we assume they would invalidate this
2707      transformation.  If that's not the case then they should have
2708      been folded before now.  */
2709   if (CONVERT_EXPR_CODE_P (rhs1_code))
2710     {
2711       conv1_stmt = rhs1_stmt;
2712       rhs1 = gimple_assign_rhs1 (rhs1_stmt);
2713       if (TREE_CODE (rhs1) == SSA_NAME)
2714         {
2715           rhs1_stmt = SSA_NAME_DEF_STMT (rhs1);
2716           if (is_gimple_assign (rhs1_stmt))
2717             rhs1_code = gimple_assign_rhs_code (rhs1_stmt);
2718         }
2719       else
2720         return false;
2721     }
2722   if (CONVERT_EXPR_CODE_P (rhs2_code))
2723     {
2724       conv2_stmt = rhs2_stmt;
2725       rhs2 = gimple_assign_rhs1 (rhs2_stmt);
2726       if (TREE_CODE (rhs2) == SSA_NAME)
2727         {
2728           rhs2_stmt = SSA_NAME_DEF_STMT (rhs2);
2729           if (is_gimple_assign (rhs2_stmt))
2730             rhs2_code = gimple_assign_rhs_code (rhs2_stmt);
2731         }
2732       else
2733         return false;
2734     }
2735
2736   /* If code is WIDEN_MULT_EXPR then it would seem unnecessary to call
2737      is_widening_mult_p, but we still need the rhs returns.
2738
2739      It might also appear that it would be sufficient to use the existing
2740      operands of the widening multiply, but that would limit the choice of
2741      multiply-and-accumulate instructions.
2742
2743      If the widened-multiplication result has more than one uses, it is
2744      probably wiser not to do the conversion.  Also restrict this operation
2745      to single basic block to avoid moving the multiply to a different block
2746      with a higher execution frequency.  */
2747   if (code == PLUS_EXPR
2748       && (rhs1_code == MULT_EXPR || rhs1_code == WIDEN_MULT_EXPR))
2749     {
2750       if (!has_single_use (rhs1)
2751           || gimple_bb (rhs1_stmt) != gimple_bb (stmt)
2752           || !is_widening_mult_p (rhs1_stmt, &type1, &mult_rhs1,
2753                                   &type2, &mult_rhs2))
2754         return false;
2755       add_rhs = rhs2;
2756       conv_stmt = conv1_stmt;
2757     }
2758   else if (rhs2_code == MULT_EXPR || rhs2_code == WIDEN_MULT_EXPR)
2759     {
2760       if (!has_single_use (rhs2)
2761           || gimple_bb (rhs2_stmt) != gimple_bb (stmt)
2762           || !is_widening_mult_p (rhs2_stmt, &type1, &mult_rhs1,
2763                                   &type2, &mult_rhs2))
2764         return false;
2765       add_rhs = rhs1;
2766       conv_stmt = conv2_stmt;
2767     }
2768   else
2769     return false;
2770
2771   to_mode = SCALAR_TYPE_MODE (type);
2772   from_mode = SCALAR_TYPE_MODE (type1);
2773   if (to_mode == from_mode)
2774     return false;
2775
2776   from_unsigned1 = TYPE_UNSIGNED (type1);
2777   from_unsigned2 = TYPE_UNSIGNED (type2);
2778   optype = type1;
2779
2780   /* There's no such thing as a mixed sign madd yet, so use a wider mode.  */
2781   if (from_unsigned1 != from_unsigned2)
2782     {
2783       if (!INTEGRAL_TYPE_P (type))
2784         return false;
2785       /* We can use a signed multiply with unsigned types as long as
2786          there is a wider mode to use, or it is the smaller of the two
2787          types that is unsigned.  Note that type1 >= type2, always.  */
2788       if ((from_unsigned1
2789            && TYPE_PRECISION (type1) == GET_MODE_PRECISION (from_mode))
2790           || (from_unsigned2
2791               && TYPE_PRECISION (type2) == GET_MODE_PRECISION (from_mode)))
2792         {
2793           if (!GET_MODE_WIDER_MODE (from_mode).exists (&from_mode)
2794               || GET_MODE_SIZE (from_mode) >= GET_MODE_SIZE (to_mode))
2795             return false;
2796         }
2797
2798       from_unsigned1 = from_unsigned2 = false;
2799       optype = build_nonstandard_integer_type (GET_MODE_PRECISION (from_mode),
2800                                                false);
2801     }
2802
2803   /* If there was a conversion between the multiply and addition
2804      then we need to make sure it fits a multiply-and-accumulate.
2805      The should be a single mode change which does not change the
2806      value.  */
2807   if (conv_stmt)
2808     {
2809       /* We use the original, unmodified data types for this.  */
2810       tree from_type = TREE_TYPE (gimple_assign_rhs1 (conv_stmt));
2811       tree to_type = TREE_TYPE (gimple_assign_lhs (conv_stmt));
2812       int data_size = TYPE_PRECISION (type1) + TYPE_PRECISION (type2);
2813       bool is_unsigned = TYPE_UNSIGNED (type1) && TYPE_UNSIGNED (type2);
2814
2815       if (TYPE_PRECISION (from_type) > TYPE_PRECISION (to_type))
2816         {
2817           /* Conversion is a truncate.  */
2818           if (TYPE_PRECISION (to_type) < data_size)
2819             return false;
2820         }
2821       else if (TYPE_PRECISION (from_type) < TYPE_PRECISION (to_type))
2822         {
2823           /* Conversion is an extend.  Check it's the right sort.  */
2824           if (TYPE_UNSIGNED (from_type) != is_unsigned
2825               && !(is_unsigned && TYPE_PRECISION (from_type) > data_size))
2826             return false;
2827         }
2828       /* else convert is a no-op for our purposes.  */
2829     }
2830
2831   /* Verify that the machine can perform a widening multiply
2832      accumulate in this mode/signedness combination, otherwise
2833      this transformation is likely to pessimize code.  */
2834   this_optab = optab_for_tree_code (wmult_code, optype, optab_default);
2835   handler = find_widening_optab_handler_and_mode (this_optab, to_mode,
2836                                                   from_mode, &actual_mode);
2837
2838   if (handler == CODE_FOR_nothing)
2839     return false;
2840
2841   /* Ensure that the inputs to the handler are in the correct precison
2842      for the opcode.  This will be the full mode size.  */
2843   actual_precision = GET_MODE_PRECISION (actual_mode);
2844   if (actual_precision != TYPE_PRECISION (type1)
2845       || from_unsigned1 != TYPE_UNSIGNED (type1))
2846     mult_rhs1 = build_and_insert_cast (gsi, loc,
2847                                        build_nonstandard_integer_type
2848                                          (actual_precision, from_unsigned1),
2849                                        mult_rhs1);
2850   if (actual_precision != TYPE_PRECISION (type2)
2851       || from_unsigned2 != TYPE_UNSIGNED (type2))
2852     mult_rhs2 = build_and_insert_cast (gsi, loc,
2853                                        build_nonstandard_integer_type
2854                                          (actual_precision, from_unsigned2),
2855                                        mult_rhs2);
2856
2857   if (!useless_type_conversion_p (type, TREE_TYPE (add_rhs)))
2858     add_rhs = build_and_insert_cast (gsi, loc, type, add_rhs);
2859
2860   /* Handle constants.  */
2861   if (TREE_CODE (mult_rhs1) == INTEGER_CST)
2862     mult_rhs1 = fold_convert (type1, mult_rhs1);
2863   if (TREE_CODE (mult_rhs2) == INTEGER_CST)
2864     mult_rhs2 = fold_convert (type2, mult_rhs2);
2865
2866   gimple_assign_set_rhs_with_ops (gsi, wmult_code, mult_rhs1, mult_rhs2,
2867                                   add_rhs);
2868   update_stmt (gsi_stmt (*gsi));
2869   widen_mul_stats.maccs_inserted++;
2870   return true;
2871 }
2872
2873 /* Given a result MUL_RESULT which is a result of a multiplication of OP1 and
2874    OP2 and which we know is used in statements that can be, together with the
2875    multiplication, converted to FMAs, perform the transformation.  */
2876
2877 static void
2878 convert_mult_to_fma_1 (tree mul_result, tree op1, tree op2)
2879 {
2880   tree type = TREE_TYPE (mul_result);
2881   gimple *use_stmt;
2882   imm_use_iterator imm_iter;
2883   gcall *fma_stmt;
2884
2885   FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, mul_result)
2886     {
2887       gimple_stmt_iterator gsi = gsi_for_stmt (use_stmt);
2888       tree addop, mulop1 = op1, result = mul_result;
2889       bool negate_p = false;
2890       gimple_seq seq = NULL;
2891
2892       if (is_gimple_debug (use_stmt))
2893         continue;
2894
2895       if (is_gimple_assign (use_stmt)
2896           && gimple_assign_rhs_code (use_stmt) == NEGATE_EXPR)
2897         {
2898           result = gimple_assign_lhs (use_stmt);
2899           use_operand_p use_p;
2900           gimple *neguse_stmt;
2901           single_imm_use (gimple_assign_lhs (use_stmt), &use_p, &neguse_stmt);
2902           gsi_remove (&gsi, true);
2903           release_defs (use_stmt);
2904
2905           use_stmt = neguse_stmt;
2906           gsi = gsi_for_stmt (use_stmt);
2907           negate_p = true;
2908         }
2909
2910       tree cond, else_value, ops[3];
2911       tree_code code;
2912       if (!can_interpret_as_conditional_op_p (use_stmt, &cond, &code,
2913                                               ops, &else_value))
2914         gcc_unreachable ();
2915       addop = ops[0] == result ? ops[1] : ops[0];
2916
2917       if (code == MINUS_EXPR)
2918         {
2919           if (ops[0] == result)
2920             /* a * b - c -> a * b + (-c)  */
2921             addop = gimple_build (&seq, NEGATE_EXPR, type, addop);
2922           else
2923             /* a - b * c -> (-b) * c + a */
2924             negate_p = !negate_p;
2925         }
2926
2927       if (negate_p)
2928         mulop1 = gimple_build (&seq, NEGATE_EXPR, type, mulop1);
2929
2930       if (seq)
2931         gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
2932
2933       if (cond)
2934         fma_stmt = gimple_build_call_internal (IFN_COND_FMA, 5, cond, mulop1,
2935                                                op2, addop, else_value);
2936       else
2937         fma_stmt = gimple_build_call_internal (IFN_FMA, 3, mulop1, op2, addop);
2938       gimple_set_lhs (fma_stmt, gimple_get_lhs (use_stmt));
2939       gimple_call_set_nothrow (fma_stmt, !stmt_can_throw_internal (cfun,
2940                                                                    use_stmt));
2941       gsi_replace (&gsi, fma_stmt, true);
2942       /* Follow all SSA edges so that we generate FMS, FNMA and FNMS
2943          regardless of where the negation occurs.  */
2944       gimple *orig_stmt = gsi_stmt (gsi);
2945       if (fold_stmt (&gsi, follow_all_ssa_edges))
2946         {
2947           if (maybe_clean_or_replace_eh_stmt (orig_stmt, gsi_stmt (gsi)))
2948             gcc_unreachable ();
2949           update_stmt (gsi_stmt (gsi));
2950         }
2951
2952       if (dump_file && (dump_flags & TDF_DETAILS))
2953         {
2954           fprintf (dump_file, "Generated FMA ");
2955           print_gimple_stmt (dump_file, gsi_stmt (gsi), 0, TDF_NONE);
2956           fprintf (dump_file, "\n");
2957         }
2958
2959       /* If the FMA result is negated in a single use, fold the negation
2960          too.  */
2961       orig_stmt = gsi_stmt (gsi);
2962       use_operand_p use_p;
2963       gimple *neg_stmt;
2964       if (is_gimple_call (orig_stmt)
2965           && gimple_call_internal_p (orig_stmt)
2966           && gimple_call_lhs (orig_stmt)
2967           && TREE_CODE (gimple_call_lhs (orig_stmt)) == SSA_NAME
2968           && single_imm_use (gimple_call_lhs (orig_stmt), &use_p, &neg_stmt)
2969           && is_gimple_assign (neg_stmt)
2970           && gimple_assign_rhs_code (neg_stmt) == NEGATE_EXPR
2971           && !stmt_could_throw_p (cfun, neg_stmt))
2972         {
2973           gsi = gsi_for_stmt (neg_stmt);
2974           if (fold_stmt (&gsi, follow_all_ssa_edges))
2975             {
2976               if (maybe_clean_or_replace_eh_stmt (neg_stmt, gsi_stmt (gsi)))
2977                 gcc_unreachable ();
2978               update_stmt (gsi_stmt (gsi));
2979               if (dump_file && (dump_flags & TDF_DETAILS))
2980                 {
2981                   fprintf (dump_file, "Folded FMA negation ");
2982                   print_gimple_stmt (dump_file, gsi_stmt (gsi), 0, TDF_NONE);
2983                   fprintf (dump_file, "\n");
2984                 }
2985             }
2986         }
2987
2988       widen_mul_stats.fmas_inserted++;
2989     }
2990 }
2991
2992 /* Data necessary to perform the actual transformation from a multiplication
2993    and an addition to an FMA after decision is taken it should be done and to
2994    then delete the multiplication statement from the function IL.  */
2995
2996 struct fma_transformation_info
2997 {
2998   gimple *mul_stmt;
2999   tree mul_result;
3000   tree op1;
3001   tree op2;
3002 };
3003
3004 /* Structure containing the current state of FMA deferring, i.e. whether we are
3005    deferring, whether to continue deferring, and all data necessary to come
3006    back and perform all deferred transformations.  */
3007
3008 class fma_deferring_state
3009 {
3010 public:
3011   /* Class constructor.  Pass true as PERFORM_DEFERRING in order to actually
3012      do any deferring.  */
3013
3014   fma_deferring_state (bool perform_deferring)
3015     : m_candidates (), m_mul_result_set (), m_initial_phi (NULL),
3016       m_last_result (NULL_TREE), m_deferring_p (perform_deferring) {}
3017
3018   /* List of FMA candidates for which we the transformation has been determined
3019      possible but we at this point in BB analysis we do not consider them
3020      beneficial.  */
3021   auto_vec<fma_transformation_info, 8> m_candidates;
3022
3023   /* Set of results of multiplication that are part of an already deferred FMA
3024      candidates.  */
3025   hash_set<tree> m_mul_result_set;
3026
3027   /* The PHI that supposedly feeds back result of a FMA to another over loop
3028      boundary.  */
3029   gphi *m_initial_phi;
3030
3031   /* Result of the last produced FMA candidate or NULL if there has not been
3032      one.  */
3033   tree m_last_result;
3034
3035   /* If true, deferring might still be profitable.  If false, transform all
3036      candidates and no longer defer.  */
3037   bool m_deferring_p;
3038 };
3039
3040 /* Transform all deferred FMA candidates and mark STATE as no longer
3041    deferring.  */
3042
3043 static void
3044 cancel_fma_deferring (fma_deferring_state *state)
3045 {
3046   if (!state->m_deferring_p)
3047     return;
3048
3049   for (unsigned i = 0; i < state->m_candidates.length (); i++)
3050     {
3051       if (dump_file && (dump_flags & TDF_DETAILS))
3052         fprintf (dump_file, "Generating deferred FMA\n");
3053
3054       const fma_transformation_info &fti = state->m_candidates[i];
3055       convert_mult_to_fma_1 (fti.mul_result, fti.op1, fti.op2);
3056
3057       gimple_stmt_iterator gsi = gsi_for_stmt (fti.mul_stmt);
3058       gsi_remove (&gsi, true);
3059       release_defs (fti.mul_stmt);
3060     }
3061   state->m_deferring_p = false;
3062 }
3063
3064 /* If OP is an SSA name defined by a PHI node, return the PHI statement.
3065    Otherwise return NULL.  */
3066
3067 static gphi *
3068 result_of_phi (tree op)
3069 {
3070   if (TREE_CODE (op) != SSA_NAME)
3071     return NULL;
3072
3073   return dyn_cast <gphi *> (SSA_NAME_DEF_STMT (op));
3074 }
3075
3076 /* After processing statements of a BB and recording STATE, return true if the
3077    initial phi is fed by the last FMA candidate result ore one such result from
3078    previously processed BBs marked in LAST_RESULT_SET.  */
3079
3080 static bool
3081 last_fma_candidate_feeds_initial_phi (fma_deferring_state *state,
3082                                       hash_set<tree> *last_result_set)
3083 {
3084   ssa_op_iter iter;
3085   use_operand_p use;
3086   FOR_EACH_PHI_ARG (use, state->m_initial_phi, iter, SSA_OP_USE)
3087     {
3088       tree t = USE_FROM_PTR (use);
3089       if (t == state->m_last_result
3090           || last_result_set->contains (t))
3091         return true;
3092     }
3093
3094   return false;
3095 }
3096
3097 /* Combine the multiplication at MUL_STMT with operands MULOP1 and MULOP2
3098    with uses in additions and subtractions to form fused multiply-add
3099    operations.  Returns true if successful and MUL_STMT should be removed.
3100    If MUL_COND is nonnull, the multiplication in MUL_STMT is conditional
3101    on MUL_COND, otherwise it is unconditional.
3102
3103    If STATE indicates that we are deferring FMA transformation, that means
3104    that we do not produce FMAs for basic blocks which look like:
3105
3106     <bb 6>
3107     # accumulator_111 = PHI <0.0(5), accumulator_66(6)>
3108     _65 = _14 * _16;
3109     accumulator_66 = _65 + accumulator_111;
3110
3111   or its unrolled version, i.e. with several FMA candidates that feed result
3112   of one into the addend of another.  Instead, we add them to a list in STATE
3113   and if we later discover an FMA candidate that is not part of such a chain,
3114   we go back and perform all deferred past candidates.  */
3115
3116 static bool
3117 convert_mult_to_fma (gimple *mul_stmt, tree op1, tree op2,
3118                      fma_deferring_state *state, tree mul_cond = NULL_TREE)
3119 {
3120   tree mul_result = gimple_get_lhs (mul_stmt);
3121   tree type = TREE_TYPE (mul_result);
3122   gimple *use_stmt, *neguse_stmt;
3123   use_operand_p use_p;
3124   imm_use_iterator imm_iter;
3125
3126   if (FLOAT_TYPE_P (type)
3127       && flag_fp_contract_mode == FP_CONTRACT_OFF)
3128     return false;
3129
3130   /* We don't want to do bitfield reduction ops.  */
3131   if (INTEGRAL_TYPE_P (type)
3132       && (!type_has_mode_precision_p (type) || TYPE_OVERFLOW_TRAPS (type)))
3133     return false;
3134
3135   /* If the target doesn't support it, don't generate it.  We assume that
3136      if fma isn't available then fms, fnma or fnms are not either.  */
3137   optimization_type opt_type = bb_optimization_type (gimple_bb (mul_stmt));
3138   if (!direct_internal_fn_supported_p (IFN_FMA, type, opt_type))
3139     return false;
3140
3141   /* If the multiplication has zero uses, it is kept around probably because
3142      of -fnon-call-exceptions.  Don't optimize it away in that case,
3143      it is DCE job.  */
3144   if (has_zero_uses (mul_result))
3145     return false;
3146
3147   bool check_defer
3148     = (state->m_deferring_p
3149        && (tree_to_shwi (TYPE_SIZE (type))
3150            <= param_avoid_fma_max_bits));
3151   bool defer = check_defer;
3152   bool seen_negate_p = false;
3153   /* Make sure that the multiplication statement becomes dead after
3154      the transformation, thus that all uses are transformed to FMAs.
3155      This means we assume that an FMA operation has the same cost
3156      as an addition.  */
3157   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, mul_result)
3158     {
3159       tree result = mul_result;
3160       bool negate_p = false;
3161
3162       use_stmt = USE_STMT (use_p);
3163
3164       if (is_gimple_debug (use_stmt))
3165         continue;
3166
3167       /* For now restrict this operations to single basic blocks.  In theory
3168          we would want to support sinking the multiplication in
3169          m = a*b;
3170          if ()
3171            ma = m + c;
3172          else
3173            d = m;
3174          to form a fma in the then block and sink the multiplication to the
3175          else block.  */
3176       if (gimple_bb (use_stmt) != gimple_bb (mul_stmt))
3177         return false;
3178
3179       /* A negate on the multiplication leads to FNMA.  */
3180       if (is_gimple_assign (use_stmt)
3181           && gimple_assign_rhs_code (use_stmt) == NEGATE_EXPR)
3182         {
3183           ssa_op_iter iter;
3184           use_operand_p usep;
3185
3186           /* If (due to earlier missed optimizations) we have two
3187              negates of the same value, treat them as equivalent
3188              to a single negate with multiple uses.  */
3189           if (seen_negate_p)
3190             return false;
3191
3192           result = gimple_assign_lhs (use_stmt);
3193
3194           /* Make sure the negate statement becomes dead with this
3195              single transformation.  */
3196           if (!single_imm_use (gimple_assign_lhs (use_stmt),
3197                                &use_p, &neguse_stmt))
3198             return false;
3199
3200           /* Make sure the multiplication isn't also used on that stmt.  */
3201           FOR_EACH_PHI_OR_STMT_USE (usep, neguse_stmt, iter, SSA_OP_USE)
3202             if (USE_FROM_PTR (usep) == mul_result)
3203               return false;
3204
3205           /* Re-validate.  */
3206           use_stmt = neguse_stmt;
3207           if (gimple_bb (use_stmt) != gimple_bb (mul_stmt))
3208             return false;
3209
3210           negate_p = seen_negate_p = true;
3211         }
3212
3213       tree cond, else_value, ops[3];
3214       tree_code code;
3215       if (!can_interpret_as_conditional_op_p (use_stmt, &cond, &code, ops,
3216                                               &else_value))
3217         return false;
3218
3219       switch (code)
3220         {
3221         case MINUS_EXPR:
3222           if (ops[1] == result)
3223             negate_p = !negate_p;
3224           break;
3225         case PLUS_EXPR:
3226           break;
3227         default:
3228           /* FMA can only be formed from PLUS and MINUS.  */
3229           return false;
3230         }
3231
3232       if (mul_cond && cond != mul_cond)
3233         return false;
3234
3235       if (cond)
3236         {
3237           if (cond == result || else_value == result)
3238             return false;
3239           if (!direct_internal_fn_supported_p (IFN_COND_FMA, type, opt_type))
3240             return false;
3241         }
3242
3243       /* If the subtrahend (OPS[1]) is computed by a MULT_EXPR that
3244          we'll visit later, we might be able to get a more profitable
3245          match with fnma.
3246          OTOH, if we don't, a negate / fma pair has likely lower latency
3247          that a mult / subtract pair.  */
3248       if (code == MINUS_EXPR
3249           && !negate_p
3250           && ops[0] == result
3251           && !direct_internal_fn_supported_p (IFN_FMS, type, opt_type)
3252           && direct_internal_fn_supported_p (IFN_FNMA, type, opt_type)
3253           && TREE_CODE (ops[1]) == SSA_NAME
3254           && has_single_use (ops[1]))
3255         {
3256           gimple *stmt2 = SSA_NAME_DEF_STMT (ops[1]);
3257           if (is_gimple_assign (stmt2)
3258               && gimple_assign_rhs_code (stmt2) == MULT_EXPR)
3259             return false;
3260         }
3261
3262       /* We can't handle a * b + a * b.  */
3263       if (ops[0] == ops[1])
3264         return false;
3265       /* If deferring, make sure we are not looking at an instruction that
3266          wouldn't have existed if we were not.  */
3267       if (state->m_deferring_p
3268           && (state->m_mul_result_set.contains (ops[0])
3269               || state->m_mul_result_set.contains (ops[1])))
3270         return false;
3271
3272       if (check_defer)
3273         {
3274           tree use_lhs = gimple_get_lhs (use_stmt);
3275           if (state->m_last_result)
3276             {
3277               if (ops[1] == state->m_last_result
3278                   || ops[0] == state->m_last_result)
3279                 defer = true;
3280               else
3281                 defer = false;
3282             }
3283           else
3284             {
3285               gcc_checking_assert (!state->m_initial_phi);
3286               gphi *phi;
3287               if (ops[0] == result)
3288                 phi = result_of_phi (ops[1]);
3289               else
3290                 {
3291                   gcc_assert (ops[1] == result);
3292                   phi = result_of_phi (ops[0]);
3293                 }
3294
3295               if (phi)
3296                 {
3297                   state->m_initial_phi = phi;
3298                   defer = true;
3299                 }
3300               else
3301                 defer = false;
3302             }
3303
3304           state->m_last_result = use_lhs;
3305           check_defer = false;
3306         }
3307       else
3308         defer = false;
3309
3310       /* While it is possible to validate whether or not the exact form that
3311          we've recognized is available in the backend, the assumption is that
3312          if the deferring logic above did not trigger, the transformation is
3313          never a loss.  For instance, suppose the target only has the plain FMA
3314          pattern available.  Consider a*b-c -> fma(a,b,-c): we've exchanged
3315          MUL+SUB for FMA+NEG, which is still two operations.  Consider
3316          -(a*b)-c -> fma(-a,b,-c): we still have 3 operations, but in the FMA
3317          form the two NEGs are independent and could be run in parallel.  */
3318     }
3319
3320   if (defer)
3321     {
3322       fma_transformation_info fti;
3323       fti.mul_stmt = mul_stmt;
3324       fti.mul_result = mul_result;
3325       fti.op1 = op1;
3326       fti.op2 = op2;
3327       state->m_candidates.safe_push (fti);
3328       state->m_mul_result_set.add (mul_result);
3329
3330       if (dump_file && (dump_flags & TDF_DETAILS))
3331         {
3332           fprintf (dump_file, "Deferred generating FMA for multiplication ");
3333           print_gimple_stmt (dump_file, mul_stmt, 0, TDF_NONE);
3334           fprintf (dump_file, "\n");
3335         }
3336
3337       return false;
3338     }
3339   else
3340     {
3341       if (state->m_deferring_p)
3342         cancel_fma_deferring (state);
3343       convert_mult_to_fma_1 (mul_result, op1, op2);
3344       return true;
3345     }
3346 }
3347
3348
3349 /* Helper function of match_uaddsub_overflow.  Return 1
3350    if USE_STMT is unsigned overflow check ovf != 0 for
3351    STMT, -1 if USE_STMT is unsigned overflow check ovf == 0
3352    and 0 otherwise.  */
3353
3354 static int
3355 uaddsub_overflow_check_p (gimple *stmt, gimple *use_stmt)
3356 {
3357   enum tree_code ccode = ERROR_MARK;
3358   tree crhs1 = NULL_TREE, crhs2 = NULL_TREE;
3359   if (gimple_code (use_stmt) == GIMPLE_COND)
3360     {
3361       ccode = gimple_cond_code (use_stmt);
3362       crhs1 = gimple_cond_lhs (use_stmt);
3363       crhs2 = gimple_cond_rhs (use_stmt);
3364     }
3365   else if (is_gimple_assign (use_stmt))
3366     {
3367       if (gimple_assign_rhs_class (use_stmt) == GIMPLE_BINARY_RHS)
3368         {
3369           ccode = gimple_assign_rhs_code (use_stmt);
3370           crhs1 = gimple_assign_rhs1 (use_stmt);
3371           crhs2 = gimple_assign_rhs2 (use_stmt);
3372         }
3373       else if (gimple_assign_rhs_code (use_stmt) == COND_EXPR)
3374         {
3375           tree cond = gimple_assign_rhs1 (use_stmt);
3376           if (COMPARISON_CLASS_P (cond))
3377             {
3378               ccode = TREE_CODE (cond);
3379               crhs1 = TREE_OPERAND (cond, 0);
3380               crhs2 = TREE_OPERAND (cond, 1);
3381             }
3382           else
3383             return 0;
3384         }
3385       else
3386         return 0;
3387     }
3388   else
3389     return 0;
3390
3391   if (TREE_CODE_CLASS (ccode) != tcc_comparison)
3392     return 0;
3393
3394   enum tree_code code = gimple_assign_rhs_code (stmt);
3395   tree lhs = gimple_assign_lhs (stmt);
3396   tree rhs1 = gimple_assign_rhs1 (stmt);
3397   tree rhs2 = gimple_assign_rhs2 (stmt);
3398
3399   switch (ccode)
3400     {
3401     case GT_EXPR:
3402     case LE_EXPR:
3403       /* r = a - b; r > a or r <= a
3404          r = a + b; a > r or a <= r or b > r or b <= r.  */
3405       if ((code == MINUS_EXPR && crhs1 == lhs && crhs2 == rhs1)
3406           || (code == PLUS_EXPR && (crhs1 == rhs1 || crhs1 == rhs2)
3407               && crhs2 == lhs))
3408         return ccode == GT_EXPR ? 1 : -1;
3409       break;
3410     case LT_EXPR:
3411     case GE_EXPR:
3412       /* r = a - b; a < r or a >= r
3413          r = a + b; r < a or r >= a or r < b or r >= b.  */
3414       if ((code == MINUS_EXPR && crhs1 == rhs1 && crhs2 == lhs)
3415           || (code == PLUS_EXPR && crhs1 == lhs
3416               && (crhs2 == rhs1 || crhs2 == rhs2)))
3417         return ccode == LT_EXPR ? 1 : -1;
3418       break;
3419     default:
3420       break;
3421     }
3422   return 0;
3423 }
3424
3425 /* Recognize for unsigned x
3426    x = y - z;
3427    if (x > y)
3428    where there are other uses of x and replace it with
3429    _7 = SUB_OVERFLOW (y, z);
3430    x = REALPART_EXPR <_7>;
3431    _8 = IMAGPART_EXPR <_7>;
3432    if (_8)
3433    and similarly for addition.  */
3434
3435 static bool
3436 match_uaddsub_overflow (gimple_stmt_iterator *gsi, gimple *stmt,
3437                         enum tree_code code)
3438 {
3439   tree lhs = gimple_assign_lhs (stmt);
3440   tree type = TREE_TYPE (lhs);
3441   use_operand_p use_p;
3442   imm_use_iterator iter;
3443   bool use_seen = false;
3444   bool ovf_use_seen = false;
3445   gimple *use_stmt;
3446
3447   gcc_checking_assert (code == PLUS_EXPR || code == MINUS_EXPR);
3448   if (!INTEGRAL_TYPE_P (type)
3449       || !TYPE_UNSIGNED (type)
3450       || has_zero_uses (lhs)
3451       || has_single_use (lhs)
3452       || optab_handler (code == PLUS_EXPR ? uaddv4_optab : usubv4_optab,
3453                         TYPE_MODE (type)) == CODE_FOR_nothing)
3454     return false;
3455
3456   FOR_EACH_IMM_USE_FAST (use_p, iter, lhs)
3457     {
3458       use_stmt = USE_STMT (use_p);
3459       if (is_gimple_debug (use_stmt))
3460         continue;
3461
3462       if (uaddsub_overflow_check_p (stmt, use_stmt))
3463         ovf_use_seen = true;
3464       else
3465         use_seen = true;
3466       if (ovf_use_seen && use_seen)
3467         break;
3468     }
3469
3470   if (!ovf_use_seen || !use_seen)
3471     return false;
3472
3473   tree ctype = build_complex_type (type);
3474   tree rhs1 = gimple_assign_rhs1 (stmt);
3475   tree rhs2 = gimple_assign_rhs2 (stmt);
3476   gcall *g = gimple_build_call_internal (code == PLUS_EXPR
3477                                          ? IFN_ADD_OVERFLOW : IFN_SUB_OVERFLOW,
3478                                          2, rhs1, rhs2);
3479   tree ctmp = make_ssa_name (ctype);
3480   gimple_call_set_lhs (g, ctmp);
3481   gsi_insert_before (gsi, g, GSI_SAME_STMT);
3482   gassign *g2 = gimple_build_assign (lhs, REALPART_EXPR,
3483                                      build1 (REALPART_EXPR, type, ctmp));
3484   gsi_replace (gsi, g2, true);
3485   tree ovf = make_ssa_name (type);
3486   g2 = gimple_build_assign (ovf, IMAGPART_EXPR,
3487                             build1 (IMAGPART_EXPR, type, ctmp));
3488   gsi_insert_after (gsi, g2, GSI_NEW_STMT);
3489
3490   FOR_EACH_IMM_USE_STMT (use_stmt, iter, lhs)
3491     {
3492       if (is_gimple_debug (use_stmt))
3493         continue;
3494
3495       int ovf_use = uaddsub_overflow_check_p (stmt, use_stmt);
3496       if (ovf_use == 0)
3497         continue;
3498       if (gimple_code (use_stmt) == GIMPLE_COND)
3499         {
3500           gcond *cond_stmt = as_a <gcond *> (use_stmt);
3501           gimple_cond_set_lhs (cond_stmt, ovf);
3502           gimple_cond_set_rhs (cond_stmt, build_int_cst (type, 0));
3503           gimple_cond_set_code (cond_stmt, ovf_use == 1 ? NE_EXPR : EQ_EXPR);
3504         }
3505       else
3506         {
3507           gcc_checking_assert (is_gimple_assign (use_stmt));
3508           if (gimple_assign_rhs_class (use_stmt) == GIMPLE_BINARY_RHS)
3509             {
3510               gimple_assign_set_rhs1 (use_stmt, ovf);
3511               gimple_assign_set_rhs2 (use_stmt, build_int_cst (type, 0));
3512               gimple_assign_set_rhs_code (use_stmt,
3513                                           ovf_use == 1 ? NE_EXPR : EQ_EXPR);
3514             }
3515           else
3516             {
3517               gcc_checking_assert (gimple_assign_rhs_code (use_stmt)
3518                                    == COND_EXPR);
3519               tree cond = build2 (ovf_use == 1 ? NE_EXPR : EQ_EXPR,
3520                                   boolean_type_node, ovf,
3521                                   build_int_cst (type, 0));
3522               gimple_assign_set_rhs1 (use_stmt, cond);
3523             }
3524         }
3525       update_stmt (use_stmt);
3526     }
3527   return true;
3528 }
3529
3530 /* Return true if target has support for divmod.  */
3531
3532 static bool
3533 target_supports_divmod_p (optab divmod_optab, optab div_optab, machine_mode mode)
3534 {
3535   /* If target supports hardware divmod insn, use it for divmod.  */
3536   if (optab_handler (divmod_optab, mode) != CODE_FOR_nothing)
3537     return true;
3538
3539   /* Check if libfunc for divmod is available.  */
3540   rtx libfunc = optab_libfunc (divmod_optab, mode);
3541   if (libfunc != NULL_RTX)
3542     {
3543       /* If optab_handler exists for div_optab, perhaps in a wider mode,
3544          we don't want to use the libfunc even if it exists for given mode.  */
3545       machine_mode div_mode;
3546       FOR_EACH_MODE_FROM (div_mode, mode)
3547         if (optab_handler (div_optab, div_mode) != CODE_FOR_nothing)
3548           return false;
3549
3550       return targetm.expand_divmod_libfunc != NULL;
3551     }
3552
3553   return false;
3554 }
3555
3556 /* Check if stmt is candidate for divmod transform.  */
3557
3558 static bool
3559 divmod_candidate_p (gassign *stmt)
3560 {
3561   tree type = TREE_TYPE (gimple_assign_lhs (stmt));
3562   machine_mode mode = TYPE_MODE (type);
3563   optab divmod_optab, div_optab;
3564
3565   if (TYPE_UNSIGNED (type))
3566     {
3567       divmod_optab = udivmod_optab;
3568       div_optab = udiv_optab;
3569     }
3570   else
3571     {
3572       divmod_optab = sdivmod_optab;
3573       div_optab = sdiv_optab;
3574     }
3575
3576   tree op1 = gimple_assign_rhs1 (stmt);
3577   tree op2 = gimple_assign_rhs2 (stmt);
3578
3579   /* Disable the transform if either is a constant, since division-by-constant
3580      may have specialized expansion.  */
3581   if (CONSTANT_CLASS_P (op1))
3582     return false;
3583
3584   if (CONSTANT_CLASS_P (op2))
3585     {
3586       if (integer_pow2p (op2))
3587         return false;
3588
3589       if (TYPE_PRECISION (type) <= HOST_BITS_PER_WIDE_INT
3590           && TYPE_PRECISION (type) <= BITS_PER_WORD)
3591         return false;
3592
3593       /* If the divisor is not power of 2 and the precision wider than
3594          HWI, expand_divmod punts on that, so in that case it is better
3595          to use divmod optab or libfunc.  Similarly if choose_multiplier
3596          might need pre/post shifts of BITS_PER_WORD or more.  */
3597     }
3598
3599   /* Exclude the case where TYPE_OVERFLOW_TRAPS (type) as that should
3600      expand using the [su]divv optabs.  */
3601   if (TYPE_OVERFLOW_TRAPS (type))
3602     return false;
3603
3604   if (!target_supports_divmod_p (divmod_optab, div_optab, mode))
3605     return false;
3606
3607   return true;
3608 }
3609
3610 /* This function looks for:
3611    t1 = a TRUNC_DIV_EXPR b;
3612    t2 = a TRUNC_MOD_EXPR b;
3613    and transforms it to the following sequence:
3614    complex_tmp = DIVMOD (a, b);
3615    t1 = REALPART_EXPR(a);
3616    t2 = IMAGPART_EXPR(b);
3617    For conditions enabling the transform see divmod_candidate_p().
3618
3619    The pass has three parts:
3620    1) Find top_stmt which is trunc_div or trunc_mod stmt and dominates all
3621       other trunc_div_expr and trunc_mod_expr stmts.
3622    2) Add top_stmt and all trunc_div and trunc_mod stmts dominated by top_stmt
3623       to stmts vector.
3624    3) Insert DIVMOD call just before top_stmt and update entries in
3625       stmts vector to use return value of DIMOVD (REALEXPR_PART for div,
3626       IMAGPART_EXPR for mod).  */
3627
3628 static bool
3629 convert_to_divmod (gassign *stmt)
3630 {
3631   if (stmt_can_throw_internal (cfun, stmt)
3632       || !divmod_candidate_p (stmt))
3633     return false;
3634
3635   tree op1 = gimple_assign_rhs1 (stmt);
3636   tree op2 = gimple_assign_rhs2 (stmt);
3637
3638   imm_use_iterator use_iter;
3639   gimple *use_stmt;
3640   auto_vec<gimple *> stmts;
3641
3642   gimple *top_stmt = stmt;
3643   basic_block top_bb = gimple_bb (stmt);
3644
3645   /* Part 1: Try to set top_stmt to "topmost" stmt that dominates
3646      at-least stmt and possibly other trunc_div/trunc_mod stmts
3647      having same operands as stmt.  */
3648
3649   FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, op1)
3650     {
3651       if (is_gimple_assign (use_stmt)
3652           && (gimple_assign_rhs_code (use_stmt) == TRUNC_DIV_EXPR
3653               || gimple_assign_rhs_code (use_stmt) == TRUNC_MOD_EXPR)
3654           && operand_equal_p (op1, gimple_assign_rhs1 (use_stmt), 0)
3655           && operand_equal_p (op2, gimple_assign_rhs2 (use_stmt), 0))
3656         {
3657           if (stmt_can_throw_internal (cfun, use_stmt))
3658             continue;
3659
3660           basic_block bb = gimple_bb (use_stmt);
3661
3662           if (bb == top_bb)
3663             {
3664               if (gimple_uid (use_stmt) < gimple_uid (top_stmt))
3665                 top_stmt = use_stmt;
3666             }
3667           else if (dominated_by_p (CDI_DOMINATORS, top_bb, bb))
3668             {
3669               top_bb = bb;
3670               top_stmt = use_stmt;
3671             }
3672         }
3673     }
3674
3675   tree top_op1 = gimple_assign_rhs1 (top_stmt);
3676   tree top_op2 = gimple_assign_rhs2 (top_stmt);
3677
3678   stmts.safe_push (top_stmt);
3679   bool div_seen = (gimple_assign_rhs_code (top_stmt) == TRUNC_DIV_EXPR);
3680
3681   /* Part 2: Add all trunc_div/trunc_mod statements domianted by top_bb
3682      to stmts vector. The 2nd loop will always add stmt to stmts vector, since
3683      gimple_bb (top_stmt) dominates gimple_bb (stmt), so the
3684      2nd loop ends up adding at-least single trunc_mod_expr stmt.  */
3685
3686   FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, top_op1)
3687     {
3688       if (is_gimple_assign (use_stmt)
3689           && (gimple_assign_rhs_code (use_stmt) == TRUNC_DIV_EXPR
3690               || gimple_assign_rhs_code (use_stmt) == TRUNC_MOD_EXPR)
3691           && operand_equal_p (top_op1, gimple_assign_rhs1 (use_stmt), 0)
3692           && operand_equal_p (top_op2, gimple_assign_rhs2 (use_stmt), 0))
3693         {
3694           if (use_stmt == top_stmt
3695               || stmt_can_throw_internal (cfun, use_stmt)
3696               || !dominated_by_p (CDI_DOMINATORS, gimple_bb (use_stmt), top_bb))
3697             continue;
3698
3699           stmts.safe_push (use_stmt);
3700           if (gimple_assign_rhs_code (use_stmt) == TRUNC_DIV_EXPR)
3701             div_seen = true;
3702         }
3703     }
3704
3705   if (!div_seen)
3706     return false;
3707
3708   /* Part 3: Create libcall to internal fn DIVMOD:
3709      divmod_tmp = DIVMOD (op1, op2).  */
3710
3711   gcall *call_stmt = gimple_build_call_internal (IFN_DIVMOD, 2, op1, op2);
3712   tree res = make_temp_ssa_name (build_complex_type (TREE_TYPE (op1)),
3713                                  call_stmt, "divmod_tmp");
3714   gimple_call_set_lhs (call_stmt, res);
3715   /* We rejected throwing statements above.  */
3716   gimple_call_set_nothrow (call_stmt, true);
3717
3718   /* Insert the call before top_stmt.  */
3719   gimple_stmt_iterator top_stmt_gsi = gsi_for_stmt (top_stmt);
3720   gsi_insert_before (&top_stmt_gsi, call_stmt, GSI_SAME_STMT);
3721
3722   widen_mul_stats.divmod_calls_inserted++;
3723
3724   /* Update all statements in stmts vector:
3725      lhs = op1 TRUNC_DIV_EXPR op2 -> lhs = REALPART_EXPR<divmod_tmp>
3726      lhs = op1 TRUNC_MOD_EXPR op2 -> lhs = IMAGPART_EXPR<divmod_tmp>.  */
3727
3728   for (unsigned i = 0; stmts.iterate (i, &use_stmt); ++i)
3729     {
3730       tree new_rhs;
3731
3732       switch (gimple_assign_rhs_code (use_stmt))
3733         {
3734           case TRUNC_DIV_EXPR:
3735             new_rhs = fold_build1 (REALPART_EXPR, TREE_TYPE (op1), res);
3736             break;
3737
3738           case TRUNC_MOD_EXPR:
3739             new_rhs = fold_build1 (IMAGPART_EXPR, TREE_TYPE (op1), res);
3740             break;
3741
3742           default:
3743             gcc_unreachable ();
3744         }
3745
3746       gimple_stmt_iterator gsi = gsi_for_stmt (use_stmt);
3747       gimple_assign_set_rhs_from_tree (&gsi, new_rhs);
3748       update_stmt (use_stmt);
3749     }
3750
3751   return true;
3752 }
3753
3754 /* Find integer multiplications where the operands are extended from
3755    smaller types, and replace the MULT_EXPR with a WIDEN_MULT_EXPR
3756    where appropriate.  */
3757
3758 namespace {
3759
3760 const pass_data pass_data_optimize_widening_mul =
3761 {
3762   GIMPLE_PASS, /* type */
3763   "widening_mul", /* name */
3764   OPTGROUP_NONE, /* optinfo_flags */
3765   TV_TREE_WIDEN_MUL, /* tv_id */
3766   PROP_ssa, /* properties_required */
3767   0, /* properties_provided */
3768   0, /* properties_destroyed */
3769   0, /* todo_flags_start */
3770   TODO_update_ssa, /* todo_flags_finish */
3771 };
3772
3773 class pass_optimize_widening_mul : public gimple_opt_pass
3774 {
3775 public:
3776   pass_optimize_widening_mul (gcc::context *ctxt)
3777     : gimple_opt_pass (pass_data_optimize_widening_mul, ctxt)
3778   {}
3779
3780   /* opt_pass methods: */
3781   virtual bool gate (function *)
3782     {
3783       return flag_expensive_optimizations && optimize;
3784     }
3785
3786   virtual unsigned int execute (function *);
3787
3788 }; // class pass_optimize_widening_mul
3789
3790 /* Walker class to perform the transformation in reverse dominance order. */
3791
3792 class math_opts_dom_walker : public dom_walker
3793 {
3794 public:
3795   /* Constructor, CFG_CHANGED is a pointer to a boolean flag that will be set
3796      if walking modidifes the CFG.  */
3797
3798   math_opts_dom_walker (bool *cfg_changed_p)
3799     : dom_walker (CDI_DOMINATORS), m_last_result_set (),
3800       m_cfg_changed_p (cfg_changed_p) {}
3801
3802   /* The actual actions performed in the walk.  */
3803
3804   virtual void after_dom_children (basic_block);
3805
3806   /* Set of results of chains of multiply and add statement combinations that
3807      were not transformed into FMAs because of active deferring.  */
3808   hash_set<tree> m_last_result_set;
3809
3810   /* Pointer to a flag of the user that needs to be set if CFG has been
3811      modified.  */
3812   bool *m_cfg_changed_p;
3813 };
3814
3815 void
3816 math_opts_dom_walker::after_dom_children (basic_block bb)
3817 {
3818   gimple_stmt_iterator gsi;
3819
3820   fma_deferring_state fma_state (param_avoid_fma_max_bits > 0);
3821
3822   for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi);)
3823     {
3824       gimple *stmt = gsi_stmt (gsi);
3825       enum tree_code code;
3826
3827       if (is_gimple_assign (stmt))
3828         {
3829           code = gimple_assign_rhs_code (stmt);
3830           switch (code)
3831             {
3832             case MULT_EXPR:
3833               if (!convert_mult_to_widen (stmt, &gsi)
3834                   && !convert_expand_mult_copysign (stmt, &gsi)
3835                   && convert_mult_to_fma (stmt,
3836                                           gimple_assign_rhs1 (stmt),
3837                                           gimple_assign_rhs2 (stmt),
3838                                           &fma_state))
3839                 {
3840                   gsi_remove (&gsi, true);
3841                   release_defs (stmt);
3842                   continue;
3843                 }
3844               break;
3845
3846             case PLUS_EXPR:
3847             case MINUS_EXPR:
3848               if (!convert_plusminus_to_widen (&gsi, stmt, code))
3849                 match_uaddsub_overflow (&gsi, stmt, code);
3850               break;
3851
3852             case TRUNC_MOD_EXPR:
3853               convert_to_divmod (as_a<gassign *> (stmt));
3854               break;
3855
3856             default:;
3857             }
3858         }
3859       else if (is_gimple_call (stmt))
3860         {
3861           switch (gimple_call_combined_fn (stmt))
3862             {
3863             CASE_CFN_POW:
3864               if (gimple_call_lhs (stmt)
3865                   && TREE_CODE (gimple_call_arg (stmt, 1)) == REAL_CST
3866                   && real_equal (&TREE_REAL_CST (gimple_call_arg (stmt, 1)),
3867                                  &dconst2)
3868                   && convert_mult_to_fma (stmt,
3869                                           gimple_call_arg (stmt, 0),
3870                                           gimple_call_arg (stmt, 0),
3871                                           &fma_state))
3872                 {
3873                   unlink_stmt_vdef (stmt);
3874                   if (gsi_remove (&gsi, true)
3875                       && gimple_purge_dead_eh_edges (bb))
3876                     *m_cfg_changed_p = true;
3877                   release_defs (stmt);
3878                   continue;
3879                 }
3880               break;
3881
3882             case CFN_COND_MUL:
3883               if (convert_mult_to_fma (stmt,
3884                                        gimple_call_arg (stmt, 1),
3885                                        gimple_call_arg (stmt, 2),
3886                                        &fma_state,
3887                                        gimple_call_arg (stmt, 0)))
3888
3889                 {
3890                   gsi_remove (&gsi, true);
3891                   release_defs (stmt);
3892                   continue;
3893                 }
3894               break;
3895
3896             case CFN_LAST:
3897               cancel_fma_deferring (&fma_state);
3898               break;
3899
3900             default:
3901               break;
3902             }
3903         }
3904       gsi_next (&gsi);
3905     }
3906   if (fma_state.m_deferring_p
3907       && fma_state.m_initial_phi)
3908     {
3909       gcc_checking_assert (fma_state.m_last_result);
3910       if (!last_fma_candidate_feeds_initial_phi (&fma_state,
3911                                                  &m_last_result_set))
3912         cancel_fma_deferring (&fma_state);
3913       else
3914         m_last_result_set.add (fma_state.m_last_result);
3915     }
3916 }
3917
3918
3919 unsigned int
3920 pass_optimize_widening_mul::execute (function *fun)
3921 {
3922   bool cfg_changed = false;
3923
3924   memset (&widen_mul_stats, 0, sizeof (widen_mul_stats));
3925   calculate_dominance_info (CDI_DOMINATORS);
3926   renumber_gimple_stmt_uids (cfun);
3927
3928   math_opts_dom_walker (&cfg_changed).walk (ENTRY_BLOCK_PTR_FOR_FN (cfun));
3929
3930   statistics_counter_event (fun, "widening multiplications inserted",
3931                             widen_mul_stats.widen_mults_inserted);
3932   statistics_counter_event (fun, "widening maccs inserted",
3933                             widen_mul_stats.maccs_inserted);
3934   statistics_counter_event (fun, "fused multiply-adds inserted",
3935                             widen_mul_stats.fmas_inserted);
3936   statistics_counter_event (fun, "divmod calls inserted",
3937                             widen_mul_stats.divmod_calls_inserted);
3938
3939   return cfg_changed ? TODO_cleanup_cfg : 0;
3940 }
3941
3942 } // anon namespace
3943
3944 gimple_opt_pass *
3945 make_pass_optimize_widening_mul (gcc::context *ctxt)
3946 {
3947   return new pass_optimize_widening_mul (ctxt);
3948 }