gcc/tree-ssa-math-opts.c

   1 /* Global, SSA-based optimizations using mathematical identities.
   2    Copyright (C) 2005-2015 Free Software Foundation, Inc.
   3
   4 This file is part of GCC.
   5
   6 GCC is free software; you can redistribute it and/or modify it
   7 under the terms of the GNU General Public License as published by the
   8 Free Software Foundation; either version 3, or (at your option) any
   9 later version.
  10
  11 GCC is distributed in the hope that it will be useful, but WITHOUT
  12 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14 for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with GCC; see the file COPYING3.  If not see
  18 <http://www.gnu.org/licenses/>.  */
  19
  20 /* Currently, the only mini-pass in this file tries to CSE reciprocal
  21    operations.  These are common in sequences such as this one:
  22
  23         modulus = sqrt(x*x + y*y + z*z);
  24         x = x / modulus;
  25         y = y / modulus;
  26         z = z / modulus;
  27
  28    that can be optimized to
  29
  30         modulus = sqrt(x*x + y*y + z*z);
  31         rmodulus = 1.0 / modulus;
  32         x = x * rmodulus;
  33         y = y * rmodulus;
  34         z = z * rmodulus;
  35
  36    We do this for loop invariant divisors, and with this pass whenever
  37    we notice that a division has the same divisor multiple times.
  38
  39    Of course, like in PRE, we don't insert a division if a dominator
  40    already has one.  However, this cannot be done as an extension of
  41    PRE for several reasons.
  42
  43    First of all, with some experiments it was found out that the
  44    transformation is not always useful if there are only two divisions
  45    hy the same divisor.  This is probably because modern processors
  46    can pipeline the divisions; on older, in-order processors it should
  47    still be effective to optimize two divisions by the same number.
  48    We make this a param, and it shall be called N in the remainder of
  49    this comment.
  50
  51    Second, if trapping math is active, we have less freedom on where
  52    to insert divisions: we can only do so in basic blocks that already
  53    contain one.  (If divisions don't trap, instead, we can insert
  54    divisions elsewhere, which will be in blocks that are common dominators
  55    of those that have the division).
  56
  57    We really don't want to compute the reciprocal unless a division will
  58    be found.  To do this, we won't insert the division in a basic block
  59    that has less than N divisions *post-dominating* it.
  60
  61    The algorithm constructs a subset of the dominator tree, holding the
  62    blocks containing the divisions and the common dominators to them,
  63    and walk it twice.  The first walk is in post-order, and it annotates
  64    each block with the number of divisions that post-dominate it: this
  65    gives information on where divisions can be inserted profitably.
  66    The second walk is in pre-order, and it inserts divisions as explained
  67    above, and replaces divisions by multiplications.
  68
  69    In the best case, the cost of the pass is O(n_statements).  In the
  70    worst-case, the cost is due to creating the dominator tree subset,
  71    with a cost of O(n_basic_blocks ^ 2); however this can only happen
  72    for n_statements / n_basic_blocks statements.  So, the amortized cost
  73    of creating the dominator tree subset is O(n_basic_blocks) and the
  74    worst-case cost of the pass is O(n_statements * n_basic_blocks).
  75
  76    More practically, the cost will be small because there are few
  77    divisions, and they tend to be in the same basic block, so insert_bb
  78    is called very few times.
  79
  80    If we did this using domwalk.c, an efficient implementation would have
  81    to work on all the variables in a single pass, because we could not
  82    work on just a subset of the dominator tree, as we do now, and the
  83    cost would also be something like O(n_statements * n_basic_blocks).
  84    The data structures would be more complex in order to work on all the
  85    variables in a single pass.  */
  86
  87 #include "config.h"
  88 #include "system.h"
  89 #include "coretypes.h"
  90 #include "backend.h"
  91 #include "target.h"
  92 #include "rtl.h"
  93 #include "tree.h"
  94 #include "gimple.h"
  95 #include "predict.h"
  96 #include "alloc-pool.h"
  97 #include "tree-pass.h"
  98 #include "ssa.h"
  99 #include "optabs-tree.h"
 100 #include "gimple-pretty-print.h"
 101 #include "alias.h"
 102 #include "fold-const.h"
 103 #include "gimple-fold.h"
 104 #include "gimple-iterator.h"
 105 #include "gimplify.h"
 106 #include "gimplify-me.h"
 107 #include "stor-layout.h"
 108 #include "tree-cfg.h"
 109 #include "tree-dfa.h"
 110 #include "tree-ssa.h"
 111 #include "builtins.h"
 112 #include "params.h"
 113 #include "case-cfn-macros.h"
 114
 115 /* This structure represents one basic block that either computes a
 116    division, or is a common dominator for basic block that compute a
 117    division.  */
 118 struct occurrence {
 119   /* The basic block represented by this structure.  */
 120   basic_block bb;
 121
 122   /* If non-NULL, the SSA_NAME holding the definition for a reciprocal
 123      inserted in BB.  */
 124   tree recip_def;
 125
 126   /* If non-NULL, the GIMPLE_ASSIGN for a reciprocal computation that
 127      was inserted in BB.  */
 128   gimple *recip_def_stmt;
 129
 130   /* Pointer to a list of "struct occurrence"s for blocks dominated
 131      by BB.  */
 132   struct occurrence *children;
 133
 134   /* Pointer to the next "struct occurrence"s in the list of blocks
 135      sharing a common dominator.  */
 136   struct occurrence *next;
 137
 138   /* The number of divisions that are in BB before compute_merit.  The
 139      number of divisions that are in BB or post-dominate it after
 140      compute_merit.  */
 141   int num_divisions;
 142
 143   /* True if the basic block has a division, false if it is a common
 144      dominator for basic blocks that do.  If it is false and trapping
 145      math is active, BB is not a candidate for inserting a reciprocal.  */
 146   bool bb_has_division;
 147 };
 148
 149 static struct
 150 {
 151   /* Number of 1.0/X ops inserted.  */
 152   int rdivs_inserted;
 153
 154   /* Number of 1.0/FUNC ops inserted.  */
 155   int rfuncs_inserted;
 156 } reciprocal_stats;
 157
 158 static struct
 159 {
 160   /* Number of cexpi calls inserted.  */
 161   int inserted;
 162 } sincos_stats;
 163
 164 static struct
 165 {
 166   /* Number of hand-written 16-bit nop / bswaps found.  */
 167   int found_16bit;
 168
 169   /* Number of hand-written 32-bit nop / bswaps found.  */
 170   int found_32bit;
 171
 172   /* Number of hand-written 64-bit nop / bswaps found.  */
 173   int found_64bit;
 174 } nop_stats, bswap_stats;
 175
 176 static struct
 177 {
 178   /* Number of widening multiplication ops inserted.  */
 179   int widen_mults_inserted;
 180
 181   /* Number of integer multiply-and-accumulate ops inserted.  */
 182   int maccs_inserted;
 183
 184   /* Number of fp fused multiply-add ops inserted.  */
 185   int fmas_inserted;
 186 } widen_mul_stats;
 187
 188 /* The instance of "struct occurrence" representing the highest
 189    interesting block in the dominator tree.  */
 190 static struct occurrence *occ_head;
 191
 192 /* Allocation pool for getting instances of "struct occurrence".  */
 193 static object_allocator<occurrence> *occ_pool;
 194
 195
 196
 197 /* Allocate and return a new struct occurrence for basic block BB, and
 198    whose children list is headed by CHILDREN.  */
 199 static struct occurrence *
 200 occ_new (basic_block bb, struct occurrence *children)
 201 {
 202   struct occurrence *occ;
 203
 204   bb->aux = occ = occ_pool->allocate ();
 205   memset (occ, 0, sizeof (struct occurrence));
 206
 207   occ->bb = bb;
 208   occ->children = children;
 209   return occ;
 210 }
 211
 212
 213 /* Insert NEW_OCC into our subset of the dominator tree.  P_HEAD points to a
 214    list of "struct occurrence"s, one per basic block, having IDOM as
 215    their common dominator.
 216
 217    We try to insert NEW_OCC as deep as possible in the tree, and we also
 218    insert any other block that is a common dominator for BB and one
 219    block already in the tree.  */
 220
 221 static void
 222 insert_bb (struct occurrence *new_occ, basic_block idom,
 223            struct occurrence **p_head)
 224 {
 225   struct occurrence *occ, **p_occ;
 226
 227   for (p_occ = p_head; (occ = *p_occ) != NULL; )
 228     {
 229       basic_block bb = new_occ->bb, occ_bb = occ->bb;
 230       basic_block dom = nearest_common_dominator (CDI_DOMINATORS, occ_bb, bb);
 231       if (dom == bb)
 232         {
 233           /* BB dominates OCC_BB.  OCC becomes NEW_OCC's child: remove OCC
 234              from its list.  */
 235           *p_occ = occ->next;
 236           occ->next = new_occ->children;
 237           new_occ->children = occ;
 238
 239           /* Try the next block (it may as well be dominated by BB).  */
 240         }
 241
 242       else if (dom == occ_bb)
 243         {
 244           /* OCC_BB dominates BB.  Tail recurse to look deeper.  */
 245           insert_bb (new_occ, dom, &occ->children);
 246           return;
 247         }
 248
 249       else if (dom != idom)
 250         {
 251           gcc_assert (!dom->aux);
 252
 253           /* There is a dominator between IDOM and BB, add it and make
 254              two children out of NEW_OCC and OCC.  First, remove OCC from
 255              its list.  */
 256           *p_occ = occ->next;
 257           new_occ->next = occ;
 258           occ->next = NULL;
 259
 260           /* None of the previous blocks has DOM as a dominator: if we tail
 261              recursed, we would reexamine them uselessly. Just switch BB with
 262              DOM, and go on looking for blocks dominated by DOM.  */
 263           new_occ = occ_new (dom, new_occ);
 264         }
 265
 266       else
 267         {
 268           /* Nothing special, go on with the next element.  */
 269           p_occ = &occ->next;
 270         }
 271     }
 272
 273   /* No place was found as a child of IDOM.  Make BB a sibling of IDOM.  */
 274   new_occ->next = *p_head;
 275   *p_head = new_occ;
 276 }
 277
 278 /* Register that we found a division in BB.  */
 279
 280 static inline void
 281 register_division_in (basic_block bb)
 282 {
 283   struct occurrence *occ;
 284
 285   occ = (struct occurrence *) bb->aux;
 286   if (!occ)
 287     {
 288       occ = occ_new (bb, NULL);
 289       insert_bb (occ, ENTRY_BLOCK_PTR_FOR_FN (cfun), &occ_head);
 290     }
 291
 292   occ->bb_has_division = true;
 293   occ->num_divisions++;
 294 }
 295
 296
 297 /* Compute the number of divisions that postdominate each block in OCC and
 298    its children.  */
 299
 300 static void
 301 compute_merit (struct occurrence *occ)
 302 {
 303   struct occurrence *occ_child;
 304   basic_block dom = occ->bb;
 305
 306   for (occ_child = occ->children; occ_child; occ_child = occ_child->next)
 307     {
 308       basic_block bb;
 309       if (occ_child->children)
 310         compute_merit (occ_child);
 311
 312       if (flag_exceptions)
 313         bb = single_noncomplex_succ (dom);
 314       else
 315         bb = dom;
 316
 317       if (dominated_by_p (CDI_POST_DOMINATORS, bb, occ_child->bb))
 318         occ->num_divisions += occ_child->num_divisions;
 319     }
 320 }
 321
 322
 323 /* Return whether USE_STMT is a floating-point division by DEF.  */
 324 static inline bool
 325 is_division_by (gimple *use_stmt, tree def)
 326 {
 327   return is_gimple_assign (use_stmt)
 328          && gimple_assign_rhs_code (use_stmt) == RDIV_EXPR
 329          && gimple_assign_rhs2 (use_stmt) == def
 330          /* Do not recognize x / x as valid division, as we are getting
 331             confused later by replacing all immediate uses x in such
 332             a stmt.  */
 333          && gimple_assign_rhs1 (use_stmt) != def;
 334 }
 335
 336 /* Walk the subset of the dominator tree rooted at OCC, setting the
 337    RECIP_DEF field to a definition of 1.0 / DEF that can be used in
 338    the given basic block.  The field may be left NULL, of course,
 339    if it is not possible or profitable to do the optimization.
 340
 341    DEF_BSI is an iterator pointing at the statement defining DEF.
 342    If RECIP_DEF is set, a dominator already has a computation that can
 343    be used.  */
 344
 345 static void
 346 insert_reciprocals (gimple_stmt_iterator *def_gsi, struct occurrence *occ,
 347                     tree def, tree recip_def, int threshold)
 348 {
 349   tree type;
 350   gassign *new_stmt;
 351   gimple_stmt_iterator gsi;
 352   struct occurrence *occ_child;
 353
 354   if (!recip_def
 355       && (occ->bb_has_division || !flag_trapping_math)
 356       && occ->num_divisions >= threshold)
 357     {
 358       /* Make a variable with the replacement and substitute it.  */
 359       type = TREE_TYPE (def);
 360       recip_def = create_tmp_reg (type, "reciptmp");
 361       new_stmt = gimple_build_assign (recip_def, RDIV_EXPR,
 362                                       build_one_cst (type), def);
 363
 364       if (occ->bb_has_division)
 365         {
 366           /* Case 1: insert before an existing division.  */
 367           gsi = gsi_after_labels (occ->bb);
 368           while (!gsi_end_p (gsi) && !is_division_by (gsi_stmt (gsi), def))
 369             gsi_next (&gsi);
 370
 371           gsi_insert_before (&gsi, new_stmt, GSI_SAME_STMT);
 372         }
 373       else if (def_gsi && occ->bb == def_gsi->bb)
 374         {
 375           /* Case 2: insert right after the definition.  Note that this will
 376              never happen if the definition statement can throw, because in
 377              that case the sole successor of the statement's basic block will
 378              dominate all the uses as well.  */
 379           gsi_insert_after (def_gsi, new_stmt, GSI_NEW_STMT);
 380         }
 381       else
 382         {
 383           /* Case 3: insert in a basic block not containing defs/uses.  */
 384           gsi = gsi_after_labels (occ->bb);
 385           gsi_insert_before (&gsi, new_stmt, GSI_SAME_STMT);
 386         }
 387
 388       reciprocal_stats.rdivs_inserted++;
 389
 390       occ->recip_def_stmt = new_stmt;
 391     }
 392
 393   occ->recip_def = recip_def;
 394   for (occ_child = occ->children; occ_child; occ_child = occ_child->next)
 395     insert_reciprocals (def_gsi, occ_child, def, recip_def, threshold);
 396 }
 397
 398
 399 /* Replace the division at USE_P with a multiplication by the reciprocal, if
 400    possible.  */
 401
 402 static inline void
 403 replace_reciprocal (use_operand_p use_p)
 404 {
 405   gimple *use_stmt = USE_STMT (use_p);
 406   basic_block bb = gimple_bb (use_stmt);
 407   struct occurrence *occ = (struct occurrence *) bb->aux;
 408
 409   if (optimize_bb_for_speed_p (bb)
 410       && occ->recip_def && use_stmt != occ->recip_def_stmt)
 411     {
 412       gimple_stmt_iterator gsi = gsi_for_stmt (use_stmt);
 413       gimple_assign_set_rhs_code (use_stmt, MULT_EXPR);
 414       SET_USE (use_p, occ->recip_def);
 415       fold_stmt_inplace (&gsi);
 416       update_stmt (use_stmt);
 417     }
 418 }
 419
 420
 421 /* Free OCC and return one more "struct occurrence" to be freed.  */
 422
 423 static struct occurrence *
 424 free_bb (struct occurrence *occ)
 425 {
 426   struct occurrence *child, *next;
 427
 428   /* First get the two pointers hanging off OCC.  */
 429   next = occ->next;
 430   child = occ->children;
 431   occ->bb->aux = NULL;
 432   occ_pool->remove (occ);
 433
 434   /* Now ensure that we don't recurse unless it is necessary.  */
 435   if (!child)
 436     return next;
 437   else
 438     {
 439       while (next)
 440         next = free_bb (next);
 441
 442       return child;
 443     }
 444 }
 445
 446
 447 /* Look for floating-point divisions among DEF's uses, and try to
 448    replace them by multiplications with the reciprocal.  Add
 449    as many statements computing the reciprocal as needed.
 450
 451    DEF must be a GIMPLE register of a floating-point type.  */
 452
 453 static void
 454 execute_cse_reciprocals_1 (gimple_stmt_iterator *def_gsi, tree def)
 455 {
 456   use_operand_p use_p;
 457   imm_use_iterator use_iter;
 458   struct occurrence *occ;
 459   int count = 0, threshold;
 460
 461   gcc_assert (FLOAT_TYPE_P (TREE_TYPE (def)) && is_gimple_reg (def));
 462
 463   FOR_EACH_IMM_USE_FAST (use_p, use_iter, def)
 464     {
 465       gimple *use_stmt = USE_STMT (use_p);
 466       if (is_division_by (use_stmt, def))
 467         {
 468           register_division_in (gimple_bb (use_stmt));
 469           count++;
 470         }
 471     }
 472
 473   /* Do the expensive part only if we can hope to optimize something.  */
 474   threshold = targetm.min_divisions_for_recip_mul (TYPE_MODE (TREE_TYPE (def)));
 475   if (count >= threshold)
 476     {
 477       gimple *use_stmt;
 478       for (occ = occ_head; occ; occ = occ->next)
 479         {
 480           compute_merit (occ);
 481           insert_reciprocals (def_gsi, occ, def, NULL, threshold);
 482         }
 483
 484       FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, def)
 485         {
 486           if (is_division_by (use_stmt, def))
 487             {
 488               FOR_EACH_IMM_USE_ON_STMT (use_p, use_iter)
 489                 replace_reciprocal (use_p);
 490             }
 491         }
 492     }
 493
 494   for (occ = occ_head; occ; )
 495     occ = free_bb (occ);
 496
 497   occ_head = NULL;
 498 }
 499
 500 /* Go through all the floating-point SSA_NAMEs, and call
 501    execute_cse_reciprocals_1 on each of them.  */
 502 namespace {
 503
 504 const pass_data pass_data_cse_reciprocals =
 505 {
 506   GIMPLE_PASS, /* type */
 507   "recip", /* name */
 508   OPTGROUP_NONE, /* optinfo_flags */
 509   TV_NONE, /* tv_id */
 510   PROP_ssa, /* properties_required */
 511   0, /* properties_provided */
 512   0, /* properties_destroyed */
 513   0, /* todo_flags_start */
 514   TODO_update_ssa, /* todo_flags_finish */
 515 };
 516
 517 class pass_cse_reciprocals : public gimple_opt_pass
 518 {
 519 public:
 520   pass_cse_reciprocals (gcc::context *ctxt)
 521     : gimple_opt_pass (pass_data_cse_reciprocals, ctxt)
 522   {}
 523
 524   /* opt_pass methods: */
 525   virtual bool gate (function *) { return optimize && flag_reciprocal_math; }
 526   virtual unsigned int execute (function *);
 527
 528 }; // class pass_cse_reciprocals
 529
 530 unsigned int
 531 pass_cse_reciprocals::execute (function *fun)
 532 {
 533   basic_block bb;
 534   tree arg;
 535
 536   occ_pool = new object_allocator<occurrence> ("dominators for recip");
 537
 538   memset (&reciprocal_stats, 0, sizeof (reciprocal_stats));
 539   calculate_dominance_info (CDI_DOMINATORS);
 540   calculate_dominance_info (CDI_POST_DOMINATORS);
 541
 542   if (flag_checking)
 543     FOR_EACH_BB_FN (bb, fun)
 544       gcc_assert (!bb->aux);
 545
 546   for (arg = DECL_ARGUMENTS (fun->decl); arg; arg = DECL_CHAIN (arg))
 547     if (FLOAT_TYPE_P (TREE_TYPE (arg))
 548         && is_gimple_reg (arg))
 549       {
 550         tree name = ssa_default_def (fun, arg);
 551         if (name)
 552           execute_cse_reciprocals_1 (NULL, name);
 553       }
 554
 555   FOR_EACH_BB_FN (bb, fun)
 556     {
 557       tree def;
 558
 559       for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
 560            gsi_next (&gsi))
 561         {
 562           gphi *phi = gsi.phi ();
 563           def = PHI_RESULT (phi);
 564           if (! virtual_operand_p (def)
 565               && FLOAT_TYPE_P (TREE_TYPE (def)))
 566             execute_cse_reciprocals_1 (NULL, def);
 567         }
 568
 569       for (gimple_stmt_iterator gsi = gsi_after_labels (bb); !gsi_end_p (gsi);
 570            gsi_next (&gsi))
 571         {
 572           gimple *stmt = gsi_stmt (gsi);
 573
 574           if (gimple_has_lhs (stmt)
 575               && (def = SINGLE_SSA_TREE_OPERAND (stmt, SSA_OP_DEF)) != NULL
 576               && FLOAT_TYPE_P (TREE_TYPE (def))
 577               && TREE_CODE (def) == SSA_NAME)
 578             execute_cse_reciprocals_1 (&gsi, def);
 579         }
 580
 581       if (optimize_bb_for_size_p (bb))
 582         continue;
 583
 584       /* Scan for a/func(b) and convert it to reciprocal a*rfunc(b).  */
 585       for (gimple_stmt_iterator gsi = gsi_after_labels (bb); !gsi_end_p (gsi);
 586            gsi_next (&gsi))
 587         {
 588           gimple *stmt = gsi_stmt (gsi);
 589           tree fndecl;
 590
 591           if (is_gimple_assign (stmt)
 592               && gimple_assign_rhs_code (stmt) == RDIV_EXPR)
 593             {
 594               tree arg1 = gimple_assign_rhs2 (stmt);
 595               gimple *stmt1;
 596
 597               if (TREE_CODE (arg1) != SSA_NAME)
 598                 continue;
 599
 600               stmt1 = SSA_NAME_DEF_STMT (arg1);
 601
 602               if (is_gimple_call (stmt1)
 603                   && gimple_call_lhs (stmt1)
 604                   && (fndecl = gimple_call_fndecl (stmt1))
 605                   && (DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_NORMAL
 606                       || DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD))
 607                 {
 608                   enum built_in_function code;
 609                   bool md_code, fail;
 610                   imm_use_iterator ui;
 611                   use_operand_p use_p;
 612
 613                   code = DECL_FUNCTION_CODE (fndecl);
 614                   md_code = DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD;
 615
 616                   fndecl = targetm.builtin_reciprocal (code, md_code, false);
 617                   if (!fndecl)
 618                     continue;
 619
 620                   /* Check that all uses of the SSA name are divisions,
 621                      otherwise replacing the defining statement will do
 622                      the wrong thing.  */
 623                   fail = false;
 624                   FOR_EACH_IMM_USE_FAST (use_p, ui, arg1)
 625                     {
 626                       gimple *stmt2 = USE_STMT (use_p);
 627                       if (is_gimple_debug (stmt2))
 628                         continue;
 629                       if (!is_gimple_assign (stmt2)
 630                           || gimple_assign_rhs_code (stmt2) != RDIV_EXPR
 631                           || gimple_assign_rhs1 (stmt2) == arg1
 632                           || gimple_assign_rhs2 (stmt2) != arg1)
 633                         {
 634                           fail = true;
 635                           break;
 636                         }
 637                     }
 638                   if (fail)
 639                     continue;
 640
 641                   gimple_replace_ssa_lhs (stmt1, arg1);
 642                   gimple_call_set_fndecl (stmt1, fndecl);
 643                   update_stmt (stmt1);
 644                   reciprocal_stats.rfuncs_inserted++;
 645
 646                   FOR_EACH_IMM_USE_STMT (stmt, ui, arg1)
 647                     {
 648                       gimple_stmt_iterator gsi = gsi_for_stmt (stmt);
 649                       gimple_assign_set_rhs_code (stmt, MULT_EXPR);
 650                       fold_stmt_inplace (&gsi);
 651                       update_stmt (stmt);
 652                     }
 653                 }
 654             }
 655         }
 656     }
 657
 658   statistics_counter_event (fun, "reciprocal divs inserted",
 659                             reciprocal_stats.rdivs_inserted);
 660   statistics_counter_event (fun, "reciprocal functions inserted",
 661                             reciprocal_stats.rfuncs_inserted);
 662
 663   free_dominance_info (CDI_DOMINATORS);
 664   free_dominance_info (CDI_POST_DOMINATORS);
 665   delete occ_pool;
 666   return 0;
 667 }
 668
 669 } // anon namespace
 670
 671 gimple_opt_pass *
 672 make_pass_cse_reciprocals (gcc::context *ctxt)
 673 {
 674   return new pass_cse_reciprocals (ctxt);
 675 }
 676
 677 /* Records an occurrence at statement USE_STMT in the vector of trees
 678    STMTS if it is dominated by *TOP_BB or dominates it or this basic block
 679    is not yet initialized.  Returns true if the occurrence was pushed on
 680    the vector.  Adjusts *TOP_BB to be the basic block dominating all
 681    statements in the vector.  */
 682
 683 static bool
 684 maybe_record_sincos (vec<gimple *> *stmts,
 685                      basic_block *top_bb, gimple *use_stmt)
 686 {
 687   basic_block use_bb = gimple_bb (use_stmt);
 688   if (*top_bb
 689       && (*top_bb == use_bb
 690           || dominated_by_p (CDI_DOMINATORS, use_bb, *top_bb)))
 691     stmts->safe_push (use_stmt);
 692   else if (!*top_bb
 693            || dominated_by_p (CDI_DOMINATORS, *top_bb, use_bb))
 694     {
 695       stmts->safe_push (use_stmt);
 696       *top_bb = use_bb;
 697     }
 698   else
 699     return false;
 700
 701   return true;
 702 }
 703
 704 /* Look for sin, cos and cexpi calls with the same argument NAME and
 705    create a single call to cexpi CSEing the result in this case.
 706    We first walk over all immediate uses of the argument collecting
 707    statements that we can CSE in a vector and in a second pass replace
 708    the statement rhs with a REALPART or IMAGPART expression on the
 709    result of the cexpi call we insert before the use statement that
 710    dominates all other candidates.  */
 711
 712 static bool
 713 execute_cse_sincos_1 (tree name)
 714 {
 715   gimple_stmt_iterator gsi;
 716   imm_use_iterator use_iter;
 717   tree fndecl, res, type;
 718   gimple *def_stmt, *use_stmt, *stmt;
 719   int seen_cos = 0, seen_sin = 0, seen_cexpi = 0;
 720   auto_vec<gimple *> stmts;
 721   basic_block top_bb = NULL;
 722   int i;
 723   bool cfg_changed = false;
 724
 725   type = TREE_TYPE (name);
 726   FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, name)
 727     {
 728       if (gimple_code (use_stmt) != GIMPLE_CALL
 729           || !gimple_call_lhs (use_stmt))
 730         continue;
 731
 732       switch (gimple_call_combined_fn (use_stmt))
 733         {
 734         CASE_CFN_COS:
 735           seen_cos |= maybe_record_sincos (&stmts, &top_bb, use_stmt) ? 1 : 0;
 736           break;
 737
 738         CASE_CFN_SIN:
 739           seen_sin |= maybe_record_sincos (&stmts, &top_bb, use_stmt) ? 1 : 0;
 740           break;
 741
 742         CASE_CFN_CEXPI:
 743           seen_cexpi |= maybe_record_sincos (&stmts, &top_bb, use_stmt) ? 1 : 0;
 744           break;
 745
 746         default:;
 747         }
 748     }
 749
 750   if (seen_cos + seen_sin + seen_cexpi <= 1)
 751     return false;
 752
 753   /* Simply insert cexpi at the beginning of top_bb but not earlier than
 754      the name def statement.  */
 755   fndecl = mathfn_built_in (type, BUILT_IN_CEXPI);
 756   if (!fndecl)
 757     return false;
 758   stmt = gimple_build_call (fndecl, 1, name);
 759   res = make_temp_ssa_name (TREE_TYPE (TREE_TYPE (fndecl)), stmt, "sincostmp");
 760   gimple_call_set_lhs (stmt, res);
 761
 762   def_stmt = SSA_NAME_DEF_STMT (name);
 763   if (!SSA_NAME_IS_DEFAULT_DEF (name)
 764       && gimple_code (def_stmt) != GIMPLE_PHI
 765       && gimple_bb (def_stmt) == top_bb)
 766     {
 767       gsi = gsi_for_stmt (def_stmt);
 768       gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
 769     }
 770   else
 771     {
 772       gsi = gsi_after_labels (top_bb);
 773       gsi_insert_before (&gsi, stmt, GSI_SAME_STMT);
 774     }
 775   sincos_stats.inserted++;
 776
 777   /* And adjust the recorded old call sites.  */
 778   for (i = 0; stmts.iterate (i, &use_stmt); ++i)
 779     {
 780       tree rhs = NULL;
 781
 782       switch (gimple_call_combined_fn (use_stmt))
 783         {
 784         CASE_CFN_COS:
 785           rhs = fold_build1 (REALPART_EXPR, type, res);
 786           break;
 787
 788         CASE_CFN_SIN:
 789           rhs = fold_build1 (IMAGPART_EXPR, type, res);
 790           break;
 791
 792         CASE_CFN_CEXPI:
 793           rhs = res;
 794           break;
 795
 796         default:;
 797           gcc_unreachable ();
 798         }
 799
 800         /* Replace call with a copy.  */
 801         stmt = gimple_build_assign (gimple_call_lhs (use_stmt), rhs);
 802
 803         gsi = gsi_for_stmt (use_stmt);
 804         gsi_replace (&gsi, stmt, true);
 805         if (gimple_purge_dead_eh_edges (gimple_bb (stmt)))
 806           cfg_changed = true;
 807     }
 808
 809   return cfg_changed;
 810 }
 811
 812 /* To evaluate powi(x,n), the floating point value x raised to the
 813    constant integer exponent n, we use a hybrid algorithm that
 814    combines the "window method" with look-up tables.  For an
 815    introduction to exponentiation algorithms and "addition chains",
 816    see section 4.6.3, "Evaluation of Powers" of Donald E. Knuth,
 817    "Seminumerical Algorithms", Vol. 2, "The Art of Computer Programming",
 818    3rd Edition, 1998, and Daniel M. Gordon, "A Survey of Fast Exponentiation
 819    Methods", Journal of Algorithms, Vol. 27, pp. 129-146, 1998.  */
 820
 821 /* Provide a default value for POWI_MAX_MULTS, the maximum number of
 822    multiplications to inline before calling the system library's pow
 823    function.  powi(x,n) requires at worst 2*bits(n)-2 multiplications,
 824    so this default never requires calling pow, powf or powl.  */
 825
 826 #ifndef POWI_MAX_MULTS
 827 #define POWI_MAX_MULTS  (2*HOST_BITS_PER_WIDE_INT-2)
 828 #endif
 829
 830 /* The size of the "optimal power tree" lookup table.  All
 831    exponents less than this value are simply looked up in the
 832    powi_table below.  This threshold is also used to size the
 833    cache of pseudo registers that hold intermediate results.  */
 834 #define POWI_TABLE_SIZE 256
 835
 836 /* The size, in bits of the window, used in the "window method"
 837    exponentiation algorithm.  This is equivalent to a radix of
 838    (1<<POWI_WINDOW_SIZE) in the corresponding "m-ary method".  */
 839 #define POWI_WINDOW_SIZE 3
 840
 841 /* The following table is an efficient representation of an
 842    "optimal power tree".  For each value, i, the corresponding
 843    value, j, in the table states than an optimal evaluation
 844    sequence for calculating pow(x,i) can be found by evaluating
 845    pow(x,j)*pow(x,i-j).  An optimal power tree for the first
 846    100 integers is given in Knuth's "Seminumerical algorithms".  */
 847
 848 static const unsigned char powi_table[POWI_TABLE_SIZE] =
 849   {
 850       0,   1,   1,   2,   2,   3,   3,   4,  /*   0 -   7 */
 851       4,   6,   5,   6,   6,  10,   7,   9,  /*   8 -  15 */
 852       8,  16,   9,  16,  10,  12,  11,  13,  /*  16 -  23 */
 853      12,  17,  13,  18,  14,  24,  15,  26,  /*  24 -  31 */
 854      16,  17,  17,  19,  18,  33,  19,  26,  /*  32 -  39 */
 855      20,  25,  21,  40,  22,  27,  23,  44,  /*  40 -  47 */
 856      24,  32,  25,  34,  26,  29,  27,  44,  /*  48 -  55 */
 857      28,  31,  29,  34,  30,  60,  31,  36,  /*  56 -  63 */
 858      32,  64,  33,  34,  34,  46,  35,  37,  /*  64 -  71 */
 859      36,  65,  37,  50,  38,  48,  39,  69,  /*  72 -  79 */
 860      40,  49,  41,  43,  42,  51,  43,  58,  /*  80 -  87 */
 861      44,  64,  45,  47,  46,  59,  47,  76,  /*  88 -  95 */
 862      48,  65,  49,  66,  50,  67,  51,  66,  /*  96 - 103 */
 863      52,  70,  53,  74,  54, 104,  55,  74,  /* 104 - 111 */
 864      56,  64,  57,  69,  58,  78,  59,  68,  /* 112 - 119 */
 865      60,  61,  61,  80,  62,  75,  63,  68,  /* 120 - 127 */
 866      64,  65,  65, 128,  66, 129,  67,  90,  /* 128 - 135 */
 867      68,  73,  69, 131,  70,  94,  71,  88,  /* 136 - 143 */
 868      72, 128,  73,  98,  74, 132,  75, 121,  /* 144 - 151 */
 869      76, 102,  77, 124,  78, 132,  79, 106,  /* 152 - 159 */
 870      80,  97,  81, 160,  82,  99,  83, 134,  /* 160 - 167 */
 871      84,  86,  85,  95,  86, 160,  87, 100,  /* 168 - 175 */
 872      88, 113,  89,  98,  90, 107,  91, 122,  /* 176 - 183 */
 873      92, 111,  93, 102,  94, 126,  95, 150,  /* 184 - 191 */
 874      96, 128,  97, 130,  98, 133,  99, 195,  /* 192 - 199 */
 875     100, 128, 101, 123, 102, 164, 103, 138,  /* 200 - 207 */
 876     104, 145, 105, 146, 106, 109, 107, 149,  /* 208 - 215 */
 877     108, 200, 109, 146, 110, 170, 111, 157,  /* 216 - 223 */
 878     112, 128, 113, 130, 114, 182, 115, 132,  /* 224 - 231 */
 879     116, 200, 117, 132, 118, 158, 119, 206,  /* 232 - 239 */
 880     120, 240, 121, 162, 122, 147, 123, 152,  /* 240 - 247 */
 881     124, 166, 125, 214, 126, 138, 127, 153,  /* 248 - 255 */
 882   };
 883
 884
 885 /* Return the number of multiplications required to calculate
 886    powi(x,n) where n is less than POWI_TABLE_SIZE.  This is a
 887    subroutine of powi_cost.  CACHE is an array indicating
 888    which exponents have already been calculated.  */
 889
 890 static int
 891 powi_lookup_cost (unsigned HOST_WIDE_INT n, bool *cache)
 892 {
 893   /* If we've already calculated this exponent, then this evaluation
 894      doesn't require any additional multiplications.  */
 895   if (cache[n])
 896     return 0;
 897
 898   cache[n] = true;
 899   return powi_lookup_cost (n - powi_table[n], cache)
 900          + powi_lookup_cost (powi_table[n], cache) + 1;
 901 }
 902
 903 /* Return the number of multiplications required to calculate
 904    powi(x,n) for an arbitrary x, given the exponent N.  This
 905    function needs to be kept in sync with powi_as_mults below.  */
 906
 907 static int
 908 powi_cost (HOST_WIDE_INT n)
 909 {
 910   bool cache[POWI_TABLE_SIZE];
 911   unsigned HOST_WIDE_INT digit;
 912   unsigned HOST_WIDE_INT val;
 913   int result;
 914
 915   if (n == 0)
 916     return 0;
 917
 918   /* Ignore the reciprocal when calculating the cost.  */
 919   val = (n < 0) ? -n : n;
 920
 921   /* Initialize the exponent cache.  */
 922   memset (cache, 0, POWI_TABLE_SIZE * sizeof (bool));
 923   cache[1] = true;
 924
 925   result = 0;
 926
 927   while (val >= POWI_TABLE_SIZE)
 928     {
 929       if (val & 1)
 930         {
 931           digit = val & ((1 << POWI_WINDOW_SIZE) - 1);
 932           result += powi_lookup_cost (digit, cache)
 933                     + POWI_WINDOW_SIZE + 1;
 934           val >>= POWI_WINDOW_SIZE;
 935         }
 936       else
 937         {
 938           val >>= 1;
 939           result++;
 940         }
 941     }
 942
 943   return result + powi_lookup_cost (val, cache);
 944 }
 945
 946 /* Recursive subroutine of powi_as_mults.  This function takes the
 947    array, CACHE, of already calculated exponents and an exponent N and
 948    returns a tree that corresponds to CACHE[1]**N, with type TYPE.  */
 949
 950 static tree
 951 powi_as_mults_1 (gimple_stmt_iterator *gsi, location_t loc, tree type,
 952                  HOST_WIDE_INT n, tree *cache)
 953 {
 954   tree op0, op1, ssa_target;
 955   unsigned HOST_WIDE_INT digit;
 956   gassign *mult_stmt;
 957
 958   if (n < POWI_TABLE_SIZE && cache[n])
 959     return cache[n];
 960
 961   ssa_target = make_temp_ssa_name (type, NULL, "powmult");
 962
 963   if (n < POWI_TABLE_SIZE)
 964     {
 965       cache[n] = ssa_target;
 966       op0 = powi_as_mults_1 (gsi, loc, type, n - powi_table[n], cache);
 967       op1 = powi_as_mults_1 (gsi, loc, type, powi_table[n], cache);
 968     }
 969   else if (n & 1)
 970     {
 971       digit = n & ((1 << POWI_WINDOW_SIZE) - 1);
 972       op0 = powi_as_mults_1 (gsi, loc, type, n - digit, cache);
 973       op1 = powi_as_mults_1 (gsi, loc, type, digit, cache);
 974     }
 975   else
 976     {
 977       op0 = powi_as_mults_1 (gsi, loc, type, n >> 1, cache);
 978       op1 = op0;
 979     }
 980
 981   mult_stmt = gimple_build_assign (ssa_target, MULT_EXPR, op0, op1);
 982   gimple_set_location (mult_stmt, loc);
 983   gsi_insert_before (gsi, mult_stmt, GSI_SAME_STMT);
 984
 985   return ssa_target;
 986 }
 987
 988 /* Convert ARG0**N to a tree of multiplications of ARG0 with itself.
 989    This function needs to be kept in sync with powi_cost above.  */
 990
 991 static tree
 992 powi_as_mults (gimple_stmt_iterator *gsi, location_t loc,
 993                tree arg0, HOST_WIDE_INT n)
 994 {
 995   tree cache[POWI_TABLE_SIZE], result, type = TREE_TYPE (arg0);
 996   gassign *div_stmt;
 997   tree target;
 998
 999   if (n == 0)
1000     return build_real (type, dconst1);
1001
1002   memset (cache, 0,  sizeof (cache));
1003   cache[1] = arg0;
1004
1005   result = powi_as_mults_1 (gsi, loc, type, (n < 0) ? -n : n, cache);
1006   if (n >= 0)
1007     return result;
1008
1009   /* If the original exponent was negative, reciprocate the result.  */
1010   target = make_temp_ssa_name (type, NULL, "powmult");
1011   div_stmt = gimple_build_assign (target, RDIV_EXPR,
1012                                   build_real (type, dconst1), result);
1013   gimple_set_location (div_stmt, loc);
1014   gsi_insert_before (gsi, div_stmt, GSI_SAME_STMT);
1015
1016   return target;
1017 }
1018
1019 /* ARG0 and N are the two arguments to a powi builtin in GSI with
1020    location info LOC.  If the arguments are appropriate, create an
1021    equivalent sequence of statements prior to GSI using an optimal
1022    number of multiplications, and return an expession holding the
1023    result.  */
1024
1025 static tree
1026 gimple_expand_builtin_powi (gimple_stmt_iterator *gsi, location_t loc,
1027                             tree arg0, HOST_WIDE_INT n)
1028 {
1029   /* Avoid largest negative number.  */
1030   if (n != -n
1031       && ((n >= -1 && n <= 2)
1032           || (optimize_function_for_speed_p (cfun)
1033               && powi_cost (n) <= POWI_MAX_MULTS)))
1034     return powi_as_mults (gsi, loc, arg0, n);
1035
1036   return NULL_TREE;
1037 }
1038
1039 /* Build a gimple call statement that calls FN with argument ARG.
1040    Set the lhs of the call statement to a fresh SSA name.  Insert the
1041    statement prior to GSI's current position, and return the fresh
1042    SSA name.  */
1043
1044 static tree
1045 build_and_insert_call (gimple_stmt_iterator *gsi, location_t loc,
1046                        tree fn, tree arg)
1047 {
1048   gcall *call_stmt;
1049   tree ssa_target;
1050
1051   call_stmt = gimple_build_call (fn, 1, arg);
1052   ssa_target = make_temp_ssa_name (TREE_TYPE (arg), NULL, "powroot");
1053   gimple_set_lhs (call_stmt, ssa_target);
1054   gimple_set_location (call_stmt, loc);
1055   gsi_insert_before (gsi, call_stmt, GSI_SAME_STMT);
1056
1057   return ssa_target;
1058 }
1059
1060 /* Build a gimple binary operation with the given CODE and arguments
1061    ARG0, ARG1, assigning the result to a new SSA name for variable
1062    TARGET.  Insert the statement prior to GSI's current position, and
1063    return the fresh SSA name.*/
1064
1065 static tree
1066 build_and_insert_binop (gimple_stmt_iterator *gsi, location_t loc,
1067                         const char *name, enum tree_code code,
1068                         tree arg0, tree arg1)
1069 {
1070   tree result = make_temp_ssa_name (TREE_TYPE (arg0), NULL, name);
1071   gassign *stmt = gimple_build_assign (result, code, arg0, arg1);
1072   gimple_set_location (stmt, loc);
1073   gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
1074   return result;
1075 }
1076
1077 /* Build a gimple reference operation with the given CODE and argument
1078    ARG, assigning the result to a new SSA name of TYPE with NAME.
1079    Insert the statement prior to GSI's current position, and return
1080    the fresh SSA name.  */
1081
1082 static inline tree
1083 build_and_insert_ref (gimple_stmt_iterator *gsi, location_t loc, tree type,
1084                       const char *name, enum tree_code code, tree arg0)
1085 {
1086   tree result = make_temp_ssa_name (type, NULL, name);
1087   gimple *stmt = gimple_build_assign (result, build1 (code, type, arg0));
1088   gimple_set_location (stmt, loc);
1089   gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
1090   return result;
1091 }
1092
1093 /* Build a gimple assignment to cast VAL to TYPE.  Insert the statement
1094    prior to GSI's current position, and return the fresh SSA name.  */
1095
1096 static tree
1097 build_and_insert_cast (gimple_stmt_iterator *gsi, location_t loc,
1098                        tree type, tree val)
1099 {
1100   tree result = make_ssa_name (type);
1101   gassign *stmt = gimple_build_assign (result, NOP_EXPR, val);
1102   gimple_set_location (stmt, loc);
1103   gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
1104   return result;
1105 }
1106
1107 struct pow_synth_sqrt_info
1108 {
1109   bool *factors;
1110   unsigned int deepest;
1111   unsigned int num_mults;
1112 };
1113
1114 /* Return true iff the real value C can be represented as a
1115    sum of powers of 0.5 up to N.  That is:
1116    C == SUM<i from 1..N> (a[i]*(0.5**i)) where a[i] is either 0 or 1.
1117    Record in INFO the various parameters of the synthesis algorithm such
1118    as the factors a[i], the maximum 0.5 power and the number of
1119    multiplications that will be required.  */
1120
1121 bool
1122 representable_as_half_series_p (REAL_VALUE_TYPE c, unsigned n,
1123                                  struct pow_synth_sqrt_info *info)
1124 {
1125   REAL_VALUE_TYPE factor = dconsthalf;
1126   REAL_VALUE_TYPE remainder = c;
1127
1128   info->deepest = 0;
1129   info->num_mults = 0;
1130   memset (info->factors, 0, n * sizeof (bool));
1131
1132   for (unsigned i = 0; i < n; i++)
1133     {
1134       REAL_VALUE_TYPE res;
1135
1136       /* If something inexact happened bail out now.  */
1137       if (real_arithmetic (&res, MINUS_EXPR, &remainder, &factor))
1138         return false;
1139
1140       /* We have hit zero.  The number is representable as a sum
1141          of powers of 0.5.  */
1142       if (real_equal (&res, &dconst0))
1143         {
1144           info->factors[i] = true;
1145           info->deepest = i + 1;
1146           return true;
1147         }
1148       else if (!REAL_VALUE_NEGATIVE (res))
1149         {
1150           remainder = res;
1151           info->factors[i] = true;
1152           info->num_mults++;
1153         }
1154       else
1155         info->factors[i] = false;
1156
1157       real_arithmetic (&factor, MULT_EXPR, &factor, &dconsthalf);
1158     }
1159   return false;
1160 }
1161
1162 /* Return the tree corresponding to FN being applied
1163    to ARG N times at GSI and LOC.
1164    Look up previous results from CACHE if need be.
1165    cache[0] should contain just plain ARG i.e. FN applied to ARG 0 times.  */
1166
1167 static tree
1168 get_fn_chain (tree arg, unsigned int n, gimple_stmt_iterator *gsi,
1169               tree fn, location_t loc, tree *cache)
1170 {
1171   tree res = cache[n];
1172   if (!res)
1173     {
1174       tree prev = get_fn_chain (arg, n - 1, gsi, fn, loc, cache);
1175       res = build_and_insert_call (gsi, loc, fn, prev);
1176       cache[n] = res;
1177     }
1178
1179   return res;
1180 }
1181
1182 /* Print to STREAM the repeated application of function FNAME to ARG
1183    N times.  So, for FNAME = "foo", ARG = "x", N = 2 it would print:
1184    "foo (foo (x))".  */
1185
1186 static void
1187 print_nested_fn (FILE* stream, const char *fname, const char* arg,
1188                  unsigned int n)
1189 {
1190   if (n == 0)
1191     fprintf (stream, "%s", arg);
1192   else
1193     {
1194       fprintf (stream, "%s (", fname);
1195       print_nested_fn (stream, fname, arg, n - 1);
1196       fprintf (stream, ")");
1197     }
1198 }
1199
1200 /* Print to STREAM the fractional sequence of sqrt chains
1201    applied to ARG, described by INFO.  Used for the dump file.  */
1202
1203 static void
1204 dump_fractional_sqrt_sequence (FILE *stream, const char *arg,
1205                                 struct pow_synth_sqrt_info *info)
1206 {
1207   for (unsigned int i = 0; i < info->deepest; i++)
1208     {
1209       bool is_set = info->factors[i];
1210       if (is_set)
1211         {
1212           print_nested_fn (stream, "sqrt", arg, i + 1);
1213           if (i != info->deepest - 1)
1214             fprintf (stream, " * ");
1215         }
1216     }
1217 }
1218
1219 /* Print to STREAM a representation of raising ARG to an integer
1220    power N.  Used for the dump file.  */
1221
1222 static void
1223 dump_integer_part (FILE *stream, const char* arg, HOST_WIDE_INT n)
1224 {
1225   if (n > 1)
1226     fprintf (stream, "powi (%s, " HOST_WIDE_INT_PRINT_DEC ")", arg, n);
1227   else if (n == 1)
1228     fprintf (stream, "%s", arg);
1229 }
1230
1231 /* Attempt to synthesize a POW[F] (ARG0, ARG1) call using chains of
1232    square roots.  Place at GSI and LOC.  Limit the maximum depth
1233    of the sqrt chains to MAX_DEPTH.  Return the tree holding the
1234    result of the expanded sequence or NULL_TREE if the expansion failed.
1235
1236    This routine assumes that ARG1 is a real number with a fractional part
1237    (the integer exponent case will have been handled earlier in
1238    gimple_expand_builtin_pow).
1239
1240    For ARG1 > 0.0:
1241    * For ARG1 composed of a whole part WHOLE_PART and a fractional part
1242      FRAC_PART i.e. WHOLE_PART == floor (ARG1) and
1243                     FRAC_PART == ARG1 - WHOLE_PART:
1244      Produce POWI (ARG0, WHOLE_PART) * POW (ARG0, FRAC_PART) where
1245      POW (ARG0, FRAC_PART) is expanded as a product of square root chains
1246      if it can be expressed as such, that is if FRAC_PART satisfies:
1247      FRAC_PART == <SUM from i = 1 until MAX_DEPTH> (a[i] * (0.5**i))
1248      where integer a[i] is either 0 or 1.
1249
1250      Example:
1251      POW (x, 3.625) == POWI (x, 3) * POW (x, 0.625)
1252        --> POWI (x, 3) * SQRT (x) * SQRT (SQRT (SQRT (x)))
1253
1254    For ARG1 < 0.0 there are two approaches:
1255    * (A) Expand to 1.0 / POW (ARG0, -ARG1) where POW (ARG0, -ARG1)
1256          is calculated as above.
1257
1258      Example:
1259      POW (x, -5.625) == 1.0 / POW (x, 5.625)
1260        -->  1.0 / (POWI (x, 5) * SQRT (x) * SQRT (SQRT (SQRT (x))))
1261
1262    * (B) : WHOLE_PART := - ceil (abs (ARG1))
1263            FRAC_PART  := ARG1 - WHOLE_PART
1264      and expand to POW (x, FRAC_PART) / POWI (x, WHOLE_PART).
1265      Example:
1266      POW (x, -5.875) == POW (x, 0.125) / POWI (X, 6)
1267        --> SQRT (SQRT (SQRT (x))) / (POWI (x, 6))
1268
1269    For ARG1 < 0.0 we choose between (A) and (B) depending on
1270    how many multiplications we'd have to do.
1271    So, for the example in (B): POW (x, -5.875), if we were to
1272    follow algorithm (A) we would produce:
1273    1.0 / POWI (X, 5) * SQRT (X) * SQRT (SQRT (X)) * SQRT (SQRT (SQRT (X)))
1274    which contains more multiplications than approach (B).
1275
1276    Hopefully, this approach will eliminate potentially expensive POW library
1277    calls when unsafe floating point math is enabled and allow the compiler to
1278    further optimise the multiplies, square roots and divides produced by this
1279    function.  */
1280
1281 static tree
1282 expand_pow_as_sqrts (gimple_stmt_iterator *gsi, location_t loc,
1283                      tree arg0, tree arg1, HOST_WIDE_INT max_depth)
1284 {
1285   tree type = TREE_TYPE (arg0);
1286   machine_mode mode = TYPE_MODE (type);
1287   tree sqrtfn = mathfn_built_in (type, BUILT_IN_SQRT);
1288   bool one_over = true;
1289
1290   if (!sqrtfn)
1291     return NULL_TREE;
1292
1293   if (TREE_CODE (arg1) != REAL_CST)
1294     return NULL_TREE;
1295
1296   REAL_VALUE_TYPE exp_init = TREE_REAL_CST (arg1);
1297
1298   gcc_assert (max_depth > 0);
1299   tree *cache = XALLOCAVEC (tree, max_depth + 1);
1300
1301   struct pow_synth_sqrt_info synth_info;
1302   synth_info.factors = XALLOCAVEC (bool, max_depth + 1);
1303   synth_info.deepest = 0;
1304   synth_info.num_mults = 0;
1305
1306   bool neg_exp = REAL_VALUE_NEGATIVE (exp_init);
1307   REAL_VALUE_TYPE exp = real_value_abs (&exp_init);
1308
1309   /* The whole and fractional parts of exp.  */
1310   REAL_VALUE_TYPE whole_part;
1311   REAL_VALUE_TYPE frac_part;
1312
1313   real_floor (&whole_part, mode, &exp);
1314   real_arithmetic (&frac_part, MINUS_EXPR, &exp, &whole_part);
1315
1316
1317   REAL_VALUE_TYPE ceil_whole = dconst0;
1318   REAL_VALUE_TYPE ceil_fract = dconst0;
1319
1320   if (neg_exp)
1321     {
1322       real_ceil (&ceil_whole, mode, &exp);
1323       real_arithmetic (&ceil_fract, MINUS_EXPR, &ceil_whole, &exp);
1324     }
1325
1326   if (!representable_as_half_series_p (frac_part, max_depth, &synth_info))
1327     return NULL_TREE;
1328
1329   /* Check whether it's more profitable to not use 1.0 / ...  */
1330   if (neg_exp)
1331     {
1332       struct pow_synth_sqrt_info alt_synth_info;
1333       alt_synth_info.factors = XALLOCAVEC (bool, max_depth + 1);
1334       alt_synth_info.deepest = 0;
1335       alt_synth_info.num_mults = 0;
1336
1337       if (representable_as_half_series_p (ceil_fract, max_depth,
1338                                            &alt_synth_info)
1339           && alt_synth_info.deepest <= synth_info.deepest
1340           && alt_synth_info.num_mults < synth_info.num_mults)
1341         {
1342           whole_part = ceil_whole;
1343           frac_part = ceil_fract;
1344           synth_info.deepest = alt_synth_info.deepest;
1345           synth_info.num_mults = alt_synth_info.num_mults;
1346           memcpy (synth_info.factors, alt_synth_info.factors,
1347                   (max_depth + 1) * sizeof (bool));
1348           one_over = false;
1349         }
1350     }
1351
1352   HOST_WIDE_INT n = real_to_integer (&whole_part);
1353   REAL_VALUE_TYPE cint;
1354   real_from_integer (&cint, VOIDmode, n, SIGNED);
1355
1356   if (!real_identical (&whole_part, &cint))
1357     return NULL_TREE;
1358
1359   if (powi_cost (n) + synth_info.num_mults > POWI_MAX_MULTS)
1360     return NULL_TREE;
1361
1362   memset (cache, 0, (max_depth + 1) * sizeof (tree));
1363
1364   tree integer_res = n == 0 ? build_real (type, dconst1) : arg0;
1365
1366   /* Calculate the integer part of the exponent.  */
1367   if (n > 1)
1368     {
1369       integer_res = gimple_expand_builtin_powi (gsi, loc, arg0, n);
1370       if (!integer_res)
1371         return NULL_TREE;
1372     }
1373
1374   if (dump_file)
1375     {
1376       char string[64];
1377
1378       real_to_decimal (string, &exp_init, sizeof (string), 0, 1);
1379       fprintf (dump_file, "synthesizing pow (x, %s) as:\n", string);
1380
1381       if (neg_exp)
1382         {
1383           if (one_over)
1384             {
1385               fprintf (dump_file, "1.0 / (");
1386               dump_integer_part (dump_file, "x", n);
1387               if (n > 0)
1388                 fprintf (dump_file, " * ");
1389               dump_fractional_sqrt_sequence (dump_file, "x", &synth_info);
1390               fprintf (dump_file, ")");
1391             }
1392           else
1393             {
1394               dump_fractional_sqrt_sequence (dump_file, "x", &synth_info);
1395               fprintf (dump_file, " / (");
1396               dump_integer_part (dump_file, "x", n);
1397               fprintf (dump_file, ")");
1398             }
1399         }
1400       else
1401         {
1402           dump_fractional_sqrt_sequence (dump_file, "x", &synth_info);
1403           if (n > 0)
1404             fprintf (dump_file, " * ");
1405           dump_integer_part (dump_file, "x", n);
1406         }
1407
1408       fprintf (dump_file, "\ndeepest sqrt chain: %d\n", synth_info.deepest);
1409     }
1410
1411
1412   tree fract_res = NULL_TREE;
1413   cache[0] = arg0;
1414
1415   /* Calculate the fractional part of the exponent.  */
1416   for (unsigned i = 0; i < synth_info.deepest; i++)
1417     {
1418       if (synth_info.factors[i])
1419         {
1420           tree sqrt_chain = get_fn_chain (arg0, i + 1, gsi, sqrtfn, loc, cache);
1421
1422           if (!fract_res)
1423               fract_res = sqrt_chain;
1424
1425           else
1426             fract_res = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1427                                            fract_res, sqrt_chain);
1428         }
1429     }
1430
1431   tree res = NULL_TREE;
1432
1433   if (neg_exp)
1434     {
1435       if (one_over)
1436         {
1437           if (n > 0)
1438             res = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1439                                            fract_res, integer_res);
1440           else
1441             res = fract_res;
1442
1443           res = build_and_insert_binop (gsi, loc, "powrootrecip", RDIV_EXPR,
1444                                           build_real (type, dconst1), res);
1445         }
1446       else
1447         {
1448           res = build_and_insert_binop (gsi, loc, "powroot", RDIV_EXPR,
1449                                          fract_res, integer_res);
1450         }
1451     }
1452   else
1453     res = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1454                                    fract_res, integer_res);
1455   return res;
1456 }
1457
1458 /* ARG0 and ARG1 are the two arguments to a pow builtin call in GSI
1459    with location info LOC.  If possible, create an equivalent and
1460    less expensive sequence of statements prior to GSI, and return an
1461    expession holding the result.  */
1462
1463 static tree
1464 gimple_expand_builtin_pow (gimple_stmt_iterator *gsi, location_t loc,
1465                            tree arg0, tree arg1)
1466 {
1467   REAL_VALUE_TYPE c, cint, dconst1_3, dconst1_4, dconst1_6;
1468   REAL_VALUE_TYPE c2, dconst3;
1469   HOST_WIDE_INT n;
1470   tree type, sqrtfn, cbrtfn, sqrt_arg0, result, cbrt_x, powi_cbrt_x;
1471   machine_mode mode;
1472   bool speed_p = optimize_bb_for_speed_p (gsi_bb (*gsi));
1473   bool hw_sqrt_exists, c_is_int, c2_is_int;
1474
1475   dconst1_4 = dconst1;
1476   SET_REAL_EXP (&dconst1_4, REAL_EXP (&dconst1_4) - 2);
1477
1478   /* If the exponent isn't a constant, there's nothing of interest
1479      to be done.  */
1480   if (TREE_CODE (arg1) != REAL_CST)
1481     return NULL_TREE;
1482
1483   /* If the exponent is equivalent to an integer, expand to an optimal
1484      multiplication sequence when profitable.  */
1485   c = TREE_REAL_CST (arg1);
1486   n = real_to_integer (&c);
1487   real_from_integer (&cint, VOIDmode, n, SIGNED);
1488   c_is_int = real_identical (&c, &cint);
1489
1490   if (c_is_int
1491       && ((n >= -1 && n <= 2)
1492           || (flag_unsafe_math_optimizations
1493               && speed_p
1494               && powi_cost (n) <= POWI_MAX_MULTS)))
1495     return gimple_expand_builtin_powi (gsi, loc, arg0, n);
1496
1497   /* Attempt various optimizations using sqrt and cbrt.  */
1498   type = TREE_TYPE (arg0);
1499   mode = TYPE_MODE (type);
1500   sqrtfn = mathfn_built_in (type, BUILT_IN_SQRT);
1501
1502   /* Optimize pow(x,0.5) = sqrt(x).  This replacement is always safe
1503      unless signed zeros must be maintained.  pow(-0,0.5) = +0, while
1504      sqrt(-0) = -0.  */
1505   if (sqrtfn
1506       && real_equal (&c, &dconsthalf)
1507       && !HONOR_SIGNED_ZEROS (mode))
1508     return build_and_insert_call (gsi, loc, sqrtfn, arg0);
1509
1510   hw_sqrt_exists = optab_handler (sqrt_optab, mode) != CODE_FOR_nothing;
1511
1512   /* Optimize pow(x,1./3.) = cbrt(x).  This requires unsafe math
1513      optimizations since 1./3. is not exactly representable.  If x
1514      is negative and finite, the correct value of pow(x,1./3.) is
1515      a NaN with the "invalid" exception raised, because the value
1516      of 1./3. actually has an even denominator.  The correct value
1517      of cbrt(x) is a negative real value.  */
1518   cbrtfn = mathfn_built_in (type, BUILT_IN_CBRT);
1519   dconst1_3 = real_value_truncate (mode, dconst_third ());
1520
1521   if (flag_unsafe_math_optimizations
1522       && cbrtfn
1523       && (!HONOR_NANS (mode) || tree_expr_nonnegative_p (arg0))
1524       && real_equal (&c, &dconst1_3))
1525     return build_and_insert_call (gsi, loc, cbrtfn, arg0);
1526
1527   /* Optimize pow(x,1./6.) = cbrt(sqrt(x)).  Don't do this optimization
1528      if we don't have a hardware sqrt insn.  */
1529   dconst1_6 = dconst1_3;
1530   SET_REAL_EXP (&dconst1_6, REAL_EXP (&dconst1_6) - 1);
1531
1532   if (flag_unsafe_math_optimizations
1533       && sqrtfn
1534       && cbrtfn
1535       && (!HONOR_NANS (mode) || tree_expr_nonnegative_p (arg0))
1536       && speed_p
1537       && hw_sqrt_exists
1538       && real_equal (&c, &dconst1_6))
1539     {
1540       /* sqrt(x)  */
1541       sqrt_arg0 = build_and_insert_call (gsi, loc, sqrtfn, arg0);
1542
1543       /* cbrt(sqrt(x))  */
1544       return build_and_insert_call (gsi, loc, cbrtfn, sqrt_arg0);
1545     }
1546
1547
1548   /* Attempt to expand the POW as a product of square root chains.
1549      Expand the 0.25 case even when otpimising for size.  */
1550   if (flag_unsafe_math_optimizations
1551       && sqrtfn
1552       && hw_sqrt_exists
1553       && (speed_p || real_equal (&c, &dconst1_4))
1554       && !HONOR_SIGNED_ZEROS (mode))
1555     {
1556       unsigned int max_depth = speed_p
1557                                 ? PARAM_VALUE (PARAM_MAX_POW_SQRT_DEPTH)
1558                                 : 2;
1559
1560       tree expand_with_sqrts
1561         = expand_pow_as_sqrts (gsi, loc, arg0, arg1, max_depth);
1562
1563       if (expand_with_sqrts)
1564         return expand_with_sqrts;
1565     }
1566
1567   real_arithmetic (&c2, MULT_EXPR, &c, &dconst2);
1568   n = real_to_integer (&c2);
1569   real_from_integer (&cint, VOIDmode, n, SIGNED);
1570   c2_is_int = real_identical (&c2, &cint);
1571
1572   /* Optimize pow(x,c), where 3c = n for some nonzero integer n, into
1573
1574      powi(x, n/3) * powi(cbrt(x), n%3),                    n > 0;
1575      1.0 / (powi(x, abs(n)/3) * powi(cbrt(x), abs(n)%3)),  n < 0.
1576
1577      Do not calculate the first factor when n/3 = 0.  As cbrt(x) is
1578      different from pow(x, 1./3.) due to rounding and behavior with
1579      negative x, we need to constrain this transformation to unsafe
1580      math and positive x or finite math.  */
1581   real_from_integer (&dconst3, VOIDmode, 3, SIGNED);
1582   real_arithmetic (&c2, MULT_EXPR, &c, &dconst3);
1583   real_round (&c2, mode, &c2);
1584   n = real_to_integer (&c2);
1585   real_from_integer (&cint, VOIDmode, n, SIGNED);
1586   real_arithmetic (&c2, RDIV_EXPR, &cint, &dconst3);
1587   real_convert (&c2, mode, &c2);
1588
1589   if (flag_unsafe_math_optimizations
1590       && cbrtfn
1591       && (!HONOR_NANS (mode) || tree_expr_nonnegative_p (arg0))
1592       && real_identical (&c2, &c)
1593       && !c2_is_int
1594       && optimize_function_for_speed_p (cfun)
1595       && powi_cost (n / 3) <= POWI_MAX_MULTS)
1596     {
1597       tree powi_x_ndiv3 = NULL_TREE;
1598
1599       /* Attempt to fold powi(arg0, abs(n/3)) into multiplies.  If not
1600          possible or profitable, give up.  Skip the degenerate case when
1601          abs(n) < 3, where the result is always 1.  */
1602       if (absu_hwi (n) >= 3)
1603         {
1604           powi_x_ndiv3 = gimple_expand_builtin_powi (gsi, loc, arg0,
1605                                                      abs_hwi (n / 3));
1606           if (!powi_x_ndiv3)
1607             return NULL_TREE;
1608         }
1609
1610       /* Calculate powi(cbrt(x), n%3).  Don't use gimple_expand_builtin_powi
1611          as that creates an unnecessary variable.  Instead, just produce
1612          either cbrt(x) or cbrt(x) * cbrt(x).  */
1613       cbrt_x = build_and_insert_call (gsi, loc, cbrtfn, arg0);
1614
1615       if (absu_hwi (n) % 3 == 1)
1616         powi_cbrt_x = cbrt_x;
1617       else
1618         powi_cbrt_x = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1619                                               cbrt_x, cbrt_x);
1620
1621       /* Multiply the two subexpressions, unless powi(x,abs(n)/3) = 1.  */
1622       if (absu_hwi (n) < 3)
1623         result = powi_cbrt_x;
1624       else
1625         result = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1626                                          powi_x_ndiv3, powi_cbrt_x);
1627
1628       /* If n is negative, reciprocate the result.  */
1629       if (n < 0)
1630         result = build_and_insert_binop (gsi, loc, "powroot", RDIV_EXPR,
1631                                          build_real (type, dconst1), result);
1632
1633       return result;
1634     }
1635
1636   /* No optimizations succeeded.  */
1637   return NULL_TREE;
1638 }
1639
1640 /* ARG is the argument to a cabs builtin call in GSI with location info
1641    LOC.  Create a sequence of statements prior to GSI that calculates
1642    sqrt(R*R + I*I), where R and I are the real and imaginary components
1643    of ARG, respectively.  Return an expression holding the result.  */
1644
1645 static tree
1646 gimple_expand_builtin_cabs (gimple_stmt_iterator *gsi, location_t loc, tree arg)
1647 {
1648   tree real_part, imag_part, addend1, addend2, sum, result;
1649   tree type = TREE_TYPE (TREE_TYPE (arg));
1650   tree sqrtfn = mathfn_built_in (type, BUILT_IN_SQRT);
1651   machine_mode mode = TYPE_MODE (type);
1652
1653   if (!flag_unsafe_math_optimizations
1654       || !optimize_bb_for_speed_p (gimple_bb (gsi_stmt (*gsi)))
1655       || !sqrtfn
1656       || optab_handler (sqrt_optab, mode) == CODE_FOR_nothing)
1657     return NULL_TREE;
1658
1659   real_part = build_and_insert_ref (gsi, loc, type, "cabs",
1660                                     REALPART_EXPR, arg);
1661   addend1 = build_and_insert_binop (gsi, loc, "cabs", MULT_EXPR,
1662                                     real_part, real_part);
1663   imag_part = build_and_insert_ref (gsi, loc, type, "cabs",
1664                                     IMAGPART_EXPR, arg);
1665   addend2 = build_and_insert_binop (gsi, loc, "cabs", MULT_EXPR,
1666                                     imag_part, imag_part);
1667   sum = build_and_insert_binop (gsi, loc, "cabs", PLUS_EXPR, addend1, addend2);
1668   result = build_and_insert_call (gsi, loc, sqrtfn, sum);
1669
1670   return result;
1671 }
1672
1673 /* Go through all calls to sin, cos and cexpi and call execute_cse_sincos_1
1674    on the SSA_NAME argument of each of them.  Also expand powi(x,n) into
1675    an optimal number of multiplies, when n is a constant.  */
1676
1677 namespace {
1678
1679 const pass_data pass_data_cse_sincos =
1680 {
1681   GIMPLE_PASS, /* type */
1682   "sincos", /* name */
1683   OPTGROUP_NONE, /* optinfo_flags */
1684   TV_NONE, /* tv_id */
1685   PROP_ssa, /* properties_required */
1686   PROP_gimple_opt_math, /* properties_provided */
1687   0, /* properties_destroyed */
1688   0, /* todo_flags_start */
1689   TODO_update_ssa, /* todo_flags_finish */
1690 };
1691
1692 class pass_cse_sincos : public gimple_opt_pass
1693 {
1694 public:
1695   pass_cse_sincos (gcc::context *ctxt)
1696     : gimple_opt_pass (pass_data_cse_sincos, ctxt)
1697   {}
1698
1699   /* opt_pass methods: */
1700   virtual bool gate (function *)
1701     {
1702       /* We no longer require either sincos or cexp, since powi expansion
1703          piggybacks on this pass.  */
1704       return optimize;
1705     }
1706
1707   virtual unsigned int execute (function *);
1708
1709 }; // class pass_cse_sincos
1710
1711 unsigned int
1712 pass_cse_sincos::execute (function *fun)
1713 {
1714   basic_block bb;
1715   bool cfg_changed = false;
1716
1717   calculate_dominance_info (CDI_DOMINATORS);
1718   memset (&sincos_stats, 0, sizeof (sincos_stats));
1719
1720   FOR_EACH_BB_FN (bb, fun)
1721     {
1722       gimple_stmt_iterator gsi;
1723       bool cleanup_eh = false;
1724
1725       for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi); gsi_next (&gsi))
1726         {
1727           gimple *stmt = gsi_stmt (gsi);
1728
1729           /* Only the last stmt in a bb could throw, no need to call
1730              gimple_purge_dead_eh_edges if we change something in the middle
1731              of a basic block.  */
1732           cleanup_eh = false;
1733
1734           if (is_gimple_call (stmt)
1735               && gimple_call_lhs (stmt))
1736             {
1737               tree arg, arg0, arg1, result;
1738               HOST_WIDE_INT n;
1739               location_t loc;
1740
1741               switch (gimple_call_combined_fn (stmt))
1742                 {
1743                 CASE_CFN_COS:
1744                 CASE_CFN_SIN:
1745                 CASE_CFN_CEXPI:
1746                   /* Make sure we have either sincos or cexp.  */
1747                   if (!targetm.libc_has_function (function_c99_math_complex)
1748                       && !targetm.libc_has_function (function_sincos))
1749                     break;
1750
1751                   arg = gimple_call_arg (stmt, 0);
1752                   if (TREE_CODE (arg) == SSA_NAME)
1753                     cfg_changed |= execute_cse_sincos_1 (arg);
1754                   break;
1755
1756                 CASE_CFN_POW:
1757                   arg0 = gimple_call_arg (stmt, 0);
1758                   arg1 = gimple_call_arg (stmt, 1);
1759
1760                   loc = gimple_location (stmt);
1761                   result = gimple_expand_builtin_pow (&gsi, loc, arg0, arg1);
1762
1763                   if (result)
1764                     {
1765                       tree lhs = gimple_get_lhs (stmt);
1766                       gassign *new_stmt = gimple_build_assign (lhs, result);
1767                       gimple_set_location (new_stmt, loc);
1768                       unlink_stmt_vdef (stmt);
1769                       gsi_replace (&gsi, new_stmt, true);
1770                       cleanup_eh = true;
1771                       if (gimple_vdef (stmt))
1772                         release_ssa_name (gimple_vdef (stmt));
1773                     }
1774                   break;
1775
1776                 CASE_CFN_POWI:
1777                   arg0 = gimple_call_arg (stmt, 0);
1778                   arg1 = gimple_call_arg (stmt, 1);
1779                   loc = gimple_location (stmt);
1780
1781                   if (real_minus_onep (arg0))
1782                     {
1783                       tree t0, t1, cond, one, minus_one;
1784                       gassign *stmt;
1785
1786                       t0 = TREE_TYPE (arg0);
1787                       t1 = TREE_TYPE (arg1);
1788                       one = build_real (t0, dconst1);
1789                       minus_one = build_real (t0, dconstm1);
1790
1791                       cond = make_temp_ssa_name (t1, NULL, "powi_cond");
1792                       stmt = gimple_build_assign (cond, BIT_AND_EXPR,
1793                                                   arg1, build_int_cst (t1, 1));
1794                       gimple_set_location (stmt, loc);
1795                       gsi_insert_before (&gsi, stmt, GSI_SAME_STMT);
1796
1797                       result = make_temp_ssa_name (t0, NULL, "powi");
1798                       stmt = gimple_build_assign (result, COND_EXPR, cond,
1799                                                   minus_one, one);
1800                       gimple_set_location (stmt, loc);
1801                       gsi_insert_before (&gsi, stmt, GSI_SAME_STMT);
1802                     }
1803                   else
1804                     {
1805                       if (!tree_fits_shwi_p (arg1))
1806                         break;
1807
1808                       n = tree_to_shwi (arg1);
1809                       result = gimple_expand_builtin_powi (&gsi, loc, arg0, n);
1810                     }
1811
1812                   if (result)
1813                     {
1814                       tree lhs = gimple_get_lhs (stmt);
1815                       gassign *new_stmt = gimple_build_assign (lhs, result);
1816                       gimple_set_location (new_stmt, loc);
1817                       unlink_stmt_vdef (stmt);
1818                       gsi_replace (&gsi, new_stmt, true);
1819                       cleanup_eh = true;
1820                       if (gimple_vdef (stmt))
1821                         release_ssa_name (gimple_vdef (stmt));
1822                     }
1823                   break;
1824
1825                 CASE_CFN_CABS:
1826                   arg0 = gimple_call_arg (stmt, 0);
1827                   loc = gimple_location (stmt);
1828                   result = gimple_expand_builtin_cabs (&gsi, loc, arg0);
1829
1830                   if (result)
1831                     {
1832                       tree lhs = gimple_get_lhs (stmt);
1833                       gassign *new_stmt = gimple_build_assign (lhs, result);
1834                       gimple_set_location (new_stmt, loc);
1835                       unlink_stmt_vdef (stmt);
1836                       gsi_replace (&gsi, new_stmt, true);
1837                       cleanup_eh = true;
1838                       if (gimple_vdef (stmt))
1839                         release_ssa_name (gimple_vdef (stmt));
1840                     }
1841                   break;
1842
1843                 default:;
1844                 }
1845             }
1846         }
1847       if (cleanup_eh)
1848         cfg_changed |= gimple_purge_dead_eh_edges (bb);
1849     }
1850
1851   statistics_counter_event (fun, "sincos statements inserted",
1852                             sincos_stats.inserted);
1853
1854   return cfg_changed ? TODO_cleanup_cfg : 0;
1855 }
1856
1857 } // anon namespace
1858
1859 gimple_opt_pass *
1860 make_pass_cse_sincos (gcc::context *ctxt)
1861 {
1862   return new pass_cse_sincos (ctxt);
1863 }
1864
1865 /* A symbolic number is used to detect byte permutation and selection
1866    patterns.  Therefore the field N contains an artificial number
1867    consisting of octet sized markers:
1868
1869    0    - target byte has the value 0
1870    FF   - target byte has an unknown value (eg. due to sign extension)
1871    1..size - marker value is the target byte index minus one.
1872
1873    To detect permutations on memory sources (arrays and structures), a symbolic
1874    number is also associated a base address (the array or structure the load is
1875    made from), an offset from the base address and a range which gives the
1876    difference between the highest and lowest accessed memory location to make
1877    such a symbolic number. The range is thus different from size which reflects
1878    the size of the type of current expression. Note that for non memory source,
1879    range holds the same value as size.
1880
1881    For instance, for an array char a[], (short) a[0] | (short) a[3] would have
1882    a size of 2 but a range of 4 while (short) a[0] | ((short) a[0] << 1) would
1883    still have a size of 2 but this time a range of 1.  */
1884
1885 struct symbolic_number {
1886   uint64_t n;
1887   tree type;
1888   tree base_addr;
1889   tree offset;
1890   HOST_WIDE_INT bytepos;
1891   tree alias_set;
1892   tree vuse;
1893   unsigned HOST_WIDE_INT range;
1894 };
1895
1896 #define BITS_PER_MARKER 8
1897 #define MARKER_MASK ((1 << BITS_PER_MARKER) - 1)
1898 #define MARKER_BYTE_UNKNOWN MARKER_MASK
1899 #define HEAD_MARKER(n, size) \
1900   ((n) & ((uint64_t) MARKER_MASK << (((size) - 1) * BITS_PER_MARKER)))
1901
1902 /* The number which the find_bswap_or_nop_1 result should match in
1903    order to have a nop.  The number is masked according to the size of
1904    the symbolic number before using it.  */
1905 #define CMPNOP (sizeof (int64_t) < 8 ? 0 : \
1906   (uint64_t)0x08070605 << 32 | 0x04030201)
1907
1908 /* The number which the find_bswap_or_nop_1 result should match in
1909    order to have a byte swap.  The number is masked according to the
1910    size of the symbolic number before using it.  */
1911 #define CMPXCHG (sizeof (int64_t) < 8 ? 0 : \
1912   (uint64_t)0x01020304 << 32 | 0x05060708)
1913
1914 /* Perform a SHIFT or ROTATE operation by COUNT bits on symbolic
1915    number N.  Return false if the requested operation is not permitted
1916    on a symbolic number.  */
1917
1918 static inline bool
1919 do_shift_rotate (enum tree_code code,
1920                  struct symbolic_number *n,
1921                  int count)
1922 {
1923   int i, size = TYPE_PRECISION (n->type) / BITS_PER_UNIT;
1924   unsigned head_marker;
1925
1926   if (count % BITS_PER_UNIT != 0)
1927     return false;
1928   count = (count / BITS_PER_UNIT) * BITS_PER_MARKER;
1929
1930   /* Zero out the extra bits of N in order to avoid them being shifted
1931      into the significant bits.  */
1932   if (size < 64 / BITS_PER_MARKER)
1933     n->n &= ((uint64_t) 1 << (size * BITS_PER_MARKER)) - 1;
1934
1935   switch (code)
1936     {
1937     case LSHIFT_EXPR:
1938       n->n <<= count;
1939       break;
1940     case RSHIFT_EXPR:
1941       head_marker = HEAD_MARKER (n->n, size);
1942       n->n >>= count;
1943       /* Arithmetic shift of signed type: result is dependent on the value.  */
1944       if (!TYPE_UNSIGNED (n->type) && head_marker)
1945         for (i = 0; i < count / BITS_PER_MARKER; i++)
1946           n->n |= (uint64_t) MARKER_BYTE_UNKNOWN
1947                   << ((size - 1 - i) * BITS_PER_MARKER);
1948       break;
1949     case LROTATE_EXPR:
1950       n->n = (n->n << count) | (n->n >> ((size * BITS_PER_MARKER) - count));
1951       break;
1952     case RROTATE_EXPR:
1953       n->n = (n->n >> count) | (n->n << ((size * BITS_PER_MARKER) - count));
1954       break;
1955     default:
1956       return false;
1957     }
1958   /* Zero unused bits for size.  */
1959   if (size < 64 / BITS_PER_MARKER)
1960     n->n &= ((uint64_t) 1 << (size * BITS_PER_MARKER)) - 1;
1961   return true;
1962 }
1963
1964 /* Perform sanity checking for the symbolic number N and the gimple
1965    statement STMT.  */
1966
1967 static inline bool
1968 verify_symbolic_number_p (struct symbolic_number *n, gimple *stmt)
1969 {
1970   tree lhs_type;
1971
1972   lhs_type = gimple_expr_type (stmt);
1973
1974   if (TREE_CODE (lhs_type) != INTEGER_TYPE)
1975     return false;
1976
1977   if (TYPE_PRECISION (lhs_type) != TYPE_PRECISION (n->type))
1978     return false;
1979
1980   return true;
1981 }
1982
1983 /* Initialize the symbolic number N for the bswap pass from the base element
1984    SRC manipulated by the bitwise OR expression.  */
1985
1986 static bool
1987 init_symbolic_number (struct symbolic_number *n, tree src)
1988 {
1989   int size;
1990
1991   n->base_addr = n->offset = n->alias_set = n->vuse = NULL_TREE;
1992
1993   /* Set up the symbolic number N by setting each byte to a value between 1 and
1994      the byte size of rhs1.  The highest order byte is set to n->size and the
1995      lowest order byte to 1.  */
1996   n->type = TREE_TYPE (src);
1997   size = TYPE_PRECISION (n->type);
1998   if (size % BITS_PER_UNIT != 0)
1999     return false;
2000   size /= BITS_PER_UNIT;
2001   if (size > 64 / BITS_PER_MARKER)
2002     return false;
2003   n->range = size;
2004   n->n = CMPNOP;
2005
2006   if (size < 64 / BITS_PER_MARKER)
2007     n->n &= ((uint64_t) 1 << (size * BITS_PER_MARKER)) - 1;
2008
2009   return true;
2010 }
2011
2012 /* Check if STMT might be a byte swap or a nop from a memory source and returns
2013    the answer. If so, REF is that memory source and the base of the memory area
2014    accessed and the offset of the access from that base are recorded in N.  */
2015
2016 bool
2017 find_bswap_or_nop_load (gimple *stmt, tree ref, struct symbolic_number *n)
2018 {
2019   /* Leaf node is an array or component ref. Memorize its base and
2020      offset from base to compare to other such leaf node.  */
2021   HOST_WIDE_INT bitsize, bitpos;
2022   machine_mode mode;
2023   int unsignedp, reversep, volatilep;
2024   tree offset, base_addr;
2025
2026   /* Not prepared to handle PDP endian.  */
2027   if (BYTES_BIG_ENDIAN != WORDS_BIG_ENDIAN)
2028     return false;
2029
2030   if (!gimple_assign_load_p (stmt) || gimple_has_volatile_ops (stmt))
2031     return false;
2032
2033   base_addr = get_inner_reference (ref, &bitsize, &bitpos, &offset, &mode,
2034                                    &unsignedp, &reversep, &volatilep, false);
2035
2036   if (TREE_CODE (base_addr) == MEM_REF)
2037     {
2038       offset_int bit_offset = 0;
2039       tree off = TREE_OPERAND (base_addr, 1);
2040
2041       if (!integer_zerop (off))
2042         {
2043           offset_int boff, coff = mem_ref_offset (base_addr);
2044           boff = wi::lshift (coff, LOG2_BITS_PER_UNIT);
2045           bit_offset += boff;
2046         }
2047
2048       base_addr = TREE_OPERAND (base_addr, 0);
2049
2050       /* Avoid returning a negative bitpos as this may wreak havoc later.  */
2051       if (wi::neg_p (bit_offset))
2052         {
2053           offset_int mask = wi::mask <offset_int> (LOG2_BITS_PER_UNIT, false);
2054           offset_int tem = bit_offset.and_not (mask);
2055           /* TEM is the bitpos rounded to BITS_PER_UNIT towards -Inf.
2056              Subtract it to BIT_OFFSET and add it (scaled) to OFFSET.  */
2057           bit_offset -= tem;
2058           tem = wi::arshift (tem, LOG2_BITS_PER_UNIT);
2059           if (offset)
2060             offset = size_binop (PLUS_EXPR, offset,
2061                                     wide_int_to_tree (sizetype, tem));
2062           else
2063             offset = wide_int_to_tree (sizetype, tem);
2064         }
2065
2066       bitpos += bit_offset.to_shwi ();
2067     }
2068
2069   if (bitpos % BITS_PER_UNIT)
2070     return false;
2071   if (bitsize % BITS_PER_UNIT)
2072     return false;
2073   if (reversep)
2074     return false;
2075
2076   if (!init_symbolic_number (n, ref))
2077     return false;
2078   n->base_addr = base_addr;
2079   n->offset = offset;
2080   n->bytepos = bitpos / BITS_PER_UNIT;
2081   n->alias_set = reference_alias_ptr_type (ref);
2082   n->vuse = gimple_vuse (stmt);
2083   return true;
2084 }
2085
2086 /* Compute the symbolic number N representing the result of a bitwise OR on 2
2087    symbolic number N1 and N2 whose source statements are respectively
2088    SOURCE_STMT1 and SOURCE_STMT2.  */
2089
2090 static gimple *
2091 perform_symbolic_merge (gimple *source_stmt1, struct symbolic_number *n1,
2092                         gimple *source_stmt2, struct symbolic_number *n2,
2093                         struct symbolic_number *n)
2094 {
2095   int i, size;
2096   uint64_t mask;
2097   gimple *source_stmt;
2098   struct symbolic_number *n_start;
2099
2100   /* Sources are different, cancel bswap if they are not memory location with
2101      the same base (array, structure, ...).  */
2102   if (gimple_assign_rhs1 (source_stmt1) != gimple_assign_rhs1 (source_stmt2))
2103     {
2104       uint64_t inc;
2105       HOST_WIDE_INT start_sub, end_sub, end1, end2, end;
2106       struct symbolic_number *toinc_n_ptr, *n_end;
2107
2108       if (!n1->base_addr || !n2->base_addr
2109           || !operand_equal_p (n1->base_addr, n2->base_addr, 0))
2110         return NULL;
2111
2112       if (!n1->offset != !n2->offset
2113           || (n1->offset && !operand_equal_p (n1->offset, n2->offset, 0)))
2114         return NULL;
2115
2116       if (n1->bytepos < n2->bytepos)
2117         {
2118           n_start = n1;
2119           start_sub = n2->bytepos - n1->bytepos;
2120           source_stmt = source_stmt1;
2121         }
2122       else
2123         {
2124           n_start = n2;
2125           start_sub = n1->bytepos - n2->bytepos;
2126           source_stmt = source_stmt2;
2127         }
2128
2129       /* Find the highest address at which a load is performed and
2130          compute related info.  */
2131       end1 = n1->bytepos + (n1->range - 1);
2132       end2 = n2->bytepos + (n2->range - 1);
2133       if (end1 < end2)
2134         {
2135           end = end2;
2136           end_sub = end2 - end1;
2137         }
2138       else
2139         {
2140           end = end1;
2141           end_sub = end1 - end2;
2142         }
2143       n_end = (end2 > end1) ? n2 : n1;
2144
2145       /* Find symbolic number whose lsb is the most significant.  */
2146       if (BYTES_BIG_ENDIAN)
2147         toinc_n_ptr = (n_end == n1) ? n2 : n1;
2148       else
2149         toinc_n_ptr = (n_start == n1) ? n2 : n1;
2150
2151       n->range = end - n_start->bytepos + 1;
2152
2153       /* Check that the range of memory covered can be represented by
2154          a symbolic number.  */
2155       if (n->range > 64 / BITS_PER_MARKER)
2156         return NULL;
2157
2158       /* Reinterpret byte marks in symbolic number holding the value of
2159          bigger weight according to target endianness.  */
2160       inc = BYTES_BIG_ENDIAN ? end_sub : start_sub;
2161       size = TYPE_PRECISION (n1->type) / BITS_PER_UNIT;
2162       for (i = 0; i < size; i++, inc <<= BITS_PER_MARKER)
2163         {
2164           unsigned marker
2165             = (toinc_n_ptr->n >> (i * BITS_PER_MARKER)) & MARKER_MASK;
2166           if (marker && marker != MARKER_BYTE_UNKNOWN)
2167             toinc_n_ptr->n += inc;
2168         }
2169     }
2170   else
2171     {
2172       n->range = n1->range;
2173       n_start = n1;
2174       source_stmt = source_stmt1;
2175     }
2176
2177   if (!n1->alias_set
2178       || alias_ptr_types_compatible_p (n1->alias_set, n2->alias_set))
2179     n->alias_set = n1->alias_set;
2180   else
2181     n->alias_set = ptr_type_node;
2182   n->vuse = n_start->vuse;
2183   n->base_addr = n_start->base_addr;
2184   n->offset = n_start->offset;
2185   n->bytepos = n_start->bytepos;
2186   n->type = n_start->type;
2187   size = TYPE_PRECISION (n->type) / BITS_PER_UNIT;
2188
2189   for (i = 0, mask = MARKER_MASK; i < size; i++, mask <<= BITS_PER_MARKER)
2190     {
2191       uint64_t masked1, masked2;
2192
2193       masked1 = n1->n & mask;
2194       masked2 = n2->n & mask;
2195       if (masked1 && masked2 && masked1 != masked2)
2196         return NULL;
2197     }
2198   n->n = n1->n | n2->n;
2199
2200   return source_stmt;
2201 }
2202
2203 /* find_bswap_or_nop_1 invokes itself recursively with N and tries to perform
2204    the operation given by the rhs of STMT on the result.  If the operation
2205    could successfully be executed the function returns a gimple stmt whose
2206    rhs's first tree is the expression of the source operand and NULL
2207    otherwise.  */
2208
2209 static gimple *
2210 find_bswap_or_nop_1 (gimple *stmt, struct symbolic_number *n, int limit)
2211 {
2212   enum tree_code code;
2213   tree rhs1, rhs2 = NULL;
2214   gimple *rhs1_stmt, *rhs2_stmt, *source_stmt1;
2215   enum gimple_rhs_class rhs_class;
2216
2217   if (!limit || !is_gimple_assign (stmt))
2218     return NULL;
2219
2220   rhs1 = gimple_assign_rhs1 (stmt);
2221
2222   if (find_bswap_or_nop_load (stmt, rhs1, n))
2223     return stmt;
2224
2225   if (TREE_CODE (rhs1) != SSA_NAME)
2226     return NULL;
2227
2228   code = gimple_assign_rhs_code (stmt);
2229   rhs_class = gimple_assign_rhs_class (stmt);
2230   rhs1_stmt = SSA_NAME_DEF_STMT (rhs1);
2231
2232   if (rhs_class == GIMPLE_BINARY_RHS)
2233     rhs2 = gimple_assign_rhs2 (stmt);
2234
2235   /* Handle unary rhs and binary rhs with integer constants as second
2236      operand.  */
2237
2238   if (rhs_class == GIMPLE_UNARY_RHS
2239       || (rhs_class == GIMPLE_BINARY_RHS
2240           && TREE_CODE (rhs2) == INTEGER_CST))
2241     {
2242       if (code != BIT_AND_EXPR
2243           && code != LSHIFT_EXPR
2244           && code != RSHIFT_EXPR
2245           && code != LROTATE_EXPR
2246           && code != RROTATE_EXPR
2247           && !CONVERT_EXPR_CODE_P (code))
2248         return NULL;
2249
2250       source_stmt1 = find_bswap_or_nop_1 (rhs1_stmt, n, limit - 1);
2251
2252       /* If find_bswap_or_nop_1 returned NULL, STMT is a leaf node and
2253          we have to initialize the symbolic number.  */
2254       if (!source_stmt1)
2255         {
2256           if (gimple_assign_load_p (stmt)
2257               || !init_symbolic_number (n, rhs1))
2258             return NULL;
2259           source_stmt1 = stmt;
2260         }
2261
2262       switch (code)
2263         {
2264         case BIT_AND_EXPR:
2265           {
2266             int i, size = TYPE_PRECISION (n->type) / BITS_PER_UNIT;
2267             uint64_t val = int_cst_value (rhs2), mask = 0;
2268             uint64_t tmp = (1 << BITS_PER_UNIT) - 1;
2269
2270             /* Only constants masking full bytes are allowed.  */
2271             for (i = 0; i < size; i++, tmp <<= BITS_PER_UNIT)
2272               if ((val & tmp) != 0 && (val & tmp) != tmp)
2273                 return NULL;
2274               else if (val & tmp)
2275                 mask |= (uint64_t) MARKER_MASK << (i * BITS_PER_MARKER);
2276
2277             n->n &= mask;
2278           }
2279           break;
2280         case LSHIFT_EXPR:
2281         case RSHIFT_EXPR:
2282         case LROTATE_EXPR:
2283         case RROTATE_EXPR:
2284           if (!do_shift_rotate (code, n, (int) TREE_INT_CST_LOW (rhs2)))
2285             return NULL;
2286           break;
2287         CASE_CONVERT:
2288           {
2289             int i, type_size, old_type_size;
2290             tree type;
2291
2292             type = gimple_expr_type (stmt);
2293             type_size = TYPE_PRECISION (type);
2294             if (type_size % BITS_PER_UNIT != 0)
2295               return NULL;
2296             type_size /= BITS_PER_UNIT;
2297             if (type_size > 64 / BITS_PER_MARKER)
2298               return NULL;
2299
2300             /* Sign extension: result is dependent on the value.  */
2301             old_type_size = TYPE_PRECISION (n->type) / BITS_PER_UNIT;
2302             if (!TYPE_UNSIGNED (n->type) && type_size > old_type_size
2303                 && HEAD_MARKER (n->n, old_type_size))
2304               for (i = 0; i < type_size - old_type_size; i++)
2305                 n->n |= (uint64_t) MARKER_BYTE_UNKNOWN
2306                         << ((type_size - 1 - i) * BITS_PER_MARKER);
2307
2308             if (type_size < 64 / BITS_PER_MARKER)
2309               {
2310                 /* If STMT casts to a smaller type mask out the bits not
2311                    belonging to the target type.  */
2312                 n->n &= ((uint64_t) 1 << (type_size * BITS_PER_MARKER)) - 1;
2313               }
2314             n->type = type;
2315             if (!n->base_addr)
2316               n->range = type_size;
2317           }
2318           break;
2319         default:
2320           return NULL;
2321         };
2322       return verify_symbolic_number_p (n, stmt) ? source_stmt1 : NULL;
2323     }
2324
2325   /* Handle binary rhs.  */
2326
2327   if (rhs_class == GIMPLE_BINARY_RHS)
2328     {
2329       struct symbolic_number n1, n2;
2330       gimple *source_stmt, *source_stmt2;
2331
2332       if (code != BIT_IOR_EXPR)
2333         return NULL;
2334
2335       if (TREE_CODE (rhs2) != SSA_NAME)
2336         return NULL;
2337
2338       rhs2_stmt = SSA_NAME_DEF_STMT (rhs2);
2339
2340       switch (code)
2341         {
2342         case BIT_IOR_EXPR:
2343           source_stmt1 = find_bswap_or_nop_1 (rhs1_stmt, &n1, limit - 1);
2344
2345           if (!source_stmt1)
2346             return NULL;
2347
2348           source_stmt2 = find_bswap_or_nop_1 (rhs2_stmt, &n2, limit - 1);
2349
2350           if (!source_stmt2)
2351             return NULL;
2352
2353           if (TYPE_PRECISION (n1.type) != TYPE_PRECISION (n2.type))
2354             return NULL;
2355
2356           if (!n1.vuse != !n2.vuse
2357               || (n1.vuse && !operand_equal_p (n1.vuse, n2.vuse, 0)))
2358             return NULL;
2359
2360           source_stmt
2361             = perform_symbolic_merge (source_stmt1, &n1, source_stmt2, &n2, n);
2362
2363           if (!source_stmt)
2364             return NULL;
2365
2366           if (!verify_symbolic_number_p (n, stmt))
2367             return NULL;
2368
2369           break;
2370         default:
2371           return NULL;
2372         }
2373       return source_stmt;
2374     }
2375   return NULL;
2376 }
2377
2378 /* Check if STMT completes a bswap implementation or a read in a given
2379    endianness consisting of ORs, SHIFTs and ANDs and sets *BSWAP
2380    accordingly.  It also sets N to represent the kind of operations
2381    performed: size of the resulting expression and whether it works on
2382    a memory source, and if so alias-set and vuse.  At last, the
2383    function returns a stmt whose rhs's first tree is the source
2384    expression.  */
2385
2386 static gimple *
2387 find_bswap_or_nop (gimple *stmt, struct symbolic_number *n, bool *bswap)
2388 {
2389 /* The number which the find_bswap_or_nop_1 result should match in order
2390    to have a full byte swap.  The number is shifted to the right
2391    according to the size of the symbolic number before using it.  */
2392   uint64_t cmpxchg = CMPXCHG;
2393   uint64_t cmpnop = CMPNOP;
2394
2395   gimple *source_stmt;
2396   int limit;
2397
2398   /* The last parameter determines the depth search limit.  It usually
2399      correlates directly to the number n of bytes to be touched.  We
2400      increase that number by log2(n) + 1 here in order to also
2401      cover signed -> unsigned conversions of the src operand as can be seen
2402      in libgcc, and for initial shift/and operation of the src operand.  */
2403   limit = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (gimple_expr_type (stmt)));
2404   limit += 1 + (int) ceil_log2 ((unsigned HOST_WIDE_INT) limit);
2405   source_stmt = find_bswap_or_nop_1 (stmt, n, limit);
2406
2407   if (!source_stmt)
2408     return NULL;
2409
2410   /* Find real size of result (highest non-zero byte).  */
2411   if (n->base_addr)
2412     {
2413       int rsize;
2414       uint64_t tmpn;
2415
2416       for (tmpn = n->n, rsize = 0; tmpn; tmpn >>= BITS_PER_MARKER, rsize++);
2417       n->range = rsize;
2418     }
2419
2420   /* Zero out the extra bits of N and CMP*.  */
2421   if (n->range < (int) sizeof (int64_t))
2422     {
2423       uint64_t mask;
2424
2425       mask = ((uint64_t) 1 << (n->range * BITS_PER_MARKER)) - 1;
2426       cmpxchg >>= (64 / BITS_PER_MARKER - n->range) * BITS_PER_MARKER;
2427       cmpnop &= mask;
2428     }
2429
2430   /* A complete byte swap should make the symbolic number to start with
2431      the largest digit in the highest order byte. Unchanged symbolic
2432      number indicates a read with same endianness as target architecture.  */
2433   if (n->n == cmpnop)
2434     *bswap = false;
2435   else if (n->n == cmpxchg)
2436     *bswap = true;
2437   else
2438     return NULL;
2439
2440   /* Useless bit manipulation performed by code.  */
2441   if (!n->base_addr && n->n == cmpnop)
2442     return NULL;
2443
2444   n->range *= BITS_PER_UNIT;
2445   return source_stmt;
2446 }
2447
2448 namespace {
2449
2450 const pass_data pass_data_optimize_bswap =
2451 {
2452   GIMPLE_PASS, /* type */
2453   "bswap", /* name */
2454   OPTGROUP_NONE, /* optinfo_flags */
2455   TV_NONE, /* tv_id */
2456   PROP_ssa, /* properties_required */
2457   0, /* properties_provided */
2458   0, /* properties_destroyed */
2459   0, /* todo_flags_start */
2460   0, /* todo_flags_finish */
2461 };
2462
2463 class pass_optimize_bswap : public gimple_opt_pass
2464 {
2465 public:
2466   pass_optimize_bswap (gcc::context *ctxt)
2467     : gimple_opt_pass (pass_data_optimize_bswap, ctxt)
2468   {}
2469
2470   /* opt_pass methods: */
2471   virtual bool gate (function *)
2472     {
2473       return flag_expensive_optimizations && optimize;
2474     }
2475
2476   virtual unsigned int execute (function *);
2477
2478 }; // class pass_optimize_bswap
2479
2480 /* Perform the bswap optimization: replace the expression computed in the rhs
2481    of CUR_STMT by an equivalent bswap, load or load + bswap expression.
2482    Which of these alternatives replace the rhs is given by N->base_addr (non
2483    null if a load is needed) and BSWAP.  The type, VUSE and set-alias of the
2484    load to perform are also given in N while the builtin bswap invoke is given
2485    in FNDEL.  Finally, if a load is involved, SRC_STMT refers to one of the
2486    load statements involved to construct the rhs in CUR_STMT and N->range gives
2487    the size of the rhs expression for maintaining some statistics.
2488
2489    Note that if the replacement involve a load, CUR_STMT is moved just after
2490    SRC_STMT to do the load with the same VUSE which can lead to CUR_STMT
2491    changing of basic block.  */
2492
2493 static bool
2494 bswap_replace (gimple *cur_stmt, gimple *src_stmt, tree fndecl,
2495                tree bswap_type, tree load_type, struct symbolic_number *n,
2496                bool bswap)
2497 {
2498   gimple_stmt_iterator gsi;
2499   tree src, tmp, tgt;
2500   gimple *bswap_stmt;
2501
2502   gsi = gsi_for_stmt (cur_stmt);
2503   src = gimple_assign_rhs1 (src_stmt);
2504   tgt = gimple_assign_lhs (cur_stmt);
2505
2506   /* Need to load the value from memory first.  */
2507   if (n->base_addr)
2508     {
2509       gimple_stmt_iterator gsi_ins = gsi_for_stmt (src_stmt);
2510       tree addr_expr, addr_tmp, val_expr, val_tmp;
2511       tree load_offset_ptr, aligned_load_type;
2512       gimple *addr_stmt, *load_stmt;
2513       unsigned align;
2514       HOST_WIDE_INT load_offset = 0;
2515
2516       align = get_object_alignment (src);
2517       /* If the new access is smaller than the original one, we need
2518          to perform big endian adjustment.  */
2519       if (BYTES_BIG_ENDIAN)
2520         {
2521           HOST_WIDE_INT bitsize, bitpos;
2522           machine_mode mode;
2523           int unsignedp, reversep, volatilep;
2524           tree offset;
2525
2526           get_inner_reference (src, &bitsize, &bitpos, &offset, &mode,
2527                                &unsignedp, &reversep, &volatilep, false);
2528           if (n->range < (unsigned HOST_WIDE_INT) bitsize)
2529             {
2530               load_offset = (bitsize - n->range) / BITS_PER_UNIT;
2531               unsigned HOST_WIDE_INT l
2532                 = (load_offset * BITS_PER_UNIT) & (align - 1);
2533               if (l)
2534                 align = l & -l;
2535             }
2536         }
2537
2538       if (bswap
2539           && align < GET_MODE_ALIGNMENT (TYPE_MODE (load_type))
2540           && SLOW_UNALIGNED_ACCESS (TYPE_MODE (load_type), align))
2541         return false;
2542
2543       /* Move cur_stmt just before  one of the load of the original
2544          to ensure it has the same VUSE.  See PR61517 for what could
2545          go wrong.  */
2546       gsi_move_before (&gsi, &gsi_ins);
2547       gsi = gsi_for_stmt (cur_stmt);
2548
2549       /* Compute address to load from and cast according to the size
2550          of the load.  */
2551       addr_expr = build_fold_addr_expr (unshare_expr (src));
2552       if (is_gimple_mem_ref_addr (addr_expr))
2553         addr_tmp = addr_expr;
2554       else
2555         {
2556           addr_tmp = make_temp_ssa_name (TREE_TYPE (addr_expr), NULL,
2557                                          "load_src");
2558           addr_stmt = gimple_build_assign (addr_tmp, addr_expr);
2559           gsi_insert_before (&gsi, addr_stmt, GSI_SAME_STMT);
2560         }
2561
2562       /* Perform the load.  */
2563       aligned_load_type = load_type;
2564       if (align < TYPE_ALIGN (load_type))
2565         aligned_load_type = build_aligned_type (load_type, align);
2566       load_offset_ptr = build_int_cst (n->alias_set, load_offset);
2567       val_expr = fold_build2 (MEM_REF, aligned_load_type, addr_tmp,
2568                               load_offset_ptr);
2569
2570       if (!bswap)
2571         {
2572           if (n->range == 16)
2573             nop_stats.found_16bit++;
2574           else if (n->range == 32)
2575             nop_stats.found_32bit++;
2576           else
2577             {
2578               gcc_assert (n->range == 64);
2579               nop_stats.found_64bit++;
2580             }
2581
2582           /* Convert the result of load if necessary.  */
2583           if (!useless_type_conversion_p (TREE_TYPE (tgt), load_type))
2584             {
2585               val_tmp = make_temp_ssa_name (aligned_load_type, NULL,
2586                                             "load_dst");
2587               load_stmt = gimple_build_assign (val_tmp, val_expr);
2588               gimple_set_vuse (load_stmt, n->vuse);
2589               gsi_insert_before (&gsi, load_stmt, GSI_SAME_STMT);
2590               gimple_assign_set_rhs_with_ops (&gsi, NOP_EXPR, val_tmp);
2591             }
2592           else
2593             {
2594               gimple_assign_set_rhs_with_ops (&gsi, MEM_REF, val_expr);
2595               gimple_set_vuse (cur_stmt, n->vuse);
2596             }
2597           update_stmt (cur_stmt);
2598
2599           if (dump_file)
2600             {
2601               fprintf (dump_file,
2602                        "%d bit load in target endianness found at: ",
2603                        (int) n->range);
2604               print_gimple_stmt (dump_file, cur_stmt, 0, 0);
2605             }
2606           return true;
2607         }
2608       else
2609         {
2610           val_tmp = make_temp_ssa_name (aligned_load_type, NULL, "load_dst");
2611           load_stmt = gimple_build_assign (val_tmp, val_expr);
2612           gimple_set_vuse (load_stmt, n->vuse);
2613           gsi_insert_before (&gsi, load_stmt, GSI_SAME_STMT);
2614         }
2615       src = val_tmp;
2616     }
2617
2618   if (n->range == 16)
2619     bswap_stats.found_16bit++;
2620   else if (n->range == 32)
2621     bswap_stats.found_32bit++;
2622   else
2623     {
2624       gcc_assert (n->range == 64);
2625       bswap_stats.found_64bit++;
2626     }
2627
2628   tmp = src;
2629
2630   /* Convert the src expression if necessary.  */
2631   if (!useless_type_conversion_p (TREE_TYPE (tmp), bswap_type))
2632     {
2633       gimple *convert_stmt;
2634
2635       tmp = make_temp_ssa_name (bswap_type, NULL, "bswapsrc");
2636       convert_stmt = gimple_build_assign (tmp, NOP_EXPR, src);
2637       gsi_insert_before (&gsi, convert_stmt, GSI_SAME_STMT);
2638     }
2639
2640   /* Canonical form for 16 bit bswap is a rotate expression.  Only 16bit values
2641      are considered as rotation of 2N bit values by N bits is generally not
2642      equivalent to a bswap.  Consider for instance 0x01020304 r>> 16 which
2643      gives 0x03040102 while a bswap for that value is 0x04030201.  */
2644   if (bswap && n->range == 16)
2645     {
2646       tree count = build_int_cst (NULL, BITS_PER_UNIT);
2647       src = fold_build2 (LROTATE_EXPR, bswap_type, tmp, count);
2648       bswap_stmt = gimple_build_assign (NULL, src);
2649     }
2650   else
2651     bswap_stmt = gimple_build_call (fndecl, 1, tmp);
2652
2653   tmp = tgt;
2654
2655   /* Convert the result if necessary.  */
2656   if (!useless_type_conversion_p (TREE_TYPE (tgt), bswap_type))
2657     {
2658       gimple *convert_stmt;
2659
2660       tmp = make_temp_ssa_name (bswap_type, NULL, "bswapdst");
2661       convert_stmt = gimple_build_assign (tgt, NOP_EXPR, tmp);
2662       gsi_insert_after (&gsi, convert_stmt, GSI_SAME_STMT);
2663     }
2664
2665   gimple_set_lhs (bswap_stmt, tmp);
2666
2667   if (dump_file)
2668     {
2669       fprintf (dump_file, "%d bit bswap implementation found at: ",
2670                (int) n->range);
2671       print_gimple_stmt (dump_file, cur_stmt, 0, 0);
2672     }
2673
2674   gsi_insert_after (&gsi, bswap_stmt, GSI_SAME_STMT);
2675   gsi_remove (&gsi, true);
2676   return true;
2677 }
2678
2679 /* Find manual byte swap implementations as well as load in a given
2680    endianness. Byte swaps are turned into a bswap builtin invokation
2681    while endian loads are converted to bswap builtin invokation or
2682    simple load according to the target endianness.  */
2683
2684 unsigned int
2685 pass_optimize_bswap::execute (function *fun)
2686 {
2687   basic_block bb;
2688   bool bswap32_p, bswap64_p;
2689   bool changed = false;
2690   tree bswap32_type = NULL_TREE, bswap64_type = NULL_TREE;
2691
2692   if (BITS_PER_UNIT != 8)
2693     return 0;
2694
2695   bswap32_p = (builtin_decl_explicit_p (BUILT_IN_BSWAP32)
2696                && optab_handler (bswap_optab, SImode) != CODE_FOR_nothing);
2697   bswap64_p = (builtin_decl_explicit_p (BUILT_IN_BSWAP64)
2698                && (optab_handler (bswap_optab, DImode) != CODE_FOR_nothing
2699                    || (bswap32_p && word_mode == SImode)));
2700
2701   /* Determine the argument type of the builtins.  The code later on
2702      assumes that the return and argument type are the same.  */
2703   if (bswap32_p)
2704     {
2705       tree fndecl = builtin_decl_explicit (BUILT_IN_BSWAP32);
2706       bswap32_type = TREE_VALUE (TYPE_ARG_TYPES (TREE_TYPE (fndecl)));
2707     }
2708
2709   if (bswap64_p)
2710     {
2711       tree fndecl = builtin_decl_explicit (BUILT_IN_BSWAP64);
2712       bswap64_type = TREE_VALUE (TYPE_ARG_TYPES (TREE_TYPE (fndecl)));
2713     }
2714
2715   memset (&nop_stats, 0, sizeof (nop_stats));
2716   memset (&bswap_stats, 0, sizeof (bswap_stats));
2717
2718   FOR_EACH_BB_FN (bb, fun)
2719     {
2720       gimple_stmt_iterator gsi;
2721
2722       /* We do a reverse scan for bswap patterns to make sure we get the
2723          widest match. As bswap pattern matching doesn't handle previously
2724          inserted smaller bswap replacements as sub-patterns, the wider
2725          variant wouldn't be detected.  */
2726       for (gsi = gsi_last_bb (bb); !gsi_end_p (gsi);)
2727         {
2728           gimple *src_stmt, *cur_stmt = gsi_stmt (gsi);
2729           tree fndecl = NULL_TREE, bswap_type = NULL_TREE, load_type;
2730           enum tree_code code;
2731           struct symbolic_number n;
2732           bool bswap;
2733
2734           /* This gsi_prev (&gsi) is not part of the for loop because cur_stmt
2735              might be moved to a different basic block by bswap_replace and gsi
2736              must not points to it if that's the case.  Moving the gsi_prev
2737              there make sure that gsi points to the statement previous to
2738              cur_stmt while still making sure that all statements are
2739              considered in this basic block.  */
2740           gsi_prev (&gsi);
2741
2742           if (!is_gimple_assign (cur_stmt))
2743             continue;
2744
2745           code = gimple_assign_rhs_code (cur_stmt);
2746           switch (code)
2747             {
2748             case LROTATE_EXPR:
2749             case RROTATE_EXPR:
2750               if (!tree_fits_uhwi_p (gimple_assign_rhs2 (cur_stmt))
2751                   || tree_to_uhwi (gimple_assign_rhs2 (cur_stmt))
2752                      % BITS_PER_UNIT)
2753                 continue;
2754               /* Fall through.  */
2755             case BIT_IOR_EXPR:
2756               break;
2757             default:
2758               continue;
2759             }
2760
2761           src_stmt = find_bswap_or_nop (cur_stmt, &n, &bswap);
2762
2763           if (!src_stmt)
2764             continue;
2765
2766           switch (n.range)
2767             {
2768             case 16:
2769               /* Already in canonical form, nothing to do.  */
2770               if (code == LROTATE_EXPR || code == RROTATE_EXPR)
2771                 continue;
2772               load_type = bswap_type = uint16_type_node;
2773               break;
2774             case 32:
2775               load_type = uint32_type_node;
2776               if (bswap32_p)
2777                 {
2778                   fndecl = builtin_decl_explicit (BUILT_IN_BSWAP32);
2779                   bswap_type = bswap32_type;
2780                 }
2781               break;
2782             case 64:
2783               load_type = uint64_type_node;
2784               if (bswap64_p)
2785                 {
2786                   fndecl = builtin_decl_explicit (BUILT_IN_BSWAP64);
2787                   bswap_type = bswap64_type;
2788                 }
2789               break;
2790             default:
2791               continue;
2792             }
2793
2794           if (bswap && !fndecl && n.range != 16)
2795             continue;
2796
2797           if (bswap_replace (cur_stmt, src_stmt, fndecl, bswap_type, load_type,
2798                              &n, bswap))
2799             changed = true;
2800         }
2801     }
2802
2803   statistics_counter_event (fun, "16-bit nop implementations found",
2804                             nop_stats.found_16bit);
2805   statistics_counter_event (fun, "32-bit nop implementations found",
2806                             nop_stats.found_32bit);
2807   statistics_counter_event (fun, "64-bit nop implementations found",
2808                             nop_stats.found_64bit);
2809   statistics_counter_event (fun, "16-bit bswap implementations found",
2810                             bswap_stats.found_16bit);
2811   statistics_counter_event (fun, "32-bit bswap implementations found",
2812                             bswap_stats.found_32bit);
2813   statistics_counter_event (fun, "64-bit bswap implementations found",
2814                             bswap_stats.found_64bit);
2815
2816   return (changed ? TODO_update_ssa : 0);
2817 }
2818
2819 } // anon namespace
2820
2821 gimple_opt_pass *
2822 make_pass_optimize_bswap (gcc::context *ctxt)
2823 {
2824   return new pass_optimize_bswap (ctxt);
2825 }
2826
2827 /* Return true if stmt is a type conversion operation that can be stripped
2828    when used in a widening multiply operation.  */
2829 static bool
2830 widening_mult_conversion_strippable_p (tree result_type, gimple *stmt)
2831 {
2832   enum tree_code rhs_code = gimple_assign_rhs_code (stmt);
2833
2834   if (TREE_CODE (result_type) == INTEGER_TYPE)
2835     {
2836       tree op_type;
2837       tree inner_op_type;
2838
2839       if (!CONVERT_EXPR_CODE_P (rhs_code))
2840         return false;
2841
2842       op_type = TREE_TYPE (gimple_assign_lhs (stmt));
2843
2844       /* If the type of OP has the same precision as the result, then
2845          we can strip this conversion.  The multiply operation will be
2846          selected to create the correct extension as a by-product.  */
2847       if (TYPE_PRECISION (result_type) == TYPE_PRECISION (op_type))
2848         return true;
2849
2850       /* We can also strip a conversion if it preserves the signed-ness of
2851          the operation and doesn't narrow the range.  */
2852       inner_op_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
2853
2854       /* If the inner-most type is unsigned, then we can strip any
2855          intermediate widening operation.  If it's signed, then the
2856          intermediate widening operation must also be signed.  */
2857       if ((TYPE_UNSIGNED (inner_op_type)
2858            || TYPE_UNSIGNED (op_type) == TYPE_UNSIGNED (inner_op_type))
2859           && TYPE_PRECISION (op_type) > TYPE_PRECISION (inner_op_type))
2860         return true;
2861
2862       return false;
2863     }
2864
2865   return rhs_code == FIXED_CONVERT_EXPR;
2866 }
2867
2868 /* Return true if RHS is a suitable operand for a widening multiplication,
2869    assuming a target type of TYPE.
2870    There are two cases:
2871
2872      - RHS makes some value at least twice as wide.  Store that value
2873        in *NEW_RHS_OUT if so, and store its type in *TYPE_OUT.
2874
2875      - RHS is an integer constant.  Store that value in *NEW_RHS_OUT if so,
2876        but leave *TYPE_OUT untouched.  */
2877
2878 static bool
2879 is_widening_mult_rhs_p (tree type, tree rhs, tree *type_out,
2880                         tree *new_rhs_out)
2881 {
2882   gimple *stmt;
2883   tree type1, rhs1;
2884
2885   if (TREE_CODE (rhs) == SSA_NAME)
2886     {
2887       stmt = SSA_NAME_DEF_STMT (rhs);
2888       if (is_gimple_assign (stmt))
2889         {
2890           if (! widening_mult_conversion_strippable_p (type, stmt))
2891             rhs1 = rhs;
2892           else
2893             {
2894               rhs1 = gimple_assign_rhs1 (stmt);
2895
2896               if (TREE_CODE (rhs1) == INTEGER_CST)
2897                 {
2898                   *new_rhs_out = rhs1;
2899                   *type_out = NULL;
2900                   return true;
2901                 }
2902             }
2903         }
2904       else
2905         rhs1 = rhs;
2906
2907       type1 = TREE_TYPE (rhs1);
2908
2909       if (TREE_CODE (type1) != TREE_CODE (type)
2910           || TYPE_PRECISION (type1) * 2 > TYPE_PRECISION (type))
2911         return false;
2912
2913       *new_rhs_out = rhs1;
2914       *type_out = type1;
2915       return true;
2916     }
2917
2918   if (TREE_CODE (rhs) == INTEGER_CST)
2919     {
2920       *new_rhs_out = rhs;
2921       *type_out = NULL;
2922       return true;
2923     }
2924
2925   return false;
2926 }
2927
2928 /* Return true if STMT performs a widening multiplication, assuming the
2929    output type is TYPE.  If so, store the unwidened types of the operands
2930    in *TYPE1_OUT and *TYPE2_OUT respectively.  Also fill *RHS1_OUT and
2931    *RHS2_OUT such that converting those operands to types *TYPE1_OUT
2932    and *TYPE2_OUT would give the operands of the multiplication.  */
2933
2934 static bool
2935 is_widening_mult_p (gimple *stmt,
2936                     tree *type1_out, tree *rhs1_out,
2937                     tree *type2_out, tree *rhs2_out)
2938 {
2939   tree type = TREE_TYPE (gimple_assign_lhs (stmt));
2940
2941   if (TREE_CODE (type) != INTEGER_TYPE
2942       && TREE_CODE (type) != FIXED_POINT_TYPE)
2943     return false;
2944
2945   if (!is_widening_mult_rhs_p (type, gimple_assign_rhs1 (stmt), type1_out,
2946                                rhs1_out))
2947     return false;
2948
2949   if (!is_widening_mult_rhs_p (type, gimple_assign_rhs2 (stmt), type2_out,
2950                                rhs2_out))
2951     return false;
2952
2953   if (*type1_out == NULL)
2954     {
2955       if (*type2_out == NULL || !int_fits_type_p (*rhs1_out, *type2_out))
2956         return false;
2957       *type1_out = *type2_out;
2958     }
2959
2960   if (*type2_out == NULL)
2961     {
2962       if (!int_fits_type_p (*rhs2_out, *type1_out))
2963         return false;
2964       *type2_out = *type1_out;
2965     }
2966
2967   /* Ensure that the larger of the two operands comes first. */
2968   if (TYPE_PRECISION (*type1_out) < TYPE_PRECISION (*type2_out))
2969     {
2970       std::swap (*type1_out, *type2_out);
2971       std::swap (*rhs1_out, *rhs2_out);
2972     }
2973
2974   return true;
2975 }
2976
2977 /* Process a single gimple statement STMT, which has a MULT_EXPR as
2978    its rhs, and try to convert it into a WIDEN_MULT_EXPR.  The return
2979    value is true iff we converted the statement.  */
2980
2981 static bool
2982 convert_mult_to_widen (gimple *stmt, gimple_stmt_iterator *gsi)
2983 {
2984   tree lhs, rhs1, rhs2, type, type1, type2;
2985   enum insn_code handler;
2986   machine_mode to_mode, from_mode, actual_mode;
2987   optab op;
2988   int actual_precision;
2989   location_t loc = gimple_location (stmt);
2990   bool from_unsigned1, from_unsigned2;
2991
2992   lhs = gimple_assign_lhs (stmt);
2993   type = TREE_TYPE (lhs);
2994   if (TREE_CODE (type) != INTEGER_TYPE)
2995     return false;
2996
2997   if (!is_widening_mult_p (stmt, &type1, &rhs1, &type2, &rhs2))
2998     return false;
2999
3000   to_mode = TYPE_MODE (type);
3001   from_mode = TYPE_MODE (type1);
3002   from_unsigned1 = TYPE_UNSIGNED (type1);
3003   from_unsigned2 = TYPE_UNSIGNED (type2);
3004
3005   if (from_unsigned1 && from_unsigned2)
3006     op = umul_widen_optab;
3007   else if (!from_unsigned1 && !from_unsigned2)
3008     op = smul_widen_optab;
3009   else
3010     op = usmul_widen_optab;
3011
3012   handler = find_widening_optab_handler_and_mode (op, to_mode, from_mode,
3013                                                   0, &actual_mode);
3014
3015   if (handler == CODE_FOR_nothing)
3016     {
3017       if (op != smul_widen_optab)
3018         {
3019           /* We can use a signed multiply with unsigned types as long as
3020              there is a wider mode to use, or it is the smaller of the two
3021              types that is unsigned.  Note that type1 >= type2, always.  */
3022           if ((TYPE_UNSIGNED (type1)
3023                && TYPE_PRECISION (type1) == GET_MODE_PRECISION (from_mode))
3024               || (TYPE_UNSIGNED (type2)
3025                   && TYPE_PRECISION (type2) == GET_MODE_PRECISION (from_mode)))
3026             {
3027               from_mode = GET_MODE_WIDER_MODE (from_mode);
3028               if (GET_MODE_SIZE (to_mode) <= GET_MODE_SIZE (from_mode))
3029                 return false;
3030             }
3031
3032           op = smul_widen_optab;
3033           handler = find_widening_optab_handler_and_mode (op, to_mode,
3034                                                           from_mode, 0,
3035                                                           &actual_mode);
3036
3037           if (handler == CODE_FOR_nothing)
3038             return false;
3039
3040           from_unsigned1 = from_unsigned2 = false;
3041         }
3042       else
3043         return false;
3044     }
3045
3046   /* Ensure that the inputs to the handler are in the correct precison
3047      for the opcode.  This will be the full mode size.  */
3048   actual_precision = GET_MODE_PRECISION (actual_mode);
3049   if (2 * actual_precision > TYPE_PRECISION (type))
3050     return false;
3051   if (actual_precision != TYPE_PRECISION (type1)
3052       || from_unsigned1 != TYPE_UNSIGNED (type1))
3053     rhs1 = build_and_insert_cast (gsi, loc,
3054                                   build_nonstandard_integer_type
3055                                     (actual_precision, from_unsigned1), rhs1);
3056   if (actual_precision != TYPE_PRECISION (type2)
3057       || from_unsigned2 != TYPE_UNSIGNED (type2))
3058     rhs2 = build_and_insert_cast (gsi, loc,
3059                                   build_nonstandard_integer_type
3060                                     (actual_precision, from_unsigned2), rhs2);
3061
3062   /* Handle constants.  */
3063   if (TREE_CODE (rhs1) == INTEGER_CST)
3064     rhs1 = fold_convert (type1, rhs1);
3065   if (TREE_CODE (rhs2) == INTEGER_CST)
3066     rhs2 = fold_convert (type2, rhs2);
3067
3068   gimple_assign_set_rhs1 (stmt, rhs1);
3069   gimple_assign_set_rhs2 (stmt, rhs2);
3070   gimple_assign_set_rhs_code (stmt, WIDEN_MULT_EXPR);
3071   update_stmt (stmt);
3072   widen_mul_stats.widen_mults_inserted++;
3073   return true;
3074 }
3075
3076 /* Process a single gimple statement STMT, which is found at the
3077    iterator GSI and has a either a PLUS_EXPR or a MINUS_EXPR as its
3078    rhs (given by CODE), and try to convert it into a
3079    WIDEN_MULT_PLUS_EXPR or a WIDEN_MULT_MINUS_EXPR.  The return value
3080    is true iff we converted the statement.  */
3081
3082 static bool
3083 convert_plusminus_to_widen (gimple_stmt_iterator *gsi, gimple *stmt,
3084                             enum tree_code code)
3085 {
3086   gimple *rhs1_stmt = NULL, *rhs2_stmt = NULL;
3087   gimple *conv1_stmt = NULL, *conv2_stmt = NULL, *conv_stmt;
3088   tree type, type1, type2, optype;
3089   tree lhs, rhs1, rhs2, mult_rhs1, mult_rhs2, add_rhs;
3090   enum tree_code rhs1_code = ERROR_MARK, rhs2_code = ERROR_MARK;
3091   optab this_optab;
3092   enum tree_code wmult_code;
3093   enum insn_code handler;
3094   machine_mode to_mode, from_mode, actual_mode;
3095   location_t loc = gimple_location (stmt);
3096   int actual_precision;
3097   bool from_unsigned1, from_unsigned2;
3098
3099   lhs = gimple_assign_lhs (stmt);
3100   type = TREE_TYPE (lhs);
3101   if (TREE_CODE (type) != INTEGER_TYPE
3102       && TREE_CODE (type) != FIXED_POINT_TYPE)
3103     return false;
3104
3105   if (code == MINUS_EXPR)
3106     wmult_code = WIDEN_MULT_MINUS_EXPR;
3107   else
3108     wmult_code = WIDEN_MULT_PLUS_EXPR;
3109
3110   rhs1 = gimple_assign_rhs1 (stmt);
3111   rhs2 = gimple_assign_rhs2 (stmt);
3112
3113   if (TREE_CODE (rhs1) == SSA_NAME)
3114     {
3115       rhs1_stmt = SSA_NAME_DEF_STMT (rhs1);
3116       if (is_gimple_assign (rhs1_stmt))
3117         rhs1_code = gimple_assign_rhs_code (rhs1_stmt);
3118     }
3119
3120   if (TREE_CODE (rhs2) == SSA_NAME)
3121     {
3122       rhs2_stmt = SSA_NAME_DEF_STMT (rhs2);
3123       if (is_gimple_assign (rhs2_stmt))
3124         rhs2_code = gimple_assign_rhs_code (rhs2_stmt);
3125     }
3126
3127   /* Allow for one conversion statement between the multiply
3128      and addition/subtraction statement.  If there are more than
3129      one conversions then we assume they would invalidate this
3130      transformation.  If that's not the case then they should have
3131      been folded before now.  */
3132   if (CONVERT_EXPR_CODE_P (rhs1_code))
3133     {
3134       conv1_stmt = rhs1_stmt;
3135       rhs1 = gimple_assign_rhs1 (rhs1_stmt);
3136       if (TREE_CODE (rhs1) == SSA_NAME)
3137         {
3138           rhs1_stmt = SSA_NAME_DEF_STMT (rhs1);
3139           if (is_gimple_assign (rhs1_stmt))
3140             rhs1_code = gimple_assign_rhs_code (rhs1_stmt);
3141         }
3142       else
3143         return false;
3144     }
3145   if (CONVERT_EXPR_CODE_P (rhs2_code))
3146     {
3147       conv2_stmt = rhs2_stmt;
3148       rhs2 = gimple_assign_rhs1 (rhs2_stmt);
3149       if (TREE_CODE (rhs2) == SSA_NAME)
3150         {
3151           rhs2_stmt = SSA_NAME_DEF_STMT (rhs2);
3152           if (is_gimple_assign (rhs2_stmt))
3153             rhs2_code = gimple_assign_rhs_code (rhs2_stmt);
3154         }
3155       else
3156         return false;
3157     }
3158
3159   /* If code is WIDEN_MULT_EXPR then it would seem unnecessary to call
3160      is_widening_mult_p, but we still need the rhs returns.
3161
3162      It might also appear that it would be sufficient to use the existing
3163      operands of the widening multiply, but that would limit the choice of
3164      multiply-and-accumulate instructions.
3165
3166      If the widened-multiplication result has more than one uses, it is
3167      probably wiser not to do the conversion.  */
3168   if (code == PLUS_EXPR
3169       && (rhs1_code == MULT_EXPR || rhs1_code == WIDEN_MULT_EXPR))
3170     {
3171       if (!has_single_use (rhs1)
3172           || !is_widening_mult_p (rhs1_stmt, &type1, &mult_rhs1,
3173                                   &type2, &mult_rhs2))
3174         return false;
3175       add_rhs = rhs2;
3176       conv_stmt = conv1_stmt;
3177     }
3178   else if (rhs2_code == MULT_EXPR || rhs2_code == WIDEN_MULT_EXPR)
3179     {
3180       if (!has_single_use (rhs2)
3181           || !is_widening_mult_p (rhs2_stmt, &type1, &mult_rhs1,
3182                                   &type2, &mult_rhs2))
3183         return false;
3184       add_rhs = rhs1;
3185       conv_stmt = conv2_stmt;
3186     }
3187   else
3188     return false;
3189
3190   to_mode = TYPE_MODE (type);
3191   from_mode = TYPE_MODE (type1);
3192   from_unsigned1 = TYPE_UNSIGNED (type1);
3193   from_unsigned2 = TYPE_UNSIGNED (type2);
3194   optype = type1;
3195
3196   /* There's no such thing as a mixed sign madd yet, so use a wider mode.  */
3197   if (from_unsigned1 != from_unsigned2)
3198     {
3199       if (!INTEGRAL_TYPE_P (type))
3200         return false;
3201       /* We can use a signed multiply with unsigned types as long as
3202          there is a wider mode to use, or it is the smaller of the two
3203          types that is unsigned.  Note that type1 >= type2, always.  */
3204       if ((from_unsigned1
3205            && TYPE_PRECISION (type1) == GET_MODE_PRECISION (from_mode))
3206           || (from_unsigned2
3207               && TYPE_PRECISION (type2) == GET_MODE_PRECISION (from_mode)))
3208         {
3209           from_mode = GET_MODE_WIDER_MODE (from_mode);
3210           if (GET_MODE_SIZE (from_mode) >= GET_MODE_SIZE (to_mode))
3211             return false;
3212         }
3213
3214       from_unsigned1 = from_unsigned2 = false;
3215       optype = build_nonstandard_integer_type (GET_MODE_PRECISION (from_mode),
3216                                                false);
3217     }
3218
3219   /* If there was a conversion between the multiply and addition
3220      then we need to make sure it fits a multiply-and-accumulate.
3221      The should be a single mode change which does not change the
3222      value.  */
3223   if (conv_stmt)
3224     {
3225       /* We use the original, unmodified data types for this.  */
3226       tree from_type = TREE_TYPE (gimple_assign_rhs1 (conv_stmt));
3227       tree to_type = TREE_TYPE (gimple_assign_lhs (conv_stmt));
3228       int data_size = TYPE_PRECISION (type1) + TYPE_PRECISION (type2);
3229       bool is_unsigned = TYPE_UNSIGNED (type1) && TYPE_UNSIGNED (type2);
3230
3231       if (TYPE_PRECISION (from_type) > TYPE_PRECISION (to_type))
3232         {
3233           /* Conversion is a truncate.  */
3234           if (TYPE_PRECISION (to_type) < data_size)
3235             return false;
3236         }
3237       else if (TYPE_PRECISION (from_type) < TYPE_PRECISION (to_type))
3238         {
3239           /* Conversion is an extend.  Check it's the right sort.  */
3240           if (TYPE_UNSIGNED (from_type) != is_unsigned
3241               && !(is_unsigned && TYPE_PRECISION (from_type) > data_size))
3242             return false;
3243         }
3244       /* else convert is a no-op for our purposes.  */
3245     }
3246
3247   /* Verify that the machine can perform a widening multiply
3248      accumulate in this mode/signedness combination, otherwise
3249      this transformation is likely to pessimize code.  */
3250   this_optab = optab_for_tree_code (wmult_code, optype, optab_default);
3251   handler = find_widening_optab_handler_and_mode (this_optab, to_mode,
3252                                                   from_mode, 0, &actual_mode);
3253
3254   if (handler == CODE_FOR_nothing)
3255     return false;
3256
3257   /* Ensure that the inputs to the handler are in the correct precison
3258      for the opcode.  This will be the full mode size.  */
3259   actual_precision = GET_MODE_PRECISION (actual_mode);
3260   if (actual_precision != TYPE_PRECISION (type1)
3261       || from_unsigned1 != TYPE_UNSIGNED (type1))
3262     mult_rhs1 = build_and_insert_cast (gsi, loc,
3263                                        build_nonstandard_integer_type
3264                                          (actual_precision, from_unsigned1),
3265                                        mult_rhs1);
3266   if (actual_precision != TYPE_PRECISION (type2)
3267       || from_unsigned2 != TYPE_UNSIGNED (type2))
3268     mult_rhs2 = build_and_insert_cast (gsi, loc,
3269                                        build_nonstandard_integer_type
3270                                          (actual_precision, from_unsigned2),
3271                                        mult_rhs2);
3272
3273   if (!useless_type_conversion_p (type, TREE_TYPE (add_rhs)))
3274     add_rhs = build_and_insert_cast (gsi, loc, type, add_rhs);
3275
3276   /* Handle constants.  */
3277   if (TREE_CODE (mult_rhs1) == INTEGER_CST)
3278     mult_rhs1 = fold_convert (type1, mult_rhs1);
3279   if (TREE_CODE (mult_rhs2) == INTEGER_CST)
3280     mult_rhs2 = fold_convert (type2, mult_rhs2);
3281
3282   gimple_assign_set_rhs_with_ops (gsi, wmult_code, mult_rhs1, mult_rhs2,
3283                                   add_rhs);
3284   update_stmt (gsi_stmt (*gsi));
3285   widen_mul_stats.maccs_inserted++;
3286   return true;
3287 }
3288
3289 /* Combine the multiplication at MUL_STMT with operands MULOP1 and MULOP2
3290    with uses in additions and subtractions to form fused multiply-add
3291    operations.  Returns true if successful and MUL_STMT should be removed.  */
3292
3293 static bool
3294 convert_mult_to_fma (gimple *mul_stmt, tree op1, tree op2)
3295 {
3296   tree mul_result = gimple_get_lhs (mul_stmt);
3297   tree type = TREE_TYPE (mul_result);
3298   gimple *use_stmt, *neguse_stmt;
3299   gassign *fma_stmt;
3300   use_operand_p use_p;
3301   imm_use_iterator imm_iter;
3302
3303   if (FLOAT_TYPE_P (type)
3304       && flag_fp_contract_mode == FP_CONTRACT_OFF)
3305     return false;
3306
3307   /* We don't want to do bitfield reduction ops.  */
3308   if (INTEGRAL_TYPE_P (type)
3309       && (TYPE_PRECISION (type)
3310           != GET_MODE_PRECISION (TYPE_MODE (type))))
3311     return false;
3312
3313   /* If the target doesn't support it, don't generate it.  We assume that
3314      if fma isn't available then fms, fnma or fnms are not either.  */
3315   if (optab_handler (fma_optab, TYPE_MODE (type)) == CODE_FOR_nothing)
3316     return false;
3317
3318   /* If the multiplication has zero uses, it is kept around probably because
3319      of -fnon-call-exceptions.  Don't optimize it away in that case,
3320      it is DCE job.  */
3321   if (has_zero_uses (mul_result))
3322     return false;
3323
3324   /* Make sure that the multiplication statement becomes dead after
3325      the transformation, thus that all uses are transformed to FMAs.
3326      This means we assume that an FMA operation has the same cost
3327      as an addition.  */
3328   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, mul_result)
3329     {
3330       enum tree_code use_code;
3331       tree result = mul_result;
3332       bool negate_p = false;
3333
3334       use_stmt = USE_STMT (use_p);
3335
3336       if (is_gimple_debug (use_stmt))
3337         continue;
3338
3339       /* For now restrict this operations to single basic blocks.  In theory
3340          we would want to support sinking the multiplication in
3341          m = a*b;
3342          if ()
3343            ma = m + c;
3344          else
3345            d = m;
3346          to form a fma in the then block and sink the multiplication to the
3347          else block.  */
3348       if (gimple_bb (use_stmt) != gimple_bb (mul_stmt))
3349         return false;
3350
3351       if (!is_gimple_assign (use_stmt))
3352         return false;
3353
3354       use_code = gimple_assign_rhs_code (use_stmt);
3355
3356       /* A negate on the multiplication leads to FNMA.  */
3357       if (use_code == NEGATE_EXPR)
3358         {
3359           ssa_op_iter iter;
3360           use_operand_p usep;
3361
3362           result = gimple_assign_lhs (use_stmt);
3363
3364           /* Make sure the negate statement becomes dead with this
3365              single transformation.  */
3366           if (!single_imm_use (gimple_assign_lhs (use_stmt),
3367                                &use_p, &neguse_stmt))
3368             return false;
3369
3370           /* Make sure the multiplication isn't also used on that stmt.  */
3371           FOR_EACH_PHI_OR_STMT_USE (usep, neguse_stmt, iter, SSA_OP_USE)
3372             if (USE_FROM_PTR (usep) == mul_result)
3373               return false;
3374
3375           /* Re-validate.  */
3376           use_stmt = neguse_stmt;
3377           if (gimple_bb (use_stmt) != gimple_bb (mul_stmt))
3378             return false;
3379           if (!is_gimple_assign (use_stmt))
3380             return false;
3381
3382           use_code = gimple_assign_rhs_code (use_stmt);
3383           negate_p = true;
3384         }
3385
3386       switch (use_code)
3387         {
3388         case MINUS_EXPR:
3389           if (gimple_assign_rhs2 (use_stmt) == result)
3390             negate_p = !negate_p;
3391           break;
3392         case PLUS_EXPR:
3393           break;
3394         default:
3395           /* FMA can only be formed from PLUS and MINUS.  */
3396           return false;
3397         }
3398
3399       /* If the subtrahend (gimple_assign_rhs2 (use_stmt)) is computed
3400          by a MULT_EXPR that we'll visit later, we might be able to
3401          get a more profitable match with fnma.
3402          OTOH, if we don't, a negate / fma pair has likely lower latency
3403          that a mult / subtract pair.  */
3404       if (use_code == MINUS_EXPR && !negate_p
3405           && gimple_assign_rhs1 (use_stmt) == result
3406           && optab_handler (fms_optab, TYPE_MODE (type)) == CODE_FOR_nothing
3407           && optab_handler (fnma_optab, TYPE_MODE (type)) != CODE_FOR_nothing)
3408         {
3409           tree rhs2 = gimple_assign_rhs2 (use_stmt);
3410
3411           if (TREE_CODE (rhs2) == SSA_NAME)
3412             {
3413               gimple *stmt2 = SSA_NAME_DEF_STMT (rhs2);
3414               if (has_single_use (rhs2)
3415                   && is_gimple_assign (stmt2)
3416                   && gimple_assign_rhs_code (stmt2) == MULT_EXPR)
3417               return false;
3418             }
3419         }
3420
3421       /* We can't handle a * b + a * b.  */
3422       if (gimple_assign_rhs1 (use_stmt) == gimple_assign_rhs2 (use_stmt))
3423         return false;
3424
3425       /* While it is possible to validate whether or not the exact form
3426          that we've recognized is available in the backend, the assumption
3427          is that the transformation is never a loss.  For instance, suppose
3428          the target only has the plain FMA pattern available.  Consider
3429          a*b-c -> fma(a,b,-c): we've exchanged MUL+SUB for FMA+NEG, which
3430          is still two operations.  Consider -(a*b)-c -> fma(-a,b,-c): we
3431          still have 3 operations, but in the FMA form the two NEGs are
3432          independent and could be run in parallel.  */
3433     }
3434
3435   FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, mul_result)
3436     {
3437       gimple_stmt_iterator gsi = gsi_for_stmt (use_stmt);
3438       enum tree_code use_code;
3439       tree addop, mulop1 = op1, result = mul_result;
3440       bool negate_p = false;
3441
3442       if (is_gimple_debug (use_stmt))
3443         continue;
3444
3445       use_code = gimple_assign_rhs_code (use_stmt);
3446       if (use_code == NEGATE_EXPR)
3447         {
3448           result = gimple_assign_lhs (use_stmt);
3449           single_imm_use (gimple_assign_lhs (use_stmt), &use_p, &neguse_stmt);
3450           gsi_remove (&gsi, true);
3451           release_defs (use_stmt);
3452
3453           use_stmt = neguse_stmt;
3454           gsi = gsi_for_stmt (use_stmt);
3455           use_code = gimple_assign_rhs_code (use_stmt);
3456           negate_p = true;
3457         }
3458
3459       if (gimple_assign_rhs1 (use_stmt) == result)
3460         {
3461           addop = gimple_assign_rhs2 (use_stmt);
3462           /* a * b - c -> a * b + (-c)  */
3463           if (gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
3464             addop = force_gimple_operand_gsi (&gsi,
3465                                               build1 (NEGATE_EXPR,
3466                                                       type, addop),
3467                                               true, NULL_TREE, true,
3468                                               GSI_SAME_STMT);
3469         }
3470       else
3471         {
3472           addop = gimple_assign_rhs1 (use_stmt);
3473           /* a - b * c -> (-b) * c + a */
3474           if (gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
3475             negate_p = !negate_p;
3476         }
3477
3478       if (negate_p)
3479         mulop1 = force_gimple_operand_gsi (&gsi,
3480                                            build1 (NEGATE_EXPR,
3481                                                    type, mulop1),
3482                                            true, NULL_TREE, true,
3483                                            GSI_SAME_STMT);
3484
3485       fma_stmt = gimple_build_assign (gimple_assign_lhs (use_stmt),
3486                                       FMA_EXPR, mulop1, op2, addop);
3487       gsi_replace (&gsi, fma_stmt, true);
3488       widen_mul_stats.fmas_inserted++;
3489     }
3490
3491   return true;
3492 }
3493
3494 /* Find integer multiplications where the operands are extended from
3495    smaller types, and replace the MULT_EXPR with a WIDEN_MULT_EXPR
3496    where appropriate.  */
3497
3498 namespace {
3499
3500 const pass_data pass_data_optimize_widening_mul =
3501 {
3502   GIMPLE_PASS, /* type */
3503   "widening_mul", /* name */
3504   OPTGROUP_NONE, /* optinfo_flags */
3505   TV_NONE, /* tv_id */
3506   PROP_ssa, /* properties_required */
3507   0, /* properties_provided */
3508   0, /* properties_destroyed */
3509   0, /* todo_flags_start */
3510   TODO_update_ssa, /* todo_flags_finish */
3511 };
3512
3513 class pass_optimize_widening_mul : public gimple_opt_pass
3514 {
3515 public:
3516   pass_optimize_widening_mul (gcc::context *ctxt)
3517     : gimple_opt_pass (pass_data_optimize_widening_mul, ctxt)
3518   {}
3519
3520   /* opt_pass methods: */
3521   virtual bool gate (function *)
3522     {
3523       return flag_expensive_optimizations && optimize;
3524     }
3525
3526   virtual unsigned int execute (function *);
3527
3528 }; // class pass_optimize_widening_mul
3529
3530 unsigned int
3531 pass_optimize_widening_mul::execute (function *fun)
3532 {
3533   basic_block bb;
3534   bool cfg_changed = false;
3535
3536   memset (&widen_mul_stats, 0, sizeof (widen_mul_stats));
3537
3538   FOR_EACH_BB_FN (bb, fun)
3539     {
3540       gimple_stmt_iterator gsi;
3541
3542       for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi);)
3543         {
3544           gimple *stmt = gsi_stmt (gsi);
3545           enum tree_code code;
3546
3547           if (is_gimple_assign (stmt))
3548             {
3549               code = gimple_assign_rhs_code (stmt);
3550               switch (code)
3551                 {
3552                 case MULT_EXPR:
3553                   if (!convert_mult_to_widen (stmt, &gsi)
3554                       && convert_mult_to_fma (stmt,
3555                                               gimple_assign_rhs1 (stmt),
3556                                               gimple_assign_rhs2 (stmt)))
3557                     {
3558                       gsi_remove (&gsi, true);
3559                       release_defs (stmt);
3560                       continue;
3561                     }
3562                   break;
3563
3564                 case PLUS_EXPR:
3565                 case MINUS_EXPR:
3566                   convert_plusminus_to_widen (&gsi, stmt, code);
3567                   break;
3568
3569                 default:;
3570                 }
3571             }
3572           else if (is_gimple_call (stmt)
3573                    && gimple_call_lhs (stmt))
3574             {
3575               tree fndecl = gimple_call_fndecl (stmt);
3576               if (fndecl
3577                   && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_NORMAL)
3578                 {
3579                   switch (DECL_FUNCTION_CODE (fndecl))
3580                     {
3581                       case BUILT_IN_POWF:
3582                       case BUILT_IN_POW:
3583                       case BUILT_IN_POWL:
3584                         if (TREE_CODE (gimple_call_arg (stmt, 1)) == REAL_CST
3585                             && real_equal
3586                                  (&TREE_REAL_CST (gimple_call_arg (stmt, 1)),
3587                                   &dconst2)
3588                             && convert_mult_to_fma (stmt,
3589                                                     gimple_call_arg (stmt, 0),
3590                                                     gimple_call_arg (stmt, 0)))
3591                           {
3592                             unlink_stmt_vdef (stmt);
3593                             if (gsi_remove (&gsi, true)
3594                                 && gimple_purge_dead_eh_edges (bb))
3595                               cfg_changed = true;
3596                             release_defs (stmt);
3597                             continue;
3598                           }
3599                           break;
3600
3601                       default:;
3602                     }
3603                 }
3604             }
3605           gsi_next (&gsi);
3606         }
3607     }
3608
3609   statistics_counter_event (fun, "widening multiplications inserted",
3610                             widen_mul_stats.widen_mults_inserted);
3611   statistics_counter_event (fun, "widening maccs inserted",
3612                             widen_mul_stats.maccs_inserted);
3613   statistics_counter_event (fun, "fused multiply-adds inserted",
3614                             widen_mul_stats.fmas_inserted);
3615
3616   return cfg_changed ? TODO_cleanup_cfg : 0;
3617 }
3618
3619 } // anon namespace
3620
3621 gimple_opt_pass *
3622 make_pass_optimize_widening_mul (gcc::context *ctxt)
3623 {
3624   return new pass_optimize_widening_mul (ctxt);
3625 }