gcc/tree-ssa-math-opts.c

   1 /* Global, SSA-based optimizations using mathematical identities.
   2    Copyright (C) 2005-2015 Free Software Foundation, Inc.
   3
   4 This file is part of GCC.
   5
   6 GCC is free software; you can redistribute it and/or modify it
   7 under the terms of the GNU General Public License as published by the
   8 Free Software Foundation; either version 3, or (at your option) any
   9 later version.
  10
  11 GCC is distributed in the hope that it will be useful, but WITHOUT
  12 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14 for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with GCC; see the file COPYING3.  If not see
  18 <http://www.gnu.org/licenses/>.  */
  19
  20 /* Currently, the only mini-pass in this file tries to CSE reciprocal
  21    operations.  These are common in sequences such as this one:
  22
  23         modulus = sqrt(x*x + y*y + z*z);
  24         x = x / modulus;
  25         y = y / modulus;
  26         z = z / modulus;
  27
  28    that can be optimized to
  29
  30         modulus = sqrt(x*x + y*y + z*z);
  31         rmodulus = 1.0 / modulus;
  32         x = x * rmodulus;
  33         y = y * rmodulus;
  34         z = z * rmodulus;
  35
  36    We do this for loop invariant divisors, and with this pass whenever
  37    we notice that a division has the same divisor multiple times.
  38
  39    Of course, like in PRE, we don't insert a division if a dominator
  40    already has one.  However, this cannot be done as an extension of
  41    PRE for several reasons.
  42
  43    First of all, with some experiments it was found out that the
  44    transformation is not always useful if there are only two divisions
  45    hy the same divisor.  This is probably because modern processors
  46    can pipeline the divisions; on older, in-order processors it should
  47    still be effective to optimize two divisions by the same number.
  48    We make this a param, and it shall be called N in the remainder of
  49    this comment.
  50
  51    Second, if trapping math is active, we have less freedom on where
  52    to insert divisions: we can only do so in basic blocks that already
  53    contain one.  (If divisions don't trap, instead, we can insert
  54    divisions elsewhere, which will be in blocks that are common dominators
  55    of those that have the division).
  56
  57    We really don't want to compute the reciprocal unless a division will
  58    be found.  To do this, we won't insert the division in a basic block
  59    that has less than N divisions *post-dominating* it.
  60
  61    The algorithm constructs a subset of the dominator tree, holding the
  62    blocks containing the divisions and the common dominators to them,
  63    and walk it twice.  The first walk is in post-order, and it annotates
  64    each block with the number of divisions that post-dominate it: this
  65    gives information on where divisions can be inserted profitably.
  66    The second walk is in pre-order, and it inserts divisions as explained
  67    above, and replaces divisions by multiplications.
  68
  69    In the best case, the cost of the pass is O(n_statements).  In the
  70    worst-case, the cost is due to creating the dominator tree subset,
  71    with a cost of O(n_basic_blocks ^ 2); however this can only happen
  72    for n_statements / n_basic_blocks statements.  So, the amortized cost
  73    of creating the dominator tree subset is O(n_basic_blocks) and the
  74    worst-case cost of the pass is O(n_statements * n_basic_blocks).
  75
  76    More practically, the cost will be small because there are few
  77    divisions, and they tend to be in the same basic block, so insert_bb
  78    is called very few times.
  79
  80    If we did this using domwalk.c, an efficient implementation would have
  81    to work on all the variables in a single pass, because we could not
  82    work on just a subset of the dominator tree, as we do now, and the
  83    cost would also be something like O(n_statements * n_basic_blocks).
  84    The data structures would be more complex in order to work on all the
  85    variables in a single pass.  */
  86
  87 #include "config.h"
  88 #include "system.h"
  89 #include "coretypes.h"
  90 #include "backend.h"
  91 #include "target.h"
  92 #include "rtl.h"
  93 #include "tree.h"
  94 #include "gimple.h"
  95 #include "predict.h"
  96 #include "alloc-pool.h"
  97 #include "tree-pass.h"
  98 #include "ssa.h"
  99 #include "optabs-tree.h"
 100 #include "gimple-pretty-print.h"
 101 #include "alias.h"
 102 #include "fold-const.h"
 103 #include "gimple-fold.h"
 104 #include "gimple-iterator.h"
 105 #include "gimplify.h"
 106 #include "gimplify-me.h"
 107 #include "stor-layout.h"
 108 #include "tree-cfg.h"
 109 #include "tree-dfa.h"
 110 #include "tree-ssa.h"
 111 #include "builtins.h"
 112 #include "params.h"
 113
 114 /* This structure represents one basic block that either computes a
 115    division, or is a common dominator for basic block that compute a
 116    division.  */
 117 struct occurrence {
 118   /* The basic block represented by this structure.  */
 119   basic_block bb;
 120
 121   /* If non-NULL, the SSA_NAME holding the definition for a reciprocal
 122      inserted in BB.  */
 123   tree recip_def;
 124
 125   /* If non-NULL, the GIMPLE_ASSIGN for a reciprocal computation that
 126      was inserted in BB.  */
 127   gimple *recip_def_stmt;
 128
 129   /* Pointer to a list of "struct occurrence"s for blocks dominated
 130      by BB.  */
 131   struct occurrence *children;
 132
 133   /* Pointer to the next "struct occurrence"s in the list of blocks
 134      sharing a common dominator.  */
 135   struct occurrence *next;
 136
 137   /* The number of divisions that are in BB before compute_merit.  The
 138      number of divisions that are in BB or post-dominate it after
 139      compute_merit.  */
 140   int num_divisions;
 141
 142   /* True if the basic block has a division, false if it is a common
 143      dominator for basic blocks that do.  If it is false and trapping
 144      math is active, BB is not a candidate for inserting a reciprocal.  */
 145   bool bb_has_division;
 146 };
 147
 148 static struct
 149 {
 150   /* Number of 1.0/X ops inserted.  */
 151   int rdivs_inserted;
 152
 153   /* Number of 1.0/FUNC ops inserted.  */
 154   int rfuncs_inserted;
 155 } reciprocal_stats;
 156
 157 static struct
 158 {
 159   /* Number of cexpi calls inserted.  */
 160   int inserted;
 161 } sincos_stats;
 162
 163 static struct
 164 {
 165   /* Number of hand-written 16-bit nop / bswaps found.  */
 166   int found_16bit;
 167
 168   /* Number of hand-written 32-bit nop / bswaps found.  */
 169   int found_32bit;
 170
 171   /* Number of hand-written 64-bit nop / bswaps found.  */
 172   int found_64bit;
 173 } nop_stats, bswap_stats;
 174
 175 static struct
 176 {
 177   /* Number of widening multiplication ops inserted.  */
 178   int widen_mults_inserted;
 179
 180   /* Number of integer multiply-and-accumulate ops inserted.  */
 181   int maccs_inserted;
 182
 183   /* Number of fp fused multiply-add ops inserted.  */
 184   int fmas_inserted;
 185 } widen_mul_stats;
 186
 187 /* The instance of "struct occurrence" representing the highest
 188    interesting block in the dominator tree.  */
 189 static struct occurrence *occ_head;
 190
 191 /* Allocation pool for getting instances of "struct occurrence".  */
 192 static object_allocator<occurrence> *occ_pool;
 193
 194
 195
 196 /* Allocate and return a new struct occurrence for basic block BB, and
 197    whose children list is headed by CHILDREN.  */
 198 static struct occurrence *
 199 occ_new (basic_block bb, struct occurrence *children)
 200 {
 201   struct occurrence *occ;
 202
 203   bb->aux = occ = occ_pool->allocate ();
 204   memset (occ, 0, sizeof (struct occurrence));
 205
 206   occ->bb = bb;
 207   occ->children = children;
 208   return occ;
 209 }
 210
 211
 212 /* Insert NEW_OCC into our subset of the dominator tree.  P_HEAD points to a
 213    list of "struct occurrence"s, one per basic block, having IDOM as
 214    their common dominator.
 215
 216    We try to insert NEW_OCC as deep as possible in the tree, and we also
 217    insert any other block that is a common dominator for BB and one
 218    block already in the tree.  */
 219
 220 static void
 221 insert_bb (struct occurrence *new_occ, basic_block idom,
 222            struct occurrence **p_head)
 223 {
 224   struct occurrence *occ, **p_occ;
 225
 226   for (p_occ = p_head; (occ = *p_occ) != NULL; )
 227     {
 228       basic_block bb = new_occ->bb, occ_bb = occ->bb;
 229       basic_block dom = nearest_common_dominator (CDI_DOMINATORS, occ_bb, bb);
 230       if (dom == bb)
 231         {
 232           /* BB dominates OCC_BB.  OCC becomes NEW_OCC's child: remove OCC
 233              from its list.  */
 234           *p_occ = occ->next;
 235           occ->next = new_occ->children;
 236           new_occ->children = occ;
 237
 238           /* Try the next block (it may as well be dominated by BB).  */
 239         }
 240
 241       else if (dom == occ_bb)
 242         {
 243           /* OCC_BB dominates BB.  Tail recurse to look deeper.  */
 244           insert_bb (new_occ, dom, &occ->children);
 245           return;
 246         }
 247
 248       else if (dom != idom)
 249         {
 250           gcc_assert (!dom->aux);
 251
 252           /* There is a dominator between IDOM and BB, add it and make
 253              two children out of NEW_OCC and OCC.  First, remove OCC from
 254              its list.  */
 255           *p_occ = occ->next;
 256           new_occ->next = occ;
 257           occ->next = NULL;
 258
 259           /* None of the previous blocks has DOM as a dominator: if we tail
 260              recursed, we would reexamine them uselessly. Just switch BB with
 261              DOM, and go on looking for blocks dominated by DOM.  */
 262           new_occ = occ_new (dom, new_occ);
 263         }
 264
 265       else
 266         {
 267           /* Nothing special, go on with the next element.  */
 268           p_occ = &occ->next;
 269         }
 270     }
 271
 272   /* No place was found as a child of IDOM.  Make BB a sibling of IDOM.  */
 273   new_occ->next = *p_head;
 274   *p_head = new_occ;
 275 }
 276
 277 /* Register that we found a division in BB.  */
 278
 279 static inline void
 280 register_division_in (basic_block bb)
 281 {
 282   struct occurrence *occ;
 283
 284   occ = (struct occurrence *) bb->aux;
 285   if (!occ)
 286     {
 287       occ = occ_new (bb, NULL);
 288       insert_bb (occ, ENTRY_BLOCK_PTR_FOR_FN (cfun), &occ_head);
 289     }
 290
 291   occ->bb_has_division = true;
 292   occ->num_divisions++;
 293 }
 294
 295
 296 /* Compute the number of divisions that postdominate each block in OCC and
 297    its children.  */
 298
 299 static void
 300 compute_merit (struct occurrence *occ)
 301 {
 302   struct occurrence *occ_child;
 303   basic_block dom = occ->bb;
 304
 305   for (occ_child = occ->children; occ_child; occ_child = occ_child->next)
 306     {
 307       basic_block bb;
 308       if (occ_child->children)
 309         compute_merit (occ_child);
 310
 311       if (flag_exceptions)
 312         bb = single_noncomplex_succ (dom);
 313       else
 314         bb = dom;
 315
 316       if (dominated_by_p (CDI_POST_DOMINATORS, bb, occ_child->bb))
 317         occ->num_divisions += occ_child->num_divisions;
 318     }
 319 }
 320
 321
 322 /* Return whether USE_STMT is a floating-point division by DEF.  */
 323 static inline bool
 324 is_division_by (gimple *use_stmt, tree def)
 325 {
 326   return is_gimple_assign (use_stmt)
 327          && gimple_assign_rhs_code (use_stmt) == RDIV_EXPR
 328          && gimple_assign_rhs2 (use_stmt) == def
 329          /* Do not recognize x / x as valid division, as we are getting
 330             confused later by replacing all immediate uses x in such
 331             a stmt.  */
 332          && gimple_assign_rhs1 (use_stmt) != def;
 333 }
 334
 335 /* Walk the subset of the dominator tree rooted at OCC, setting the
 336    RECIP_DEF field to a definition of 1.0 / DEF that can be used in
 337    the given basic block.  The field may be left NULL, of course,
 338    if it is not possible or profitable to do the optimization.
 339
 340    DEF_BSI is an iterator pointing at the statement defining DEF.
 341    If RECIP_DEF is set, a dominator already has a computation that can
 342    be used.  */
 343
 344 static void
 345 insert_reciprocals (gimple_stmt_iterator *def_gsi, struct occurrence *occ,
 346                     tree def, tree recip_def, int threshold)
 347 {
 348   tree type;
 349   gassign *new_stmt;
 350   gimple_stmt_iterator gsi;
 351   struct occurrence *occ_child;
 352
 353   if (!recip_def
 354       && (occ->bb_has_division || !flag_trapping_math)
 355       && occ->num_divisions >= threshold)
 356     {
 357       /* Make a variable with the replacement and substitute it.  */
 358       type = TREE_TYPE (def);
 359       recip_def = create_tmp_reg (type, "reciptmp");
 360       new_stmt = gimple_build_assign (recip_def, RDIV_EXPR,
 361                                       build_one_cst (type), def);
 362
 363       if (occ->bb_has_division)
 364         {
 365           /* Case 1: insert before an existing division.  */
 366           gsi = gsi_after_labels (occ->bb);
 367           while (!gsi_end_p (gsi) && !is_division_by (gsi_stmt (gsi), def))
 368             gsi_next (&gsi);
 369
 370           gsi_insert_before (&gsi, new_stmt, GSI_SAME_STMT);
 371         }
 372       else if (def_gsi && occ->bb == def_gsi->bb)
 373         {
 374           /* Case 2: insert right after the definition.  Note that this will
 375              never happen if the definition statement can throw, because in
 376              that case the sole successor of the statement's basic block will
 377              dominate all the uses as well.  */
 378           gsi_insert_after (def_gsi, new_stmt, GSI_NEW_STMT);
 379         }
 380       else
 381         {
 382           /* Case 3: insert in a basic block not containing defs/uses.  */
 383           gsi = gsi_after_labels (occ->bb);
 384           gsi_insert_before (&gsi, new_stmt, GSI_SAME_STMT);
 385         }
 386
 387       reciprocal_stats.rdivs_inserted++;
 388
 389       occ->recip_def_stmt = new_stmt;
 390     }
 391
 392   occ->recip_def = recip_def;
 393   for (occ_child = occ->children; occ_child; occ_child = occ_child->next)
 394     insert_reciprocals (def_gsi, occ_child, def, recip_def, threshold);
 395 }
 396
 397
 398 /* Replace the division at USE_P with a multiplication by the reciprocal, if
 399    possible.  */
 400
 401 static inline void
 402 replace_reciprocal (use_operand_p use_p)
 403 {
 404   gimple *use_stmt = USE_STMT (use_p);
 405   basic_block bb = gimple_bb (use_stmt);
 406   struct occurrence *occ = (struct occurrence *) bb->aux;
 407
 408   if (optimize_bb_for_speed_p (bb)
 409       && occ->recip_def && use_stmt != occ->recip_def_stmt)
 410     {
 411       gimple_stmt_iterator gsi = gsi_for_stmt (use_stmt);
 412       gimple_assign_set_rhs_code (use_stmt, MULT_EXPR);
 413       SET_USE (use_p, occ->recip_def);
 414       fold_stmt_inplace (&gsi);
 415       update_stmt (use_stmt);
 416     }
 417 }
 418
 419
 420 /* Free OCC and return one more "struct occurrence" to be freed.  */
 421
 422 static struct occurrence *
 423 free_bb (struct occurrence *occ)
 424 {
 425   struct occurrence *child, *next;
 426
 427   /* First get the two pointers hanging off OCC.  */
 428   next = occ->next;
 429   child = occ->children;
 430   occ->bb->aux = NULL;
 431   occ_pool->remove (occ);
 432
 433   /* Now ensure that we don't recurse unless it is necessary.  */
 434   if (!child)
 435     return next;
 436   else
 437     {
 438       while (next)
 439         next = free_bb (next);
 440
 441       return child;
 442     }
 443 }
 444
 445
 446 /* Look for floating-point divisions among DEF's uses, and try to
 447    replace them by multiplications with the reciprocal.  Add
 448    as many statements computing the reciprocal as needed.
 449
 450    DEF must be a GIMPLE register of a floating-point type.  */
 451
 452 static void
 453 execute_cse_reciprocals_1 (gimple_stmt_iterator *def_gsi, tree def)
 454 {
 455   use_operand_p use_p;
 456   imm_use_iterator use_iter;
 457   struct occurrence *occ;
 458   int count = 0, threshold;
 459
 460   gcc_assert (FLOAT_TYPE_P (TREE_TYPE (def)) && is_gimple_reg (def));
 461
 462   FOR_EACH_IMM_USE_FAST (use_p, use_iter, def)
 463     {
 464       gimple *use_stmt = USE_STMT (use_p);
 465       if (is_division_by (use_stmt, def))
 466         {
 467           register_division_in (gimple_bb (use_stmt));
 468           count++;
 469         }
 470     }
 471
 472   /* Do the expensive part only if we can hope to optimize something.  */
 473   threshold = targetm.min_divisions_for_recip_mul (TYPE_MODE (TREE_TYPE (def)));
 474   if (count >= threshold)
 475     {
 476       gimple *use_stmt;
 477       for (occ = occ_head; occ; occ = occ->next)
 478         {
 479           compute_merit (occ);
 480           insert_reciprocals (def_gsi, occ, def, NULL, threshold);
 481         }
 482
 483       FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, def)
 484         {
 485           if (is_division_by (use_stmt, def))
 486             {
 487               FOR_EACH_IMM_USE_ON_STMT (use_p, use_iter)
 488                 replace_reciprocal (use_p);
 489             }
 490         }
 491     }
 492
 493   for (occ = occ_head; occ; )
 494     occ = free_bb (occ);
 495
 496   occ_head = NULL;
 497 }
 498
 499 /* Go through all the floating-point SSA_NAMEs, and call
 500    execute_cse_reciprocals_1 on each of them.  */
 501 namespace {
 502
 503 const pass_data pass_data_cse_reciprocals =
 504 {
 505   GIMPLE_PASS, /* type */
 506   "recip", /* name */
 507   OPTGROUP_NONE, /* optinfo_flags */
 508   TV_NONE, /* tv_id */
 509   PROP_ssa, /* properties_required */
 510   0, /* properties_provided */
 511   0, /* properties_destroyed */
 512   0, /* todo_flags_start */
 513   TODO_update_ssa, /* todo_flags_finish */
 514 };
 515
 516 class pass_cse_reciprocals : public gimple_opt_pass
 517 {
 518 public:
 519   pass_cse_reciprocals (gcc::context *ctxt)
 520     : gimple_opt_pass (pass_data_cse_reciprocals, ctxt)
 521   {}
 522
 523   /* opt_pass methods: */
 524   virtual bool gate (function *) { return optimize && flag_reciprocal_math; }
 525   virtual unsigned int execute (function *);
 526
 527 }; // class pass_cse_reciprocals
 528
 529 unsigned int
 530 pass_cse_reciprocals::execute (function *fun)
 531 {
 532   basic_block bb;
 533   tree arg;
 534
 535   occ_pool = new object_allocator<occurrence> ("dominators for recip");
 536
 537   memset (&reciprocal_stats, 0, sizeof (reciprocal_stats));
 538   calculate_dominance_info (CDI_DOMINATORS);
 539   calculate_dominance_info (CDI_POST_DOMINATORS);
 540
 541   if (flag_checking)
 542     FOR_EACH_BB_FN (bb, fun)
 543       gcc_assert (!bb->aux);
 544
 545   for (arg = DECL_ARGUMENTS (fun->decl); arg; arg = DECL_CHAIN (arg))
 546     if (FLOAT_TYPE_P (TREE_TYPE (arg))
 547         && is_gimple_reg (arg))
 548       {
 549         tree name = ssa_default_def (fun, arg);
 550         if (name)
 551           execute_cse_reciprocals_1 (NULL, name);
 552       }
 553
 554   FOR_EACH_BB_FN (bb, fun)
 555     {
 556       tree def;
 557
 558       for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
 559            gsi_next (&gsi))
 560         {
 561           gphi *phi = gsi.phi ();
 562           def = PHI_RESULT (phi);
 563           if (! virtual_operand_p (def)
 564               && FLOAT_TYPE_P (TREE_TYPE (def)))
 565             execute_cse_reciprocals_1 (NULL, def);
 566         }
 567
 568       for (gimple_stmt_iterator gsi = gsi_after_labels (bb); !gsi_end_p (gsi);
 569            gsi_next (&gsi))
 570         {
 571           gimple *stmt = gsi_stmt (gsi);
 572
 573           if (gimple_has_lhs (stmt)
 574               && (def = SINGLE_SSA_TREE_OPERAND (stmt, SSA_OP_DEF)) != NULL
 575               && FLOAT_TYPE_P (TREE_TYPE (def))
 576               && TREE_CODE (def) == SSA_NAME)
 577             execute_cse_reciprocals_1 (&gsi, def);
 578         }
 579
 580       if (optimize_bb_for_size_p (bb))
 581         continue;
 582
 583       /* Scan for a/func(b) and convert it to reciprocal a*rfunc(b).  */
 584       for (gimple_stmt_iterator gsi = gsi_after_labels (bb); !gsi_end_p (gsi);
 585            gsi_next (&gsi))
 586         {
 587           gimple *stmt = gsi_stmt (gsi);
 588           tree fndecl;
 589
 590           if (is_gimple_assign (stmt)
 591               && gimple_assign_rhs_code (stmt) == RDIV_EXPR)
 592             {
 593               tree arg1 = gimple_assign_rhs2 (stmt);
 594               gimple *stmt1;
 595
 596               if (TREE_CODE (arg1) != SSA_NAME)
 597                 continue;
 598
 599               stmt1 = SSA_NAME_DEF_STMT (arg1);
 600
 601               if (is_gimple_call (stmt1)
 602                   && gimple_call_lhs (stmt1)
 603                   && (fndecl = gimple_call_fndecl (stmt1))
 604                   && (DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_NORMAL
 605                       || DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD))
 606                 {
 607                   enum built_in_function code;
 608                   bool md_code, fail;
 609                   imm_use_iterator ui;
 610                   use_operand_p use_p;
 611
 612                   code = DECL_FUNCTION_CODE (fndecl);
 613                   md_code = DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD;
 614
 615                   fndecl = targetm.builtin_reciprocal (code, md_code, false);
 616                   if (!fndecl)
 617                     continue;
 618
 619                   /* Check that all uses of the SSA name are divisions,
 620                      otherwise replacing the defining statement will do
 621                      the wrong thing.  */
 622                   fail = false;
 623                   FOR_EACH_IMM_USE_FAST (use_p, ui, arg1)
 624                     {
 625                       gimple *stmt2 = USE_STMT (use_p);
 626                       if (is_gimple_debug (stmt2))
 627                         continue;
 628                       if (!is_gimple_assign (stmt2)
 629                           || gimple_assign_rhs_code (stmt2) != RDIV_EXPR
 630                           || gimple_assign_rhs1 (stmt2) == arg1
 631                           || gimple_assign_rhs2 (stmt2) != arg1)
 632                         {
 633                           fail = true;
 634                           break;
 635                         }
 636                     }
 637                   if (fail)
 638                     continue;
 639
 640                   gimple_replace_ssa_lhs (stmt1, arg1);
 641                   gimple_call_set_fndecl (stmt1, fndecl);
 642                   update_stmt (stmt1);
 643                   reciprocal_stats.rfuncs_inserted++;
 644
 645                   FOR_EACH_IMM_USE_STMT (stmt, ui, arg1)
 646                     {
 647                       gimple_stmt_iterator gsi = gsi_for_stmt (stmt);
 648                       gimple_assign_set_rhs_code (stmt, MULT_EXPR);
 649                       fold_stmt_inplace (&gsi);
 650                       update_stmt (stmt);
 651                     }
 652                 }
 653             }
 654         }
 655     }
 656
 657   statistics_counter_event (fun, "reciprocal divs inserted",
 658                             reciprocal_stats.rdivs_inserted);
 659   statistics_counter_event (fun, "reciprocal functions inserted",
 660                             reciprocal_stats.rfuncs_inserted);
 661
 662   free_dominance_info (CDI_DOMINATORS);
 663   free_dominance_info (CDI_POST_DOMINATORS);
 664   delete occ_pool;
 665   return 0;
 666 }
 667
 668 } // anon namespace
 669
 670 gimple_opt_pass *
 671 make_pass_cse_reciprocals (gcc::context *ctxt)
 672 {
 673   return new pass_cse_reciprocals (ctxt);
 674 }
 675
 676 /* Records an occurrence at statement USE_STMT in the vector of trees
 677    STMTS if it is dominated by *TOP_BB or dominates it or this basic block
 678    is not yet initialized.  Returns true if the occurrence was pushed on
 679    the vector.  Adjusts *TOP_BB to be the basic block dominating all
 680    statements in the vector.  */
 681
 682 static bool
 683 maybe_record_sincos (vec<gimple *> *stmts,
 684                      basic_block *top_bb, gimple *use_stmt)
 685 {
 686   basic_block use_bb = gimple_bb (use_stmt);
 687   if (*top_bb
 688       && (*top_bb == use_bb
 689           || dominated_by_p (CDI_DOMINATORS, use_bb, *top_bb)))
 690     stmts->safe_push (use_stmt);
 691   else if (!*top_bb
 692            || dominated_by_p (CDI_DOMINATORS, *top_bb, use_bb))
 693     {
 694       stmts->safe_push (use_stmt);
 695       *top_bb = use_bb;
 696     }
 697   else
 698     return false;
 699
 700   return true;
 701 }
 702
 703 /* Look for sin, cos and cexpi calls with the same argument NAME and
 704    create a single call to cexpi CSEing the result in this case.
 705    We first walk over all immediate uses of the argument collecting
 706    statements that we can CSE in a vector and in a second pass replace
 707    the statement rhs with a REALPART or IMAGPART expression on the
 708    result of the cexpi call we insert before the use statement that
 709    dominates all other candidates.  */
 710
 711 static bool
 712 execute_cse_sincos_1 (tree name)
 713 {
 714   gimple_stmt_iterator gsi;
 715   imm_use_iterator use_iter;
 716   tree fndecl, res, type;
 717   gimple *def_stmt, *use_stmt, *stmt;
 718   int seen_cos = 0, seen_sin = 0, seen_cexpi = 0;
 719   auto_vec<gimple *> stmts;
 720   basic_block top_bb = NULL;
 721   int i;
 722   bool cfg_changed = false;
 723
 724   type = TREE_TYPE (name);
 725   FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, name)
 726     {
 727       if (gimple_code (use_stmt) != GIMPLE_CALL
 728           || !gimple_call_lhs (use_stmt)
 729           || !(fndecl = gimple_call_fndecl (use_stmt))
 730           || DECL_BUILT_IN_CLASS (fndecl) != BUILT_IN_NORMAL)
 731         continue;
 732
 733       switch (DECL_FUNCTION_CODE (fndecl))
 734         {
 735         CASE_FLT_FN (BUILT_IN_COS):
 736           seen_cos |= maybe_record_sincos (&stmts, &top_bb, use_stmt) ? 1 : 0;
 737           break;
 738
 739         CASE_FLT_FN (BUILT_IN_SIN):
 740           seen_sin |= maybe_record_sincos (&stmts, &top_bb, use_stmt) ? 1 : 0;
 741           break;
 742
 743         CASE_FLT_FN (BUILT_IN_CEXPI):
 744           seen_cexpi |= maybe_record_sincos (&stmts, &top_bb, use_stmt) ? 1 : 0;
 745           break;
 746
 747         default:;
 748         }
 749     }
 750
 751   if (seen_cos + seen_sin + seen_cexpi <= 1)
 752     return false;
 753
 754   /* Simply insert cexpi at the beginning of top_bb but not earlier than
 755      the name def statement.  */
 756   fndecl = mathfn_built_in (type, BUILT_IN_CEXPI);
 757   if (!fndecl)
 758     return false;
 759   stmt = gimple_build_call (fndecl, 1, name);
 760   res = make_temp_ssa_name (TREE_TYPE (TREE_TYPE (fndecl)), stmt, "sincostmp");
 761   gimple_call_set_lhs (stmt, res);
 762
 763   def_stmt = SSA_NAME_DEF_STMT (name);
 764   if (!SSA_NAME_IS_DEFAULT_DEF (name)
 765       && gimple_code (def_stmt) != GIMPLE_PHI
 766       && gimple_bb (def_stmt) == top_bb)
 767     {
 768       gsi = gsi_for_stmt (def_stmt);
 769       gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
 770     }
 771   else
 772     {
 773       gsi = gsi_after_labels (top_bb);
 774       gsi_insert_before (&gsi, stmt, GSI_SAME_STMT);
 775     }
 776   sincos_stats.inserted++;
 777
 778   /* And adjust the recorded old call sites.  */
 779   for (i = 0; stmts.iterate (i, &use_stmt); ++i)
 780     {
 781       tree rhs = NULL;
 782       fndecl = gimple_call_fndecl (use_stmt);
 783
 784       switch (DECL_FUNCTION_CODE (fndecl))
 785         {
 786         CASE_FLT_FN (BUILT_IN_COS):
 787           rhs = fold_build1 (REALPART_EXPR, type, res);
 788           break;
 789
 790         CASE_FLT_FN (BUILT_IN_SIN):
 791           rhs = fold_build1 (IMAGPART_EXPR, type, res);
 792           break;
 793
 794         CASE_FLT_FN (BUILT_IN_CEXPI):
 795           rhs = res;
 796           break;
 797
 798         default:;
 799           gcc_unreachable ();
 800         }
 801
 802         /* Replace call with a copy.  */
 803         stmt = gimple_build_assign (gimple_call_lhs (use_stmt), rhs);
 804
 805         gsi = gsi_for_stmt (use_stmt);
 806         gsi_replace (&gsi, stmt, true);
 807         if (gimple_purge_dead_eh_edges (gimple_bb (stmt)))
 808           cfg_changed = true;
 809     }
 810
 811   return cfg_changed;
 812 }
 813
 814 /* To evaluate powi(x,n), the floating point value x raised to the
 815    constant integer exponent n, we use a hybrid algorithm that
 816    combines the "window method" with look-up tables.  For an
 817    introduction to exponentiation algorithms and "addition chains",
 818    see section 4.6.3, "Evaluation of Powers" of Donald E. Knuth,
 819    "Seminumerical Algorithms", Vol. 2, "The Art of Computer Programming",
 820    3rd Edition, 1998, and Daniel M. Gordon, "A Survey of Fast Exponentiation
 821    Methods", Journal of Algorithms, Vol. 27, pp. 129-146, 1998.  */
 822
 823 /* Provide a default value for POWI_MAX_MULTS, the maximum number of
 824    multiplications to inline before calling the system library's pow
 825    function.  powi(x,n) requires at worst 2*bits(n)-2 multiplications,
 826    so this default never requires calling pow, powf or powl.  */
 827
 828 #ifndef POWI_MAX_MULTS
 829 #define POWI_MAX_MULTS  (2*HOST_BITS_PER_WIDE_INT-2)
 830 #endif
 831
 832 /* The size of the "optimal power tree" lookup table.  All
 833    exponents less than this value are simply looked up in the
 834    powi_table below.  This threshold is also used to size the
 835    cache of pseudo registers that hold intermediate results.  */
 836 #define POWI_TABLE_SIZE 256
 837
 838 /* The size, in bits of the window, used in the "window method"
 839    exponentiation algorithm.  This is equivalent to a radix of
 840    (1<<POWI_WINDOW_SIZE) in the corresponding "m-ary method".  */
 841 #define POWI_WINDOW_SIZE 3
 842
 843 /* The following table is an efficient representation of an
 844    "optimal power tree".  For each value, i, the corresponding
 845    value, j, in the table states than an optimal evaluation
 846    sequence for calculating pow(x,i) can be found by evaluating
 847    pow(x,j)*pow(x,i-j).  An optimal power tree for the first
 848    100 integers is given in Knuth's "Seminumerical algorithms".  */
 849
 850 static const unsigned char powi_table[POWI_TABLE_SIZE] =
 851   {
 852       0,   1,   1,   2,   2,   3,   3,   4,  /*   0 -   7 */
 853       4,   6,   5,   6,   6,  10,   7,   9,  /*   8 -  15 */
 854       8,  16,   9,  16,  10,  12,  11,  13,  /*  16 -  23 */
 855      12,  17,  13,  18,  14,  24,  15,  26,  /*  24 -  31 */
 856      16,  17,  17,  19,  18,  33,  19,  26,  /*  32 -  39 */
 857      20,  25,  21,  40,  22,  27,  23,  44,  /*  40 -  47 */
 858      24,  32,  25,  34,  26,  29,  27,  44,  /*  48 -  55 */
 859      28,  31,  29,  34,  30,  60,  31,  36,  /*  56 -  63 */
 860      32,  64,  33,  34,  34,  46,  35,  37,  /*  64 -  71 */
 861      36,  65,  37,  50,  38,  48,  39,  69,  /*  72 -  79 */
 862      40,  49,  41,  43,  42,  51,  43,  58,  /*  80 -  87 */
 863      44,  64,  45,  47,  46,  59,  47,  76,  /*  88 -  95 */
 864      48,  65,  49,  66,  50,  67,  51,  66,  /*  96 - 103 */
 865      52,  70,  53,  74,  54, 104,  55,  74,  /* 104 - 111 */
 866      56,  64,  57,  69,  58,  78,  59,  68,  /* 112 - 119 */
 867      60,  61,  61,  80,  62,  75,  63,  68,  /* 120 - 127 */
 868      64,  65,  65, 128,  66, 129,  67,  90,  /* 128 - 135 */
 869      68,  73,  69, 131,  70,  94,  71,  88,  /* 136 - 143 */
 870      72, 128,  73,  98,  74, 132,  75, 121,  /* 144 - 151 */
 871      76, 102,  77, 124,  78, 132,  79, 106,  /* 152 - 159 */
 872      80,  97,  81, 160,  82,  99,  83, 134,  /* 160 - 167 */
 873      84,  86,  85,  95,  86, 160,  87, 100,  /* 168 - 175 */
 874      88, 113,  89,  98,  90, 107,  91, 122,  /* 176 - 183 */
 875      92, 111,  93, 102,  94, 126,  95, 150,  /* 184 - 191 */
 876      96, 128,  97, 130,  98, 133,  99, 195,  /* 192 - 199 */
 877     100, 128, 101, 123, 102, 164, 103, 138,  /* 200 - 207 */
 878     104, 145, 105, 146, 106, 109, 107, 149,  /* 208 - 215 */
 879     108, 200, 109, 146, 110, 170, 111, 157,  /* 216 - 223 */
 880     112, 128, 113, 130, 114, 182, 115, 132,  /* 224 - 231 */
 881     116, 200, 117, 132, 118, 158, 119, 206,  /* 232 - 239 */
 882     120, 240, 121, 162, 122, 147, 123, 152,  /* 240 - 247 */
 883     124, 166, 125, 214, 126, 138, 127, 153,  /* 248 - 255 */
 884   };
 885
 886
 887 /* Return the number of multiplications required to calculate
 888    powi(x,n) where n is less than POWI_TABLE_SIZE.  This is a
 889    subroutine of powi_cost.  CACHE is an array indicating
 890    which exponents have already been calculated.  */
 891
 892 static int
 893 powi_lookup_cost (unsigned HOST_WIDE_INT n, bool *cache)
 894 {
 895   /* If we've already calculated this exponent, then this evaluation
 896      doesn't require any additional multiplications.  */
 897   if (cache[n])
 898     return 0;
 899
 900   cache[n] = true;
 901   return powi_lookup_cost (n - powi_table[n], cache)
 902          + powi_lookup_cost (powi_table[n], cache) + 1;
 903 }
 904
 905 /* Return the number of multiplications required to calculate
 906    powi(x,n) for an arbitrary x, given the exponent N.  This
 907    function needs to be kept in sync with powi_as_mults below.  */
 908
 909 static int
 910 powi_cost (HOST_WIDE_INT n)
 911 {
 912   bool cache[POWI_TABLE_SIZE];
 913   unsigned HOST_WIDE_INT digit;
 914   unsigned HOST_WIDE_INT val;
 915   int result;
 916
 917   if (n == 0)
 918     return 0;
 919
 920   /* Ignore the reciprocal when calculating the cost.  */
 921   val = (n < 0) ? -n : n;
 922
 923   /* Initialize the exponent cache.  */
 924   memset (cache, 0, POWI_TABLE_SIZE * sizeof (bool));
 925   cache[1] = true;
 926
 927   result = 0;
 928
 929   while (val >= POWI_TABLE_SIZE)
 930     {
 931       if (val & 1)
 932         {
 933           digit = val & ((1 << POWI_WINDOW_SIZE) - 1);
 934           result += powi_lookup_cost (digit, cache)
 935                     + POWI_WINDOW_SIZE + 1;
 936           val >>= POWI_WINDOW_SIZE;
 937         }
 938       else
 939         {
 940           val >>= 1;
 941           result++;
 942         }
 943     }
 944
 945   return result + powi_lookup_cost (val, cache);
 946 }
 947
 948 /* Recursive subroutine of powi_as_mults.  This function takes the
 949    array, CACHE, of already calculated exponents and an exponent N and
 950    returns a tree that corresponds to CACHE[1]**N, with type TYPE.  */
 951
 952 static tree
 953 powi_as_mults_1 (gimple_stmt_iterator *gsi, location_t loc, tree type,
 954                  HOST_WIDE_INT n, tree *cache)
 955 {
 956   tree op0, op1, ssa_target;
 957   unsigned HOST_WIDE_INT digit;
 958   gassign *mult_stmt;
 959
 960   if (n < POWI_TABLE_SIZE && cache[n])
 961     return cache[n];
 962
 963   ssa_target = make_temp_ssa_name (type, NULL, "powmult");
 964
 965   if (n < POWI_TABLE_SIZE)
 966     {
 967       cache[n] = ssa_target;
 968       op0 = powi_as_mults_1 (gsi, loc, type, n - powi_table[n], cache);
 969       op1 = powi_as_mults_1 (gsi, loc, type, powi_table[n], cache);
 970     }
 971   else if (n & 1)
 972     {
 973       digit = n & ((1 << POWI_WINDOW_SIZE) - 1);
 974       op0 = powi_as_mults_1 (gsi, loc, type, n - digit, cache);
 975       op1 = powi_as_mults_1 (gsi, loc, type, digit, cache);
 976     }
 977   else
 978     {
 979       op0 = powi_as_mults_1 (gsi, loc, type, n >> 1, cache);
 980       op1 = op0;
 981     }
 982
 983   mult_stmt = gimple_build_assign (ssa_target, MULT_EXPR, op0, op1);
 984   gimple_set_location (mult_stmt, loc);
 985   gsi_insert_before (gsi, mult_stmt, GSI_SAME_STMT);
 986
 987   return ssa_target;
 988 }
 989
 990 /* Convert ARG0**N to a tree of multiplications of ARG0 with itself.
 991    This function needs to be kept in sync with powi_cost above.  */
 992
 993 static tree
 994 powi_as_mults (gimple_stmt_iterator *gsi, location_t loc,
 995                tree arg0, HOST_WIDE_INT n)
 996 {
 997   tree cache[POWI_TABLE_SIZE], result, type = TREE_TYPE (arg0);
 998   gassign *div_stmt;
 999   tree target;
1000
1001   if (n == 0)
1002     return build_real (type, dconst1);
1003
1004   memset (cache, 0,  sizeof (cache));
1005   cache[1] = arg0;
1006
1007   result = powi_as_mults_1 (gsi, loc, type, (n < 0) ? -n : n, cache);
1008   if (n >= 0)
1009     return result;
1010
1011   /* If the original exponent was negative, reciprocate the result.  */
1012   target = make_temp_ssa_name (type, NULL, "powmult");
1013   div_stmt = gimple_build_assign (target, RDIV_EXPR,
1014                                   build_real (type, dconst1), result);
1015   gimple_set_location (div_stmt, loc);
1016   gsi_insert_before (gsi, div_stmt, GSI_SAME_STMT);
1017
1018   return target;
1019 }
1020
1021 /* ARG0 and N are the two arguments to a powi builtin in GSI with
1022    location info LOC.  If the arguments are appropriate, create an
1023    equivalent sequence of statements prior to GSI using an optimal
1024    number of multiplications, and return an expession holding the
1025    result.  */
1026
1027 static tree
1028 gimple_expand_builtin_powi (gimple_stmt_iterator *gsi, location_t loc,
1029                             tree arg0, HOST_WIDE_INT n)
1030 {
1031   /* Avoid largest negative number.  */
1032   if (n != -n
1033       && ((n >= -1 && n <= 2)
1034           || (optimize_function_for_speed_p (cfun)
1035               && powi_cost (n) <= POWI_MAX_MULTS)))
1036     return powi_as_mults (gsi, loc, arg0, n);
1037
1038   return NULL_TREE;
1039 }
1040
1041 /* Build a gimple call statement that calls FN with argument ARG.
1042    Set the lhs of the call statement to a fresh SSA name.  Insert the
1043    statement prior to GSI's current position, and return the fresh
1044    SSA name.  */
1045
1046 static tree
1047 build_and_insert_call (gimple_stmt_iterator *gsi, location_t loc,
1048                        tree fn, tree arg)
1049 {
1050   gcall *call_stmt;
1051   tree ssa_target;
1052
1053   call_stmt = gimple_build_call (fn, 1, arg);
1054   ssa_target = make_temp_ssa_name (TREE_TYPE (arg), NULL, "powroot");
1055   gimple_set_lhs (call_stmt, ssa_target);
1056   gimple_set_location (call_stmt, loc);
1057   gsi_insert_before (gsi, call_stmt, GSI_SAME_STMT);
1058
1059   return ssa_target;
1060 }
1061
1062 /* Build a gimple binary operation with the given CODE and arguments
1063    ARG0, ARG1, assigning the result to a new SSA name for variable
1064    TARGET.  Insert the statement prior to GSI's current position, and
1065    return the fresh SSA name.*/
1066
1067 static tree
1068 build_and_insert_binop (gimple_stmt_iterator *gsi, location_t loc,
1069                         const char *name, enum tree_code code,
1070                         tree arg0, tree arg1)
1071 {
1072   tree result = make_temp_ssa_name (TREE_TYPE (arg0), NULL, name);
1073   gassign *stmt = gimple_build_assign (result, code, arg0, arg1);
1074   gimple_set_location (stmt, loc);
1075   gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
1076   return result;
1077 }
1078
1079 /* Build a gimple reference operation with the given CODE and argument
1080    ARG, assigning the result to a new SSA name of TYPE with NAME.
1081    Insert the statement prior to GSI's current position, and return
1082    the fresh SSA name.  */
1083
1084 static inline tree
1085 build_and_insert_ref (gimple_stmt_iterator *gsi, location_t loc, tree type,
1086                       const char *name, enum tree_code code, tree arg0)
1087 {
1088   tree result = make_temp_ssa_name (type, NULL, name);
1089   gimple *stmt = gimple_build_assign (result, build1 (code, type, arg0));
1090   gimple_set_location (stmt, loc);
1091   gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
1092   return result;
1093 }
1094
1095 /* Build a gimple assignment to cast VAL to TYPE.  Insert the statement
1096    prior to GSI's current position, and return the fresh SSA name.  */
1097
1098 static tree
1099 build_and_insert_cast (gimple_stmt_iterator *gsi, location_t loc,
1100                        tree type, tree val)
1101 {
1102   tree result = make_ssa_name (type);
1103   gassign *stmt = gimple_build_assign (result, NOP_EXPR, val);
1104   gimple_set_location (stmt, loc);
1105   gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
1106   return result;
1107 }
1108
1109 struct pow_synth_sqrt_info
1110 {
1111   bool *factors;
1112   unsigned int deepest;
1113   unsigned int num_mults;
1114 };
1115
1116 /* Return true iff the real value C can be represented as a
1117    sum of powers of 0.5 up to N.  That is:
1118    C == SUM<i from 1..N> (a[i]*(0.5**i)) where a[i] is either 0 or 1.
1119    Record in INFO the various parameters of the synthesis algorithm such
1120    as the factors a[i], the maximum 0.5 power and the number of
1121    multiplications that will be required.  */
1122
1123 bool
1124 representable_as_half_series_p (REAL_VALUE_TYPE c, unsigned n,
1125                                  struct pow_synth_sqrt_info *info)
1126 {
1127   REAL_VALUE_TYPE factor = dconsthalf;
1128   REAL_VALUE_TYPE remainder = c;
1129
1130   info->deepest = 0;
1131   info->num_mults = 0;
1132   memset (info->factors, 0, n * sizeof (bool));
1133
1134   for (unsigned i = 0; i < n; i++)
1135     {
1136       REAL_VALUE_TYPE res;
1137
1138       /* If something inexact happened bail out now.  */
1139       if (real_arithmetic (&res, MINUS_EXPR, &remainder, &factor))
1140         return false;
1141
1142       /* We have hit zero.  The number is representable as a sum
1143          of powers of 0.5.  */
1144       if (real_equal (&res, &dconst0))
1145         {
1146           info->factors[i] = true;
1147           info->deepest = i + 1;
1148           return true;
1149         }
1150       else if (!REAL_VALUE_NEGATIVE (res))
1151         {
1152           remainder = res;
1153           info->factors[i] = true;
1154           info->num_mults++;
1155         }
1156       else
1157         info->factors[i] = false;
1158
1159       real_arithmetic (&factor, MULT_EXPR, &factor, &dconsthalf);
1160     }
1161   return false;
1162 }
1163
1164 /* Return the tree corresponding to FN being applied
1165    to ARG N times at GSI and LOC.
1166    Look up previous results from CACHE if need be.
1167    cache[0] should contain just plain ARG i.e. FN applied to ARG 0 times.  */
1168
1169 static tree
1170 get_fn_chain (tree arg, unsigned int n, gimple_stmt_iterator *gsi,
1171               tree fn, location_t loc, tree *cache)
1172 {
1173   tree res = cache[n];
1174   if (!res)
1175     {
1176       tree prev = get_fn_chain (arg, n - 1, gsi, fn, loc, cache);
1177       res = build_and_insert_call (gsi, loc, fn, prev);
1178       cache[n] = res;
1179     }
1180
1181   return res;
1182 }
1183
1184 /* Print to STREAM the repeated application of function FNAME to ARG
1185    N times.  So, for FNAME = "foo", ARG = "x", N = 2 it would print:
1186    "foo (foo (x))".  */
1187
1188 static void
1189 print_nested_fn (FILE* stream, const char *fname, const char* arg,
1190                  unsigned int n)
1191 {
1192   if (n == 0)
1193     fprintf (stream, "%s", arg);
1194   else
1195     {
1196       fprintf (stream, "%s (", fname);
1197       print_nested_fn (stream, fname, arg, n - 1);
1198       fprintf (stream, ")");
1199     }
1200 }
1201
1202 /* Print to STREAM the fractional sequence of sqrt chains
1203    applied to ARG, described by INFO.  Used for the dump file.  */
1204
1205 static void
1206 dump_fractional_sqrt_sequence (FILE *stream, const char *arg,
1207                                 struct pow_synth_sqrt_info *info)
1208 {
1209   for (unsigned int i = 0; i < info->deepest; i++)
1210     {
1211       bool is_set = info->factors[i];
1212       if (is_set)
1213         {
1214           print_nested_fn (stream, "sqrt", arg, i + 1);
1215           if (i != info->deepest - 1)
1216             fprintf (stream, " * ");
1217         }
1218     }
1219 }
1220
1221 /* Print to STREAM a representation of raising ARG to an integer
1222    power N.  Used for the dump file.  */
1223
1224 static void
1225 dump_integer_part (FILE *stream, const char* arg, HOST_WIDE_INT n)
1226 {
1227   if (n > 1)
1228     fprintf (stream, "powi (%s, " HOST_WIDE_INT_PRINT_DEC ")", arg, n);
1229   else if (n == 1)
1230     fprintf (stream, "%s", arg);
1231 }
1232
1233 /* Attempt to synthesize a POW[F] (ARG0, ARG1) call using chains of
1234    square roots.  Place at GSI and LOC.  Limit the maximum depth
1235    of the sqrt chains to MAX_DEPTH.  Return the tree holding the
1236    result of the expanded sequence or NULL_TREE if the expansion failed.
1237
1238    This routine assumes that ARG1 is a real number with a fractional part
1239    (the integer exponent case will have been handled earlier in
1240    gimple_expand_builtin_pow).
1241
1242    For ARG1 > 0.0:
1243    * For ARG1 composed of a whole part WHOLE_PART and a fractional part
1244      FRAC_PART i.e. WHOLE_PART == floor (ARG1) and
1245                     FRAC_PART == ARG1 - WHOLE_PART:
1246      Produce POWI (ARG0, WHOLE_PART) * POW (ARG0, FRAC_PART) where
1247      POW (ARG0, FRAC_PART) is expanded as a product of square root chains
1248      if it can be expressed as such, that is if FRAC_PART satisfies:
1249      FRAC_PART == <SUM from i = 1 until MAX_DEPTH> (a[i] * (0.5**i))
1250      where integer a[i] is either 0 or 1.
1251
1252      Example:
1253      POW (x, 3.625) == POWI (x, 3) * POW (x, 0.625)
1254        --> POWI (x, 3) * SQRT (x) * SQRT (SQRT (SQRT (x)))
1255
1256    For ARG1 < 0.0 there are two approaches:
1257    * (A) Expand to 1.0 / POW (ARG0, -ARG1) where POW (ARG0, -ARG1)
1258          is calculated as above.
1259
1260      Example:
1261      POW (x, -5.625) == 1.0 / POW (x, 5.625)
1262        -->  1.0 / (POWI (x, 5) * SQRT (x) * SQRT (SQRT (SQRT (x))))
1263
1264    * (B) : WHOLE_PART := - ceil (abs (ARG1))
1265            FRAC_PART  := ARG1 - WHOLE_PART
1266      and expand to POW (x, FRAC_PART) / POWI (x, WHOLE_PART).
1267      Example:
1268      POW (x, -5.875) == POW (x, 0.125) / POWI (X, 6)
1269        --> SQRT (SQRT (SQRT (x))) / (POWI (x, 6))
1270
1271    For ARG1 < 0.0 we choose between (A) and (B) depending on
1272    how many multiplications we'd have to do.
1273    So, for the example in (B): POW (x, -5.875), if we were to
1274    follow algorithm (A) we would produce:
1275    1.0 / POWI (X, 5) * SQRT (X) * SQRT (SQRT (X)) * SQRT (SQRT (SQRT (X)))
1276    which contains more multiplications than approach (B).
1277
1278    Hopefully, this approach will eliminate potentially expensive POW library
1279    calls when unsafe floating point math is enabled and allow the compiler to
1280    further optimise the multiplies, square roots and divides produced by this
1281    function.  */
1282
1283 static tree
1284 expand_pow_as_sqrts (gimple_stmt_iterator *gsi, location_t loc,
1285                      tree arg0, tree arg1, HOST_WIDE_INT max_depth)
1286 {
1287   tree type = TREE_TYPE (arg0);
1288   machine_mode mode = TYPE_MODE (type);
1289   tree sqrtfn = mathfn_built_in (type, BUILT_IN_SQRT);
1290   bool one_over = true;
1291
1292   if (!sqrtfn)
1293     return NULL_TREE;
1294
1295   if (TREE_CODE (arg1) != REAL_CST)
1296     return NULL_TREE;
1297
1298   REAL_VALUE_TYPE exp_init = TREE_REAL_CST (arg1);
1299
1300   gcc_assert (max_depth > 0);
1301   tree *cache = XALLOCAVEC (tree, max_depth + 1);
1302
1303   struct pow_synth_sqrt_info synth_info;
1304   synth_info.factors = XALLOCAVEC (bool, max_depth + 1);
1305   synth_info.deepest = 0;
1306   synth_info.num_mults = 0;
1307
1308   bool neg_exp = REAL_VALUE_NEGATIVE (exp_init);
1309   REAL_VALUE_TYPE exp = real_value_abs (&exp_init);
1310
1311   /* The whole and fractional parts of exp.  */
1312   REAL_VALUE_TYPE whole_part;
1313   REAL_VALUE_TYPE frac_part;
1314
1315   real_floor (&whole_part, mode, &exp);
1316   real_arithmetic (&frac_part, MINUS_EXPR, &exp, &whole_part);
1317
1318
1319   REAL_VALUE_TYPE ceil_whole = dconst0;
1320   REAL_VALUE_TYPE ceil_fract = dconst0;
1321
1322   if (neg_exp)
1323     {
1324       real_ceil (&ceil_whole, mode, &exp);
1325       real_arithmetic (&ceil_fract, MINUS_EXPR, &ceil_whole, &exp);
1326     }
1327
1328   if (!representable_as_half_series_p (frac_part, max_depth, &synth_info))
1329     return NULL_TREE;
1330
1331   /* Check whether it's more profitable to not use 1.0 / ...  */
1332   if (neg_exp)
1333     {
1334       struct pow_synth_sqrt_info alt_synth_info;
1335       alt_synth_info.factors = XALLOCAVEC (bool, max_depth + 1);
1336       alt_synth_info.deepest = 0;
1337       alt_synth_info.num_mults = 0;
1338
1339       if (representable_as_half_series_p (ceil_fract, max_depth,
1340                                            &alt_synth_info)
1341           && alt_synth_info.deepest <= synth_info.deepest
1342           && alt_synth_info.num_mults < synth_info.num_mults)
1343         {
1344           whole_part = ceil_whole;
1345           frac_part = ceil_fract;
1346           synth_info.deepest = alt_synth_info.deepest;
1347           synth_info.num_mults = alt_synth_info.num_mults;
1348           memcpy (synth_info.factors, alt_synth_info.factors,
1349                   (max_depth + 1) * sizeof (bool));
1350           one_over = false;
1351         }
1352     }
1353
1354   HOST_WIDE_INT n = real_to_integer (&whole_part);
1355   REAL_VALUE_TYPE cint;
1356   real_from_integer (&cint, VOIDmode, n, SIGNED);
1357
1358   if (!real_identical (&whole_part, &cint))
1359     return NULL_TREE;
1360
1361   if (powi_cost (n) + synth_info.num_mults > POWI_MAX_MULTS)
1362     return NULL_TREE;
1363
1364   memset (cache, 0, (max_depth + 1) * sizeof (tree));
1365
1366   tree integer_res = n == 0 ? build_real (type, dconst1) : arg0;
1367
1368   /* Calculate the integer part of the exponent.  */
1369   if (n > 1)
1370     {
1371       integer_res = gimple_expand_builtin_powi (gsi, loc, arg0, n);
1372       if (!integer_res)
1373         return NULL_TREE;
1374     }
1375
1376   if (dump_file)
1377     {
1378       char string[64];
1379
1380       real_to_decimal (string, &exp_init, sizeof (string), 0, 1);
1381       fprintf (dump_file, "synthesizing pow (x, %s) as:\n", string);
1382
1383       if (neg_exp)
1384         {
1385           if (one_over)
1386             {
1387               fprintf (dump_file, "1.0 / (");
1388               dump_integer_part (dump_file, "x", n);
1389               if (n > 0)
1390                 fprintf (dump_file, " * ");
1391               dump_fractional_sqrt_sequence (dump_file, "x", &synth_info);
1392               fprintf (dump_file, ")");
1393             }
1394           else
1395             {
1396               dump_fractional_sqrt_sequence (dump_file, "x", &synth_info);
1397               fprintf (dump_file, " / (");
1398               dump_integer_part (dump_file, "x", n);
1399               fprintf (dump_file, ")");
1400             }
1401         }
1402       else
1403         {
1404           dump_fractional_sqrt_sequence (dump_file, "x", &synth_info);
1405           if (n > 0)
1406             fprintf (dump_file, " * ");
1407           dump_integer_part (dump_file, "x", n);
1408         }
1409
1410       fprintf (dump_file, "\ndeepest sqrt chain: %d\n", synth_info.deepest);
1411     }
1412
1413
1414   tree fract_res = NULL_TREE;
1415   cache[0] = arg0;
1416
1417   /* Calculate the fractional part of the exponent.  */
1418   for (unsigned i = 0; i < synth_info.deepest; i++)
1419     {
1420       if (synth_info.factors[i])
1421         {
1422           tree sqrt_chain = get_fn_chain (arg0, i + 1, gsi, sqrtfn, loc, cache);
1423
1424           if (!fract_res)
1425               fract_res = sqrt_chain;
1426
1427           else
1428             fract_res = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1429                                            fract_res, sqrt_chain);
1430         }
1431     }
1432
1433   tree res = NULL_TREE;
1434
1435   if (neg_exp)
1436     {
1437       if (one_over)
1438         {
1439           if (n > 0)
1440             res = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1441                                            fract_res, integer_res);
1442           else
1443             res = fract_res;
1444
1445           res = build_and_insert_binop (gsi, loc, "powrootrecip", RDIV_EXPR,
1446                                           build_real (type, dconst1), res);
1447         }
1448       else
1449         {
1450           res = build_and_insert_binop (gsi, loc, "powroot", RDIV_EXPR,
1451                                          fract_res, integer_res);
1452         }
1453     }
1454   else
1455     res = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1456                                    fract_res, integer_res);
1457   return res;
1458 }
1459
1460 /* ARG0 and ARG1 are the two arguments to a pow builtin call in GSI
1461    with location info LOC.  If possible, create an equivalent and
1462    less expensive sequence of statements prior to GSI, and return an
1463    expession holding the result.  */
1464
1465 static tree
1466 gimple_expand_builtin_pow (gimple_stmt_iterator *gsi, location_t loc,
1467                            tree arg0, tree arg1)
1468 {
1469   REAL_VALUE_TYPE c, cint, dconst1_3, dconst1_4, dconst1_6;
1470   REAL_VALUE_TYPE c2, dconst3;
1471   HOST_WIDE_INT n;
1472   tree type, sqrtfn, cbrtfn, sqrt_arg0, result, cbrt_x, powi_cbrt_x;
1473   machine_mode mode;
1474   bool speed_p = optimize_bb_for_speed_p (gsi_bb (*gsi));
1475   bool hw_sqrt_exists, c_is_int, c2_is_int;
1476
1477   dconst1_4 = dconst1;
1478   SET_REAL_EXP (&dconst1_4, REAL_EXP (&dconst1_4) - 2);
1479
1480   /* If the exponent isn't a constant, there's nothing of interest
1481      to be done.  */
1482   if (TREE_CODE (arg1) != REAL_CST)
1483     return NULL_TREE;
1484
1485   /* If the exponent is equivalent to an integer, expand to an optimal
1486      multiplication sequence when profitable.  */
1487   c = TREE_REAL_CST (arg1);
1488   n = real_to_integer (&c);
1489   real_from_integer (&cint, VOIDmode, n, SIGNED);
1490   c_is_int = real_identical (&c, &cint);
1491
1492   if (c_is_int
1493       && ((n >= -1 && n <= 2)
1494           || (flag_unsafe_math_optimizations
1495               && speed_p
1496               && powi_cost (n) <= POWI_MAX_MULTS)))
1497     return gimple_expand_builtin_powi (gsi, loc, arg0, n);
1498
1499   /* Attempt various optimizations using sqrt and cbrt.  */
1500   type = TREE_TYPE (arg0);
1501   mode = TYPE_MODE (type);
1502   sqrtfn = mathfn_built_in (type, BUILT_IN_SQRT);
1503
1504   /* Optimize pow(x,0.5) = sqrt(x).  This replacement is always safe
1505      unless signed zeros must be maintained.  pow(-0,0.5) = +0, while
1506      sqrt(-0) = -0.  */
1507   if (sqrtfn
1508       && real_equal (&c, &dconsthalf)
1509       && !HONOR_SIGNED_ZEROS (mode))
1510     return build_and_insert_call (gsi, loc, sqrtfn, arg0);
1511
1512   hw_sqrt_exists = optab_handler (sqrt_optab, mode) != CODE_FOR_nothing;
1513
1514   /* Optimize pow(x,1./3.) = cbrt(x).  This requires unsafe math
1515      optimizations since 1./3. is not exactly representable.  If x
1516      is negative and finite, the correct value of pow(x,1./3.) is
1517      a NaN with the "invalid" exception raised, because the value
1518      of 1./3. actually has an even denominator.  The correct value
1519      of cbrt(x) is a negative real value.  */
1520   cbrtfn = mathfn_built_in (type, BUILT_IN_CBRT);
1521   dconst1_3 = real_value_truncate (mode, dconst_third ());
1522
1523   if (flag_unsafe_math_optimizations
1524       && cbrtfn
1525       && (!HONOR_NANS (mode) || tree_expr_nonnegative_p (arg0))
1526       && real_equal (&c, &dconst1_3))
1527     return build_and_insert_call (gsi, loc, cbrtfn, arg0);
1528
1529   /* Optimize pow(x,1./6.) = cbrt(sqrt(x)).  Don't do this optimization
1530      if we don't have a hardware sqrt insn.  */
1531   dconst1_6 = dconst1_3;
1532   SET_REAL_EXP (&dconst1_6, REAL_EXP (&dconst1_6) - 1);
1533
1534   if (flag_unsafe_math_optimizations
1535       && sqrtfn
1536       && cbrtfn
1537       && (!HONOR_NANS (mode) || tree_expr_nonnegative_p (arg0))
1538       && speed_p
1539       && hw_sqrt_exists
1540       && real_equal (&c, &dconst1_6))
1541     {
1542       /* sqrt(x)  */
1543       sqrt_arg0 = build_and_insert_call (gsi, loc, sqrtfn, arg0);
1544
1545       /* cbrt(sqrt(x))  */
1546       return build_and_insert_call (gsi, loc, cbrtfn, sqrt_arg0);
1547     }
1548
1549
1550   /* Attempt to expand the POW as a product of square root chains.
1551      Expand the 0.25 case even when otpimising for size.  */
1552   if (flag_unsafe_math_optimizations
1553       && sqrtfn
1554       && hw_sqrt_exists
1555       && (speed_p || real_equal (&c, &dconst1_4))
1556       && !HONOR_SIGNED_ZEROS (mode))
1557     {
1558       unsigned int max_depth = speed_p
1559                                 ? PARAM_VALUE (PARAM_MAX_POW_SQRT_DEPTH)
1560                                 : 2;
1561
1562       tree expand_with_sqrts
1563         = expand_pow_as_sqrts (gsi, loc, arg0, arg1, max_depth);
1564
1565       if (expand_with_sqrts)
1566         return expand_with_sqrts;
1567     }
1568
1569   real_arithmetic (&c2, MULT_EXPR, &c, &dconst2);
1570   n = real_to_integer (&c2);
1571   real_from_integer (&cint, VOIDmode, n, SIGNED);
1572   c2_is_int = real_identical (&c2, &cint);
1573
1574   /* Optimize pow(x,c), where 3c = n for some nonzero integer n, into
1575
1576      powi(x, n/3) * powi(cbrt(x), n%3),                    n > 0;
1577      1.0 / (powi(x, abs(n)/3) * powi(cbrt(x), abs(n)%3)),  n < 0.
1578
1579      Do not calculate the first factor when n/3 = 0.  As cbrt(x) is
1580      different from pow(x, 1./3.) due to rounding and behavior with
1581      negative x, we need to constrain this transformation to unsafe
1582      math and positive x or finite math.  */
1583   real_from_integer (&dconst3, VOIDmode, 3, SIGNED);
1584   real_arithmetic (&c2, MULT_EXPR, &c, &dconst3);
1585   real_round (&c2, mode, &c2);
1586   n = real_to_integer (&c2);
1587   real_from_integer (&cint, VOIDmode, n, SIGNED);
1588   real_arithmetic (&c2, RDIV_EXPR, &cint, &dconst3);
1589   real_convert (&c2, mode, &c2);
1590
1591   if (flag_unsafe_math_optimizations
1592       && cbrtfn
1593       && (!HONOR_NANS (mode) || tree_expr_nonnegative_p (arg0))
1594       && real_identical (&c2, &c)
1595       && !c2_is_int
1596       && optimize_function_for_speed_p (cfun)
1597       && powi_cost (n / 3) <= POWI_MAX_MULTS)
1598     {
1599       tree powi_x_ndiv3 = NULL_TREE;
1600
1601       /* Attempt to fold powi(arg0, abs(n/3)) into multiplies.  If not
1602          possible or profitable, give up.  Skip the degenerate case when
1603          abs(n) < 3, where the result is always 1.  */
1604       if (absu_hwi (n) >= 3)
1605         {
1606           powi_x_ndiv3 = gimple_expand_builtin_powi (gsi, loc, arg0,
1607                                                      abs_hwi (n / 3));
1608           if (!powi_x_ndiv3)
1609             return NULL_TREE;
1610         }
1611
1612       /* Calculate powi(cbrt(x), n%3).  Don't use gimple_expand_builtin_powi
1613          as that creates an unnecessary variable.  Instead, just produce
1614          either cbrt(x) or cbrt(x) * cbrt(x).  */
1615       cbrt_x = build_and_insert_call (gsi, loc, cbrtfn, arg0);
1616
1617       if (absu_hwi (n) % 3 == 1)
1618         powi_cbrt_x = cbrt_x;
1619       else
1620         powi_cbrt_x = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1621                                               cbrt_x, cbrt_x);
1622
1623       /* Multiply the two subexpressions, unless powi(x,abs(n)/3) = 1.  */
1624       if (absu_hwi (n) < 3)
1625         result = powi_cbrt_x;
1626       else
1627         result = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1628                                          powi_x_ndiv3, powi_cbrt_x);
1629
1630       /* If n is negative, reciprocate the result.  */
1631       if (n < 0)
1632         result = build_and_insert_binop (gsi, loc, "powroot", RDIV_EXPR,
1633                                          build_real (type, dconst1), result);
1634
1635       return result;
1636     }
1637
1638   /* No optimizations succeeded.  */
1639   return NULL_TREE;
1640 }
1641
1642 /* ARG is the argument to a cabs builtin call in GSI with location info
1643    LOC.  Create a sequence of statements prior to GSI that calculates
1644    sqrt(R*R + I*I), where R and I are the real and imaginary components
1645    of ARG, respectively.  Return an expression holding the result.  */
1646
1647 static tree
1648 gimple_expand_builtin_cabs (gimple_stmt_iterator *gsi, location_t loc, tree arg)
1649 {
1650   tree real_part, imag_part, addend1, addend2, sum, result;
1651   tree type = TREE_TYPE (TREE_TYPE (arg));
1652   tree sqrtfn = mathfn_built_in (type, BUILT_IN_SQRT);
1653   machine_mode mode = TYPE_MODE (type);
1654
1655   if (!flag_unsafe_math_optimizations
1656       || !optimize_bb_for_speed_p (gimple_bb (gsi_stmt (*gsi)))
1657       || !sqrtfn
1658       || optab_handler (sqrt_optab, mode) == CODE_FOR_nothing)
1659     return NULL_TREE;
1660
1661   real_part = build_and_insert_ref (gsi, loc, type, "cabs",
1662                                     REALPART_EXPR, arg);
1663   addend1 = build_and_insert_binop (gsi, loc, "cabs", MULT_EXPR,
1664                                     real_part, real_part);
1665   imag_part = build_and_insert_ref (gsi, loc, type, "cabs",
1666                                     IMAGPART_EXPR, arg);
1667   addend2 = build_and_insert_binop (gsi, loc, "cabs", MULT_EXPR,
1668                                     imag_part, imag_part);
1669   sum = build_and_insert_binop (gsi, loc, "cabs", PLUS_EXPR, addend1, addend2);
1670   result = build_and_insert_call (gsi, loc, sqrtfn, sum);
1671
1672   return result;
1673 }
1674
1675 /* Go through all calls to sin, cos and cexpi and call execute_cse_sincos_1
1676    on the SSA_NAME argument of each of them.  Also expand powi(x,n) into
1677    an optimal number of multiplies, when n is a constant.  */
1678
1679 namespace {
1680
1681 const pass_data pass_data_cse_sincos =
1682 {
1683   GIMPLE_PASS, /* type */
1684   "sincos", /* name */
1685   OPTGROUP_NONE, /* optinfo_flags */
1686   TV_NONE, /* tv_id */
1687   PROP_ssa, /* properties_required */
1688   PROP_gimple_opt_math, /* properties_provided */
1689   0, /* properties_destroyed */
1690   0, /* todo_flags_start */
1691   TODO_update_ssa, /* todo_flags_finish */
1692 };
1693
1694 class pass_cse_sincos : public gimple_opt_pass
1695 {
1696 public:
1697   pass_cse_sincos (gcc::context *ctxt)
1698     : gimple_opt_pass (pass_data_cse_sincos, ctxt)
1699   {}
1700
1701   /* opt_pass methods: */
1702   virtual bool gate (function *)
1703     {
1704       /* We no longer require either sincos or cexp, since powi expansion
1705          piggybacks on this pass.  */
1706       return optimize;
1707     }
1708
1709   virtual unsigned int execute (function *);
1710
1711 }; // class pass_cse_sincos
1712
1713 unsigned int
1714 pass_cse_sincos::execute (function *fun)
1715 {
1716   basic_block bb;
1717   bool cfg_changed = false;
1718
1719   calculate_dominance_info (CDI_DOMINATORS);
1720   memset (&sincos_stats, 0, sizeof (sincos_stats));
1721
1722   FOR_EACH_BB_FN (bb, fun)
1723     {
1724       gimple_stmt_iterator gsi;
1725       bool cleanup_eh = false;
1726
1727       for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi); gsi_next (&gsi))
1728         {
1729           gimple *stmt = gsi_stmt (gsi);
1730           tree fndecl;
1731
1732           /* Only the last stmt in a bb could throw, no need to call
1733              gimple_purge_dead_eh_edges if we change something in the middle
1734              of a basic block.  */
1735           cleanup_eh = false;
1736
1737           if (gimple_call_builtin_p (stmt, BUILT_IN_NORMAL)
1738               && gimple_call_lhs (stmt))
1739             {
1740               tree arg, arg0, arg1, result;
1741               HOST_WIDE_INT n;
1742               location_t loc;
1743
1744               fndecl = gimple_call_fndecl (stmt);
1745               switch (DECL_FUNCTION_CODE (fndecl))
1746                 {
1747                 CASE_FLT_FN (BUILT_IN_COS):
1748                 CASE_FLT_FN (BUILT_IN_SIN):
1749                 CASE_FLT_FN (BUILT_IN_CEXPI):
1750                   /* Make sure we have either sincos or cexp.  */
1751                   if (!targetm.libc_has_function (function_c99_math_complex)
1752                       && !targetm.libc_has_function (function_sincos))
1753                     break;
1754
1755                   arg = gimple_call_arg (stmt, 0);
1756                   if (TREE_CODE (arg) == SSA_NAME)
1757                     cfg_changed |= execute_cse_sincos_1 (arg);
1758                   break;
1759
1760                 CASE_FLT_FN (BUILT_IN_POW):
1761                   arg0 = gimple_call_arg (stmt, 0);
1762                   arg1 = gimple_call_arg (stmt, 1);
1763
1764                   loc = gimple_location (stmt);
1765                   result = gimple_expand_builtin_pow (&gsi, loc, arg0, arg1);
1766
1767                   if (result)
1768                     {
1769                       tree lhs = gimple_get_lhs (stmt);
1770                       gassign *new_stmt = gimple_build_assign (lhs, result);
1771                       gimple_set_location (new_stmt, loc);
1772                       unlink_stmt_vdef (stmt);
1773                       gsi_replace (&gsi, new_stmt, true);
1774                       cleanup_eh = true;
1775                       if (gimple_vdef (stmt))
1776                         release_ssa_name (gimple_vdef (stmt));
1777                     }
1778                   break;
1779
1780                 CASE_FLT_FN (BUILT_IN_POWI):
1781                   arg0 = gimple_call_arg (stmt, 0);
1782                   arg1 = gimple_call_arg (stmt, 1);
1783                   loc = gimple_location (stmt);
1784
1785                   if (real_minus_onep (arg0))
1786                     {
1787                       tree t0, t1, cond, one, minus_one;
1788                       gassign *stmt;
1789
1790                       t0 = TREE_TYPE (arg0);
1791                       t1 = TREE_TYPE (arg1);
1792                       one = build_real (t0, dconst1);
1793                       minus_one = build_real (t0, dconstm1);
1794
1795                       cond = make_temp_ssa_name (t1, NULL, "powi_cond");
1796                       stmt = gimple_build_assign (cond, BIT_AND_EXPR,
1797                                                   arg1, build_int_cst (t1, 1));
1798                       gimple_set_location (stmt, loc);
1799                       gsi_insert_before (&gsi, stmt, GSI_SAME_STMT);
1800
1801                       result = make_temp_ssa_name (t0, NULL, "powi");
1802                       stmt = gimple_build_assign (result, COND_EXPR, cond,
1803                                                   minus_one, one);
1804                       gimple_set_location (stmt, loc);
1805                       gsi_insert_before (&gsi, stmt, GSI_SAME_STMT);
1806                     }
1807                   else
1808                     {
1809                       if (!tree_fits_shwi_p (arg1))
1810                         break;
1811
1812                       n = tree_to_shwi (arg1);
1813                       result = gimple_expand_builtin_powi (&gsi, loc, arg0, n);
1814                     }
1815
1816                   if (result)
1817                     {
1818                       tree lhs = gimple_get_lhs (stmt);
1819                       gassign *new_stmt = gimple_build_assign (lhs, result);
1820                       gimple_set_location (new_stmt, loc);
1821                       unlink_stmt_vdef (stmt);
1822                       gsi_replace (&gsi, new_stmt, true);
1823                       cleanup_eh = true;
1824                       if (gimple_vdef (stmt))
1825                         release_ssa_name (gimple_vdef (stmt));
1826                     }
1827                   break;
1828
1829                 CASE_FLT_FN (BUILT_IN_CABS):
1830                   arg0 = gimple_call_arg (stmt, 0);
1831                   loc = gimple_location (stmt);
1832                   result = gimple_expand_builtin_cabs (&gsi, loc, arg0);
1833
1834                   if (result)
1835                     {
1836                       tree lhs = gimple_get_lhs (stmt);
1837                       gassign *new_stmt = gimple_build_assign (lhs, result);
1838                       gimple_set_location (new_stmt, loc);
1839                       unlink_stmt_vdef (stmt);
1840                       gsi_replace (&gsi, new_stmt, true);
1841                       cleanup_eh = true;
1842                       if (gimple_vdef (stmt))
1843                         release_ssa_name (gimple_vdef (stmt));
1844                     }
1845                   break;
1846
1847                 default:;
1848                 }
1849             }
1850         }
1851       if (cleanup_eh)
1852         cfg_changed |= gimple_purge_dead_eh_edges (bb);
1853     }
1854
1855   statistics_counter_event (fun, "sincos statements inserted",
1856                             sincos_stats.inserted);
1857
1858   return cfg_changed ? TODO_cleanup_cfg : 0;
1859 }
1860
1861 } // anon namespace
1862
1863 gimple_opt_pass *
1864 make_pass_cse_sincos (gcc::context *ctxt)
1865 {
1866   return new pass_cse_sincos (ctxt);
1867 }
1868
1869 /* A symbolic number is used to detect byte permutation and selection
1870    patterns.  Therefore the field N contains an artificial number
1871    consisting of octet sized markers:
1872
1873    0    - target byte has the value 0
1874    FF   - target byte has an unknown value (eg. due to sign extension)
1875    1..size - marker value is the target byte index minus one.
1876
1877    To detect permutations on memory sources (arrays and structures), a symbolic
1878    number is also associated a base address (the array or structure the load is
1879    made from), an offset from the base address and a range which gives the
1880    difference between the highest and lowest accessed memory location to make
1881    such a symbolic number. The range is thus different from size which reflects
1882    the size of the type of current expression. Note that for non memory source,
1883    range holds the same value as size.
1884
1885    For instance, for an array char a[], (short) a[0] | (short) a[3] would have
1886    a size of 2 but a range of 4 while (short) a[0] | ((short) a[0] << 1) would
1887    still have a size of 2 but this time a range of 1.  */
1888
1889 struct symbolic_number {
1890   uint64_t n;
1891   tree type;
1892   tree base_addr;
1893   tree offset;
1894   HOST_WIDE_INT bytepos;
1895   tree alias_set;
1896   tree vuse;
1897   unsigned HOST_WIDE_INT range;
1898 };
1899
1900 #define BITS_PER_MARKER 8
1901 #define MARKER_MASK ((1 << BITS_PER_MARKER) - 1)
1902 #define MARKER_BYTE_UNKNOWN MARKER_MASK
1903 #define HEAD_MARKER(n, size) \
1904   ((n) & ((uint64_t) MARKER_MASK << (((size) - 1) * BITS_PER_MARKER)))
1905
1906 /* The number which the find_bswap_or_nop_1 result should match in
1907    order to have a nop.  The number is masked according to the size of
1908    the symbolic number before using it.  */
1909 #define CMPNOP (sizeof (int64_t) < 8 ? 0 : \
1910   (uint64_t)0x08070605 << 32 | 0x04030201)
1911
1912 /* The number which the find_bswap_or_nop_1 result should match in
1913    order to have a byte swap.  The number is masked according to the
1914    size of the symbolic number before using it.  */
1915 #define CMPXCHG (sizeof (int64_t) < 8 ? 0 : \
1916   (uint64_t)0x01020304 << 32 | 0x05060708)
1917
1918 /* Perform a SHIFT or ROTATE operation by COUNT bits on symbolic
1919    number N.  Return false if the requested operation is not permitted
1920    on a symbolic number.  */
1921
1922 static inline bool
1923 do_shift_rotate (enum tree_code code,
1924                  struct symbolic_number *n,
1925                  int count)
1926 {
1927   int i, size = TYPE_PRECISION (n->type) / BITS_PER_UNIT;
1928   unsigned head_marker;
1929
1930   if (count % BITS_PER_UNIT != 0)
1931     return false;
1932   count = (count / BITS_PER_UNIT) * BITS_PER_MARKER;
1933
1934   /* Zero out the extra bits of N in order to avoid them being shifted
1935      into the significant bits.  */
1936   if (size < 64 / BITS_PER_MARKER)
1937     n->n &= ((uint64_t) 1 << (size * BITS_PER_MARKER)) - 1;
1938
1939   switch (code)
1940     {
1941     case LSHIFT_EXPR:
1942       n->n <<= count;
1943       break;
1944     case RSHIFT_EXPR:
1945       head_marker = HEAD_MARKER (n->n, size);
1946       n->n >>= count;
1947       /* Arithmetic shift of signed type: result is dependent on the value.  */
1948       if (!TYPE_UNSIGNED (n->type) && head_marker)
1949         for (i = 0; i < count / BITS_PER_MARKER; i++)
1950           n->n |= (uint64_t) MARKER_BYTE_UNKNOWN
1951                   << ((size - 1 - i) * BITS_PER_MARKER);
1952       break;
1953     case LROTATE_EXPR:
1954       n->n = (n->n << count) | (n->n >> ((size * BITS_PER_MARKER) - count));
1955       break;
1956     case RROTATE_EXPR:
1957       n->n = (n->n >> count) | (n->n << ((size * BITS_PER_MARKER) - count));
1958       break;
1959     default:
1960       return false;
1961     }
1962   /* Zero unused bits for size.  */
1963   if (size < 64 / BITS_PER_MARKER)
1964     n->n &= ((uint64_t) 1 << (size * BITS_PER_MARKER)) - 1;
1965   return true;
1966 }
1967
1968 /* Perform sanity checking for the symbolic number N and the gimple
1969    statement STMT.  */
1970
1971 static inline bool
1972 verify_symbolic_number_p (struct symbolic_number *n, gimple *stmt)
1973 {
1974   tree lhs_type;
1975
1976   lhs_type = gimple_expr_type (stmt);
1977
1978   if (TREE_CODE (lhs_type) != INTEGER_TYPE)
1979     return false;
1980
1981   if (TYPE_PRECISION (lhs_type) != TYPE_PRECISION (n->type))
1982     return false;
1983
1984   return true;
1985 }
1986
1987 /* Initialize the symbolic number N for the bswap pass from the base element
1988    SRC manipulated by the bitwise OR expression.  */
1989
1990 static bool
1991 init_symbolic_number (struct symbolic_number *n, tree src)
1992 {
1993   int size;
1994
1995   n->base_addr = n->offset = n->alias_set = n->vuse = NULL_TREE;
1996
1997   /* Set up the symbolic number N by setting each byte to a value between 1 and
1998      the byte size of rhs1.  The highest order byte is set to n->size and the
1999      lowest order byte to 1.  */
2000   n->type = TREE_TYPE (src);
2001   size = TYPE_PRECISION (n->type);
2002   if (size % BITS_PER_UNIT != 0)
2003     return false;
2004   size /= BITS_PER_UNIT;
2005   if (size > 64 / BITS_PER_MARKER)
2006     return false;
2007   n->range = size;
2008   n->n = CMPNOP;
2009
2010   if (size < 64 / BITS_PER_MARKER)
2011     n->n &= ((uint64_t) 1 << (size * BITS_PER_MARKER)) - 1;
2012
2013   return true;
2014 }
2015
2016 /* Check if STMT might be a byte swap or a nop from a memory source and returns
2017    the answer. If so, REF is that memory source and the base of the memory area
2018    accessed and the offset of the access from that base are recorded in N.  */
2019
2020 bool
2021 find_bswap_or_nop_load (gimple *stmt, tree ref, struct symbolic_number *n)
2022 {
2023   /* Leaf node is an array or component ref. Memorize its base and
2024      offset from base to compare to other such leaf node.  */
2025   HOST_WIDE_INT bitsize, bitpos;
2026   machine_mode mode;
2027   int unsignedp, reversep, volatilep;
2028   tree offset, base_addr;
2029
2030   /* Not prepared to handle PDP endian.  */
2031   if (BYTES_BIG_ENDIAN != WORDS_BIG_ENDIAN)
2032     return false;
2033
2034   if (!gimple_assign_load_p (stmt) || gimple_has_volatile_ops (stmt))
2035     return false;
2036
2037   base_addr = get_inner_reference (ref, &bitsize, &bitpos, &offset, &mode,
2038                                    &unsignedp, &reversep, &volatilep, false);
2039
2040   if (TREE_CODE (base_addr) == MEM_REF)
2041     {
2042       offset_int bit_offset = 0;
2043       tree off = TREE_OPERAND (base_addr, 1);
2044
2045       if (!integer_zerop (off))
2046         {
2047           offset_int boff, coff = mem_ref_offset (base_addr);
2048           boff = wi::lshift (coff, LOG2_BITS_PER_UNIT);
2049           bit_offset += boff;
2050         }
2051
2052       base_addr = TREE_OPERAND (base_addr, 0);
2053
2054       /* Avoid returning a negative bitpos as this may wreak havoc later.  */
2055       if (wi::neg_p (bit_offset))
2056         {
2057           offset_int mask = wi::mask <offset_int> (LOG2_BITS_PER_UNIT, false);
2058           offset_int tem = bit_offset.and_not (mask);
2059           /* TEM is the bitpos rounded to BITS_PER_UNIT towards -Inf.
2060              Subtract it to BIT_OFFSET and add it (scaled) to OFFSET.  */
2061           bit_offset -= tem;
2062           tem = wi::arshift (tem, LOG2_BITS_PER_UNIT);
2063           if (offset)
2064             offset = size_binop (PLUS_EXPR, offset,
2065                                     wide_int_to_tree (sizetype, tem));
2066           else
2067             offset = wide_int_to_tree (sizetype, tem);
2068         }
2069
2070       bitpos += bit_offset.to_shwi ();
2071     }
2072
2073   if (bitpos % BITS_PER_UNIT)
2074     return false;
2075   if (bitsize % BITS_PER_UNIT)
2076     return false;
2077   if (reversep)
2078     return false;
2079
2080   if (!init_symbolic_number (n, ref))
2081     return false;
2082   n->base_addr = base_addr;
2083   n->offset = offset;
2084   n->bytepos = bitpos / BITS_PER_UNIT;
2085   n->alias_set = reference_alias_ptr_type (ref);
2086   n->vuse = gimple_vuse (stmt);
2087   return true;
2088 }
2089
2090 /* Compute the symbolic number N representing the result of a bitwise OR on 2
2091    symbolic number N1 and N2 whose source statements are respectively
2092    SOURCE_STMT1 and SOURCE_STMT2.  */
2093
2094 static gimple *
2095 perform_symbolic_merge (gimple *source_stmt1, struct symbolic_number *n1,
2096                         gimple *source_stmt2, struct symbolic_number *n2,
2097                         struct symbolic_number *n)
2098 {
2099   int i, size;
2100   uint64_t mask;
2101   gimple *source_stmt;
2102   struct symbolic_number *n_start;
2103
2104   /* Sources are different, cancel bswap if they are not memory location with
2105      the same base (array, structure, ...).  */
2106   if (gimple_assign_rhs1 (source_stmt1) != gimple_assign_rhs1 (source_stmt2))
2107     {
2108       uint64_t inc;
2109       HOST_WIDE_INT start_sub, end_sub, end1, end2, end;
2110       struct symbolic_number *toinc_n_ptr, *n_end;
2111
2112       if (!n1->base_addr || !n2->base_addr
2113           || !operand_equal_p (n1->base_addr, n2->base_addr, 0))
2114         return NULL;
2115
2116       if (!n1->offset != !n2->offset
2117           || (n1->offset && !operand_equal_p (n1->offset, n2->offset, 0)))
2118         return NULL;
2119
2120       if (n1->bytepos < n2->bytepos)
2121         {
2122           n_start = n1;
2123           start_sub = n2->bytepos - n1->bytepos;
2124           source_stmt = source_stmt1;
2125         }
2126       else
2127         {
2128           n_start = n2;
2129           start_sub = n1->bytepos - n2->bytepos;
2130           source_stmt = source_stmt2;
2131         }
2132
2133       /* Find the highest address at which a load is performed and
2134          compute related info.  */
2135       end1 = n1->bytepos + (n1->range - 1);
2136       end2 = n2->bytepos + (n2->range - 1);
2137       if (end1 < end2)
2138         {
2139           end = end2;
2140           end_sub = end2 - end1;
2141         }
2142       else
2143         {
2144           end = end1;
2145           end_sub = end1 - end2;
2146         }
2147       n_end = (end2 > end1) ? n2 : n1;
2148
2149       /* Find symbolic number whose lsb is the most significant.  */
2150       if (BYTES_BIG_ENDIAN)
2151         toinc_n_ptr = (n_end == n1) ? n2 : n1;
2152       else
2153         toinc_n_ptr = (n_start == n1) ? n2 : n1;
2154
2155       n->range = end - n_start->bytepos + 1;
2156
2157       /* Check that the range of memory covered can be represented by
2158          a symbolic number.  */
2159       if (n->range > 64 / BITS_PER_MARKER)
2160         return NULL;
2161
2162       /* Reinterpret byte marks in symbolic number holding the value of
2163          bigger weight according to target endianness.  */
2164       inc = BYTES_BIG_ENDIAN ? end_sub : start_sub;
2165       size = TYPE_PRECISION (n1->type) / BITS_PER_UNIT;
2166       for (i = 0; i < size; i++, inc <<= BITS_PER_MARKER)
2167         {
2168           unsigned marker
2169             = (toinc_n_ptr->n >> (i * BITS_PER_MARKER)) & MARKER_MASK;
2170           if (marker && marker != MARKER_BYTE_UNKNOWN)
2171             toinc_n_ptr->n += inc;
2172         }
2173     }
2174   else
2175     {
2176       n->range = n1->range;
2177       n_start = n1;
2178       source_stmt = source_stmt1;
2179     }
2180
2181   if (!n1->alias_set
2182       || alias_ptr_types_compatible_p (n1->alias_set, n2->alias_set))
2183     n->alias_set = n1->alias_set;
2184   else
2185     n->alias_set = ptr_type_node;
2186   n->vuse = n_start->vuse;
2187   n->base_addr = n_start->base_addr;
2188   n->offset = n_start->offset;
2189   n->bytepos = n_start->bytepos;
2190   n->type = n_start->type;
2191   size = TYPE_PRECISION (n->type) / BITS_PER_UNIT;
2192
2193   for (i = 0, mask = MARKER_MASK; i < size; i++, mask <<= BITS_PER_MARKER)
2194     {
2195       uint64_t masked1, masked2;
2196
2197       masked1 = n1->n & mask;
2198       masked2 = n2->n & mask;
2199       if (masked1 && masked2 && masked1 != masked2)
2200         return NULL;
2201     }
2202   n->n = n1->n | n2->n;
2203
2204   return source_stmt;
2205 }
2206
2207 /* find_bswap_or_nop_1 invokes itself recursively with N and tries to perform
2208    the operation given by the rhs of STMT on the result.  If the operation
2209    could successfully be executed the function returns a gimple stmt whose
2210    rhs's first tree is the expression of the source operand and NULL
2211    otherwise.  */
2212
2213 static gimple *
2214 find_bswap_or_nop_1 (gimple *stmt, struct symbolic_number *n, int limit)
2215 {
2216   enum tree_code code;
2217   tree rhs1, rhs2 = NULL;
2218   gimple *rhs1_stmt, *rhs2_stmt, *source_stmt1;
2219   enum gimple_rhs_class rhs_class;
2220
2221   if (!limit || !is_gimple_assign (stmt))
2222     return NULL;
2223
2224   rhs1 = gimple_assign_rhs1 (stmt);
2225
2226   if (find_bswap_or_nop_load (stmt, rhs1, n))
2227     return stmt;
2228
2229   if (TREE_CODE (rhs1) != SSA_NAME)
2230     return NULL;
2231
2232   code = gimple_assign_rhs_code (stmt);
2233   rhs_class = gimple_assign_rhs_class (stmt);
2234   rhs1_stmt = SSA_NAME_DEF_STMT (rhs1);
2235
2236   if (rhs_class == GIMPLE_BINARY_RHS)
2237     rhs2 = gimple_assign_rhs2 (stmt);
2238
2239   /* Handle unary rhs and binary rhs with integer constants as second
2240      operand.  */
2241
2242   if (rhs_class == GIMPLE_UNARY_RHS
2243       || (rhs_class == GIMPLE_BINARY_RHS
2244           && TREE_CODE (rhs2) == INTEGER_CST))
2245     {
2246       if (code != BIT_AND_EXPR
2247           && code != LSHIFT_EXPR
2248           && code != RSHIFT_EXPR
2249           && code != LROTATE_EXPR
2250           && code != RROTATE_EXPR
2251           && !CONVERT_EXPR_CODE_P (code))
2252         return NULL;
2253
2254       source_stmt1 = find_bswap_or_nop_1 (rhs1_stmt, n, limit - 1);
2255
2256       /* If find_bswap_or_nop_1 returned NULL, STMT is a leaf node and
2257          we have to initialize the symbolic number.  */
2258       if (!source_stmt1)
2259         {
2260           if (gimple_assign_load_p (stmt)
2261               || !init_symbolic_number (n, rhs1))
2262             return NULL;
2263           source_stmt1 = stmt;
2264         }
2265
2266       switch (code)
2267         {
2268         case BIT_AND_EXPR:
2269           {
2270             int i, size = TYPE_PRECISION (n->type) / BITS_PER_UNIT;
2271             uint64_t val = int_cst_value (rhs2), mask = 0;
2272             uint64_t tmp = (1 << BITS_PER_UNIT) - 1;
2273
2274             /* Only constants masking full bytes are allowed.  */
2275             for (i = 0; i < size; i++, tmp <<= BITS_PER_UNIT)
2276               if ((val & tmp) != 0 && (val & tmp) != tmp)
2277                 return NULL;
2278               else if (val & tmp)
2279                 mask |= (uint64_t) MARKER_MASK << (i * BITS_PER_MARKER);
2280
2281             n->n &= mask;
2282           }
2283           break;
2284         case LSHIFT_EXPR:
2285         case RSHIFT_EXPR:
2286         case LROTATE_EXPR:
2287         case RROTATE_EXPR:
2288           if (!do_shift_rotate (code, n, (int) TREE_INT_CST_LOW (rhs2)))
2289             return NULL;
2290           break;
2291         CASE_CONVERT:
2292           {
2293             int i, type_size, old_type_size;
2294             tree type;
2295
2296             type = gimple_expr_type (stmt);
2297             type_size = TYPE_PRECISION (type);
2298             if (type_size % BITS_PER_UNIT != 0)
2299               return NULL;
2300             type_size /= BITS_PER_UNIT;
2301             if (type_size > 64 / BITS_PER_MARKER)
2302               return NULL;
2303
2304             /* Sign extension: result is dependent on the value.  */
2305             old_type_size = TYPE_PRECISION (n->type) / BITS_PER_UNIT;
2306             if (!TYPE_UNSIGNED (n->type) && type_size > old_type_size
2307                 && HEAD_MARKER (n->n, old_type_size))
2308               for (i = 0; i < type_size - old_type_size; i++)
2309                 n->n |= (uint64_t) MARKER_BYTE_UNKNOWN
2310                         << ((type_size - 1 - i) * BITS_PER_MARKER);
2311
2312             if (type_size < 64 / BITS_PER_MARKER)
2313               {
2314                 /* If STMT casts to a smaller type mask out the bits not
2315                    belonging to the target type.  */
2316                 n->n &= ((uint64_t) 1 << (type_size * BITS_PER_MARKER)) - 1;
2317               }
2318             n->type = type;
2319             if (!n->base_addr)
2320               n->range = type_size;
2321           }
2322           break;
2323         default:
2324           return NULL;
2325         };
2326       return verify_symbolic_number_p (n, stmt) ? source_stmt1 : NULL;
2327     }
2328
2329   /* Handle binary rhs.  */
2330
2331   if (rhs_class == GIMPLE_BINARY_RHS)
2332     {
2333       struct symbolic_number n1, n2;
2334       gimple *source_stmt, *source_stmt2;
2335
2336       if (code != BIT_IOR_EXPR)
2337         return NULL;
2338
2339       if (TREE_CODE (rhs2) != SSA_NAME)
2340         return NULL;
2341
2342       rhs2_stmt = SSA_NAME_DEF_STMT (rhs2);
2343
2344       switch (code)
2345         {
2346         case BIT_IOR_EXPR:
2347           source_stmt1 = find_bswap_or_nop_1 (rhs1_stmt, &n1, limit - 1);
2348
2349           if (!source_stmt1)
2350             return NULL;
2351
2352           source_stmt2 = find_bswap_or_nop_1 (rhs2_stmt, &n2, limit - 1);
2353
2354           if (!source_stmt2)
2355             return NULL;
2356
2357           if (TYPE_PRECISION (n1.type) != TYPE_PRECISION (n2.type))
2358             return NULL;
2359
2360           if (!n1.vuse != !n2.vuse
2361               || (n1.vuse && !operand_equal_p (n1.vuse, n2.vuse, 0)))
2362             return NULL;
2363
2364           source_stmt
2365             = perform_symbolic_merge (source_stmt1, &n1, source_stmt2, &n2, n);
2366
2367           if (!source_stmt)
2368             return NULL;
2369
2370           if (!verify_symbolic_number_p (n, stmt))
2371             return NULL;
2372
2373           break;
2374         default:
2375           return NULL;
2376         }
2377       return source_stmt;
2378     }
2379   return NULL;
2380 }
2381
2382 /* Check if STMT completes a bswap implementation or a read in a given
2383    endianness consisting of ORs, SHIFTs and ANDs and sets *BSWAP
2384    accordingly.  It also sets N to represent the kind of operations
2385    performed: size of the resulting expression and whether it works on
2386    a memory source, and if so alias-set and vuse.  At last, the
2387    function returns a stmt whose rhs's first tree is the source
2388    expression.  */
2389
2390 static gimple *
2391 find_bswap_or_nop (gimple *stmt, struct symbolic_number *n, bool *bswap)
2392 {
2393 /* The number which the find_bswap_or_nop_1 result should match in order
2394    to have a full byte swap.  The number is shifted to the right
2395    according to the size of the symbolic number before using it.  */
2396   uint64_t cmpxchg = CMPXCHG;
2397   uint64_t cmpnop = CMPNOP;
2398
2399   gimple *source_stmt;
2400   int limit;
2401
2402   /* The last parameter determines the depth search limit.  It usually
2403      correlates directly to the number n of bytes to be touched.  We
2404      increase that number by log2(n) + 1 here in order to also
2405      cover signed -> unsigned conversions of the src operand as can be seen
2406      in libgcc, and for initial shift/and operation of the src operand.  */
2407   limit = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (gimple_expr_type (stmt)));
2408   limit += 1 + (int) ceil_log2 ((unsigned HOST_WIDE_INT) limit);
2409   source_stmt = find_bswap_or_nop_1 (stmt, n, limit);
2410
2411   if (!source_stmt)
2412     return NULL;
2413
2414   /* Find real size of result (highest non-zero byte).  */
2415   if (n->base_addr)
2416     {
2417       int rsize;
2418       uint64_t tmpn;
2419
2420       for (tmpn = n->n, rsize = 0; tmpn; tmpn >>= BITS_PER_MARKER, rsize++);
2421       n->range = rsize;
2422     }
2423
2424   /* Zero out the extra bits of N and CMP*.  */
2425   if (n->range < (int) sizeof (int64_t))
2426     {
2427       uint64_t mask;
2428
2429       mask = ((uint64_t) 1 << (n->range * BITS_PER_MARKER)) - 1;
2430       cmpxchg >>= (64 / BITS_PER_MARKER - n->range) * BITS_PER_MARKER;
2431       cmpnop &= mask;
2432     }
2433
2434   /* A complete byte swap should make the symbolic number to start with
2435      the largest digit in the highest order byte. Unchanged symbolic
2436      number indicates a read with same endianness as target architecture.  */
2437   if (n->n == cmpnop)
2438     *bswap = false;
2439   else if (n->n == cmpxchg)
2440     *bswap = true;
2441   else
2442     return NULL;
2443
2444   /* Useless bit manipulation performed by code.  */
2445   if (!n->base_addr && n->n == cmpnop)
2446     return NULL;
2447
2448   n->range *= BITS_PER_UNIT;
2449   return source_stmt;
2450 }
2451
2452 namespace {
2453
2454 const pass_data pass_data_optimize_bswap =
2455 {
2456   GIMPLE_PASS, /* type */
2457   "bswap", /* name */
2458   OPTGROUP_NONE, /* optinfo_flags */
2459   TV_NONE, /* tv_id */
2460   PROP_ssa, /* properties_required */
2461   0, /* properties_provided */
2462   0, /* properties_destroyed */
2463   0, /* todo_flags_start */
2464   0, /* todo_flags_finish */
2465 };
2466
2467 class pass_optimize_bswap : public gimple_opt_pass
2468 {
2469 public:
2470   pass_optimize_bswap (gcc::context *ctxt)
2471     : gimple_opt_pass (pass_data_optimize_bswap, ctxt)
2472   {}
2473
2474   /* opt_pass methods: */
2475   virtual bool gate (function *)
2476     {
2477       return flag_expensive_optimizations && optimize;
2478     }
2479
2480   virtual unsigned int execute (function *);
2481
2482 }; // class pass_optimize_bswap
2483
2484 /* Perform the bswap optimization: replace the expression computed in the rhs
2485    of CUR_STMT by an equivalent bswap, load or load + bswap expression.
2486    Which of these alternatives replace the rhs is given by N->base_addr (non
2487    null if a load is needed) and BSWAP.  The type, VUSE and set-alias of the
2488    load to perform are also given in N while the builtin bswap invoke is given
2489    in FNDEL.  Finally, if a load is involved, SRC_STMT refers to one of the
2490    load statements involved to construct the rhs in CUR_STMT and N->range gives
2491    the size of the rhs expression for maintaining some statistics.
2492
2493    Note that if the replacement involve a load, CUR_STMT is moved just after
2494    SRC_STMT to do the load with the same VUSE which can lead to CUR_STMT
2495    changing of basic block.  */
2496
2497 static bool
2498 bswap_replace (gimple *cur_stmt, gimple *src_stmt, tree fndecl,
2499                tree bswap_type, tree load_type, struct symbolic_number *n,
2500                bool bswap)
2501 {
2502   gimple_stmt_iterator gsi;
2503   tree src, tmp, tgt;
2504   gimple *bswap_stmt;
2505
2506   gsi = gsi_for_stmt (cur_stmt);
2507   src = gimple_assign_rhs1 (src_stmt);
2508   tgt = gimple_assign_lhs (cur_stmt);
2509
2510   /* Need to load the value from memory first.  */
2511   if (n->base_addr)
2512     {
2513       gimple_stmt_iterator gsi_ins = gsi_for_stmt (src_stmt);
2514       tree addr_expr, addr_tmp, val_expr, val_tmp;
2515       tree load_offset_ptr, aligned_load_type;
2516       gimple *addr_stmt, *load_stmt;
2517       unsigned align;
2518       HOST_WIDE_INT load_offset = 0;
2519
2520       align = get_object_alignment (src);
2521       /* If the new access is smaller than the original one, we need
2522          to perform big endian adjustment.  */
2523       if (BYTES_BIG_ENDIAN)
2524         {
2525           HOST_WIDE_INT bitsize, bitpos;
2526           machine_mode mode;
2527           int unsignedp, reversep, volatilep;
2528           tree offset;
2529
2530           get_inner_reference (src, &bitsize, &bitpos, &offset, &mode,
2531                                &unsignedp, &reversep, &volatilep, false);
2532           if (n->range < (unsigned HOST_WIDE_INT) bitsize)
2533             {
2534               load_offset = (bitsize - n->range) / BITS_PER_UNIT;
2535               unsigned HOST_WIDE_INT l
2536                 = (load_offset * BITS_PER_UNIT) & (align - 1);
2537               if (l)
2538                 align = l & -l;
2539             }
2540         }
2541
2542       if (bswap
2543           && align < GET_MODE_ALIGNMENT (TYPE_MODE (load_type))
2544           && SLOW_UNALIGNED_ACCESS (TYPE_MODE (load_type), align))
2545         return false;
2546
2547       /* Move cur_stmt just before  one of the load of the original
2548          to ensure it has the same VUSE.  See PR61517 for what could
2549          go wrong.  */
2550       gsi_move_before (&gsi, &gsi_ins);
2551       gsi = gsi_for_stmt (cur_stmt);
2552
2553       /* Compute address to load from and cast according to the size
2554          of the load.  */
2555       addr_expr = build_fold_addr_expr (unshare_expr (src));
2556       if (is_gimple_mem_ref_addr (addr_expr))
2557         addr_tmp = addr_expr;
2558       else
2559         {
2560           addr_tmp = make_temp_ssa_name (TREE_TYPE (addr_expr), NULL,
2561                                          "load_src");
2562           addr_stmt = gimple_build_assign (addr_tmp, addr_expr);
2563           gsi_insert_before (&gsi, addr_stmt, GSI_SAME_STMT);
2564         }
2565
2566       /* Perform the load.  */
2567       aligned_load_type = load_type;
2568       if (align < TYPE_ALIGN (load_type))
2569         aligned_load_type = build_aligned_type (load_type, align);
2570       load_offset_ptr = build_int_cst (n->alias_set, load_offset);
2571       val_expr = fold_build2 (MEM_REF, aligned_load_type, addr_tmp,
2572                               load_offset_ptr);
2573
2574       if (!bswap)
2575         {
2576           if (n->range == 16)
2577             nop_stats.found_16bit++;
2578           else if (n->range == 32)
2579             nop_stats.found_32bit++;
2580           else
2581             {
2582               gcc_assert (n->range == 64);
2583               nop_stats.found_64bit++;
2584             }
2585
2586           /* Convert the result of load if necessary.  */
2587           if (!useless_type_conversion_p (TREE_TYPE (tgt), load_type))
2588             {
2589               val_tmp = make_temp_ssa_name (aligned_load_type, NULL,
2590                                             "load_dst");
2591               load_stmt = gimple_build_assign (val_tmp, val_expr);
2592               gimple_set_vuse (load_stmt, n->vuse);
2593               gsi_insert_before (&gsi, load_stmt, GSI_SAME_STMT);
2594               gimple_assign_set_rhs_with_ops (&gsi, NOP_EXPR, val_tmp);
2595             }
2596           else
2597             {
2598               gimple_assign_set_rhs_with_ops (&gsi, MEM_REF, val_expr);
2599               gimple_set_vuse (cur_stmt, n->vuse);
2600             }
2601           update_stmt (cur_stmt);
2602
2603           if (dump_file)
2604             {
2605               fprintf (dump_file,
2606                        "%d bit load in target endianness found at: ",
2607                        (int) n->range);
2608               print_gimple_stmt (dump_file, cur_stmt, 0, 0);
2609             }
2610           return true;
2611         }
2612       else
2613         {
2614           val_tmp = make_temp_ssa_name (aligned_load_type, NULL, "load_dst");
2615           load_stmt = gimple_build_assign (val_tmp, val_expr);
2616           gimple_set_vuse (load_stmt, n->vuse);
2617           gsi_insert_before (&gsi, load_stmt, GSI_SAME_STMT);
2618         }
2619       src = val_tmp;
2620     }
2621
2622   if (n->range == 16)
2623     bswap_stats.found_16bit++;
2624   else if (n->range == 32)
2625     bswap_stats.found_32bit++;
2626   else
2627     {
2628       gcc_assert (n->range == 64);
2629       bswap_stats.found_64bit++;
2630     }
2631
2632   tmp = src;
2633
2634   /* Convert the src expression if necessary.  */
2635   if (!useless_type_conversion_p (TREE_TYPE (tmp), bswap_type))
2636     {
2637       gimple *convert_stmt;
2638
2639       tmp = make_temp_ssa_name (bswap_type, NULL, "bswapsrc");
2640       convert_stmt = gimple_build_assign (tmp, NOP_EXPR, src);
2641       gsi_insert_before (&gsi, convert_stmt, GSI_SAME_STMT);
2642     }
2643
2644   /* Canonical form for 16 bit bswap is a rotate expression.  Only 16bit values
2645      are considered as rotation of 2N bit values by N bits is generally not
2646      equivalent to a bswap.  Consider for instance 0x01020304 r>> 16 which
2647      gives 0x03040102 while a bswap for that value is 0x04030201.  */
2648   if (bswap && n->range == 16)
2649     {
2650       tree count = build_int_cst (NULL, BITS_PER_UNIT);
2651       src = fold_build2 (LROTATE_EXPR, bswap_type, tmp, count);
2652       bswap_stmt = gimple_build_assign (NULL, src);
2653     }
2654   else
2655     bswap_stmt = gimple_build_call (fndecl, 1, tmp);
2656
2657   tmp = tgt;
2658
2659   /* Convert the result if necessary.  */
2660   if (!useless_type_conversion_p (TREE_TYPE (tgt), bswap_type))
2661     {
2662       gimple *convert_stmt;
2663
2664       tmp = make_temp_ssa_name (bswap_type, NULL, "bswapdst");
2665       convert_stmt = gimple_build_assign (tgt, NOP_EXPR, tmp);
2666       gsi_insert_after (&gsi, convert_stmt, GSI_SAME_STMT);
2667     }
2668
2669   gimple_set_lhs (bswap_stmt, tmp);
2670
2671   if (dump_file)
2672     {
2673       fprintf (dump_file, "%d bit bswap implementation found at: ",
2674                (int) n->range);
2675       print_gimple_stmt (dump_file, cur_stmt, 0, 0);
2676     }
2677
2678   gsi_insert_after (&gsi, bswap_stmt, GSI_SAME_STMT);
2679   gsi_remove (&gsi, true);
2680   return true;
2681 }
2682
2683 /* Find manual byte swap implementations as well as load in a given
2684    endianness. Byte swaps are turned into a bswap builtin invokation
2685    while endian loads are converted to bswap builtin invokation or
2686    simple load according to the target endianness.  */
2687
2688 unsigned int
2689 pass_optimize_bswap::execute (function *fun)
2690 {
2691   basic_block bb;
2692   bool bswap32_p, bswap64_p;
2693   bool changed = false;
2694   tree bswap32_type = NULL_TREE, bswap64_type = NULL_TREE;
2695
2696   if (BITS_PER_UNIT != 8)
2697     return 0;
2698
2699   bswap32_p = (builtin_decl_explicit_p (BUILT_IN_BSWAP32)
2700                && optab_handler (bswap_optab, SImode) != CODE_FOR_nothing);
2701   bswap64_p = (builtin_decl_explicit_p (BUILT_IN_BSWAP64)
2702                && (optab_handler (bswap_optab, DImode) != CODE_FOR_nothing
2703                    || (bswap32_p && word_mode == SImode)));
2704
2705   /* Determine the argument type of the builtins.  The code later on
2706      assumes that the return and argument type are the same.  */
2707   if (bswap32_p)
2708     {
2709       tree fndecl = builtin_decl_explicit (BUILT_IN_BSWAP32);
2710       bswap32_type = TREE_VALUE (TYPE_ARG_TYPES (TREE_TYPE (fndecl)));
2711     }
2712
2713   if (bswap64_p)
2714     {
2715       tree fndecl = builtin_decl_explicit (BUILT_IN_BSWAP64);
2716       bswap64_type = TREE_VALUE (TYPE_ARG_TYPES (TREE_TYPE (fndecl)));
2717     }
2718
2719   memset (&nop_stats, 0, sizeof (nop_stats));
2720   memset (&bswap_stats, 0, sizeof (bswap_stats));
2721
2722   FOR_EACH_BB_FN (bb, fun)
2723     {
2724       gimple_stmt_iterator gsi;
2725
2726       /* We do a reverse scan for bswap patterns to make sure we get the
2727          widest match. As bswap pattern matching doesn't handle previously
2728          inserted smaller bswap replacements as sub-patterns, the wider
2729          variant wouldn't be detected.  */
2730       for (gsi = gsi_last_bb (bb); !gsi_end_p (gsi);)
2731         {
2732           gimple *src_stmt, *cur_stmt = gsi_stmt (gsi);
2733           tree fndecl = NULL_TREE, bswap_type = NULL_TREE, load_type;
2734           enum tree_code code;
2735           struct symbolic_number n;
2736           bool bswap;
2737
2738           /* This gsi_prev (&gsi) is not part of the for loop because cur_stmt
2739              might be moved to a different basic block by bswap_replace and gsi
2740              must not points to it if that's the case.  Moving the gsi_prev
2741              there make sure that gsi points to the statement previous to
2742              cur_stmt while still making sure that all statements are
2743              considered in this basic block.  */
2744           gsi_prev (&gsi);
2745
2746           if (!is_gimple_assign (cur_stmt))
2747             continue;
2748
2749           code = gimple_assign_rhs_code (cur_stmt);
2750           switch (code)
2751             {
2752             case LROTATE_EXPR:
2753             case RROTATE_EXPR:
2754               if (!tree_fits_uhwi_p (gimple_assign_rhs2 (cur_stmt))
2755                   || tree_to_uhwi (gimple_assign_rhs2 (cur_stmt))
2756                      % BITS_PER_UNIT)
2757                 continue;
2758               /* Fall through.  */
2759             case BIT_IOR_EXPR:
2760               break;
2761             default:
2762               continue;
2763             }
2764
2765           src_stmt = find_bswap_or_nop (cur_stmt, &n, &bswap);
2766
2767           if (!src_stmt)
2768             continue;
2769
2770           switch (n.range)
2771             {
2772             case 16:
2773               /* Already in canonical form, nothing to do.  */
2774               if (code == LROTATE_EXPR || code == RROTATE_EXPR)
2775                 continue;
2776               load_type = bswap_type = uint16_type_node;
2777               break;
2778             case 32:
2779               load_type = uint32_type_node;
2780               if (bswap32_p)
2781                 {
2782                   fndecl = builtin_decl_explicit (BUILT_IN_BSWAP32);
2783                   bswap_type = bswap32_type;
2784                 }
2785               break;
2786             case 64:
2787               load_type = uint64_type_node;
2788               if (bswap64_p)
2789                 {
2790                   fndecl = builtin_decl_explicit (BUILT_IN_BSWAP64);
2791                   bswap_type = bswap64_type;
2792                 }
2793               break;
2794             default:
2795               continue;
2796             }
2797
2798           if (bswap && !fndecl && n.range != 16)
2799             continue;
2800
2801           if (bswap_replace (cur_stmt, src_stmt, fndecl, bswap_type, load_type,
2802                              &n, bswap))
2803             changed = true;
2804         }
2805     }
2806
2807   statistics_counter_event (fun, "16-bit nop implementations found",
2808                             nop_stats.found_16bit);
2809   statistics_counter_event (fun, "32-bit nop implementations found",
2810                             nop_stats.found_32bit);
2811   statistics_counter_event (fun, "64-bit nop implementations found",
2812                             nop_stats.found_64bit);
2813   statistics_counter_event (fun, "16-bit bswap implementations found",
2814                             bswap_stats.found_16bit);
2815   statistics_counter_event (fun, "32-bit bswap implementations found",
2816                             bswap_stats.found_32bit);
2817   statistics_counter_event (fun, "64-bit bswap implementations found",
2818                             bswap_stats.found_64bit);
2819
2820   return (changed ? TODO_update_ssa : 0);
2821 }
2822
2823 } // anon namespace
2824
2825 gimple_opt_pass *
2826 make_pass_optimize_bswap (gcc::context *ctxt)
2827 {
2828   return new pass_optimize_bswap (ctxt);
2829 }
2830
2831 /* Return true if stmt is a type conversion operation that can be stripped
2832    when used in a widening multiply operation.  */
2833 static bool
2834 widening_mult_conversion_strippable_p (tree result_type, gimple *stmt)
2835 {
2836   enum tree_code rhs_code = gimple_assign_rhs_code (stmt);
2837
2838   if (TREE_CODE (result_type) == INTEGER_TYPE)
2839     {
2840       tree op_type;
2841       tree inner_op_type;
2842
2843       if (!CONVERT_EXPR_CODE_P (rhs_code))
2844         return false;
2845
2846       op_type = TREE_TYPE (gimple_assign_lhs (stmt));
2847
2848       /* If the type of OP has the same precision as the result, then
2849          we can strip this conversion.  The multiply operation will be
2850          selected to create the correct extension as a by-product.  */
2851       if (TYPE_PRECISION (result_type) == TYPE_PRECISION (op_type))
2852         return true;
2853
2854       /* We can also strip a conversion if it preserves the signed-ness of
2855          the operation and doesn't narrow the range.  */
2856       inner_op_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
2857
2858       /* If the inner-most type is unsigned, then we can strip any
2859          intermediate widening operation.  If it's signed, then the
2860          intermediate widening operation must also be signed.  */
2861       if ((TYPE_UNSIGNED (inner_op_type)
2862            || TYPE_UNSIGNED (op_type) == TYPE_UNSIGNED (inner_op_type))
2863           && TYPE_PRECISION (op_type) > TYPE_PRECISION (inner_op_type))
2864         return true;
2865
2866       return false;
2867     }
2868
2869   return rhs_code == FIXED_CONVERT_EXPR;
2870 }
2871
2872 /* Return true if RHS is a suitable operand for a widening multiplication,
2873    assuming a target type of TYPE.
2874    There are two cases:
2875
2876      - RHS makes some value at least twice as wide.  Store that value
2877        in *NEW_RHS_OUT if so, and store its type in *TYPE_OUT.
2878
2879      - RHS is an integer constant.  Store that value in *NEW_RHS_OUT if so,
2880        but leave *TYPE_OUT untouched.  */
2881
2882 static bool
2883 is_widening_mult_rhs_p (tree type, tree rhs, tree *type_out,
2884                         tree *new_rhs_out)
2885 {
2886   gimple *stmt;
2887   tree type1, rhs1;
2888
2889   if (TREE_CODE (rhs) == SSA_NAME)
2890     {
2891       stmt = SSA_NAME_DEF_STMT (rhs);
2892       if (is_gimple_assign (stmt))
2893         {
2894           if (! widening_mult_conversion_strippable_p (type, stmt))
2895             rhs1 = rhs;
2896           else
2897             {
2898               rhs1 = gimple_assign_rhs1 (stmt);
2899
2900               if (TREE_CODE (rhs1) == INTEGER_CST)
2901                 {
2902                   *new_rhs_out = rhs1;
2903                   *type_out = NULL;
2904                   return true;
2905                 }
2906             }
2907         }
2908       else
2909         rhs1 = rhs;
2910
2911       type1 = TREE_TYPE (rhs1);
2912
2913       if (TREE_CODE (type1) != TREE_CODE (type)
2914           || TYPE_PRECISION (type1) * 2 > TYPE_PRECISION (type))
2915         return false;
2916
2917       *new_rhs_out = rhs1;
2918       *type_out = type1;
2919       return true;
2920     }
2921
2922   if (TREE_CODE (rhs) == INTEGER_CST)
2923     {
2924       *new_rhs_out = rhs;
2925       *type_out = NULL;
2926       return true;
2927     }
2928
2929   return false;
2930 }
2931
2932 /* Return true if STMT performs a widening multiplication, assuming the
2933    output type is TYPE.  If so, store the unwidened types of the operands
2934    in *TYPE1_OUT and *TYPE2_OUT respectively.  Also fill *RHS1_OUT and
2935    *RHS2_OUT such that converting those operands to types *TYPE1_OUT
2936    and *TYPE2_OUT would give the operands of the multiplication.  */
2937
2938 static bool
2939 is_widening_mult_p (gimple *stmt,
2940                     tree *type1_out, tree *rhs1_out,
2941                     tree *type2_out, tree *rhs2_out)
2942 {
2943   tree type = TREE_TYPE (gimple_assign_lhs (stmt));
2944
2945   if (TREE_CODE (type) != INTEGER_TYPE
2946       && TREE_CODE (type) != FIXED_POINT_TYPE)
2947     return false;
2948
2949   if (!is_widening_mult_rhs_p (type, gimple_assign_rhs1 (stmt), type1_out,
2950                                rhs1_out))
2951     return false;
2952
2953   if (!is_widening_mult_rhs_p (type, gimple_assign_rhs2 (stmt), type2_out,
2954                                rhs2_out))
2955     return false;
2956
2957   if (*type1_out == NULL)
2958     {
2959       if (*type2_out == NULL || !int_fits_type_p (*rhs1_out, *type2_out))
2960         return false;
2961       *type1_out = *type2_out;
2962     }
2963
2964   if (*type2_out == NULL)
2965     {
2966       if (!int_fits_type_p (*rhs2_out, *type1_out))
2967         return false;
2968       *type2_out = *type1_out;
2969     }
2970
2971   /* Ensure that the larger of the two operands comes first. */
2972   if (TYPE_PRECISION (*type1_out) < TYPE_PRECISION (*type2_out))
2973     {
2974       std::swap (*type1_out, *type2_out);
2975       std::swap (*rhs1_out, *rhs2_out);
2976     }
2977
2978   return true;
2979 }
2980
2981 /* Process a single gimple statement STMT, which has a MULT_EXPR as
2982    its rhs, and try to convert it into a WIDEN_MULT_EXPR.  The return
2983    value is true iff we converted the statement.  */
2984
2985 static bool
2986 convert_mult_to_widen (gimple *stmt, gimple_stmt_iterator *gsi)
2987 {
2988   tree lhs, rhs1, rhs2, type, type1, type2;
2989   enum insn_code handler;
2990   machine_mode to_mode, from_mode, actual_mode;
2991   optab op;
2992   int actual_precision;
2993   location_t loc = gimple_location (stmt);
2994   bool from_unsigned1, from_unsigned2;
2995
2996   lhs = gimple_assign_lhs (stmt);
2997   type = TREE_TYPE (lhs);
2998   if (TREE_CODE (type) != INTEGER_TYPE)
2999     return false;
3000
3001   if (!is_widening_mult_p (stmt, &type1, &rhs1, &type2, &rhs2))
3002     return false;
3003
3004   to_mode = TYPE_MODE (type);
3005   from_mode = TYPE_MODE (type1);
3006   from_unsigned1 = TYPE_UNSIGNED (type1);
3007   from_unsigned2 = TYPE_UNSIGNED (type2);
3008
3009   if (from_unsigned1 && from_unsigned2)
3010     op = umul_widen_optab;
3011   else if (!from_unsigned1 && !from_unsigned2)
3012     op = smul_widen_optab;
3013   else
3014     op = usmul_widen_optab;
3015
3016   handler = find_widening_optab_handler_and_mode (op, to_mode, from_mode,
3017                                                   0, &actual_mode);
3018
3019   if (handler == CODE_FOR_nothing)
3020     {
3021       if (op != smul_widen_optab)
3022         {
3023           /* We can use a signed multiply with unsigned types as long as
3024              there is a wider mode to use, or it is the smaller of the two
3025              types that is unsigned.  Note that type1 >= type2, always.  */
3026           if ((TYPE_UNSIGNED (type1)
3027                && TYPE_PRECISION (type1) == GET_MODE_PRECISION (from_mode))
3028               || (TYPE_UNSIGNED (type2)
3029                   && TYPE_PRECISION (type2) == GET_MODE_PRECISION (from_mode)))
3030             {
3031               from_mode = GET_MODE_WIDER_MODE (from_mode);
3032               if (GET_MODE_SIZE (to_mode) <= GET_MODE_SIZE (from_mode))
3033                 return false;
3034             }
3035
3036           op = smul_widen_optab;
3037           handler = find_widening_optab_handler_and_mode (op, to_mode,
3038                                                           from_mode, 0,
3039                                                           &actual_mode);
3040
3041           if (handler == CODE_FOR_nothing)
3042             return false;
3043
3044           from_unsigned1 = from_unsigned2 = false;
3045         }
3046       else
3047         return false;
3048     }
3049
3050   /* Ensure that the inputs to the handler are in the correct precison
3051      for the opcode.  This will be the full mode size.  */
3052   actual_precision = GET_MODE_PRECISION (actual_mode);
3053   if (2 * actual_precision > TYPE_PRECISION (type))
3054     return false;
3055   if (actual_precision != TYPE_PRECISION (type1)
3056       || from_unsigned1 != TYPE_UNSIGNED (type1))
3057     rhs1 = build_and_insert_cast (gsi, loc,
3058                                   build_nonstandard_integer_type
3059                                     (actual_precision, from_unsigned1), rhs1);
3060   if (actual_precision != TYPE_PRECISION (type2)
3061       || from_unsigned2 != TYPE_UNSIGNED (type2))
3062     rhs2 = build_and_insert_cast (gsi, loc,
3063                                   build_nonstandard_integer_type
3064                                     (actual_precision, from_unsigned2), rhs2);
3065
3066   /* Handle constants.  */
3067   if (TREE_CODE (rhs1) == INTEGER_CST)
3068     rhs1 = fold_convert (type1, rhs1);
3069   if (TREE_CODE (rhs2) == INTEGER_CST)
3070     rhs2 = fold_convert (type2, rhs2);
3071
3072   gimple_assign_set_rhs1 (stmt, rhs1);
3073   gimple_assign_set_rhs2 (stmt, rhs2);
3074   gimple_assign_set_rhs_code (stmt, WIDEN_MULT_EXPR);
3075   update_stmt (stmt);
3076   widen_mul_stats.widen_mults_inserted++;
3077   return true;
3078 }
3079
3080 /* Process a single gimple statement STMT, which is found at the
3081    iterator GSI and has a either a PLUS_EXPR or a MINUS_EXPR as its
3082    rhs (given by CODE), and try to convert it into a
3083    WIDEN_MULT_PLUS_EXPR or a WIDEN_MULT_MINUS_EXPR.  The return value
3084    is true iff we converted the statement.  */
3085
3086 static bool
3087 convert_plusminus_to_widen (gimple_stmt_iterator *gsi, gimple *stmt,
3088                             enum tree_code code)
3089 {
3090   gimple *rhs1_stmt = NULL, *rhs2_stmt = NULL;
3091   gimple *conv1_stmt = NULL, *conv2_stmt = NULL, *conv_stmt;
3092   tree type, type1, type2, optype;
3093   tree lhs, rhs1, rhs2, mult_rhs1, mult_rhs2, add_rhs;
3094   enum tree_code rhs1_code = ERROR_MARK, rhs2_code = ERROR_MARK;
3095   optab this_optab;
3096   enum tree_code wmult_code;
3097   enum insn_code handler;
3098   machine_mode to_mode, from_mode, actual_mode;
3099   location_t loc = gimple_location (stmt);
3100   int actual_precision;
3101   bool from_unsigned1, from_unsigned2;
3102
3103   lhs = gimple_assign_lhs (stmt);
3104   type = TREE_TYPE (lhs);
3105   if (TREE_CODE (type) != INTEGER_TYPE
3106       && TREE_CODE (type) != FIXED_POINT_TYPE)
3107     return false;
3108
3109   if (code == MINUS_EXPR)
3110     wmult_code = WIDEN_MULT_MINUS_EXPR;
3111   else
3112     wmult_code = WIDEN_MULT_PLUS_EXPR;
3113
3114   rhs1 = gimple_assign_rhs1 (stmt);
3115   rhs2 = gimple_assign_rhs2 (stmt);
3116
3117   if (TREE_CODE (rhs1) == SSA_NAME)
3118     {
3119       rhs1_stmt = SSA_NAME_DEF_STMT (rhs1);
3120       if (is_gimple_assign (rhs1_stmt))
3121         rhs1_code = gimple_assign_rhs_code (rhs1_stmt);
3122     }
3123
3124   if (TREE_CODE (rhs2) == SSA_NAME)
3125     {
3126       rhs2_stmt = SSA_NAME_DEF_STMT (rhs2);
3127       if (is_gimple_assign (rhs2_stmt))
3128         rhs2_code = gimple_assign_rhs_code (rhs2_stmt);
3129     }
3130
3131   /* Allow for one conversion statement between the multiply
3132      and addition/subtraction statement.  If there are more than
3133      one conversions then we assume they would invalidate this
3134      transformation.  If that's not the case then they should have
3135      been folded before now.  */
3136   if (CONVERT_EXPR_CODE_P (rhs1_code))
3137     {
3138       conv1_stmt = rhs1_stmt;
3139       rhs1 = gimple_assign_rhs1 (rhs1_stmt);
3140       if (TREE_CODE (rhs1) == SSA_NAME)
3141         {
3142           rhs1_stmt = SSA_NAME_DEF_STMT (rhs1);
3143           if (is_gimple_assign (rhs1_stmt))
3144             rhs1_code = gimple_assign_rhs_code (rhs1_stmt);
3145         }
3146       else
3147         return false;
3148     }
3149   if (CONVERT_EXPR_CODE_P (rhs2_code))
3150     {
3151       conv2_stmt = rhs2_stmt;
3152       rhs2 = gimple_assign_rhs1 (rhs2_stmt);
3153       if (TREE_CODE (rhs2) == SSA_NAME)
3154         {
3155           rhs2_stmt = SSA_NAME_DEF_STMT (rhs2);
3156           if (is_gimple_assign (rhs2_stmt))
3157             rhs2_code = gimple_assign_rhs_code (rhs2_stmt);
3158         }
3159       else
3160         return false;
3161     }
3162
3163   /* If code is WIDEN_MULT_EXPR then it would seem unnecessary to call
3164      is_widening_mult_p, but we still need the rhs returns.
3165
3166      It might also appear that it would be sufficient to use the existing
3167      operands of the widening multiply, but that would limit the choice of
3168      multiply-and-accumulate instructions.
3169
3170      If the widened-multiplication result has more than one uses, it is
3171      probably wiser not to do the conversion.  */
3172   if (code == PLUS_EXPR
3173       && (rhs1_code == MULT_EXPR || rhs1_code == WIDEN_MULT_EXPR))
3174     {
3175       if (!has_single_use (rhs1)
3176           || !is_widening_mult_p (rhs1_stmt, &type1, &mult_rhs1,
3177                                   &type2, &mult_rhs2))
3178         return false;
3179       add_rhs = rhs2;
3180       conv_stmt = conv1_stmt;
3181     }
3182   else if (rhs2_code == MULT_EXPR || rhs2_code == WIDEN_MULT_EXPR)
3183     {
3184       if (!has_single_use (rhs2)
3185           || !is_widening_mult_p (rhs2_stmt, &type1, &mult_rhs1,
3186                                   &type2, &mult_rhs2))
3187         return false;
3188       add_rhs = rhs1;
3189       conv_stmt = conv2_stmt;
3190     }
3191   else
3192     return false;
3193
3194   to_mode = TYPE_MODE (type);
3195   from_mode = TYPE_MODE (type1);
3196   from_unsigned1 = TYPE_UNSIGNED (type1);
3197   from_unsigned2 = TYPE_UNSIGNED (type2);
3198   optype = type1;
3199
3200   /* There's no such thing as a mixed sign madd yet, so use a wider mode.  */
3201   if (from_unsigned1 != from_unsigned2)
3202     {
3203       if (!INTEGRAL_TYPE_P (type))
3204         return false;
3205       /* We can use a signed multiply with unsigned types as long as
3206          there is a wider mode to use, or it is the smaller of the two
3207          types that is unsigned.  Note that type1 >= type2, always.  */
3208       if ((from_unsigned1
3209            && TYPE_PRECISION (type1) == GET_MODE_PRECISION (from_mode))
3210           || (from_unsigned2
3211               && TYPE_PRECISION (type2) == GET_MODE_PRECISION (from_mode)))
3212         {
3213           from_mode = GET_MODE_WIDER_MODE (from_mode);
3214           if (GET_MODE_SIZE (from_mode) >= GET_MODE_SIZE (to_mode))
3215             return false;
3216         }
3217
3218       from_unsigned1 = from_unsigned2 = false;
3219       optype = build_nonstandard_integer_type (GET_MODE_PRECISION (from_mode),
3220                                                false);
3221     }
3222
3223   /* If there was a conversion between the multiply and addition
3224      then we need to make sure it fits a multiply-and-accumulate.
3225      The should be a single mode change which does not change the
3226      value.  */
3227   if (conv_stmt)
3228     {
3229       /* We use the original, unmodified data types for this.  */
3230       tree from_type = TREE_TYPE (gimple_assign_rhs1 (conv_stmt));
3231       tree to_type = TREE_TYPE (gimple_assign_lhs (conv_stmt));
3232       int data_size = TYPE_PRECISION (type1) + TYPE_PRECISION (type2);
3233       bool is_unsigned = TYPE_UNSIGNED (type1) && TYPE_UNSIGNED (type2);
3234
3235       if (TYPE_PRECISION (from_type) > TYPE_PRECISION (to_type))
3236         {
3237           /* Conversion is a truncate.  */
3238           if (TYPE_PRECISION (to_type) < data_size)
3239             return false;
3240         }
3241       else if (TYPE_PRECISION (from_type) < TYPE_PRECISION (to_type))
3242         {
3243           /* Conversion is an extend.  Check it's the right sort.  */
3244           if (TYPE_UNSIGNED (from_type) != is_unsigned
3245               && !(is_unsigned && TYPE_PRECISION (from_type) > data_size))
3246             return false;
3247         }
3248       /* else convert is a no-op for our purposes.  */
3249     }
3250
3251   /* Verify that the machine can perform a widening multiply
3252      accumulate in this mode/signedness combination, otherwise
3253      this transformation is likely to pessimize code.  */
3254   this_optab = optab_for_tree_code (wmult_code, optype, optab_default);
3255   handler = find_widening_optab_handler_and_mode (this_optab, to_mode,
3256                                                   from_mode, 0, &actual_mode);
3257
3258   if (handler == CODE_FOR_nothing)
3259     return false;
3260
3261   /* Ensure that the inputs to the handler are in the correct precison
3262      for the opcode.  This will be the full mode size.  */
3263   actual_precision = GET_MODE_PRECISION (actual_mode);
3264   if (actual_precision != TYPE_PRECISION (type1)
3265       || from_unsigned1 != TYPE_UNSIGNED (type1))
3266     mult_rhs1 = build_and_insert_cast (gsi, loc,
3267                                        build_nonstandard_integer_type
3268                                          (actual_precision, from_unsigned1),
3269                                        mult_rhs1);
3270   if (actual_precision != TYPE_PRECISION (type2)
3271       || from_unsigned2 != TYPE_UNSIGNED (type2))
3272     mult_rhs2 = build_and_insert_cast (gsi, loc,
3273                                        build_nonstandard_integer_type
3274                                          (actual_precision, from_unsigned2),
3275                                        mult_rhs2);
3276
3277   if (!useless_type_conversion_p (type, TREE_TYPE (add_rhs)))
3278     add_rhs = build_and_insert_cast (gsi, loc, type, add_rhs);
3279
3280   /* Handle constants.  */
3281   if (TREE_CODE (mult_rhs1) == INTEGER_CST)
3282     mult_rhs1 = fold_convert (type1, mult_rhs1);
3283   if (TREE_CODE (mult_rhs2) == INTEGER_CST)
3284     mult_rhs2 = fold_convert (type2, mult_rhs2);
3285
3286   gimple_assign_set_rhs_with_ops (gsi, wmult_code, mult_rhs1, mult_rhs2,
3287                                   add_rhs);
3288   update_stmt (gsi_stmt (*gsi));
3289   widen_mul_stats.maccs_inserted++;
3290   return true;
3291 }
3292
3293 /* Combine the multiplication at MUL_STMT with operands MULOP1 and MULOP2
3294    with uses in additions and subtractions to form fused multiply-add
3295    operations.  Returns true if successful and MUL_STMT should be removed.  */
3296
3297 static bool
3298 convert_mult_to_fma (gimple *mul_stmt, tree op1, tree op2)
3299 {
3300   tree mul_result = gimple_get_lhs (mul_stmt);
3301   tree type = TREE_TYPE (mul_result);
3302   gimple *use_stmt, *neguse_stmt;
3303   gassign *fma_stmt;
3304   use_operand_p use_p;
3305   imm_use_iterator imm_iter;
3306
3307   if (FLOAT_TYPE_P (type)
3308       && flag_fp_contract_mode == FP_CONTRACT_OFF)
3309     return false;
3310
3311   /* We don't want to do bitfield reduction ops.  */
3312   if (INTEGRAL_TYPE_P (type)
3313       && (TYPE_PRECISION (type)
3314           != GET_MODE_PRECISION (TYPE_MODE (type))))
3315     return false;
3316
3317   /* If the target doesn't support it, don't generate it.  We assume that
3318      if fma isn't available then fms, fnma or fnms are not either.  */
3319   if (optab_handler (fma_optab, TYPE_MODE (type)) == CODE_FOR_nothing)
3320     return false;
3321
3322   /* If the multiplication has zero uses, it is kept around probably because
3323      of -fnon-call-exceptions.  Don't optimize it away in that case,
3324      it is DCE job.  */
3325   if (has_zero_uses (mul_result))
3326     return false;
3327
3328   /* Make sure that the multiplication statement becomes dead after
3329      the transformation, thus that all uses are transformed to FMAs.
3330      This means we assume that an FMA operation has the same cost
3331      as an addition.  */
3332   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, mul_result)
3333     {
3334       enum tree_code use_code;
3335       tree result = mul_result;
3336       bool negate_p = false;
3337
3338       use_stmt = USE_STMT (use_p);
3339
3340       if (is_gimple_debug (use_stmt))
3341         continue;
3342
3343       /* For now restrict this operations to single basic blocks.  In theory
3344          we would want to support sinking the multiplication in
3345          m = a*b;
3346          if ()
3347            ma = m + c;
3348          else
3349            d = m;
3350          to form a fma in the then block and sink the multiplication to the
3351          else block.  */
3352       if (gimple_bb (use_stmt) != gimple_bb (mul_stmt))
3353         return false;
3354
3355       if (!is_gimple_assign (use_stmt))
3356         return false;
3357
3358       use_code = gimple_assign_rhs_code (use_stmt);
3359
3360       /* A negate on the multiplication leads to FNMA.  */
3361       if (use_code == NEGATE_EXPR)
3362         {
3363           ssa_op_iter iter;
3364           use_operand_p usep;
3365
3366           result = gimple_assign_lhs (use_stmt);
3367
3368           /* Make sure the negate statement becomes dead with this
3369              single transformation.  */
3370           if (!single_imm_use (gimple_assign_lhs (use_stmt),
3371                                &use_p, &neguse_stmt))
3372             return false;
3373
3374           /* Make sure the multiplication isn't also used on that stmt.  */
3375           FOR_EACH_PHI_OR_STMT_USE (usep, neguse_stmt, iter, SSA_OP_USE)
3376             if (USE_FROM_PTR (usep) == mul_result)
3377               return false;
3378
3379           /* Re-validate.  */
3380           use_stmt = neguse_stmt;
3381           if (gimple_bb (use_stmt) != gimple_bb (mul_stmt))
3382             return false;
3383           if (!is_gimple_assign (use_stmt))
3384             return false;
3385
3386           use_code = gimple_assign_rhs_code (use_stmt);
3387           negate_p = true;
3388         }
3389
3390       switch (use_code)
3391         {
3392         case MINUS_EXPR:
3393           if (gimple_assign_rhs2 (use_stmt) == result)
3394             negate_p = !negate_p;
3395           break;
3396         case PLUS_EXPR:
3397           break;
3398         default:
3399           /* FMA can only be formed from PLUS and MINUS.  */
3400           return false;
3401         }
3402
3403       /* If the subtrahend (gimple_assign_rhs2 (use_stmt)) is computed
3404          by a MULT_EXPR that we'll visit later, we might be able to
3405          get a more profitable match with fnma.
3406          OTOH, if we don't, a negate / fma pair has likely lower latency
3407          that a mult / subtract pair.  */
3408       if (use_code == MINUS_EXPR && !negate_p
3409           && gimple_assign_rhs1 (use_stmt) == result
3410           && optab_handler (fms_optab, TYPE_MODE (type)) == CODE_FOR_nothing
3411           && optab_handler (fnma_optab, TYPE_MODE (type)) != CODE_FOR_nothing)
3412         {
3413           tree rhs2 = gimple_assign_rhs2 (use_stmt);
3414
3415           if (TREE_CODE (rhs2) == SSA_NAME)
3416             {
3417               gimple *stmt2 = SSA_NAME_DEF_STMT (rhs2);
3418               if (has_single_use (rhs2)
3419                   && is_gimple_assign (stmt2)
3420                   && gimple_assign_rhs_code (stmt2) == MULT_EXPR)
3421               return false;
3422             }
3423         }
3424
3425       /* We can't handle a * b + a * b.  */
3426       if (gimple_assign_rhs1 (use_stmt) == gimple_assign_rhs2 (use_stmt))
3427         return false;
3428
3429       /* While it is possible to validate whether or not the exact form
3430          that we've recognized is available in the backend, the assumption
3431          is that the transformation is never a loss.  For instance, suppose
3432          the target only has the plain FMA pattern available.  Consider
3433          a*b-c -> fma(a,b,-c): we've exchanged MUL+SUB for FMA+NEG, which
3434          is still two operations.  Consider -(a*b)-c -> fma(-a,b,-c): we
3435          still have 3 operations, but in the FMA form the two NEGs are
3436          independent and could be run in parallel.  */
3437     }
3438
3439   FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, mul_result)
3440     {
3441       gimple_stmt_iterator gsi = gsi_for_stmt (use_stmt);
3442       enum tree_code use_code;
3443       tree addop, mulop1 = op1, result = mul_result;
3444       bool negate_p = false;
3445
3446       if (is_gimple_debug (use_stmt))
3447         continue;
3448
3449       use_code = gimple_assign_rhs_code (use_stmt);
3450       if (use_code == NEGATE_EXPR)
3451         {
3452           result = gimple_assign_lhs (use_stmt);
3453           single_imm_use (gimple_assign_lhs (use_stmt), &use_p, &neguse_stmt);
3454           gsi_remove (&gsi, true);
3455           release_defs (use_stmt);
3456
3457           use_stmt = neguse_stmt;
3458           gsi = gsi_for_stmt (use_stmt);
3459           use_code = gimple_assign_rhs_code (use_stmt);
3460           negate_p = true;
3461         }
3462
3463       if (gimple_assign_rhs1 (use_stmt) == result)
3464         {
3465           addop = gimple_assign_rhs2 (use_stmt);
3466           /* a * b - c -> a * b + (-c)  */
3467           if (gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
3468             addop = force_gimple_operand_gsi (&gsi,
3469                                               build1 (NEGATE_EXPR,
3470                                                       type, addop),
3471                                               true, NULL_TREE, true,
3472                                               GSI_SAME_STMT);
3473         }
3474       else
3475         {
3476           addop = gimple_assign_rhs1 (use_stmt);
3477           /* a - b * c -> (-b) * c + a */
3478           if (gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
3479             negate_p = !negate_p;
3480         }
3481
3482       if (negate_p)
3483         mulop1 = force_gimple_operand_gsi (&gsi,
3484                                            build1 (NEGATE_EXPR,
3485                                                    type, mulop1),
3486                                            true, NULL_TREE, true,
3487                                            GSI_SAME_STMT);
3488
3489       fma_stmt = gimple_build_assign (gimple_assign_lhs (use_stmt),
3490                                       FMA_EXPR, mulop1, op2, addop);
3491       gsi_replace (&gsi, fma_stmt, true);
3492       widen_mul_stats.fmas_inserted++;
3493     }
3494
3495   return true;
3496 }
3497
3498 /* Find integer multiplications where the operands are extended from
3499    smaller types, and replace the MULT_EXPR with a WIDEN_MULT_EXPR
3500    where appropriate.  */
3501
3502 namespace {
3503
3504 const pass_data pass_data_optimize_widening_mul =
3505 {
3506   GIMPLE_PASS, /* type */
3507   "widening_mul", /* name */
3508   OPTGROUP_NONE, /* optinfo_flags */
3509   TV_NONE, /* tv_id */
3510   PROP_ssa, /* properties_required */
3511   0, /* properties_provided */
3512   0, /* properties_destroyed */
3513   0, /* todo_flags_start */
3514   TODO_update_ssa, /* todo_flags_finish */
3515 };
3516
3517 class pass_optimize_widening_mul : public gimple_opt_pass
3518 {
3519 public:
3520   pass_optimize_widening_mul (gcc::context *ctxt)
3521     : gimple_opt_pass (pass_data_optimize_widening_mul, ctxt)
3522   {}
3523
3524   /* opt_pass methods: */
3525   virtual bool gate (function *)
3526     {
3527       return flag_expensive_optimizations && optimize;
3528     }
3529
3530   virtual unsigned int execute (function *);
3531
3532 }; // class pass_optimize_widening_mul
3533
3534 unsigned int
3535 pass_optimize_widening_mul::execute (function *fun)
3536 {
3537   basic_block bb;
3538   bool cfg_changed = false;
3539
3540   memset (&widen_mul_stats, 0, sizeof (widen_mul_stats));
3541
3542   FOR_EACH_BB_FN (bb, fun)
3543     {
3544       gimple_stmt_iterator gsi;
3545
3546       for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi);)
3547         {
3548           gimple *stmt = gsi_stmt (gsi);
3549           enum tree_code code;
3550
3551           if (is_gimple_assign (stmt))
3552             {
3553               code = gimple_assign_rhs_code (stmt);
3554               switch (code)
3555                 {
3556                 case MULT_EXPR:
3557                   if (!convert_mult_to_widen (stmt, &gsi)
3558                       && convert_mult_to_fma (stmt,
3559                                               gimple_assign_rhs1 (stmt),
3560                                               gimple_assign_rhs2 (stmt)))
3561                     {
3562                       gsi_remove (&gsi, true);
3563                       release_defs (stmt);
3564                       continue;
3565                     }
3566                   break;
3567
3568                 case PLUS_EXPR:
3569                 case MINUS_EXPR:
3570                   convert_plusminus_to_widen (&gsi, stmt, code);
3571                   break;
3572
3573                 default:;
3574                 }
3575             }
3576           else if (is_gimple_call (stmt)
3577                    && gimple_call_lhs (stmt))
3578             {
3579               tree fndecl = gimple_call_fndecl (stmt);
3580               if (fndecl
3581                   && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_NORMAL)
3582                 {
3583                   switch (DECL_FUNCTION_CODE (fndecl))
3584                     {
3585                       case BUILT_IN_POWF:
3586                       case BUILT_IN_POW:
3587                       case BUILT_IN_POWL:
3588                         if (TREE_CODE (gimple_call_arg (stmt, 1)) == REAL_CST
3589                             && real_equal
3590                                  (&TREE_REAL_CST (gimple_call_arg (stmt, 1)),
3591                                   &dconst2)
3592                             && convert_mult_to_fma (stmt,
3593                                                     gimple_call_arg (stmt, 0),
3594                                                     gimple_call_arg (stmt, 0)))
3595                           {
3596                             unlink_stmt_vdef (stmt);
3597                             if (gsi_remove (&gsi, true)
3598                                 && gimple_purge_dead_eh_edges (bb))
3599                               cfg_changed = true;
3600                             release_defs (stmt);
3601                             continue;
3602                           }
3603                           break;
3604
3605                       default:;
3606                     }
3607                 }
3608             }
3609           gsi_next (&gsi);
3610         }
3611     }
3612
3613   statistics_counter_event (fun, "widening multiplications inserted",
3614                             widen_mul_stats.widen_mults_inserted);
3615   statistics_counter_event (fun, "widening maccs inserted",
3616                             widen_mul_stats.maccs_inserted);
3617   statistics_counter_event (fun, "fused multiply-adds inserted",
3618                             widen_mul_stats.fmas_inserted);
3619
3620   return cfg_changed ? TODO_cleanup_cfg : 0;
3621 }
3622
3623 } // anon namespace
3624
3625 gimple_opt_pass *
3626 make_pass_optimize_widening_mul (gcc::context *ctxt)
3627 {
3628   return new pass_optimize_widening_mul (ctxt);
3629 }