gcc/tree-ssa-math-opts.c

   1 /* Global, SSA-based optimizations using mathematical identities.
   2    Copyright (C) 2005-2015 Free Software Foundation, Inc.
   3
   4 This file is part of GCC.
   5
   6 GCC is free software; you can redistribute it and/or modify it
   7 under the terms of the GNU General Public License as published by the
   8 Free Software Foundation; either version 3, or (at your option) any
   9 later version.
  10
  11 GCC is distributed in the hope that it will be useful, but WITHOUT
  12 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14 for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with GCC; see the file COPYING3.  If not see
  18 <http://www.gnu.org/licenses/>.  */
  19
  20 /* Currently, the only mini-pass in this file tries to CSE reciprocal
  21    operations.  These are common in sequences such as this one:
  22
  23         modulus = sqrt(x*x + y*y + z*z);
  24         x = x / modulus;
  25         y = y / modulus;
  26         z = z / modulus;
  27
  28    that can be optimized to
  29
  30         modulus = sqrt(x*x + y*y + z*z);
  31         rmodulus = 1.0 / modulus;
  32         x = x * rmodulus;
  33         y = y * rmodulus;
  34         z = z * rmodulus;
  35
  36    We do this for loop invariant divisors, and with this pass whenever
  37    we notice that a division has the same divisor multiple times.
  38
  39    Of course, like in PRE, we don't insert a division if a dominator
  40    already has one.  However, this cannot be done as an extension of
  41    PRE for several reasons.
  42
  43    First of all, with some experiments it was found out that the
  44    transformation is not always useful if there are only two divisions
  45    hy the same divisor.  This is probably because modern processors
  46    can pipeline the divisions; on older, in-order processors it should
  47    still be effective to optimize two divisions by the same number.
  48    We make this a param, and it shall be called N in the remainder of
  49    this comment.
  50
  51    Second, if trapping math is active, we have less freedom on where
  52    to insert divisions: we can only do so in basic blocks that already
  53    contain one.  (If divisions don't trap, instead, we can insert
  54    divisions elsewhere, which will be in blocks that are common dominators
  55    of those that have the division).
  56
  57    We really don't want to compute the reciprocal unless a division will
  58    be found.  To do this, we won't insert the division in a basic block
  59    that has less than N divisions *post-dominating* it.
  60
  61    The algorithm constructs a subset of the dominator tree, holding the
  62    blocks containing the divisions and the common dominators to them,
  63    and walk it twice.  The first walk is in post-order, and it annotates
  64    each block with the number of divisions that post-dominate it: this
  65    gives information on where divisions can be inserted profitably.
  66    The second walk is in pre-order, and it inserts divisions as explained
  67    above, and replaces divisions by multiplications.
  68
  69    In the best case, the cost of the pass is O(n_statements).  In the
  70    worst-case, the cost is due to creating the dominator tree subset,
  71    with a cost of O(n_basic_blocks ^ 2); however this can only happen
  72    for n_statements / n_basic_blocks statements.  So, the amortized cost
  73    of creating the dominator tree subset is O(n_basic_blocks) and the
  74    worst-case cost of the pass is O(n_statements * n_basic_blocks).
  75
  76    More practically, the cost will be small because there are few
  77    divisions, and they tend to be in the same basic block, so insert_bb
  78    is called very few times.
  79
  80    If we did this using domwalk.c, an efficient implementation would have
  81    to work on all the variables in a single pass, because we could not
  82    work on just a subset of the dominator tree, as we do now, and the
  83    cost would also be something like O(n_statements * n_basic_blocks).
  84    The data structures would be more complex in order to work on all the
  85    variables in a single pass.  */
  86
  87 #include "config.h"
  88 #include "system.h"
  89 #include "coretypes.h"
  90 #include "backend.h"
  91 #include "predict.h"
  92 #include "tree.h"
  93 #include "gimple.h"
  94 #include "rtl.h"
  95 #include "ssa.h"
  96 #include "flags.h"
  97 #include "alias.h"
  98 #include "fold-const.h"
  99 #include "internal-fn.h"
 100 #include "gimple-fold.h"
 101 #include "gimple-iterator.h"
 102 #include "gimplify.h"
 103 #include "gimplify-me.h"
 104 #include "stor-layout.h"
 105 #include "tree-cfg.h"
 106 #include "tree-dfa.h"
 107 #include "tree-ssa.h"
 108 #include "tree-pass.h"
 109 #include "alloc-pool.h"
 110 #include "target.h"
 111 #include "gimple-pretty-print.h"
 112 #include "builtins.h"
 113 #include "params.h"
 114 #include "insn-codes.h"
 115 #include "optabs-tree.h"
 116
 117 /* This structure represents one basic block that either computes a
 118    division, or is a common dominator for basic block that compute a
 119    division.  */
 120 struct occurrence {
 121   /* The basic block represented by this structure.  */
 122   basic_block bb;
 123
 124   /* If non-NULL, the SSA_NAME holding the definition for a reciprocal
 125      inserted in BB.  */
 126   tree recip_def;
 127
 128   /* If non-NULL, the GIMPLE_ASSIGN for a reciprocal computation that
 129      was inserted in BB.  */
 130   gimple *recip_def_stmt;
 131
 132   /* Pointer to a list of "struct occurrence"s for blocks dominated
 133      by BB.  */
 134   struct occurrence *children;
 135
 136   /* Pointer to the next "struct occurrence"s in the list of blocks
 137      sharing a common dominator.  */
 138   struct occurrence *next;
 139
 140   /* The number of divisions that are in BB before compute_merit.  The
 141      number of divisions that are in BB or post-dominate it after
 142      compute_merit.  */
 143   int num_divisions;
 144
 145   /* True if the basic block has a division, false if it is a common
 146      dominator for basic blocks that do.  If it is false and trapping
 147      math is active, BB is not a candidate for inserting a reciprocal.  */
 148   bool bb_has_division;
 149 };
 150
 151 static struct
 152 {
 153   /* Number of 1.0/X ops inserted.  */
 154   int rdivs_inserted;
 155
 156   /* Number of 1.0/FUNC ops inserted.  */
 157   int rfuncs_inserted;
 158 } reciprocal_stats;
 159
 160 static struct
 161 {
 162   /* Number of cexpi calls inserted.  */
 163   int inserted;
 164 } sincos_stats;
 165
 166 static struct
 167 {
 168   /* Number of hand-written 16-bit nop / bswaps found.  */
 169   int found_16bit;
 170
 171   /* Number of hand-written 32-bit nop / bswaps found.  */
 172   int found_32bit;
 173
 174   /* Number of hand-written 64-bit nop / bswaps found.  */
 175   int found_64bit;
 176 } nop_stats, bswap_stats;
 177
 178 static struct
 179 {
 180   /* Number of widening multiplication ops inserted.  */
 181   int widen_mults_inserted;
 182
 183   /* Number of integer multiply-and-accumulate ops inserted.  */
 184   int maccs_inserted;
 185
 186   /* Number of fp fused multiply-add ops inserted.  */
 187   int fmas_inserted;
 188 } widen_mul_stats;
 189
 190 /* The instance of "struct occurrence" representing the highest
 191    interesting block in the dominator tree.  */
 192 static struct occurrence *occ_head;
 193
 194 /* Allocation pool for getting instances of "struct occurrence".  */
 195 static object_allocator<occurrence> *occ_pool;
 196
 197
 198
 199 /* Allocate and return a new struct occurrence for basic block BB, and
 200    whose children list is headed by CHILDREN.  */
 201 static struct occurrence *
 202 occ_new (basic_block bb, struct occurrence *children)
 203 {
 204   struct occurrence *occ;
 205
 206   bb->aux = occ = occ_pool->allocate ();
 207   memset (occ, 0, sizeof (struct occurrence));
 208
 209   occ->bb = bb;
 210   occ->children = children;
 211   return occ;
 212 }
 213
 214
 215 /* Insert NEW_OCC into our subset of the dominator tree.  P_HEAD points to a
 216    list of "struct occurrence"s, one per basic block, having IDOM as
 217    their common dominator.
 218
 219    We try to insert NEW_OCC as deep as possible in the tree, and we also
 220    insert any other block that is a common dominator for BB and one
 221    block already in the tree.  */
 222
 223 static void
 224 insert_bb (struct occurrence *new_occ, basic_block idom,
 225            struct occurrence **p_head)
 226 {
 227   struct occurrence *occ, **p_occ;
 228
 229   for (p_occ = p_head; (occ = *p_occ) != NULL; )
 230     {
 231       basic_block bb = new_occ->bb, occ_bb = occ->bb;
 232       basic_block dom = nearest_common_dominator (CDI_DOMINATORS, occ_bb, bb);
 233       if (dom == bb)
 234         {
 235           /* BB dominates OCC_BB.  OCC becomes NEW_OCC's child: remove OCC
 236              from its list.  */
 237           *p_occ = occ->next;
 238           occ->next = new_occ->children;
 239           new_occ->children = occ;
 240
 241           /* Try the next block (it may as well be dominated by BB).  */
 242         }
 243
 244       else if (dom == occ_bb)
 245         {
 246           /* OCC_BB dominates BB.  Tail recurse to look deeper.  */
 247           insert_bb (new_occ, dom, &occ->children);
 248           return;
 249         }
 250
 251       else if (dom != idom)
 252         {
 253           gcc_assert (!dom->aux);
 254
 255           /* There is a dominator between IDOM and BB, add it and make
 256              two children out of NEW_OCC and OCC.  First, remove OCC from
 257              its list.  */
 258           *p_occ = occ->next;
 259           new_occ->next = occ;
 260           occ->next = NULL;
 261
 262           /* None of the previous blocks has DOM as a dominator: if we tail
 263              recursed, we would reexamine them uselessly. Just switch BB with
 264              DOM, and go on looking for blocks dominated by DOM.  */
 265           new_occ = occ_new (dom, new_occ);
 266         }
 267
 268       else
 269         {
 270           /* Nothing special, go on with the next element.  */
 271           p_occ = &occ->next;
 272         }
 273     }
 274
 275   /* No place was found as a child of IDOM.  Make BB a sibling of IDOM.  */
 276   new_occ->next = *p_head;
 277   *p_head = new_occ;
 278 }
 279
 280 /* Register that we found a division in BB.  */
 281
 282 static inline void
 283 register_division_in (basic_block bb)
 284 {
 285   struct occurrence *occ;
 286
 287   occ = (struct occurrence *) bb->aux;
 288   if (!occ)
 289     {
 290       occ = occ_new (bb, NULL);
 291       insert_bb (occ, ENTRY_BLOCK_PTR_FOR_FN (cfun), &occ_head);
 292     }
 293
 294   occ->bb_has_division = true;
 295   occ->num_divisions++;
 296 }
 297
 298
 299 /* Compute the number of divisions that postdominate each block in OCC and
 300    its children.  */
 301
 302 static void
 303 compute_merit (struct occurrence *occ)
 304 {
 305   struct occurrence *occ_child;
 306   basic_block dom = occ->bb;
 307
 308   for (occ_child = occ->children; occ_child; occ_child = occ_child->next)
 309     {
 310       basic_block bb;
 311       if (occ_child->children)
 312         compute_merit (occ_child);
 313
 314       if (flag_exceptions)
 315         bb = single_noncomplex_succ (dom);
 316       else
 317         bb = dom;
 318
 319       if (dominated_by_p (CDI_POST_DOMINATORS, bb, occ_child->bb))
 320         occ->num_divisions += occ_child->num_divisions;
 321     }
 322 }
 323
 324
 325 /* Return whether USE_STMT is a floating-point division by DEF.  */
 326 static inline bool
 327 is_division_by (gimple *use_stmt, tree def)
 328 {
 329   return is_gimple_assign (use_stmt)
 330          && gimple_assign_rhs_code (use_stmt) == RDIV_EXPR
 331          && gimple_assign_rhs2 (use_stmt) == def
 332          /* Do not recognize x / x as valid division, as we are getting
 333             confused later by replacing all immediate uses x in such
 334             a stmt.  */
 335          && gimple_assign_rhs1 (use_stmt) != def;
 336 }
 337
 338 /* Walk the subset of the dominator tree rooted at OCC, setting the
 339    RECIP_DEF field to a definition of 1.0 / DEF that can be used in
 340    the given basic block.  The field may be left NULL, of course,
 341    if it is not possible or profitable to do the optimization.
 342
 343    DEF_BSI is an iterator pointing at the statement defining DEF.
 344    If RECIP_DEF is set, a dominator already has a computation that can
 345    be used.  */
 346
 347 static void
 348 insert_reciprocals (gimple_stmt_iterator *def_gsi, struct occurrence *occ,
 349                     tree def, tree recip_def, int threshold)
 350 {
 351   tree type;
 352   gassign *new_stmt;
 353   gimple_stmt_iterator gsi;
 354   struct occurrence *occ_child;
 355
 356   if (!recip_def
 357       && (occ->bb_has_division || !flag_trapping_math)
 358       && occ->num_divisions >= threshold)
 359     {
 360       /* Make a variable with the replacement and substitute it.  */
 361       type = TREE_TYPE (def);
 362       recip_def = create_tmp_reg (type, "reciptmp");
 363       new_stmt = gimple_build_assign (recip_def, RDIV_EXPR,
 364                                       build_one_cst (type), def);
 365
 366       if (occ->bb_has_division)
 367         {
 368           /* Case 1: insert before an existing division.  */
 369           gsi = gsi_after_labels (occ->bb);
 370           while (!gsi_end_p (gsi) && !is_division_by (gsi_stmt (gsi), def))
 371             gsi_next (&gsi);
 372
 373           gsi_insert_before (&gsi, new_stmt, GSI_SAME_STMT);
 374         }
 375       else if (def_gsi && occ->bb == def_gsi->bb)
 376         {
 377           /* Case 2: insert right after the definition.  Note that this will
 378              never happen if the definition statement can throw, because in
 379              that case the sole successor of the statement's basic block will
 380              dominate all the uses as well.  */
 381           gsi_insert_after (def_gsi, new_stmt, GSI_NEW_STMT);
 382         }
 383       else
 384         {
 385           /* Case 3: insert in a basic block not containing defs/uses.  */
 386           gsi = gsi_after_labels (occ->bb);
 387           gsi_insert_before (&gsi, new_stmt, GSI_SAME_STMT);
 388         }
 389
 390       reciprocal_stats.rdivs_inserted++;
 391
 392       occ->recip_def_stmt = new_stmt;
 393     }
 394
 395   occ->recip_def = recip_def;
 396   for (occ_child = occ->children; occ_child; occ_child = occ_child->next)
 397     insert_reciprocals (def_gsi, occ_child, def, recip_def, threshold);
 398 }
 399
 400
 401 /* Replace the division at USE_P with a multiplication by the reciprocal, if
 402    possible.  */
 403
 404 static inline void
 405 replace_reciprocal (use_operand_p use_p)
 406 {
 407   gimple *use_stmt = USE_STMT (use_p);
 408   basic_block bb = gimple_bb (use_stmt);
 409   struct occurrence *occ = (struct occurrence *) bb->aux;
 410
 411   if (optimize_bb_for_speed_p (bb)
 412       && occ->recip_def && use_stmt != occ->recip_def_stmt)
 413     {
 414       gimple_stmt_iterator gsi = gsi_for_stmt (use_stmt);
 415       gimple_assign_set_rhs_code (use_stmt, MULT_EXPR);
 416       SET_USE (use_p, occ->recip_def);
 417       fold_stmt_inplace (&gsi);
 418       update_stmt (use_stmt);
 419     }
 420 }
 421
 422
 423 /* Free OCC and return one more "struct occurrence" to be freed.  */
 424
 425 static struct occurrence *
 426 free_bb (struct occurrence *occ)
 427 {
 428   struct occurrence *child, *next;
 429
 430   /* First get the two pointers hanging off OCC.  */
 431   next = occ->next;
 432   child = occ->children;
 433   occ->bb->aux = NULL;
 434   occ_pool->remove (occ);
 435
 436   /* Now ensure that we don't recurse unless it is necessary.  */
 437   if (!child)
 438     return next;
 439   else
 440     {
 441       while (next)
 442         next = free_bb (next);
 443
 444       return child;
 445     }
 446 }
 447
 448
 449 /* Look for floating-point divisions among DEF's uses, and try to
 450    replace them by multiplications with the reciprocal.  Add
 451    as many statements computing the reciprocal as needed.
 452
 453    DEF must be a GIMPLE register of a floating-point type.  */
 454
 455 static void
 456 execute_cse_reciprocals_1 (gimple_stmt_iterator *def_gsi, tree def)
 457 {
 458   use_operand_p use_p;
 459   imm_use_iterator use_iter;
 460   struct occurrence *occ;
 461   int count = 0, threshold;
 462
 463   gcc_assert (FLOAT_TYPE_P (TREE_TYPE (def)) && is_gimple_reg (def));
 464
 465   FOR_EACH_IMM_USE_FAST (use_p, use_iter, def)
 466     {
 467       gimple *use_stmt = USE_STMT (use_p);
 468       if (is_division_by (use_stmt, def))
 469         {
 470           register_division_in (gimple_bb (use_stmt));
 471           count++;
 472         }
 473     }
 474
 475   /* Do the expensive part only if we can hope to optimize something.  */
 476   threshold = targetm.min_divisions_for_recip_mul (TYPE_MODE (TREE_TYPE (def)));
 477   if (count >= threshold)
 478     {
 479       gimple *use_stmt;
 480       for (occ = occ_head; occ; occ = occ->next)
 481         {
 482           compute_merit (occ);
 483           insert_reciprocals (def_gsi, occ, def, NULL, threshold);
 484         }
 485
 486       FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, def)
 487         {
 488           if (is_division_by (use_stmt, def))
 489             {
 490               FOR_EACH_IMM_USE_ON_STMT (use_p, use_iter)
 491                 replace_reciprocal (use_p);
 492             }
 493         }
 494     }
 495
 496   for (occ = occ_head; occ; )
 497     occ = free_bb (occ);
 498
 499   occ_head = NULL;
 500 }
 501
 502 /* Go through all the floating-point SSA_NAMEs, and call
 503    execute_cse_reciprocals_1 on each of them.  */
 504 namespace {
 505
 506 const pass_data pass_data_cse_reciprocals =
 507 {
 508   GIMPLE_PASS, /* type */
 509   "recip", /* name */
 510   OPTGROUP_NONE, /* optinfo_flags */
 511   TV_NONE, /* tv_id */
 512   PROP_ssa, /* properties_required */
 513   0, /* properties_provided */
 514   0, /* properties_destroyed */
 515   0, /* todo_flags_start */
 516   TODO_update_ssa, /* todo_flags_finish */
 517 };
 518
 519 class pass_cse_reciprocals : public gimple_opt_pass
 520 {
 521 public:
 522   pass_cse_reciprocals (gcc::context *ctxt)
 523     : gimple_opt_pass (pass_data_cse_reciprocals, ctxt)
 524   {}
 525
 526   /* opt_pass methods: */
 527   virtual bool gate (function *) { return optimize && flag_reciprocal_math; }
 528   virtual unsigned int execute (function *);
 529
 530 }; // class pass_cse_reciprocals
 531
 532 unsigned int
 533 pass_cse_reciprocals::execute (function *fun)
 534 {
 535   basic_block bb;
 536   tree arg;
 537
 538   occ_pool = new object_allocator<occurrence> ("dominators for recip");
 539
 540   memset (&reciprocal_stats, 0, sizeof (reciprocal_stats));
 541   calculate_dominance_info (CDI_DOMINATORS);
 542   calculate_dominance_info (CDI_POST_DOMINATORS);
 543
 544 #ifdef ENABLE_CHECKING
 545   FOR_EACH_BB_FN (bb, fun)
 546     gcc_assert (!bb->aux);
 547 #endif
 548
 549   for (arg = DECL_ARGUMENTS (fun->decl); arg; arg = DECL_CHAIN (arg))
 550     if (FLOAT_TYPE_P (TREE_TYPE (arg))
 551         && is_gimple_reg (arg))
 552       {
 553         tree name = ssa_default_def (fun, arg);
 554         if (name)
 555           execute_cse_reciprocals_1 (NULL, name);
 556       }
 557
 558   FOR_EACH_BB_FN (bb, fun)
 559     {
 560       tree def;
 561
 562       for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
 563            gsi_next (&gsi))
 564         {
 565           gphi *phi = gsi.phi ();
 566           def = PHI_RESULT (phi);
 567           if (! virtual_operand_p (def)
 568               && FLOAT_TYPE_P (TREE_TYPE (def)))
 569             execute_cse_reciprocals_1 (NULL, def);
 570         }
 571
 572       for (gimple_stmt_iterator gsi = gsi_after_labels (bb); !gsi_end_p (gsi);
 573            gsi_next (&gsi))
 574         {
 575           gimple *stmt = gsi_stmt (gsi);
 576
 577           if (gimple_has_lhs (stmt)
 578               && (def = SINGLE_SSA_TREE_OPERAND (stmt, SSA_OP_DEF)) != NULL
 579               && FLOAT_TYPE_P (TREE_TYPE (def))
 580               && TREE_CODE (def) == SSA_NAME)
 581             execute_cse_reciprocals_1 (&gsi, def);
 582         }
 583
 584       if (optimize_bb_for_size_p (bb))
 585         continue;
 586
 587       /* Scan for a/func(b) and convert it to reciprocal a*rfunc(b).  */
 588       for (gimple_stmt_iterator gsi = gsi_after_labels (bb); !gsi_end_p (gsi);
 589            gsi_next (&gsi))
 590         {
 591           gimple *stmt = gsi_stmt (gsi);
 592           tree fndecl;
 593
 594           if (is_gimple_assign (stmt)
 595               && gimple_assign_rhs_code (stmt) == RDIV_EXPR)
 596             {
 597               tree arg1 = gimple_assign_rhs2 (stmt);
 598               gimple *stmt1;
 599
 600               if (TREE_CODE (arg1) != SSA_NAME)
 601                 continue;
 602
 603               stmt1 = SSA_NAME_DEF_STMT (arg1);
 604
 605               if (is_gimple_call (stmt1)
 606                   && gimple_call_lhs (stmt1)
 607                   && (fndecl = gimple_call_fndecl (stmt1))
 608                   && (DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_NORMAL
 609                       || DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD))
 610                 {
 611                   enum built_in_function code;
 612                   bool md_code, fail;
 613                   imm_use_iterator ui;
 614                   use_operand_p use_p;
 615
 616                   code = DECL_FUNCTION_CODE (fndecl);
 617                   md_code = DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD;
 618
 619                   fndecl = targetm.builtin_reciprocal (code, md_code, false);
 620                   if (!fndecl)
 621                     continue;
 622
 623                   /* Check that all uses of the SSA name are divisions,
 624                      otherwise replacing the defining statement will do
 625                      the wrong thing.  */
 626                   fail = false;
 627                   FOR_EACH_IMM_USE_FAST (use_p, ui, arg1)
 628                     {
 629                       gimple *stmt2 = USE_STMT (use_p);
 630                       if (is_gimple_debug (stmt2))
 631                         continue;
 632                       if (!is_gimple_assign (stmt2)
 633                           || gimple_assign_rhs_code (stmt2) != RDIV_EXPR
 634                           || gimple_assign_rhs1 (stmt2) == arg1
 635                           || gimple_assign_rhs2 (stmt2) != arg1)
 636                         {
 637                           fail = true;
 638                           break;
 639                         }
 640                     }
 641                   if (fail)
 642                     continue;
 643
 644                   gimple_replace_ssa_lhs (stmt1, arg1);
 645                   gimple_call_set_fndecl (stmt1, fndecl);
 646                   update_stmt (stmt1);
 647                   reciprocal_stats.rfuncs_inserted++;
 648
 649                   FOR_EACH_IMM_USE_STMT (stmt, ui, arg1)
 650                     {
 651                       gimple_stmt_iterator gsi = gsi_for_stmt (stmt);
 652                       gimple_assign_set_rhs_code (stmt, MULT_EXPR);
 653                       fold_stmt_inplace (&gsi);
 654                       update_stmt (stmt);
 655                     }
 656                 }
 657             }
 658         }
 659     }
 660
 661   statistics_counter_event (fun, "reciprocal divs inserted",
 662                             reciprocal_stats.rdivs_inserted);
 663   statistics_counter_event (fun, "reciprocal functions inserted",
 664                             reciprocal_stats.rfuncs_inserted);
 665
 666   free_dominance_info (CDI_DOMINATORS);
 667   free_dominance_info (CDI_POST_DOMINATORS);
 668   delete occ_pool;
 669   return 0;
 670 }
 671
 672 } // anon namespace
 673
 674 gimple_opt_pass *
 675 make_pass_cse_reciprocals (gcc::context *ctxt)
 676 {
 677   return new pass_cse_reciprocals (ctxt);
 678 }
 679
 680 /* Records an occurrence at statement USE_STMT in the vector of trees
 681    STMTS if it is dominated by *TOP_BB or dominates it or this basic block
 682    is not yet initialized.  Returns true if the occurrence was pushed on
 683    the vector.  Adjusts *TOP_BB to be the basic block dominating all
 684    statements in the vector.  */
 685
 686 static bool
 687 maybe_record_sincos (vec<gimple *> *stmts,
 688                      basic_block *top_bb, gimple *use_stmt)
 689 {
 690   basic_block use_bb = gimple_bb (use_stmt);
 691   if (*top_bb
 692       && (*top_bb == use_bb
 693           || dominated_by_p (CDI_DOMINATORS, use_bb, *top_bb)))
 694     stmts->safe_push (use_stmt);
 695   else if (!*top_bb
 696            || dominated_by_p (CDI_DOMINATORS, *top_bb, use_bb))
 697     {
 698       stmts->safe_push (use_stmt);
 699       *top_bb = use_bb;
 700     }
 701   else
 702     return false;
 703
 704   return true;
 705 }
 706
 707 /* Look for sin, cos and cexpi calls with the same argument NAME and
 708    create a single call to cexpi CSEing the result in this case.
 709    We first walk over all immediate uses of the argument collecting
 710    statements that we can CSE in a vector and in a second pass replace
 711    the statement rhs with a REALPART or IMAGPART expression on the
 712    result of the cexpi call we insert before the use statement that
 713    dominates all other candidates.  */
 714
 715 static bool
 716 execute_cse_sincos_1 (tree name)
 717 {
 718   gimple_stmt_iterator gsi;
 719   imm_use_iterator use_iter;
 720   tree fndecl, res, type;
 721   gimple *def_stmt, *use_stmt, *stmt;
 722   int seen_cos = 0, seen_sin = 0, seen_cexpi = 0;
 723   auto_vec<gimple *> stmts;
 724   basic_block top_bb = NULL;
 725   int i;
 726   bool cfg_changed = false;
 727
 728   type = TREE_TYPE (name);
 729   FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, name)
 730     {
 731       if (gimple_code (use_stmt) != GIMPLE_CALL
 732           || !gimple_call_lhs (use_stmt)
 733           || !(fndecl = gimple_call_fndecl (use_stmt))
 734           || DECL_BUILT_IN_CLASS (fndecl) != BUILT_IN_NORMAL)
 735         continue;
 736
 737       switch (DECL_FUNCTION_CODE (fndecl))
 738         {
 739         CASE_FLT_FN (BUILT_IN_COS):
 740           seen_cos |= maybe_record_sincos (&stmts, &top_bb, use_stmt) ? 1 : 0;
 741           break;
 742
 743         CASE_FLT_FN (BUILT_IN_SIN):
 744           seen_sin |= maybe_record_sincos (&stmts, &top_bb, use_stmt) ? 1 : 0;
 745           break;
 746
 747         CASE_FLT_FN (BUILT_IN_CEXPI):
 748           seen_cexpi |= maybe_record_sincos (&stmts, &top_bb, use_stmt) ? 1 : 0;
 749           break;
 750
 751         default:;
 752         }
 753     }
 754
 755   if (seen_cos + seen_sin + seen_cexpi <= 1)
 756     return false;
 757
 758   /* Simply insert cexpi at the beginning of top_bb but not earlier than
 759      the name def statement.  */
 760   fndecl = mathfn_built_in (type, BUILT_IN_CEXPI);
 761   if (!fndecl)
 762     return false;
 763   stmt = gimple_build_call (fndecl, 1, name);
 764   res = make_temp_ssa_name (TREE_TYPE (TREE_TYPE (fndecl)), stmt, "sincostmp");
 765   gimple_call_set_lhs (stmt, res);
 766
 767   def_stmt = SSA_NAME_DEF_STMT (name);
 768   if (!SSA_NAME_IS_DEFAULT_DEF (name)
 769       && gimple_code (def_stmt) != GIMPLE_PHI
 770       && gimple_bb (def_stmt) == top_bb)
 771     {
 772       gsi = gsi_for_stmt (def_stmt);
 773       gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
 774     }
 775   else
 776     {
 777       gsi = gsi_after_labels (top_bb);
 778       gsi_insert_before (&gsi, stmt, GSI_SAME_STMT);
 779     }
 780   sincos_stats.inserted++;
 781
 782   /* And adjust the recorded old call sites.  */
 783   for (i = 0; stmts.iterate (i, &use_stmt); ++i)
 784     {
 785       tree rhs = NULL;
 786       fndecl = gimple_call_fndecl (use_stmt);
 787
 788       switch (DECL_FUNCTION_CODE (fndecl))
 789         {
 790         CASE_FLT_FN (BUILT_IN_COS):
 791           rhs = fold_build1 (REALPART_EXPR, type, res);
 792           break;
 793
 794         CASE_FLT_FN (BUILT_IN_SIN):
 795           rhs = fold_build1 (IMAGPART_EXPR, type, res);
 796           break;
 797
 798         CASE_FLT_FN (BUILT_IN_CEXPI):
 799           rhs = res;
 800           break;
 801
 802         default:;
 803           gcc_unreachable ();
 804         }
 805
 806         /* Replace call with a copy.  */
 807         stmt = gimple_build_assign (gimple_call_lhs (use_stmt), rhs);
 808
 809         gsi = gsi_for_stmt (use_stmt);
 810         gsi_replace (&gsi, stmt, true);
 811         if (gimple_purge_dead_eh_edges (gimple_bb (stmt)))
 812           cfg_changed = true;
 813     }
 814
 815   return cfg_changed;
 816 }
 817
 818 /* To evaluate powi(x,n), the floating point value x raised to the
 819    constant integer exponent n, we use a hybrid algorithm that
 820    combines the "window method" with look-up tables.  For an
 821    introduction to exponentiation algorithms and "addition chains",
 822    see section 4.6.3, "Evaluation of Powers" of Donald E. Knuth,
 823    "Seminumerical Algorithms", Vol. 2, "The Art of Computer Programming",
 824    3rd Edition, 1998, and Daniel M. Gordon, "A Survey of Fast Exponentiation
 825    Methods", Journal of Algorithms, Vol. 27, pp. 129-146, 1998.  */
 826
 827 /* Provide a default value for POWI_MAX_MULTS, the maximum number of
 828    multiplications to inline before calling the system library's pow
 829    function.  powi(x,n) requires at worst 2*bits(n)-2 multiplications,
 830    so this default never requires calling pow, powf or powl.  */
 831
 832 #ifndef POWI_MAX_MULTS
 833 #define POWI_MAX_MULTS  (2*HOST_BITS_PER_WIDE_INT-2)
 834 #endif
 835
 836 /* The size of the "optimal power tree" lookup table.  All
 837    exponents less than this value are simply looked up in the
 838    powi_table below.  This threshold is also used to size the
 839    cache of pseudo registers that hold intermediate results.  */
 840 #define POWI_TABLE_SIZE 256
 841
 842 /* The size, in bits of the window, used in the "window method"
 843    exponentiation algorithm.  This is equivalent to a radix of
 844    (1<<POWI_WINDOW_SIZE) in the corresponding "m-ary method".  */
 845 #define POWI_WINDOW_SIZE 3
 846
 847 /* The following table is an efficient representation of an
 848    "optimal power tree".  For each value, i, the corresponding
 849    value, j, in the table states than an optimal evaluation
 850    sequence for calculating pow(x,i) can be found by evaluating
 851    pow(x,j)*pow(x,i-j).  An optimal power tree for the first
 852    100 integers is given in Knuth's "Seminumerical algorithms".  */
 853
 854 static const unsigned char powi_table[POWI_TABLE_SIZE] =
 855   {
 856       0,   1,   1,   2,   2,   3,   3,   4,  /*   0 -   7 */
 857       4,   6,   5,   6,   6,  10,   7,   9,  /*   8 -  15 */
 858       8,  16,   9,  16,  10,  12,  11,  13,  /*  16 -  23 */
 859      12,  17,  13,  18,  14,  24,  15,  26,  /*  24 -  31 */
 860      16,  17,  17,  19,  18,  33,  19,  26,  /*  32 -  39 */
 861      20,  25,  21,  40,  22,  27,  23,  44,  /*  40 -  47 */
 862      24,  32,  25,  34,  26,  29,  27,  44,  /*  48 -  55 */
 863      28,  31,  29,  34,  30,  60,  31,  36,  /*  56 -  63 */
 864      32,  64,  33,  34,  34,  46,  35,  37,  /*  64 -  71 */
 865      36,  65,  37,  50,  38,  48,  39,  69,  /*  72 -  79 */
 866      40,  49,  41,  43,  42,  51,  43,  58,  /*  80 -  87 */
 867      44,  64,  45,  47,  46,  59,  47,  76,  /*  88 -  95 */
 868      48,  65,  49,  66,  50,  67,  51,  66,  /*  96 - 103 */
 869      52,  70,  53,  74,  54, 104,  55,  74,  /* 104 - 111 */
 870      56,  64,  57,  69,  58,  78,  59,  68,  /* 112 - 119 */
 871      60,  61,  61,  80,  62,  75,  63,  68,  /* 120 - 127 */
 872      64,  65,  65, 128,  66, 129,  67,  90,  /* 128 - 135 */
 873      68,  73,  69, 131,  70,  94,  71,  88,  /* 136 - 143 */
 874      72, 128,  73,  98,  74, 132,  75, 121,  /* 144 - 151 */
 875      76, 102,  77, 124,  78, 132,  79, 106,  /* 152 - 159 */
 876      80,  97,  81, 160,  82,  99,  83, 134,  /* 160 - 167 */
 877      84,  86,  85,  95,  86, 160,  87, 100,  /* 168 - 175 */
 878      88, 113,  89,  98,  90, 107,  91, 122,  /* 176 - 183 */
 879      92, 111,  93, 102,  94, 126,  95, 150,  /* 184 - 191 */
 880      96, 128,  97, 130,  98, 133,  99, 195,  /* 192 - 199 */
 881     100, 128, 101, 123, 102, 164, 103, 138,  /* 200 - 207 */
 882     104, 145, 105, 146, 106, 109, 107, 149,  /* 208 - 215 */
 883     108, 200, 109, 146, 110, 170, 111, 157,  /* 216 - 223 */
 884     112, 128, 113, 130, 114, 182, 115, 132,  /* 224 - 231 */
 885     116, 200, 117, 132, 118, 158, 119, 206,  /* 232 - 239 */
 886     120, 240, 121, 162, 122, 147, 123, 152,  /* 240 - 247 */
 887     124, 166, 125, 214, 126, 138, 127, 153,  /* 248 - 255 */
 888   };
 889
 890
 891 /* Return the number of multiplications required to calculate
 892    powi(x,n) where n is less than POWI_TABLE_SIZE.  This is a
 893    subroutine of powi_cost.  CACHE is an array indicating
 894    which exponents have already been calculated.  */
 895
 896 static int
 897 powi_lookup_cost (unsigned HOST_WIDE_INT n, bool *cache)
 898 {
 899   /* If we've already calculated this exponent, then this evaluation
 900      doesn't require any additional multiplications.  */
 901   if (cache[n])
 902     return 0;
 903
 904   cache[n] = true;
 905   return powi_lookup_cost (n - powi_table[n], cache)
 906          + powi_lookup_cost (powi_table[n], cache) + 1;
 907 }
 908
 909 /* Return the number of multiplications required to calculate
 910    powi(x,n) for an arbitrary x, given the exponent N.  This
 911    function needs to be kept in sync with powi_as_mults below.  */
 912
 913 static int
 914 powi_cost (HOST_WIDE_INT n)
 915 {
 916   bool cache[POWI_TABLE_SIZE];
 917   unsigned HOST_WIDE_INT digit;
 918   unsigned HOST_WIDE_INT val;
 919   int result;
 920
 921   if (n == 0)
 922     return 0;
 923
 924   /* Ignore the reciprocal when calculating the cost.  */
 925   val = (n < 0) ? -n : n;
 926
 927   /* Initialize the exponent cache.  */
 928   memset (cache, 0, POWI_TABLE_SIZE * sizeof (bool));
 929   cache[1] = true;
 930
 931   result = 0;
 932
 933   while (val >= POWI_TABLE_SIZE)
 934     {
 935       if (val & 1)
 936         {
 937           digit = val & ((1 << POWI_WINDOW_SIZE) - 1);
 938           result += powi_lookup_cost (digit, cache)
 939                     + POWI_WINDOW_SIZE + 1;
 940           val >>= POWI_WINDOW_SIZE;
 941         }
 942       else
 943         {
 944           val >>= 1;
 945           result++;
 946         }
 947     }
 948
 949   return result + powi_lookup_cost (val, cache);
 950 }
 951
 952 /* Recursive subroutine of powi_as_mults.  This function takes the
 953    array, CACHE, of already calculated exponents and an exponent N and
 954    returns a tree that corresponds to CACHE[1]**N, with type TYPE.  */
 955
 956 static tree
 957 powi_as_mults_1 (gimple_stmt_iterator *gsi, location_t loc, tree type,
 958                  HOST_WIDE_INT n, tree *cache)
 959 {
 960   tree op0, op1, ssa_target;
 961   unsigned HOST_WIDE_INT digit;
 962   gassign *mult_stmt;
 963
 964   if (n < POWI_TABLE_SIZE && cache[n])
 965     return cache[n];
 966
 967   ssa_target = make_temp_ssa_name (type, NULL, "powmult");
 968
 969   if (n < POWI_TABLE_SIZE)
 970     {
 971       cache[n] = ssa_target;
 972       op0 = powi_as_mults_1 (gsi, loc, type, n - powi_table[n], cache);
 973       op1 = powi_as_mults_1 (gsi, loc, type, powi_table[n], cache);
 974     }
 975   else if (n & 1)
 976     {
 977       digit = n & ((1 << POWI_WINDOW_SIZE) - 1);
 978       op0 = powi_as_mults_1 (gsi, loc, type, n - digit, cache);
 979       op1 = powi_as_mults_1 (gsi, loc, type, digit, cache);
 980     }
 981   else
 982     {
 983       op0 = powi_as_mults_1 (gsi, loc, type, n >> 1, cache);
 984       op1 = op0;
 985     }
 986
 987   mult_stmt = gimple_build_assign (ssa_target, MULT_EXPR, op0, op1);
 988   gimple_set_location (mult_stmt, loc);
 989   gsi_insert_before (gsi, mult_stmt, GSI_SAME_STMT);
 990
 991   return ssa_target;
 992 }
 993
 994 /* Convert ARG0**N to a tree of multiplications of ARG0 with itself.
 995    This function needs to be kept in sync with powi_cost above.  */
 996
 997 static tree
 998 powi_as_mults (gimple_stmt_iterator *gsi, location_t loc,
 999                tree arg0, HOST_WIDE_INT n)
1000 {
1001   tree cache[POWI_TABLE_SIZE], result, type = TREE_TYPE (arg0);
1002   gassign *div_stmt;
1003   tree target;
1004
1005   if (n == 0)
1006     return build_real (type, dconst1);
1007
1008   memset (cache, 0,  sizeof (cache));
1009   cache[1] = arg0;
1010
1011   result = powi_as_mults_1 (gsi, loc, type, (n < 0) ? -n : n, cache);
1012   if (n >= 0)
1013     return result;
1014
1015   /* If the original exponent was negative, reciprocate the result.  */
1016   target = make_temp_ssa_name (type, NULL, "powmult");
1017   div_stmt = gimple_build_assign (target, RDIV_EXPR,
1018                                   build_real (type, dconst1), result);
1019   gimple_set_location (div_stmt, loc);
1020   gsi_insert_before (gsi, div_stmt, GSI_SAME_STMT);
1021
1022   return target;
1023 }
1024
1025 /* ARG0 and N are the two arguments to a powi builtin in GSI with
1026    location info LOC.  If the arguments are appropriate, create an
1027    equivalent sequence of statements prior to GSI using an optimal
1028    number of multiplications, and return an expession holding the
1029    result.  */
1030
1031 static tree
1032 gimple_expand_builtin_powi (gimple_stmt_iterator *gsi, location_t loc,
1033                             tree arg0, HOST_WIDE_INT n)
1034 {
1035   /* Avoid largest negative number.  */
1036   if (n != -n
1037       && ((n >= -1 && n <= 2)
1038           || (optimize_function_for_speed_p (cfun)
1039               && powi_cost (n) <= POWI_MAX_MULTS)))
1040     return powi_as_mults (gsi, loc, arg0, n);
1041
1042   return NULL_TREE;
1043 }
1044
1045 /* Build a gimple call statement that calls FN with argument ARG.
1046    Set the lhs of the call statement to a fresh SSA name.  Insert the
1047    statement prior to GSI's current position, and return the fresh
1048    SSA name.  */
1049
1050 static tree
1051 build_and_insert_call (gimple_stmt_iterator *gsi, location_t loc,
1052                        tree fn, tree arg)
1053 {
1054   gcall *call_stmt;
1055   tree ssa_target;
1056
1057   call_stmt = gimple_build_call (fn, 1, arg);
1058   ssa_target = make_temp_ssa_name (TREE_TYPE (arg), NULL, "powroot");
1059   gimple_set_lhs (call_stmt, ssa_target);
1060   gimple_set_location (call_stmt, loc);
1061   gsi_insert_before (gsi, call_stmt, GSI_SAME_STMT);
1062
1063   return ssa_target;
1064 }
1065
1066 /* Build a gimple binary operation with the given CODE and arguments
1067    ARG0, ARG1, assigning the result to a new SSA name for variable
1068    TARGET.  Insert the statement prior to GSI's current position, and
1069    return the fresh SSA name.*/
1070
1071 static tree
1072 build_and_insert_binop (gimple_stmt_iterator *gsi, location_t loc,
1073                         const char *name, enum tree_code code,
1074                         tree arg0, tree arg1)
1075 {
1076   tree result = make_temp_ssa_name (TREE_TYPE (arg0), NULL, name);
1077   gassign *stmt = gimple_build_assign (result, code, arg0, arg1);
1078   gimple_set_location (stmt, loc);
1079   gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
1080   return result;
1081 }
1082
1083 /* Build a gimple reference operation with the given CODE and argument
1084    ARG, assigning the result to a new SSA name of TYPE with NAME.
1085    Insert the statement prior to GSI's current position, and return
1086    the fresh SSA name.  */
1087
1088 static inline tree
1089 build_and_insert_ref (gimple_stmt_iterator *gsi, location_t loc, tree type,
1090                       const char *name, enum tree_code code, tree arg0)
1091 {
1092   tree result = make_temp_ssa_name (type, NULL, name);
1093   gimple *stmt = gimple_build_assign (result, build1 (code, type, arg0));
1094   gimple_set_location (stmt, loc);
1095   gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
1096   return result;
1097 }
1098
1099 /* Build a gimple assignment to cast VAL to TYPE.  Insert the statement
1100    prior to GSI's current position, and return the fresh SSA name.  */
1101
1102 static tree
1103 build_and_insert_cast (gimple_stmt_iterator *gsi, location_t loc,
1104                        tree type, tree val)
1105 {
1106   tree result = make_ssa_name (type);
1107   gassign *stmt = gimple_build_assign (result, NOP_EXPR, val);
1108   gimple_set_location (stmt, loc);
1109   gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
1110   return result;
1111 }
1112
1113 struct pow_synth_sqrt_info
1114 {
1115   bool *factors;
1116   unsigned int deepest;
1117   unsigned int num_mults;
1118 };
1119
1120 /* Return true iff the real value C can be represented as a
1121    sum of powers of 0.5 up to N.  That is:
1122    C == SUM<i from 1..N> (a[i]*(0.5**i)) where a[i] is either 0 or 1.
1123    Record in INFO the various parameters of the synthesis algorithm such
1124    as the factors a[i], the maximum 0.5 power and the number of
1125    multiplications that will be required.  */
1126
1127 bool
1128 representable_as_half_series_p (REAL_VALUE_TYPE c, unsigned n,
1129                                  struct pow_synth_sqrt_info *info)
1130 {
1131   REAL_VALUE_TYPE factor = dconsthalf;
1132   REAL_VALUE_TYPE remainder = c;
1133
1134   info->deepest = 0;
1135   info->num_mults = 0;
1136   memset (info->factors, 0, n * sizeof (bool));
1137
1138   for (unsigned i = 0; i < n; i++)
1139     {
1140       REAL_VALUE_TYPE res;
1141
1142       /* If something inexact happened bail out now.  */
1143       if (REAL_ARITHMETIC (res, MINUS_EXPR, remainder, factor))
1144         return false;
1145
1146       /* We have hit zero.  The number is representable as a sum
1147          of powers of 0.5.  */
1148       if (REAL_VALUES_EQUAL (res, dconst0))
1149         {
1150           info->factors[i] = true;
1151           info->deepest = i + 1;
1152           return true;
1153         }
1154       else if (!REAL_VALUE_NEGATIVE (res))
1155         {
1156           remainder = res;
1157           info->factors[i] = true;
1158           info->num_mults++;
1159         }
1160       else
1161         info->factors[i] = false;
1162
1163       REAL_ARITHMETIC (factor, MULT_EXPR, factor, dconsthalf);
1164     }
1165   return false;
1166 }
1167
1168 /* Return the tree corresponding to FN being applied
1169    to ARG N times at GSI and LOC.
1170    Look up previous results from CACHE if need be.
1171    cache[0] should contain just plain ARG i.e. FN applied to ARG 0 times.  */
1172
1173 static tree
1174 get_fn_chain (tree arg, unsigned int n, gimple_stmt_iterator *gsi,
1175               tree fn, location_t loc, tree *cache)
1176 {
1177   tree res = cache[n];
1178   if (!res)
1179     {
1180       tree prev = get_fn_chain (arg, n - 1, gsi, fn, loc, cache);
1181       res = build_and_insert_call (gsi, loc, fn, prev);
1182       cache[n] = res;
1183     }
1184
1185   return res;
1186 }
1187
1188 /* Print to STREAM the repeated application of function FNAME to ARG
1189    N times.  So, for FNAME = "foo", ARG = "x", N = 2 it would print:
1190    "foo (foo (x))".  */
1191
1192 static void
1193 print_nested_fn (FILE* stream, const char *fname, const char* arg,
1194                  unsigned int n)
1195 {
1196   if (n == 0)
1197     fprintf (stream, "%s", arg);
1198   else
1199     {
1200       fprintf (stream, "%s (", fname);
1201       print_nested_fn (stream, fname, arg, n - 1);
1202       fprintf (stream, ")");
1203     }
1204 }
1205
1206 /* Print to STREAM the fractional sequence of sqrt chains
1207    applied to ARG, described by INFO.  Used for the dump file.  */
1208
1209 static void
1210 dump_fractional_sqrt_sequence (FILE *stream, const char *arg,
1211                                 struct pow_synth_sqrt_info *info)
1212 {
1213   for (unsigned int i = 0; i < info->deepest; i++)
1214     {
1215       bool is_set = info->factors[i];
1216       if (is_set)
1217         {
1218           print_nested_fn (stream, "sqrt", arg, i + 1);
1219           if (i != info->deepest - 1)
1220             fprintf (stream, " * ");
1221         }
1222     }
1223 }
1224
1225 /* Print to STREAM a representation of raising ARG to an integer
1226    power N.  Used for the dump file.  */
1227
1228 static void
1229 dump_integer_part (FILE *stream, const char* arg, HOST_WIDE_INT n)
1230 {
1231   if (n > 1)
1232     fprintf (stream, "powi (%s, " HOST_WIDE_INT_PRINT_DEC ")", arg, n);
1233   else if (n == 1)
1234     fprintf (stream, "%s", arg);
1235 }
1236
1237 /* Attempt to synthesize a POW[F] (ARG0, ARG1) call using chains of
1238    square roots.  Place at GSI and LOC.  Limit the maximum depth
1239    of the sqrt chains to MAX_DEPTH.  Return the tree holding the
1240    result of the expanded sequence or NULL_TREE if the expansion failed.
1241
1242    This routine assumes that ARG1 is a real number with a fractional part
1243    (the integer exponent case will have been handled earlier in
1244    gimple_expand_builtin_pow).
1245
1246    For ARG1 > 0.0:
1247    * For ARG1 composed of a whole part WHOLE_PART and a fractional part
1248      FRAC_PART i.e. WHOLE_PART == floor (ARG1) and
1249                     FRAC_PART == ARG1 - WHOLE_PART:
1250      Produce POWI (ARG0, WHOLE_PART) * POW (ARG0, FRAC_PART) where
1251      POW (ARG0, FRAC_PART) is expanded as a product of square root chains
1252      if it can be expressed as such, that is if FRAC_PART satisfies:
1253      FRAC_PART == <SUM from i = 1 until MAX_DEPTH> (a[i] * (0.5**i))
1254      where integer a[i] is either 0 or 1.
1255
1256      Example:
1257      POW (x, 3.625) == POWI (x, 3) * POW (x, 0.625)
1258        --> POWI (x, 3) * SQRT (x) * SQRT (SQRT (SQRT (x)))
1259
1260    For ARG1 < 0.0 there are two approaches:
1261    * (A) Expand to 1.0 / POW (ARG0, -ARG1) where POW (ARG0, -ARG1)
1262          is calculated as above.
1263
1264      Example:
1265      POW (x, -5.625) == 1.0 / POW (x, 5.625)
1266        -->  1.0 / (POWI (x, 5) * SQRT (x) * SQRT (SQRT (SQRT (x))))
1267
1268    * (B) : WHOLE_PART := - ceil (abs (ARG1))
1269            FRAC_PART  := ARG1 - WHOLE_PART
1270      and expand to POW (x, FRAC_PART) / POWI (x, WHOLE_PART).
1271      Example:
1272      POW (x, -5.875) == POW (x, 0.125) / POWI (X, 6)
1273        --> SQRT (SQRT (SQRT (x))) / (POWI (x, 6))
1274
1275    For ARG1 < 0.0 we choose between (A) and (B) depending on
1276    how many multiplications we'd have to do.
1277    So, for the example in (B): POW (x, -5.875), if we were to
1278    follow algorithm (A) we would produce:
1279    1.0 / POWI (X, 5) * SQRT (X) * SQRT (SQRT (X)) * SQRT (SQRT (SQRT (X)))
1280    which contains more multiplications than approach (B).
1281
1282    Hopefully, this approach will eliminate potentially expensive POW library
1283    calls when unsafe floating point math is enabled and allow the compiler to
1284    further optimise the multiplies, square roots and divides produced by this
1285    function.  */
1286
1287 static tree
1288 expand_pow_as_sqrts (gimple_stmt_iterator *gsi, location_t loc,
1289                      tree arg0, tree arg1, HOST_WIDE_INT max_depth)
1290 {
1291   tree type = TREE_TYPE (arg0);
1292   machine_mode mode = TYPE_MODE (type);
1293   tree sqrtfn = mathfn_built_in (type, BUILT_IN_SQRT);
1294   bool one_over = true;
1295
1296   if (!sqrtfn)
1297     return NULL_TREE;
1298
1299   if (TREE_CODE (arg1) != REAL_CST)
1300     return NULL_TREE;
1301
1302   REAL_VALUE_TYPE exp_init = TREE_REAL_CST (arg1);
1303
1304   gcc_assert (max_depth > 0);
1305   tree *cache = XALLOCAVEC (tree, max_depth + 1);
1306
1307   struct pow_synth_sqrt_info synth_info;
1308   synth_info.factors = XALLOCAVEC (bool, max_depth + 1);
1309   synth_info.deepest = 0;
1310   synth_info.num_mults = 0;
1311
1312   bool neg_exp = REAL_VALUE_NEGATIVE (exp_init);
1313   REAL_VALUE_TYPE exp = real_value_abs (&exp_init);
1314
1315   /* The whole and fractional parts of exp.  */
1316   REAL_VALUE_TYPE whole_part;
1317   REAL_VALUE_TYPE frac_part;
1318
1319   real_floor (&whole_part, mode, &exp);
1320   REAL_ARITHMETIC (frac_part, MINUS_EXPR, exp, whole_part);
1321
1322
1323   REAL_VALUE_TYPE ceil_whole = dconst0;
1324   REAL_VALUE_TYPE ceil_fract = dconst0;
1325
1326   if (neg_exp)
1327     {
1328       real_ceil (&ceil_whole, mode, &exp);
1329       REAL_ARITHMETIC (ceil_fract, MINUS_EXPR, ceil_whole, exp);
1330     }
1331
1332   if (!representable_as_half_series_p (frac_part, max_depth, &synth_info))
1333     return NULL_TREE;
1334
1335   /* Check whether it's more profitable to not use 1.0 / ...  */
1336   if (neg_exp)
1337     {
1338       struct pow_synth_sqrt_info alt_synth_info;
1339       alt_synth_info.factors = XALLOCAVEC (bool, max_depth + 1);
1340       alt_synth_info.deepest = 0;
1341       alt_synth_info.num_mults = 0;
1342
1343       if (representable_as_half_series_p (ceil_fract, max_depth,
1344                                            &alt_synth_info)
1345           && alt_synth_info.deepest <= synth_info.deepest
1346           && alt_synth_info.num_mults < synth_info.num_mults)
1347         {
1348           whole_part = ceil_whole;
1349           frac_part = ceil_fract;
1350           synth_info.deepest = alt_synth_info.deepest;
1351           synth_info.num_mults = alt_synth_info.num_mults;
1352           memcpy (synth_info.factors, alt_synth_info.factors,
1353                   (max_depth + 1) * sizeof (bool));
1354           one_over = false;
1355         }
1356     }
1357
1358   HOST_WIDE_INT n = real_to_integer (&whole_part);
1359   REAL_VALUE_TYPE cint;
1360   real_from_integer (&cint, VOIDmode, n, SIGNED);
1361
1362   if (!real_identical (&whole_part, &cint))
1363     return NULL_TREE;
1364
1365   if (powi_cost (n) + synth_info.num_mults > POWI_MAX_MULTS)
1366     return NULL_TREE;
1367
1368   memset (cache, 0, (max_depth + 1) * sizeof (tree));
1369
1370   tree integer_res = n == 0 ? build_real (type, dconst1) : arg0;
1371
1372   /* Calculate the integer part of the exponent.  */
1373   if (n > 1)
1374     {
1375       integer_res = gimple_expand_builtin_powi (gsi, loc, arg0, n);
1376       if (!integer_res)
1377         return NULL_TREE;
1378     }
1379
1380   if (dump_file)
1381     {
1382       char string[64];
1383
1384       real_to_decimal (string, &exp_init, sizeof (string), 0, 1);
1385       fprintf (dump_file, "synthesizing pow (x, %s) as:\n", string);
1386
1387       if (neg_exp)
1388         {
1389           if (one_over)
1390             {
1391               fprintf (dump_file, "1.0 / (");
1392               dump_integer_part (dump_file, "x", n);
1393               if (n > 0)
1394                 fprintf (dump_file, " * ");
1395               dump_fractional_sqrt_sequence (dump_file, "x", &synth_info);
1396               fprintf (dump_file, ")");
1397             }
1398           else
1399             {
1400               dump_fractional_sqrt_sequence (dump_file, "x", &synth_info);
1401               fprintf (dump_file, " / (");
1402               dump_integer_part (dump_file, "x", n);
1403               fprintf (dump_file, ")");
1404             }
1405         }
1406       else
1407         {
1408           dump_fractional_sqrt_sequence (dump_file, "x", &synth_info);
1409           if (n > 0)
1410             fprintf (dump_file, " * ");
1411           dump_integer_part (dump_file, "x", n);
1412         }
1413
1414       fprintf (dump_file, "\ndeepest sqrt chain: %d\n", synth_info.deepest);
1415     }
1416
1417
1418   tree fract_res = NULL_TREE;
1419   cache[0] = arg0;
1420
1421   /* Calculate the fractional part of the exponent.  */
1422   for (unsigned i = 0; i < synth_info.deepest; i++)
1423     {
1424       if (synth_info.factors[i])
1425         {
1426           tree sqrt_chain = get_fn_chain (arg0, i + 1, gsi, sqrtfn, loc, cache);
1427
1428           if (!fract_res)
1429               fract_res = sqrt_chain;
1430
1431           else
1432             fract_res = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1433                                            fract_res, sqrt_chain);
1434         }
1435     }
1436
1437   tree res = NULL_TREE;
1438
1439   if (neg_exp)
1440     {
1441       if (one_over)
1442         {
1443           if (n > 0)
1444             res = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1445                                            fract_res, integer_res);
1446           else
1447             res = fract_res;
1448
1449           res = build_and_insert_binop (gsi, loc, "powrootrecip", RDIV_EXPR,
1450                                           build_real (type, dconst1), res);
1451         }
1452       else
1453         {
1454           res = build_and_insert_binop (gsi, loc, "powroot", RDIV_EXPR,
1455                                          fract_res, integer_res);
1456         }
1457     }
1458   else
1459     res = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1460                                    fract_res, integer_res);
1461   return res;
1462 }
1463
1464 /* ARG0 and ARG1 are the two arguments to a pow builtin call in GSI
1465    with location info LOC.  If possible, create an equivalent and
1466    less expensive sequence of statements prior to GSI, and return an
1467    expession holding the result.  */
1468
1469 static tree
1470 gimple_expand_builtin_pow (gimple_stmt_iterator *gsi, location_t loc,
1471                            tree arg0, tree arg1)
1472 {
1473   REAL_VALUE_TYPE c, cint, dconst1_3, dconst1_4, dconst1_6;
1474   REAL_VALUE_TYPE c2, dconst3;
1475   HOST_WIDE_INT n;
1476   tree type, sqrtfn, cbrtfn, sqrt_arg0, result, cbrt_x, powi_cbrt_x;
1477   machine_mode mode;
1478   bool speed_p = optimize_bb_for_speed_p (gsi_bb (*gsi));
1479   bool hw_sqrt_exists, c_is_int, c2_is_int;
1480
1481   dconst1_4 = dconst1;
1482   SET_REAL_EXP (&dconst1_4, REAL_EXP (&dconst1_4) - 2);
1483
1484   /* If the exponent isn't a constant, there's nothing of interest
1485      to be done.  */
1486   if (TREE_CODE (arg1) != REAL_CST)
1487     return NULL_TREE;
1488
1489   /* If the exponent is equivalent to an integer, expand to an optimal
1490      multiplication sequence when profitable.  */
1491   c = TREE_REAL_CST (arg1);
1492   n = real_to_integer (&c);
1493   real_from_integer (&cint, VOIDmode, n, SIGNED);
1494   c_is_int = real_identical (&c, &cint);
1495
1496   if (c_is_int
1497       && ((n >= -1 && n <= 2)
1498           || (flag_unsafe_math_optimizations
1499               && speed_p
1500               && powi_cost (n) <= POWI_MAX_MULTS)))
1501     return gimple_expand_builtin_powi (gsi, loc, arg0, n);
1502
1503   /* Attempt various optimizations using sqrt and cbrt.  */
1504   type = TREE_TYPE (arg0);
1505   mode = TYPE_MODE (type);
1506   sqrtfn = mathfn_built_in (type, BUILT_IN_SQRT);
1507
1508   /* Optimize pow(x,0.5) = sqrt(x).  This replacement is always safe
1509      unless signed zeros must be maintained.  pow(-0,0.5) = +0, while
1510      sqrt(-0) = -0.  */
1511   if (sqrtfn
1512       && REAL_VALUES_EQUAL (c, dconsthalf)
1513       && !HONOR_SIGNED_ZEROS (mode))
1514     return build_and_insert_call (gsi, loc, sqrtfn, arg0);
1515
1516   hw_sqrt_exists = optab_handler (sqrt_optab, mode) != CODE_FOR_nothing;
1517
1518   /* Optimize pow(x,1./3.) = cbrt(x).  This requires unsafe math
1519      optimizations since 1./3. is not exactly representable.  If x
1520      is negative and finite, the correct value of pow(x,1./3.) is
1521      a NaN with the "invalid" exception raised, because the value
1522      of 1./3. actually has an even denominator.  The correct value
1523      of cbrt(x) is a negative real value.  */
1524   cbrtfn = mathfn_built_in (type, BUILT_IN_CBRT);
1525   dconst1_3 = real_value_truncate (mode, dconst_third ());
1526
1527   if (flag_unsafe_math_optimizations
1528       && cbrtfn
1529       && (gimple_val_nonnegative_real_p (arg0) || !HONOR_NANS (mode))
1530       && REAL_VALUES_EQUAL (c, dconst1_3))
1531     return build_and_insert_call (gsi, loc, cbrtfn, arg0);
1532
1533   /* Optimize pow(x,1./6.) = cbrt(sqrt(x)).  Don't do this optimization
1534      if we don't have a hardware sqrt insn.  */
1535   dconst1_6 = dconst1_3;
1536   SET_REAL_EXP (&dconst1_6, REAL_EXP (&dconst1_6) - 1);
1537
1538   if (flag_unsafe_math_optimizations
1539       && sqrtfn
1540       && cbrtfn
1541       && (gimple_val_nonnegative_real_p (arg0) || !HONOR_NANS (mode))
1542       && speed_p
1543       && hw_sqrt_exists
1544       && REAL_VALUES_EQUAL (c, dconst1_6))
1545     {
1546       /* sqrt(x)  */
1547       sqrt_arg0 = build_and_insert_call (gsi, loc, sqrtfn, arg0);
1548
1549       /* cbrt(sqrt(x))  */
1550       return build_and_insert_call (gsi, loc, cbrtfn, sqrt_arg0);
1551     }
1552
1553
1554   /* Attempt to expand the POW as a product of square root chains.
1555      Expand the 0.25 case even when otpimising for size.  */
1556   if (flag_unsafe_math_optimizations
1557       && sqrtfn
1558       && hw_sqrt_exists
1559       && (speed_p || REAL_VALUES_EQUAL (c, dconst1_4))
1560       && !HONOR_SIGNED_ZEROS (mode))
1561     {
1562       unsigned int max_depth = speed_p
1563                                 ? PARAM_VALUE (PARAM_MAX_POW_SQRT_DEPTH)
1564                                 : 2;
1565
1566       tree expand_with_sqrts
1567         = expand_pow_as_sqrts (gsi, loc, arg0, arg1, max_depth);
1568
1569       if (expand_with_sqrts)
1570         return expand_with_sqrts;
1571     }
1572
1573   real_arithmetic (&c2, MULT_EXPR, &c, &dconst2);
1574   n = real_to_integer (&c2);
1575   real_from_integer (&cint, VOIDmode, n, SIGNED);
1576   c2_is_int = real_identical (&c2, &cint);
1577
1578   /* Optimize pow(x,c), where 3c = n for some nonzero integer n, into
1579
1580      powi(x, n/3) * powi(cbrt(x), n%3),                    n > 0;
1581      1.0 / (powi(x, abs(n)/3) * powi(cbrt(x), abs(n)%3)),  n < 0.
1582
1583      Do not calculate the first factor when n/3 = 0.  As cbrt(x) is
1584      different from pow(x, 1./3.) due to rounding and behavior with
1585      negative x, we need to constrain this transformation to unsafe
1586      math and positive x or finite math.  */
1587   real_from_integer (&dconst3, VOIDmode, 3, SIGNED);
1588   real_arithmetic (&c2, MULT_EXPR, &c, &dconst3);
1589   real_round (&c2, mode, &c2);
1590   n = real_to_integer (&c2);
1591   real_from_integer (&cint, VOIDmode, n, SIGNED);
1592   real_arithmetic (&c2, RDIV_EXPR, &cint, &dconst3);
1593   real_convert (&c2, mode, &c2);
1594
1595   if (flag_unsafe_math_optimizations
1596       && cbrtfn
1597       && (gimple_val_nonnegative_real_p (arg0) || !HONOR_NANS (mode))
1598       && real_identical (&c2, &c)
1599       && !c2_is_int
1600       && optimize_function_for_speed_p (cfun)
1601       && powi_cost (n / 3) <= POWI_MAX_MULTS)
1602     {
1603       tree powi_x_ndiv3 = NULL_TREE;
1604
1605       /* Attempt to fold powi(arg0, abs(n/3)) into multiplies.  If not
1606          possible or profitable, give up.  Skip the degenerate case when
1607          abs(n) < 3, where the result is always 1.  */
1608       if (absu_hwi (n) >= 3)
1609         {
1610           powi_x_ndiv3 = gimple_expand_builtin_powi (gsi, loc, arg0,
1611                                                      abs_hwi (n / 3));
1612           if (!powi_x_ndiv3)
1613             return NULL_TREE;
1614         }
1615
1616       /* Calculate powi(cbrt(x), n%3).  Don't use gimple_expand_builtin_powi
1617          as that creates an unnecessary variable.  Instead, just produce
1618          either cbrt(x) or cbrt(x) * cbrt(x).  */
1619       cbrt_x = build_and_insert_call (gsi, loc, cbrtfn, arg0);
1620
1621       if (absu_hwi (n) % 3 == 1)
1622         powi_cbrt_x = cbrt_x;
1623       else
1624         powi_cbrt_x = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1625                                               cbrt_x, cbrt_x);
1626
1627       /* Multiply the two subexpressions, unless powi(x,abs(n)/3) = 1.  */
1628       if (absu_hwi (n) < 3)
1629         result = powi_cbrt_x;
1630       else
1631         result = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1632                                          powi_x_ndiv3, powi_cbrt_x);
1633
1634       /* If n is negative, reciprocate the result.  */
1635       if (n < 0)
1636         result = build_and_insert_binop (gsi, loc, "powroot", RDIV_EXPR,
1637                                          build_real (type, dconst1), result);
1638
1639       return result;
1640     }
1641
1642   /* No optimizations succeeded.  */
1643   return NULL_TREE;
1644 }
1645
1646 /* ARG is the argument to a cabs builtin call in GSI with location info
1647    LOC.  Create a sequence of statements prior to GSI that calculates
1648    sqrt(R*R + I*I), where R and I are the real and imaginary components
1649    of ARG, respectively.  Return an expression holding the result.  */
1650
1651 static tree
1652 gimple_expand_builtin_cabs (gimple_stmt_iterator *gsi, location_t loc, tree arg)
1653 {
1654   tree real_part, imag_part, addend1, addend2, sum, result;
1655   tree type = TREE_TYPE (TREE_TYPE (arg));
1656   tree sqrtfn = mathfn_built_in (type, BUILT_IN_SQRT);
1657   machine_mode mode = TYPE_MODE (type);
1658
1659   if (!flag_unsafe_math_optimizations
1660       || !optimize_bb_for_speed_p (gimple_bb (gsi_stmt (*gsi)))
1661       || !sqrtfn
1662       || optab_handler (sqrt_optab, mode) == CODE_FOR_nothing)
1663     return NULL_TREE;
1664
1665   real_part = build_and_insert_ref (gsi, loc, type, "cabs",
1666                                     REALPART_EXPR, arg);
1667   addend1 = build_and_insert_binop (gsi, loc, "cabs", MULT_EXPR,
1668                                     real_part, real_part);
1669   imag_part = build_and_insert_ref (gsi, loc, type, "cabs",
1670                                     IMAGPART_EXPR, arg);
1671   addend2 = build_and_insert_binop (gsi, loc, "cabs", MULT_EXPR,
1672                                     imag_part, imag_part);
1673   sum = build_and_insert_binop (gsi, loc, "cabs", PLUS_EXPR, addend1, addend2);
1674   result = build_and_insert_call (gsi, loc, sqrtfn, sum);
1675
1676   return result;
1677 }
1678
1679 /* Go through all calls to sin, cos and cexpi and call execute_cse_sincos_1
1680    on the SSA_NAME argument of each of them.  Also expand powi(x,n) into
1681    an optimal number of multiplies, when n is a constant.  */
1682
1683 namespace {
1684
1685 const pass_data pass_data_cse_sincos =
1686 {
1687   GIMPLE_PASS, /* type */
1688   "sincos", /* name */
1689   OPTGROUP_NONE, /* optinfo_flags */
1690   TV_NONE, /* tv_id */
1691   PROP_ssa, /* properties_required */
1692   0, /* properties_provided */
1693   0, /* properties_destroyed */
1694   0, /* todo_flags_start */
1695   TODO_update_ssa, /* todo_flags_finish */
1696 };
1697
1698 class pass_cse_sincos : public gimple_opt_pass
1699 {
1700 public:
1701   pass_cse_sincos (gcc::context *ctxt)
1702     : gimple_opt_pass (pass_data_cse_sincos, ctxt)
1703   {}
1704
1705   /* opt_pass methods: */
1706   virtual bool gate (function *)
1707     {
1708       /* We no longer require either sincos or cexp, since powi expansion
1709          piggybacks on this pass.  */
1710       return optimize;
1711     }
1712
1713   virtual unsigned int execute (function *);
1714
1715 }; // class pass_cse_sincos
1716
1717 unsigned int
1718 pass_cse_sincos::execute (function *fun)
1719 {
1720   basic_block bb;
1721   bool cfg_changed = false;
1722
1723   calculate_dominance_info (CDI_DOMINATORS);
1724   memset (&sincos_stats, 0, sizeof (sincos_stats));
1725
1726   FOR_EACH_BB_FN (bb, fun)
1727     {
1728       gimple_stmt_iterator gsi;
1729       bool cleanup_eh = false;
1730
1731       for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi); gsi_next (&gsi))
1732         {
1733           gimple *stmt = gsi_stmt (gsi);
1734           tree fndecl;
1735
1736           /* Only the last stmt in a bb could throw, no need to call
1737              gimple_purge_dead_eh_edges if we change something in the middle
1738              of a basic block.  */
1739           cleanup_eh = false;
1740
1741           if (is_gimple_call (stmt)
1742               && gimple_call_lhs (stmt)
1743               && (fndecl = gimple_call_fndecl (stmt))
1744               && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_NORMAL)
1745             {
1746               tree arg, arg0, arg1, result;
1747               HOST_WIDE_INT n;
1748               location_t loc;
1749
1750               switch (DECL_FUNCTION_CODE (fndecl))
1751                 {
1752                 CASE_FLT_FN (BUILT_IN_COS):
1753                 CASE_FLT_FN (BUILT_IN_SIN):
1754                 CASE_FLT_FN (BUILT_IN_CEXPI):
1755                   /* Make sure we have either sincos or cexp.  */
1756                   if (!targetm.libc_has_function (function_c99_math_complex)
1757                       && !targetm.libc_has_function (function_sincos))
1758                     break;
1759
1760                   arg = gimple_call_arg (stmt, 0);
1761                   if (TREE_CODE (arg) == SSA_NAME)
1762                     cfg_changed |= execute_cse_sincos_1 (arg);
1763                   break;
1764
1765                 CASE_FLT_FN (BUILT_IN_POW):
1766                   arg0 = gimple_call_arg (stmt, 0);
1767                   arg1 = gimple_call_arg (stmt, 1);
1768
1769                   loc = gimple_location (stmt);
1770                   result = gimple_expand_builtin_pow (&gsi, loc, arg0, arg1);
1771
1772                   if (result)
1773                     {
1774                       tree lhs = gimple_get_lhs (stmt);
1775                       gassign *new_stmt = gimple_build_assign (lhs, result);
1776                       gimple_set_location (new_stmt, loc);
1777                       unlink_stmt_vdef (stmt);
1778                       gsi_replace (&gsi, new_stmt, true);
1779                       cleanup_eh = true;
1780                       if (gimple_vdef (stmt))
1781                         release_ssa_name (gimple_vdef (stmt));
1782                     }
1783                   break;
1784
1785                 CASE_FLT_FN (BUILT_IN_POWI):
1786                   arg0 = gimple_call_arg (stmt, 0);
1787                   arg1 = gimple_call_arg (stmt, 1);
1788                   loc = gimple_location (stmt);
1789
1790                   if (real_minus_onep (arg0))
1791                     {
1792                       tree t0, t1, cond, one, minus_one;
1793                       gassign *stmt;
1794
1795                       t0 = TREE_TYPE (arg0);
1796                       t1 = TREE_TYPE (arg1);
1797                       one = build_real (t0, dconst1);
1798                       minus_one = build_real (t0, dconstm1);
1799
1800                       cond = make_temp_ssa_name (t1, NULL, "powi_cond");
1801                       stmt = gimple_build_assign (cond, BIT_AND_EXPR,
1802                                                   arg1, build_int_cst (t1, 1));
1803                       gimple_set_location (stmt, loc);
1804                       gsi_insert_before (&gsi, stmt, GSI_SAME_STMT);
1805
1806                       result = make_temp_ssa_name (t0, NULL, "powi");
1807                       stmt = gimple_build_assign (result, COND_EXPR, cond,
1808                                                   minus_one, one);
1809                       gimple_set_location (stmt, loc);
1810                       gsi_insert_before (&gsi, stmt, GSI_SAME_STMT);
1811                     }
1812                   else
1813                     {
1814                       if (!tree_fits_shwi_p (arg1))
1815                         break;
1816
1817                       n = tree_to_shwi (arg1);
1818                       result = gimple_expand_builtin_powi (&gsi, loc, arg0, n);
1819                     }
1820
1821                   if (result)
1822                     {
1823                       tree lhs = gimple_get_lhs (stmt);
1824                       gassign *new_stmt = gimple_build_assign (lhs, result);
1825                       gimple_set_location (new_stmt, loc);
1826                       unlink_stmt_vdef (stmt);
1827                       gsi_replace (&gsi, new_stmt, true);
1828                       cleanup_eh = true;
1829                       if (gimple_vdef (stmt))
1830                         release_ssa_name (gimple_vdef (stmt));
1831                     }
1832                   break;
1833
1834                 CASE_FLT_FN (BUILT_IN_CABS):
1835                   arg0 = gimple_call_arg (stmt, 0);
1836                   loc = gimple_location (stmt);
1837                   result = gimple_expand_builtin_cabs (&gsi, loc, arg0);
1838
1839                   if (result)
1840                     {
1841                       tree lhs = gimple_get_lhs (stmt);
1842                       gassign *new_stmt = gimple_build_assign (lhs, result);
1843                       gimple_set_location (new_stmt, loc);
1844                       unlink_stmt_vdef (stmt);
1845                       gsi_replace (&gsi, new_stmt, true);
1846                       cleanup_eh = true;
1847                       if (gimple_vdef (stmt))
1848                         release_ssa_name (gimple_vdef (stmt));
1849                     }
1850                   break;
1851
1852                 default:;
1853                 }
1854             }
1855         }
1856       if (cleanup_eh)
1857         cfg_changed |= gimple_purge_dead_eh_edges (bb);
1858     }
1859
1860   statistics_counter_event (fun, "sincos statements inserted",
1861                             sincos_stats.inserted);
1862
1863   free_dominance_info (CDI_DOMINATORS);
1864   return cfg_changed ? TODO_cleanup_cfg : 0;
1865 }
1866
1867 } // anon namespace
1868
1869 gimple_opt_pass *
1870 make_pass_cse_sincos (gcc::context *ctxt)
1871 {
1872   return new pass_cse_sincos (ctxt);
1873 }
1874
1875 /* A symbolic number is used to detect byte permutation and selection
1876    patterns.  Therefore the field N contains an artificial number
1877    consisting of octet sized markers:
1878
1879    0    - target byte has the value 0
1880    FF   - target byte has an unknown value (eg. due to sign extension)
1881    1..size - marker value is the target byte index minus one.
1882
1883    To detect permutations on memory sources (arrays and structures), a symbolic
1884    number is also associated a base address (the array or structure the load is
1885    made from), an offset from the base address and a range which gives the
1886    difference between the highest and lowest accessed memory location to make
1887    such a symbolic number. The range is thus different from size which reflects
1888    the size of the type of current expression. Note that for non memory source,
1889    range holds the same value as size.
1890
1891    For instance, for an array char a[], (short) a[0] | (short) a[3] would have
1892    a size of 2 but a range of 4 while (short) a[0] | ((short) a[0] << 1) would
1893    still have a size of 2 but this time a range of 1.  */
1894
1895 struct symbolic_number {
1896   uint64_t n;
1897   tree type;
1898   tree base_addr;
1899   tree offset;
1900   HOST_WIDE_INT bytepos;
1901   tree alias_set;
1902   tree vuse;
1903   unsigned HOST_WIDE_INT range;
1904 };
1905
1906 #define BITS_PER_MARKER 8
1907 #define MARKER_MASK ((1 << BITS_PER_MARKER) - 1)
1908 #define MARKER_BYTE_UNKNOWN MARKER_MASK
1909 #define HEAD_MARKER(n, size) \
1910   ((n) & ((uint64_t) MARKER_MASK << (((size) - 1) * BITS_PER_MARKER)))
1911
1912 /* The number which the find_bswap_or_nop_1 result should match in
1913    order to have a nop.  The number is masked according to the size of
1914    the symbolic number before using it.  */
1915 #define CMPNOP (sizeof (int64_t) < 8 ? 0 : \
1916   (uint64_t)0x08070605 << 32 | 0x04030201)
1917
1918 /* The number which the find_bswap_or_nop_1 result should match in
1919    order to have a byte swap.  The number is masked according to the
1920    size of the symbolic number before using it.  */
1921 #define CMPXCHG (sizeof (int64_t) < 8 ? 0 : \
1922   (uint64_t)0x01020304 << 32 | 0x05060708)
1923
1924 /* Perform a SHIFT or ROTATE operation by COUNT bits on symbolic
1925    number N.  Return false if the requested operation is not permitted
1926    on a symbolic number.  */
1927
1928 static inline bool
1929 do_shift_rotate (enum tree_code code,
1930                  struct symbolic_number *n,
1931                  int count)
1932 {
1933   int i, size = TYPE_PRECISION (n->type) / BITS_PER_UNIT;
1934   unsigned head_marker;
1935
1936   if (count % BITS_PER_UNIT != 0)
1937     return false;
1938   count = (count / BITS_PER_UNIT) * BITS_PER_MARKER;
1939
1940   /* Zero out the extra bits of N in order to avoid them being shifted
1941      into the significant bits.  */
1942   if (size < 64 / BITS_PER_MARKER)
1943     n->n &= ((uint64_t) 1 << (size * BITS_PER_MARKER)) - 1;
1944
1945   switch (code)
1946     {
1947     case LSHIFT_EXPR:
1948       n->n <<= count;
1949       break;
1950     case RSHIFT_EXPR:
1951       head_marker = HEAD_MARKER (n->n, size);
1952       n->n >>= count;
1953       /* Arithmetic shift of signed type: result is dependent on the value.  */
1954       if (!TYPE_UNSIGNED (n->type) && head_marker)
1955         for (i = 0; i < count / BITS_PER_MARKER; i++)
1956           n->n |= (uint64_t) MARKER_BYTE_UNKNOWN
1957                   << ((size - 1 - i) * BITS_PER_MARKER);
1958       break;
1959     case LROTATE_EXPR:
1960       n->n = (n->n << count) | (n->n >> ((size * BITS_PER_MARKER) - count));
1961       break;
1962     case RROTATE_EXPR:
1963       n->n = (n->n >> count) | (n->n << ((size * BITS_PER_MARKER) - count));
1964       break;
1965     default:
1966       return false;
1967     }
1968   /* Zero unused bits for size.  */
1969   if (size < 64 / BITS_PER_MARKER)
1970     n->n &= ((uint64_t) 1 << (size * BITS_PER_MARKER)) - 1;
1971   return true;
1972 }
1973
1974 /* Perform sanity checking for the symbolic number N and the gimple
1975    statement STMT.  */
1976
1977 static inline bool
1978 verify_symbolic_number_p (struct symbolic_number *n, gimple *stmt)
1979 {
1980   tree lhs_type;
1981
1982   lhs_type = gimple_expr_type (stmt);
1983
1984   if (TREE_CODE (lhs_type) != INTEGER_TYPE)
1985     return false;
1986
1987   if (TYPE_PRECISION (lhs_type) != TYPE_PRECISION (n->type))
1988     return false;
1989
1990   return true;
1991 }
1992
1993 /* Initialize the symbolic number N for the bswap pass from the base element
1994    SRC manipulated by the bitwise OR expression.  */
1995
1996 static bool
1997 init_symbolic_number (struct symbolic_number *n, tree src)
1998 {
1999   int size;
2000
2001   n->base_addr = n->offset = n->alias_set = n->vuse = NULL_TREE;
2002
2003   /* Set up the symbolic number N by setting each byte to a value between 1 and
2004      the byte size of rhs1.  The highest order byte is set to n->size and the
2005      lowest order byte to 1.  */
2006   n->type = TREE_TYPE (src);
2007   size = TYPE_PRECISION (n->type);
2008   if (size % BITS_PER_UNIT != 0)
2009     return false;
2010   size /= BITS_PER_UNIT;
2011   if (size > 64 / BITS_PER_MARKER)
2012     return false;
2013   n->range = size;
2014   n->n = CMPNOP;
2015
2016   if (size < 64 / BITS_PER_MARKER)
2017     n->n &= ((uint64_t) 1 << (size * BITS_PER_MARKER)) - 1;
2018
2019   return true;
2020 }
2021
2022 /* Check if STMT might be a byte swap or a nop from a memory source and returns
2023    the answer. If so, REF is that memory source and the base of the memory area
2024    accessed and the offset of the access from that base are recorded in N.  */
2025
2026 bool
2027 find_bswap_or_nop_load (gimple *stmt, tree ref, struct symbolic_number *n)
2028 {
2029   /* Leaf node is an array or component ref. Memorize its base and
2030      offset from base to compare to other such leaf node.  */
2031   HOST_WIDE_INT bitsize, bitpos;
2032   machine_mode mode;
2033   int unsignedp, volatilep;
2034   tree offset, base_addr;
2035
2036   /* Not prepared to handle PDP endian.  */
2037   if (BYTES_BIG_ENDIAN != WORDS_BIG_ENDIAN)
2038     return false;
2039
2040   if (!gimple_assign_load_p (stmt) || gimple_has_volatile_ops (stmt))
2041     return false;
2042
2043   base_addr = get_inner_reference (ref, &bitsize, &bitpos, &offset, &mode,
2044                                    &unsignedp, &volatilep, false);
2045
2046   if (TREE_CODE (base_addr) == MEM_REF)
2047     {
2048       offset_int bit_offset = 0;
2049       tree off = TREE_OPERAND (base_addr, 1);
2050
2051       if (!integer_zerop (off))
2052         {
2053           offset_int boff, coff = mem_ref_offset (base_addr);
2054           boff = wi::lshift (coff, LOG2_BITS_PER_UNIT);
2055           bit_offset += boff;
2056         }
2057
2058       base_addr = TREE_OPERAND (base_addr, 0);
2059
2060       /* Avoid returning a negative bitpos as this may wreak havoc later.  */
2061       if (wi::neg_p (bit_offset))
2062         {
2063           offset_int mask = wi::mask <offset_int> (LOG2_BITS_PER_UNIT, false);
2064           offset_int tem = bit_offset.and_not (mask);
2065           /* TEM is the bitpos rounded to BITS_PER_UNIT towards -Inf.
2066              Subtract it to BIT_OFFSET and add it (scaled) to OFFSET.  */
2067           bit_offset -= tem;
2068           tem = wi::arshift (tem, LOG2_BITS_PER_UNIT);
2069           if (offset)
2070             offset = size_binop (PLUS_EXPR, offset,
2071                                     wide_int_to_tree (sizetype, tem));
2072           else
2073             offset = wide_int_to_tree (sizetype, tem);
2074         }
2075
2076       bitpos += bit_offset.to_shwi ();
2077     }
2078
2079   if (bitpos % BITS_PER_UNIT)
2080     return false;
2081   if (bitsize % BITS_PER_UNIT)
2082     return false;
2083
2084   if (!init_symbolic_number (n, ref))
2085     return false;
2086   n->base_addr = base_addr;
2087   n->offset = offset;
2088   n->bytepos = bitpos / BITS_PER_UNIT;
2089   n->alias_set = reference_alias_ptr_type (ref);
2090   n->vuse = gimple_vuse (stmt);
2091   return true;
2092 }
2093
2094 /* Compute the symbolic number N representing the result of a bitwise OR on 2
2095    symbolic number N1 and N2 whose source statements are respectively
2096    SOURCE_STMT1 and SOURCE_STMT2.  */
2097
2098 static gimple *
2099 perform_symbolic_merge (gimple *source_stmt1, struct symbolic_number *n1,
2100                         gimple *source_stmt2, struct symbolic_number *n2,
2101                         struct symbolic_number *n)
2102 {
2103   int i, size;
2104   uint64_t mask;
2105   gimple *source_stmt;
2106   struct symbolic_number *n_start;
2107
2108   /* Sources are different, cancel bswap if they are not memory location with
2109      the same base (array, structure, ...).  */
2110   if (gimple_assign_rhs1 (source_stmt1) != gimple_assign_rhs1 (source_stmt2))
2111     {
2112       uint64_t inc;
2113       HOST_WIDE_INT start_sub, end_sub, end1, end2, end;
2114       struct symbolic_number *toinc_n_ptr, *n_end;
2115
2116       if (!n1->base_addr || !n2->base_addr
2117           || !operand_equal_p (n1->base_addr, n2->base_addr, 0))
2118         return NULL;
2119
2120       if (!n1->offset != !n2->offset
2121           || (n1->offset && !operand_equal_p (n1->offset, n2->offset, 0)))
2122         return NULL;
2123
2124       if (n1->bytepos < n2->bytepos)
2125         {
2126           n_start = n1;
2127           start_sub = n2->bytepos - n1->bytepos;
2128           source_stmt = source_stmt1;
2129         }
2130       else
2131         {
2132           n_start = n2;
2133           start_sub = n1->bytepos - n2->bytepos;
2134           source_stmt = source_stmt2;
2135         }
2136
2137       /* Find the highest address at which a load is performed and
2138          compute related info.  */
2139       end1 = n1->bytepos + (n1->range - 1);
2140       end2 = n2->bytepos + (n2->range - 1);
2141       if (end1 < end2)
2142         {
2143           end = end2;
2144           end_sub = end2 - end1;
2145         }
2146       else
2147         {
2148           end = end1;
2149           end_sub = end1 - end2;
2150         }
2151       n_end = (end2 > end1) ? n2 : n1;
2152
2153       /* Find symbolic number whose lsb is the most significant.  */
2154       if (BYTES_BIG_ENDIAN)
2155         toinc_n_ptr = (n_end == n1) ? n2 : n1;
2156       else
2157         toinc_n_ptr = (n_start == n1) ? n2 : n1;
2158
2159       n->range = end - n_start->bytepos + 1;
2160
2161       /* Check that the range of memory covered can be represented by
2162          a symbolic number.  */
2163       if (n->range > 64 / BITS_PER_MARKER)
2164         return NULL;
2165
2166       /* Reinterpret byte marks in symbolic number holding the value of
2167          bigger weight according to target endianness.  */
2168       inc = BYTES_BIG_ENDIAN ? end_sub : start_sub;
2169       size = TYPE_PRECISION (n1->type) / BITS_PER_UNIT;
2170       for (i = 0; i < size; i++, inc <<= BITS_PER_MARKER)
2171         {
2172           unsigned marker
2173             = (toinc_n_ptr->n >> (i * BITS_PER_MARKER)) & MARKER_MASK;
2174           if (marker && marker != MARKER_BYTE_UNKNOWN)
2175             toinc_n_ptr->n += inc;
2176         }
2177     }
2178   else
2179     {
2180       n->range = n1->range;
2181       n_start = n1;
2182       source_stmt = source_stmt1;
2183     }
2184
2185   if (!n1->alias_set
2186       || alias_ptr_types_compatible_p (n1->alias_set, n2->alias_set))
2187     n->alias_set = n1->alias_set;
2188   else
2189     n->alias_set = ptr_type_node;
2190   n->vuse = n_start->vuse;
2191   n->base_addr = n_start->base_addr;
2192   n->offset = n_start->offset;
2193   n->bytepos = n_start->bytepos;
2194   n->type = n_start->type;
2195   size = TYPE_PRECISION (n->type) / BITS_PER_UNIT;
2196
2197   for (i = 0, mask = MARKER_MASK; i < size; i++, mask <<= BITS_PER_MARKER)
2198     {
2199       uint64_t masked1, masked2;
2200
2201       masked1 = n1->n & mask;
2202       masked2 = n2->n & mask;
2203       if (masked1 && masked2 && masked1 != masked2)
2204         return NULL;
2205     }
2206   n->n = n1->n | n2->n;
2207
2208   return source_stmt;
2209 }
2210
2211 /* find_bswap_or_nop_1 invokes itself recursively with N and tries to perform
2212    the operation given by the rhs of STMT on the result.  If the operation
2213    could successfully be executed the function returns a gimple stmt whose
2214    rhs's first tree is the expression of the source operand and NULL
2215    otherwise.  */
2216
2217 static gimple *
2218 find_bswap_or_nop_1 (gimple *stmt, struct symbolic_number *n, int limit)
2219 {
2220   enum tree_code code;
2221   tree rhs1, rhs2 = NULL;
2222   gimple *rhs1_stmt, *rhs2_stmt, *source_stmt1;
2223   enum gimple_rhs_class rhs_class;
2224
2225   if (!limit || !is_gimple_assign (stmt))
2226     return NULL;
2227
2228   rhs1 = gimple_assign_rhs1 (stmt);
2229
2230   if (find_bswap_or_nop_load (stmt, rhs1, n))
2231     return stmt;
2232
2233   if (TREE_CODE (rhs1) != SSA_NAME)
2234     return NULL;
2235
2236   code = gimple_assign_rhs_code (stmt);
2237   rhs_class = gimple_assign_rhs_class (stmt);
2238   rhs1_stmt = SSA_NAME_DEF_STMT (rhs1);
2239
2240   if (rhs_class == GIMPLE_BINARY_RHS)
2241     rhs2 = gimple_assign_rhs2 (stmt);
2242
2243   /* Handle unary rhs and binary rhs with integer constants as second
2244      operand.  */
2245
2246   if (rhs_class == GIMPLE_UNARY_RHS
2247       || (rhs_class == GIMPLE_BINARY_RHS
2248           && TREE_CODE (rhs2) == INTEGER_CST))
2249     {
2250       if (code != BIT_AND_EXPR
2251           && code != LSHIFT_EXPR
2252           && code != RSHIFT_EXPR
2253           && code != LROTATE_EXPR
2254           && code != RROTATE_EXPR
2255           && !CONVERT_EXPR_CODE_P (code))
2256         return NULL;
2257
2258       source_stmt1 = find_bswap_or_nop_1 (rhs1_stmt, n, limit - 1);
2259
2260       /* If find_bswap_or_nop_1 returned NULL, STMT is a leaf node and
2261          we have to initialize the symbolic number.  */
2262       if (!source_stmt1)
2263         {
2264           if (gimple_assign_load_p (stmt)
2265               || !init_symbolic_number (n, rhs1))
2266             return NULL;
2267           source_stmt1 = stmt;
2268         }
2269
2270       switch (code)
2271         {
2272         case BIT_AND_EXPR:
2273           {
2274             int i, size = TYPE_PRECISION (n->type) / BITS_PER_UNIT;
2275             uint64_t val = int_cst_value (rhs2), mask = 0;
2276             uint64_t tmp = (1 << BITS_PER_UNIT) - 1;
2277
2278             /* Only constants masking full bytes are allowed.  */
2279             for (i = 0; i < size; i++, tmp <<= BITS_PER_UNIT)
2280               if ((val & tmp) != 0 && (val & tmp) != tmp)
2281                 return NULL;
2282               else if (val & tmp)
2283                 mask |= (uint64_t) MARKER_MASK << (i * BITS_PER_MARKER);
2284
2285             n->n &= mask;
2286           }
2287           break;
2288         case LSHIFT_EXPR:
2289         case RSHIFT_EXPR:
2290         case LROTATE_EXPR:
2291         case RROTATE_EXPR:
2292           if (!do_shift_rotate (code, n, (int) TREE_INT_CST_LOW (rhs2)))
2293             return NULL;
2294           break;
2295         CASE_CONVERT:
2296           {
2297             int i, type_size, old_type_size;
2298             tree type;
2299
2300             type = gimple_expr_type (stmt);
2301             type_size = TYPE_PRECISION (type);
2302             if (type_size % BITS_PER_UNIT != 0)
2303               return NULL;
2304             type_size /= BITS_PER_UNIT;
2305             if (type_size > 64 / BITS_PER_MARKER)
2306               return NULL;
2307
2308             /* Sign extension: result is dependent on the value.  */
2309             old_type_size = TYPE_PRECISION (n->type) / BITS_PER_UNIT;
2310             if (!TYPE_UNSIGNED (n->type) && type_size > old_type_size
2311                 && HEAD_MARKER (n->n, old_type_size))
2312               for (i = 0; i < type_size - old_type_size; i++)
2313                 n->n |= (uint64_t) MARKER_BYTE_UNKNOWN
2314                         << ((type_size - 1 - i) * BITS_PER_MARKER);
2315
2316             if (type_size < 64 / BITS_PER_MARKER)
2317               {
2318                 /* If STMT casts to a smaller type mask out the bits not
2319                    belonging to the target type.  */
2320                 n->n &= ((uint64_t) 1 << (type_size * BITS_PER_MARKER)) - 1;
2321               }
2322             n->type = type;
2323             if (!n->base_addr)
2324               n->range = type_size;
2325           }
2326           break;
2327         default:
2328           return NULL;
2329         };
2330       return verify_symbolic_number_p (n, stmt) ? source_stmt1 : NULL;
2331     }
2332
2333   /* Handle binary rhs.  */
2334
2335   if (rhs_class == GIMPLE_BINARY_RHS)
2336     {
2337       struct symbolic_number n1, n2;
2338       gimple *source_stmt, *source_stmt2;
2339
2340       if (code != BIT_IOR_EXPR)
2341         return NULL;
2342
2343       if (TREE_CODE (rhs2) != SSA_NAME)
2344         return NULL;
2345
2346       rhs2_stmt = SSA_NAME_DEF_STMT (rhs2);
2347
2348       switch (code)
2349         {
2350         case BIT_IOR_EXPR:
2351           source_stmt1 = find_bswap_or_nop_1 (rhs1_stmt, &n1, limit - 1);
2352
2353           if (!source_stmt1)
2354             return NULL;
2355
2356           source_stmt2 = find_bswap_or_nop_1 (rhs2_stmt, &n2, limit - 1);
2357
2358           if (!source_stmt2)
2359             return NULL;
2360
2361           if (TYPE_PRECISION (n1.type) != TYPE_PRECISION (n2.type))
2362             return NULL;
2363
2364           if (!n1.vuse != !n2.vuse
2365               || (n1.vuse && !operand_equal_p (n1.vuse, n2.vuse, 0)))
2366             return NULL;
2367
2368           source_stmt
2369             = perform_symbolic_merge (source_stmt1, &n1, source_stmt2, &n2, n);
2370
2371           if (!source_stmt)
2372             return NULL;
2373
2374           if (!verify_symbolic_number_p (n, stmt))
2375             return NULL;
2376
2377           break;
2378         default:
2379           return NULL;
2380         }
2381       return source_stmt;
2382     }
2383   return NULL;
2384 }
2385
2386 /* Check if STMT completes a bswap implementation or a read in a given
2387    endianness consisting of ORs, SHIFTs and ANDs and sets *BSWAP
2388    accordingly.  It also sets N to represent the kind of operations
2389    performed: size of the resulting expression and whether it works on
2390    a memory source, and if so alias-set and vuse.  At last, the
2391    function returns a stmt whose rhs's first tree is the source
2392    expression.  */
2393
2394 static gimple *
2395 find_bswap_or_nop (gimple *stmt, struct symbolic_number *n, bool *bswap)
2396 {
2397 /* The number which the find_bswap_or_nop_1 result should match in order
2398    to have a full byte swap.  The number is shifted to the right
2399    according to the size of the symbolic number before using it.  */
2400   uint64_t cmpxchg = CMPXCHG;
2401   uint64_t cmpnop = CMPNOP;
2402
2403   gimple *source_stmt;
2404   int limit;
2405
2406   /* The last parameter determines the depth search limit.  It usually
2407      correlates directly to the number n of bytes to be touched.  We
2408      increase that number by log2(n) + 1 here in order to also
2409      cover signed -> unsigned conversions of the src operand as can be seen
2410      in libgcc, and for initial shift/and operation of the src operand.  */
2411   limit = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (gimple_expr_type (stmt)));
2412   limit += 1 + (int) ceil_log2 ((unsigned HOST_WIDE_INT) limit);
2413   source_stmt = find_bswap_or_nop_1 (stmt, n, limit);
2414
2415   if (!source_stmt)
2416     return NULL;
2417
2418   /* Find real size of result (highest non-zero byte).  */
2419   if (n->base_addr)
2420     {
2421       int rsize;
2422       uint64_t tmpn;
2423
2424       for (tmpn = n->n, rsize = 0; tmpn; tmpn >>= BITS_PER_MARKER, rsize++);
2425       n->range = rsize;
2426     }
2427
2428   /* Zero out the extra bits of N and CMP*.  */
2429   if (n->range < (int) sizeof (int64_t))
2430     {
2431       uint64_t mask;
2432
2433       mask = ((uint64_t) 1 << (n->range * BITS_PER_MARKER)) - 1;
2434       cmpxchg >>= (64 / BITS_PER_MARKER - n->range) * BITS_PER_MARKER;
2435       cmpnop &= mask;
2436     }
2437
2438   /* A complete byte swap should make the symbolic number to start with
2439      the largest digit in the highest order byte. Unchanged symbolic
2440      number indicates a read with same endianness as target architecture.  */
2441   if (n->n == cmpnop)
2442     *bswap = false;
2443   else if (n->n == cmpxchg)
2444     *bswap = true;
2445   else
2446     return NULL;
2447
2448   /* Useless bit manipulation performed by code.  */
2449   if (!n->base_addr && n->n == cmpnop)
2450     return NULL;
2451
2452   n->range *= BITS_PER_UNIT;
2453   return source_stmt;
2454 }
2455
2456 namespace {
2457
2458 const pass_data pass_data_optimize_bswap =
2459 {
2460   GIMPLE_PASS, /* type */
2461   "bswap", /* name */
2462   OPTGROUP_NONE, /* optinfo_flags */
2463   TV_NONE, /* tv_id */
2464   PROP_ssa, /* properties_required */
2465   0, /* properties_provided */
2466   0, /* properties_destroyed */
2467   0, /* todo_flags_start */
2468   0, /* todo_flags_finish */
2469 };
2470
2471 class pass_optimize_bswap : public gimple_opt_pass
2472 {
2473 public:
2474   pass_optimize_bswap (gcc::context *ctxt)
2475     : gimple_opt_pass (pass_data_optimize_bswap, ctxt)
2476   {}
2477
2478   /* opt_pass methods: */
2479   virtual bool gate (function *)
2480     {
2481       return flag_expensive_optimizations && optimize;
2482     }
2483
2484   virtual unsigned int execute (function *);
2485
2486 }; // class pass_optimize_bswap
2487
2488 /* Perform the bswap optimization: replace the expression computed in the rhs
2489    of CUR_STMT by an equivalent bswap, load or load + bswap expression.
2490    Which of these alternatives replace the rhs is given by N->base_addr (non
2491    null if a load is needed) and BSWAP.  The type, VUSE and set-alias of the
2492    load to perform are also given in N while the builtin bswap invoke is given
2493    in FNDEL.  Finally, if a load is involved, SRC_STMT refers to one of the
2494    load statements involved to construct the rhs in CUR_STMT and N->range gives
2495    the size of the rhs expression for maintaining some statistics.
2496
2497    Note that if the replacement involve a load, CUR_STMT is moved just after
2498    SRC_STMT to do the load with the same VUSE which can lead to CUR_STMT
2499    changing of basic block.  */
2500
2501 static bool
2502 bswap_replace (gimple *cur_stmt, gimple *src_stmt, tree fndecl,
2503                tree bswap_type, tree load_type, struct symbolic_number *n,
2504                bool bswap)
2505 {
2506   gimple_stmt_iterator gsi;
2507   tree src, tmp, tgt;
2508   gimple *bswap_stmt;
2509
2510   gsi = gsi_for_stmt (cur_stmt);
2511   src = gimple_assign_rhs1 (src_stmt);
2512   tgt = gimple_assign_lhs (cur_stmt);
2513
2514   /* Need to load the value from memory first.  */
2515   if (n->base_addr)
2516     {
2517       gimple_stmt_iterator gsi_ins = gsi_for_stmt (src_stmt);
2518       tree addr_expr, addr_tmp, val_expr, val_tmp;
2519       tree load_offset_ptr, aligned_load_type;
2520       gimple *addr_stmt, *load_stmt;
2521       unsigned align;
2522       HOST_WIDE_INT load_offset = 0;
2523
2524       align = get_object_alignment (src);
2525       /* If the new access is smaller than the original one, we need
2526          to perform big endian adjustment.  */
2527       if (BYTES_BIG_ENDIAN)
2528         {
2529           HOST_WIDE_INT bitsize, bitpos;
2530           machine_mode mode;
2531           int unsignedp, volatilep;
2532           tree offset;
2533
2534           get_inner_reference (src, &bitsize, &bitpos, &offset, &mode,
2535                                &unsignedp, &volatilep, false);
2536           if (n->range < (unsigned HOST_WIDE_INT) bitsize)
2537             {
2538               load_offset = (bitsize - n->range) / BITS_PER_UNIT;
2539               unsigned HOST_WIDE_INT l
2540                 = (load_offset * BITS_PER_UNIT) & (align - 1);
2541               if (l)
2542                 align = l & -l;
2543             }
2544         }
2545
2546       if (bswap
2547           && align < GET_MODE_ALIGNMENT (TYPE_MODE (load_type))
2548           && SLOW_UNALIGNED_ACCESS (TYPE_MODE (load_type), align))
2549         return false;
2550
2551       /* Move cur_stmt just before  one of the load of the original
2552          to ensure it has the same VUSE.  See PR61517 for what could
2553          go wrong.  */
2554       gsi_move_before (&gsi, &gsi_ins);
2555       gsi = gsi_for_stmt (cur_stmt);
2556
2557       /* Compute address to load from and cast according to the size
2558          of the load.  */
2559       addr_expr = build_fold_addr_expr (unshare_expr (src));
2560       if (is_gimple_mem_ref_addr (addr_expr))
2561         addr_tmp = addr_expr;
2562       else
2563         {
2564           addr_tmp = make_temp_ssa_name (TREE_TYPE (addr_expr), NULL,
2565                                          "load_src");
2566           addr_stmt = gimple_build_assign (addr_tmp, addr_expr);
2567           gsi_insert_before (&gsi, addr_stmt, GSI_SAME_STMT);
2568         }
2569
2570       /* Perform the load.  */
2571       aligned_load_type = load_type;
2572       if (align < TYPE_ALIGN (load_type))
2573         aligned_load_type = build_aligned_type (load_type, align);
2574       load_offset_ptr = build_int_cst (n->alias_set, load_offset);
2575       val_expr = fold_build2 (MEM_REF, aligned_load_type, addr_tmp,
2576                               load_offset_ptr);
2577
2578       if (!bswap)
2579         {
2580           if (n->range == 16)
2581             nop_stats.found_16bit++;
2582           else if (n->range == 32)
2583             nop_stats.found_32bit++;
2584           else
2585             {
2586               gcc_assert (n->range == 64);
2587               nop_stats.found_64bit++;
2588             }
2589
2590           /* Convert the result of load if necessary.  */
2591           if (!useless_type_conversion_p (TREE_TYPE (tgt), load_type))
2592             {
2593               val_tmp = make_temp_ssa_name (aligned_load_type, NULL,
2594                                             "load_dst");
2595               load_stmt = gimple_build_assign (val_tmp, val_expr);
2596               gimple_set_vuse (load_stmt, n->vuse);
2597               gsi_insert_before (&gsi, load_stmt, GSI_SAME_STMT);
2598               gimple_assign_set_rhs_with_ops (&gsi, NOP_EXPR, val_tmp);
2599             }
2600           else
2601             {
2602               gimple_assign_set_rhs_with_ops (&gsi, MEM_REF, val_expr);
2603               gimple_set_vuse (cur_stmt, n->vuse);
2604             }
2605           update_stmt (cur_stmt);
2606
2607           if (dump_file)
2608             {
2609               fprintf (dump_file,
2610                        "%d bit load in target endianness found at: ",
2611                        (int) n->range);
2612               print_gimple_stmt (dump_file, cur_stmt, 0, 0);
2613             }
2614           return true;
2615         }
2616       else
2617         {
2618           val_tmp = make_temp_ssa_name (aligned_load_type, NULL, "load_dst");
2619           load_stmt = gimple_build_assign (val_tmp, val_expr);
2620           gimple_set_vuse (load_stmt, n->vuse);
2621           gsi_insert_before (&gsi, load_stmt, GSI_SAME_STMT);
2622         }
2623       src = val_tmp;
2624     }
2625
2626   if (n->range == 16)
2627     bswap_stats.found_16bit++;
2628   else if (n->range == 32)
2629     bswap_stats.found_32bit++;
2630   else
2631     {
2632       gcc_assert (n->range == 64);
2633       bswap_stats.found_64bit++;
2634     }
2635
2636   tmp = src;
2637
2638   /* Convert the src expression if necessary.  */
2639   if (!useless_type_conversion_p (TREE_TYPE (tmp), bswap_type))
2640     {
2641       gimple *convert_stmt;
2642
2643       tmp = make_temp_ssa_name (bswap_type, NULL, "bswapsrc");
2644       convert_stmt = gimple_build_assign (tmp, NOP_EXPR, src);
2645       gsi_insert_before (&gsi, convert_stmt, GSI_SAME_STMT);
2646     }
2647
2648   /* Canonical form for 16 bit bswap is a rotate expression.  Only 16bit values
2649      are considered as rotation of 2N bit values by N bits is generally not
2650      equivalent to a bswap.  Consider for instance 0x01020304 r>> 16 which
2651      gives 0x03040102 while a bswap for that value is 0x04030201.  */
2652   if (bswap && n->range == 16)
2653     {
2654       tree count = build_int_cst (NULL, BITS_PER_UNIT);
2655       src = fold_build2 (LROTATE_EXPR, bswap_type, tmp, count);
2656       bswap_stmt = gimple_build_assign (NULL, src);
2657     }
2658   else
2659     bswap_stmt = gimple_build_call (fndecl, 1, tmp);
2660
2661   tmp = tgt;
2662
2663   /* Convert the result if necessary.  */
2664   if (!useless_type_conversion_p (TREE_TYPE (tgt), bswap_type))
2665     {
2666       gimple *convert_stmt;
2667
2668       tmp = make_temp_ssa_name (bswap_type, NULL, "bswapdst");
2669       convert_stmt = gimple_build_assign (tgt, NOP_EXPR, tmp);
2670       gsi_insert_after (&gsi, convert_stmt, GSI_SAME_STMT);
2671     }
2672
2673   gimple_set_lhs (bswap_stmt, tmp);
2674
2675   if (dump_file)
2676     {
2677       fprintf (dump_file, "%d bit bswap implementation found at: ",
2678                (int) n->range);
2679       print_gimple_stmt (dump_file, cur_stmt, 0, 0);
2680     }
2681
2682   gsi_insert_after (&gsi, bswap_stmt, GSI_SAME_STMT);
2683   gsi_remove (&gsi, true);
2684   return true;
2685 }
2686
2687 /* Find manual byte swap implementations as well as load in a given
2688    endianness. Byte swaps are turned into a bswap builtin invokation
2689    while endian loads are converted to bswap builtin invokation or
2690    simple load according to the target endianness.  */
2691
2692 unsigned int
2693 pass_optimize_bswap::execute (function *fun)
2694 {
2695   basic_block bb;
2696   bool bswap32_p, bswap64_p;
2697   bool changed = false;
2698   tree bswap32_type = NULL_TREE, bswap64_type = NULL_TREE;
2699
2700   if (BITS_PER_UNIT != 8)
2701     return 0;
2702
2703   bswap32_p = (builtin_decl_explicit_p (BUILT_IN_BSWAP32)
2704                && optab_handler (bswap_optab, SImode) != CODE_FOR_nothing);
2705   bswap64_p = (builtin_decl_explicit_p (BUILT_IN_BSWAP64)
2706                && (optab_handler (bswap_optab, DImode) != CODE_FOR_nothing
2707                    || (bswap32_p && word_mode == SImode)));
2708
2709   /* Determine the argument type of the builtins.  The code later on
2710      assumes that the return and argument type are the same.  */
2711   if (bswap32_p)
2712     {
2713       tree fndecl = builtin_decl_explicit (BUILT_IN_BSWAP32);
2714       bswap32_type = TREE_VALUE (TYPE_ARG_TYPES (TREE_TYPE (fndecl)));
2715     }
2716
2717   if (bswap64_p)
2718     {
2719       tree fndecl = builtin_decl_explicit (BUILT_IN_BSWAP64);
2720       bswap64_type = TREE_VALUE (TYPE_ARG_TYPES (TREE_TYPE (fndecl)));
2721     }
2722
2723   memset (&nop_stats, 0, sizeof (nop_stats));
2724   memset (&bswap_stats, 0, sizeof (bswap_stats));
2725
2726   FOR_EACH_BB_FN (bb, fun)
2727     {
2728       gimple_stmt_iterator gsi;
2729
2730       /* We do a reverse scan for bswap patterns to make sure we get the
2731          widest match. As bswap pattern matching doesn't handle previously
2732          inserted smaller bswap replacements as sub-patterns, the wider
2733          variant wouldn't be detected.  */
2734       for (gsi = gsi_last_bb (bb); !gsi_end_p (gsi);)
2735         {
2736           gimple *src_stmt, *cur_stmt = gsi_stmt (gsi);
2737           tree fndecl = NULL_TREE, bswap_type = NULL_TREE, load_type;
2738           enum tree_code code;
2739           struct symbolic_number n;
2740           bool bswap;
2741
2742           /* This gsi_prev (&gsi) is not part of the for loop because cur_stmt
2743              might be moved to a different basic block by bswap_replace and gsi
2744              must not points to it if that's the case.  Moving the gsi_prev
2745              there make sure that gsi points to the statement previous to
2746              cur_stmt while still making sure that all statements are
2747              considered in this basic block.  */
2748           gsi_prev (&gsi);
2749
2750           if (!is_gimple_assign (cur_stmt))
2751             continue;
2752
2753           code = gimple_assign_rhs_code (cur_stmt);
2754           switch (code)
2755             {
2756             case LROTATE_EXPR:
2757             case RROTATE_EXPR:
2758               if (!tree_fits_uhwi_p (gimple_assign_rhs2 (cur_stmt))
2759                   || tree_to_uhwi (gimple_assign_rhs2 (cur_stmt))
2760                      % BITS_PER_UNIT)
2761                 continue;
2762               /* Fall through.  */
2763             case BIT_IOR_EXPR:
2764               break;
2765             default:
2766               continue;
2767             }
2768
2769           src_stmt = find_bswap_or_nop (cur_stmt, &n, &bswap);
2770
2771           if (!src_stmt)
2772             continue;
2773
2774           switch (n.range)
2775             {
2776             case 16:
2777               /* Already in canonical form, nothing to do.  */
2778               if (code == LROTATE_EXPR || code == RROTATE_EXPR)
2779                 continue;
2780               load_type = bswap_type = uint16_type_node;
2781               break;
2782             case 32:
2783               load_type = uint32_type_node;
2784               if (bswap32_p)
2785                 {
2786                   fndecl = builtin_decl_explicit (BUILT_IN_BSWAP32);
2787                   bswap_type = bswap32_type;
2788                 }
2789               break;
2790             case 64:
2791               load_type = uint64_type_node;
2792               if (bswap64_p)
2793                 {
2794                   fndecl = builtin_decl_explicit (BUILT_IN_BSWAP64);
2795                   bswap_type = bswap64_type;
2796                 }
2797               break;
2798             default:
2799               continue;
2800             }
2801
2802           if (bswap && !fndecl && n.range != 16)
2803             continue;
2804
2805           if (bswap_replace (cur_stmt, src_stmt, fndecl, bswap_type, load_type,
2806                              &n, bswap))
2807             changed = true;
2808         }
2809     }
2810
2811   statistics_counter_event (fun, "16-bit nop implementations found",
2812                             nop_stats.found_16bit);
2813   statistics_counter_event (fun, "32-bit nop implementations found",
2814                             nop_stats.found_32bit);
2815   statistics_counter_event (fun, "64-bit nop implementations found",
2816                             nop_stats.found_64bit);
2817   statistics_counter_event (fun, "16-bit bswap implementations found",
2818                             bswap_stats.found_16bit);
2819   statistics_counter_event (fun, "32-bit bswap implementations found",
2820                             bswap_stats.found_32bit);
2821   statistics_counter_event (fun, "64-bit bswap implementations found",
2822                             bswap_stats.found_64bit);
2823
2824   return (changed ? TODO_update_ssa : 0);
2825 }
2826
2827 } // anon namespace
2828
2829 gimple_opt_pass *
2830 make_pass_optimize_bswap (gcc::context *ctxt)
2831 {
2832   return new pass_optimize_bswap (ctxt);
2833 }
2834
2835 /* Return true if stmt is a type conversion operation that can be stripped
2836    when used in a widening multiply operation.  */
2837 static bool
2838 widening_mult_conversion_strippable_p (tree result_type, gimple *stmt)
2839 {
2840   enum tree_code rhs_code = gimple_assign_rhs_code (stmt);
2841
2842   if (TREE_CODE (result_type) == INTEGER_TYPE)
2843     {
2844       tree op_type;
2845       tree inner_op_type;
2846
2847       if (!CONVERT_EXPR_CODE_P (rhs_code))
2848         return false;
2849
2850       op_type = TREE_TYPE (gimple_assign_lhs (stmt));
2851
2852       /* If the type of OP has the same precision as the result, then
2853          we can strip this conversion.  The multiply operation will be
2854          selected to create the correct extension as a by-product.  */
2855       if (TYPE_PRECISION (result_type) == TYPE_PRECISION (op_type))
2856         return true;
2857
2858       /* We can also strip a conversion if it preserves the signed-ness of
2859          the operation and doesn't narrow the range.  */
2860       inner_op_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
2861
2862       /* If the inner-most type is unsigned, then we can strip any
2863          intermediate widening operation.  If it's signed, then the
2864          intermediate widening operation must also be signed.  */
2865       if ((TYPE_UNSIGNED (inner_op_type)
2866            || TYPE_UNSIGNED (op_type) == TYPE_UNSIGNED (inner_op_type))
2867           && TYPE_PRECISION (op_type) > TYPE_PRECISION (inner_op_type))
2868         return true;
2869
2870       return false;
2871     }
2872
2873   return rhs_code == FIXED_CONVERT_EXPR;
2874 }
2875
2876 /* Return true if RHS is a suitable operand for a widening multiplication,
2877    assuming a target type of TYPE.
2878    There are two cases:
2879
2880      - RHS makes some value at least twice as wide.  Store that value
2881        in *NEW_RHS_OUT if so, and store its type in *TYPE_OUT.
2882
2883      - RHS is an integer constant.  Store that value in *NEW_RHS_OUT if so,
2884        but leave *TYPE_OUT untouched.  */
2885
2886 static bool
2887 is_widening_mult_rhs_p (tree type, tree rhs, tree *type_out,
2888                         tree *new_rhs_out)
2889 {
2890   gimple *stmt;
2891   tree type1, rhs1;
2892
2893   if (TREE_CODE (rhs) == SSA_NAME)
2894     {
2895       stmt = SSA_NAME_DEF_STMT (rhs);
2896       if (is_gimple_assign (stmt))
2897         {
2898           if (! widening_mult_conversion_strippable_p (type, stmt))
2899             rhs1 = rhs;
2900           else
2901             {
2902               rhs1 = gimple_assign_rhs1 (stmt);
2903
2904               if (TREE_CODE (rhs1) == INTEGER_CST)
2905                 {
2906                   *new_rhs_out = rhs1;
2907                   *type_out = NULL;
2908                   return true;
2909                 }
2910             }
2911         }
2912       else
2913         rhs1 = rhs;
2914
2915       type1 = TREE_TYPE (rhs1);
2916
2917       if (TREE_CODE (type1) != TREE_CODE (type)
2918           || TYPE_PRECISION (type1) * 2 > TYPE_PRECISION (type))
2919         return false;
2920
2921       *new_rhs_out = rhs1;
2922       *type_out = type1;
2923       return true;
2924     }
2925
2926   if (TREE_CODE (rhs) == INTEGER_CST)
2927     {
2928       *new_rhs_out = rhs;
2929       *type_out = NULL;
2930       return true;
2931     }
2932
2933   return false;
2934 }
2935
2936 /* Return true if STMT performs a widening multiplication, assuming the
2937    output type is TYPE.  If so, store the unwidened types of the operands
2938    in *TYPE1_OUT and *TYPE2_OUT respectively.  Also fill *RHS1_OUT and
2939    *RHS2_OUT such that converting those operands to types *TYPE1_OUT
2940    and *TYPE2_OUT would give the operands of the multiplication.  */
2941
2942 static bool
2943 is_widening_mult_p (gimple *stmt,
2944                     tree *type1_out, tree *rhs1_out,
2945                     tree *type2_out, tree *rhs2_out)
2946 {
2947   tree type = TREE_TYPE (gimple_assign_lhs (stmt));
2948
2949   if (TREE_CODE (type) != INTEGER_TYPE
2950       && TREE_CODE (type) != FIXED_POINT_TYPE)
2951     return false;
2952
2953   if (!is_widening_mult_rhs_p (type, gimple_assign_rhs1 (stmt), type1_out,
2954                                rhs1_out))
2955     return false;
2956
2957   if (!is_widening_mult_rhs_p (type, gimple_assign_rhs2 (stmt), type2_out,
2958                                rhs2_out))
2959     return false;
2960
2961   if (*type1_out == NULL)
2962     {
2963       if (*type2_out == NULL || !int_fits_type_p (*rhs1_out, *type2_out))
2964         return false;
2965       *type1_out = *type2_out;
2966     }
2967
2968   if (*type2_out == NULL)
2969     {
2970       if (!int_fits_type_p (*rhs2_out, *type1_out))
2971         return false;
2972       *type2_out = *type1_out;
2973     }
2974
2975   /* Ensure that the larger of the two operands comes first. */
2976   if (TYPE_PRECISION (*type1_out) < TYPE_PRECISION (*type2_out))
2977     {
2978       std::swap (*type1_out, *type2_out);
2979       std::swap (*rhs1_out, *rhs2_out);
2980     }
2981
2982   return true;
2983 }
2984
2985 /* Process a single gimple statement STMT, which has a MULT_EXPR as
2986    its rhs, and try to convert it into a WIDEN_MULT_EXPR.  The return
2987    value is true iff we converted the statement.  */
2988
2989 static bool
2990 convert_mult_to_widen (gimple *stmt, gimple_stmt_iterator *gsi)
2991 {
2992   tree lhs, rhs1, rhs2, type, type1, type2;
2993   enum insn_code handler;
2994   machine_mode to_mode, from_mode, actual_mode;
2995   optab op;
2996   int actual_precision;
2997   location_t loc = gimple_location (stmt);
2998   bool from_unsigned1, from_unsigned2;
2999
3000   lhs = gimple_assign_lhs (stmt);
3001   type = TREE_TYPE (lhs);
3002   if (TREE_CODE (type) != INTEGER_TYPE)
3003     return false;
3004
3005   if (!is_widening_mult_p (stmt, &type1, &rhs1, &type2, &rhs2))
3006     return false;
3007
3008   to_mode = TYPE_MODE (type);
3009   from_mode = TYPE_MODE (type1);
3010   from_unsigned1 = TYPE_UNSIGNED (type1);
3011   from_unsigned2 = TYPE_UNSIGNED (type2);
3012
3013   if (from_unsigned1 && from_unsigned2)
3014     op = umul_widen_optab;
3015   else if (!from_unsigned1 && !from_unsigned2)
3016     op = smul_widen_optab;
3017   else
3018     op = usmul_widen_optab;
3019
3020   handler = find_widening_optab_handler_and_mode (op, to_mode, from_mode,
3021                                                   0, &actual_mode);
3022
3023   if (handler == CODE_FOR_nothing)
3024     {
3025       if (op != smul_widen_optab)
3026         {
3027           /* We can use a signed multiply with unsigned types as long as
3028              there is a wider mode to use, or it is the smaller of the two
3029              types that is unsigned.  Note that type1 >= type2, always.  */
3030           if ((TYPE_UNSIGNED (type1)
3031                && TYPE_PRECISION (type1) == GET_MODE_PRECISION (from_mode))
3032               || (TYPE_UNSIGNED (type2)
3033                   && TYPE_PRECISION (type2) == GET_MODE_PRECISION (from_mode)))
3034             {
3035               from_mode = GET_MODE_WIDER_MODE (from_mode);
3036               if (GET_MODE_SIZE (to_mode) <= GET_MODE_SIZE (from_mode))
3037                 return false;
3038             }
3039
3040           op = smul_widen_optab;
3041           handler = find_widening_optab_handler_and_mode (op, to_mode,
3042                                                           from_mode, 0,
3043                                                           &actual_mode);
3044
3045           if (handler == CODE_FOR_nothing)
3046             return false;
3047
3048           from_unsigned1 = from_unsigned2 = false;
3049         }
3050       else
3051         return false;
3052     }
3053
3054   /* Ensure that the inputs to the handler are in the correct precison
3055      for the opcode.  This will be the full mode size.  */
3056   actual_precision = GET_MODE_PRECISION (actual_mode);
3057   if (2 * actual_precision > TYPE_PRECISION (type))
3058     return false;
3059   if (actual_precision != TYPE_PRECISION (type1)
3060       || from_unsigned1 != TYPE_UNSIGNED (type1))
3061     rhs1 = build_and_insert_cast (gsi, loc,
3062                                   build_nonstandard_integer_type
3063                                     (actual_precision, from_unsigned1), rhs1);
3064   if (actual_precision != TYPE_PRECISION (type2)
3065       || from_unsigned2 != TYPE_UNSIGNED (type2))
3066     rhs2 = build_and_insert_cast (gsi, loc,
3067                                   build_nonstandard_integer_type
3068                                     (actual_precision, from_unsigned2), rhs2);
3069
3070   /* Handle constants.  */
3071   if (TREE_CODE (rhs1) == INTEGER_CST)
3072     rhs1 = fold_convert (type1, rhs1);
3073   if (TREE_CODE (rhs2) == INTEGER_CST)
3074     rhs2 = fold_convert (type2, rhs2);
3075
3076   gimple_assign_set_rhs1 (stmt, rhs1);
3077   gimple_assign_set_rhs2 (stmt, rhs2);
3078   gimple_assign_set_rhs_code (stmt, WIDEN_MULT_EXPR);
3079   update_stmt (stmt);
3080   widen_mul_stats.widen_mults_inserted++;
3081   return true;
3082 }
3083
3084 /* Process a single gimple statement STMT, which is found at the
3085    iterator GSI and has a either a PLUS_EXPR or a MINUS_EXPR as its
3086    rhs (given by CODE), and try to convert it into a
3087    WIDEN_MULT_PLUS_EXPR or a WIDEN_MULT_MINUS_EXPR.  The return value
3088    is true iff we converted the statement.  */
3089
3090 static bool
3091 convert_plusminus_to_widen (gimple_stmt_iterator *gsi, gimple *stmt,
3092                             enum tree_code code)
3093 {
3094   gimple *rhs1_stmt = NULL, *rhs2_stmt = NULL;
3095   gimple *conv1_stmt = NULL, *conv2_stmt = NULL, *conv_stmt;
3096   tree type, type1, type2, optype;
3097   tree lhs, rhs1, rhs2, mult_rhs1, mult_rhs2, add_rhs;
3098   enum tree_code rhs1_code = ERROR_MARK, rhs2_code = ERROR_MARK;
3099   optab this_optab;
3100   enum tree_code wmult_code;
3101   enum insn_code handler;
3102   machine_mode to_mode, from_mode, actual_mode;
3103   location_t loc = gimple_location (stmt);
3104   int actual_precision;
3105   bool from_unsigned1, from_unsigned2;
3106
3107   lhs = gimple_assign_lhs (stmt);
3108   type = TREE_TYPE (lhs);
3109   if (TREE_CODE (type) != INTEGER_TYPE
3110       && TREE_CODE (type) != FIXED_POINT_TYPE)
3111     return false;
3112
3113   if (code == MINUS_EXPR)
3114     wmult_code = WIDEN_MULT_MINUS_EXPR;
3115   else
3116     wmult_code = WIDEN_MULT_PLUS_EXPR;
3117
3118   rhs1 = gimple_assign_rhs1 (stmt);
3119   rhs2 = gimple_assign_rhs2 (stmt);
3120
3121   if (TREE_CODE (rhs1) == SSA_NAME)
3122     {
3123       rhs1_stmt = SSA_NAME_DEF_STMT (rhs1);
3124       if (is_gimple_assign (rhs1_stmt))
3125         rhs1_code = gimple_assign_rhs_code (rhs1_stmt);
3126     }
3127
3128   if (TREE_CODE (rhs2) == SSA_NAME)
3129     {
3130       rhs2_stmt = SSA_NAME_DEF_STMT (rhs2);
3131       if (is_gimple_assign (rhs2_stmt))
3132         rhs2_code = gimple_assign_rhs_code (rhs2_stmt);
3133     }
3134
3135   /* Allow for one conversion statement between the multiply
3136      and addition/subtraction statement.  If there are more than
3137      one conversions then we assume they would invalidate this
3138      transformation.  If that's not the case then they should have
3139      been folded before now.  */
3140   if (CONVERT_EXPR_CODE_P (rhs1_code))
3141     {
3142       conv1_stmt = rhs1_stmt;
3143       rhs1 = gimple_assign_rhs1 (rhs1_stmt);
3144       if (TREE_CODE (rhs1) == SSA_NAME)
3145         {
3146           rhs1_stmt = SSA_NAME_DEF_STMT (rhs1);
3147           if (is_gimple_assign (rhs1_stmt))
3148             rhs1_code = gimple_assign_rhs_code (rhs1_stmt);
3149         }
3150       else
3151         return false;
3152     }
3153   if (CONVERT_EXPR_CODE_P (rhs2_code))
3154     {
3155       conv2_stmt = rhs2_stmt;
3156       rhs2 = gimple_assign_rhs1 (rhs2_stmt);
3157       if (TREE_CODE (rhs2) == SSA_NAME)
3158         {
3159           rhs2_stmt = SSA_NAME_DEF_STMT (rhs2);
3160           if (is_gimple_assign (rhs2_stmt))
3161             rhs2_code = gimple_assign_rhs_code (rhs2_stmt);
3162         }
3163       else
3164         return false;
3165     }
3166
3167   /* If code is WIDEN_MULT_EXPR then it would seem unnecessary to call
3168      is_widening_mult_p, but we still need the rhs returns.
3169
3170      It might also appear that it would be sufficient to use the existing
3171      operands of the widening multiply, but that would limit the choice of
3172      multiply-and-accumulate instructions.
3173
3174      If the widened-multiplication result has more than one uses, it is
3175      probably wiser not to do the conversion.  */
3176   if (code == PLUS_EXPR
3177       && (rhs1_code == MULT_EXPR || rhs1_code == WIDEN_MULT_EXPR))
3178     {
3179       if (!has_single_use (rhs1)
3180           || !is_widening_mult_p (rhs1_stmt, &type1, &mult_rhs1,
3181                                   &type2, &mult_rhs2))
3182         return false;
3183       add_rhs = rhs2;
3184       conv_stmt = conv1_stmt;
3185     }
3186   else if (rhs2_code == MULT_EXPR || rhs2_code == WIDEN_MULT_EXPR)
3187     {
3188       if (!has_single_use (rhs2)
3189           || !is_widening_mult_p (rhs2_stmt, &type1, &mult_rhs1,
3190                                   &type2, &mult_rhs2))
3191         return false;
3192       add_rhs = rhs1;
3193       conv_stmt = conv2_stmt;
3194     }
3195   else
3196     return false;
3197
3198   to_mode = TYPE_MODE (type);
3199   from_mode = TYPE_MODE (type1);
3200   from_unsigned1 = TYPE_UNSIGNED (type1);
3201   from_unsigned2 = TYPE_UNSIGNED (type2);
3202   optype = type1;
3203
3204   /* There's no such thing as a mixed sign madd yet, so use a wider mode.  */
3205   if (from_unsigned1 != from_unsigned2)
3206     {
3207       if (!INTEGRAL_TYPE_P (type))
3208         return false;
3209       /* We can use a signed multiply with unsigned types as long as
3210          there is a wider mode to use, or it is the smaller of the two
3211          types that is unsigned.  Note that type1 >= type2, always.  */
3212       if ((from_unsigned1
3213            && TYPE_PRECISION (type1) == GET_MODE_PRECISION (from_mode))
3214           || (from_unsigned2
3215               && TYPE_PRECISION (type2) == GET_MODE_PRECISION (from_mode)))
3216         {
3217           from_mode = GET_MODE_WIDER_MODE (from_mode);
3218           if (GET_MODE_SIZE (from_mode) >= GET_MODE_SIZE (to_mode))
3219             return false;
3220         }
3221
3222       from_unsigned1 = from_unsigned2 = false;
3223       optype = build_nonstandard_integer_type (GET_MODE_PRECISION (from_mode),
3224                                                false);
3225     }
3226
3227   /* If there was a conversion between the multiply and addition
3228      then we need to make sure it fits a multiply-and-accumulate.
3229      The should be a single mode change which does not change the
3230      value.  */
3231   if (conv_stmt)
3232     {
3233       /* We use the original, unmodified data types for this.  */
3234       tree from_type = TREE_TYPE (gimple_assign_rhs1 (conv_stmt));
3235       tree to_type = TREE_TYPE (gimple_assign_lhs (conv_stmt));
3236       int data_size = TYPE_PRECISION (type1) + TYPE_PRECISION (type2);
3237       bool is_unsigned = TYPE_UNSIGNED (type1) && TYPE_UNSIGNED (type2);
3238
3239       if (TYPE_PRECISION (from_type) > TYPE_PRECISION (to_type))
3240         {
3241           /* Conversion is a truncate.  */
3242           if (TYPE_PRECISION (to_type) < data_size)
3243             return false;
3244         }
3245       else if (TYPE_PRECISION (from_type) < TYPE_PRECISION (to_type))
3246         {
3247           /* Conversion is an extend.  Check it's the right sort.  */
3248           if (TYPE_UNSIGNED (from_type) != is_unsigned
3249               && !(is_unsigned && TYPE_PRECISION (from_type) > data_size))
3250             return false;
3251         }
3252       /* else convert is a no-op for our purposes.  */
3253     }
3254
3255   /* Verify that the machine can perform a widening multiply
3256      accumulate in this mode/signedness combination, otherwise
3257      this transformation is likely to pessimize code.  */
3258   this_optab = optab_for_tree_code (wmult_code, optype, optab_default);
3259   handler = find_widening_optab_handler_and_mode (this_optab, to_mode,
3260                                                   from_mode, 0, &actual_mode);
3261
3262   if (handler == CODE_FOR_nothing)
3263     return false;
3264
3265   /* Ensure that the inputs to the handler are in the correct precison
3266      for the opcode.  This will be the full mode size.  */
3267   actual_precision = GET_MODE_PRECISION (actual_mode);
3268   if (actual_precision != TYPE_PRECISION (type1)
3269       || from_unsigned1 != TYPE_UNSIGNED (type1))
3270     mult_rhs1 = build_and_insert_cast (gsi, loc,
3271                                        build_nonstandard_integer_type
3272                                          (actual_precision, from_unsigned1),
3273                                        mult_rhs1);
3274   if (actual_precision != TYPE_PRECISION (type2)
3275       || from_unsigned2 != TYPE_UNSIGNED (type2))
3276     mult_rhs2 = build_and_insert_cast (gsi, loc,
3277                                        build_nonstandard_integer_type
3278                                          (actual_precision, from_unsigned2),
3279                                        mult_rhs2);
3280
3281   if (!useless_type_conversion_p (type, TREE_TYPE (add_rhs)))
3282     add_rhs = build_and_insert_cast (gsi, loc, type, add_rhs);
3283
3284   /* Handle constants.  */
3285   if (TREE_CODE (mult_rhs1) == INTEGER_CST)
3286     mult_rhs1 = fold_convert (type1, mult_rhs1);
3287   if (TREE_CODE (mult_rhs2) == INTEGER_CST)
3288     mult_rhs2 = fold_convert (type2, mult_rhs2);
3289
3290   gimple_assign_set_rhs_with_ops (gsi, wmult_code, mult_rhs1, mult_rhs2,
3291                                   add_rhs);
3292   update_stmt (gsi_stmt (*gsi));
3293   widen_mul_stats.maccs_inserted++;
3294   return true;
3295 }
3296
3297 /* Combine the multiplication at MUL_STMT with operands MULOP1 and MULOP2
3298    with uses in additions and subtractions to form fused multiply-add
3299    operations.  Returns true if successful and MUL_STMT should be removed.  */
3300
3301 static bool
3302 convert_mult_to_fma (gimple *mul_stmt, tree op1, tree op2)
3303 {
3304   tree mul_result = gimple_get_lhs (mul_stmt);
3305   tree type = TREE_TYPE (mul_result);
3306   gimple *use_stmt, *neguse_stmt;
3307   gassign *fma_stmt;
3308   use_operand_p use_p;
3309   imm_use_iterator imm_iter;
3310
3311   if (FLOAT_TYPE_P (type)
3312       && flag_fp_contract_mode == FP_CONTRACT_OFF)
3313     return false;
3314
3315   /* We don't want to do bitfield reduction ops.  */
3316   if (INTEGRAL_TYPE_P (type)
3317       && (TYPE_PRECISION (type)
3318           != GET_MODE_PRECISION (TYPE_MODE (type))))
3319     return false;
3320
3321   /* If the target doesn't support it, don't generate it.  We assume that
3322      if fma isn't available then fms, fnma or fnms are not either.  */
3323   if (optab_handler (fma_optab, TYPE_MODE (type)) == CODE_FOR_nothing)
3324     return false;
3325
3326   /* If the multiplication has zero uses, it is kept around probably because
3327      of -fnon-call-exceptions.  Don't optimize it away in that case,
3328      it is DCE job.  */
3329   if (has_zero_uses (mul_result))
3330     return false;
3331
3332   /* Make sure that the multiplication statement becomes dead after
3333      the transformation, thus that all uses are transformed to FMAs.
3334      This means we assume that an FMA operation has the same cost
3335      as an addition.  */
3336   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, mul_result)
3337     {
3338       enum tree_code use_code;
3339       tree result = mul_result;
3340       bool negate_p = false;
3341
3342       use_stmt = USE_STMT (use_p);
3343
3344       if (is_gimple_debug (use_stmt))
3345         continue;
3346
3347       /* For now restrict this operations to single basic blocks.  In theory
3348          we would want to support sinking the multiplication in
3349          m = a*b;
3350          if ()
3351            ma = m + c;
3352          else
3353            d = m;
3354          to form a fma in the then block and sink the multiplication to the
3355          else block.  */
3356       if (gimple_bb (use_stmt) != gimple_bb (mul_stmt))
3357         return false;
3358
3359       if (!is_gimple_assign (use_stmt))
3360         return false;
3361
3362       use_code = gimple_assign_rhs_code (use_stmt);
3363
3364       /* A negate on the multiplication leads to FNMA.  */
3365       if (use_code == NEGATE_EXPR)
3366         {
3367           ssa_op_iter iter;
3368           use_operand_p usep;
3369
3370           result = gimple_assign_lhs (use_stmt);
3371
3372           /* Make sure the negate statement becomes dead with this
3373              single transformation.  */
3374           if (!single_imm_use (gimple_assign_lhs (use_stmt),
3375                                &use_p, &neguse_stmt))
3376             return false;
3377
3378           /* Make sure the multiplication isn't also used on that stmt.  */
3379           FOR_EACH_PHI_OR_STMT_USE (usep, neguse_stmt, iter, SSA_OP_USE)
3380             if (USE_FROM_PTR (usep) == mul_result)
3381               return false;
3382
3383           /* Re-validate.  */
3384           use_stmt = neguse_stmt;
3385           if (gimple_bb (use_stmt) != gimple_bb (mul_stmt))
3386             return false;
3387           if (!is_gimple_assign (use_stmt))
3388             return false;
3389
3390           use_code = gimple_assign_rhs_code (use_stmt);
3391           negate_p = true;
3392         }
3393
3394       switch (use_code)
3395         {
3396         case MINUS_EXPR:
3397           if (gimple_assign_rhs2 (use_stmt) == result)
3398             negate_p = !negate_p;
3399           break;
3400         case PLUS_EXPR:
3401           break;
3402         default:
3403           /* FMA can only be formed from PLUS and MINUS.  */
3404           return false;
3405         }
3406
3407       /* If the subtrahend (gimple_assign_rhs2 (use_stmt)) is computed
3408          by a MULT_EXPR that we'll visit later, we might be able to
3409          get a more profitable match with fnma.
3410          OTOH, if we don't, a negate / fma pair has likely lower latency
3411          that a mult / subtract pair.  */
3412       if (use_code == MINUS_EXPR && !negate_p
3413           && gimple_assign_rhs1 (use_stmt) == result
3414           && optab_handler (fms_optab, TYPE_MODE (type)) == CODE_FOR_nothing
3415           && optab_handler (fnma_optab, TYPE_MODE (type)) != CODE_FOR_nothing)
3416         {
3417           tree rhs2 = gimple_assign_rhs2 (use_stmt);
3418
3419           if (TREE_CODE (rhs2) == SSA_NAME)
3420             {
3421               gimple *stmt2 = SSA_NAME_DEF_STMT (rhs2);
3422               if (has_single_use (rhs2)
3423                   && is_gimple_assign (stmt2)
3424                   && gimple_assign_rhs_code (stmt2) == MULT_EXPR)
3425               return false;
3426             }
3427         }
3428
3429       /* We can't handle a * b + a * b.  */
3430       if (gimple_assign_rhs1 (use_stmt) == gimple_assign_rhs2 (use_stmt))
3431         return false;
3432
3433       /* While it is possible to validate whether or not the exact form
3434          that we've recognized is available in the backend, the assumption
3435          is that the transformation is never a loss.  For instance, suppose
3436          the target only has the plain FMA pattern available.  Consider
3437          a*b-c -> fma(a,b,-c): we've exchanged MUL+SUB for FMA+NEG, which
3438          is still two operations.  Consider -(a*b)-c -> fma(-a,b,-c): we
3439          still have 3 operations, but in the FMA form the two NEGs are
3440          independent and could be run in parallel.  */
3441     }
3442
3443   FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, mul_result)
3444     {
3445       gimple_stmt_iterator gsi = gsi_for_stmt (use_stmt);
3446       enum tree_code use_code;
3447       tree addop, mulop1 = op1, result = mul_result;
3448       bool negate_p = false;
3449
3450       if (is_gimple_debug (use_stmt))
3451         continue;
3452
3453       use_code = gimple_assign_rhs_code (use_stmt);
3454       if (use_code == NEGATE_EXPR)
3455         {
3456           result = gimple_assign_lhs (use_stmt);
3457           single_imm_use (gimple_assign_lhs (use_stmt), &use_p, &neguse_stmt);
3458           gsi_remove (&gsi, true);
3459           release_defs (use_stmt);
3460
3461           use_stmt = neguse_stmt;
3462           gsi = gsi_for_stmt (use_stmt);
3463           use_code = gimple_assign_rhs_code (use_stmt);
3464           negate_p = true;
3465         }
3466
3467       if (gimple_assign_rhs1 (use_stmt) == result)
3468         {
3469           addop = gimple_assign_rhs2 (use_stmt);
3470           /* a * b - c -> a * b + (-c)  */
3471           if (gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
3472             addop = force_gimple_operand_gsi (&gsi,
3473                                               build1 (NEGATE_EXPR,
3474                                                       type, addop),
3475                                               true, NULL_TREE, true,
3476                                               GSI_SAME_STMT);
3477         }
3478       else
3479         {
3480           addop = gimple_assign_rhs1 (use_stmt);
3481           /* a - b * c -> (-b) * c + a */
3482           if (gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
3483             negate_p = !negate_p;
3484         }
3485
3486       if (negate_p)
3487         mulop1 = force_gimple_operand_gsi (&gsi,
3488                                            build1 (NEGATE_EXPR,
3489                                                    type, mulop1),
3490                                            true, NULL_TREE, true,
3491                                            GSI_SAME_STMT);
3492
3493       fma_stmt = gimple_build_assign (gimple_assign_lhs (use_stmt),
3494                                       FMA_EXPR, mulop1, op2, addop);
3495       gsi_replace (&gsi, fma_stmt, true);
3496       widen_mul_stats.fmas_inserted++;
3497     }
3498
3499   return true;
3500 }
3501
3502 /* Find integer multiplications where the operands are extended from
3503    smaller types, and replace the MULT_EXPR with a WIDEN_MULT_EXPR
3504    where appropriate.  */
3505
3506 namespace {
3507
3508 const pass_data pass_data_optimize_widening_mul =
3509 {
3510   GIMPLE_PASS, /* type */
3511   "widening_mul", /* name */
3512   OPTGROUP_NONE, /* optinfo_flags */
3513   TV_NONE, /* tv_id */
3514   PROP_ssa, /* properties_required */
3515   0, /* properties_provided */
3516   0, /* properties_destroyed */
3517   0, /* todo_flags_start */
3518   TODO_update_ssa, /* todo_flags_finish */
3519 };
3520
3521 class pass_optimize_widening_mul : public gimple_opt_pass
3522 {
3523 public:
3524   pass_optimize_widening_mul (gcc::context *ctxt)
3525     : gimple_opt_pass (pass_data_optimize_widening_mul, ctxt)
3526   {}
3527
3528   /* opt_pass methods: */
3529   virtual bool gate (function *)
3530     {
3531       return flag_expensive_optimizations && optimize;
3532     }
3533
3534   virtual unsigned int execute (function *);
3535
3536 }; // class pass_optimize_widening_mul
3537
3538 unsigned int
3539 pass_optimize_widening_mul::execute (function *fun)
3540 {
3541   basic_block bb;
3542   bool cfg_changed = false;
3543
3544   memset (&widen_mul_stats, 0, sizeof (widen_mul_stats));
3545
3546   FOR_EACH_BB_FN (bb, fun)
3547     {
3548       gimple_stmt_iterator gsi;
3549
3550       for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi);)
3551         {
3552           gimple *stmt = gsi_stmt (gsi);
3553           enum tree_code code;
3554
3555           if (is_gimple_assign (stmt))
3556             {
3557               code = gimple_assign_rhs_code (stmt);
3558               switch (code)
3559                 {
3560                 case MULT_EXPR:
3561                   if (!convert_mult_to_widen (stmt, &gsi)
3562                       && convert_mult_to_fma (stmt,
3563                                               gimple_assign_rhs1 (stmt),
3564                                               gimple_assign_rhs2 (stmt)))
3565                     {
3566                       gsi_remove (&gsi, true);
3567                       release_defs (stmt);
3568                       continue;
3569                     }
3570                   break;
3571
3572                 case PLUS_EXPR:
3573                 case MINUS_EXPR:
3574                   convert_plusminus_to_widen (&gsi, stmt, code);
3575                   break;
3576
3577                 default:;
3578                 }
3579             }
3580           else if (is_gimple_call (stmt)
3581                    && gimple_call_lhs (stmt))
3582             {
3583               tree fndecl = gimple_call_fndecl (stmt);
3584               if (fndecl
3585                   && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_NORMAL)
3586                 {
3587                   switch (DECL_FUNCTION_CODE (fndecl))
3588                     {
3589                       case BUILT_IN_POWF:
3590                       case BUILT_IN_POW:
3591                       case BUILT_IN_POWL:
3592                         if (TREE_CODE (gimple_call_arg (stmt, 1)) == REAL_CST
3593                             && REAL_VALUES_EQUAL
3594                                  (TREE_REAL_CST (gimple_call_arg (stmt, 1)),
3595                                   dconst2)
3596                             && convert_mult_to_fma (stmt,
3597                                                     gimple_call_arg (stmt, 0),
3598                                                     gimple_call_arg (stmt, 0)))
3599                           {
3600                             unlink_stmt_vdef (stmt);
3601                             if (gsi_remove (&gsi, true)
3602                                 && gimple_purge_dead_eh_edges (bb))
3603                               cfg_changed = true;
3604                             release_defs (stmt);
3605                             continue;
3606                           }
3607                           break;
3608
3609                       default:;
3610                     }
3611                 }
3612             }
3613           gsi_next (&gsi);
3614         }
3615     }
3616
3617   statistics_counter_event (fun, "widening multiplications inserted",
3618                             widen_mul_stats.widen_mults_inserted);
3619   statistics_counter_event (fun, "widening maccs inserted",
3620                             widen_mul_stats.maccs_inserted);
3621   statistics_counter_event (fun, "fused multiply-adds inserted",
3622                             widen_mul_stats.fmas_inserted);
3623
3624   return cfg_changed ? TODO_cleanup_cfg : 0;
3625 }
3626
3627 } // anon namespace
3628
3629 gimple_opt_pass *
3630 make_pass_optimize_widening_mul (gcc::context *ctxt)
3631 {
3632   return new pass_optimize_widening_mul (ctxt);
3633 }