2005-04-29 Jim Tison <jtison@us.ibm.com>
[official-gcc.git] / gcc / modulo-sched.c
blob2dffc3170a116f88c9dca2420e2022c8a60c043d
1 /* Swing Modulo Scheduling implementation.
2 Copyright (C) 2004, 2005
3 Free Software Foundation, Inc.
4 Contributed by Ayal Zaks and Mustafa Hagog <zaks,mustafa@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 2, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING. If not, write to the Free
20 Software Foundation, 59 Temple Place - Suite 330, Boston, MA
21 02111-1307, USA. */
24 #include "config.h"
25 #include "system.h"
26 #include "coretypes.h"
27 #include "tm.h"
28 #include "toplev.h"
29 #include "rtl.h"
30 #include "tm_p.h"
31 #include "hard-reg-set.h"
32 #include "regs.h"
33 #include "function.h"
34 #include "flags.h"
35 #include "insn-config.h"
36 #include "insn-attr.h"
37 #include "except.h"
38 #include "toplev.h"
39 #include "recog.h"
40 #include "sched-int.h"
41 #include "target.h"
42 #include "cfglayout.h"
43 #include "cfgloop.h"
44 #include "cfghooks.h"
45 #include "expr.h"
46 #include "params.h"
47 #include "gcov-io.h"
48 #include "df.h"
49 #include "ddg.h"
51 #ifdef INSN_SCHEDULING
53 /* This file contains the implementation of the Swing Modulo Scheduler,
54 described in the following references:
55 [1] J. Llosa, A. Gonzalez, E. Ayguade, M. Valero., and J. Eckhardt.
56 Lifetime--sensitive modulo scheduling in a production environment.
57 IEEE Trans. on Comps., 50(3), March 2001
58 [2] J. Llosa, A. Gonzalez, E. Ayguade, and M. Valero.
59 Swing Modulo Scheduling: A Lifetime Sensitive Approach.
60 PACT '96 , pages 80-87, October 1996 (Boston - Massachusetts - USA).
62 The basic structure is:
63 1. Build a data-dependence graph (DDG) for each loop.
64 2. Use the DDG to order the insns of a loop (not in topological order
65 necessarily, but rather) trying to place each insn after all its
66 predecessors _or_ after all its successors.
67 3. Compute MII: a lower bound on the number of cycles to schedule the loop.
68 4. Use the ordering to perform list-scheduling of the loop:
69 1. Set II = MII. We will try to schedule the loop within II cycles.
70 2. Try to schedule the insns one by one according to the ordering.
71 For each insn compute an interval of cycles by considering already-
72 scheduled preds and succs (and associated latencies); try to place
73 the insn in the cycles of this window checking for potential
74 resource conflicts (using the DFA interface).
75 Note: this is different from the cycle-scheduling of schedule_insns;
76 here the insns are not scheduled monotonically top-down (nor bottom-
77 up).
78 3. If failed in scheduling all insns - bump II++ and try again, unless
79 II reaches an upper bound MaxII, in which case report failure.
80 5. If we succeeded in scheduling the loop within II cycles, we now
81 generate prolog and epilog, decrease the counter of the loop, and
82 perform modulo variable expansion for live ranges that span more than
83 II cycles (i.e. use register copies to prevent a def from overwriting
84 itself before reaching the use).
88 /* This page defines partial-schedule structures and functions for
89 modulo scheduling. */
91 typedef struct partial_schedule *partial_schedule_ptr;
92 typedef struct ps_insn *ps_insn_ptr;
94 /* The minimum (absolute) cycle that a node of ps was scheduled in. */
95 #define PS_MIN_CYCLE(ps) (((partial_schedule_ptr)(ps))->min_cycle)
97 /* The maximum (absolute) cycle that a node of ps was scheduled in. */
98 #define PS_MAX_CYCLE(ps) (((partial_schedule_ptr)(ps))->max_cycle)
100 /* Perform signed modulo, always returning a non-negative value. */
101 #define SMODULO(x,y) ((x) % (y) < 0 ? ((x) % (y) + (y)) : (x) % (y))
103 /* The number of different iterations the nodes in ps span, assuming
104 the stage boundaries are placed efficiently. */
105 #define PS_STAGE_COUNT(ps) ((PS_MAX_CYCLE (ps) - PS_MIN_CYCLE (ps) \
106 + 1 + (ps)->ii - 1) / (ps)->ii)
108 /* A single instruction in the partial schedule. */
109 struct ps_insn
111 /* The corresponding DDG_NODE. */
112 ddg_node_ptr node;
114 /* The (absolute) cycle in which the PS instruction is scheduled.
115 Same as SCHED_TIME (node). */
116 int cycle;
118 /* The next/prev PS_INSN in the same row. */
119 ps_insn_ptr next_in_row,
120 prev_in_row;
122 /* The number of nodes in the same row that come after this node. */
123 int row_rest_count;
126 /* Holds the partial schedule as an array of II rows. Each entry of the
127 array points to a linked list of PS_INSNs, which represents the
128 instructions that are scheduled for that row. */
129 struct partial_schedule
131 int ii; /* Number of rows in the partial schedule. */
132 int history; /* Threshold for conflict checking using DFA. */
134 /* rows[i] points to linked list of insns scheduled in row i (0<=i<ii). */
135 ps_insn_ptr *rows;
137 /* The earliest absolute cycle of an insn in the partial schedule. */
138 int min_cycle;
140 /* The latest absolute cycle of an insn in the partial schedule. */
141 int max_cycle;
143 ddg_ptr g; /* The DDG of the insns in the partial schedule. */
146 /* We use this to record all the register replacements we do in
147 the kernel so we can undo SMS if it is not profitable. */
148 struct undo_replace_buff_elem
150 rtx insn;
151 rtx orig_reg;
152 rtx new_reg;
153 struct undo_replace_buff_elem *next;
158 partial_schedule_ptr create_partial_schedule (int ii, ddg_ptr, int history);
159 void free_partial_schedule (partial_schedule_ptr);
160 void reset_partial_schedule (partial_schedule_ptr, int new_ii);
161 void print_partial_schedule (partial_schedule_ptr, FILE *);
162 static int kernel_number_of_cycles (rtx first_insn, rtx last_insn);
163 static ps_insn_ptr ps_add_node_check_conflicts (partial_schedule_ptr,
164 ddg_node_ptr node, int cycle,
165 sbitmap must_precede,
166 sbitmap must_follow);
167 static void rotate_partial_schedule (partial_schedule_ptr, int);
168 void set_row_column_for_ps (partial_schedule_ptr);
169 static bool ps_unschedule_node (partial_schedule_ptr, ddg_node_ptr );
172 /* This page defines constants and structures for the modulo scheduling
173 driver. */
175 /* As in haifa-sched.c: */
176 /* issue_rate is the number of insns that can be scheduled in the same
177 machine cycle. It can be defined in the config/mach/mach.h file,
178 otherwise we set it to 1. */
180 static int issue_rate;
182 /* For printing statistics. */
183 static FILE *stats_file;
185 static int sms_order_nodes (ddg_ptr, int, int * result);
186 static void set_node_sched_params (ddg_ptr);
187 static partial_schedule_ptr sms_schedule_by_order (ddg_ptr, int, int,
188 int *, FILE*);
189 static void permute_partial_schedule (partial_schedule_ptr ps, rtx last);
190 static void generate_prolog_epilog (partial_schedule_ptr ,struct loop * loop, rtx);
191 static void duplicate_insns_of_cycles (partial_schedule_ptr ps,
192 int from_stage, int to_stage,
193 int is_prolog);
195 #define SCHED_ASAP(x) (((node_sched_params_ptr)(x)->aux.info)->asap)
196 #define SCHED_TIME(x) (((node_sched_params_ptr)(x)->aux.info)->time)
197 #define SCHED_FIRST_REG_MOVE(x) \
198 (((node_sched_params_ptr)(x)->aux.info)->first_reg_move)
199 #define SCHED_NREG_MOVES(x) \
200 (((node_sched_params_ptr)(x)->aux.info)->nreg_moves)
201 #define SCHED_ROW(x) (((node_sched_params_ptr)(x)->aux.info)->row)
202 #define SCHED_STAGE(x) (((node_sched_params_ptr)(x)->aux.info)->stage)
203 #define SCHED_COLUMN(x) (((node_sched_params_ptr)(x)->aux.info)->column)
205 /* The scheduling parameters held for each node. */
206 typedef struct node_sched_params
208 int asap; /* A lower-bound on the absolute scheduling cycle. */
209 int time; /* The absolute scheduling cycle (time >= asap). */
211 /* The following field (first_reg_move) is a pointer to the first
212 register-move instruction added to handle the modulo-variable-expansion
213 of the register defined by this node. This register-move copies the
214 original register defined by the node. */
215 rtx first_reg_move;
217 /* The number of register-move instructions added, immediately preceding
218 first_reg_move. */
219 int nreg_moves;
221 int row; /* Holds time % ii. */
222 int stage; /* Holds time / ii. */
224 /* The column of a node inside the ps. If nodes u, v are on the same row,
225 u will precede v if column (u) < column (v). */
226 int column;
227 } *node_sched_params_ptr;
230 /* The following three functions are copied from the current scheduler
231 code in order to use sched_analyze() for computing the dependencies.
232 They are used when initializing the sched_info structure. */
233 static const char *
234 sms_print_insn (rtx insn, int aligned ATTRIBUTE_UNUSED)
236 static char tmp[80];
238 sprintf (tmp, "i%4d", INSN_UID (insn));
239 return tmp;
242 static int
243 contributes_to_priority (rtx next, rtx insn)
245 return BLOCK_NUM (next) == BLOCK_NUM (insn);
248 static void
249 compute_jump_reg_dependencies (rtx insn ATTRIBUTE_UNUSED,
250 regset cond_exec ATTRIBUTE_UNUSED,
251 regset used ATTRIBUTE_UNUSED,
252 regset set ATTRIBUTE_UNUSED)
256 static struct sched_info sms_sched_info =
258 NULL,
259 NULL,
260 NULL,
261 NULL,
262 NULL,
263 sms_print_insn,
264 contributes_to_priority,
265 compute_jump_reg_dependencies,
266 NULL, NULL,
267 NULL, NULL,
268 0, 0, 0
272 /* Return the register decremented and tested or zero if it is not a decrement
273 and branch jump insn (similar to doloop_condition_get). */
274 static rtx
275 doloop_register_get (rtx insn, rtx *comp)
277 rtx pattern, cmp, inc, reg, condition;
279 if (!JUMP_P (insn))
280 return NULL_RTX;
281 pattern = PATTERN (insn);
283 /* The canonical doloop pattern we expect is:
285 (parallel [(set (pc) (if_then_else (condition)
286 (label_ref (label))
287 (pc)))
288 (set (reg) (plus (reg) (const_int -1)))
289 (additional clobbers and uses)])
291 where condition is further restricted to be
292 (ne (reg) (const_int 1)). */
294 if (GET_CODE (pattern) != PARALLEL)
295 return NULL_RTX;
297 cmp = XVECEXP (pattern, 0, 0);
298 inc = XVECEXP (pattern, 0, 1);
299 /* Return the compare rtx. */
300 *comp = cmp;
302 /* Check for (set (reg) (something)). */
303 if (GET_CODE (inc) != SET || ! REG_P (SET_DEST (inc)))
304 return NULL_RTX;
306 /* Extract loop counter register. */
307 reg = SET_DEST (inc);
309 /* Check if something = (plus (reg) (const_int -1)). */
310 if (GET_CODE (SET_SRC (inc)) != PLUS
311 || XEXP (SET_SRC (inc), 0) != reg
312 || XEXP (SET_SRC (inc), 1) != constm1_rtx)
313 return NULL_RTX;
315 /* Check for (set (pc) (if_then_else (condition)
316 (label_ref (label))
317 (pc))). */
318 if (GET_CODE (cmp) != SET
319 || SET_DEST (cmp) != pc_rtx
320 || GET_CODE (SET_SRC (cmp)) != IF_THEN_ELSE
321 || GET_CODE (XEXP (SET_SRC (cmp), 1)) != LABEL_REF
322 || XEXP (SET_SRC (cmp), 2) != pc_rtx)
323 return NULL_RTX;
325 /* Extract loop termination condition. */
326 condition = XEXP (SET_SRC (cmp), 0);
328 /* Check if condition = (ne (reg) (const_int 1)), which is more
329 restrictive than the check in doloop_condition_get:
330 if ((GET_CODE (condition) != GE && GET_CODE (condition) != NE)
331 || GET_CODE (XEXP (condition, 1)) != CONST_INT). */
332 if (GET_CODE (condition) != NE
333 || XEXP (condition, 1) != const1_rtx)
334 return NULL_RTX;
336 if (XEXP (condition, 0) == reg)
337 return reg;
339 return NULL_RTX;
342 /* Check if COUNT_REG is set to a constant in the PRE_HEADER block, so
343 that the number of iterations is a compile-time constant. If so,
344 return the rtx that sets COUNT_REG to a constant, and set COUNT to
345 this constant. Otherwise return 0. */
346 static rtx
347 const_iteration_count (rtx count_reg, basic_block pre_header,
348 HOST_WIDEST_INT * count)
350 rtx insn;
351 rtx head, tail;
353 if (! pre_header)
354 return NULL_RTX;
356 get_block_head_tail (pre_header->index, &head, &tail);
358 for (insn = tail; insn != PREV_INSN (head); insn = PREV_INSN (insn))
359 if (INSN_P (insn) && single_set (insn) &&
360 rtx_equal_p (count_reg, SET_DEST (single_set (insn))))
362 rtx pat = single_set (insn);
364 if (GET_CODE (SET_SRC (pat)) == CONST_INT)
366 *count = INTVAL (SET_SRC (pat));
367 return insn;
370 return NULL_RTX;
373 return NULL_RTX;
376 /* A very simple resource-based lower bound on the initiation interval.
377 ??? Improve the accuracy of this bound by considering the
378 utilization of various units. */
379 static int
380 res_MII (ddg_ptr g)
382 return (g->num_nodes / issue_rate);
386 /* Points to the array that contains the sched data for each node. */
387 static node_sched_params_ptr node_sched_params;
389 /* Allocate sched_params for each node and initialize it. Assumes that
390 the aux field of each node contain the asap bound (computed earlier),
391 and copies it into the sched_params field. */
392 static void
393 set_node_sched_params (ddg_ptr g)
395 int i;
397 /* Allocate for each node in the DDG a place to hold the "sched_data". */
398 /* Initialize ASAP/ALAP/HIGHT to zero. */
399 node_sched_params = (node_sched_params_ptr)
400 xcalloc (g->num_nodes,
401 sizeof (struct node_sched_params));
403 /* Set the pointer of the general data of the node to point to the
404 appropriate sched_params structure. */
405 for (i = 0; i < g->num_nodes; i++)
407 /* Watch out for aliasing problems? */
408 node_sched_params[i].asap = g->nodes[i].aux.count;
409 g->nodes[i].aux.info = &node_sched_params[i];
413 static void
414 print_node_sched_params (FILE * dump_file, int num_nodes)
416 int i;
418 if (! dump_file)
419 return;
420 for (i = 0; i < num_nodes; i++)
422 node_sched_params_ptr nsp = &node_sched_params[i];
423 rtx reg_move = nsp->first_reg_move;
424 int j;
426 fprintf (dump_file, "Node %d:\n", i);
427 fprintf (dump_file, " asap = %d:\n", nsp->asap);
428 fprintf (dump_file, " time = %d:\n", nsp->time);
429 fprintf (dump_file, " nreg_moves = %d:\n", nsp->nreg_moves);
430 for (j = 0; j < nsp->nreg_moves; j++)
432 fprintf (dump_file, " reg_move = ");
433 print_rtl_single (dump_file, reg_move);
434 reg_move = PREV_INSN (reg_move);
439 /* Calculate an upper bound for II. SMS should not schedule the loop if it
440 requires more cycles than this bound. Currently set to the sum of the
441 longest latency edge for each node. Reset based on experiments. */
442 static int
443 calculate_maxii (ddg_ptr g)
445 int i;
446 int maxii = 0;
448 for (i = 0; i < g->num_nodes; i++)
450 ddg_node_ptr u = &g->nodes[i];
451 ddg_edge_ptr e;
452 int max_edge_latency = 0;
454 for (e = u->out; e; e = e->next_out)
455 max_edge_latency = MAX (max_edge_latency, e->latency);
457 maxii += max_edge_latency;
459 return maxii;
463 Breaking intra-loop register anti-dependences:
464 Each intra-loop register anti-dependence implies a cross-iteration true
465 dependence of distance 1. Therefore, we can remove such false dependencies
466 and figure out if the partial schedule broke them by checking if (for a
467 true-dependence of distance 1): SCHED_TIME (def) < SCHED_TIME (use) and
468 if so generate a register move. The number of such moves is equal to:
469 SCHED_TIME (use) - SCHED_TIME (def) { 0 broken
470 nreg_moves = ----------------------------------- + 1 - { dependence.
471 ii { 1 if not.
473 static struct undo_replace_buff_elem *
474 generate_reg_moves (partial_schedule_ptr ps)
476 ddg_ptr g = ps->g;
477 int ii = ps->ii;
478 int i;
479 struct undo_replace_buff_elem *reg_move_replaces = NULL;
481 for (i = 0; i < g->num_nodes; i++)
483 ddg_node_ptr u = &g->nodes[i];
484 ddg_edge_ptr e;
485 int nreg_moves = 0, i_reg_move;
486 sbitmap *uses_of_defs;
487 rtx last_reg_move;
488 rtx prev_reg, old_reg;
490 /* Compute the number of reg_moves needed for u, by looking at life
491 ranges started at u (excluding self-loops). */
492 for (e = u->out; e; e = e->next_out)
493 if (e->type == TRUE_DEP && e->dest != e->src)
495 int nreg_moves4e = (SCHED_TIME (e->dest) - SCHED_TIME (e->src)) / ii;
497 if (e->distance == 1)
498 nreg_moves4e = (SCHED_TIME (e->dest) - SCHED_TIME (e->src) + ii) / ii;
500 /* If dest precedes src in the schedule of the kernel, then dest
501 will read before src writes and we can save one reg_copy. */
502 if (SCHED_ROW (e->dest) == SCHED_ROW (e->src)
503 && SCHED_COLUMN (e->dest) < SCHED_COLUMN (e->src))
504 nreg_moves4e--;
506 nreg_moves = MAX (nreg_moves, nreg_moves4e);
509 if (nreg_moves == 0)
510 continue;
512 /* Every use of the register defined by node may require a different
513 copy of this register, depending on the time the use is scheduled.
514 Set a bitmap vector, telling which nodes use each copy of this
515 register. */
516 uses_of_defs = sbitmap_vector_alloc (nreg_moves, g->num_nodes);
517 sbitmap_vector_zero (uses_of_defs, nreg_moves);
518 for (e = u->out; e; e = e->next_out)
519 if (e->type == TRUE_DEP && e->dest != e->src)
521 int dest_copy = (SCHED_TIME (e->dest) - SCHED_TIME (e->src)) / ii;
523 if (e->distance == 1)
524 dest_copy = (SCHED_TIME (e->dest) - SCHED_TIME (e->src) + ii) / ii;
526 if (SCHED_ROW (e->dest) == SCHED_ROW (e->src)
527 && SCHED_COLUMN (e->dest) < SCHED_COLUMN (e->src))
528 dest_copy--;
530 if (dest_copy)
531 SET_BIT (uses_of_defs[dest_copy - 1], e->dest->cuid);
534 /* Now generate the reg_moves, attaching relevant uses to them. */
535 SCHED_NREG_MOVES (u) = nreg_moves;
536 old_reg = prev_reg = copy_rtx (SET_DEST (single_set (u->insn)));
537 last_reg_move = u->insn;
539 for (i_reg_move = 0; i_reg_move < nreg_moves; i_reg_move++)
541 int i_use;
542 rtx new_reg = gen_reg_rtx (GET_MODE (prev_reg));
543 rtx reg_move = gen_move_insn (new_reg, prev_reg);
545 add_insn_before (reg_move, last_reg_move);
546 last_reg_move = reg_move;
548 if (!SCHED_FIRST_REG_MOVE (u))
549 SCHED_FIRST_REG_MOVE (u) = reg_move;
551 EXECUTE_IF_SET_IN_SBITMAP (uses_of_defs[i_reg_move], 0, i_use,
553 struct undo_replace_buff_elem *rep;
555 rep = (struct undo_replace_buff_elem *)
556 xcalloc (1, sizeof (struct undo_replace_buff_elem));
557 rep->insn = g->nodes[i_use].insn;
558 rep->orig_reg = old_reg;
559 rep->new_reg = new_reg;
561 if (! reg_move_replaces)
562 reg_move_replaces = rep;
563 else
565 rep->next = reg_move_replaces;
566 reg_move_replaces = rep;
569 replace_rtx (g->nodes[i_use].insn, old_reg, new_reg);
572 prev_reg = new_reg;
575 return reg_move_replaces;
578 /* We call this when we want to undo the SMS schedule for a given loop.
579 One of the things that we do is to delete the register moves generated
580 for the sake of SMS; this function deletes the register move instructions
581 recorded in the undo buffer. */
582 static void
583 undo_generate_reg_moves (partial_schedule_ptr ps,
584 struct undo_replace_buff_elem *reg_move_replaces)
586 int i,j;
588 for (i = 0; i < ps->g->num_nodes; i++)
590 ddg_node_ptr u = &ps->g->nodes[i];
591 rtx prev;
592 rtx crr = SCHED_FIRST_REG_MOVE (u);
594 for (j = 0; j < SCHED_NREG_MOVES (u); j++)
596 prev = PREV_INSN (crr);
597 delete_insn (crr);
598 crr = prev;
602 while (reg_move_replaces)
604 struct undo_replace_buff_elem *rep = reg_move_replaces;
606 reg_move_replaces = reg_move_replaces->next;
607 replace_rtx (rep->insn, rep->new_reg, rep->orig_reg);
611 /* Free memory allocated for the undo buffer. */
612 static void
613 free_undo_replace_buff (struct undo_replace_buff_elem *reg_move_replaces)
616 while (reg_move_replaces)
618 struct undo_replace_buff_elem *rep = reg_move_replaces;
620 reg_move_replaces = reg_move_replaces->next;
621 free (rep);
625 /* Bump the SCHED_TIMEs of all nodes to start from zero. Set the values
626 of SCHED_ROW and SCHED_STAGE. */
627 static void
628 normalize_sched_times (partial_schedule_ptr ps)
630 int i;
631 ddg_ptr g = ps->g;
632 int amount = PS_MIN_CYCLE (ps);
633 int ii = ps->ii;
635 /* Don't include the closing branch assuming that it is the last node. */
636 for (i = 0; i < g->num_nodes - 1; i++)
638 ddg_node_ptr u = &g->nodes[i];
639 int normalized_time = SCHED_TIME (u) - amount;
641 gcc_assert (normalized_time >= 0);
643 SCHED_TIME (u) = normalized_time;
644 SCHED_ROW (u) = normalized_time % ii;
645 SCHED_STAGE (u) = normalized_time / ii;
649 /* Set SCHED_COLUMN of each node according to its position in PS. */
650 static void
651 set_columns_for_ps (partial_schedule_ptr ps)
653 int row;
655 for (row = 0; row < ps->ii; row++)
657 ps_insn_ptr cur_insn = ps->rows[row];
658 int column = 0;
660 for (; cur_insn; cur_insn = cur_insn->next_in_row)
661 SCHED_COLUMN (cur_insn->node) = column++;
665 /* Permute the insns according to their order in PS, from row 0 to
666 row ii-1, and position them right before LAST. This schedules
667 the insns of the loop kernel. */
668 static void
669 permute_partial_schedule (partial_schedule_ptr ps, rtx last)
671 int ii = ps->ii;
672 int row;
673 ps_insn_ptr ps_ij;
675 for (row = 0; row < ii ; row++)
676 for (ps_ij = ps->rows[row]; ps_ij; ps_ij = ps_ij->next_in_row)
677 if (PREV_INSN (last) != ps_ij->node->insn)
678 reorder_insns_nobb (ps_ij->node->first_note, ps_ij->node->insn,
679 PREV_INSN (last));
682 /* As part of undoing SMS we return to the original ordering of the
683 instructions inside the loop kernel. Given the partial schedule PS, this
684 function returns the ordering of the instruction according to their CUID
685 in the DDG (PS->G), which is the original order of the instruction before
686 performing SMS. */
687 static void
688 undo_permute_partial_schedule (partial_schedule_ptr ps, rtx last)
690 int i;
692 for (i = 0 ; i < ps->g->num_nodes; i++)
693 if (last == ps->g->nodes[i].insn
694 || last == ps->g->nodes[i].first_note)
695 break;
696 else if (PREV_INSN (last) != ps->g->nodes[i].insn)
697 reorder_insns_nobb (ps->g->nodes[i].first_note, ps->g->nodes[i].insn,
698 PREV_INSN (last));
701 /* Used to generate the prologue & epilogue. Duplicate the subset of
702 nodes whose stages are between FROM_STAGE and TO_STAGE (inclusive
703 of both), together with a prefix/suffix of their reg_moves. */
704 static void
705 duplicate_insns_of_cycles (partial_schedule_ptr ps, int from_stage,
706 int to_stage, int for_prolog)
708 int row;
709 ps_insn_ptr ps_ij;
711 for (row = 0; row < ps->ii; row++)
712 for (ps_ij = ps->rows[row]; ps_ij; ps_ij = ps_ij->next_in_row)
714 ddg_node_ptr u_node = ps_ij->node;
715 int j, i_reg_moves;
716 rtx reg_move = NULL_RTX;
718 if (for_prolog)
720 /* SCHED_STAGE (u_node) >= from_stage == 0. Generate increasing
721 number of reg_moves starting with the second occurrence of
722 u_node, which is generated if its SCHED_STAGE <= to_stage. */
723 i_reg_moves = to_stage - SCHED_STAGE (u_node) + 1;
724 i_reg_moves = MAX (i_reg_moves, 0);
725 i_reg_moves = MIN (i_reg_moves, SCHED_NREG_MOVES (u_node));
727 /* The reg_moves start from the *first* reg_move backwards. */
728 if (i_reg_moves)
730 reg_move = SCHED_FIRST_REG_MOVE (u_node);
731 for (j = 1; j < i_reg_moves; j++)
732 reg_move = PREV_INSN (reg_move);
735 else /* It's for the epilog. */
737 /* SCHED_STAGE (u_node) <= to_stage. Generate all reg_moves,
738 starting to decrease one stage after u_node no longer occurs;
739 that is, generate all reg_moves until
740 SCHED_STAGE (u_node) == from_stage - 1. */
741 i_reg_moves = SCHED_NREG_MOVES (u_node)
742 - (from_stage - SCHED_STAGE (u_node) - 1);
743 i_reg_moves = MAX (i_reg_moves, 0);
744 i_reg_moves = MIN (i_reg_moves, SCHED_NREG_MOVES (u_node));
746 /* The reg_moves start from the *last* reg_move forwards. */
747 if (i_reg_moves)
749 reg_move = SCHED_FIRST_REG_MOVE (u_node);
750 for (j = 1; j < SCHED_NREG_MOVES (u_node); j++)
751 reg_move = PREV_INSN (reg_move);
755 for (j = 0; j < i_reg_moves; j++, reg_move = NEXT_INSN (reg_move))
756 emit_insn (copy_rtx (PATTERN (reg_move)));
757 if (SCHED_STAGE (u_node) >= from_stage
758 && SCHED_STAGE (u_node) <= to_stage)
759 duplicate_insn_chain (u_node->first_note, u_node->insn);
764 /* Generate the instructions (including reg_moves) for prolog & epilog. */
765 static void
766 generate_prolog_epilog (partial_schedule_ptr ps, struct loop * loop, rtx count_reg)
768 int i;
769 int last_stage = PS_STAGE_COUNT (ps) - 1;
770 edge e;
772 /* Generate the prolog, inserting its insns on the loop-entry edge. */
773 start_sequence ();
775 if (count_reg)
776 /* Generate a subtract instruction at the beginning of the prolog to
777 adjust the loop count by STAGE_COUNT. */
778 emit_insn (gen_sub2_insn (count_reg, GEN_INT (last_stage)));
780 for (i = 0; i < last_stage; i++)
781 duplicate_insns_of_cycles (ps, 0, i, 1);
783 /* Put the prolog , on the one and only entry edge. */
784 e = loop_preheader_edge (loop);
785 loop_split_edge_with(e , get_insns());
787 end_sequence ();
789 /* Generate the epilog, inserting its insns on the loop-exit edge. */
790 start_sequence ();
792 for (i = 0; i < last_stage; i++)
793 duplicate_insns_of_cycles (ps, i + 1, last_stage, 0);
795 /* Put the epilogue on the one and only one exit edge. */
796 gcc_assert (loop->single_exit);
797 e = loop->single_exit;
798 loop_split_edge_with(e , get_insns());
799 end_sequence ();
802 /* Return the line note insn preceding INSN, for debugging. Taken from
803 emit-rtl.c. */
804 static rtx
805 find_line_note (rtx insn)
807 for (; insn; insn = PREV_INSN (insn))
808 if (NOTE_P (insn)
809 && NOTE_LINE_NUMBER (insn) >= 0)
810 break;
812 return insn;
815 /* Return true if all the BBs of the loop are empty except the
816 loop header. */
817 static bool
818 loop_single_full_bb_p (struct loop *loop)
820 unsigned i;
821 basic_block *bbs = get_loop_body (loop);
823 for (i = 0; i < loop->num_nodes ; i++)
825 rtx head, tail;
826 bool empty_bb = true;
828 if (bbs[i] == loop->header)
829 continue;
831 /* Make sure that basic blocks other than the header
832 have only notes labels or jumps. */
833 get_block_head_tail (bbs[i]->index, &head, &tail);
834 for (; head != NEXT_INSN (tail); head = NEXT_INSN (head))
836 if (NOTE_P (head) || LABEL_P (head)
837 || (INSN_P (head) && JUMP_P (head)))
838 continue;
839 empty_bb = false;
840 break;
843 if (! empty_bb)
845 free (bbs);
846 return false;
849 free (bbs);
850 return true;
853 /* A simple loop from SMS point of view; it is a loop that is composed of
854 either a single basic block or two BBs - a header and a latch. */
855 #define SIMPLE_SMS_LOOP_P(loop) ((loop->num_nodes < 3 ) \
856 && (EDGE_COUNT (loop->latch->preds) == 1) \
857 && (EDGE_COUNT (loop->latch->succs) == 1))
859 /* Return true if the loop is in its canonical form and false if not.
860 i.e. SIMPLE_SMS_LOOP_P and have one preheader block, and single exit. */
861 static bool
862 loop_canon_p (struct loop *loop, FILE *dump_file)
865 if (loop->inner || ! loop->outer)
866 return false;
868 if (!loop->single_exit)
870 if (dump_file)
872 rtx line_note = find_line_note (BB_END (loop->header));
874 fprintf (dump_file, "SMS loop many exits ");
875 if (line_note)
877 expanded_location xloc;
878 NOTE_EXPANDED_LOCATION (xloc, line_note);
879 fprintf (stats_file, " %s %d (file, line)\n",
880 xloc.file, xloc.line);
883 return false;
886 if (! SIMPLE_SMS_LOOP_P (loop) && ! loop_single_full_bb_p (loop))
888 if (dump_file)
890 rtx line_note = find_line_note (BB_END (loop->header));
892 fprintf (dump_file, "SMS loop many BBs. ");
893 if (line_note)
895 expanded_location xloc;
896 NOTE_EXPANDED_LOCATION (xloc, line_note);
897 fprintf (stats_file, " %s %d (file, line)\n",
898 xloc.file, xloc.line);
901 return false;
904 return true;
907 /* If there are more than one entry for the loop,
908 make it one by splitting the first entry edge and
909 redirecting the others to the new BB. */
910 static void
911 canon_loop (struct loop *loop)
913 edge e;
914 edge_iterator i;
916 /* Avoid annoying special cases of edges going to exit
917 block. */
918 FOR_EACH_EDGE (e, i, EXIT_BLOCK_PTR->preds)
919 if ((e->flags & EDGE_FALLTHRU) && (EDGE_COUNT (e->src->succs) > 1))
920 loop_split_edge_with (e, NULL_RTX);
922 if (loop->latch == loop->header
923 || EDGE_COUNT (loop->latch->succs) > 1)
925 FOR_EACH_EDGE (e, i, loop->header->preds)
926 if (e->src == loop->latch)
927 break;
928 loop_split_edge_with (e, NULL_RTX);
932 /* Build the loop information without loop
933 canonization, the loop canonization will
934 be performed if the loop is SMSable. */
935 static struct loops *
936 build_loops_structure (FILE *dumpfile)
938 struct loops *loops = xcalloc (1, sizeof (struct loops));
940 /* Find the loops. */
942 if (flow_loops_find (loops) <= 1)
944 /* No loops. */
945 flow_loops_free (loops);
946 free (loops);
948 return NULL;
951 /* Not going to update these. */
952 free (loops->cfg.rc_order);
953 loops->cfg.rc_order = NULL;
954 free (loops->cfg.dfs_order);
955 loops->cfg.dfs_order = NULL;
957 create_preheaders (loops, CP_SIMPLE_PREHEADERS);
958 mark_single_exit_loops (loops);
959 /* Dump loops. */
960 flow_loops_dump (loops, dumpfile, NULL, 1);
962 #ifdef ENABLE_CHECKING
963 verify_dominators (CDI_DOMINATORS);
964 verify_loop_structure (loops);
965 #endif
967 return loops;
970 /* Main entry point, perform SMS scheduling on the loops of the function
971 that consist of single basic blocks. */
972 void
973 sms_schedule (FILE *dump_file)
975 static int passes = 0;
976 rtx insn;
977 ddg_ptr *g_arr, g;
978 int * node_order;
979 int maxii;
980 unsigned i,num_loops;
981 partial_schedule_ptr ps;
982 struct df *df;
983 struct loops *loops;
984 basic_block bb = NULL;
985 /* vars to the versioning only if needed*/
986 struct loop * nloop;
987 basic_block condition_bb = NULL;
988 edge latch_edge;
989 gcov_type trip_count = 0;
991 if (! (loops = build_loops_structure (dump_file)))
992 return; /* There is no loops to schedule. */
995 stats_file = dump_file;
997 /* Initialize issue_rate. */
998 if (targetm.sched.issue_rate)
1000 int temp = reload_completed;
1002 reload_completed = 1;
1003 issue_rate = targetm.sched.issue_rate ();
1004 reload_completed = temp;
1006 else
1007 issue_rate = 1;
1009 /* Initialize the scheduler. */
1010 current_sched_info = &sms_sched_info;
1011 sched_init (NULL);
1013 /* Init Data Flow analysis, to be used in interloop dep calculation. */
1014 df = df_init ();
1015 df_analyze (df, 0, DF_ALL);
1017 /* Allocate memory to hold the DDG array one entry for each loop.
1018 We use loop->num as index into this array. */
1019 g_arr = xcalloc (loops->num, sizeof (ddg_ptr));
1022 /* Build DDGs for all the relevant loops and hold them in G_ARR
1023 indexed by the loop index. */
1024 for (i = 0; i < loops->num; i++)
1026 rtx head, tail;
1027 rtx count_reg, comp;
1028 struct loop *loop = loops->parray[i];
1030 /* For debugging. */
1031 if ((passes++ > MAX_SMS_LOOP_NUMBER) && (MAX_SMS_LOOP_NUMBER != -1))
1033 if (dump_file)
1034 fprintf (dump_file, "SMS reached MAX_PASSES... \n");
1036 break;
1039 if (! loop_canon_p (loop, dump_file))
1040 continue;
1042 if (! loop_single_full_bb_p (loop))
1043 continue;
1045 bb = loop->header;
1047 get_block_head_tail (bb->index, &head, &tail);
1048 latch_edge = loop_latch_edge (loop);
1049 gcc_assert (loop->single_exit);
1050 if (loop->single_exit->count)
1051 trip_count = latch_edge->count / loop->single_exit->count;
1053 /* Perfrom SMS only on loops that their average count is above threshold. */
1055 if ( latch_edge->count
1056 && (latch_edge->count < loop->single_exit->count * SMS_LOOP_AVERAGE_COUNT_THRESHOLD))
1058 if (stats_file)
1060 rtx line_note = find_line_note (tail);
1062 if (line_note)
1064 expanded_location xloc;
1065 NOTE_EXPANDED_LOCATION (xloc, line_note);
1066 fprintf (stats_file, "SMS bb %s %d (file, line)\n",
1067 xloc.file, xloc.line);
1069 fprintf (stats_file, "SMS single-bb-loop\n");
1070 if (profile_info && flag_branch_probabilities)
1072 fprintf (stats_file, "SMS loop-count ");
1073 fprintf (stats_file, HOST_WIDEST_INT_PRINT_DEC,
1074 (HOST_WIDEST_INT) bb->count);
1075 fprintf (stats_file, "\n");
1076 fprintf (stats_file, "SMS trip-count ");
1077 fprintf (stats_file, HOST_WIDEST_INT_PRINT_DEC,
1078 (HOST_WIDEST_INT) trip_count);
1079 fprintf (stats_file, "\n");
1080 fprintf (stats_file, "SMS profile-sum-max ");
1081 fprintf (stats_file, HOST_WIDEST_INT_PRINT_DEC,
1082 (HOST_WIDEST_INT) profile_info->sum_max);
1083 fprintf (stats_file, "\n");
1086 continue;
1089 /* Make sure this is a doloop. */
1090 if ( !(count_reg = doloop_register_get (tail, &comp)))
1091 continue;
1093 /* Don't handle BBs with calls or barriers, or !single_set insns. */
1094 for (insn = head; insn != NEXT_INSN (tail); insn = NEXT_INSN (insn))
1095 if (CALL_P (insn)
1096 || BARRIER_P (insn)
1097 || (INSN_P (insn) && !JUMP_P (insn)
1098 && !single_set (insn) && GET_CODE (PATTERN (insn)) != USE))
1099 break;
1101 if (insn != NEXT_INSN (tail))
1103 if (stats_file)
1105 if (CALL_P (insn))
1106 fprintf (stats_file, "SMS loop-with-call\n");
1107 else if (BARRIER_P (insn))
1108 fprintf (stats_file, "SMS loop-with-barrier\n");
1109 else
1110 fprintf (stats_file, "SMS loop-with-not-single-set\n");
1111 print_rtl_single (stats_file, insn);
1114 continue;
1117 if (! (g = create_ddg (bb, df, 0)))
1119 if (stats_file)
1120 fprintf (stats_file, "SMS doloop\n");
1121 continue;
1124 g_arr[i] = g;
1127 /* Release Data Flow analysis data structures. */
1128 df_finish (df);
1130 /* We don't want to perform SMS on new loops - created by versioning. */
1131 num_loops = loops->num;
1132 /* Go over the built DDGs and perfrom SMS for each one of them. */
1133 for (i = 0; i < num_loops; i++)
1135 rtx head, tail;
1136 rtx count_reg, count_init, comp;
1137 int mii, rec_mii;
1138 unsigned stage_count = 0;
1139 HOST_WIDEST_INT loop_count = 0;
1140 struct loop *loop = loops->parray[i];
1142 if (! (g = g_arr[i]))
1143 continue;
1145 if (dump_file)
1146 print_ddg (dump_file, g);
1148 get_block_head_tail (loop->header->index, &head, &tail);
1150 latch_edge = loop_latch_edge (loop);
1151 gcc_assert (loop->single_exit);
1152 if (loop->single_exit->count)
1153 trip_count = latch_edge->count / loop->single_exit->count;
1155 if (stats_file)
1157 rtx line_note = find_line_note (tail);
1159 if (line_note)
1161 expanded_location xloc;
1162 NOTE_EXPANDED_LOCATION (xloc, line_note);
1163 fprintf (stats_file, "SMS bb %s %d (file, line)\n",
1164 xloc.file, xloc.line);
1166 fprintf (stats_file, "SMS single-bb-loop\n");
1167 if (profile_info && flag_branch_probabilities)
1169 fprintf (stats_file, "SMS loop-count ");
1170 fprintf (stats_file, HOST_WIDEST_INT_PRINT_DEC,
1171 (HOST_WIDEST_INT) bb->count);
1172 fprintf (stats_file, "\n");
1173 fprintf (stats_file, "SMS profile-sum-max ");
1174 fprintf (stats_file, HOST_WIDEST_INT_PRINT_DEC,
1175 (HOST_WIDEST_INT) profile_info->sum_max);
1176 fprintf (stats_file, "\n");
1178 fprintf (stats_file, "SMS doloop\n");
1179 fprintf (stats_file, "SMS built-ddg %d\n", g->num_nodes);
1180 fprintf (stats_file, "SMS num-loads %d\n", g->num_loads);
1181 fprintf (stats_file, "SMS num-stores %d\n", g->num_stores);
1185 /* In case of th loop have doloop register it gets special
1186 handling. */
1187 count_init = NULL_RTX;
1188 if ((count_reg = doloop_register_get (tail, &comp)))
1190 basic_block pre_header;
1192 pre_header = loop_preheader_edge (loop)->src;
1193 count_init = const_iteration_count (count_reg, pre_header,
1194 &loop_count);
1196 gcc_assert (count_reg);
1198 if (stats_file && count_init)
1200 fprintf (stats_file, "SMS const-doloop ");
1201 fprintf (stats_file, HOST_WIDEST_INT_PRINT_DEC,
1202 loop_count);
1203 fprintf (stats_file, "\n");
1206 node_order = (int *) xmalloc (sizeof (int) * g->num_nodes);
1208 mii = 1; /* Need to pass some estimate of mii. */
1209 rec_mii = sms_order_nodes (g, mii, node_order);
1210 mii = MAX (res_MII (g), rec_mii);
1211 maxii = (calculate_maxii (g) * SMS_MAX_II_FACTOR) / 100;
1213 if (stats_file)
1214 fprintf (stats_file, "SMS iis %d %d %d (rec_mii, mii, maxii)\n",
1215 rec_mii, mii, maxii);
1217 /* After sms_order_nodes and before sms_schedule_by_order, to copy over
1218 ASAP. */
1219 set_node_sched_params (g);
1221 ps = sms_schedule_by_order (g, mii, maxii, node_order, dump_file);
1223 if (ps)
1224 stage_count = PS_STAGE_COUNT (ps);
1226 /* Stage count of 1 means that there is no interleaving between
1227 iterations, let the scheduling passes do the job. */
1228 if (stage_count < 1
1229 || (count_init && (loop_count <= stage_count))
1230 || (flag_branch_probabilities && (trip_count <= stage_count)))
1232 if (dump_file)
1234 fprintf (dump_file, "SMS failed... \n");
1235 fprintf (dump_file, "SMS sched-failed (stage-count=%d, loop-count=", stage_count);
1236 fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC, loop_count);
1237 fprintf (dump_file, ", trip-count=");
1238 fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC, trip_count);
1239 fprintf (dump_file, ")\n");
1241 continue;
1243 else
1245 int orig_cycles = kernel_number_of_cycles (BB_HEAD (g->bb), BB_END (g->bb));
1246 int new_cycles;
1247 struct undo_replace_buff_elem *reg_move_replaces;
1249 if (stats_file)
1251 fprintf (stats_file,
1252 "SMS succeeded %d %d (with ii, sc)\n", ps->ii,
1253 stage_count);
1254 print_partial_schedule (ps, stats_file);
1255 fprintf (stats_file,
1256 "SMS Branch (%d) will later be scheduled at cycle %d.\n",
1257 g->closing_branch->cuid, PS_MIN_CYCLE (ps) - 1);
1260 /* Set the stage boundaries. If the DDG is built with closing_branch_deps,
1261 the closing_branch was scheduled and should appear in the last (ii-1)
1262 row. Otherwise, we are free to schedule the branch, and we let nodes
1263 that were scheduled at the first PS_MIN_CYCLE cycle appear in the first
1264 row; this should reduce stage_count to minimum. */
1265 normalize_sched_times (ps);
1266 rotate_partial_schedule (ps, PS_MIN_CYCLE (ps));
1267 set_columns_for_ps (ps);
1269 /* Generate the kernel just to be able to measure its cycles. */
1270 permute_partial_schedule (ps, g->closing_branch->first_note);
1271 reg_move_replaces = generate_reg_moves (ps);
1273 /* Get the number of cycles the new kernel expect to execute in. */
1274 new_cycles = kernel_number_of_cycles (BB_HEAD (g->bb), BB_END (g->bb));
1276 /* Get back to the original loop so we can do loop versioning. */
1277 undo_permute_partial_schedule (ps, g->closing_branch->first_note);
1278 if (reg_move_replaces)
1279 undo_generate_reg_moves (ps, reg_move_replaces);
1281 if ( new_cycles >= orig_cycles)
1283 /* SMS is not profitable so undo the permutation and reg move generation
1284 and return the kernel to its original state. */
1285 if (dump_file)
1286 fprintf (dump_file, "Undoing SMS becuase it is not profitable.\n");
1289 else
1291 canon_loop (loop);
1293 /* case the BCT count is not known , Do loop-versioning */
1294 if (count_reg && ! count_init)
1296 rtx comp_rtx = gen_rtx_fmt_ee (GT, VOIDmode, count_reg,
1297 GEN_INT(stage_count));
1299 nloop = loop_version (loops, loop, comp_rtx, &condition_bb);
1302 /* Set new iteration count of loop kernel. */
1303 if (count_reg && count_init)
1304 SET_SRC (single_set (count_init)) = GEN_INT (loop_count
1305 - stage_count + 1);
1307 /* Now apply the scheduled kernel to the RTL of the loop. */
1308 permute_partial_schedule (ps, g->closing_branch->first_note);
1310 /* Mark this loop as software pipelined so the later
1311 scheduling passes doesn't touch it. */
1312 if (! flag_resched_modulo_sched)
1313 g->bb->flags |= BB_DISABLE_SCHEDULE;
1314 /* The life-info is not valid any more. */
1315 g->bb->flags |= BB_DIRTY;
1317 reg_move_replaces = generate_reg_moves (ps);
1318 if (dump_file)
1319 print_node_sched_params (dump_file, g->num_nodes);
1320 /* Generate prolog and epilog. */
1321 if (count_reg && !count_init)
1322 generate_prolog_epilog (ps, loop, count_reg);
1323 else
1324 generate_prolog_epilog (ps, loop, NULL_RTX);
1326 free_undo_replace_buff (reg_move_replaces);
1329 free_partial_schedule (ps);
1330 free (node_sched_params);
1331 free (node_order);
1332 free_ddg (g);
1335 /* Release scheduler data, needed until now because of DFA. */
1336 sched_finish ();
1337 loop_optimizer_finalize (loops, dump_file);
1340 /* The SMS scheduling algorithm itself
1341 -----------------------------------
1342 Input: 'O' an ordered list of insns of a loop.
1343 Output: A scheduling of the loop - kernel, prolog, and epilogue.
1345 'Q' is the empty Set
1346 'PS' is the partial schedule; it holds the currently scheduled nodes with
1347 their cycle/slot.
1348 'PSP' previously scheduled predecessors.
1349 'PSS' previously scheduled successors.
1350 't(u)' the cycle where u is scheduled.
1351 'l(u)' is the latency of u.
1352 'd(v,u)' is the dependence distance from v to u.
1353 'ASAP(u)' the earliest time at which u could be scheduled as computed in
1354 the node ordering phase.
1355 'check_hardware_resources_conflicts(u, PS, c)'
1356 run a trace around cycle/slot through DFA model
1357 to check resource conflicts involving instruction u
1358 at cycle c given the partial schedule PS.
1359 'add_to_partial_schedule_at_time(u, PS, c)'
1360 Add the node/instruction u to the partial schedule
1361 PS at time c.
1362 'calculate_register_pressure(PS)'
1363 Given a schedule of instructions, calculate the register
1364 pressure it implies. One implementation could be the
1365 maximum number of overlapping live ranges.
1366 'maxRP' The maximum allowed register pressure, it is usually derived from the number
1367 registers available in the hardware.
1369 1. II = MII.
1370 2. PS = empty list
1371 3. for each node u in O in pre-computed order
1372 4. if (PSP(u) != Q && PSS(u) == Q) then
1373 5. Early_start(u) = max ( t(v) + l(v) - d(v,u)*II ) over all every v in PSP(u).
1374 6. start = Early_start; end = Early_start + II - 1; step = 1
1375 11. else if (PSP(u) == Q && PSS(u) != Q) then
1376 12. Late_start(u) = min ( t(v) - l(v) + d(v,u)*II ) over all every v in PSS(u).
1377 13. start = Late_start; end = Late_start - II + 1; step = -1
1378 14. else if (PSP(u) != Q && PSS(u) != Q) then
1379 15. Early_start(u) = max ( t(v) + l(v) - d(v,u)*II ) over all every v in PSP(u).
1380 16. Late_start(u) = min ( t(v) - l(v) + d(v,u)*II ) over all every v in PSS(u).
1381 17. start = Early_start;
1382 18. end = min(Early_start + II - 1 , Late_start);
1383 19. step = 1
1384 20. else "if (PSP(u) == Q && PSS(u) == Q)"
1385 21. start = ASAP(u); end = start + II - 1; step = 1
1386 22. endif
1388 23. success = false
1389 24. for (c = start ; c != end ; c += step)
1390 25. if check_hardware_resources_conflicts(u, PS, c) then
1391 26. add_to_partial_schedule_at_time(u, PS, c)
1392 27. success = true
1393 28. break
1394 29. endif
1395 30. endfor
1396 31. if (success == false) then
1397 32. II = II + 1
1398 33. if (II > maxII) then
1399 34. finish - failed to schedule
1400 35. endif
1401 36. goto 2.
1402 37. endif
1403 38. endfor
1404 39. if (calculate_register_pressure(PS) > maxRP) then
1405 40. goto 32.
1406 41. endif
1407 42. compute epilogue & prologue
1408 43. finish - succeeded to schedule
1411 /* A limit on the number of cycles that resource conflicts can span. ??? Should
1412 be provided by DFA, and be dependent on the type of insn scheduled. Currently
1413 set to 0 to save compile time. */
1414 #define DFA_HISTORY SMS_DFA_HISTORY
1416 /* Given the partial schedule PS, this function calculates and returns the
1417 cycles in which we can schedule the node with the given index I.
1418 NOTE: Here we do the backtracking in SMS, in some special cases. We have
1419 noticed that there are several cases in which we fail to SMS the loop
1420 because the sched window of a node is empty due to tight data-deps. In
1421 such cases we want to unschedule some of the predecessors/successors
1422 until we get non-empty scheduling window. It returns -1 if the
1423 scheduling window is empty and zero otherwise. */
1425 static int
1426 get_sched_window (partial_schedule_ptr ps, int *nodes_order, int i,
1427 sbitmap sched_nodes, int ii, int *start_p, int *step_p, int *end_p)
1429 int start, step, end;
1430 ddg_edge_ptr e;
1431 int u = nodes_order [i];
1432 ddg_node_ptr u_node = &ps->g->nodes[u];
1433 sbitmap psp = sbitmap_alloc (ps->g->num_nodes);
1434 sbitmap pss = sbitmap_alloc (ps->g->num_nodes);
1435 sbitmap u_node_preds = NODE_PREDECESSORS (u_node);
1436 sbitmap u_node_succs = NODE_SUCCESSORS (u_node);
1437 int psp_not_empty;
1438 int pss_not_empty;
1440 /* 1. compute sched window for u (start, end, step). */
1441 sbitmap_zero (psp);
1442 sbitmap_zero (pss);
1443 psp_not_empty = sbitmap_a_and_b_cg (psp, u_node_preds, sched_nodes);
1444 pss_not_empty = sbitmap_a_and_b_cg (pss, u_node_succs, sched_nodes);
1446 if (psp_not_empty && !pss_not_empty)
1448 int early_start = INT_MIN;
1450 end = INT_MAX;
1451 for (e = u_node->in; e != 0; e = e->next_in)
1453 ddg_node_ptr v_node = e->src;
1454 if (TEST_BIT (sched_nodes, v_node->cuid))
1456 int node_st = SCHED_TIME (v_node)
1457 + e->latency - (e->distance * ii);
1459 early_start = MAX (early_start, node_st);
1461 if (e->data_type == MEM_DEP)
1462 end = MIN (end, SCHED_TIME (v_node) + ii - 1);
1465 start = early_start;
1466 end = MIN (end, early_start + ii);
1467 step = 1;
1470 else if (!psp_not_empty && pss_not_empty)
1472 int late_start = INT_MAX;
1474 end = INT_MIN;
1475 for (e = u_node->out; e != 0; e = e->next_out)
1477 ddg_node_ptr v_node = e->dest;
1478 if (TEST_BIT (sched_nodes, v_node->cuid))
1480 late_start = MIN (late_start,
1481 SCHED_TIME (v_node) - e->latency
1482 + (e->distance * ii));
1483 if (e->data_type == MEM_DEP)
1484 end = MAX (end, SCHED_TIME (v_node) - ii + 1);
1487 start = late_start;
1488 end = MAX (end, late_start - ii);
1489 step = -1;
1492 else if (psp_not_empty && pss_not_empty)
1494 int early_start = INT_MIN;
1495 int late_start = INT_MAX;
1497 start = INT_MIN;
1498 end = INT_MAX;
1499 for (e = u_node->in; e != 0; e = e->next_in)
1501 ddg_node_ptr v_node = e->src;
1503 if (TEST_BIT (sched_nodes, v_node->cuid))
1505 early_start = MAX (early_start,
1506 SCHED_TIME (v_node) + e->latency
1507 - (e->distance * ii));
1508 if (e->data_type == MEM_DEP)
1509 end = MIN (end, SCHED_TIME (v_node) + ii - 1);
1512 for (e = u_node->out; e != 0; e = e->next_out)
1514 ddg_node_ptr v_node = e->dest;
1516 if (TEST_BIT (sched_nodes, v_node->cuid))
1518 late_start = MIN (late_start,
1519 SCHED_TIME (v_node) - e->latency
1520 + (e->distance * ii));
1521 if (e->data_type == MEM_DEP)
1522 start = MAX (start, SCHED_TIME (v_node) - ii + 1);
1525 start = MAX (start, early_start);
1526 end = MIN (end, MIN (early_start + ii, late_start + 1));
1527 step = 1;
1529 else /* psp is empty && pss is empty. */
1531 start = SCHED_ASAP (u_node);
1532 end = start + ii;
1533 step = 1;
1536 *start_p = start;
1537 *step_p = step;
1538 *end_p = end;
1539 sbitmap_free (psp);
1540 sbitmap_free (pss);
1542 if ((start >= end && step == 1) || (start <= end && step == -1))
1543 return -1;
1544 else
1545 return 0;
1548 /* This function implements the scheduling algorithm for SMS according to the
1549 above algorithm. */
1550 static partial_schedule_ptr
1551 sms_schedule_by_order (ddg_ptr g, int mii, int maxii, int *nodes_order, FILE *dump_file)
1553 int ii = mii;
1554 int i, c, success;
1555 int try_again_with_larger_ii = true;
1556 int num_nodes = g->num_nodes;
1557 ddg_edge_ptr e;
1558 int start, end, step; /* Place together into one struct? */
1559 sbitmap sched_nodes = sbitmap_alloc (num_nodes);
1560 sbitmap must_precede = sbitmap_alloc (num_nodes);
1561 sbitmap must_follow = sbitmap_alloc (num_nodes);
1562 sbitmap tobe_scheduled = sbitmap_alloc (num_nodes);
1564 partial_schedule_ptr ps = create_partial_schedule (ii, g, DFA_HISTORY);
1566 sbitmap_ones (tobe_scheduled);
1567 sbitmap_zero (sched_nodes);
1569 while ((! sbitmap_equal (tobe_scheduled, sched_nodes)
1570 || try_again_with_larger_ii ) && ii < maxii)
1572 int j;
1573 bool unscheduled_nodes = false;
1575 if (dump_file)
1576 fprintf(dump_file, "Starting with ii=%d\n", ii);
1577 if (try_again_with_larger_ii)
1579 try_again_with_larger_ii = false;
1580 sbitmap_zero (sched_nodes);
1583 for (i = 0; i < num_nodes; i++)
1585 int u = nodes_order[i];
1586 ddg_node_ptr u_node = &ps->g->nodes[u];
1587 rtx insn = u_node->insn;
1589 if (!INSN_P (insn))
1591 RESET_BIT (tobe_scheduled, u);
1592 continue;
1595 if (JUMP_P (insn)) /* Closing branch handled later. */
1597 RESET_BIT (tobe_scheduled, u);
1598 continue;
1601 if (TEST_BIT (sched_nodes, u))
1602 continue;
1604 /* Try to get non-empty scheduling window. */
1605 j = i;
1606 while (get_sched_window (ps, nodes_order, i, sched_nodes, ii, &start, &step, &end) < 0
1607 && j > 0)
1609 unscheduled_nodes = true;
1610 if (TEST_BIT (NODE_PREDECESSORS (u_node), nodes_order[j - 1])
1611 || TEST_BIT (NODE_SUCCESSORS (u_node), nodes_order[j - 1]))
1613 ps_unschedule_node (ps, &ps->g->nodes[nodes_order[j - 1]]);
1614 RESET_BIT (sched_nodes, nodes_order [j - 1]);
1616 j--;
1618 if (j < 0)
1620 /* ??? Try backtracking instead of immediately ii++? */
1621 ii++;
1622 try_again_with_larger_ii = true;
1623 reset_partial_schedule (ps, ii);
1624 break;
1626 /* 2. Try scheduling u in window. */
1627 if (dump_file)
1628 fprintf(dump_file, "Trying to schedule node %d in (%d .. %d) step %d\n",
1629 u, start, end, step);
1631 /* use must_follow & must_precede bitmaps to determine order
1632 of nodes within the cycle. */
1633 sbitmap_zero (must_precede);
1634 sbitmap_zero (must_follow);
1635 for (e = u_node->in; e != 0; e = e->next_in)
1636 if (TEST_BIT (sched_nodes, e->src->cuid)
1637 && e->latency == (ii * e->distance)
1638 && start == SCHED_TIME (e->src))
1639 SET_BIT (must_precede, e->src->cuid);
1641 for (e = u_node->out; e != 0; e = e->next_out)
1642 if (TEST_BIT (sched_nodes, e->dest->cuid)
1643 && e->latency == (ii * e->distance)
1644 && end == SCHED_TIME (e->dest))
1645 SET_BIT (must_follow, e->dest->cuid);
1647 success = 0;
1648 if ((step > 0 && start < end) || (step < 0 && start > end))
1649 for (c = start; c != end; c += step)
1651 ps_insn_ptr psi;
1653 psi = ps_add_node_check_conflicts (ps, u_node, c,
1654 must_precede,
1655 must_follow);
1657 if (psi)
1659 SCHED_TIME (u_node) = c;
1660 SET_BIT (sched_nodes, u);
1661 success = 1;
1662 if (dump_file)
1663 fprintf(dump_file, "Schedule in %d\n", c);
1664 break;
1667 if (!success)
1669 /* ??? Try backtracking instead of immediately ii++? */
1670 ii++;
1671 try_again_with_larger_ii = true;
1672 reset_partial_schedule (ps, ii);
1673 break;
1675 if (unscheduled_nodes)
1676 break;
1678 /* ??? If (success), check register pressure estimates. */
1679 } /* Continue with next node. */
1680 } /* While try_again_with_larger_ii. */
1682 sbitmap_free (sched_nodes);
1684 if (ii >= maxii)
1686 free_partial_schedule (ps);
1687 ps = NULL;
1689 return ps;
1693 /* This page implements the algorithm for ordering the nodes of a DDG
1694 for modulo scheduling, activated through the
1695 "int sms_order_nodes (ddg_ptr, int mii, int * result)" API. */
1697 #define ORDER_PARAMS(x) ((struct node_order_params *) (x)->aux.info)
1698 #define ASAP(x) (ORDER_PARAMS ((x))->asap)
1699 #define ALAP(x) (ORDER_PARAMS ((x))->alap)
1700 #define HEIGHT(x) (ORDER_PARAMS ((x))->height)
1701 #define MOB(x) (ALAP ((x)) - ASAP ((x)))
1702 #define DEPTH(x) (ASAP ((x)))
1704 typedef struct node_order_params * nopa;
1706 static void order_nodes_of_sccs (ddg_all_sccs_ptr, int * result);
1707 static int order_nodes_in_scc (ddg_ptr, sbitmap, sbitmap, int*, int);
1708 static nopa calculate_order_params (ddg_ptr, int mii);
1709 static int find_max_asap (ddg_ptr, sbitmap);
1710 static int find_max_hv_min_mob (ddg_ptr, sbitmap);
1711 static int find_max_dv_min_mob (ddg_ptr, sbitmap);
1713 enum sms_direction {BOTTOMUP, TOPDOWN};
1715 struct node_order_params
1717 int asap;
1718 int alap;
1719 int height;
1722 /* Check if NODE_ORDER contains a permutation of 0 .. NUM_NODES-1. */
1723 static void
1724 check_nodes_order (int *node_order, int num_nodes)
1726 int i;
1727 sbitmap tmp = sbitmap_alloc (num_nodes);
1729 sbitmap_zero (tmp);
1731 for (i = 0; i < num_nodes; i++)
1733 int u = node_order[i];
1735 gcc_assert (u < num_nodes && u >= 0 && !TEST_BIT (tmp, u));
1737 SET_BIT (tmp, u);
1740 sbitmap_free (tmp);
1743 /* Order the nodes of G for scheduling and pass the result in
1744 NODE_ORDER. Also set aux.count of each node to ASAP.
1745 Return the recMII for the given DDG. */
1746 static int
1747 sms_order_nodes (ddg_ptr g, int mii, int * node_order)
1749 int i;
1750 int rec_mii = 0;
1751 ddg_all_sccs_ptr sccs = create_ddg_all_sccs (g);
1753 nopa nops = calculate_order_params (g, mii);
1755 order_nodes_of_sccs (sccs, node_order);
1757 if (sccs->num_sccs > 0)
1758 /* First SCC has the largest recurrence_length. */
1759 rec_mii = sccs->sccs[0]->recurrence_length;
1761 /* Save ASAP before destroying node_order_params. */
1762 for (i = 0; i < g->num_nodes; i++)
1764 ddg_node_ptr v = &g->nodes[i];
1765 v->aux.count = ASAP (v);
1768 free (nops);
1769 free_ddg_all_sccs (sccs);
1770 check_nodes_order (node_order, g->num_nodes);
1772 return rec_mii;
1775 static void
1776 order_nodes_of_sccs (ddg_all_sccs_ptr all_sccs, int * node_order)
1778 int i, pos = 0;
1779 ddg_ptr g = all_sccs->ddg;
1780 int num_nodes = g->num_nodes;
1781 sbitmap prev_sccs = sbitmap_alloc (num_nodes);
1782 sbitmap on_path = sbitmap_alloc (num_nodes);
1783 sbitmap tmp = sbitmap_alloc (num_nodes);
1784 sbitmap ones = sbitmap_alloc (num_nodes);
1786 sbitmap_zero (prev_sccs);
1787 sbitmap_ones (ones);
1789 /* Perfrom the node ordering starting from the SCC with the highest recMII.
1790 For each SCC order the nodes according to their ASAP/ALAP/HEIGHT etc. */
1791 for (i = 0; i < all_sccs->num_sccs; i++)
1793 ddg_scc_ptr scc = all_sccs->sccs[i];
1795 /* Add nodes on paths from previous SCCs to the current SCC. */
1796 find_nodes_on_paths (on_path, g, prev_sccs, scc->nodes);
1797 sbitmap_a_or_b (tmp, scc->nodes, on_path);
1799 /* Add nodes on paths from the current SCC to previous SCCs. */
1800 find_nodes_on_paths (on_path, g, scc->nodes, prev_sccs);
1801 sbitmap_a_or_b (tmp, tmp, on_path);
1803 /* Remove nodes of previous SCCs from current extended SCC. */
1804 sbitmap_difference (tmp, tmp, prev_sccs);
1806 pos = order_nodes_in_scc (g, prev_sccs, tmp, node_order, pos);
1807 /* Above call to order_nodes_in_scc updated prev_sccs |= tmp. */
1810 /* Handle the remaining nodes that do not belong to any scc. Each call
1811 to order_nodes_in_scc handles a single connected component. */
1812 while (pos < g->num_nodes)
1814 sbitmap_difference (tmp, ones, prev_sccs);
1815 pos = order_nodes_in_scc (g, prev_sccs, tmp, node_order, pos);
1817 sbitmap_free (prev_sccs);
1818 sbitmap_free (on_path);
1819 sbitmap_free (tmp);
1820 sbitmap_free (ones);
1823 /* MII is needed if we consider backarcs (that do not close recursive cycles). */
1824 static struct node_order_params *
1825 calculate_order_params (ddg_ptr g, int mii ATTRIBUTE_UNUSED)
1827 int u;
1828 int max_asap;
1829 int num_nodes = g->num_nodes;
1830 ddg_edge_ptr e;
1831 /* Allocate a place to hold ordering params for each node in the DDG. */
1832 nopa node_order_params_arr;
1834 /* Initialize of ASAP/ALAP/HEIGHT to zero. */
1835 node_order_params_arr = (nopa) xcalloc (num_nodes,
1836 sizeof (struct node_order_params));
1838 /* Set the aux pointer of each node to point to its order_params structure. */
1839 for (u = 0; u < num_nodes; u++)
1840 g->nodes[u].aux.info = &node_order_params_arr[u];
1842 /* Disregarding a backarc from each recursive cycle to obtain a DAG,
1843 calculate ASAP, ALAP, mobility, distance, and height for each node
1844 in the dependence (direct acyclic) graph. */
1846 /* We assume that the nodes in the array are in topological order. */
1848 max_asap = 0;
1849 for (u = 0; u < num_nodes; u++)
1851 ddg_node_ptr u_node = &g->nodes[u];
1853 ASAP (u_node) = 0;
1854 for (e = u_node->in; e; e = e->next_in)
1855 if (e->distance == 0)
1856 ASAP (u_node) = MAX (ASAP (u_node),
1857 ASAP (e->src) + e->latency);
1858 max_asap = MAX (max_asap, ASAP (u_node));
1861 for (u = num_nodes - 1; u > -1; u--)
1863 ddg_node_ptr u_node = &g->nodes[u];
1865 ALAP (u_node) = max_asap;
1866 HEIGHT (u_node) = 0;
1867 for (e = u_node->out; e; e = e->next_out)
1868 if (e->distance == 0)
1870 ALAP (u_node) = MIN (ALAP (u_node),
1871 ALAP (e->dest) - e->latency);
1872 HEIGHT (u_node) = MAX (HEIGHT (u_node),
1873 HEIGHT (e->dest) + e->latency);
1877 return node_order_params_arr;
1880 static int
1881 find_max_asap (ddg_ptr g, sbitmap nodes)
1883 int u;
1884 int max_asap = -1;
1885 int result = -1;
1887 EXECUTE_IF_SET_IN_SBITMAP (nodes, 0, u,
1889 ddg_node_ptr u_node = &g->nodes[u];
1891 if (max_asap < ASAP (u_node))
1893 max_asap = ASAP (u_node);
1894 result = u;
1897 return result;
1900 static int
1901 find_max_hv_min_mob (ddg_ptr g, sbitmap nodes)
1903 int u;
1904 int max_hv = -1;
1905 int min_mob = INT_MAX;
1906 int result = -1;
1908 EXECUTE_IF_SET_IN_SBITMAP (nodes, 0, u,
1910 ddg_node_ptr u_node = &g->nodes[u];
1912 if (max_hv < HEIGHT (u_node))
1914 max_hv = HEIGHT (u_node);
1915 min_mob = MOB (u_node);
1916 result = u;
1918 else if ((max_hv == HEIGHT (u_node))
1919 && (min_mob > MOB (u_node)))
1921 min_mob = MOB (u_node);
1922 result = u;
1925 return result;
1928 static int
1929 find_max_dv_min_mob (ddg_ptr g, sbitmap nodes)
1931 int u;
1932 int max_dv = -1;
1933 int min_mob = INT_MAX;
1934 int result = -1;
1936 EXECUTE_IF_SET_IN_SBITMAP (nodes, 0, u,
1938 ddg_node_ptr u_node = &g->nodes[u];
1940 if (max_dv < DEPTH (u_node))
1942 max_dv = DEPTH (u_node);
1943 min_mob = MOB (u_node);
1944 result = u;
1946 else if ((max_dv == DEPTH (u_node))
1947 && (min_mob > MOB (u_node)))
1949 min_mob = MOB (u_node);
1950 result = u;
1953 return result;
1956 /* Places the nodes of SCC into the NODE_ORDER array starting
1957 at position POS, according to the SMS ordering algorithm.
1958 NODES_ORDERED (in&out parameter) holds the bitset of all nodes in
1959 the NODE_ORDER array, starting from position zero. */
1960 static int
1961 order_nodes_in_scc (ddg_ptr g, sbitmap nodes_ordered, sbitmap scc,
1962 int * node_order, int pos)
1964 enum sms_direction dir;
1965 int num_nodes = g->num_nodes;
1966 sbitmap workset = sbitmap_alloc (num_nodes);
1967 sbitmap tmp = sbitmap_alloc (num_nodes);
1968 sbitmap zero_bitmap = sbitmap_alloc (num_nodes);
1969 sbitmap predecessors = sbitmap_alloc (num_nodes);
1970 sbitmap successors = sbitmap_alloc (num_nodes);
1972 sbitmap_zero (predecessors);
1973 find_predecessors (predecessors, g, nodes_ordered);
1975 sbitmap_zero (successors);
1976 find_successors (successors, g, nodes_ordered);
1978 sbitmap_zero (tmp);
1979 if (sbitmap_a_and_b_cg (tmp, predecessors, scc))
1981 sbitmap_copy (workset, tmp);
1982 dir = BOTTOMUP;
1984 else if (sbitmap_a_and_b_cg (tmp, successors, scc))
1986 sbitmap_copy (workset, tmp);
1987 dir = TOPDOWN;
1989 else
1991 int u;
1993 sbitmap_zero (workset);
1994 if ((u = find_max_asap (g, scc)) >= 0)
1995 SET_BIT (workset, u);
1996 dir = BOTTOMUP;
1999 sbitmap_zero (zero_bitmap);
2000 while (!sbitmap_equal (workset, zero_bitmap))
2002 int v;
2003 ddg_node_ptr v_node;
2004 sbitmap v_node_preds;
2005 sbitmap v_node_succs;
2007 if (dir == TOPDOWN)
2009 while (!sbitmap_equal (workset, zero_bitmap))
2011 v = find_max_hv_min_mob (g, workset);
2012 v_node = &g->nodes[v];
2013 node_order[pos++] = v;
2014 v_node_succs = NODE_SUCCESSORS (v_node);
2015 sbitmap_a_and_b (tmp, v_node_succs, scc);
2017 /* Don't consider the already ordered successors again. */
2018 sbitmap_difference (tmp, tmp, nodes_ordered);
2019 sbitmap_a_or_b (workset, workset, tmp);
2020 RESET_BIT (workset, v);
2021 SET_BIT (nodes_ordered, v);
2023 dir = BOTTOMUP;
2024 sbitmap_zero (predecessors);
2025 find_predecessors (predecessors, g, nodes_ordered);
2026 sbitmap_a_and_b (workset, predecessors, scc);
2028 else
2030 while (!sbitmap_equal (workset, zero_bitmap))
2032 v = find_max_dv_min_mob (g, workset);
2033 v_node = &g->nodes[v];
2034 node_order[pos++] = v;
2035 v_node_preds = NODE_PREDECESSORS (v_node);
2036 sbitmap_a_and_b (tmp, v_node_preds, scc);
2038 /* Don't consider the already ordered predecessors again. */
2039 sbitmap_difference (tmp, tmp, nodes_ordered);
2040 sbitmap_a_or_b (workset, workset, tmp);
2041 RESET_BIT (workset, v);
2042 SET_BIT (nodes_ordered, v);
2044 dir = TOPDOWN;
2045 sbitmap_zero (successors);
2046 find_successors (successors, g, nodes_ordered);
2047 sbitmap_a_and_b (workset, successors, scc);
2050 sbitmap_free (tmp);
2051 sbitmap_free (workset);
2052 sbitmap_free (zero_bitmap);
2053 sbitmap_free (predecessors);
2054 sbitmap_free (successors);
2055 return pos;
2059 /* This page contains functions for manipulating partial-schedules during
2060 modulo scheduling. */
2062 /* Create a partial schedule and allocate a memory to hold II rows. */
2063 partial_schedule_ptr
2064 create_partial_schedule (int ii, ddg_ptr g, int history)
2066 partial_schedule_ptr ps = (partial_schedule_ptr)
2067 xmalloc (sizeof (struct partial_schedule));
2068 ps->rows = (ps_insn_ptr *) xcalloc (ii, sizeof (ps_insn_ptr));
2069 ps->ii = ii;
2070 ps->history = history;
2071 ps->min_cycle = INT_MAX;
2072 ps->max_cycle = INT_MIN;
2073 ps->g = g;
2075 return ps;
2078 /* Free the PS_INSNs in rows array of the given partial schedule.
2079 ??? Consider caching the PS_INSN's. */
2080 static void
2081 free_ps_insns (partial_schedule_ptr ps)
2083 int i;
2085 for (i = 0; i < ps->ii; i++)
2087 while (ps->rows[i])
2089 ps_insn_ptr ps_insn = ps->rows[i]->next_in_row;
2091 free (ps->rows[i]);
2092 ps->rows[i] = ps_insn;
2094 ps->rows[i] = NULL;
2098 /* Free all the memory allocated to the partial schedule. */
2099 void
2100 free_partial_schedule (partial_schedule_ptr ps)
2102 if (!ps)
2103 return;
2104 free_ps_insns (ps);
2105 free (ps->rows);
2106 free (ps);
2109 /* Clear the rows array with its PS_INSNs, and create a new one with
2110 NEW_II rows. */
2111 void
2112 reset_partial_schedule (partial_schedule_ptr ps, int new_ii)
2114 if (!ps)
2115 return;
2116 free_ps_insns (ps);
2117 if (new_ii == ps->ii)
2118 return;
2119 ps->rows = (ps_insn_ptr *) xrealloc (ps->rows, new_ii
2120 * sizeof (ps_insn_ptr));
2121 memset (ps->rows, 0, new_ii * sizeof (ps_insn_ptr));
2122 ps->ii = new_ii;
2123 ps->min_cycle = INT_MAX;
2124 ps->max_cycle = INT_MIN;
2127 /* Prints the partial schedule as an ii rows array, for each rows
2128 print the ids of the insns in it. */
2129 void
2130 print_partial_schedule (partial_schedule_ptr ps, FILE *dump)
2132 int i;
2134 for (i = 0; i < ps->ii; i++)
2136 ps_insn_ptr ps_i = ps->rows[i];
2138 fprintf (dump, "\n[CYCLE %d ]: ", i);
2139 while (ps_i)
2141 fprintf (dump, "%d, ",
2142 INSN_UID (ps_i->node->insn));
2143 ps_i = ps_i->next_in_row;
2148 /* Creates an object of PS_INSN and initializes it to the given parameters. */
2149 static ps_insn_ptr
2150 create_ps_insn (ddg_node_ptr node, int rest_count, int cycle)
2152 ps_insn_ptr ps_i = xmalloc (sizeof (struct ps_insn));
2154 ps_i->node = node;
2155 ps_i->next_in_row = NULL;
2156 ps_i->prev_in_row = NULL;
2157 ps_i->row_rest_count = rest_count;
2158 ps_i->cycle = cycle;
2160 return ps_i;
2164 /* Removes the given PS_INSN from the partial schedule. Returns false if the
2165 node is not found in the partial schedule, else returns true. */
2166 static bool
2167 remove_node_from_ps (partial_schedule_ptr ps, ps_insn_ptr ps_i)
2169 int row;
2171 if (!ps || !ps_i)
2172 return false;
2174 row = SMODULO (ps_i->cycle, ps->ii);
2175 if (! ps_i->prev_in_row)
2177 if (ps_i != ps->rows[row])
2178 return false;
2180 ps->rows[row] = ps_i->next_in_row;
2181 if (ps->rows[row])
2182 ps->rows[row]->prev_in_row = NULL;
2184 else
2186 ps_i->prev_in_row->next_in_row = ps_i->next_in_row;
2187 if (ps_i->next_in_row)
2188 ps_i->next_in_row->prev_in_row = ps_i->prev_in_row;
2190 free (ps_i);
2191 return true;
2194 /* Unlike what literature describes for modulo scheduling (which focuses
2195 on VLIW machines) the order of the instructions inside a cycle is
2196 important. Given the bitmaps MUST_FOLLOW and MUST_PRECEDE we know
2197 where the current instruction should go relative to the already
2198 scheduled instructions in the given cycle. Go over these
2199 instructions and find the first possible column to put it in. */
2200 static bool
2201 ps_insn_find_column (partial_schedule_ptr ps, ps_insn_ptr ps_i,
2202 sbitmap must_precede, sbitmap must_follow)
2204 ps_insn_ptr next_ps_i;
2205 ps_insn_ptr first_must_follow = NULL;
2206 ps_insn_ptr last_must_precede = NULL;
2207 int row;
2209 if (! ps_i)
2210 return false;
2212 row = SMODULO (ps_i->cycle, ps->ii);
2214 /* Find the first must follow and the last must precede
2215 and insert the node immediately after the must precede
2216 but make sure that it there is no must follow after it. */
2217 for (next_ps_i = ps->rows[row];
2218 next_ps_i;
2219 next_ps_i = next_ps_i->next_in_row)
2221 if (TEST_BIT (must_follow, next_ps_i->node->cuid)
2222 && ! first_must_follow)
2223 first_must_follow = next_ps_i;
2224 if (TEST_BIT (must_precede, next_ps_i->node->cuid))
2226 /* If we have already met a node that must follow, then
2227 there is no possible column. */
2228 if (first_must_follow)
2229 return false;
2230 else
2231 last_must_precede = next_ps_i;
2235 /* Now insert the node after INSERT_AFTER_PSI. */
2237 if (! last_must_precede)
2239 ps_i->next_in_row = ps->rows[row];
2240 ps_i->prev_in_row = NULL;
2241 if (ps_i->next_in_row)
2242 ps_i->next_in_row->prev_in_row = ps_i;
2243 ps->rows[row] = ps_i;
2245 else
2247 ps_i->next_in_row = last_must_precede->next_in_row;
2248 last_must_precede->next_in_row = ps_i;
2249 ps_i->prev_in_row = last_must_precede;
2250 if (ps_i->next_in_row)
2251 ps_i->next_in_row->prev_in_row = ps_i;
2254 return true;
2257 /* Advances the PS_INSN one column in its current row; returns false
2258 in failure and true in success. Bit N is set in MUST_FOLLOW if
2259 the node with cuid N must be come after the node pointed to by
2260 PS_I when scheduled in the same cycle. */
2261 static int
2262 ps_insn_advance_column (partial_schedule_ptr ps, ps_insn_ptr ps_i,
2263 sbitmap must_follow)
2265 ps_insn_ptr prev, next;
2266 int row;
2267 ddg_node_ptr next_node;
2269 if (!ps || !ps_i)
2270 return false;
2272 row = SMODULO (ps_i->cycle, ps->ii);
2274 if (! ps_i->next_in_row)
2275 return false;
2277 next_node = ps_i->next_in_row->node;
2279 /* Check if next_in_row is dependent on ps_i, both having same sched
2280 times (typically ANTI_DEP). If so, ps_i cannot skip over it. */
2281 if (TEST_BIT (must_follow, next_node->cuid))
2282 return false;
2284 /* Advance PS_I over its next_in_row in the doubly linked list. */
2285 prev = ps_i->prev_in_row;
2286 next = ps_i->next_in_row;
2288 if (ps_i == ps->rows[row])
2289 ps->rows[row] = next;
2291 ps_i->next_in_row = next->next_in_row;
2293 if (next->next_in_row)
2294 next->next_in_row->prev_in_row = ps_i;
2296 next->next_in_row = ps_i;
2297 ps_i->prev_in_row = next;
2299 next->prev_in_row = prev;
2300 if (prev)
2301 prev->next_in_row = next;
2303 return true;
2306 /* Inserts a DDG_NODE to the given partial schedule at the given cycle.
2307 Returns 0 if this is not possible and a PS_INSN otherwise. Bit N is
2308 set in MUST_PRECEDE/MUST_FOLLOW if the node with cuid N must be come
2309 before/after (respectively) the node pointed to by PS_I when scheduled
2310 in the same cycle. */
2311 static ps_insn_ptr
2312 add_node_to_ps (partial_schedule_ptr ps, ddg_node_ptr node, int cycle,
2313 sbitmap must_precede, sbitmap must_follow)
2315 ps_insn_ptr ps_i;
2316 int rest_count = 1;
2317 int row = SMODULO (cycle, ps->ii);
2319 if (ps->rows[row]
2320 && ps->rows[row]->row_rest_count >= issue_rate)
2321 return NULL;
2323 if (ps->rows[row])
2324 rest_count += ps->rows[row]->row_rest_count;
2326 ps_i = create_ps_insn (node, rest_count, cycle);
2328 /* Finds and inserts PS_I according to MUST_FOLLOW and
2329 MUST_PRECEDE. */
2330 if (! ps_insn_find_column (ps, ps_i, must_precede, must_follow))
2332 free (ps_i);
2333 return NULL;
2336 return ps_i;
2339 /* Advance time one cycle. Assumes DFA is being used. */
2340 static void
2341 advance_one_cycle (void)
2343 if (targetm.sched.dfa_pre_cycle_insn)
2344 state_transition (curr_state,
2345 targetm.sched.dfa_pre_cycle_insn ());
2347 state_transition (curr_state, NULL);
2349 if (targetm.sched.dfa_post_cycle_insn)
2350 state_transition (curr_state,
2351 targetm.sched.dfa_post_cycle_insn ());
2354 /* Given the kernel of a loop (from FIRST_INSN to LAST_INSN), finds
2355 the number of cycles according to DFA that the kernel fits in,
2356 we use this to check if we done well with SMS after we add
2357 register moves. In some cases register moves overhead makes
2358 it even worse than the original loop. We want SMS to be performed
2359 when it gives less cycles after register moves are added. */
2360 static int
2361 kernel_number_of_cycles (rtx first_insn, rtx last_insn)
2363 int cycles = 0;
2364 rtx insn;
2365 int can_issue_more = issue_rate;
2367 state_reset (curr_state);
2369 for (insn = first_insn;
2370 insn != NULL_RTX && insn != last_insn;
2371 insn = NEXT_INSN (insn))
2373 if (! INSN_P (insn) || GET_CODE (PATTERN (insn)) == USE)
2374 continue;
2376 /* Check if there is room for the current insn. */
2377 if (!can_issue_more || state_dead_lock_p (curr_state))
2379 cycles ++;
2380 advance_one_cycle ();
2381 can_issue_more = issue_rate;
2384 /* Update the DFA state and return with failure if the DFA found
2385 recource conflicts. */
2386 if (state_transition (curr_state, insn) >= 0)
2388 cycles ++;
2389 advance_one_cycle ();
2390 can_issue_more = issue_rate;
2393 if (targetm.sched.variable_issue)
2394 can_issue_more =
2395 targetm.sched.variable_issue (sched_dump, sched_verbose,
2396 insn, can_issue_more);
2397 /* A naked CLOBBER or USE generates no instruction, so don't
2398 let them consume issue slots. */
2399 else if (GET_CODE (PATTERN (insn)) != USE
2400 && GET_CODE (PATTERN (insn)) != CLOBBER)
2401 can_issue_more--;
2403 return cycles;
2406 /* Checks if PS has resource conflicts according to DFA, starting from
2407 FROM cycle to TO cycle; returns true if there are conflicts and false
2408 if there are no conflicts. Assumes DFA is being used. */
2409 static int
2410 ps_has_conflicts (partial_schedule_ptr ps, int from, int to)
2412 int cycle;
2414 state_reset (curr_state);
2416 for (cycle = from; cycle <= to; cycle++)
2418 ps_insn_ptr crr_insn;
2419 /* Holds the remaining issue slots in the current row. */
2420 int can_issue_more = issue_rate;
2422 /* Walk through the DFA for the current row. */
2423 for (crr_insn = ps->rows[SMODULO (cycle, ps->ii)];
2424 crr_insn;
2425 crr_insn = crr_insn->next_in_row)
2427 rtx insn = crr_insn->node->insn;
2429 if (!INSN_P (insn))
2430 continue;
2432 /* Check if there is room for the current insn. */
2433 if (!can_issue_more || state_dead_lock_p (curr_state))
2434 return true;
2436 /* Update the DFA state and return with failure if the DFA found
2437 recource conflicts. */
2438 if (state_transition (curr_state, insn) >= 0)
2439 return true;
2441 if (targetm.sched.variable_issue)
2442 can_issue_more =
2443 targetm.sched.variable_issue (sched_dump, sched_verbose,
2444 insn, can_issue_more);
2445 /* A naked CLOBBER or USE generates no instruction, so don't
2446 let them consume issue slots. */
2447 else if (GET_CODE (PATTERN (insn)) != USE
2448 && GET_CODE (PATTERN (insn)) != CLOBBER)
2449 can_issue_more--;
2452 /* Advance the DFA to the next cycle. */
2453 advance_one_cycle ();
2455 return false;
2458 /* Checks if the given node causes resource conflicts when added to PS at
2459 cycle C. If not the node is added to PS and returned; otherwise zero
2460 is returned. Bit N is set in MUST_PRECEDE/MUST_FOLLOW if the node with
2461 cuid N must be come before/after (respectively) the node pointed to by
2462 PS_I when scheduled in the same cycle. */
2463 ps_insn_ptr
2464 ps_add_node_check_conflicts (partial_schedule_ptr ps, ddg_node_ptr n,
2465 int c, sbitmap must_precede,
2466 sbitmap must_follow)
2468 int has_conflicts = 0;
2469 ps_insn_ptr ps_i;
2471 /* First add the node to the PS, if this succeeds check for
2472 conflicts, trying different issue slots in the same row. */
2473 if (! (ps_i = add_node_to_ps (ps, n, c, must_precede, must_follow)))
2474 return NULL; /* Failed to insert the node at the given cycle. */
2476 has_conflicts = ps_has_conflicts (ps, c, c)
2477 || (ps->history > 0
2478 && ps_has_conflicts (ps,
2479 c - ps->history,
2480 c + ps->history));
2482 /* Try different issue slots to find one that the given node can be
2483 scheduled in without conflicts. */
2484 while (has_conflicts)
2486 if (! ps_insn_advance_column (ps, ps_i, must_follow))
2487 break;
2488 has_conflicts = ps_has_conflicts (ps, c, c)
2489 || (ps->history > 0
2490 && ps_has_conflicts (ps,
2491 c - ps->history,
2492 c + ps->history));
2495 if (has_conflicts)
2497 remove_node_from_ps (ps, ps_i);
2498 return NULL;
2501 ps->min_cycle = MIN (ps->min_cycle, c);
2502 ps->max_cycle = MAX (ps->max_cycle, c);
2503 return ps_i;
2506 /* Rotate the rows of PS such that insns scheduled at time
2507 START_CYCLE will appear in row 0. Updates max/min_cycles. */
2508 void
2509 rotate_partial_schedule (partial_schedule_ptr ps, int start_cycle)
2511 int i, row, backward_rotates;
2512 int last_row = ps->ii - 1;
2514 if (start_cycle == 0)
2515 return;
2517 backward_rotates = SMODULO (start_cycle, ps->ii);
2519 /* Revisit later and optimize this into a single loop. */
2520 for (i = 0; i < backward_rotates; i++)
2522 ps_insn_ptr first_row = ps->rows[0];
2524 for (row = 0; row < last_row; row++)
2525 ps->rows[row] = ps->rows[row+1];
2527 ps->rows[last_row] = first_row;
2530 ps->max_cycle -= start_cycle;
2531 ps->min_cycle -= start_cycle;
2534 /* Remove the node N from the partial schedule PS; because we restart the DFA
2535 each time we want to check for resource conflicts; this is equivalent to
2536 unscheduling the node N. */
2537 static bool
2538 ps_unschedule_node (partial_schedule_ptr ps, ddg_node_ptr n)
2540 ps_insn_ptr ps_i;
2541 int row = SMODULO (SCHED_TIME (n), ps->ii);
2543 if (row < 0 || row > ps->ii)
2544 return false;
2546 for (ps_i = ps->rows[row];
2547 ps_i && ps_i->node != n;
2548 ps_i = ps_i->next_in_row);
2549 if (!ps_i)
2550 return false;
2552 return remove_node_from_ps (ps, ps_i);
2554 #endif /* INSN_SCHEDULING*/