1 /* Swing Modulo Scheduling implementation.
2 Copyright (C) 2004, 2005, 2006, 2007, 2008
3 Free Software Foundation, Inc.
4 Contributed by Ayal Zaks and Mustafa Hagog <zaks,mustafa@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
25 #include "coretypes.h"
30 #include "hard-reg-set.h"
34 #include "insn-config.h"
35 #include "insn-attr.h"
39 #include "sched-int.h"
41 #include "cfglayout.h"
49 #include "tree-pass.h"
53 #ifdef INSN_SCHEDULING
55 /* This file contains the implementation of the Swing Modulo Scheduler,
56 described in the following references:
57 [1] J. Llosa, A. Gonzalez, E. Ayguade, M. Valero., and J. Eckhardt.
58 Lifetime--sensitive modulo scheduling in a production environment.
59 IEEE Trans. on Comps., 50(3), March 2001
60 [2] J. Llosa, A. Gonzalez, E. Ayguade, and M. Valero.
61 Swing Modulo Scheduling: A Lifetime Sensitive Approach.
62 PACT '96 , pages 80-87, October 1996 (Boston - Massachusetts - USA).
64 The basic structure is:
65 1. Build a data-dependence graph (DDG) for each loop.
66 2. Use the DDG to order the insns of a loop (not in topological order
67 necessarily, but rather) trying to place each insn after all its
68 predecessors _or_ after all its successors.
69 3. Compute MII: a lower bound on the number of cycles to schedule the loop.
70 4. Use the ordering to perform list-scheduling of the loop:
71 1. Set II = MII. We will try to schedule the loop within II cycles.
72 2. Try to schedule the insns one by one according to the ordering.
73 For each insn compute an interval of cycles by considering already-
74 scheduled preds and succs (and associated latencies); try to place
75 the insn in the cycles of this window checking for potential
76 resource conflicts (using the DFA interface).
77 Note: this is different from the cycle-scheduling of schedule_insns;
78 here the insns are not scheduled monotonically top-down (nor bottom-
80 3. If failed in scheduling all insns - bump II++ and try again, unless
81 II reaches an upper bound MaxII, in which case report failure.
82 5. If we succeeded in scheduling the loop within II cycles, we now
83 generate prolog and epilog, decrease the counter of the loop, and
84 perform modulo variable expansion for live ranges that span more than
85 II cycles (i.e. use register copies to prevent a def from overwriting
86 itself before reaching the use).
88 SMS works with countable loops (1) whose control part can be easily
89 decoupled from the rest of the loop and (2) whose loop count can
90 be easily adjusted. This is because we peel a constant number of
91 iterations into a prologue and epilogue for which we want to avoid
92 emitting the control part, and a kernel which is to iterate that
93 constant number of iterations less than the original loop. So the
94 control part should be a set of insns clearly identified and having
95 its own iv, not otherwise used in the loop (at-least for now), which
96 initializes a register before the loop to the number of iterations.
97 Currently SMS relies on the do-loop pattern to recognize such loops,
98 where (1) the control part comprises of all insns defining and/or
99 using a certain 'count' register and (2) the loop count can be
100 adjusted by modifying this register prior to the loop.
101 TODO: Rely on cfgloop analysis instead. */
103 /* This page defines partial-schedule structures and functions for
104 modulo scheduling. */
106 typedef struct partial_schedule
*partial_schedule_ptr
;
107 typedef struct ps_insn
*ps_insn_ptr
;
109 /* The minimum (absolute) cycle that a node of ps was scheduled in. */
110 #define PS_MIN_CYCLE(ps) (((partial_schedule_ptr)(ps))->min_cycle)
112 /* The maximum (absolute) cycle that a node of ps was scheduled in. */
113 #define PS_MAX_CYCLE(ps) (((partial_schedule_ptr)(ps))->max_cycle)
115 /* Perform signed modulo, always returning a non-negative value. */
116 #define SMODULO(x,y) ((x) % (y) < 0 ? ((x) % (y) + (y)) : (x) % (y))
118 /* The number of different iterations the nodes in ps span, assuming
119 the stage boundaries are placed efficiently. */
120 #define PS_STAGE_COUNT(ps) ((PS_MAX_CYCLE (ps) - PS_MIN_CYCLE (ps) \
121 + 1 + (ps)->ii - 1) / (ps)->ii)
123 /* A single instruction in the partial schedule. */
126 /* The corresponding DDG_NODE. */
129 /* The (absolute) cycle in which the PS instruction is scheduled.
130 Same as SCHED_TIME (node). */
133 /* The next/prev PS_INSN in the same row. */
134 ps_insn_ptr next_in_row
,
137 /* The number of nodes in the same row that come after this node. */
141 /* Holds the partial schedule as an array of II rows. Each entry of the
142 array points to a linked list of PS_INSNs, which represents the
143 instructions that are scheduled for that row. */
144 struct partial_schedule
146 int ii
; /* Number of rows in the partial schedule. */
147 int history
; /* Threshold for conflict checking using DFA. */
149 /* rows[i] points to linked list of insns scheduled in row i (0<=i<ii). */
152 /* The earliest absolute cycle of an insn in the partial schedule. */
155 /* The latest absolute cycle of an insn in the partial schedule. */
158 ddg_ptr g
; /* The DDG of the insns in the partial schedule. */
161 /* We use this to record all the register replacements we do in
162 the kernel so we can undo SMS if it is not profitable. */
163 struct undo_replace_buff_elem
168 struct undo_replace_buff_elem
*next
;
173 static partial_schedule_ptr
create_partial_schedule (int ii
, ddg_ptr
, int history
);
174 static void free_partial_schedule (partial_schedule_ptr
);
175 static void reset_partial_schedule (partial_schedule_ptr
, int new_ii
);
176 void print_partial_schedule (partial_schedule_ptr
, FILE *);
177 static void verify_partial_schedule (partial_schedule_ptr
, sbitmap
);
178 static ps_insn_ptr
ps_add_node_check_conflicts (partial_schedule_ptr
,
179 ddg_node_ptr node
, int cycle
,
180 sbitmap must_precede
,
181 sbitmap must_follow
);
182 static void rotate_partial_schedule (partial_schedule_ptr
, int);
183 void set_row_column_for_ps (partial_schedule_ptr
);
184 static void ps_insert_empty_row (partial_schedule_ptr
, int, sbitmap
);
185 static int compute_split_row (sbitmap
, int, int, int, ddg_node_ptr
);
188 /* This page defines constants and structures for the modulo scheduling
191 static int sms_order_nodes (ddg_ptr
, int, int *, int *);
192 static void set_node_sched_params (ddg_ptr
);
193 static partial_schedule_ptr
sms_schedule_by_order (ddg_ptr
, int, int, int *);
194 static void permute_partial_schedule (partial_schedule_ptr
, rtx
);
195 static void generate_prolog_epilog (partial_schedule_ptr
, struct loop
*,
197 static void duplicate_insns_of_cycles (partial_schedule_ptr
,
200 #define SCHED_ASAP(x) (((node_sched_params_ptr)(x)->aux.info)->asap)
201 #define SCHED_TIME(x) (((node_sched_params_ptr)(x)->aux.info)->time)
202 #define SCHED_FIRST_REG_MOVE(x) \
203 (((node_sched_params_ptr)(x)->aux.info)->first_reg_move)
204 #define SCHED_NREG_MOVES(x) \
205 (((node_sched_params_ptr)(x)->aux.info)->nreg_moves)
206 #define SCHED_ROW(x) (((node_sched_params_ptr)(x)->aux.info)->row)
207 #define SCHED_STAGE(x) (((node_sched_params_ptr)(x)->aux.info)->stage)
208 #define SCHED_COLUMN(x) (((node_sched_params_ptr)(x)->aux.info)->column)
210 /* The scheduling parameters held for each node. */
211 typedef struct node_sched_params
213 int asap
; /* A lower-bound on the absolute scheduling cycle. */
214 int time
; /* The absolute scheduling cycle (time >= asap). */
216 /* The following field (first_reg_move) is a pointer to the first
217 register-move instruction added to handle the modulo-variable-expansion
218 of the register defined by this node. This register-move copies the
219 original register defined by the node. */
222 /* The number of register-move instructions added, immediately preceding
226 int row
; /* Holds time % ii. */
227 int stage
; /* Holds time / ii. */
229 /* The column of a node inside the ps. If nodes u, v are on the same row,
230 u will precede v if column (u) < column (v). */
232 } *node_sched_params_ptr
;
235 /* The following three functions are copied from the current scheduler
236 code in order to use sched_analyze() for computing the dependencies.
237 They are used when initializing the sched_info structure. */
239 sms_print_insn (const_rtx insn
, int aligned ATTRIBUTE_UNUSED
)
243 sprintf (tmp
, "i%4d", INSN_UID (insn
));
248 compute_jump_reg_dependencies (rtx insn ATTRIBUTE_UNUSED
,
249 regset cond_exec ATTRIBUTE_UNUSED
,
250 regset used ATTRIBUTE_UNUSED
,
251 regset set ATTRIBUTE_UNUSED
)
255 static struct common_sched_info_def sms_common_sched_info
;
257 static struct sched_deps_info_def sms_sched_deps_info
=
259 compute_jump_reg_dependencies
,
260 NULL
, NULL
, NULL
, NULL
, NULL
, NULL
, NULL
, NULL
, NULL
, NULL
,
265 static struct haifa_sched_info sms_sched_info
=
274 NULL
, /* insn_finishes_block_p */
283 /* Given HEAD and TAIL which are the first and last insns in a loop;
284 return the register which controls the loop. Return zero if it has
285 more than one occurrence in the loop besides the control part or the
286 do-loop pattern is not of the form we expect. */
288 doloop_register_get (rtx head ATTRIBUTE_UNUSED
, rtx tail ATTRIBUTE_UNUSED
)
290 #ifdef HAVE_doloop_end
291 rtx reg
, condition
, insn
, first_insn_not_to_check
;
296 /* TODO: Free SMS's dependence on doloop_condition_get. */
297 condition
= doloop_condition_get (tail
);
301 if (REG_P (XEXP (condition
, 0)))
302 reg
= XEXP (condition
, 0);
303 else if (GET_CODE (XEXP (condition
, 0)) == PLUS
304 && REG_P (XEXP (XEXP (condition
, 0), 0)))
305 reg
= XEXP (XEXP (condition
, 0), 0);
309 /* Check that the COUNT_REG has no other occurrences in the loop
310 until the decrement. We assume the control part consists of
311 either a single (parallel) branch-on-count or a (non-parallel)
312 branch immediately preceded by a single (decrement) insn. */
313 first_insn_not_to_check
= (GET_CODE (PATTERN (tail
)) == PARALLEL
? tail
316 for (insn
= head
; insn
!= first_insn_not_to_check
; insn
= NEXT_INSN (insn
))
317 if (reg_mentioned_p (reg
, insn
))
321 fprintf (dump_file
, "SMS count_reg found ");
322 print_rtl_single (dump_file
, reg
);
323 fprintf (dump_file
, " outside control in insn:\n");
324 print_rtl_single (dump_file
, insn
);
336 /* Check if COUNT_REG is set to a constant in the PRE_HEADER block, so
337 that the number of iterations is a compile-time constant. If so,
338 return the rtx that sets COUNT_REG to a constant, and set COUNT to
339 this constant. Otherwise return 0. */
341 const_iteration_count (rtx count_reg
, basic_block pre_header
,
342 HOST_WIDEST_INT
* count
)
350 get_ebb_head_tail (pre_header
, pre_header
, &head
, &tail
);
352 for (insn
= tail
; insn
!= PREV_INSN (head
); insn
= PREV_INSN (insn
))
353 if (NONDEBUG_INSN_P (insn
) && single_set (insn
) &&
354 rtx_equal_p (count_reg
, SET_DEST (single_set (insn
))))
356 rtx pat
= single_set (insn
);
358 if (CONST_INT_P (SET_SRC (pat
)))
360 *count
= INTVAL (SET_SRC (pat
));
370 /* A very simple resource-based lower bound on the initiation interval.
371 ??? Improve the accuracy of this bound by considering the
372 utilization of various units. */
376 if (targetm
.sched
.sms_res_mii
)
377 return targetm
.sched
.sms_res_mii (g
);
379 return ((g
->num_nodes
- g
->num_debug
) / issue_rate
);
383 /* Points to the array that contains the sched data for each node. */
384 static node_sched_params_ptr node_sched_params
;
386 /* Allocate sched_params for each node and initialize it. Assumes that
387 the aux field of each node contain the asap bound (computed earlier),
388 and copies it into the sched_params field. */
390 set_node_sched_params (ddg_ptr g
)
394 /* Allocate for each node in the DDG a place to hold the "sched_data". */
395 /* Initialize ASAP/ALAP/HIGHT to zero. */
396 node_sched_params
= (node_sched_params_ptr
)
397 xcalloc (g
->num_nodes
,
398 sizeof (struct node_sched_params
));
400 /* Set the pointer of the general data of the node to point to the
401 appropriate sched_params structure. */
402 for (i
= 0; i
< g
->num_nodes
; i
++)
404 /* Watch out for aliasing problems? */
405 node_sched_params
[i
].asap
= g
->nodes
[i
].aux
.count
;
406 g
->nodes
[i
].aux
.info
= &node_sched_params
[i
];
411 print_node_sched_params (FILE *file
, int num_nodes
, ddg_ptr g
)
417 for (i
= 0; i
< num_nodes
; i
++)
419 node_sched_params_ptr nsp
= &node_sched_params
[i
];
420 rtx reg_move
= nsp
->first_reg_move
;
423 fprintf (file
, "Node = %d; INSN = %d\n", i
,
424 (INSN_UID (g
->nodes
[i
].insn
)));
425 fprintf (file
, " asap = %d:\n", nsp
->asap
);
426 fprintf (file
, " time = %d:\n", nsp
->time
);
427 fprintf (file
, " nreg_moves = %d:\n", nsp
->nreg_moves
);
428 for (j
= 0; j
< nsp
->nreg_moves
; j
++)
430 fprintf (file
, " reg_move = ");
431 print_rtl_single (file
, reg_move
);
432 reg_move
= PREV_INSN (reg_move
);
438 Breaking intra-loop register anti-dependences:
439 Each intra-loop register anti-dependence implies a cross-iteration true
440 dependence of distance 1. Therefore, we can remove such false dependencies
441 and figure out if the partial schedule broke them by checking if (for a
442 true-dependence of distance 1): SCHED_TIME (def) < SCHED_TIME (use) and
443 if so generate a register move. The number of such moves is equal to:
444 SCHED_TIME (use) - SCHED_TIME (def) { 0 broken
445 nreg_moves = ----------------------------------- + 1 - { dependence.
448 static struct undo_replace_buff_elem
*
449 generate_reg_moves (partial_schedule_ptr ps
, bool rescan
)
454 struct undo_replace_buff_elem
*reg_move_replaces
= NULL
;
456 for (i
= 0; i
< g
->num_nodes
; i
++)
458 ddg_node_ptr u
= &g
->nodes
[i
];
460 int nreg_moves
= 0, i_reg_move
;
461 sbitmap
*uses_of_defs
;
463 rtx prev_reg
, old_reg
;
465 /* Compute the number of reg_moves needed for u, by looking at life
466 ranges started at u (excluding self-loops). */
467 for (e
= u
->out
; e
; e
= e
->next_out
)
468 if (e
->type
== TRUE_DEP
&& e
->dest
!= e
->src
)
470 int nreg_moves4e
= (SCHED_TIME (e
->dest
) - SCHED_TIME (e
->src
)) / ii
;
472 if (e
->distance
== 1)
473 nreg_moves4e
= (SCHED_TIME (e
->dest
) - SCHED_TIME (e
->src
) + ii
) / ii
;
475 /* If dest precedes src in the schedule of the kernel, then dest
476 will read before src writes and we can save one reg_copy. */
477 if (SCHED_ROW (e
->dest
) == SCHED_ROW (e
->src
)
478 && SCHED_COLUMN (e
->dest
) < SCHED_COLUMN (e
->src
))
481 nreg_moves
= MAX (nreg_moves
, nreg_moves4e
);
487 /* Every use of the register defined by node may require a different
488 copy of this register, depending on the time the use is scheduled.
489 Set a bitmap vector, telling which nodes use each copy of this
491 uses_of_defs
= sbitmap_vector_alloc (nreg_moves
, g
->num_nodes
);
492 sbitmap_vector_zero (uses_of_defs
, nreg_moves
);
493 for (e
= u
->out
; e
; e
= e
->next_out
)
494 if (e
->type
== TRUE_DEP
&& e
->dest
!= e
->src
)
496 int dest_copy
= (SCHED_TIME (e
->dest
) - SCHED_TIME (e
->src
)) / ii
;
498 if (e
->distance
== 1)
499 dest_copy
= (SCHED_TIME (e
->dest
) - SCHED_TIME (e
->src
) + ii
) / ii
;
501 if (SCHED_ROW (e
->dest
) == SCHED_ROW (e
->src
)
502 && SCHED_COLUMN (e
->dest
) < SCHED_COLUMN (e
->src
))
506 SET_BIT (uses_of_defs
[dest_copy
- 1], e
->dest
->cuid
);
509 /* Now generate the reg_moves, attaching relevant uses to them. */
510 SCHED_NREG_MOVES (u
) = nreg_moves
;
511 old_reg
= prev_reg
= copy_rtx (SET_DEST (single_set (u
->insn
)));
512 /* Insert the reg-moves right before the notes which precede
513 the insn they relates to. */
514 last_reg_move
= u
->first_note
;
516 for (i_reg_move
= 0; i_reg_move
< nreg_moves
; i_reg_move
++)
518 unsigned int i_use
= 0;
519 rtx new_reg
= gen_reg_rtx (GET_MODE (prev_reg
));
520 rtx reg_move
= gen_move_insn (new_reg
, prev_reg
);
521 sbitmap_iterator sbi
;
523 add_insn_before (reg_move
, last_reg_move
, NULL
);
524 last_reg_move
= reg_move
;
526 if (!SCHED_FIRST_REG_MOVE (u
))
527 SCHED_FIRST_REG_MOVE (u
) = reg_move
;
529 EXECUTE_IF_SET_IN_SBITMAP (uses_of_defs
[i_reg_move
], 0, i_use
, sbi
)
531 struct undo_replace_buff_elem
*rep
;
533 rep
= (struct undo_replace_buff_elem
*)
534 xcalloc (1, sizeof (struct undo_replace_buff_elem
));
535 rep
->insn
= g
->nodes
[i_use
].insn
;
536 rep
->orig_reg
= old_reg
;
537 rep
->new_reg
= new_reg
;
539 if (! reg_move_replaces
)
540 reg_move_replaces
= rep
;
543 rep
->next
= reg_move_replaces
;
544 reg_move_replaces
= rep
;
547 replace_rtx (g
->nodes
[i_use
].insn
, old_reg
, new_reg
);
549 df_insn_rescan (g
->nodes
[i_use
].insn
);
554 sbitmap_vector_free (uses_of_defs
);
556 return reg_move_replaces
;
559 /* Free memory allocated for the undo buffer. */
561 free_undo_replace_buff (struct undo_replace_buff_elem
*reg_move_replaces
)
564 while (reg_move_replaces
)
566 struct undo_replace_buff_elem
*rep
= reg_move_replaces
;
568 reg_move_replaces
= reg_move_replaces
->next
;
573 /* Bump the SCHED_TIMEs of all nodes to start from zero. Set the values
574 of SCHED_ROW and SCHED_STAGE. */
576 normalize_sched_times (partial_schedule_ptr ps
)
579 int amount
= PS_MIN_CYCLE (ps
);
581 ps_insn_ptr crr_insn
;
583 for (row
= 0; row
< ii
; row
++)
584 for (crr_insn
= ps
->rows
[row
]; crr_insn
; crr_insn
= crr_insn
->next_in_row
)
586 ddg_node_ptr u
= crr_insn
->node
;
587 int normalized_time
= SCHED_TIME (u
) - amount
;
590 fprintf (dump_file
, "crr_insn->node=%d, crr_insn->cycle=%d,\
591 min_cycle=%d\n", crr_insn
->node
->cuid
, SCHED_TIME
593 gcc_assert (SCHED_TIME (u
) >= ps
->min_cycle
);
594 gcc_assert (SCHED_TIME (u
) <= ps
->max_cycle
);
595 SCHED_TIME (u
) = normalized_time
;
596 SCHED_ROW (u
) = normalized_time
% ii
;
597 SCHED_STAGE (u
) = normalized_time
/ ii
;
601 /* Set SCHED_COLUMN of each node according to its position in PS. */
603 set_columns_for_ps (partial_schedule_ptr ps
)
607 for (row
= 0; row
< ps
->ii
; row
++)
609 ps_insn_ptr cur_insn
= ps
->rows
[row
];
612 for (; cur_insn
; cur_insn
= cur_insn
->next_in_row
)
613 SCHED_COLUMN (cur_insn
->node
) = column
++;
617 /* Permute the insns according to their order in PS, from row 0 to
618 row ii-1, and position them right before LAST. This schedules
619 the insns of the loop kernel. */
621 permute_partial_schedule (partial_schedule_ptr ps
, rtx last
)
627 for (row
= 0; row
< ii
; row
++)
628 for (ps_ij
= ps
->rows
[row
]; ps_ij
; ps_ij
= ps_ij
->next_in_row
)
629 if (PREV_INSN (last
) != ps_ij
->node
->insn
)
630 reorder_insns_nobb (ps_ij
->node
->first_note
, ps_ij
->node
->insn
,
635 duplicate_insns_of_cycles (partial_schedule_ptr ps
, int from_stage
,
636 int to_stage
, int for_prolog
, rtx count_reg
)
641 for (row
= 0; row
< ps
->ii
; row
++)
642 for (ps_ij
= ps
->rows
[row
]; ps_ij
; ps_ij
= ps_ij
->next_in_row
)
644 ddg_node_ptr u_node
= ps_ij
->node
;
646 rtx reg_move
= NULL_RTX
;
648 /* Do not duplicate any insn which refers to count_reg as it
649 belongs to the control part.
650 TODO: This should be done by analyzing the control part of
652 if (reg_mentioned_p (count_reg
, u_node
->insn
))
657 /* SCHED_STAGE (u_node) >= from_stage == 0. Generate increasing
658 number of reg_moves starting with the second occurrence of
659 u_node, which is generated if its SCHED_STAGE <= to_stage. */
660 i_reg_moves
= to_stage
- SCHED_STAGE (u_node
) + 1;
661 i_reg_moves
= MAX (i_reg_moves
, 0);
662 i_reg_moves
= MIN (i_reg_moves
, SCHED_NREG_MOVES (u_node
));
664 /* The reg_moves start from the *first* reg_move backwards. */
667 reg_move
= SCHED_FIRST_REG_MOVE (u_node
);
668 for (j
= 1; j
< i_reg_moves
; j
++)
669 reg_move
= PREV_INSN (reg_move
);
672 else /* It's for the epilog. */
674 /* SCHED_STAGE (u_node) <= to_stage. Generate all reg_moves,
675 starting to decrease one stage after u_node no longer occurs;
676 that is, generate all reg_moves until
677 SCHED_STAGE (u_node) == from_stage - 1. */
678 i_reg_moves
= SCHED_NREG_MOVES (u_node
)
679 - (from_stage
- SCHED_STAGE (u_node
) - 1);
680 i_reg_moves
= MAX (i_reg_moves
, 0);
681 i_reg_moves
= MIN (i_reg_moves
, SCHED_NREG_MOVES (u_node
));
683 /* The reg_moves start from the *last* reg_move forwards. */
686 reg_move
= SCHED_FIRST_REG_MOVE (u_node
);
687 for (j
= 1; j
< SCHED_NREG_MOVES (u_node
); j
++)
688 reg_move
= PREV_INSN (reg_move
);
692 for (j
= 0; j
< i_reg_moves
; j
++, reg_move
= NEXT_INSN (reg_move
))
693 emit_insn (copy_rtx (PATTERN (reg_move
)));
694 if (SCHED_STAGE (u_node
) >= from_stage
695 && SCHED_STAGE (u_node
) <= to_stage
)
696 duplicate_insn_chain (u_node
->first_note
, u_node
->insn
);
701 /* Generate the instructions (including reg_moves) for prolog & epilog. */
703 generate_prolog_epilog (partial_schedule_ptr ps
, struct loop
*loop
,
704 rtx count_reg
, rtx count_init
)
707 int last_stage
= PS_STAGE_COUNT (ps
) - 1;
710 /* Generate the prolog, inserting its insns on the loop-entry edge. */
715 /* Generate instructions at the beginning of the prolog to
716 adjust the loop count by STAGE_COUNT. If loop count is constant
717 (count_init), this constant is adjusted by STAGE_COUNT in
718 generate_prolog_epilog function. */
719 rtx sub_reg
= NULL_RTX
;
721 sub_reg
= expand_simple_binop (GET_MODE (count_reg
), MINUS
,
722 count_reg
, GEN_INT (last_stage
),
723 count_reg
, 1, OPTAB_DIRECT
);
724 gcc_assert (REG_P (sub_reg
));
725 if (REGNO (sub_reg
) != REGNO (count_reg
))
726 emit_move_insn (count_reg
, sub_reg
);
729 for (i
= 0; i
< last_stage
; i
++)
730 duplicate_insns_of_cycles (ps
, 0, i
, 1, count_reg
);
732 /* Put the prolog on the entry edge. */
733 e
= loop_preheader_edge (loop
);
734 split_edge_and_insert (e
, get_insns ());
738 /* Generate the epilog, inserting its insns on the loop-exit edge. */
741 for (i
= 0; i
< last_stage
; i
++)
742 duplicate_insns_of_cycles (ps
, i
+ 1, last_stage
, 0, count_reg
);
744 /* Put the epilogue on the exit edge. */
745 gcc_assert (single_exit (loop
));
746 e
= single_exit (loop
);
747 split_edge_and_insert (e
, get_insns ());
751 /* Return true if all the BBs of the loop are empty except the
754 loop_single_full_bb_p (struct loop
*loop
)
757 basic_block
*bbs
= get_loop_body (loop
);
759 for (i
= 0; i
< loop
->num_nodes
; i
++)
762 bool empty_bb
= true;
764 if (bbs
[i
] == loop
->header
)
767 /* Make sure that basic blocks other than the header
768 have only notes labels or jumps. */
769 get_ebb_head_tail (bbs
[i
], bbs
[i
], &head
, &tail
);
770 for (; head
!= NEXT_INSN (tail
); head
= NEXT_INSN (head
))
772 if (NOTE_P (head
) || LABEL_P (head
)
773 || (INSN_P (head
) && (DEBUG_INSN_P (head
) || JUMP_P (head
))))
789 /* A simple loop from SMS point of view; it is a loop that is composed of
790 either a single basic block or two BBs - a header and a latch. */
791 #define SIMPLE_SMS_LOOP_P(loop) ((loop->num_nodes < 3 ) \
792 && (EDGE_COUNT (loop->latch->preds) == 1) \
793 && (EDGE_COUNT (loop->latch->succs) == 1))
795 /* Return true if the loop is in its canonical form and false if not.
796 i.e. SIMPLE_SMS_LOOP_P and have one preheader block, and single exit. */
798 loop_canon_p (struct loop
*loop
)
801 if (loop
->inner
|| !loop_outer (loop
))
804 fprintf (dump_file
, "SMS loop inner or !loop_outer\n");
808 if (!single_exit (loop
))
812 rtx insn
= BB_END (loop
->header
);
814 fprintf (dump_file
, "SMS loop many exits ");
815 fprintf (dump_file
, " %s %d (file, line)\n",
816 insn_file (insn
), insn_line (insn
));
821 if (! SIMPLE_SMS_LOOP_P (loop
) && ! loop_single_full_bb_p (loop
))
825 rtx insn
= BB_END (loop
->header
);
827 fprintf (dump_file
, "SMS loop many BBs. ");
828 fprintf (dump_file
, " %s %d (file, line)\n",
829 insn_file (insn
), insn_line (insn
));
837 /* If there are more than one entry for the loop,
838 make it one by splitting the first entry edge and
839 redirecting the others to the new BB. */
841 canon_loop (struct loop
*loop
)
846 /* Avoid annoying special cases of edges going to exit
848 FOR_EACH_EDGE (e
, i
, EXIT_BLOCK_PTR
->preds
)
849 if ((e
->flags
& EDGE_FALLTHRU
) && (EDGE_COUNT (e
->src
->succs
) > 1))
852 if (loop
->latch
== loop
->header
853 || EDGE_COUNT (loop
->latch
->succs
) > 1)
855 FOR_EACH_EDGE (e
, i
, loop
->header
->preds
)
856 if (e
->src
== loop
->latch
)
864 setup_sched_infos (void)
866 memcpy (&sms_common_sched_info
, &haifa_common_sched_info
,
867 sizeof (sms_common_sched_info
));
868 sms_common_sched_info
.sched_pass_id
= SCHED_SMS_PASS
;
869 common_sched_info
= &sms_common_sched_info
;
871 sched_deps_info
= &sms_sched_deps_info
;
872 current_sched_info
= &sms_sched_info
;
875 /* Probability in % that the sms-ed loop rolls enough so that optimized
876 version may be entered. Just a guess. */
877 #define PROB_SMS_ENOUGH_ITERATIONS 80
879 /* Used to calculate the upper bound of ii. */
880 #define MAXII_FACTOR 2
882 /* Main entry point, perform SMS scheduling on the loops of the function
883 that consist of single basic blocks. */
892 partial_schedule_ptr ps
;
893 basic_block bb
= NULL
;
895 basic_block condition_bb
= NULL
;
897 gcov_type trip_count
= 0;
899 loop_optimizer_init (LOOPS_HAVE_PREHEADERS
900 | LOOPS_HAVE_RECORDED_EXITS
);
901 if (number_of_loops () <= 1)
903 loop_optimizer_finalize ();
904 return; /* There are no loops to schedule. */
907 /* Initialize issue_rate. */
908 if (targetm
.sched
.issue_rate
)
910 int temp
= reload_completed
;
912 reload_completed
= 1;
913 issue_rate
= targetm
.sched
.issue_rate ();
914 reload_completed
= temp
;
919 /* Initialize the scheduler. */
920 setup_sched_infos ();
923 /* Allocate memory to hold the DDG array one entry for each loop.
924 We use loop->num as index into this array. */
925 g_arr
= XCNEWVEC (ddg_ptr
, number_of_loops ());
929 fprintf (dump_file
, "\n\nSMS analysis phase\n");
930 fprintf (dump_file
, "===================\n\n");
933 /* Build DDGs for all the relevant loops and hold them in G_ARR
934 indexed by the loop index. */
935 FOR_EACH_LOOP (li
, loop
, 0)
941 if (dbg_cnt (sms_sched_loop
) == false)
944 fprintf (dump_file
, "SMS reached max limit... \n");
951 rtx insn
= BB_END (loop
->header
);
953 fprintf (dump_file
, "SMS loop num: %d, file: %s, line: %d\n",
954 loop
->num
, insn_file (insn
), insn_line (insn
));
958 if (! loop_canon_p (loop
))
961 if (! loop_single_full_bb_p (loop
))
964 fprintf (dump_file
, "SMS not loop_single_full_bb_p\n");
970 get_ebb_head_tail (bb
, bb
, &head
, &tail
);
971 latch_edge
= loop_latch_edge (loop
);
972 gcc_assert (single_exit (loop
));
973 if (single_exit (loop
)->count
)
974 trip_count
= latch_edge
->count
/ single_exit (loop
)->count
;
976 /* Perform SMS only on loops that their average count is above threshold. */
978 if ( latch_edge
->count
979 && (latch_edge
->count
< single_exit (loop
)->count
* SMS_LOOP_AVERAGE_COUNT_THRESHOLD
))
983 fprintf (dump_file
, " %s %d (file, line)\n",
984 insn_file (tail
), insn_line (tail
));
985 fprintf (dump_file
, "SMS single-bb-loop\n");
986 if (profile_info
&& flag_branch_probabilities
)
988 fprintf (dump_file
, "SMS loop-count ");
989 fprintf (dump_file
, HOST_WIDEST_INT_PRINT_DEC
,
990 (HOST_WIDEST_INT
) bb
->count
);
991 fprintf (dump_file
, "\n");
992 fprintf (dump_file
, "SMS trip-count ");
993 fprintf (dump_file
, HOST_WIDEST_INT_PRINT_DEC
,
994 (HOST_WIDEST_INT
) trip_count
);
995 fprintf (dump_file
, "\n");
996 fprintf (dump_file
, "SMS profile-sum-max ");
997 fprintf (dump_file
, HOST_WIDEST_INT_PRINT_DEC
,
998 (HOST_WIDEST_INT
) profile_info
->sum_max
);
999 fprintf (dump_file
, "\n");
1005 /* Make sure this is a doloop. */
1006 if ( !(count_reg
= doloop_register_get (head
, tail
)))
1009 fprintf (dump_file
, "SMS doloop_register_get failed\n");
1013 /* Don't handle BBs with calls or barriers, or !single_set insns,
1014 or auto-increment insns (to avoid creating invalid reg-moves
1015 for the auto-increment insns).
1016 ??? Should handle auto-increment insns.
1017 ??? Should handle insns defining subregs. */
1018 for (insn
= head
; insn
!= NEXT_INSN (tail
); insn
= NEXT_INSN (insn
))
1024 || (NONDEBUG_INSN_P (insn
) && !JUMP_P (insn
)
1025 && !single_set (insn
) && GET_CODE (PATTERN (insn
)) != USE
)
1026 || (FIND_REG_INC_NOTE (insn
, NULL_RTX
) != 0)
1027 || (INSN_P (insn
) && (set
= single_set (insn
))
1028 && GET_CODE (SET_DEST (set
)) == SUBREG
))
1032 if (insn
!= NEXT_INSN (tail
))
1037 fprintf (dump_file
, "SMS loop-with-call\n");
1038 else if (BARRIER_P (insn
))
1039 fprintf (dump_file
, "SMS loop-with-barrier\n");
1040 else if (FIND_REG_INC_NOTE (insn
, NULL_RTX
) != 0)
1041 fprintf (dump_file
, "SMS reg inc\n");
1042 else if ((NONDEBUG_INSN_P (insn
) && !JUMP_P (insn
)
1043 && !single_set (insn
) && GET_CODE (PATTERN (insn
)) != USE
))
1044 fprintf (dump_file
, "SMS loop-with-not-single-set\n");
1046 fprintf (dump_file
, "SMS loop with subreg in lhs\n");
1047 print_rtl_single (dump_file
, insn
);
1053 if (! (g
= create_ddg (bb
, 0)))
1056 fprintf (dump_file
, "SMS create_ddg failed\n");
1060 g_arr
[loop
->num
] = g
;
1062 fprintf (dump_file
, "...OK\n");
1067 fprintf (dump_file
, "\nSMS transformation phase\n");
1068 fprintf (dump_file
, "=========================\n\n");
1071 /* We don't want to perform SMS on new loops - created by versioning. */
1072 FOR_EACH_LOOP (li
, loop
, 0)
1075 rtx count_reg
, count_init
;
1077 unsigned stage_count
= 0;
1078 HOST_WIDEST_INT loop_count
= 0;
1080 if (! (g
= g_arr
[loop
->num
]))
1085 rtx insn
= BB_END (loop
->header
);
1087 fprintf (dump_file
, "SMS loop num: %d, file: %s, line: %d\n",
1088 loop
->num
, insn_file (insn
), insn_line (insn
));
1090 print_ddg (dump_file
, g
);
1093 get_ebb_head_tail (loop
->header
, loop
->header
, &head
, &tail
);
1095 latch_edge
= loop_latch_edge (loop
);
1096 gcc_assert (single_exit (loop
));
1097 if (single_exit (loop
)->count
)
1098 trip_count
= latch_edge
->count
/ single_exit (loop
)->count
;
1102 fprintf (dump_file
, " %s %d (file, line)\n",
1103 insn_file (tail
), insn_line (tail
));
1104 fprintf (dump_file
, "SMS single-bb-loop\n");
1105 if (profile_info
&& flag_branch_probabilities
)
1107 fprintf (dump_file
, "SMS loop-count ");
1108 fprintf (dump_file
, HOST_WIDEST_INT_PRINT_DEC
,
1109 (HOST_WIDEST_INT
) bb
->count
);
1110 fprintf (dump_file
, "\n");
1111 fprintf (dump_file
, "SMS profile-sum-max ");
1112 fprintf (dump_file
, HOST_WIDEST_INT_PRINT_DEC
,
1113 (HOST_WIDEST_INT
) profile_info
->sum_max
);
1114 fprintf (dump_file
, "\n");
1116 fprintf (dump_file
, "SMS doloop\n");
1117 fprintf (dump_file
, "SMS built-ddg %d\n", g
->num_nodes
);
1118 fprintf (dump_file
, "SMS num-loads %d\n", g
->num_loads
);
1119 fprintf (dump_file
, "SMS num-stores %d\n", g
->num_stores
);
1123 /* In case of th loop have doloop register it gets special
1125 count_init
= NULL_RTX
;
1126 if ((count_reg
= doloop_register_get (head
, tail
)))
1128 basic_block pre_header
;
1130 pre_header
= loop_preheader_edge (loop
)->src
;
1131 count_init
= const_iteration_count (count_reg
, pre_header
,
1134 gcc_assert (count_reg
);
1136 if (dump_file
&& count_init
)
1138 fprintf (dump_file
, "SMS const-doloop ");
1139 fprintf (dump_file
, HOST_WIDEST_INT_PRINT_DEC
,
1141 fprintf (dump_file
, "\n");
1144 node_order
= XNEWVEC (int, g
->num_nodes
);
1146 mii
= 1; /* Need to pass some estimate of mii. */
1147 rec_mii
= sms_order_nodes (g
, mii
, node_order
, &max_asap
);
1148 mii
= MAX (res_MII (g
), rec_mii
);
1149 maxii
= MAX (max_asap
, MAXII_FACTOR
* mii
);
1152 fprintf (dump_file
, "SMS iis %d %d %d (rec_mii, mii, maxii)\n",
1153 rec_mii
, mii
, maxii
);
1155 /* After sms_order_nodes and before sms_schedule_by_order, to copy over
1157 set_node_sched_params (g
);
1159 ps
= sms_schedule_by_order (g
, mii
, maxii
, node_order
);
1162 stage_count
= PS_STAGE_COUNT (ps
);
1163 gcc_assert(stage_count
>= 1);
1166 /* Stage count of 1 means that there is no interleaving between
1167 iterations, let the scheduling passes do the job. */
1168 if (stage_count
<= 1
1169 || (count_init
&& (loop_count
<= stage_count
))
1170 || (flag_branch_probabilities
&& (trip_count
<= stage_count
)))
1174 fprintf (dump_file
, "SMS failed... \n");
1175 fprintf (dump_file
, "SMS sched-failed (stage-count=%d, loop-count=", stage_count
);
1176 fprintf (dump_file
, HOST_WIDEST_INT_PRINT_DEC
, loop_count
);
1177 fprintf (dump_file
, ", trip-count=");
1178 fprintf (dump_file
, HOST_WIDEST_INT_PRINT_DEC
, trip_count
);
1179 fprintf (dump_file
, ")\n");
1185 struct undo_replace_buff_elem
*reg_move_replaces
;
1190 "SMS succeeded %d %d (with ii, sc)\n", ps
->ii
,
1192 print_partial_schedule (ps
, dump_file
);
1194 "SMS Branch (%d) will later be scheduled at cycle %d.\n",
1195 g
->closing_branch
->cuid
, PS_MIN_CYCLE (ps
) - 1);
1198 /* Set the stage boundaries. If the DDG is built with closing_branch_deps,
1199 the closing_branch was scheduled and should appear in the last (ii-1)
1200 row. Otherwise, we are free to schedule the branch, and we let nodes
1201 that were scheduled at the first PS_MIN_CYCLE cycle appear in the first
1202 row; this should reduce stage_count to minimum.
1203 TODO: Revisit the issue of scheduling the insns of the
1204 control part relative to the branch when the control part
1205 has more than one insn. */
1206 normalize_sched_times (ps
);
1207 rotate_partial_schedule (ps
, PS_MIN_CYCLE (ps
));
1208 set_columns_for_ps (ps
);
1212 /* case the BCT count is not known , Do loop-versioning */
1213 if (count_reg
&& ! count_init
)
1215 rtx comp_rtx
= gen_rtx_fmt_ee (GT
, VOIDmode
, count_reg
,
1216 GEN_INT(stage_count
));
1217 unsigned prob
= (PROB_SMS_ENOUGH_ITERATIONS
1218 * REG_BR_PROB_BASE
) / 100;
1220 loop_version (loop
, comp_rtx
, &condition_bb
,
1221 prob
, prob
, REG_BR_PROB_BASE
- prob
,
1225 /* Set new iteration count of loop kernel. */
1226 if (count_reg
&& count_init
)
1227 SET_SRC (single_set (count_init
)) = GEN_INT (loop_count
1230 /* Now apply the scheduled kernel to the RTL of the loop. */
1231 permute_partial_schedule (ps
, g
->closing_branch
->first_note
);
1233 /* Mark this loop as software pipelined so the later
1234 scheduling passes doesn't touch it. */
1235 if (! flag_resched_modulo_sched
)
1236 g
->bb
->flags
|= BB_DISABLE_SCHEDULE
;
1237 /* The life-info is not valid any more. */
1238 df_set_bb_dirty (g
->bb
);
1240 reg_move_replaces
= generate_reg_moves (ps
, true);
1242 print_node_sched_params (dump_file
, g
->num_nodes
, g
);
1243 /* Generate prolog and epilog. */
1244 generate_prolog_epilog (ps
, loop
, count_reg
, count_init
);
1246 free_undo_replace_buff (reg_move_replaces
);
1249 free_partial_schedule (ps
);
1250 free (node_sched_params
);
1257 /* Release scheduler data, needed until now because of DFA. */
1258 haifa_sched_finish ();
1259 loop_optimizer_finalize ();
1262 /* The SMS scheduling algorithm itself
1263 -----------------------------------
1264 Input: 'O' an ordered list of insns of a loop.
1265 Output: A scheduling of the loop - kernel, prolog, and epilogue.
1267 'Q' is the empty Set
1268 'PS' is the partial schedule; it holds the currently scheduled nodes with
1270 'PSP' previously scheduled predecessors.
1271 'PSS' previously scheduled successors.
1272 't(u)' the cycle where u is scheduled.
1273 'l(u)' is the latency of u.
1274 'd(v,u)' is the dependence distance from v to u.
1275 'ASAP(u)' the earliest time at which u could be scheduled as computed in
1276 the node ordering phase.
1277 'check_hardware_resources_conflicts(u, PS, c)'
1278 run a trace around cycle/slot through DFA model
1279 to check resource conflicts involving instruction u
1280 at cycle c given the partial schedule PS.
1281 'add_to_partial_schedule_at_time(u, PS, c)'
1282 Add the node/instruction u to the partial schedule
1284 'calculate_register_pressure(PS)'
1285 Given a schedule of instructions, calculate the register
1286 pressure it implies. One implementation could be the
1287 maximum number of overlapping live ranges.
1288 'maxRP' The maximum allowed register pressure, it is usually derived from the number
1289 registers available in the hardware.
1293 3. for each node u in O in pre-computed order
1294 4. if (PSP(u) != Q && PSS(u) == Q) then
1295 5. Early_start(u) = max ( t(v) + l(v) - d(v,u)*II ) over all every v in PSP(u).
1296 6. start = Early_start; end = Early_start + II - 1; step = 1
1297 11. else if (PSP(u) == Q && PSS(u) != Q) then
1298 12. Late_start(u) = min ( t(v) - l(v) + d(v,u)*II ) over all every v in PSS(u).
1299 13. start = Late_start; end = Late_start - II + 1; step = -1
1300 14. else if (PSP(u) != Q && PSS(u) != Q) then
1301 15. Early_start(u) = max ( t(v) + l(v) - d(v,u)*II ) over all every v in PSP(u).
1302 16. Late_start(u) = min ( t(v) - l(v) + d(v,u)*II ) over all every v in PSS(u).
1303 17. start = Early_start;
1304 18. end = min(Early_start + II - 1 , Late_start);
1306 20. else "if (PSP(u) == Q && PSS(u) == Q)"
1307 21. start = ASAP(u); end = start + II - 1; step = 1
1311 24. for (c = start ; c != end ; c += step)
1312 25. if check_hardware_resources_conflicts(u, PS, c) then
1313 26. add_to_partial_schedule_at_time(u, PS, c)
1318 31. if (success == false) then
1320 33. if (II > maxII) then
1321 34. finish - failed to schedule
1326 39. if (calculate_register_pressure(PS) > maxRP) then
1329 42. compute epilogue & prologue
1330 43. finish - succeeded to schedule
1333 /* A limit on the number of cycles that resource conflicts can span. ??? Should
1334 be provided by DFA, and be dependent on the type of insn scheduled. Currently
1335 set to 0 to save compile time. */
1336 #define DFA_HISTORY SMS_DFA_HISTORY
1338 /* A threshold for the number of repeated unsuccessful attempts to insert
1339 an empty row, before we flush the partial schedule and start over. */
1340 #define MAX_SPLIT_NUM 10
1341 /* Given the partial schedule PS, this function calculates and returns the
1342 cycles in which we can schedule the node with the given index I.
1343 NOTE: Here we do the backtracking in SMS, in some special cases. We have
1344 noticed that there are several cases in which we fail to SMS the loop
1345 because the sched window of a node is empty due to tight data-deps. In
1346 such cases we want to unschedule some of the predecessors/successors
1347 until we get non-empty scheduling window. It returns -1 if the
1348 scheduling window is empty and zero otherwise. */
1351 get_sched_window (partial_schedule_ptr ps
, int *nodes_order
, int i
,
1352 sbitmap sched_nodes
, int ii
, int *start_p
, int *step_p
, int *end_p
)
1354 int start
, step
, end
;
1356 int u
= nodes_order
[i
];
1357 ddg_node_ptr u_node
= &ps
->g
->nodes
[u
];
1358 sbitmap psp
= sbitmap_alloc (ps
->g
->num_nodes
);
1359 sbitmap pss
= sbitmap_alloc (ps
->g
->num_nodes
);
1360 sbitmap u_node_preds
= NODE_PREDECESSORS (u_node
);
1361 sbitmap u_node_succs
= NODE_SUCCESSORS (u_node
);
1365 /* 1. compute sched window for u (start, end, step). */
1368 psp_not_empty
= sbitmap_a_and_b_cg (psp
, u_node_preds
, sched_nodes
);
1369 pss_not_empty
= sbitmap_a_and_b_cg (pss
, u_node_succs
, sched_nodes
);
1371 if (psp_not_empty
&& !pss_not_empty
)
1373 int early_start
= INT_MIN
;
1376 for (e
= u_node
->in
; e
!= 0; e
= e
->next_in
)
1378 ddg_node_ptr v_node
= e
->src
;
1382 fprintf (dump_file
, "\nProcessing edge: ");
1383 print_ddg_edge (dump_file
, e
);
1385 "\nScheduling %d (%d) in psp_not_empty,"
1386 " checking p %d (%d): ", u_node
->cuid
,
1387 INSN_UID (u_node
->insn
), v_node
->cuid
, INSN_UID
1391 if (TEST_BIT (sched_nodes
, v_node
->cuid
))
1393 int p_st
= SCHED_TIME (v_node
);
1396 MAX (early_start
, p_st
+ e
->latency
- (e
->distance
* ii
));
1400 "pred st = %d; early_start = %d; latency: %d",
1401 p_st
, early_start
, e
->latency
);
1403 if (e
->data_type
== MEM_DEP
)
1404 end
= MIN (end
, SCHED_TIME (v_node
) + ii
- 1);
1407 fprintf (dump_file
, "the node is not scheduled\n");
1409 start
= early_start
;
1410 end
= MIN (end
, early_start
+ ii
);
1411 /* Schedule the node close to it's predecessors. */
1416 "\nScheduling %d (%d) in a window (%d..%d) with step %d\n",
1417 u_node
->cuid
, INSN_UID (u_node
->insn
), start
, end
, step
);
1420 else if (!psp_not_empty
&& pss_not_empty
)
1422 int late_start
= INT_MAX
;
1425 for (e
= u_node
->out
; e
!= 0; e
= e
->next_out
)
1427 ddg_node_ptr v_node
= e
->dest
;
1431 fprintf (dump_file
, "\nProcessing edge:");
1432 print_ddg_edge (dump_file
, e
);
1434 "\nScheduling %d (%d) in pss_not_empty,"
1435 " checking s %d (%d): ", u_node
->cuid
,
1436 INSN_UID (u_node
->insn
), v_node
->cuid
, INSN_UID
1440 if (TEST_BIT (sched_nodes
, v_node
->cuid
))
1442 int s_st
= SCHED_TIME (v_node
);
1444 late_start
= MIN (late_start
,
1445 s_st
- e
->latency
+ (e
->distance
* ii
));
1449 "succ st = %d; late_start = %d; latency = %d",
1450 s_st
, late_start
, e
->latency
);
1452 if (e
->data_type
== MEM_DEP
)
1453 end
= MAX (end
, SCHED_TIME (v_node
) - ii
+ 1);
1455 fprintf (dump_file
, "end = %d\n", end
);
1459 fprintf (dump_file
, "the node is not scheduled\n");
1463 end
= MAX (end
, late_start
- ii
);
1464 /* Schedule the node close to it's successors. */
1469 "\nScheduling %d (%d) in a window (%d..%d) with step %d\n",
1470 u_node
->cuid
, INSN_UID (u_node
->insn
), start
, end
, step
);
1474 else if (psp_not_empty
&& pss_not_empty
)
1476 int early_start
= INT_MIN
;
1477 int late_start
= INT_MAX
;
1478 int count_preds
= 0;
1479 int count_succs
= 0;
1483 for (e
= u_node
->in
; e
!= 0; e
= e
->next_in
)
1485 ddg_node_ptr v_node
= e
->src
;
1489 fprintf (dump_file
, "\nProcessing edge:");
1490 print_ddg_edge (dump_file
, e
);
1492 "\nScheduling %d (%d) in psp_pss_not_empty,"
1493 " checking p %d (%d): ", u_node
->cuid
, INSN_UID
1494 (u_node
->insn
), v_node
->cuid
, INSN_UID
1498 if (TEST_BIT (sched_nodes
, v_node
->cuid
))
1500 int p_st
= SCHED_TIME (v_node
);
1502 early_start
= MAX (early_start
,
1504 - (e
->distance
* ii
));
1508 "pred st = %d; early_start = %d; latency = %d",
1509 p_st
, early_start
, e
->latency
);
1511 if (e
->type
== TRUE_DEP
&& e
->data_type
== REG_DEP
)
1514 if (e
->data_type
== MEM_DEP
)
1515 end
= MIN (end
, SCHED_TIME (v_node
) + ii
- 1);
1518 fprintf (dump_file
, "the node is not scheduled\n");
1521 for (e
= u_node
->out
; e
!= 0; e
= e
->next_out
)
1523 ddg_node_ptr v_node
= e
->dest
;
1527 fprintf (dump_file
, "\nProcessing edge:");
1528 print_ddg_edge (dump_file
, e
);
1530 "\nScheduling %d (%d) in psp_pss_not_empty,"
1531 " checking s %d (%d): ", u_node
->cuid
, INSN_UID
1532 (u_node
->insn
), v_node
->cuid
, INSN_UID
1536 if (TEST_BIT (sched_nodes
, v_node
->cuid
))
1538 int s_st
= SCHED_TIME (v_node
);
1540 late_start
= MIN (late_start
,
1542 + (e
->distance
* ii
));
1546 "succ st = %d; late_start = %d; latency = %d",
1547 s_st
, late_start
, e
->latency
);
1549 if (e
->type
== TRUE_DEP
&& e
->data_type
== REG_DEP
)
1552 if (e
->data_type
== MEM_DEP
)
1553 start
= MAX (start
, SCHED_TIME (v_node
) - ii
+ 1);
1556 fprintf (dump_file
, "the node is not scheduled\n");
1559 start
= MAX (start
, early_start
);
1560 end
= MIN (end
, MIN (early_start
+ ii
, late_start
+ 1));
1562 /* If there are more successors than predecessors schedule the
1563 node close to it's successors. */
1564 if (count_succs
>= count_preds
)
1566 int old_start
= start
;
1569 end
= old_start
- 1;
1573 else /* psp is empty && pss is empty. */
1575 start
= SCHED_ASAP (u_node
);
1586 if ((start
>= end
&& step
== 1) || (start
<= end
&& step
== -1))
1589 fprintf (dump_file
, "\nEmpty window: start=%d, end=%d, step=%d\n",
1597 /* Calculate MUST_PRECEDE/MUST_FOLLOW bitmaps of U_NODE; which is the
1598 node currently been scheduled. At the end of the calculation
1599 MUST_PRECEDE/MUST_FOLLOW contains all predecessors/successors of
1600 U_NODE which are (1) already scheduled in the first/last row of
1601 U_NODE's scheduling window, (2) whose dependence inequality with U
1602 becomes an equality when U is scheduled in this same row, and (3)
1603 whose dependence latency is zero.
1605 The first and last rows are calculated using the following parameters:
1606 START/END rows - The cycles that begins/ends the traversal on the window;
1607 searching for an empty cycle to schedule U_NODE.
1608 STEP - The direction in which we traverse the window.
1609 II - The initiation interval. */
1612 calculate_must_precede_follow (ddg_node_ptr u_node
, int start
, int end
,
1613 int step
, int ii
, sbitmap sched_nodes
,
1614 sbitmap must_precede
, sbitmap must_follow
)
1617 int first_cycle_in_window
, last_cycle_in_window
;
1619 gcc_assert (must_precede
&& must_follow
);
1621 /* Consider the following scheduling window:
1622 {first_cycle_in_window, first_cycle_in_window+1, ...,
1623 last_cycle_in_window}. If step is 1 then the following will be
1624 the order we traverse the window: {start=first_cycle_in_window,
1625 first_cycle_in_window+1, ..., end=last_cycle_in_window+1},
1626 or {start=last_cycle_in_window, last_cycle_in_window-1, ...,
1627 end=first_cycle_in_window-1} if step is -1. */
1628 first_cycle_in_window
= (step
== 1) ? start
: end
- step
;
1629 last_cycle_in_window
= (step
== 1) ? end
- step
: start
;
1631 sbitmap_zero (must_precede
);
1632 sbitmap_zero (must_follow
);
1635 fprintf (dump_file
, "\nmust_precede: ");
1637 /* Instead of checking if:
1638 (SMODULO (SCHED_TIME (e->src), ii) == first_row_in_window)
1639 && ((SCHED_TIME (e->src) + e->latency - (e->distance * ii)) ==
1640 first_cycle_in_window)
1642 we use the fact that latency is non-negative:
1643 SCHED_TIME (e->src) - (e->distance * ii) <=
1644 SCHED_TIME (e->src) + e->latency - (e->distance * ii)) <=
1645 first_cycle_in_window
1647 SCHED_TIME (e->src) - (e->distance * ii) == first_cycle_in_window */
1648 for (e
= u_node
->in
; e
!= 0; e
= e
->next_in
)
1649 if (TEST_BIT (sched_nodes
, e
->src
->cuid
)
1650 && ((SCHED_TIME (e
->src
) - (e
->distance
* ii
)) ==
1651 first_cycle_in_window
))
1654 fprintf (dump_file
, "%d ", e
->src
->cuid
);
1656 SET_BIT (must_precede
, e
->src
->cuid
);
1660 fprintf (dump_file
, "\nmust_follow: ");
1662 /* Instead of checking if:
1663 (SMODULO (SCHED_TIME (e->dest), ii) == last_row_in_window)
1664 && ((SCHED_TIME (e->dest) - e->latency + (e->distance * ii)) ==
1665 last_cycle_in_window)
1667 we use the fact that latency is non-negative:
1668 SCHED_TIME (e->dest) + (e->distance * ii) >=
1669 SCHED_TIME (e->dest) - e->latency + (e->distance * ii)) >=
1670 last_cycle_in_window
1672 SCHED_TIME (e->dest) + (e->distance * ii) == last_cycle_in_window */
1673 for (e
= u_node
->out
; e
!= 0; e
= e
->next_out
)
1674 if (TEST_BIT (sched_nodes
, e
->dest
->cuid
)
1675 && ((SCHED_TIME (e
->dest
) + (e
->distance
* ii
)) ==
1676 last_cycle_in_window
))
1679 fprintf (dump_file
, "%d ", e
->dest
->cuid
);
1681 SET_BIT (must_follow
, e
->dest
->cuid
);
1685 fprintf (dump_file
, "\n");
1688 /* Return 1 if U_NODE can be scheduled in CYCLE. Use the following
1689 parameters to decide if that's possible:
1690 PS - The partial schedule.
1691 U - The serial number of U_NODE.
1692 NUM_SPLITS - The number of row splits made so far.
1693 MUST_PRECEDE - The nodes that must precede U_NODE. (only valid at
1694 the first row of the scheduling window)
1695 MUST_FOLLOW - The nodes that must follow U_NODE. (only valid at the
1696 last row of the scheduling window) */
1699 try_scheduling_node_in_cycle (partial_schedule_ptr ps
, ddg_node_ptr u_node
,
1700 int u
, int cycle
, sbitmap sched_nodes
,
1701 int *num_splits
, sbitmap must_precede
,
1702 sbitmap must_follow
)
1707 verify_partial_schedule (ps
, sched_nodes
);
1708 psi
= ps_add_node_check_conflicts (ps
, u_node
, cycle
,
1709 must_precede
, must_follow
);
1712 SCHED_TIME (u_node
) = cycle
;
1713 SET_BIT (sched_nodes
, u
);
1717 fprintf (dump_file
, "Scheduled w/o split in %d\n", cycle
);
1724 /* This function implements the scheduling algorithm for SMS according to the
1726 static partial_schedule_ptr
1727 sms_schedule_by_order (ddg_ptr g
, int mii
, int maxii
, int *nodes_order
)
1730 int i
, c
, success
, num_splits
= 0;
1731 int flush_and_start_over
= true;
1732 int num_nodes
= g
->num_nodes
;
1733 int start
, end
, step
; /* Place together into one struct? */
1734 sbitmap sched_nodes
= sbitmap_alloc (num_nodes
);
1735 sbitmap must_precede
= sbitmap_alloc (num_nodes
);
1736 sbitmap must_follow
= sbitmap_alloc (num_nodes
);
1737 sbitmap tobe_scheduled
= sbitmap_alloc (num_nodes
);
1739 partial_schedule_ptr ps
= create_partial_schedule (ii
, g
, DFA_HISTORY
);
1741 sbitmap_ones (tobe_scheduled
);
1742 sbitmap_zero (sched_nodes
);
1744 while (flush_and_start_over
&& (ii
< maxii
))
1748 fprintf (dump_file
, "Starting with ii=%d\n", ii
);
1749 flush_and_start_over
= false;
1750 sbitmap_zero (sched_nodes
);
1752 for (i
= 0; i
< num_nodes
; i
++)
1754 int u
= nodes_order
[i
];
1755 ddg_node_ptr u_node
= &ps
->g
->nodes
[u
];
1756 rtx insn
= u_node
->insn
;
1758 if (!NONDEBUG_INSN_P (insn
))
1760 RESET_BIT (tobe_scheduled
, u
);
1764 if (JUMP_P (insn
)) /* Closing branch handled later. */
1766 RESET_BIT (tobe_scheduled
, u
);
1770 if (TEST_BIT (sched_nodes
, u
))
1773 /* Try to get non-empty scheduling window. */
1775 if (get_sched_window (ps
, nodes_order
, i
, sched_nodes
, ii
, &start
,
1779 fprintf (dump_file
, "\nTrying to schedule node %d \
1780 INSN = %d in (%d .. %d) step %d\n", u
, (INSN_UID
1781 (g
->nodes
[u
].insn
)), start
, end
, step
);
1783 gcc_assert ((step
> 0 && start
< end
)
1784 || (step
< 0 && start
> end
));
1786 calculate_must_precede_follow (u_node
, start
, end
, step
, ii
,
1787 sched_nodes
, must_precede
,
1790 for (c
= start
; c
!= end
; c
+= step
)
1792 sbitmap tmp_precede
= NULL
;
1793 sbitmap tmp_follow
= NULL
;
1798 tmp_precede
= must_precede
;
1799 else /* step == -1. */
1800 tmp_follow
= must_follow
;
1802 if (c
== end
- step
)
1805 tmp_follow
= must_follow
;
1806 else /* step == -1. */
1807 tmp_precede
= must_precede
;
1811 try_scheduling_node_in_cycle (ps
, u_node
, u
, c
,
1813 &num_splits
, tmp_precede
,
1819 verify_partial_schedule (ps
, sched_nodes
);
1828 if (num_splits
>= MAX_SPLIT_NUM
)
1831 flush_and_start_over
= true;
1832 verify_partial_schedule (ps
, sched_nodes
);
1833 reset_partial_schedule (ps
, ii
);
1834 verify_partial_schedule (ps
, sched_nodes
);
1839 /* The scheduling window is exclusive of 'end'
1840 whereas compute_split_window() expects an inclusive,
1843 split_row
= compute_split_row (sched_nodes
, start
, end
- 1,
1846 split_row
= compute_split_row (sched_nodes
, end
+ 1, start
,
1849 ps_insert_empty_row (ps
, split_row
, sched_nodes
);
1850 i
--; /* Go back and retry node i. */
1853 fprintf (dump_file
, "num_splits=%d\n", num_splits
);
1856 /* ??? If (success), check register pressure estimates. */
1857 } /* Continue with next node. */
1858 } /* While flush_and_start_over. */
1861 free_partial_schedule (ps
);
1865 gcc_assert (sbitmap_equal (tobe_scheduled
, sched_nodes
));
1867 sbitmap_free (sched_nodes
);
1868 sbitmap_free (must_precede
);
1869 sbitmap_free (must_follow
);
1870 sbitmap_free (tobe_scheduled
);
1875 /* This function inserts a new empty row into PS at the position
1876 according to SPLITROW, keeping all already scheduled instructions
1877 intact and updating their SCHED_TIME and cycle accordingly. */
1879 ps_insert_empty_row (partial_schedule_ptr ps
, int split_row
,
1880 sbitmap sched_nodes
)
1882 ps_insn_ptr crr_insn
;
1883 ps_insn_ptr
*rows_new
;
1885 int new_ii
= ii
+ 1;
1888 verify_partial_schedule (ps
, sched_nodes
);
1890 /* We normalize sched_time and rotate ps to have only non-negative sched
1891 times, for simplicity of updating cycles after inserting new row. */
1892 split_row
-= ps
->min_cycle
;
1893 split_row
= SMODULO (split_row
, ii
);
1895 fprintf (dump_file
, "split_row=%d\n", split_row
);
1897 normalize_sched_times (ps
);
1898 rotate_partial_schedule (ps
, ps
->min_cycle
);
1900 rows_new
= (ps_insn_ptr
*) xcalloc (new_ii
, sizeof (ps_insn_ptr
));
1901 for (row
= 0; row
< split_row
; row
++)
1903 rows_new
[row
] = ps
->rows
[row
];
1904 ps
->rows
[row
] = NULL
;
1905 for (crr_insn
= rows_new
[row
];
1906 crr_insn
; crr_insn
= crr_insn
->next_in_row
)
1908 ddg_node_ptr u
= crr_insn
->node
;
1909 int new_time
= SCHED_TIME (u
) + (SCHED_TIME (u
) / ii
);
1911 SCHED_TIME (u
) = new_time
;
1912 crr_insn
->cycle
= new_time
;
1913 SCHED_ROW (u
) = new_time
% new_ii
;
1914 SCHED_STAGE (u
) = new_time
/ new_ii
;
1919 rows_new
[split_row
] = NULL
;
1921 for (row
= split_row
; row
< ii
; row
++)
1923 rows_new
[row
+ 1] = ps
->rows
[row
];
1924 ps
->rows
[row
] = NULL
;
1925 for (crr_insn
= rows_new
[row
+ 1];
1926 crr_insn
; crr_insn
= crr_insn
->next_in_row
)
1928 ddg_node_ptr u
= crr_insn
->node
;
1929 int new_time
= SCHED_TIME (u
) + (SCHED_TIME (u
) / ii
) + 1;
1931 SCHED_TIME (u
) = new_time
;
1932 crr_insn
->cycle
= new_time
;
1933 SCHED_ROW (u
) = new_time
% new_ii
;
1934 SCHED_STAGE (u
) = new_time
/ new_ii
;
1939 ps
->min_cycle
= ps
->min_cycle
+ ps
->min_cycle
/ ii
1940 + (SMODULO (ps
->min_cycle
, ii
) >= split_row
? 1 : 0);
1941 ps
->max_cycle
= ps
->max_cycle
+ ps
->max_cycle
/ ii
1942 + (SMODULO (ps
->max_cycle
, ii
) >= split_row
? 1 : 0);
1944 ps
->rows
= rows_new
;
1946 gcc_assert (ps
->min_cycle
>= 0);
1948 verify_partial_schedule (ps
, sched_nodes
);
1951 fprintf (dump_file
, "min_cycle=%d, max_cycle=%d\n", ps
->min_cycle
,
1955 /* Given U_NODE which is the node that failed to be scheduled; LOW and
1956 UP which are the boundaries of it's scheduling window; compute using
1957 SCHED_NODES and II a row in the partial schedule that can be split
1958 which will separate a critical predecessor from a critical successor
1959 thereby expanding the window, and return it. */
1961 compute_split_row (sbitmap sched_nodes
, int low
, int up
, int ii
,
1962 ddg_node_ptr u_node
)
1965 int lower
= INT_MIN
, upper
= INT_MAX
;
1966 ddg_node_ptr crit_pred
= NULL
;
1967 ddg_node_ptr crit_succ
= NULL
;
1970 for (e
= u_node
->in
; e
!= 0; e
= e
->next_in
)
1972 ddg_node_ptr v_node
= e
->src
;
1974 if (TEST_BIT (sched_nodes
, v_node
->cuid
)
1975 && (low
== SCHED_TIME (v_node
) + e
->latency
- (e
->distance
* ii
)))
1976 if (SCHED_TIME (v_node
) > lower
)
1979 lower
= SCHED_TIME (v_node
);
1983 if (crit_pred
!= NULL
)
1985 crit_cycle
= SCHED_TIME (crit_pred
) + 1;
1986 return SMODULO (crit_cycle
, ii
);
1989 for (e
= u_node
->out
; e
!= 0; e
= e
->next_out
)
1991 ddg_node_ptr v_node
= e
->dest
;
1992 if (TEST_BIT (sched_nodes
, v_node
->cuid
)
1993 && (up
== SCHED_TIME (v_node
) - e
->latency
+ (e
->distance
* ii
)))
1994 if (SCHED_TIME (v_node
) < upper
)
1997 upper
= SCHED_TIME (v_node
);
2001 if (crit_succ
!= NULL
)
2003 crit_cycle
= SCHED_TIME (crit_succ
);
2004 return SMODULO (crit_cycle
, ii
);
2008 fprintf (dump_file
, "Both crit_pred and crit_succ are NULL\n");
2010 return SMODULO ((low
+ up
+ 1) / 2, ii
);
2014 verify_partial_schedule (partial_schedule_ptr ps
, sbitmap sched_nodes
)
2017 ps_insn_ptr crr_insn
;
2019 for (row
= 0; row
< ps
->ii
; row
++)
2020 for (crr_insn
= ps
->rows
[row
]; crr_insn
; crr_insn
= crr_insn
->next_in_row
)
2022 ddg_node_ptr u
= crr_insn
->node
;
2024 gcc_assert (TEST_BIT (sched_nodes
, u
->cuid
));
2025 /* ??? Test also that all nodes of sched_nodes are in ps, perhaps by
2026 popcount (sched_nodes) == number of insns in ps. */
2027 gcc_assert (SCHED_TIME (u
) >= ps
->min_cycle
);
2028 gcc_assert (SCHED_TIME (u
) <= ps
->max_cycle
);
2033 /* This page implements the algorithm for ordering the nodes of a DDG
2034 for modulo scheduling, activated through the
2035 "int sms_order_nodes (ddg_ptr, int mii, int * result)" API. */
2037 #define ORDER_PARAMS(x) ((struct node_order_params *) (x)->aux.info)
2038 #define ASAP(x) (ORDER_PARAMS ((x))->asap)
2039 #define ALAP(x) (ORDER_PARAMS ((x))->alap)
2040 #define HEIGHT(x) (ORDER_PARAMS ((x))->height)
2041 #define MOB(x) (ALAP ((x)) - ASAP ((x)))
2042 #define DEPTH(x) (ASAP ((x)))
2044 typedef struct node_order_params
* nopa
;
2046 static void order_nodes_of_sccs (ddg_all_sccs_ptr
, int * result
);
2047 static int order_nodes_in_scc (ddg_ptr
, sbitmap
, sbitmap
, int*, int);
2048 static nopa
calculate_order_params (ddg_ptr
, int, int *);
2049 static int find_max_asap (ddg_ptr
, sbitmap
);
2050 static int find_max_hv_min_mob (ddg_ptr
, sbitmap
);
2051 static int find_max_dv_min_mob (ddg_ptr
, sbitmap
);
2053 enum sms_direction
{BOTTOMUP
, TOPDOWN
};
2055 struct node_order_params
2062 /* Check if NODE_ORDER contains a permutation of 0 .. NUM_NODES-1. */
2064 check_nodes_order (int *node_order
, int num_nodes
)
2067 sbitmap tmp
= sbitmap_alloc (num_nodes
);
2072 fprintf (dump_file
, "SMS final nodes order: \n");
2074 for (i
= 0; i
< num_nodes
; i
++)
2076 int u
= node_order
[i
];
2079 fprintf (dump_file
, "%d ", u
);
2080 gcc_assert (u
< num_nodes
&& u
>= 0 && !TEST_BIT (tmp
, u
));
2086 fprintf (dump_file
, "\n");
2091 /* Order the nodes of G for scheduling and pass the result in
2092 NODE_ORDER. Also set aux.count of each node to ASAP.
2093 Put maximal ASAP to PMAX_ASAP. Return the recMII for the given DDG. */
2095 sms_order_nodes (ddg_ptr g
, int mii
, int * node_order
, int *pmax_asap
)
2099 ddg_all_sccs_ptr sccs
= create_ddg_all_sccs (g
);
2101 nopa nops
= calculate_order_params (g
, mii
, pmax_asap
);
2104 print_sccs (dump_file
, sccs
, g
);
2106 order_nodes_of_sccs (sccs
, node_order
);
2108 if (sccs
->num_sccs
> 0)
2109 /* First SCC has the largest recurrence_length. */
2110 rec_mii
= sccs
->sccs
[0]->recurrence_length
;
2112 /* Save ASAP before destroying node_order_params. */
2113 for (i
= 0; i
< g
->num_nodes
; i
++)
2115 ddg_node_ptr v
= &g
->nodes
[i
];
2116 v
->aux
.count
= ASAP (v
);
2120 free_ddg_all_sccs (sccs
);
2121 check_nodes_order (node_order
, g
->num_nodes
);
2127 order_nodes_of_sccs (ddg_all_sccs_ptr all_sccs
, int * node_order
)
2130 ddg_ptr g
= all_sccs
->ddg
;
2131 int num_nodes
= g
->num_nodes
;
2132 sbitmap prev_sccs
= sbitmap_alloc (num_nodes
);
2133 sbitmap on_path
= sbitmap_alloc (num_nodes
);
2134 sbitmap tmp
= sbitmap_alloc (num_nodes
);
2135 sbitmap ones
= sbitmap_alloc (num_nodes
);
2137 sbitmap_zero (prev_sccs
);
2138 sbitmap_ones (ones
);
2140 /* Perform the node ordering starting from the SCC with the highest recMII.
2141 For each SCC order the nodes according to their ASAP/ALAP/HEIGHT etc. */
2142 for (i
= 0; i
< all_sccs
->num_sccs
; i
++)
2144 ddg_scc_ptr scc
= all_sccs
->sccs
[i
];
2146 /* Add nodes on paths from previous SCCs to the current SCC. */
2147 find_nodes_on_paths (on_path
, g
, prev_sccs
, scc
->nodes
);
2148 sbitmap_a_or_b (tmp
, scc
->nodes
, on_path
);
2150 /* Add nodes on paths from the current SCC to previous SCCs. */
2151 find_nodes_on_paths (on_path
, g
, scc
->nodes
, prev_sccs
);
2152 sbitmap_a_or_b (tmp
, tmp
, on_path
);
2154 /* Remove nodes of previous SCCs from current extended SCC. */
2155 sbitmap_difference (tmp
, tmp
, prev_sccs
);
2157 pos
= order_nodes_in_scc (g
, prev_sccs
, tmp
, node_order
, pos
);
2158 /* Above call to order_nodes_in_scc updated prev_sccs |= tmp. */
2161 /* Handle the remaining nodes that do not belong to any scc. Each call
2162 to order_nodes_in_scc handles a single connected component. */
2163 while (pos
< g
->num_nodes
)
2165 sbitmap_difference (tmp
, ones
, prev_sccs
);
2166 pos
= order_nodes_in_scc (g
, prev_sccs
, tmp
, node_order
, pos
);
2168 sbitmap_free (prev_sccs
);
2169 sbitmap_free (on_path
);
2171 sbitmap_free (ones
);
2174 /* MII is needed if we consider backarcs (that do not close recursive cycles). */
2175 static struct node_order_params
*
2176 calculate_order_params (ddg_ptr g
, int mii ATTRIBUTE_UNUSED
, int *pmax_asap
)
2180 int num_nodes
= g
->num_nodes
;
2182 /* Allocate a place to hold ordering params for each node in the DDG. */
2183 nopa node_order_params_arr
;
2185 /* Initialize of ASAP/ALAP/HEIGHT to zero. */
2186 node_order_params_arr
= (nopa
) xcalloc (num_nodes
,
2187 sizeof (struct node_order_params
));
2189 /* Set the aux pointer of each node to point to its order_params structure. */
2190 for (u
= 0; u
< num_nodes
; u
++)
2191 g
->nodes
[u
].aux
.info
= &node_order_params_arr
[u
];
2193 /* Disregarding a backarc from each recursive cycle to obtain a DAG,
2194 calculate ASAP, ALAP, mobility, distance, and height for each node
2195 in the dependence (direct acyclic) graph. */
2197 /* We assume that the nodes in the array are in topological order. */
2200 for (u
= 0; u
< num_nodes
; u
++)
2202 ddg_node_ptr u_node
= &g
->nodes
[u
];
2205 for (e
= u_node
->in
; e
; e
= e
->next_in
)
2206 if (e
->distance
== 0)
2207 ASAP (u_node
) = MAX (ASAP (u_node
),
2208 ASAP (e
->src
) + e
->latency
);
2209 max_asap
= MAX (max_asap
, ASAP (u_node
));
2212 for (u
= num_nodes
- 1; u
> -1; u
--)
2214 ddg_node_ptr u_node
= &g
->nodes
[u
];
2216 ALAP (u_node
) = max_asap
;
2217 HEIGHT (u_node
) = 0;
2218 for (e
= u_node
->out
; e
; e
= e
->next_out
)
2219 if (e
->distance
== 0)
2221 ALAP (u_node
) = MIN (ALAP (u_node
),
2222 ALAP (e
->dest
) - e
->latency
);
2223 HEIGHT (u_node
) = MAX (HEIGHT (u_node
),
2224 HEIGHT (e
->dest
) + e
->latency
);
2229 fprintf (dump_file
, "\nOrder params\n");
2230 for (u
= 0; u
< num_nodes
; u
++)
2232 ddg_node_ptr u_node
= &g
->nodes
[u
];
2234 fprintf (dump_file
, "node %d, ASAP: %d, ALAP: %d, HEIGHT: %d\n", u
,
2235 ASAP (u_node
), ALAP (u_node
), HEIGHT (u_node
));
2239 *pmax_asap
= max_asap
;
2240 return node_order_params_arr
;
2244 find_max_asap (ddg_ptr g
, sbitmap nodes
)
2249 sbitmap_iterator sbi
;
2251 EXECUTE_IF_SET_IN_SBITMAP (nodes
, 0, u
, sbi
)
2253 ddg_node_ptr u_node
= &g
->nodes
[u
];
2255 if (max_asap
< ASAP (u_node
))
2257 max_asap
= ASAP (u_node
);
2265 find_max_hv_min_mob (ddg_ptr g
, sbitmap nodes
)
2269 int min_mob
= INT_MAX
;
2271 sbitmap_iterator sbi
;
2273 EXECUTE_IF_SET_IN_SBITMAP (nodes
, 0, u
, sbi
)
2275 ddg_node_ptr u_node
= &g
->nodes
[u
];
2277 if (max_hv
< HEIGHT (u_node
))
2279 max_hv
= HEIGHT (u_node
);
2280 min_mob
= MOB (u_node
);
2283 else if ((max_hv
== HEIGHT (u_node
))
2284 && (min_mob
> MOB (u_node
)))
2286 min_mob
= MOB (u_node
);
2294 find_max_dv_min_mob (ddg_ptr g
, sbitmap nodes
)
2298 int min_mob
= INT_MAX
;
2300 sbitmap_iterator sbi
;
2302 EXECUTE_IF_SET_IN_SBITMAP (nodes
, 0, u
, sbi
)
2304 ddg_node_ptr u_node
= &g
->nodes
[u
];
2306 if (max_dv
< DEPTH (u_node
))
2308 max_dv
= DEPTH (u_node
);
2309 min_mob
= MOB (u_node
);
2312 else if ((max_dv
== DEPTH (u_node
))
2313 && (min_mob
> MOB (u_node
)))
2315 min_mob
= MOB (u_node
);
2322 /* Places the nodes of SCC into the NODE_ORDER array starting
2323 at position POS, according to the SMS ordering algorithm.
2324 NODES_ORDERED (in&out parameter) holds the bitset of all nodes in
2325 the NODE_ORDER array, starting from position zero. */
2327 order_nodes_in_scc (ddg_ptr g
, sbitmap nodes_ordered
, sbitmap scc
,
2328 int * node_order
, int pos
)
2330 enum sms_direction dir
;
2331 int num_nodes
= g
->num_nodes
;
2332 sbitmap workset
= sbitmap_alloc (num_nodes
);
2333 sbitmap tmp
= sbitmap_alloc (num_nodes
);
2334 sbitmap zero_bitmap
= sbitmap_alloc (num_nodes
);
2335 sbitmap predecessors
= sbitmap_alloc (num_nodes
);
2336 sbitmap successors
= sbitmap_alloc (num_nodes
);
2338 sbitmap_zero (predecessors
);
2339 find_predecessors (predecessors
, g
, nodes_ordered
);
2341 sbitmap_zero (successors
);
2342 find_successors (successors
, g
, nodes_ordered
);
2345 if (sbitmap_a_and_b_cg (tmp
, predecessors
, scc
))
2347 sbitmap_copy (workset
, tmp
);
2350 else if (sbitmap_a_and_b_cg (tmp
, successors
, scc
))
2352 sbitmap_copy (workset
, tmp
);
2359 sbitmap_zero (workset
);
2360 if ((u
= find_max_asap (g
, scc
)) >= 0)
2361 SET_BIT (workset
, u
);
2365 sbitmap_zero (zero_bitmap
);
2366 while (!sbitmap_equal (workset
, zero_bitmap
))
2369 ddg_node_ptr v_node
;
2370 sbitmap v_node_preds
;
2371 sbitmap v_node_succs
;
2375 while (!sbitmap_equal (workset
, zero_bitmap
))
2377 v
= find_max_hv_min_mob (g
, workset
);
2378 v_node
= &g
->nodes
[v
];
2379 node_order
[pos
++] = v
;
2380 v_node_succs
= NODE_SUCCESSORS (v_node
);
2381 sbitmap_a_and_b (tmp
, v_node_succs
, scc
);
2383 /* Don't consider the already ordered successors again. */
2384 sbitmap_difference (tmp
, tmp
, nodes_ordered
);
2385 sbitmap_a_or_b (workset
, workset
, tmp
);
2386 RESET_BIT (workset
, v
);
2387 SET_BIT (nodes_ordered
, v
);
2390 sbitmap_zero (predecessors
);
2391 find_predecessors (predecessors
, g
, nodes_ordered
);
2392 sbitmap_a_and_b (workset
, predecessors
, scc
);
2396 while (!sbitmap_equal (workset
, zero_bitmap
))
2398 v
= find_max_dv_min_mob (g
, workset
);
2399 v_node
= &g
->nodes
[v
];
2400 node_order
[pos
++] = v
;
2401 v_node_preds
= NODE_PREDECESSORS (v_node
);
2402 sbitmap_a_and_b (tmp
, v_node_preds
, scc
);
2404 /* Don't consider the already ordered predecessors again. */
2405 sbitmap_difference (tmp
, tmp
, nodes_ordered
);
2406 sbitmap_a_or_b (workset
, workset
, tmp
);
2407 RESET_BIT (workset
, v
);
2408 SET_BIT (nodes_ordered
, v
);
2411 sbitmap_zero (successors
);
2412 find_successors (successors
, g
, nodes_ordered
);
2413 sbitmap_a_and_b (workset
, successors
, scc
);
2417 sbitmap_free (workset
);
2418 sbitmap_free (zero_bitmap
);
2419 sbitmap_free (predecessors
);
2420 sbitmap_free (successors
);
2425 /* This page contains functions for manipulating partial-schedules during
2426 modulo scheduling. */
2428 /* Create a partial schedule and allocate a memory to hold II rows. */
2430 static partial_schedule_ptr
2431 create_partial_schedule (int ii
, ddg_ptr g
, int history
)
2433 partial_schedule_ptr ps
= XNEW (struct partial_schedule
);
2434 ps
->rows
= (ps_insn_ptr
*) xcalloc (ii
, sizeof (ps_insn_ptr
));
2436 ps
->history
= history
;
2437 ps
->min_cycle
= INT_MAX
;
2438 ps
->max_cycle
= INT_MIN
;
2444 /* Free the PS_INSNs in rows array of the given partial schedule.
2445 ??? Consider caching the PS_INSN's. */
2447 free_ps_insns (partial_schedule_ptr ps
)
2451 for (i
= 0; i
< ps
->ii
; i
++)
2455 ps_insn_ptr ps_insn
= ps
->rows
[i
]->next_in_row
;
2458 ps
->rows
[i
] = ps_insn
;
2464 /* Free all the memory allocated to the partial schedule. */
2467 free_partial_schedule (partial_schedule_ptr ps
)
2476 /* Clear the rows array with its PS_INSNs, and create a new one with
2480 reset_partial_schedule (partial_schedule_ptr ps
, int new_ii
)
2485 if (new_ii
== ps
->ii
)
2487 ps
->rows
= (ps_insn_ptr
*) xrealloc (ps
->rows
, new_ii
2488 * sizeof (ps_insn_ptr
));
2489 memset (ps
->rows
, 0, new_ii
* sizeof (ps_insn_ptr
));
2491 ps
->min_cycle
= INT_MAX
;
2492 ps
->max_cycle
= INT_MIN
;
2495 /* Prints the partial schedule as an ii rows array, for each rows
2496 print the ids of the insns in it. */
2498 print_partial_schedule (partial_schedule_ptr ps
, FILE *dump
)
2502 for (i
= 0; i
< ps
->ii
; i
++)
2504 ps_insn_ptr ps_i
= ps
->rows
[i
];
2506 fprintf (dump
, "\n[ROW %d ]: ", i
);
2509 fprintf (dump
, "%d, ",
2510 INSN_UID (ps_i
->node
->insn
));
2511 ps_i
= ps_i
->next_in_row
;
2516 /* Creates an object of PS_INSN and initializes it to the given parameters. */
2518 create_ps_insn (ddg_node_ptr node
, int rest_count
, int cycle
)
2520 ps_insn_ptr ps_i
= XNEW (struct ps_insn
);
2523 ps_i
->next_in_row
= NULL
;
2524 ps_i
->prev_in_row
= NULL
;
2525 ps_i
->row_rest_count
= rest_count
;
2526 ps_i
->cycle
= cycle
;
2532 /* Removes the given PS_INSN from the partial schedule. Returns false if the
2533 node is not found in the partial schedule, else returns true. */
2535 remove_node_from_ps (partial_schedule_ptr ps
, ps_insn_ptr ps_i
)
2542 row
= SMODULO (ps_i
->cycle
, ps
->ii
);
2543 if (! ps_i
->prev_in_row
)
2545 if (ps_i
!= ps
->rows
[row
])
2548 ps
->rows
[row
] = ps_i
->next_in_row
;
2550 ps
->rows
[row
]->prev_in_row
= NULL
;
2554 ps_i
->prev_in_row
->next_in_row
= ps_i
->next_in_row
;
2555 if (ps_i
->next_in_row
)
2556 ps_i
->next_in_row
->prev_in_row
= ps_i
->prev_in_row
;
2562 /* Unlike what literature describes for modulo scheduling (which focuses
2563 on VLIW machines) the order of the instructions inside a cycle is
2564 important. Given the bitmaps MUST_FOLLOW and MUST_PRECEDE we know
2565 where the current instruction should go relative to the already
2566 scheduled instructions in the given cycle. Go over these
2567 instructions and find the first possible column to put it in. */
2569 ps_insn_find_column (partial_schedule_ptr ps
, ps_insn_ptr ps_i
,
2570 sbitmap must_precede
, sbitmap must_follow
)
2572 ps_insn_ptr next_ps_i
;
2573 ps_insn_ptr first_must_follow
= NULL
;
2574 ps_insn_ptr last_must_precede
= NULL
;
2580 row
= SMODULO (ps_i
->cycle
, ps
->ii
);
2582 /* Find the first must follow and the last must precede
2583 and insert the node immediately after the must precede
2584 but make sure that it there is no must follow after it. */
2585 for (next_ps_i
= ps
->rows
[row
];
2587 next_ps_i
= next_ps_i
->next_in_row
)
2589 if (must_follow
&& TEST_BIT (must_follow
, next_ps_i
->node
->cuid
)
2590 && ! first_must_follow
)
2591 first_must_follow
= next_ps_i
;
2592 if (must_precede
&& TEST_BIT (must_precede
, next_ps_i
->node
->cuid
))
2594 /* If we have already met a node that must follow, then
2595 there is no possible column. */
2596 if (first_must_follow
)
2599 last_must_precede
= next_ps_i
;
2603 /* Now insert the node after INSERT_AFTER_PSI. */
2605 if (! last_must_precede
)
2607 ps_i
->next_in_row
= ps
->rows
[row
];
2608 ps_i
->prev_in_row
= NULL
;
2609 if (ps_i
->next_in_row
)
2610 ps_i
->next_in_row
->prev_in_row
= ps_i
;
2611 ps
->rows
[row
] = ps_i
;
2615 ps_i
->next_in_row
= last_must_precede
->next_in_row
;
2616 last_must_precede
->next_in_row
= ps_i
;
2617 ps_i
->prev_in_row
= last_must_precede
;
2618 if (ps_i
->next_in_row
)
2619 ps_i
->next_in_row
->prev_in_row
= ps_i
;
2625 /* Advances the PS_INSN one column in its current row; returns false
2626 in failure and true in success. Bit N is set in MUST_FOLLOW if
2627 the node with cuid N must be come after the node pointed to by
2628 PS_I when scheduled in the same cycle. */
2630 ps_insn_advance_column (partial_schedule_ptr ps
, ps_insn_ptr ps_i
,
2631 sbitmap must_follow
)
2633 ps_insn_ptr prev
, next
;
2635 ddg_node_ptr next_node
;
2640 row
= SMODULO (ps_i
->cycle
, ps
->ii
);
2642 if (! ps_i
->next_in_row
)
2645 next_node
= ps_i
->next_in_row
->node
;
2647 /* Check if next_in_row is dependent on ps_i, both having same sched
2648 times (typically ANTI_DEP). If so, ps_i cannot skip over it. */
2649 if (must_follow
&& TEST_BIT (must_follow
, next_node
->cuid
))
2652 /* Advance PS_I over its next_in_row in the doubly linked list. */
2653 prev
= ps_i
->prev_in_row
;
2654 next
= ps_i
->next_in_row
;
2656 if (ps_i
== ps
->rows
[row
])
2657 ps
->rows
[row
] = next
;
2659 ps_i
->next_in_row
= next
->next_in_row
;
2661 if (next
->next_in_row
)
2662 next
->next_in_row
->prev_in_row
= ps_i
;
2664 next
->next_in_row
= ps_i
;
2665 ps_i
->prev_in_row
= next
;
2667 next
->prev_in_row
= prev
;
2669 prev
->next_in_row
= next
;
2674 /* Inserts a DDG_NODE to the given partial schedule at the given cycle.
2675 Returns 0 if this is not possible and a PS_INSN otherwise. Bit N is
2676 set in MUST_PRECEDE/MUST_FOLLOW if the node with cuid N must be come
2677 before/after (respectively) the node pointed to by PS_I when scheduled
2678 in the same cycle. */
2680 add_node_to_ps (partial_schedule_ptr ps
, ddg_node_ptr node
, int cycle
,
2681 sbitmap must_precede
, sbitmap must_follow
)
2685 int row
= SMODULO (cycle
, ps
->ii
);
2688 && ps
->rows
[row
]->row_rest_count
>= issue_rate
)
2692 rest_count
+= ps
->rows
[row
]->row_rest_count
;
2694 ps_i
= create_ps_insn (node
, rest_count
, cycle
);
2696 /* Finds and inserts PS_I according to MUST_FOLLOW and
2698 if (! ps_insn_find_column (ps
, ps_i
, must_precede
, must_follow
))
2707 /* Advance time one cycle. Assumes DFA is being used. */
2709 advance_one_cycle (void)
2711 if (targetm
.sched
.dfa_pre_cycle_insn
)
2712 state_transition (curr_state
,
2713 targetm
.sched
.dfa_pre_cycle_insn ());
2715 state_transition (curr_state
, NULL
);
2717 if (targetm
.sched
.dfa_post_cycle_insn
)
2718 state_transition (curr_state
,
2719 targetm
.sched
.dfa_post_cycle_insn ());
2724 /* Checks if PS has resource conflicts according to DFA, starting from
2725 FROM cycle to TO cycle; returns true if there are conflicts and false
2726 if there are no conflicts. Assumes DFA is being used. */
2728 ps_has_conflicts (partial_schedule_ptr ps
, int from
, int to
)
2732 state_reset (curr_state
);
2734 for (cycle
= from
; cycle
<= to
; cycle
++)
2736 ps_insn_ptr crr_insn
;
2737 /* Holds the remaining issue slots in the current row. */
2738 int can_issue_more
= issue_rate
;
2740 /* Walk through the DFA for the current row. */
2741 for (crr_insn
= ps
->rows
[SMODULO (cycle
, ps
->ii
)];
2743 crr_insn
= crr_insn
->next_in_row
)
2745 rtx insn
= crr_insn
->node
->insn
;
2747 if (!NONDEBUG_INSN_P (insn
))
2750 /* Check if there is room for the current insn. */
2751 if (!can_issue_more
|| state_dead_lock_p (curr_state
))
2754 /* Update the DFA state and return with failure if the DFA found
2755 resource conflicts. */
2756 if (state_transition (curr_state
, insn
) >= 0)
2759 if (targetm
.sched
.variable_issue
)
2761 targetm
.sched
.variable_issue (sched_dump
, sched_verbose
,
2762 insn
, can_issue_more
);
2763 /* A naked CLOBBER or USE generates no instruction, so don't
2764 let them consume issue slots. */
2765 else if (GET_CODE (PATTERN (insn
)) != USE
2766 && GET_CODE (PATTERN (insn
)) != CLOBBER
)
2770 /* Advance the DFA to the next cycle. */
2771 advance_one_cycle ();
2776 /* Checks if the given node causes resource conflicts when added to PS at
2777 cycle C. If not the node is added to PS and returned; otherwise zero
2778 is returned. Bit N is set in MUST_PRECEDE/MUST_FOLLOW if the node with
2779 cuid N must be come before/after (respectively) the node pointed to by
2780 PS_I when scheduled in the same cycle. */
2782 ps_add_node_check_conflicts (partial_schedule_ptr ps
, ddg_node_ptr n
,
2783 int c
, sbitmap must_precede
,
2784 sbitmap must_follow
)
2786 int has_conflicts
= 0;
2789 /* First add the node to the PS, if this succeeds check for
2790 conflicts, trying different issue slots in the same row. */
2791 if (! (ps_i
= add_node_to_ps (ps
, n
, c
, must_precede
, must_follow
)))
2792 return NULL
; /* Failed to insert the node at the given cycle. */
2794 has_conflicts
= ps_has_conflicts (ps
, c
, c
)
2796 && ps_has_conflicts (ps
,
2800 /* Try different issue slots to find one that the given node can be
2801 scheduled in without conflicts. */
2802 while (has_conflicts
)
2804 if (! ps_insn_advance_column (ps
, ps_i
, must_follow
))
2806 has_conflicts
= ps_has_conflicts (ps
, c
, c
)
2808 && ps_has_conflicts (ps
,
2815 remove_node_from_ps (ps
, ps_i
);
2819 ps
->min_cycle
= MIN (ps
->min_cycle
, c
);
2820 ps
->max_cycle
= MAX (ps
->max_cycle
, c
);
2824 /* Rotate the rows of PS such that insns scheduled at time
2825 START_CYCLE will appear in row 0. Updates max/min_cycles. */
2827 rotate_partial_schedule (partial_schedule_ptr ps
, int start_cycle
)
2829 int i
, row
, backward_rotates
;
2830 int last_row
= ps
->ii
- 1;
2832 if (start_cycle
== 0)
2835 backward_rotates
= SMODULO (start_cycle
, ps
->ii
);
2837 /* Revisit later and optimize this into a single loop. */
2838 for (i
= 0; i
< backward_rotates
; i
++)
2840 ps_insn_ptr first_row
= ps
->rows
[0];
2842 for (row
= 0; row
< last_row
; row
++)
2843 ps
->rows
[row
] = ps
->rows
[row
+1];
2845 ps
->rows
[last_row
] = first_row
;
2848 ps
->max_cycle
-= start_cycle
;
2849 ps
->min_cycle
-= start_cycle
;
2852 #endif /* INSN_SCHEDULING */
2855 gate_handle_sms (void)
2857 return (optimize
> 0 && flag_modulo_sched
);
2861 /* Run instruction scheduler. */
2862 /* Perform SMS module scheduling. */
2864 rest_of_handle_sms (void)
2866 #ifdef INSN_SCHEDULING
2869 /* Collect loop information to be used in SMS. */
2870 cfg_layout_initialize (0);
2873 /* Update the life information, because we add pseudos. */
2874 max_regno
= max_reg_num ();
2876 /* Finalize layout changes. */
2878 if (bb
->next_bb
!= EXIT_BLOCK_PTR
)
2879 bb
->aux
= bb
->next_bb
;
2880 free_dominance_info (CDI_DOMINATORS
);
2881 cfg_layout_finalize ();
2882 #endif /* INSN_SCHEDULING */
2886 struct rtl_opt_pass pass_sms
=
2891 gate_handle_sms
, /* gate */
2892 rest_of_handle_sms
, /* execute */
2895 0, /* static_pass_number */
2897 0, /* properties_required */
2898 0, /* properties_provided */
2899 0, /* properties_destroyed */
2900 TODO_dump_func
, /* todo_flags_start */
2901 TODO_df_finish
| TODO_verify_rtl_sharing
|
2903 TODO_ggc_collect
/* todo_flags_finish */