arch/alpha/lib/ev6-copy_page.S

   1 /*
   2  * arch/alpha/lib/ev6-copy_page.S
   3  *
   4  * Copy an entire page.
   5  */
   6
   7 /* The following comparison of this routine vs the normal copy_page.S
   8    was written by an unnamed ev6 hardware designer and forwarded to me
   9    via Steven Hobbs <hobbs@steven.zko.dec.com>.
  10
  11    First Problem: STQ overflows.
  12    -----------------------------
  13
  14         It would be nice if EV6 handled every resource overflow efficiently,
  15         but for some it doesn't.  Including store queue overflows.  It causes
  16         a trap and a restart of the pipe.
  17
  18         To get around this we sometimes use (to borrow a term from a VSSAD
  19         researcher) "aeration".  The idea is to slow the rate at which the
  20         processor receives valid instructions by inserting nops in the fetch
  21         path.  In doing so, you can prevent the overflow and actually make
  22         the code run faster.  You can, of course, take advantage of the fact
  23         that the processor can fetch at most 4 aligned instructions per cycle.
  24
  25         I inserted enough nops to force it to take 10 cycles to fetch the
  26         loop code.  In theory, EV6 should be able to execute this loop in
  27         9 cycles but I was not able to get it to run that fast -- the initial
  28         conditions were such that I could not reach this optimum rate on
  29         (chaotic) EV6.  I wrote the code such that everything would issue
  30         in order.
  31
  32    Second Problem: Dcache index matches.
  33    -------------------------------------
  34
  35         If you are going to use this routine on random aligned pages, there
  36         is a 25% chance that the pages will be at the same dcache indices.
  37         This results in many nasty memory traps without care.
  38
  39         The solution is to schedule the prefetches to avoid the memory
  40         conflicts.  I schedule the wh64 prefetches farther ahead of the
  41         read prefetches to avoid this problem.
  42
  43    Third Problem: Needs more prefetching.
  44    --------------------------------------
  45
  46         In order to improve the code I added deeper prefetching to take the
  47         most advantage of EV6's bandwidth.
  48
  49         I also prefetched the read stream. Note that adding the read prefetch
  50         forced me to add another cycle to the inner-most kernel - up to 11
  51         from the original 8 cycles per iteration.  We could improve performance
  52         further by unrolling the loop and doing multiple prefetches per cycle.
  53
  54    I think that the code below will be very robust and fast code for the
  55    purposes of copying aligned pages.  It is slower when both source and
  56    destination pages are in the dcache, but it is my guess that this is
  57    less important than the dcache miss case.  */
  58
  59
  60         .text
  61         .align 4
  62         .global copy_page
  63         .ent copy_page
  64 copy_page:
  65         .prologue 0
  66
  67         /* Prefetch 5 read cachelines; write-hint 10 cache lines.  */
  68         wh64    ($16)
  69         ldl     $31,0($17)
  70         ldl     $31,64($17)
  71         lda     $1,1*64($16)
  72
  73         wh64    ($1)
  74         ldl     $31,128($17)
  75         ldl     $31,192($17)
  76         lda     $1,2*64($16)
  77
  78         wh64    ($1)
  79         ldl     $31,256($17)
  80         lda     $18,118
  81         lda     $1,3*64($16)
  82
  83         wh64    ($1)
  84         nop
  85         lda     $1,4*64($16)
  86         lda     $2,5*64($16)
  87
  88         wh64    ($1)
  89         wh64    ($2)
  90         lda     $1,6*64($16)
  91         lda     $2,7*64($16)
  92
  93         wh64    ($1)
  94         wh64    ($2)
  95         lda     $1,8*64($16)
  96         lda     $2,9*64($16)
  97
  98         wh64    ($1)
  99         wh64    ($2)
 100         lda     $19,10*64($16)
 101         nop
 102
 103         /* Main prefetching/write-hinting loop.  */
 104 1:      ldq     $0,0($17)
 105         ldq     $1,8($17)
 106         unop
 107         unop
 108
 109         unop
 110         unop
 111         ldq     $2,16($17)
 112         ldq     $3,24($17)
 113
 114         ldq     $4,32($17)
 115         ldq     $5,40($17)
 116         unop
 117         unop
 118
 119         unop
 120         unop
 121         ldq     $6,48($17)
 122         ldq     $7,56($17)
 123
 124         ldl     $31,320($17)
 125         unop
 126         unop
 127         unop
 128
 129         /* This gives the extra cycle of aeration above the minimum.  */
 130         unop
 131         unop
 132         unop
 133         unop
 134
 135         wh64    ($19)
 136         unop
 137         unop
 138         unop
 139
 140         stq     $0,0($16)
 141         subq    $18,1,$18
 142         stq     $1,8($16)
 143         unop
 144
 145         unop
 146         stq     $2,16($16)
 147         addq    $17,64,$17
 148         stq     $3,24($16)
 149
 150         stq     $4,32($16)
 151         stq     $5,40($16)
 152         addq    $19,64,$19
 153         unop
 154
 155         stq     $6,48($16)
 156         stq     $7,56($16)
 157         addq    $16,64,$16
 158         bne     $18, 1b
 159
 160         /* Prefetch the final 5 cache lines of the read stream.  */
 161         lda     $18,10
 162         ldl     $31,320($17)
 163         ldl     $31,384($17)
 164         ldl     $31,448($17)
 165
 166         ldl     $31,512($17)
 167         ldl     $31,576($17)
 168         nop
 169         nop
 170
 171         /* Non-prefetching, non-write-hinting cleanup loop for the
 172            final 10 cache lines.  */
 173 2:      ldq     $0,0($17)
 174         ldq     $1,8($17)
 175         ldq     $2,16($17)
 176         ldq     $3,24($17)
 177
 178         ldq     $4,32($17)
 179         ldq     $5,40($17)
 180         ldq     $6,48($17)
 181         ldq     $7,56($17)
 182
 183         stq     $0,0($16)
 184         subq    $18,1,$18
 185         stq     $1,8($16)
 186         addq    $17,64,$17
 187
 188         stq     $2,16($16)
 189         stq     $3,24($16)
 190         stq     $4,32($16)
 191         stq     $5,40($16)
 192
 193         stq     $6,48($16)
 194         stq     $7,56($16)
 195         addq    $16,64,$16
 196         bne     $18, 2b
 197
 198         ret
 199         nop
 200         unop
 201         nop
 202
 203         .end copy_page