firmware/target/sh/memcpy-sh.S

   1 /***************************************************************************
   2  *             __________               __   ___.
   3  *   Open      \______   \ ____   ____ |  | _\_ |__   _______  ___
   4  *   Source     |       _//  _ \_/ ___\|  |/ /| __ \ /  _ \  \/  /
   5  *   Jukebox    |    |   (  <_> )  \___|    < | \_\ (  <_> > <  <
   6  *   Firmware   |____|_  /\____/ \___  >__|_ \|___  /\____/__/\_ \
   7  *                     \/            \/     \/    \/            \/
   8  * $Id$
   9  *
  10  * Copyright (C) 2004-2005 by Jens Arnold
  11  *
  12  * All files in this archive are subject to the GNU General Public License.
  13  * See the file COPYING in the source tree root for full license agreement.
  14  *
  15  * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
  16  * KIND, either express or implied.
  17  *
  18  ****************************************************************************/
  19 #include "config.h"
  20
  21     .section    .icode,"ax",@progbits
  22
  23     .align      2
  24     .global     _memcpy
  25     .global     ___memcpy_fwd_entry
  26     .type       _memcpy,@function
  27
  28 /* Copies <length> bytes of data in memory from <source> to <dest>
  29  * This version is optimized for speed
  30  *
  31  * arguments:
  32  *  r4 - destination address
  33  *  r5 - source address
  34  *  r6 - length
  35  *
  36  * return value:
  37  *  r0 - destination address (like ANSI version)
  38  *
  39  * register usage:
  40  *  r0 - data / scratch
  41  *  r1 - 2nd data / scratch
  42  *  r2 - scratch
  43  *  r3 - first long bound / adjusted end address (only if >= 11 bytes)
  44  *  r4 - current dest address
  45  *  r5 - current source address
  46  *  r6 - source end address
  47  *  r7 - stored dest start address
  48  *
  49  * The instruction order is devised in a way to utilize the pipelining
  50  * of the SH1 to the max. The routine also tries to utilize fast page mode.
  51  */
  52
  53 _memcpy:
  54     mov     r4,r7       /* store dest for returning */
  55 ___memcpy_fwd_entry:
  56     add     #-8,r4      /* offset for early increment (max. 2 longs) */
  57     mov     #11,r0
  58     cmp/hs  r0,r6       /* at least 11 bytes to copy? (ensures 2 aligned longs) */
  59     add     r5,r6       /* r6 = source_end */
  60     bf      .start_b2   /* no: jump directly to byte loop */
  61
  62     mov     #3,r0
  63     neg     r5,r3
  64     and     r0,r3       /* r3 = (4 - align_offset) % 4 */
  65     tst     r3,r3       /* already aligned? */
  66     bt      .end_b1     /* yes: skip leading byte loop */
  67
  68     add     r5,r3       /* r3 = first source long bound */
  69
  70     /* leading byte loop: copies 0..3 bytes */
  71 .loop_b1:
  72     mov.b   @r5+,r0     /* load byte & increment source addr */
  73     add     #1,r4       /* increment dest addr */
  74     mov.b   r0,@(7,r4)  /* store byte */
  75     cmp/hi  r5,r3       /* runs r5 up to first long bound */
  76     bt      .loop_b1
  77     /* now r5 is always at a long boundary */
  78     /* -> memory reading is done in longs for all dest alignments */
  79
  80     /* selector for main copy loop */
  81 .end_b1:
  82     mov     #3,r1
  83     and     r4,r1       /* r1 = dest alignment offset */
  84     mova    .jmptab,r0
  85     mov.b   @(r0,r1),r1 /* select appropriate main loop */
  86     add     r0,r1
  87     mov     r6,r3       /* move end address to r3 */
  88     jmp     @r1         /* and jump to it */
  89     add     #-7,r3      /* adjust end addr for main loops doing 2 longs/pass */
  90
  91     /** main loops, copying 2 longs per pass to profit from fast page mode **/
  92
  93     /* long aligned destination (fastest) */
  94     .align  2
  95 .loop_do0:
  96     mov.l   @r5+,r1     /* load first long & increment source addr */
  97     add     #16,r4      /* increment dest addr & account for decrementing stores */
  98     mov.l   @r5+,r0     /* load second long & increment source addr */
  99     cmp/hi  r5,r3       /* runs r5 up to last or second last long bound */
 100     mov.l   r0,@-r4     /* store second long */
 101     mov.l   r1,@-r4     /* store first long; NOT ALIGNED - no speed loss here! */
 102     bt      .loop_do0
 103
 104     add     #4,r3       /* readjust end address */
 105     cmp/hi  r5,r3       /* one long left? */
 106     bf      .start_b2   /* no, jump to trailing byte loop */
 107
 108     mov.l   @r5+,r0     /* load last long & increment source addr */
 109     add     #4,r4       /* increment dest addr */
 110     bra     .start_b2   /* jump to trailing byte loop */
 111     mov.l   r0,@(4,r4)  /* store last long */
 112
 113     /* word aligned destination (long + 2) */
 114     .align  2
 115 .loop_do2:
 116     mov.l   @r5+,r1     /* load first long & increment source addr */
 117     add     #16,r4      /* increment dest addr */
 118     mov.l   @r5+,r0     /* load second long & increment source addr */
 119     cmp/hi  r5,r3       /* runs r5 up to last or second last long bound */
 120     mov.w   r0,@-r4     /* store low word of second long */
 121     xtrct   r1,r0       /* extract low word of first long & high word of second long */
 122     mov.l   r0,@-r4     /* and store as long */
 123     swap.w  r1,r0       /* get high word of first long */
 124     mov.w   r0,@-r4     /* and store it */
 125     bt      .loop_do2
 126
 127     add     #4,r3       /* readjust end address */
 128     cmp/hi  r5,r3       /* one long left? */
 129     bf      .start_b2   /* no, jump to trailing byte loop */
 130
 131     mov.l   @r5+,r0     /* load last long & increment source addr */
 132     add     #4,r4       /* increment dest addr */
 133     mov.w   r0,@(6,r4)  /* store low word */
 134     shlr16  r0          /* get high word */
 135     bra     .start_b2   /* jump to trailing byte loop */
 136     mov.w   r0,@(4,r4)  /* and store it */
 137
 138     /* jumptable for loop selector */
 139     .align  2
 140 .jmptab:
 141     .byte   .loop_do0 - .jmptab  /* placed in the middle because the SH1 */
 142     .byte   .loop_do1 - .jmptab  /* loads bytes sign-extended. Otherwise */
 143     .byte   .loop_do2 - .jmptab  /* the last loop would be out of reach */
 144     .byte   .loop_do3 - .jmptab  /* of the offset range. */
 145
 146     /* byte aligned destination (long + 1) */
 147     .align  2
 148 .loop_do1:
 149     mov.l   @r5+,r1     /* load first long & increment source addr */
 150     add     #16,r4      /* increment dest addr */
 151     mov.l   @r5+,r0     /* load second long & increment source addr */
 152     cmp/hi  r5,r3       /* runs r5 up to last or second last long bound */
 153     mov.b   r0,@-r4     /* store low byte of second long */
 154     shlr8   r0          /* get upper 3 bytes */
 155     mov     r1,r2       /* copy first long */
 156     shll16  r2          /* move low byte of first long all the way up, .. */
 157     shll8   r2
 158     or      r2,r0       /* ..combine with the 3 bytes of second long.. */
 159     mov.l   r0,@-r4     /* ..and store as long */
 160     shlr8   r1          /* get middle 2 bytes */
 161     mov.w   r1,@-r4     /* store as word */
 162     shlr16  r1          /* get upper byte */
 163     mov.b   r1,@-r4     /* and store */
 164     bt      .loop_do1
 165
 166     add     #4,r3       /* readjust end address */
 167 .last_do13:
 168     cmp/hi  r5,r3       /* one long left? */
 169     bf      .start_b2   /* no, jump to trailing byte loop */
 170
 171     mov.l   @r5+,r0     /* load last long & increment source addr */
 172     add     #12,r4      /* increment dest addr */
 173     mov.b   r0,@-r4     /* store low byte */
 174     shlr8   r0          /* get middle 2 bytes */
 175     mov.w   r0,@-r4     /* store as word */
 176     shlr16  r0          /* get upper byte */
 177     mov.b   r0,@-r4     /* and store */
 178     bra     .start_b2   /* jump to trailing byte loop */
 179     add     #-4,r4      /* readjust destination */
 180
 181     /* byte aligned destination (long + 3) */
 182     .align  2
 183 .loop_do3:
 184     mov.l   @r5+,r1     /* load first long & increment source addr */
 185     add     #16,r4      /* increment dest addr */
 186     mov.l   @r5+,r0     /* load second long & increment source addr */
 187     mov     r1,r2       /* copy first long */
 188     mov.b   r0,@-r4     /* store low byte of second long */
 189     shlr8   r0          /* get middle 2 bytes */
 190     mov.w   r0,@-r4     /* store as word */
 191     shlr16  r0          /* get upper byte */
 192     shll8   r2          /* move lower 3 bytes of first long one up.. */
 193     or      r2,r0       /* ..combine with the 1 byte of second long.. */
 194     mov.l   r0,@-r4     /* ..and store as long */
 195     shlr16  r1          /* get upper byte of first long.. */
 196     shlr8   r1
 197     cmp/hi  r5,r3       /* runs r5 up to last or second last long bound */
 198     mov.b   r1,@-r4     /* ..and store */
 199     bt      .loop_do3
 200
 201     bra     .last_do13  /* handle last longword: reuse routine for (long + 1) */
 202     add     #4,r3       /* readjust end address */
 203
 204     /* trailing byte loop: copies 0..3 bytes (or all for < 11 in total) */
 205     .align  2
 206 .loop_b2:
 207     mov.b   @r5+,r0     /* load byte & increment source addr */
 208     add     #1,r4       /* increment dest addr */
 209     mov.b   r0,@(7,r4)  /* store byte */
 210 .start_b2:
 211     cmp/hi  r5,r6       /* runs r5 up to end address */
 212     bt      .loop_b2
 213
 214     rts
 215     mov     r7,r0       /* return dest start address */
 216 .end:
 217     .size   _memcpy,.end-_memcpy