firmware/common/memcpy_a.S

   1 /***************************************************************************
   2  *             __________               __   ___.
   3  *   Open      \______   \ ____   ____ |  | _\_ |__   _______  ___
   4  *   Source     |       _//  _ \_/ ___\|  |/ /| __ \ /  _ \  \/  /
   5  *   Jukebox    |    |   (  <_> )  \___|    < | \_\ (  <_> > <  <
   6  *   Firmware   |____|_  /\____/ \___  >__|_ \|___  /\____/__/\_ \
   7  *                     \/            \/     \/    \/            \/
   8  * $Id$
   9  *
  10  * Copyright (C) 2004 by Jens Arnold
  11  *
  12  * All files in this archive are subject to the GNU General Public License.
  13  * See the file COPYING in the source tree root for full license agreement.
  14  *
  15  * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
  16  * KIND, either express or implied.
  17  *
  18  ****************************************************************************/
  19 #include "config.h"
  20
  21     .section    .icode,"ax",@progbits
  22
  23 #if CONFIG_CPU == SH7034
  24     .align      2
  25     .global     _memcpy
  26     .type       _memcpy,@function
  27
  28 /* Copies <length> bytes of data in memory from <source> to <dest>
  29  * This version is optimized for speed
  30  *
  31  * arguments:
  32  *  r4 - destination address
  33  *  r5 - source address
  34  *  r6 - length
  35  *
  36  * return value:
  37  *  r0 - destination address (like ANSI version)
  38  *
  39  * register usage:
  40  *  r0 - data / temporary
  41  *  r1 - bit mask for rounding to long bounds / 2nd data
  42  *  r2 - first long bound (only if >= 12 bytes)
  43  *  r3 - last long bound (-4) (only if >= 12 bytes)
  44  *  r4 - current dest address
  45  *  r5 - current source address
  46  *  r6 - source end address
  47  *  r7 - stored dest start address
  48  *
  49  * The instruction order below is devised in a way to utilize the pipelining
  50  * of the SH1 to the max. The routine also tries to utilize fast page mode.
  51  */
  52
  53 _memcpy:
  54     add     r5,r6       /* r6 = source_end */
  55     mov     r4,r7       /* store for returning */
  56     add     #-8,r4      /* adjust for early increments (max. 2 longs) */
  57
  58     mov     r6,r0
  59     add     #-12,r0     /* r0 = r6 - 12; don't go below 12 here! */
  60     cmp/hs  r5,r0       /* >= 12 bytes to copy? */
  61     bf      .start_b2   /* no, jump into byte loop */
  62
  63     mov     #-4,r1      /* r1 = 0xFFFFFFFC */
  64
  65     mov     r5,r2
  66     add     #3,r2
  67     and     r1,r2       /* r2 = first source long bound */
  68     mov     r6,r3
  69     add     #-4,r3      /* end offset for copying 2 longs per pass */
  70     bra     .start_b1   /* jump into leading byte loop */
  71     and     r1,r3       /* r3 = last source long bound - 4 */
  72
  73     /* leading byte loop: copies 0..3 bytes */
  74     .align  2
  75 .loop_b1:
  76     mov.b   @r5+,r0     /* load byte & increment source addr */
  77     add     #1,r4       /* increment dest addr */
  78     mov.b   r0,@(7,r4)  /* store byte */
  79 .start_b1:
  80     cmp/hi  r5,r2       /* runs r5 up to first long bound */
  81     bt      .loop_b1
  82     /* now r5 is always at a long boundary */
  83     /* -> memory reading is done in longs for all dest alignments */
  84
  85     /* selector for main copy loop */
  86     mov     r4,r0
  87     tst     #3,r0       /* dest now also at long bound? */
  88     bt      .loop2_l    /* yes, do long copy */
  89     tst     #1,r0       /* dest now at least at word bound? */
  90     bt      .start4_w   /* yes, do word copy */
  91
  92     /* main loop for byte aligned destination (fast) */
  93     /* copies 1 long per pass */
  94     add     #4,r3       /* reset end offset */
  95     add     #-1,r4      /* adjust to word alignment for word write+ */
  96
  97 .loop4_b:
  98     mov.l   @r5+,r0     /* load a long & increment source addr */
  99     add     #4,r4       /* increment dest addr */
 100     mov.b   r0,@(8,r4)  /* store low byte */
 101     shlr8   r0          /* get middle 2 bytes */
 102     mov.w   r0,@(6,r4)  /* store as word+ */
 103     shlr16  r0          /* get upper byte */
 104     mov.b   r0,@(5,r4)  /* and store */
 105     cmp/hi  r5,r3       /* runs r5 up to last long bound */
 106     bt      .loop4_b
 107
 108     bra     .start_b2   /* jump to trailing byte loop */
 109     add     #1,r4       /* readjust */
 110
 111     /* main loop for word aligned destination (faster) */
 112     /* copies 2 longs per pass, utilizing fast page mode */
 113 .start4_w:
 114     add     #-2,r4      /* adjust to long alignment for long write+ */
 115
 116 .loop4_w:
 117     mov.l   @r5+,r1     /* load first long & increment source addr */
 118     add     #8,r4       /* increment dest addr */
 119     mov.l   @r5+,r0     /* load second long & increment source addr */
 120     cmp/hi  r5,r3       /* runs r5 up to last or second last long bound */
 121     mov.w   r0,@(8,r4)  /* store low word of second long */
 122     xtrct   r1,r0       /* extract low word of first long & high word of second long */
 123     mov.l   r0,@(4,r4)  /* and store as long+ */
 124     swap.w  r1,r0       /* get high word of first long */
 125     mov.w   r0,@(2,r4)  /* and store it */
 126     bt      .loop4_w
 127
 128     add     #2,r4       /* readjust destination */
 129     add     #4,r3       /* reset end offset */
 130     cmp/hi  r5,r3       /* one long left? */
 131     bf      .start_b2   /* no, jump to trailing byte loop */
 132
 133     mov.l   @r5+,r0     /* load last long & increment source addr */
 134     add     #4,r4       /* increment dest addr */
 135     mov.w   r0,@(6,r4)  /* store low word */
 136     shlr16  r0          /* get high word */
 137     bra     .start_b2   /* jump to trailing byte loop */
 138     mov.w   r0,@(4,r4)  /* and store it */
 139
 140     /* main loop for long aligned destination (fastest) */
 141     /* copies 2 longs per pass, utilizing fast page mode */
 142 .loop2_l:
 143     mov.l   @r5+,r1     /* load first long & increment source addr */
 144     add     #8,r4       /* increment dest addr */
 145     mov.l   @r5+,r0     /* load second long & increment source addr */
 146     cmp/hi  r5,r3       /* runs r5 up to last or second last long bound */
 147     mov.l   r1,@r4      /* store first long */
 148     mov.l   r0,@(4,r4)  /* store second long; NOT ALIGNED - no speed loss here! */
 149     bt      .loop2_l
 150
 151     add     #4,r3       /* reset end offset */
 152     cmp/hi  r5,r3       /* one long left? */
 153     bf      .start_b2   /* no, jump to trailing byte loop */
 154
 155     mov.l   @r5+,r0     /* load last long & increment source addr */
 156     add     #4,r4       /* increment dest addr */
 157     bra     .start_b2   /* jump to trailing byte loop */
 158     mov.l   r0,@(4,r4)  /* store last long */
 159
 160     /* trailing byte loop: copies 0..3 bytes (or all for < 12 in total) */
 161 .loop_b2:
 162     mov.b   @r5+,r0     /* load byte & increment source addr */
 163     add     #1,r4       /* increment dest addr */
 164     mov.b   r0,@(7,r4)  /* store byte */
 165 .start_b2:
 166     cmp/hi  r5,r6       /* runs r5 up to end address */
 167     bt      .loop_b2
 168
 169     rts
 170     mov     r7,r0       /* return dest start address */
 171 .end:
 172     .size   _memcpy,.end-_memcpy
 173 #elif CONFIG_CPU == MCF5249
 174     .align      2
 175     .global     memcpy
 176     .type       memcpy,@function
 177
 178 /* Copies <length> bytes of data in memory from <source> to <dest>
 179  * This version is not optimized at all
 180  */
 181 memcpy:
 182         move.l  (4,%sp),%a1    /* Destination */
 183         move.l  (8,%sp),%a0     /* Source */
 184         move.l  (12,%sp),%d1    /* Length */
 185
 186         cmp.l   #0,%d1
 187         bra.b   .byteloopend
 188
 189 .byteloop:
 190         move.b  (%a0)+,(%a1)+
 191         subq.l  #1,%d1
 192 .byteloopend:
 193         bne.b   .byteloop
 194
 195         rts
 196 #endif