arch/alpha/lib/memcpy.c

   1 /*
   2  *  linux/arch/alpha/lib/memcpy.c
   3  *
   4  *  Copyright (C) 1995  Linus Torvalds
   5  */
   6
   7 /*
   8  * This is a reasonably optimized memcpy() routine.
   9  */
  10
  11 /*
  12  * Note that the C code is written to be optimized into good assembly. However,
  13  * at this point gcc is unable to sanely compile "if (n >= 0)", resulting in a
  14  * explicit compare against 0 (instead of just using the proper "blt reg, xx" or
  15  * "bge reg, xx"). I hope alpha-gcc will be fixed to notice this eventually..
  16  */
  17
  18 #include <linux/types.h>
  19
  20 /*
  21  * This should be done in one go with ldq_u*2/mask/stq_u. Do it
  22  * with a macro so that we can fix it up later..
  23  */
  24 #define ALIGN_DEST_TO8_UP(d,s,n) \
  25         while (d & 7) { \
  26                 if (n <= 0) return; \
  27                 n--; \
  28                 *(char *) d = *(char *) s; \
  29                 d++; s++; \
  30         }
  31 #define ALIGN_DEST_TO8_DN(d,s,n) \
  32         while (d & 7) { \
  33                 if (n <= 0) return; \
  34                 n--; \
  35                 d--; s--; \
  36                 *(char *) d = *(char *) s; \
  37         }
  38
  39 /*
  40  * This should similarly be done with ldq_u*2/mask/stq. The destination
  41  * is aligned, but we don't fill in a full quad-word
  42  */
  43 #define DO_REST_UP(d,s,n) \
  44         while (n > 0) { \
  45                 n--; \
  46                 *(char *) d = *(char *) s; \
  47                 d++; s++; \
  48         }
  49 #define DO_REST_DN(d,s,n) \
  50         while (n > 0) { \
  51                 n--; \
  52                 d--; s--; \
  53                 *(char *) d = *(char *) s; \
  54         }
  55
  56 /*
  57  * This should be done with ldq/mask/stq. The source and destination are
  58  * aligned, but we don't fill in a full quad-word
  59  */
  60 #define DO_REST_ALIGNED_UP(d,s,n) DO_REST_UP(d,s,n)
  61 #define DO_REST_ALIGNED_DN(d,s,n) DO_REST_DN(d,s,n)
  62
  63 /*
  64  * This does unaligned memory copies. We want to avoid storing to
  65  * an unaligned address, as that would do a read-modify-write cycle.
  66  * We also want to avoid double-reading the unaligned reads.
  67  *
  68  * Note the ordering to try to avoid load (and address generation) latencies.
  69  */
  70 static inline void __memcpy_unaligned_up (unsigned long d, unsigned long s,
  71                                           long n)
  72 {
  73         ALIGN_DEST_TO8_UP(d,s,n);
  74         n -= 8;                 /* to avoid compare against 8 in the loop */
  75         if (n >= 0) {
  76                 unsigned long low_word, high_word;
  77                 __asm__("ldq_u %0,%1":"=r" (low_word):"m" (*(unsigned long *) s));
  78                 do {
  79                         unsigned long tmp;
  80                         __asm__("ldq_u %0,%1":"=r" (high_word):"m" (*(unsigned long *)(s+8)));
  81                         n -= 8;
  82                         __asm__("extql %1,%2,%0"
  83                                 :"=r" (low_word)
  84                                 :"r" (low_word), "r" (s));
  85                         __asm__("extqh %1,%2,%0"
  86                                 :"=r" (tmp)
  87                                 :"r" (high_word), "r" (s));
  88                         s += 8;
  89                         *(unsigned long *) d = low_word | tmp;
  90                         d += 8;
  91                         low_word = high_word;
  92                 } while (n >= 0);
  93         }
  94         n += 8;
  95         DO_REST_UP(d,s,n);
  96 }
  97
  98 static inline void __memcpy_unaligned_dn (unsigned long d, unsigned long s,
  99                                           long n)
 100 {
 101         /* I don't understand AXP assembler well enough for this. -Tim */
 102         s += n;
 103         d += n;
 104         while (n--)
 105                 * (char *) --d = * (char *) --s;
 106 }
 107
 108 /*
 109  * Hmm.. Strange. The __asm__ here is there to make gcc use an integer register
 110  * for the load-store. I don't know why, but it would seem that using a floating
 111  * point register for the move seems to slow things down (very small difference,
 112  * though).
 113  *
 114  * Note the ordering to try to avoid load (and address generation) latencies.
 115  */
 116 static inline void __memcpy_aligned_up (unsigned long d, unsigned long s,
 117                                         long n)
 118 {
 119         ALIGN_DEST_TO8_UP(d,s,n);
 120         n -= 8;
 121         while (n >= 0) {
 122                 unsigned long tmp;
 123                 __asm__("ldq %0,%1":"=r" (tmp):"m" (*(unsigned long *) s));
 124                 n -= 8;
 125                 s += 8;
 126                 *(unsigned long *) d = tmp;
 127                 d += 8;
 128         }
 129         n += 8;
 130         DO_REST_ALIGNED_UP(d,s,n);
 131 }
 132 static inline void __memcpy_aligned_dn (unsigned long d, unsigned long s,
 133                                         long n)
 134 {
 135         s += n;
 136         d += n;
 137         ALIGN_DEST_TO8_DN(d,s,n);
 138         n -= 8;
 139         while (n >= 0) {
 140                 unsigned long tmp;
 141                 s -= 8;
 142                 __asm__("ldq %0,%1":"=r" (tmp):"m" (*(unsigned long *) s));
 143                 n -= 8;
 144                 d -= 8;
 145                 *(unsigned long *) d = tmp;
 146         }
 147         n += 8;
 148         DO_REST_ALIGNED_DN(d,s,n);
 149 }
 150
 151 void * memcpy(void * dest, const void *src, size_t n)
 152 {
 153         if (!(((unsigned long) dest ^ (unsigned long) src) & 7)) {
 154                 __memcpy_aligned_up ((unsigned long) dest, (unsigned long) src,
 155                                      n);
 156                 return dest;
 157         }
 158         __memcpy_unaligned_up ((unsigned long) dest, (unsigned long) src, n);
 159         return dest;
 160 }
 161
 162 /* For backward modules compatibility, define __memcpy.  */
 163 asm("__memcpy = memcpy; .globl __memcpy");