paq8/paq7asm.asm

   1 ; NASM assembly language code for PAQ7.
   2 ; (C) 2005, Matt Mahoney.
   3 ; This is free software under GPL, http://www.gnu.org/licenses/gpl.txt
   4 ;
   5 ;   MINGW g++:     nasm paq7asm.asm -f win32 --prefix _
   6 ;   DJGPP g++:     nasm paq7asm.asm -f coff  --prefix _
   7 ;   Borland, Mars: nasm paq7asm.asm -f obj   --prefix _
   8 ;   Linux:         nasm paq7asm.asm -f elf
   9 ;
  10 ; For other Windows compilers try -f win32 or -f obj.  Some old versions
  11 ; of Linux should use -f aout instead of -f elf.
  12 ;
  13 ; This code will only work on a Pentium-MMX or higher.  It doesn't
  14 ; use extended (Katmai/SSE) instructions.  It won't work
  15 ; in 64-bit mode.
  16
  17 section .text use32 class=CODE
  18
  19 ; Reset after MMX
  20 global do_emms
  21 do_emms:
  22   emms
  23   ret
  24
  25 ; Vector product a*b of n signed words, returning signed dword scaled
  26 ; down by 8 bits. n is rounded up to a multiple of 8.
  27
  28 global dot_product ; (short* a, short* b, int n)
  29 align 16
  30 dot_product:
  31   mov eax, [esp+4]      ; a
  32   mov edx, [esp+8]      ; b
  33   mov ecx, [esp+12]     ; n
  34   add ecx, 7            ; n rounding up
  35   and ecx, -8
  36   jz .done
  37   sub eax, 8
  38   sub edx, 8
  39   pxor mm0, mm0         ; sum = 0
  40 .loop:                  ; each loop sums 4 products
  41   movq mm1, [eax+ecx*2] ; put halves of vector product in mm0
  42   pmaddwd mm1, [edx+ecx*2]
  43   movq mm2, [eax+ecx*2-8]
  44   pmaddwd mm2, [edx+ecx*2-8]
  45   psrad mm1, 8
  46   psrad mm2, 8
  47   paddd mm0, mm1
  48   paddd mm0, mm2
  49   sub ecx, 8
  50   ja .loop
  51   movq mm1, mm0         ; add 2 halves of mm0 and return in eax
  52   psrlq mm1, 32
  53   paddd mm0, mm1
  54   movd eax, mm0
  55   emms
  56 .done
  57   ret
  58
  59 ; This should work on a Pentium 4 or higher in 32-bit mode,
  60 ; but it isn't much faster than the MMX version so I don't use it.
  61
  62 global dot_product_sse2 ; (short* a, short* b, int n)
  63 align 16
  64 dot_product_sse2:
  65   mov eax, [esp+4]      ; a
  66   mov edx, [esp+8]      ; b
  67   mov ecx, [esp+12]     ; n
  68   add ecx, 7            ; n rounding up
  69   and ecx, -8
  70   jz .done
  71   sub eax, 16
  72   sub edx, 16
  73   pxor xmm0, xmm0       ; sum = 0
  74 .loop:                  ; each loop sums 4 products
  75   movdqa xmm1, [eax+ecx*2] ; put parital sums of vector product in xmm0
  76   pmaddwd xmm1, [edx+ecx*2]
  77   psrad xmm1, 8
  78   paddd xmm0, xmm1
  79   sub ecx, 8
  80   ja .loop
  81   movdqa xmm1, xmm0      ; add 4 parts of xmm0 and return in eax
  82   psrldq xmm1, 8
  83   paddd xmm0, xmm1
  84   movdqa xmm1, xmm0
  85   psrldq xmm1, 4
  86   paddd xmm0, xmm1
  87   movd eax, xmm0
  88 .done
  89   ret
  90
  91
  92 ; Train n neural network weights w[n] on inputs t[n] and err.
  93 ; w[i] += t[i]*err*2+1 >> 17 bounded to +- 32K.
  94 ; n is rounded up to a multiple of 8.
  95
  96 global train ; (short* t, short* w, int n, int err)
  97 align 16
  98 train:
  99   mov eax, [esp+16]     ; err
 100   and eax, 0xffff       ; put 4 copies of err in mm0
 101   movd mm0, eax
 102   movd mm1, eax
 103   psllq mm1, 16
 104   por mm0, mm1
 105   movq mm1, mm0
 106   psllq mm1, 32
 107   por mm0, mm1
 108   pcmpeqb mm1, mm1      ; 4 copies of 1 in mm1
 109   psrlw mm1, 15
 110   mov eax, [esp+4]      ; t
 111   mov edx, [esp+8]      ; w
 112   mov ecx, [esp+12]     ; n
 113   add ecx, 7            ; n/8 rounding up
 114   and ecx, -8
 115   sub eax, 8
 116   sub edx, 8
 117   jz .done
 118 .loop:                  ; each iteration adjusts 8 weights
 119   movq mm2, [edx+ecx*2] ; w[i]
 120   movq mm3, [eax+ecx*2] ; t[i]
 121   movq mm4, [edx+ecx*2-8] ; w[i]
 122   movq mm5, [eax+ecx*2-8] ; t[i]
 123   paddsw mm3, mm3
 124   paddsw mm5, mm5
 125   pmulhw mm3, mm0
 126   pmulhw mm5, mm0
 127   paddsw mm3, mm1
 128   paddsw mm5, mm1
 129   psraw mm3, 1
 130   psraw mm5, 1
 131   paddsw mm2, mm3
 132   paddsw mm4, mm5
 133   movq [edx+ecx*2], mm2
 134   movq [edx+ecx*2-8], mm4
 135   sub ecx, 8
 136   ja .loop
 137 .done:
 138   emms
 139   ret
 140