mpeg2enc/mblock_sub44_sads.c

   1 /*
   2  *
   3  * mblock_sub44_sads.c
   4  * Copyright (C) 2000 Andrew Stevens <as@comlab.ox.ac.uk>
   5  *
   6  * Fast block sum-absolute difference computation for a rectangular area 4*x
   7  * by y where y > h against a 4 by h block.
   8  *
   9  * Used for 4*4 sub-sampled motion compensation calculations.
  10  *
  11  * This is actually just a shell that uses templates from the included
  12  * file "mblock_sub44_sads_x86_h.c".  I didn't trust the compiler to do a good
  13  * job on nested inlining.  One day I'll experiment.
  14  *
  15  *
  16  * This file is part of mpeg2enc, a free MPEG-2 video stream encoder
  17  * based on the original MSSG reference design
  18  *
  19  * mpeg2enc is free software; you can redistribute new parts
  20  * and/or modify under the terms of the GNU General Public License
  21  * as published by
  22  * the Free Software Foundation; either version 2 of the License, or
  23  * (at your option) any later version.
  24  *
  25  * mpeg2dec is distributed in the hope that it will be useful,
  26  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  27  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  28  * GNU General Public License for more details.
  29  *
  30  * See the files for those sections (c) MSSG
  31  *
  32  * You should have received a copy of the GNU General Public License
  33  * along with this program; if not, write to the Free Software
  34  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  35  */
  36
  37 #include "config.h"
  38 #include "global.h"
  39 #include "mpeg2enc.h"
  40 #include "simd.h"
  41 #include "attributes.h"
  42 #include "mmx.h"
  43 #include "fastintfns.h"
  44
  45 /*
  46   Register usage:
  47   mm0-mm3  Hold the current row
  48   mm4      Used for accumulating partial SAD
  49   mm7      Holds zero
  50  */
  51
  52 static inline void mmx_zero_reg (void)
  53 {
  54         /*  load 0 into mm7      */
  55         pxor_r2r (mm7, mm7);
  56 }
  57
  58 /*
  59  * Load a 4*4 block of 4*4 sub-sampled pels (qpels) into the MMX
  60  * registers
  61  *
  62  */
  63
  64 static __inline__ void load_blk(uint8_t *blk, uint32_t rowstride, int h)
  65 {
  66 // Required to get GCC 4.0 to use the right registers as the source argument to
  67 // movq
  68         uint8_t *blk2 = blk + rowstride * 2;
  69
  70         movq_m2r( *blk, mm0);
  71         blk += rowstride;
  72         movq_m2r( *blk, mm1);
  73
  74
  75         if( h == 2 )
  76                 return;
  77
  78
  79         movq_m2r( *blk2, mm2);
  80         blk2 += rowstride;
  81         movq_m2r( *blk2, mm3);
  82 }
  83
  84 /*
  85  * Do a shift right on the 4*4 block in the MMX registers
  86  *
  87  */
  88 static __inline__ void shift_blk(const uint32_t shift)
  89 {
  90         psrlq_i2r( shift,mm0);
  91         psrlq_i2r( shift,mm1);
  92         psrlq_i2r( shift,mm2);
  93         psrlq_i2r( shift,mm3);
  94 }
  95
  96 /*
  97  * Compute the Sum absolute differences between the 4*h block in
  98  * the MMX registers
  99  *
 100  * and the 4*h block pointed to by refblk
 101  *
 102  * h == 2 || h == 4
 103  *
 104  * TODO: Currently always loads and shifts 4*4 even if 4*2 is required.
 105  *
 106  */
 107
 108 static __inline__ int qblock_sad_mmxe(uint8_t *refblk,
 109                                                                   uint32_t h,
 110                                                                   uint32_t rowstride)
 111 {
 112         int res;
 113         pxor_r2r        (mm4,mm4);
 114
 115         movq_r2r        (mm0,mm5);              /* First row */
 116         movd_m2r        (*refblk, mm6);
 117         pxor_r2r    ( mm7, mm7);
 118         refblk += rowstride;
 119         punpcklbw_r2r   ( mm7, mm5);
 120         punpcklbw_r2r   ( mm7, mm6);
 121         psadbw_r2r      ( mm5, mm6);
 122         paddw_r2r     ( mm6, mm4 );
 123
 124
 125
 126         movq_r2r        (mm1,mm5);              /* Second row */
 127         movd_m2r        (*refblk, mm6);
 128         refblk += rowstride;
 129         punpcklbw_r2r   ( mm7, mm5);
 130         punpcklbw_r2r   ( mm7, mm6);
 131         psadbw_r2r      ( mm5, mm6);
 132         paddw_r2r     ( mm6, mm4 );
 133
 134         if( h == 4 )
 135         {
 136
 137                 movq_r2r        (mm2,mm5);              /* Third row */
 138                 movd_m2r        (*refblk, mm6);
 139                 refblk += rowstride;
 140                 punpcklbw_r2r   ( mm7, mm5);
 141                 punpcklbw_r2r   ( mm7, mm6);
 142                 psadbw_r2r      ( mm5, mm6);
 143                 paddw_r2r     ( mm6, mm4 );
 144
 145
 146                 movq_r2r        (mm3,mm5);              /* Fourth row */
 147                 movd_m2r        (*refblk, mm6);
 148                 punpcklbw_r2r   ( mm7, mm5);
 149                 punpcklbw_r2r   ( mm7, mm6);
 150                 psadbw_r2r      ( mm5, mm6);
 151                 paddw_r2r     ( mm6, mm4 );
 152
 153         }
 154         movd_r2m      ( mm4, res );
 155
 156         return res;
 157 }
 158
 159
 160
 161 static __inline__ int qblock_sad_mmx(uint8_t *refblk,
 162                                                                   uint32_t h,
 163                                                                   uint32_t rowstride)
 164 {
 165         int res;
 166         pxor_r2r        (mm4,mm4);
 167
 168         movq_r2r        (mm0,mm5);              /* First row */
 169         movd_m2r        (*refblk, mm6);
 170         pxor_r2r    ( mm7, mm7);
 171         refblk += rowstride;
 172         punpcklbw_r2r   ( mm7, mm5);
 173
 174         punpcklbw_r2r   ( mm7, mm6);
 175
 176         movq_r2r                ( mm5, mm7);
 177         psubusw_r2r     ( mm6, mm5);
 178
 179         psubusw_r2r   ( mm7, mm6);
 180
 181         paddw_r2r     ( mm5, mm4);
 182         paddw_r2r     ( mm6, mm4 );
 183
 184
 185
 186         movq_r2r        (mm1,mm5);              /* Second row */
 187         movd_m2r        (*refblk, mm6);
 188         pxor_r2r    ( mm7, mm7);
 189         refblk += rowstride;
 190         punpcklbw_r2r   ( mm7, mm5);
 191         punpcklbw_r2r   ( mm7, mm6);
 192         movq_r2r                ( mm5, mm7);
 193         psubusw_r2r     ( mm6, mm5);
 194         psubusw_r2r   ( mm7, mm6);
 195         paddw_r2r     ( mm5, mm4);
 196         paddw_r2r     ( mm6, mm4 );
 197
 198         if( h == 4 )
 199         {
 200
 201                 movq_r2r        (mm2,mm5);              /* Third row */
 202                 movd_m2r        (*refblk, mm6);
 203                 pxor_r2r    ( mm7, mm7);
 204                 refblk += rowstride;
 205                 punpcklbw_r2r   ( mm7, mm5);
 206                 punpcklbw_r2r   ( mm7, mm6);
 207                 movq_r2r                ( mm5, mm7);
 208                 psubusw_r2r     ( mm6, mm5);
 209                 psubusw_r2r   ( mm7, mm6);
 210                 paddw_r2r     ( mm5, mm4);
 211                 paddw_r2r     ( mm6, mm4 );
 212
 213                 movq_r2r        (mm3,mm5);              /* Fourth row */
 214                 movd_m2r        (*refblk, mm6);
 215                 pxor_r2r    ( mm7, mm7);
 216                 punpcklbw_r2r   ( mm7, mm5);
 217                 punpcklbw_r2r   ( mm7, mm6);
 218                 movq_r2r                ( mm5, mm7);
 219                 psubusw_r2r     ( mm6, mm5);
 220                 psubusw_r2r   ( mm7, mm6);
 221                 paddw_r2r     ( mm5, mm4);
 222                 paddw_r2r     ( mm6, mm4 );
 223         }
 224
 225
 226         movq_r2r      ( mm4, mm5 );
 227     psrlq_i2r     ( 32, mm5 );
 228     paddw_r2r     ( mm5, mm4 );
 229         movq_r2r      ( mm4, mm6 );
 230     psrlq_i2r     ( 16, mm6 );
 231     paddw_r2r     ( mm6, mm4 );
 232         movd_r2m      ( mm4, res );
 233
 234         return res & 0xffff;
 235 }
 236
 237
 238 /*
 239  * Do the Extended MMX versions
 240  */
 241 #define SIMD_SUFFIX(x) x##_mmxe
 242 #include "mblock_sub44_sads_x86_h.c"
 243 #undef SIMD_SUFFIX
 244 /*
 245  * Do the original MMX versions
 246  */
 247 #define SIMD_SUFFIX(x) x##_mmx
 248 #include "mblock_sub44_sads_x86_h.c"
 249 #undef SIMD_SUFFIX
 250
 251
 252
 253