gfx/2d/ssse3-scaler.c

   1 /*
   2  * Copyright © 2013 Soren Sandmann Pedersen
   3  * Copyright © 2013 Red Hat, Inc.
   4  * Copyright © 2016 Mozilla Foundation
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a
   7  * copy of this software and associated documentation files (the "Software"),
   8  * to deal in the Software without restriction, including without limitation
   9  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  10  * and/or sell copies of the Software, and to permit persons to whom the
  11  * Software is furnished to do so, subject to the following conditions:
  12  *
  13  * The above copyright notice and this permission notice (including the next
  14  * paragraph) shall be included in all copies or substantial portions of the
  15  * Software.
  16  *
  17  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  18  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  19  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  20  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  21  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  22  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
  23  * DEALINGS IN THE SOFTWARE.
  24  *
  25  * Author: Soren Sandmann (soren.sandmann@gmail.com)
  26  *         Jeff Muizelaar (jmuizelaar@mozilla.com)
  27  */
  28
  29 /* This has been adapted from the ssse3 code from pixman. It's currently
  30  * a mess as I want to try it out in practice before finalizing the details.
  31  */
  32
  33 #include <stdlib.h>
  34 #include <mmintrin.h>
  35 #include <xmmintrin.h>
  36 #include <emmintrin.h>
  37 #include <tmmintrin.h>
  38 #include <stdint.h>
  39 #include <assert.h>
  40 #include "ssse3-scaler.h"
  41
  42 typedef int32_t pixman_fixed_16_16_t;
  43 typedef pixman_fixed_16_16_t pixman_fixed_t;
  44 #define pixman_fixed_1 (pixman_int_to_fixed(1))
  45 #define pixman_fixed_to_int(f) ((int)((f) >> 16))
  46 #define pixman_int_to_fixed(i) ((pixman_fixed_t)((i) << 16))
  47 #define pixman_double_to_fixed(d) ((pixman_fixed_t)((d)*65536.0))
  48 #define PIXMAN_FIXED_INT_MAX 32767
  49 #define PIXMAN_FIXED_INT_MIN -32768
  50 typedef struct pixman_vector pixman_vector_t;
  51
  52 typedef int pixman_bool_t;
  53 typedef int64_t pixman_fixed_32_32_t;
  54 typedef pixman_fixed_32_32_t pixman_fixed_48_16_t;
  55 typedef struct {
  56   pixman_fixed_48_16_t v[3];
  57 } pixman_vector_48_16_t;
  58
  59 struct pixman_vector {
  60   pixman_fixed_t vector[3];
  61 };
  62 typedef struct pixman_transform pixman_transform_t;
  63
  64 struct pixman_transform {
  65   pixman_fixed_t matrix[3][3];
  66 };
  67
  68 #ifdef _MSC_VER
  69 #  define force_inline __forceinline
  70 #else
  71 #  define force_inline __inline__ __attribute__((always_inline))
  72 #endif
  73
  74 #define BILINEAR_INTERPOLATION_BITS 6
  75
  76 static force_inline int pixman_fixed_to_bilinear_weight(pixman_fixed_t x) {
  77   return (x >> (16 - BILINEAR_INTERPOLATION_BITS)) &
  78          ((1 << BILINEAR_INTERPOLATION_BITS) - 1);
  79 }
  80
  81 static void pixman_transform_point_31_16_3d(const pixman_transform_t* t,
  82                                             const pixman_vector_48_16_t* v,
  83                                             pixman_vector_48_16_t* result) {
  84   int i;
  85   int64_t tmp[3][2];
  86
  87   /* input vector values must have no more than 31 bits (including sign)
  88    * in the integer part */
  89   assert(v->v[0] < ((pixman_fixed_48_16_t)1 << (30 + 16)));
  90   assert(v->v[0] >= -((pixman_fixed_48_16_t)1 << (30 + 16)));
  91   assert(v->v[1] < ((pixman_fixed_48_16_t)1 << (30 + 16)));
  92   assert(v->v[1] >= -((pixman_fixed_48_16_t)1 << (30 + 16)));
  93   assert(v->v[2] < ((pixman_fixed_48_16_t)1 << (30 + 16)));
  94   assert(v->v[2] >= -((pixman_fixed_48_16_t)1 << (30 + 16)));
  95
  96   for (i = 0; i < 3; i++) {
  97     tmp[i][0] = (int64_t)t->matrix[i][0] * (v->v[0] >> 16);
  98     tmp[i][1] = (int64_t)t->matrix[i][0] * (v->v[0] & 0xFFFF);
  99     tmp[i][0] += (int64_t)t->matrix[i][1] * (v->v[1] >> 16);
 100     tmp[i][1] += (int64_t)t->matrix[i][1] * (v->v[1] & 0xFFFF);
 101     tmp[i][0] += (int64_t)t->matrix[i][2] * (v->v[2] >> 16);
 102     tmp[i][1] += (int64_t)t->matrix[i][2] * (v->v[2] & 0xFFFF);
 103   }
 104
 105   result->v[0] = tmp[0][0] + ((tmp[0][1] + 0x8000) >> 16);
 106   result->v[1] = tmp[1][0] + ((tmp[1][1] + 0x8000) >> 16);
 107   result->v[2] = tmp[2][0] + ((tmp[2][1] + 0x8000) >> 16);
 108 }
 109
 110 static pixman_bool_t pixman_transform_point_3d(
 111     const struct pixman_transform* transform, struct pixman_vector* vector) {
 112   pixman_vector_48_16_t tmp;
 113   tmp.v[0] = vector->vector[0];
 114   tmp.v[1] = vector->vector[1];
 115   tmp.v[2] = vector->vector[2];
 116
 117   pixman_transform_point_31_16_3d(transform, &tmp, &tmp);
 118
 119   vector->vector[0] = tmp.v[0];
 120   vector->vector[1] = tmp.v[1];
 121   vector->vector[2] = tmp.v[2];
 122
 123   return vector->vector[0] == tmp.v[0] && vector->vector[1] == tmp.v[1] &&
 124          vector->vector[2] == tmp.v[2];
 125 }
 126
 127 struct bits_image_t {
 128   uint32_t* bits;
 129   int rowstride;
 130   pixman_transform_t* transform;
 131 };
 132
 133 typedef struct bits_image_t bits_image_t;
 134 typedef struct {
 135   int unused;
 136 } pixman_iter_info_t;
 137
 138 typedef struct pixman_iter_t pixman_iter_t;
 139 typedef void (*pixman_iter_fini_t)(pixman_iter_t* iter);
 140
 141 struct pixman_iter_t {
 142   int x, y;
 143   pixman_iter_fini_t fini;
 144   bits_image_t* image;
 145   uint32_t* buffer;
 146   int width;
 147   int height;
 148   void* data;
 149 };
 150
 151 typedef struct {
 152   int y;
 153   uint64_t* buffer;
 154 } line_t;
 155
 156 typedef struct {
 157   line_t lines[2];
 158   pixman_fixed_t y;
 159   pixman_fixed_t x;
 160   uint64_t data[1];
 161 } bilinear_info_t;
 162
 163 static void ssse3_fetch_horizontal(bits_image_t* image, line_t* line, int y,
 164                                    pixman_fixed_t x, pixman_fixed_t ux, int n) {
 165   uint32_t* bits = image->bits + y * image->rowstride;
 166   __m128i vx = _mm_set_epi16(-(x + 1), x, -(x + 1), x, -(x + ux + 1), x + ux,
 167                              -(x + ux + 1), x + ux);
 168   __m128i vux = _mm_set_epi16(-2 * ux, 2 * ux, -2 * ux, 2 * ux, -2 * ux, 2 * ux,
 169                               -2 * ux, 2 * ux);
 170   __m128i vaddc = _mm_set_epi16(1, 0, 1, 0, 1, 0, 1, 0);
 171   __m128i* b = (__m128i*)line->buffer;
 172   __m128i vrl0, vrl1;
 173
 174   while ((n -= 2) >= 0) {
 175     __m128i vw, vr, s;
 176 #ifdef HACKY_PADDING
 177     if (pixman_fixed_to_int(x + ux) >= image->rowstride) {
 178       vrl1 = _mm_setzero_si128();
 179       printf("overread 2loop\n");
 180     } else {
 181       if (pixman_fixed_to_int(x + ux) < 0) printf("underflow\n");
 182       vrl1 = _mm_loadl_epi64(
 183           (__m128i*)(bits + (pixman_fixed_to_int(x + ux) < 0
 184                                  ? 0
 185                                  : pixman_fixed_to_int(x + ux))));
 186     }
 187 #else
 188     vrl1 = _mm_loadl_epi64((__m128i*)(bits + pixman_fixed_to_int(x + ux)));
 189 #endif
 190     /* vrl1: R1, L1 */
 191
 192   final_pixel:
 193 #ifdef HACKY_PADDING
 194     vrl0 = _mm_loadl_epi64(
 195         (__m128i*)(bits +
 196                    (pixman_fixed_to_int(x) < 0 ? 0 : pixman_fixed_to_int(x))));
 197 #else
 198     vrl0 = _mm_loadl_epi64((__m128i*)(bits + pixman_fixed_to_int(x)));
 199 #endif
 200     /* vrl0: R0, L0 */
 201
 202     /* The weights are based on vx which is a vector of
 203      *
 204      *    - (x + 1), x, - (x + 1), x,
 205      *          - (x + ux + 1), x + ux, - (x + ux + 1), x + ux
 206      *
 207      * so the 16 bit weights end up like this:
 208      *
 209      *    iw0, w0, iw0, w0, iw1, w1, iw1, w1
 210      *
 211      * and after shifting and packing, we get these bytes:
 212      *
 213      *    iw0, w0, iw0, w0, iw1, w1, iw1, w1,
 214      *        iw0, w0, iw0, w0, iw1, w1, iw1, w1,
 215      *
 216      * which means the first and the second input pixel
 217      * have to be interleaved like this:
 218      *
 219      *    la0, ra0, lr0, rr0, la1, ra1, lr1, rr1,
 220      *        lg0, rg0, lb0, rb0, lg1, rg1, lb1, rb1
 221      *
 222      * before maddubsw can be used.
 223      */
 224
 225     vw = _mm_add_epi16(vaddc,
 226                        _mm_srli_epi16(vx, 16 - BILINEAR_INTERPOLATION_BITS));
 227     /* vw: iw0, w0, iw0, w0, iw1, w1, iw1, w1
 228      */
 229
 230     vw = _mm_packus_epi16(vw, vw);
 231     /* vw: iw0, w0, iw0, w0, iw1, w1, iw1, w1,
 232      *         iw0, w0, iw0, w0, iw1, w1, iw1, w1
 233      */
 234     vx = _mm_add_epi16(vx, vux);
 235
 236     x += 2 * ux;
 237
 238     vr = _mm_unpacklo_epi16(vrl1, vrl0);
 239     /* vr: rar0, rar1, rgb0, rgb1, lar0, lar1, lgb0, lgb1 */
 240
 241     s = _mm_shuffle_epi32(vr, _MM_SHUFFLE(1, 0, 3, 2));
 242     /* s:  lar0, lar1, lgb0, lgb1, rar0, rar1, rgb0, rgb1 */
 243
 244     vr = _mm_unpackhi_epi8(vr, s);
 245     /* vr: la0, ra0, lr0, rr0, la1, ra1, lr1, rr1,
 246      *         lg0, rg0, lb0, rb0, lg1, rg1, lb1, rb1
 247      */
 248
 249     vr = _mm_maddubs_epi16(vr, vw);
 250
 251     /* When the weight is 0, the inverse weight is
 252      * 128 which can't be represented in a signed byte.
 253      * As a result maddubsw computes the following:
 254      *
 255      *     r = l * -128 + r * 0
 256      *
 257      * rather than the desired
 258      *
 259      *     r = l * 128 + r * 0
 260      *
 261      * We fix this by taking the absolute value of the
 262      * result.
 263      */
 264     // we can drop this if we use lower precision
 265
 266     vr = _mm_shuffle_epi32(vr, _MM_SHUFFLE(2, 0, 3, 1));
 267     /* vr: A0, R0, A1, R1, G0, B0, G1, B1 */
 268     _mm_store_si128(b++, vr);
 269   }
 270
 271   if (n == -1) {
 272     vrl1 = _mm_setzero_si128();
 273     goto final_pixel;
 274   }
 275
 276   line->y = y;
 277 }
 278
 279 // scale a line of destination pixels
 280 static uint32_t* ssse3_fetch_bilinear_cover(pixman_iter_t* iter,
 281                                             const uint32_t* mask) {
 282   pixman_fixed_t fx, ux;
 283   bilinear_info_t* info = iter->data;
 284   line_t *line0, *line1;
 285   int y0, y1;
 286   int32_t dist_y;
 287   __m128i vw, uvw;
 288   int i;
 289
 290   fx = info->x;
 291   ux = iter->image->transform->matrix[0][0];
 292
 293   y0 = pixman_fixed_to_int(info->y);
 294   if (y0 < 0) *(volatile char*)0 = 9;
 295   y1 = y0 + 1;
 296
 297   // clamping in y direction
 298   if (y1 >= iter->height) {
 299     y1 = iter->height - 1;
 300   }
 301
 302   line0 = &info->lines[y0 & 0x01];
 303   line1 = &info->lines[y1 & 0x01];
 304
 305   if (line0->y != y0) {
 306     ssse3_fetch_horizontal(iter->image, line0, y0, fx, ux, iter->width);
 307   }
 308
 309   if (line1->y != y1) {
 310     ssse3_fetch_horizontal(iter->image, line1, y1, fx, ux, iter->width);
 311   }
 312
 313 #ifdef PIXMAN_STYLE_INTERPOLATION
 314   dist_y = pixman_fixed_to_bilinear_weight(info->y);
 315   dist_y <<= (16 - BILINEAR_INTERPOLATION_BITS);
 316
 317   vw = _mm_set_epi16(dist_y, dist_y, dist_y, dist_y, dist_y, dist_y, dist_y,
 318                      dist_y);
 319
 320 #else
 321   // setup the weights for the top (vw) and bottom (uvw) lines
 322   dist_y = pixman_fixed_to_bilinear_weight(info->y);
 323   // we use 15 instead of 16 because we need an extra bit to handle when the
 324   // weights are 0 and 1
 325   dist_y <<= (15 - BILINEAR_INTERPOLATION_BITS);
 326
 327   vw = _mm_set_epi16(dist_y, dist_y, dist_y, dist_y, dist_y, dist_y, dist_y,
 328                      dist_y);
 329
 330   dist_y = (1 << BILINEAR_INTERPOLATION_BITS) -
 331            pixman_fixed_to_bilinear_weight(info->y);
 332   dist_y <<= (15 - BILINEAR_INTERPOLATION_BITS);
 333   uvw = _mm_set_epi16(dist_y, dist_y, dist_y, dist_y, dist_y, dist_y, dist_y,
 334                       dist_y);
 335 #endif
 336
 337   for (i = 0; i + 3 < iter->width; i += 4) {
 338     __m128i top0 = _mm_load_si128((__m128i*)(line0->buffer + i));
 339     __m128i bot0 = _mm_load_si128((__m128i*)(line1->buffer + i));
 340     __m128i top1 = _mm_load_si128((__m128i*)(line0->buffer + i + 2));
 341     __m128i bot1 = _mm_load_si128((__m128i*)(line1->buffer + i + 2));
 342 #ifdef PIXMAN_STYLE_INTERPOLATION
 343     __m128i r0, r1, tmp, p;
 344
 345     r0 = _mm_mulhi_epu16(_mm_sub_epi16(bot0, top0), vw);
 346     tmp = _mm_cmplt_epi16(bot0, top0);
 347     tmp = _mm_and_si128(tmp, vw);
 348     r0 = _mm_sub_epi16(r0, tmp);
 349     r0 = _mm_add_epi16(r0, top0);
 350     r0 = _mm_srli_epi16(r0, BILINEAR_INTERPOLATION_BITS);
 351     /* r0:  A0 R0 A1 R1 G0 B0 G1 B1 */
 352     // r0 = _mm_shuffle_epi32 (r0, _MM_SHUFFLE (2, 0, 3, 1));
 353     /* r0:  A1 R1 G1 B1 A0 R0 G0 B0 */
 354
 355     // tmp = bot1 < top1 ? vw : 0;
 356     // r1 = (bot1 - top1)*vw + top1 - tmp
 357     // r1 = bot1*vw - vw*top1 + top1 - tmp
 358     // r1 = bot1*vw + top1 - vw*top1 - tmp
 359     // r1 = bot1*vw + top1*(1 - vw) - tmp
 360     r1 = _mm_mulhi_epu16(_mm_sub_epi16(bot1, top1), vw);
 361     tmp = _mm_cmplt_epi16(bot1, top1);
 362     tmp = _mm_and_si128(tmp, vw);
 363     r1 = _mm_sub_epi16(r1, tmp);
 364     r1 = _mm_add_epi16(r1, top1);
 365     r1 = _mm_srli_epi16(r1, BILINEAR_INTERPOLATION_BITS);
 366     // r1 = _mm_shuffle_epi32 (r1, _MM_SHUFFLE (2, 0, 3, 1));
 367     /* r1: A3 R3 G3 B3 A2 R2 G2 B2 */
 368 #else
 369     __m128i r0, r1, p;
 370     top0 = _mm_mulhi_epu16(top0, uvw);
 371     bot0 = _mm_mulhi_epu16(bot0, vw);
 372     r0 = _mm_add_epi16(top0, bot0);
 373     r0 = _mm_srli_epi16(r0, BILINEAR_INTERPOLATION_BITS - 1);
 374
 375     top1 = _mm_mulhi_epu16(top1, uvw);
 376     bot1 = _mm_mulhi_epu16(bot1, vw);
 377     r1 = _mm_add_epi16(top1, bot1);
 378     r1 = _mm_srli_epi16(r1, BILINEAR_INTERPOLATION_BITS - 1);
 379 #endif
 380
 381     p = _mm_packus_epi16(r0, r1);
 382     _mm_storeu_si128((__m128i*)(iter->buffer + i), p);
 383   }
 384
 385   while (i < iter->width) {
 386     __m128i top0 = _mm_load_si128((__m128i*)(line0->buffer + i));
 387     __m128i bot0 = _mm_load_si128((__m128i*)(line1->buffer + i));
 388
 389 #ifdef PIXMAN_STYLE_INTERPOLATION
 390     __m128i r0, tmp, p;
 391     r0 = _mm_mulhi_epu16(_mm_sub_epi16(bot0, top0), vw);
 392     tmp = _mm_cmplt_epi16(bot0, top0);
 393     tmp = _mm_and_si128(tmp, vw);
 394     r0 = _mm_sub_epi16(r0, tmp);
 395     r0 = _mm_add_epi16(r0, top0);
 396     r0 = _mm_srli_epi16(r0, BILINEAR_INTERPOLATION_BITS);
 397     /* r0:  A0 R0 A1 R1 G0 B0 G1 B1 */
 398     r0 = _mm_shuffle_epi32(r0, _MM_SHUFFLE(2, 0, 3, 1));
 399     /* r0:  A1 R1 G1 B1 A0 R0 G0 B0 */
 400 #else
 401     __m128i r0, p;
 402     top0 = _mm_mulhi_epu16(top0, uvw);
 403     bot0 = _mm_mulhi_epu16(bot0, vw);
 404     r0 = _mm_add_epi16(top0, bot0);
 405     r0 = _mm_srli_epi16(r0, BILINEAR_INTERPOLATION_BITS - 1);
 406 #endif
 407
 408     p = _mm_packus_epi16(r0, r0);
 409
 410     if (iter->width - i == 1) {
 411       *(uint32_t*)(iter->buffer + i) = _mm_cvtsi128_si32(p);
 412       i++;
 413     } else {
 414       _mm_storel_epi64((__m128i*)(iter->buffer + i), p);
 415       i += 2;
 416     }
 417   }
 418
 419   info->y += iter->image->transform->matrix[1][1];
 420
 421   return iter->buffer;
 422 }
 423
 424 static void ssse3_bilinear_cover_iter_fini(pixman_iter_t* iter) {
 425   free(iter->data);
 426 }
 427
 428 static void ssse3_bilinear_cover_iter_init(pixman_iter_t* iter) {
 429   int width = iter->width;
 430   bilinear_info_t* info;
 431   pixman_vector_t v;
 432
 433   if (iter->x > PIXMAN_FIXED_INT_MAX || iter->x < PIXMAN_FIXED_INT_MIN ||
 434       iter->y > PIXMAN_FIXED_INT_MAX || iter->y < PIXMAN_FIXED_INT_MIN)
 435     goto fail;
 436
 437   /* Reference point is the center of the pixel */
 438   v.vector[0] = pixman_int_to_fixed(iter->x) + pixman_fixed_1 / 2;
 439   v.vector[1] = pixman_int_to_fixed(iter->y) + pixman_fixed_1 / 2;
 440   v.vector[2] = pixman_fixed_1;
 441
 442   if (!pixman_transform_point_3d(iter->image->transform, &v)) goto fail;
 443
 444   info = malloc(sizeof(*info) + (2 * width - 1) * sizeof(uint64_t) + 64);
 445   if (!info) goto fail;
 446
 447   info->x = v.vector[0] - pixman_fixed_1 / 2;
 448   info->y = v.vector[1] - pixman_fixed_1 / 2;
 449
 450 #define ALIGN(addr) ((void*)((((uintptr_t)(addr)) + 15) & (~15)))
 451
 452   /* It is safe to set the y coordinates to -1 initially
 453    * because COVER_CLIP_BILINEAR ensures that we will only
 454    * be asked to fetch lines in the [0, height) interval
 455    */
 456   info->lines[0].y = -1;
 457   info->lines[0].buffer = ALIGN(&(info->data[0]));
 458   info->lines[1].y = -1;
 459   info->lines[1].buffer = ALIGN(info->lines[0].buffer + width);
 460
 461   iter->fini = ssse3_bilinear_cover_iter_fini;
 462
 463   iter->data = info;
 464   return;
 465
 466 fail:
 467   /* Something went wrong, either a bad matrix or OOM; in such cases,
 468    * we don't guarantee any particular rendering.
 469    */
 470   iter->fini = NULL;
 471 }
 472
 473 /* scale the src from src_width/height to dest_width/height drawn
 474  * into the rectangle x,y width,height
 475  * src_stride and dst_stride are 4 byte units */
 476 bool ssse3_scale_data(uint32_t* src, int src_width, int src_height,
 477                       int src_stride, uint32_t* dest, int dest_width,
 478                       int dest_height, int dest_stride, int x, int y, int width,
 479                       int height) {
 480   // XXX: assert(src_width > 1)
 481   pixman_transform_t transform = {
 482       {{pixman_fixed_1, 0, 0}, {0, pixman_fixed_1, 0}, {0, 0, pixman_fixed_1}}};
 483   double width_scale = ((double)src_width) / dest_width;
 484   double height_scale = ((double)src_height) / dest_height;
 485 #define AVOID_PADDING
 486 #ifdef AVOID_PADDING
 487   // scale up by enough that we don't read outside of the bounds of the source
 488   // surface currently this is required to avoid reading out of bounds.
 489   if (width_scale < 1) {
 490     width_scale = (double)(src_width - 1) / dest_width;
 491     transform.matrix[0][2] = pixman_fixed_1 / 2;
 492   }
 493   if (height_scale < 1) {
 494     height_scale = (double)(src_height - 1) / dest_height;
 495     transform.matrix[1][2] = pixman_fixed_1 / 2;
 496   }
 497 #endif
 498   transform.matrix[0][0] = pixman_double_to_fixed(width_scale);
 499   transform.matrix[1][1] = pixman_double_to_fixed(height_scale);
 500   transform.matrix[2][2] = pixman_fixed_1;
 501
 502   bits_image_t image;
 503   image.bits = src;
 504   image.transform = &transform;
 505   image.rowstride = src_stride;
 506
 507   pixman_iter_t iter;
 508   iter.image = &image;
 509   iter.x = x;
 510   iter.y = y;
 511   iter.width = width;
 512   iter.height = src_height;
 513   iter.buffer = dest;
 514   iter.data = NULL;
 515
 516   ssse3_bilinear_cover_iter_init(&iter);
 517
 518   if (!iter.fini) return false;
 519
 520   if (iter.data) {
 521     for (int iy = 0; iy < height; iy++) {
 522       ssse3_fetch_bilinear_cover(&iter, NULL);
 523       iter.buffer += dest_stride;
 524     }
 525     ssse3_bilinear_cover_iter_fini(&iter);
 526   }
 527   return true;
 528 }