1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 /* vim: set ts=8 sts=2 et sw=2 tw=80: */
3 /* This Source Code Form is subject to the terms of the Mozilla Public
4 * License, v. 2.0. If a copy of the MPL was not distributed with this
5 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
11 #ifdef _MIPS_ARCH_LOONGSON3A
13 # include "MMIHelpers.h"
24 __m128i
loadUnaligned128(__m128i
* p
) {
29 ".set arch=loongson3a \n\t"
30 "gsldlc1 %[vh], 0xf(%[p]) \n\t"
31 "gsldrc1 %[vh], 0x8(%[p]) \n\t"
32 "gsldlc1 %[vl], 0x7(%[p]) \n\t"
33 "gsldrc1 %[vl], 0x0(%[p]) \n\t"
35 : [vh
] "=f"(v
.h
), [vl
] "=f"(v
.l
)
43 __m128i
Divide(__m128i aValues
, __m128i aDivisor
) {
46 __m128i mask
, ra
, p4321
, t1
, t2
;
50 ".set arch=loongson3a \n\t"
51 "li %[tmp], 0x80000000 \n\t"
52 "mtc1 %[tmp], %[ral] \n\t"
53 "xor %[maskl], %[maskl], %[maskl] \n\t"
54 "mov.d %[rah], %[ral] \n\t"
55 "li %[tmp], 0xffffffff \n\t"
56 "mthc1 %[tmp], %[maskl] \n\t"
57 "mov.d %[maskh], %[maskl] \n\t"
59 : [rah
] "=f"(ra
.h
), [ral
] "=f"(ra
.l
), [maskh
] "=f"(mask
.h
),
60 [maskl
] "=f"(mask
.l
), [tmp
] "=&r"(tmp
));
64 ".set arch=loongson3a \n\t"
65 "ori %[tmp], $0, 32 \n\t"
66 "mtc1 %[tmp], %[srl32] \n\t" _mm_pmuluw(t1
, av
, ad
)
67 _mm_psrld(t2
, av
, srl32
) _mm_pmuluw(t2
, t2
, ad
)
68 // Add 1 << 31 before shifting or masking the lower 32 bits away, so that
69 // the result is rounded.
70 _mm_paddd(t1
, t1
, ra
) _mm_psrld(t1
, t1
, srl32
) _mm_paddd(t2
, t2
, ra
)
71 _mm_and(t2
, t2
, mask
) _mm_or(p4321
, t1
, t2
) ".set pop \n\t"
72 : [p4321h
] "=&f"(p4321
.h
), [p4321l
] "=&f"(p4321
.l
), [t1h
] "=&f"(t1
.h
),
73 [t1l
] "=&f"(t1
.l
), [t2h
] "=&f"(t2
.h
), [t2l
] "=&f"(t2
.l
),
74 [srl32
] "=&f"(srl32
), [tmp
] "=&r"(tmp
)
75 : [rah
] "f"(ra
.h
), [ral
] "f"(ra
.l
), [maskh
] "f"(mask
.h
),
76 [maskl
] "f"(mask
.l
), [avh
] "f"(aValues
.h
), [avl
] "f"(aValues
.l
),
77 [adh
] "f"(aDivisor
.h
), [adl
] "f"(aDivisor
.l
));
83 __m128i
BlurFourPixels(const __m128i
& aTopLeft
, const __m128i
& aTopRight
,
84 const __m128i
& aBottomRight
, const __m128i
& aBottomLeft
,
85 const __m128i
& aDivisor
) {
90 ".set arch=loongson3a \n\t" _mm_psubw(val
, abr
, atr
)
91 _mm_psubw(val
, val
, abl
) _mm_paddw(val
, val
, atl
) ".set pop \n\t"
92 : [valh
] "=&f"(values
.h
), [vall
] "=&f"(values
.l
)
93 : [abrh
] "f"(aBottomRight
.h
), [abrl
] "f"(aBottomRight
.l
),
94 [atrh
] "f"(aTopRight
.h
), [atrl
] "f"(aTopRight
.l
),
95 [ablh
] "f"(aBottomLeft
.h
), [abll
] "f"(aBottomLeft
.l
),
96 [atlh
] "f"(aTopLeft
.h
), [atll
] "f"(aTopLeft
.l
));
98 return Divide(values
, aDivisor
);
102 void LoadIntegralRowFromRow(uint32_t* aDest
, const uint8_t* aSource
,
103 int32_t aSourceWidth
, int32_t aLeftInflation
,
104 int32_t aRightInflation
) {
105 int32_t currentRowSum
= 0;
107 for (int x
= 0; x
< aLeftInflation
; x
++) {
108 currentRowSum
+= aSource
[0];
109 aDest
[x
] = currentRowSum
;
111 for (int x
= aLeftInflation
; x
< (aSourceWidth
+ aLeftInflation
); x
++) {
112 currentRowSum
+= aSource
[(x
- aLeftInflation
)];
113 aDest
[x
] = currentRowSum
;
115 for (int x
= (aSourceWidth
+ aLeftInflation
);
116 x
< (aSourceWidth
+ aLeftInflation
+ aRightInflation
); x
++) {
117 currentRowSum
+= aSource
[aSourceWidth
- 1];
118 aDest
[x
] = currentRowSum
;
122 // This function calculates an integral of four pixels stored in the 4
123 // 32-bit integers on aPixels. i.e. for { 30, 50, 80, 100 } this returns
124 // { 30, 80, 160, 260 }. This seems to be the fastest way to do this after
127 __m128i
AccumulatePixelSums(__m128i aPixels
) {
130 __m128i sumPixels
, currentPixels
, zero
;
134 ".set arch=loongson3a \n\t"
137 "mtc1 %[tr], %[s64] \n\t"
139 "mtc1 %[tr], %[s4] \n\t"
140 _mm_psllq(cp
, ap
, s4
, s64
, t
)
141 _mm_paddw(sp
, ap
, cp
)
142 _mm_punpckldq(cp
, z
, sp
)
143 _mm_paddw(sp
, sp
, cp
)
145 :[sph
]"=&f"(sumPixels
.h
), [spl
]"=&f"(sumPixels
.l
),
146 [cph
]"=&f"(currentPixels
.h
), [cpl
]"=&f"(currentPixels
.l
),
147 [zh
]"=&f"(zero
.h
), [zl
]"=&f"(zero
.l
),
148 [s4
]"=&f"(s4
), [s64
]"=&f"(s64
), [t
]"=&f"(tmp
), [tr
]"=&r"(tr
)
149 :[aph
]"f"(aPixels
.h
), [apl
]"f"(aPixels
.l
)
156 void GenerateIntegralImage_LS3(int32_t aLeftInflation
, int32_t aRightInflation
,
157 int32_t aTopInflation
, int32_t aBottomInflation
,
158 uint32_t* aIntegralImage
,
159 size_t aIntegralImageStride
, uint8_t* aSource
,
160 int32_t aSourceStride
, const IntSize
& aSize
) {
161 MOZ_ASSERT(!(aLeftInflation
& 3));
163 uint32_t stride32bit
= aIntegralImageStride
/ 4;
165 IntSize
integralImageSize(aSize
.width
+ aLeftInflation
+ aRightInflation
,
166 aSize
.height
+ aTopInflation
+ aBottomInflation
);
168 LoadIntegralRowFromRow(aIntegralImage
, aSource
, aSize
.width
, aLeftInflation
,
171 for (int y
= 1; y
< aTopInflation
+ 1; y
++) {
172 uint32_t* intRow
= aIntegralImage
+ (y
* stride32bit
);
173 uint32_t* intPrevRow
= aIntegralImage
+ (y
- 1) * stride32bit
;
174 uint32_t* intFirstRow
= aIntegralImage
;
176 for (int x
= 0; x
< integralImageSize
.width
; x
+= 4) {
177 __m128i firstRow
, previousRow
;
181 ".set arch=loongson3a \n\t"
182 "gslqc1 %[frh], %[frl], (%[fr]) \n\t"
183 "gslqc1 %[prh], %[prl], (%[pr]) \n\t"
184 _mm_paddw(fr
, fr
, pr
)
185 "gssqc1 %[frh], %[frl], (%[r]) \n\t"
187 :[frh
]"=&f"(firstRow
.h
), [frl
]"=&f"(firstRow
.l
),
188 [prh
]"=&f"(previousRow
.h
), [prl
]"=&f"(previousRow
.l
)
189 :[fr
]"r"(intFirstRow
+ x
), [pr
]"r"(intPrevRow
+ x
),
201 ".set arch=loongson3a \n\t"
202 "li %[tmp], 0xee \n\t"
203 "mtc1 %[tmp], %[see] \n\t"
204 "li %[tmp], 0x44 \n\t"
205 "mtc1 %[tmp], %[s44] \n\t" _mm_xor(zero
, zero
, zero
) ".set pop \n\t"
206 : [tmp
] "=&r"(tmp
), [s44
] "=f"(s44
), [see
] "=f"(see
),
207 [zeroh
] "=f"(zero
.h
), [zerol
] "=f"(zero
.l
));
208 for (int y
= aTopInflation
+ 1; y
< (aSize
.height
+ aTopInflation
); y
++) {
209 __m128i currentRowSum
;
210 uint32_t* intRow
= aIntegralImage
+ (y
* stride32bit
);
211 uint32_t* intPrevRow
= aIntegralImage
+ (y
- 1) * stride32bit
;
212 uint8_t* sourceRow
= aSource
+ aSourceStride
* (y
- aTopInflation
);
213 uint32_t pixel
= sourceRow
[0];
217 ".set arch=loongson3a \n\t" _mm_xor(cr
, cr
, cr
) ".set pop \n\t"
218 : [crh
] "=f"(currentRowSum
.h
), [crl
] "=f"(currentRowSum
.l
));
219 for (int x
= 0; x
< aLeftInflation
; x
+= 4) {
220 __m128i sumPixels
, t
;
223 ".set arch=loongson3a \n\t"
224 "mtc1 %[pix], %[spl] \n\t"
225 "punpcklwd %[spl], %[spl], %[spl] \n\t"
226 "mov.d %[sph], %[spl] \n\t"
227 "pshufh %[sph], %[spl], %[s44] \n\t"
228 "pshufh %[spl], %[spl], %[s44] \n\t"
230 : [sph
] "=&f"(sumPixels
.h
), [spl
] "=&f"(sumPixels
.l
)
231 : [pix
] "r"(pixel
), [s44
] "f"(s44
));
232 sumPixels
= AccumulatePixelSums(sumPixels
);
235 ".set arch=loongson3a \n\t"
236 _mm_paddw(sp
, sp
, cr
)
237 "pshufh %[crh], %[sph], %[see] \n\t"
238 "pshufh %[crl], %[sph], %[see] \n\t"
239 "gslqc1 %[th], %[tl], (%[pr]) \n\t"
241 "gssqc1 %[th], %[tl], (%[r]) \n\t"
243 :[th
]"=&f"(t
.h
), [tl
]"=&f"(t
.l
),
244 [sph
]"+f"(sumPixels
.h
), [spl
]"+f"(sumPixels
.l
),
245 [crh
]"+f"(currentRowSum
.h
), [crl
]"+f"(currentRowSum
.l
)
246 :[r
]"r"(intRow
+ x
), [pr
]"r"(intPrevRow
+ x
), [see
]"f"(see
)
250 for (int x
= aLeftInflation
; x
< (aSize
.width
+ aLeftInflation
); x
+= 4) {
251 uint32_t pixels
= *(uint32_t*)(sourceRow
+ (x
- aLeftInflation
));
252 __m128i sumPixels
, t
;
254 // It's important to shuffle here. When we exit this loop currentRowSum
255 // has to be set to sumPixels, so that the following loop can get the
256 // correct pixel for the currentRowSum. The highest order pixel in
257 // currentRowSum could've originated from accumulation in the stride.
260 ".set arch=loongson3a \n\t"
261 "pshufh %[crl], %[crh], %[see] \n\t"
262 "pshufh %[crh], %[crh], %[see] \n\t"
263 "mtc1 %[pix], %[spl] \n\t"
264 "punpcklwd %[spl], %[spl], %[spl] \n\t"
265 "mov.d %[sph], %[spl] \n\t" _mm_punpcklbh(sp
, sp
, zero
)
266 _mm_punpcklhw(sp
, sp
, zero
) ".set pop \n\t"
267 : [sph
] "=&f"(sumPixels
.h
), [spl
] "=&f"(sumPixels
.l
),
268 [crh
] "+f"(currentRowSum
.h
), [crl
] "+f"(currentRowSum
.l
)
269 : [pix
] "r"(pixels
), [see
] "f"(see
), [zeroh
] "f"(zero
.h
),
270 [zerol
] "f"(zero
.l
));
271 sumPixels
= AccumulatePixelSums(sumPixels
);
274 ".set arch=loongson3a \n\t"
275 _mm_paddw(sp
, sp
, cr
)
276 "mov.d %[crh], %[sph] \n\t"
277 "mov.d %[crl], %[spl] \n\t"
278 "gslqc1 %[th], %[tl], (%[pr]) \n\t"
280 "gssqc1 %[th], %[tl], (%[r]) \n\t"
282 :[th
]"=&f"(t
.h
), [tl
]"=&f"(t
.l
),
283 [sph
]"+f"(sumPixels
.h
), [spl
]"+f"(sumPixels
.l
),
284 [crh
]"+f"(currentRowSum
.h
), [crl
]"+f"(currentRowSum
.l
)
285 :[r
]"r"(intRow
+ x
), [pr
]"r"(intPrevRow
+ x
)
290 pixel
= sourceRow
[aSize
.width
- 1];
291 int x
= (aSize
.width
+ aLeftInflation
);
292 if ((aSize
.width
& 3)) {
293 // Deal with unaligned portion. Get the correct pixel from currentRowSum,
294 // see explanation above.
295 uint32_t intCurrentRowSum
=
296 ((uint32_t*)¤tRowSum
)[(aSize
.width
% 4) - 1];
297 for (; x
< integralImageSize
.width
; x
++) {
298 // We could be unaligned here!
303 ".set arch=loongson3a \n\t"
304 "mtc1 %[cr], %[crl] \n\t"
305 "punpcklwd %[crl], %[crl], %[crl] \n\t"
306 "mov.d %[crh], %[crl] \n\t"
308 : [crh
] "=f"(currentRowSum
.h
), [crl
] "=f"(currentRowSum
.l
)
309 : [cr
] "r"(intCurrentRowSum
));
312 intCurrentRowSum
+= pixel
;
313 intRow
[x
] = intPrevRow
[x
] + intCurrentRowSum
;
318 ".set arch=loongson3a \n\t"
319 "pshufh %[crl], %[crh], %[see] \n\t"
320 "pshufh %[crh], %[crh], %[see] \n\t"
322 : [crh
] "+f"(currentRowSum
.h
), [crl
] "+f"(currentRowSum
.l
)
325 for (; x
< integralImageSize
.width
; x
+= 4) {
326 __m128i sumPixels
, t
;
329 ".set arch=loongson3a \n\t"
330 "mtc1 %[pix], %[spl] \n\t"
331 "punpcklwd %[spl], %[spl], %[spl] \n\t"
332 "mov.d %[sph], %[spl] \n\t"
334 : [sph
] "=f"(sumPixels
.h
), [spl
] "=f"(sumPixels
.l
)
336 sumPixels
= AccumulatePixelSums(sumPixels
);
339 ".set arch=loongson3a \n\t"
340 _mm_paddw(sp
, sp
, cr
)
341 "pshufh %[crh], %[sph], %[see] \n\t"
342 "pshufh %[crl], %[sph], %[see] \n\t"
343 "gslqc1 %[th], %[tl], (%[pr]) \n\t"
345 "gssqc1 %[th], %[tl], (%[r]) \n\t"
347 :[th
]"=&f"(t
.h
), [tl
]"=&f"(t
.l
),
348 [sph
]"+f"(sumPixels
.h
), [spl
]"+f"(sumPixels
.l
),
349 [crh
]"+f"(currentRowSum
.h
), [crl
]"+f"(currentRowSum
.l
)
350 :[r
]"r"(intRow
+ x
), [pr
]"r"(intPrevRow
+ x
), [see
]"f"(see
)
356 if (aBottomInflation
) {
357 // Store the last valid row of our source image in the last row of
358 // our integral image. This will be overwritten with the correct values
359 // in the upcoming loop.
360 LoadIntegralRowFromRow(
361 aIntegralImage
+ (integralImageSize
.height
- 1) * stride32bit
,
362 aSource
+ (aSize
.height
- 1) * aSourceStride
, aSize
.width
,
363 aLeftInflation
, aRightInflation
);
365 for (int y
= aSize
.height
+ aTopInflation
; y
< integralImageSize
.height
;
367 __m128i
* intRow
= (__m128i
*)(aIntegralImage
+ (y
* stride32bit
));
368 __m128i
* intPrevRow
= (__m128i
*)(aIntegralImage
+ (y
- 1) * stride32bit
);
369 __m128i
* intLastRow
=
370 (__m128i
*)(aIntegralImage
+
371 (integralImageSize
.height
- 1) * stride32bit
);
373 for (int x
= 0; x
< integralImageSize
.width
; x
+= 4) {
377 ".set arch=loongson3a \n\t"
378 "gslqc1 %[t1h], %[t1l], (%[lr]) \n\t"
379 "gslqc1 %[t2h], %[t2l], (%[pr]) \n\t"
380 _mm_paddw(t1
, t1
, t2
)
381 "gssqc1 %[t1h], %[t1l], (%[r]) \n\t"
383 :[t1h
]"=&f"(t1
.h
), [t1l
]"=&f"(t1
.l
),
384 [t2h
]"=&f"(t2
.h
), [t2l
]"=&f"(t2
.l
)
385 :[r
]"r"(intRow
+ (x
/ 4)),
386 [lr
]"r"(intLastRow
+ (x
/ 4)),
387 [pr
]"r"(intPrevRow
+ (x
/ 4))
396 * Attempt to do an in-place box blur using an integral image.
398 void AlphaBoxBlur::BoxBlur_LS3(uint8_t* aData
, int32_t aLeftLobe
,
399 int32_t aRightLobe
, int32_t aTopLobe
,
400 int32_t aBottomLobe
, uint32_t* aIntegralImage
,
401 size_t aIntegralImageStride
) const {
402 IntSize size
= GetSize();
404 MOZ_ASSERT(size
.height
> 0);
406 // Our 'left' or 'top' lobe will include the current pixel. i.e. when
407 // looking at an integral image the value of a pixel at 'x,y' is calculated
408 // using the value of the integral image values above/below that.
411 int32_t boxSize
= (aLeftLobe
+ aRightLobe
) * (aTopLobe
+ aBottomLobe
);
413 MOZ_ASSERT(boxSize
> 0);
419 uint32_t reciprocal
= uint32_t((uint64_t(1) << 32) / boxSize
);
421 uint32_t stride32bit
= aIntegralImageStride
/ 4;
422 int32_t leftInflation
= RoundUpToMultipleOf4(aLeftLobe
).value();
424 GenerateIntegralImage_LS3(leftInflation
, aRightLobe
, aTopLobe
, aBottomLobe
,
425 aIntegralImage
, aIntegralImageStride
, aData
,
428 __m128i divisor
, zero
;
431 ".set arch=loongson3a \n\t"
432 "mtc1 %[rec], %[divl] \n\t"
433 "punpcklwd %[divl], %[divl], %[divl] \n\t"
434 "mov.d %[divh], %[divl] \n\t" _mm_xor(zero
, zero
, zero
) ".set pop \n\t"
435 : [divh
] "=f"(divisor
.h
), [divl
] "=f"(divisor
.l
), [zeroh
] "=f"(zero
.h
),
437 : [rec
] "r"(reciprocal
));
439 // This points to the start of the rectangle within the IntegralImage that
440 // overlaps the surface being blurred.
441 uint32_t* innerIntegral
=
442 aIntegralImage
+ (aTopLobe
* stride32bit
) + leftInflation
;
444 IntRect skipRect
= mSkipRect
;
445 int32_t stride
= mStride
;
446 uint8_t* data
= aData
;
447 for (int32_t y
= 0; y
< size
.height
; y
++) {
448 bool inSkipRectY
= y
> skipRect
.y
&& y
< skipRect
.YMost();
450 uint32_t* topLeftBase
=
451 innerIntegral
+ ((y
- aTopLobe
) * ptrdiff_t(stride32bit
) - aLeftLobe
);
452 uint32_t* topRightBase
=
453 innerIntegral
+ ((y
- aTopLobe
) * ptrdiff_t(stride32bit
) + aRightLobe
);
454 uint32_t* bottomRightBase
=
456 ((y
+ aBottomLobe
) * ptrdiff_t(stride32bit
) + aRightLobe
);
457 uint32_t* bottomLeftBase
=
459 ((y
+ aBottomLobe
) * ptrdiff_t(stride32bit
) - aLeftLobe
);
462 // Process 16 pixels at a time for as long as possible.
463 for (; x
<= size
.width
- 16; x
+= 16) {
464 if (inSkipRectY
&& x
> skipRect
.x
&& x
< skipRect
.XMost()) {
465 x
= skipRect
.XMost() - 16;
466 // Trigger early jump on coming loop iterations, this will be reset
477 topLeft
= loadUnaligned128((__m128i
*)(topLeftBase
+ x
));
478 topRight
= loadUnaligned128((__m128i
*)(topRightBase
+ x
));
479 bottomRight
= loadUnaligned128((__m128i
*)(bottomRightBase
+ x
));
480 bottomLeft
= loadUnaligned128((__m128i
*)(bottomLeftBase
+ x
));
482 BlurFourPixels(topLeft
, topRight
, bottomRight
, bottomLeft
, divisor
);
484 topLeft
= loadUnaligned128((__m128i
*)(topLeftBase
+ x
+ 4));
485 topRight
= loadUnaligned128((__m128i
*)(topRightBase
+ x
+ 4));
486 bottomRight
= loadUnaligned128((__m128i
*)(bottomRightBase
+ x
+ 4));
487 bottomLeft
= loadUnaligned128((__m128i
*)(bottomLeftBase
+ x
+ 4));
489 BlurFourPixels(topLeft
, topRight
, bottomRight
, bottomLeft
, divisor
);
491 topLeft
= loadUnaligned128((__m128i
*)(topLeftBase
+ x
+ 8));
492 topRight
= loadUnaligned128((__m128i
*)(topRightBase
+ x
+ 8));
493 bottomRight
= loadUnaligned128((__m128i
*)(bottomRightBase
+ x
+ 8));
494 bottomLeft
= loadUnaligned128((__m128i
*)(bottomLeftBase
+ x
+ 8));
496 BlurFourPixels(topLeft
, topRight
, bottomRight
, bottomLeft
, divisor
);
498 topLeft
= loadUnaligned128((__m128i
*)(topLeftBase
+ x
+ 12));
499 topRight
= loadUnaligned128((__m128i
*)(topRightBase
+ x
+ 12));
500 bottomRight
= loadUnaligned128((__m128i
*)(bottomRightBase
+ x
+ 12));
501 bottomLeft
= loadUnaligned128((__m128i
*)(bottomLeftBase
+ x
+ 12));
503 BlurFourPixels(topLeft
, topRight
, bottomRight
, bottomLeft
, divisor
);
509 ".set arch=loongson3a \n\t"
510 _mm_packsswh(r3
, r3
, r4
, t
)
511 _mm_packsswh(f
, r1
, r2
, t
)
512 _mm_packushb(f
, f
, r3
, t
)
513 "gssdlc1 %[fh], 0xf(%[d]) \n\t"
514 "gssdrc1 %[fh], 0x8(%[d]) \n\t"
515 "gssdlc1 %[fl], 0x7(%[d]) \n\t"
516 "gssdrc1 %[fl], 0x0(%[d]) \n\t"
518 :[fh
]"=&f"(final
.h
), [fl
]"=&f"(final
.l
),
519 [r3h
]"+f"(result3
.h
), [r3l
]"+f"(result3
.l
),
521 :[r1h
]"f"(result1
.h
), [r1l
]"f"(result1
.l
),
522 [r2h
]"f"(result2
.h
), [r2l
]"f"(result2
.l
),
523 [r4h
]"f"(result4
.h
), [r4l
]"f"(result4
.l
),
524 [d
]"r"(data
+ stride
* y
+ x
)
529 // Process the remaining pixels 4 bytes at a time.
530 for (; x
< size
.width
; x
+= 4) {
531 if (inSkipRectY
&& x
> skipRect
.x
&& x
< skipRect
.XMost()) {
532 x
= skipRect
.XMost() - 4;
533 // Trigger early jump on coming loop iterations, this will be reset
538 __m128i topLeft
= loadUnaligned128((__m128i
*)(topLeftBase
+ x
));
539 __m128i topRight
= loadUnaligned128((__m128i
*)(topRightBase
+ x
));
540 __m128i bottomRight
= loadUnaligned128((__m128i
*)(bottomRightBase
+ x
));
541 __m128i bottomLeft
= loadUnaligned128((__m128i
*)(bottomLeftBase
+ x
));
544 BlurFourPixels(topLeft
, topRight
, bottomRight
, bottomLeft
, divisor
);
550 ".set arch=loongson3a \n\t"
551 _mm_packsswh(f
, r
, zero
, t
)
552 _mm_packushb(f
, f
, zero
, t
)
553 "swc1 %[fl], (%[d]) \n\t"
555 :[fh
]"=&f"(final
.h
), [fl
]"=&f"(final
.l
),
557 :[d
]"r"(data
+ stride
* y
+ x
),
558 [rh
]"f"(result
.h
), [rl
]"f"(result
.l
),
559 [zeroh
]"f"(zero
.h
), [zerol
]"f"(zero
.l
)
567 } // namespace mozilla
569 #endif /* _MIPS_ARCH_LOONGSON3A */