2 * Loongson MMI optimizations for libjpeg-turbo
4 * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
5 * Copyright (C) 2015, 2019, D. R. Commander. All Rights Reserved.
6 * Copyright (C) 2016-2018, Loongson Technology Corporation Limited, BeiJing.
8 * Authors: ZhangLixia <zhanglixia-hf@loongson.cn>
10 * Based on the x86 SIMD extension for IJG JPEG library
11 * Copyright (C) 1999-2006, MIYASAKA Masaru.
13 * This software is provided 'as-is', without any express or implied
14 * warranty. In no event will the authors be held liable for any damages
15 * arising from the use of this software.
17 * Permission is granted to anyone to use this software for any purpose,
18 * including commercial applications, and to alter it and redistribute it
19 * freely, subject to the following restrictions:
21 * 1. The origin of this software must not be misrepresented; you must not
22 * claim that you wrote the original software. If you use this software
23 * in a product, an acknowledgment in the product documentation would be
24 * appreciated but is not required.
25 * 2. Altered source versions must be plainly marked as such, and must not be
26 * misrepresented as being the original software.
27 * 3. This notice may not be removed or altered from any source distribution.
30 /* This file is included by jdmerge-mmi.c */
90 void jsimd_h2v1_merged_upsample_mmi(JDIMENSION output_width
,
92 JDIMENSION in_row_group_ctr
,
93 JSAMPARRAY output_buf
)
95 JSAMPROW outptr
, inptr0
, inptr1
, inptr2
;
97 __m64 ythise
, ythiso
, ythis
, ynexte
, ynexto
, ynext
, yl
, y
;
98 __m64 cbl
, cbl2
, cbh
, cbh2
, cb
, crl
, crl2
, crh
, crh2
, cr
;
99 __m64 rle
, rlo
, rl
, rhe
, rho
, rh
, re
, ro
;
100 __m64 ga
, gb
, gle
, glo
, gl
, gc
, gd
, ghe
, gho
, gh
, ge
, go
;
101 __m64 ble
, blo
, bl
, bhe
, bho
, bh
, be
, bo
, xe
= 0.0, xo
= 0.0;
102 __m64 decenter
, mask
, zero
= 0.0;
103 #if RGB_PIXELSIZE == 4
107 inptr0
= input_buf
[0][in_row_group_ctr
];
108 inptr1
= input_buf
[1][in_row_group_ctr
];
109 inptr2
= input_buf
[2][in_row_group_ctr
];
110 outptr
= output_buf
[0];
112 for (num_cols
= output_width
>> 1; num_cols
> 0; num_cols
-= 8,
113 inptr0
+= 16, inptr1
+= 8, inptr2
+= 8) {
115 cb
= _mm_load_si64((__m64
*)inptr1
);
116 cr
= _mm_load_si64((__m64
*)inptr2
);
117 ythis
= _mm_load_si64((__m64
*)inptr0
);
118 ynext
= _mm_load_si64((__m64
*)inptr0
+ 1);
120 mask
= decenter
= 0.0;
121 mask
= _mm_cmpeq_pi16(mask
, mask
);
122 decenter
= _mm_cmpeq_pi16(decenter
, decenter
);
123 mask
= _mm_srli_pi16(mask
, BYTE_BIT
); /* {0xFF 0x00 0xFF 0x00 ..} */
124 decenter
= _mm_slli_pi16(decenter
, 7); /* {0xFF80 0xFF80 0xFF80 0xFF80} */
126 cbl
= _mm_unpacklo_pi8(cb
, zero
); /* Cb(0123) */
127 cbh
= _mm_unpackhi_pi8(cb
, zero
); /* Cb(4567) */
128 crl
= _mm_unpacklo_pi8(cr
, zero
); /* Cr(0123) */
129 crh
= _mm_unpackhi_pi8(cr
, zero
); /* Cr(4567) */
130 cbl
= _mm_add_pi16(cbl
, decenter
);
131 cbh
= _mm_add_pi16(cbh
, decenter
);
132 crl
= _mm_add_pi16(crl
, decenter
);
133 crh
= _mm_add_pi16(crh
, decenter
);
136 * R = Y + 1.40200 * Cr
137 * G = Y - 0.34414 * Cb - 0.71414 * Cr
138 * B = Y + 1.77200 * Cb
140 * (This implementation)
141 * R = Y + 0.40200 * Cr + Cr
142 * G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
143 * B = Y - 0.22800 * Cb + Cb + Cb
146 cbl2
= _mm_add_pi16(cbl
, cbl
); /* 2*CbL */
147 cbh2
= _mm_add_pi16(cbh
, cbh
); /* 2*CbH */
148 crl2
= _mm_add_pi16(crl
, crl
); /* 2*CrL */
149 crh2
= _mm_add_pi16(crh
, crh
); /* 2*CrH */
151 bl
= _mm_mulhi_pi16(cbl2
, PW_MF0228
); /* (2*CbL * -FIX(0.22800) */
152 bh
= _mm_mulhi_pi16(cbh2
, PW_MF0228
); /* (2*CbH * -FIX(0.22800) */
153 rl
= _mm_mulhi_pi16(crl2
, PW_F0402
); /* (2*CrL * FIX(0.40200)) */
154 rh
= _mm_mulhi_pi16(crh2
, PW_F0402
); /* (2*CrH * FIX(0.40200)) */
156 bl
= _mm_add_pi16(bl
, PW_ONE
);
157 bh
= _mm_add_pi16(bh
, PW_ONE
);
158 bl
= _mm_srai_pi16(bl
, 1); /* (CbL * -FIX(0.22800)) */
159 bh
= _mm_srai_pi16(bh
, 1); /* (CbH * -FIX(0.22800)) */
160 rl
= _mm_add_pi16(rl
, PW_ONE
);
161 rh
= _mm_add_pi16(rh
, PW_ONE
);
162 rl
= _mm_srai_pi16(rl
, 1); /* (CrL * FIX(0.40200)) */
163 rh
= _mm_srai_pi16(rh
, 1); /* (CrH * FIX(0.40200)) */
165 bl
= _mm_add_pi16(bl
, cbl
);
166 bh
= _mm_add_pi16(bh
, cbh
);
167 bl
= _mm_add_pi16(bl
, cbl
); /* (CbL * FIX(1.77200))=(B-Y)L */
168 bh
= _mm_add_pi16(bh
, cbh
); /* (CbH * FIX(1.77200))=(B-Y)H */
169 rl
= _mm_add_pi16(rl
, crl
); /* (CrL * FIX(1.40200))=(R-Y)L */
170 rh
= _mm_add_pi16(rh
, crh
); /* (CrH * FIX(1.40200))=(R-Y)H */
172 ga
= _mm_unpacklo_pi16(cbl
, crl
);
173 gb
= _mm_unpackhi_pi16(cbl
, crl
);
174 ga
= _mm_madd_pi16(ga
, PW_MF0344_F0285
);
175 gb
= _mm_madd_pi16(gb
, PW_MF0344_F0285
);
176 gc
= _mm_unpacklo_pi16(cbh
, crh
);
177 gd
= _mm_unpackhi_pi16(cbh
, crh
);
178 gc
= _mm_madd_pi16(gc
, PW_MF0344_F0285
);
179 gd
= _mm_madd_pi16(gd
, PW_MF0344_F0285
);
181 ga
= _mm_add_pi32(ga
, PD_ONEHALF
);
182 gb
= _mm_add_pi32(gb
, PD_ONEHALF
);
183 ga
= _mm_srai_pi32(ga
, SCALEBITS
);
184 gb
= _mm_srai_pi32(gb
, SCALEBITS
);
185 gc
= _mm_add_pi32(gc
, PD_ONEHALF
);
186 gd
= _mm_add_pi32(gd
, PD_ONEHALF
);
187 gc
= _mm_srai_pi32(gc
, SCALEBITS
);
188 gd
= _mm_srai_pi32(gd
, SCALEBITS
);
190 gl
= _mm_packs_pi32(ga
, gb
); /* CbL*-FIX(0.344)+CrL*FIX(0.285) */
191 gh
= _mm_packs_pi32(gc
, gd
); /* CbH*-FIX(0.344)+CrH*FIX(0.285) */
192 gl
= _mm_sub_pi16(gl
, crl
); /* CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L */
193 gh
= _mm_sub_pi16(gh
, crh
); /* CbH*-FIX(0.344)+CrH*-FIX(0.714)=(G-Y)H */
195 ythise
= _mm_and_si64(mask
, ythis
); /* Y(0246) */
196 ythiso
= _mm_srli_pi16(ythis
, BYTE_BIT
); /* Y(1357) */
197 ynexte
= _mm_and_si64(mask
, ynext
); /* Y(8ACE) */
198 ynexto
= _mm_srli_pi16(ynext
, BYTE_BIT
); /* Y(9BDF) */
200 rle
= _mm_add_pi16(rl
, ythise
); /* (R0 R2 R4 R6) */
201 rlo
= _mm_add_pi16(rl
, ythiso
); /* (R1 R3 R5 R7) */
202 rhe
= _mm_add_pi16(rh
, ynexte
); /* (R8 RA RC RE) */
203 rho
= _mm_add_pi16(rh
, ynexto
); /* (R9 RB RD RF) */
204 re
= _mm_packs_pu16(rle
, rhe
); /* (R0 R2 R4 R6 R8 RA RC RE) */
205 ro
= _mm_packs_pu16(rlo
, rho
); /* (R1 R3 R5 R7 R9 RB RD RF) */
207 gle
= _mm_add_pi16(gl
, ythise
); /* (G0 G2 G4 G6) */
208 glo
= _mm_add_pi16(gl
, ythiso
); /* (G1 G3 G5 G7) */
209 ghe
= _mm_add_pi16(gh
, ynexte
); /* (G8 GA GC GE) */
210 gho
= _mm_add_pi16(gh
, ynexto
); /* (G9 GB GD GF) */
211 ge
= _mm_packs_pu16(gle
, ghe
); /* (G0 G2 G4 G6 G8 GA GC GE) */
212 go
= _mm_packs_pu16(glo
, gho
); /* (G1 G3 G5 G7 G9 GB GD GF) */
214 ble
= _mm_add_pi16(bl
, ythise
); /* (B0 B2 B4 B6) */
215 blo
= _mm_add_pi16(bl
, ythiso
); /* (B1 B3 B5 B7) */
216 bhe
= _mm_add_pi16(bh
, ynexte
); /* (B8 BA BC BE) */
217 bho
= _mm_add_pi16(bh
, ynexto
); /* (B9 BB BD BF) */
218 be
= _mm_packs_pu16(ble
, bhe
); /* (B0 B2 B4 B6 B8 BA BC BE) */
219 bo
= _mm_packs_pu16(blo
, bho
); /* (B1 B3 B5 B7 B9 BB BD BF) */
221 #if RGB_PIXELSIZE == 3
223 /* mmA=(00 02 04 06 08 0A 0C 0E), mmB=(01 03 05 07 09 0B 0D 0F) */
224 /* mmC=(10 12 14 16 18 1A 1C 1E), mmD=(11 13 15 17 19 1B 1D 1F) */
225 /* mmE=(20 22 24 26 28 2A 2C 2E), mmF=(21 23 25 27 29 2B 2D 2F) */
226 mmG
= _mm_unpacklo_pi8(mmA
, mmC
); /* (00 10 02 12 04 14 06 16) */
227 mmA
= _mm_unpackhi_pi8(mmA
, mmC
); /* (08 18 0A 1A 0C 1C 0E 1E) */
228 mmH
= _mm_unpacklo_pi8(mmE
, mmB
); /* (20 01 22 03 24 05 26 07) */
229 mmE
= _mm_unpackhi_pi8(mmE
, mmB
); /* (28 09 2A 0B 2C 0D 2E 0F) */
230 mmC
= _mm_unpacklo_pi8(mmD
, mmF
); /* (11 21 13 23 15 25 17 27) */
231 mmD
= _mm_unpackhi_pi8(mmD
, mmF
); /* (19 29 1B 2B 1D 2D 1F 2F) */
233 mmB
= _mm_unpacklo_pi16(mmG
, mmA
); /* (00 10 08 18 02 12 0A 1A) */
234 mmA
= _mm_unpackhi_pi16(mmG
, mmA
); /* (04 14 0C 1C 06 16 0E 1E) */
235 mmF
= _mm_unpacklo_pi16(mmH
, mmE
); /* (20 01 28 09 22 03 2A 0B) */
236 mmE
= _mm_unpackhi_pi16(mmH
, mmE
); /* (24 05 2C 0D 26 07 2E 0F) */
237 mmH
= _mm_unpacklo_pi16(mmC
, mmD
); /* (11 21 19 29 13 23 1B 2B) */
238 mmG
= _mm_unpackhi_pi16(mmC
, mmD
); /* (15 25 1D 2D 17 27 1F 2F) */
240 mmC
= _mm_unpacklo_pi16(mmB
, mmF
); /* (00 10 20 01 08 18 28 09) */
241 mmB
= _mm_srli_si64(mmB
, 4 * BYTE_BIT
);
242 mmB
= _mm_unpacklo_pi16(mmH
, mmB
); /* (11 21 02 12 19 29 0A 1A) */
243 mmD
= _mm_unpackhi_pi16(mmF
, mmH
); /* (22 03 13 23 2A 0B 1B 2B) */
244 mmF
= _mm_unpacklo_pi16(mmA
, mmE
); /* (04 14 24 05 0C 1C 2C 0D) */
245 mmA
= _mm_srli_si64(mmA
, 4 * BYTE_BIT
);
246 mmH
= _mm_unpacklo_pi16(mmG
, mmA
); /* (15 25 06 16 1D 2D 0E 1E) */
247 mmG
= _mm_unpackhi_pi16(mmE
, mmG
); /* (26 07 17 27 2E 0F 1F 2F) */
249 mmA
= _mm_unpacklo_pi32(mmC
, mmB
); /* (00 10 20 01 11 21 02 12) */
250 mmE
= _mm_unpackhi_pi32(mmC
, mmB
); /* (08 18 28 09 19 29 0A 1A) */
251 mmB
= _mm_unpacklo_pi32(mmD
, mmF
); /* (22 03 13 23 04 14 24 05) */
252 mmF
= _mm_unpackhi_pi32(mmD
, mmF
); /* (2A 0B 1B 2B 0C 1C 2C 0D) */
253 mmC
= _mm_unpacklo_pi32(mmH
, mmG
); /* (15 25 06 16 26 07 17 27) */
254 mmG
= _mm_unpackhi_pi32(mmH
, mmG
); /* (1D 2D 0E 1E 2E 0F 1F 2F) */
257 if (!(((long)outptr
) & 7)) {
258 _mm_store_si64((__m64
*)outptr
, mmA
);
259 _mm_store_si64((__m64
*)(outptr
+ 8), mmB
);
260 _mm_store_si64((__m64
*)(outptr
+ 16), mmC
);
261 _mm_store_si64((__m64
*)(outptr
+ 24), mmE
);
262 _mm_store_si64((__m64
*)(outptr
+ 32), mmF
);
263 _mm_store_si64((__m64
*)(outptr
+ 40), mmG
);
265 _mm_storeu_si64((__m64
*)outptr
, mmA
);
266 _mm_storeu_si64((__m64
*)(outptr
+ 8), mmB
);
267 _mm_storeu_si64((__m64
*)(outptr
+ 16), mmC
);
268 _mm_storeu_si64((__m64
*)(outptr
+ 24), mmE
);
269 _mm_storeu_si64((__m64
*)(outptr
+ 32), mmF
);
270 _mm_storeu_si64((__m64
*)(outptr
+ 40), mmG
);
272 outptr
+= RGB_PIXELSIZE
* 16;
274 if (output_width
& 1)
275 col
= num_cols
* 6 + 3;
279 asm(".set noreorder\r\n" /* st24 */
287 "bltu $9, $8, 1f\r\n"
289 "gssdlc1 $f4, 7($10)\r\n"
290 "gssdrc1 $f4, 0($10)\r\n"
291 "gssdlc1 $f6, 7+8($10)\r\n"
292 "gssdrc1 $f6, 8($10)\r\n"
293 "gssdlc1 $f8, 7+16($10)\r\n"
294 "gssdrc1 $f8, 16($10)\r\n"
298 "subu $9, $9, 24\r\n"
299 PTR_ADDU
"$10, $10, 24\r\n"
302 "li $8, 16\r\n" /* st16 */
303 "bltu $9, $8, 2f\r\n"
305 "gssdlc1 $f4, 7($10)\r\n"
306 "gssdrc1 $f4, 0($10)\r\n"
307 "gssdlc1 $f6, 7+8($10)\r\n"
308 "gssdrc1 $f6, 8($10)\r\n"
310 "subu $9, $9, 16\r\n"
311 PTR_ADDU
"$10, $10, 16\r\n"
314 "li $8, 8\r\n" /* st8 */
315 "bltu $9, $8, 3f\r\n"
317 "gssdlc1 $f4, 7($10)\r\n"
318 "gssdrc1 $f4, 0($10)\r\n"
321 PTR_ADDU
"$10, $10, 8\r\n"
324 "li $8, 4\r\n" /* st4 */
326 "bltu $9, $8, 4f\r\n"
328 "swl $11, 3($10)\r\n"
329 "swr $11, 0($10)\r\n"
332 "dsrl $f4, $f4, $f6\r\n"
335 PTR_ADDU
"$10, $10, 4\r\n"
338 "li $8, 2\r\n" /* st2 */
339 "bltu $9, $8, 5f\r\n"
341 "ush $11, 0($10)\r\n"
344 PTR_ADDU
"$10, $10, 2\r\n"
347 "li $8, 1\r\n" /* st1 */
348 "bltu $9, $8, 6f\r\n"
355 : "f" (mmA
), "f" (mmB
), "f" (mmC
), "f" (mmE
), "f" (mmF
),
356 "f" (mmG
), "r" (col
), "r" (outptr
)
357 : "$f4", "$f6", "$f8", "$8", "$9", "$10", "$11", "memory"
361 #else /* RGB_PIXELSIZE == 4 */
363 #ifdef RGBX_FILLER_0XFF
364 xe
= _mm_cmpeq_pi8(xe
, xe
);
365 xo
= _mm_cmpeq_pi8(xo
, xo
);
367 xe
= _mm_xor_si64(xe
, xe
);
368 xo
= _mm_xor_si64(xo
, xo
);
370 /* mmA=(00 02 04 06 08 0A 0C 0E), mmB=(01 03 05 07 09 0B 0D 0F) */
371 /* mmC=(10 12 14 16 18 1A 1C 1E), mmD=(11 13 15 17 19 1B 1D 1F) */
372 /* mmE=(20 22 24 26 28 2A 2C 2E), mmF=(21 23 25 27 29 2B 2D 2F) */
373 /* mmG=(30 32 34 36 38 3A 3C 3E), mmH=(31 33 35 37 39 3B 3D 3F) */
375 mm8
= _mm_unpacklo_pi8(mmA
, mmC
); /* (00 10 02 12 04 14 06 16) */
376 mm9
= _mm_unpackhi_pi8(mmA
, mmC
); /* (08 18 0A 1A 0C 1C 0E 1E) */
377 mmA
= _mm_unpacklo_pi8(mmE
, mmG
); /* (20 30 22 32 24 34 26 36) */
378 mmE
= _mm_unpackhi_pi8(mmE
, mmG
); /* (28 38 2A 3A 2C 3C 2E 3E) */
380 mmG
= _mm_unpacklo_pi8(mmB
, mmD
); /* (01 11 03 13 05 15 07 17) */
381 mmB
= _mm_unpackhi_pi8(mmB
, mmD
); /* (09 19 0B 1B 0D 1D 0F 1F) */
382 mmD
= _mm_unpacklo_pi8(mmF
, mmH
); /* (21 31 23 33 25 35 27 37) */
383 mmF
= _mm_unpackhi_pi8(mmF
, mmH
); /* (29 39 2B 3B 2D 3D 2F 3F) */
385 mmH
= _mm_unpacklo_pi16(mm8
, mmA
); /* (00 10 20 30 02 12 22 32) */
386 mm8
= _mm_unpackhi_pi16(mm8
, mmA
); /* (04 14 24 34 06 16 26 36) */
387 mmA
= _mm_unpacklo_pi16(mmG
, mmD
); /* (01 11 21 31 03 13 23 33) */
388 mmD
= _mm_unpackhi_pi16(mmG
, mmD
); /* (05 15 25 35 07 17 27 37) */
390 mmG
= _mm_unpackhi_pi16(mm9
, mmE
); /* (0C 1C 2C 3C 0E 1E 2E 3E) */
391 mm9
= _mm_unpacklo_pi16(mm9
, mmE
); /* (08 18 28 38 0A 1A 2A 3A) */
392 mmE
= _mm_unpacklo_pi16(mmB
, mmF
); /* (09 19 29 39 0B 1B 2B 3B) */
393 mmF
= _mm_unpackhi_pi16(mmB
, mmF
); /* (0D 1D 2D 3D 0F 1F 2F 3F) */
395 mmB
= _mm_unpackhi_pi32(mmH
, mmA
); /* (02 12 22 32 03 13 23 33) */
396 mmA
= _mm_unpacklo_pi32(mmH
, mmA
); /* (00 10 20 30 01 11 21 31) */
397 mmC
= _mm_unpacklo_pi32(mm8
, mmD
); /* (04 14 24 34 05 15 25 35) */
398 mmD
= _mm_unpackhi_pi32(mm8
, mmD
); /* (06 16 26 36 07 17 27 37) */
400 mmH
= _mm_unpackhi_pi32(mmG
, mmF
); /* (0E 1E 2E 3E 0F 1F 2F 3F) */
401 mmG
= _mm_unpacklo_pi32(mmG
, mmF
); /* (0C 1C 2C 3C 0D 1D 2D 3D) */
402 mmF
= _mm_unpackhi_pi32(mm9
, mmE
); /* (0A 1A 2A 3A 0B 1B 2B 3B) */
403 mmE
= _mm_unpacklo_pi32(mm9
, mmE
); /* (08 18 28 38 09 19 29 39) */
406 if (!(((long)outptr
) & 7)) {
407 _mm_store_si64((__m64
*)outptr
, mmA
);
408 _mm_store_si64((__m64
*)(outptr
+ 8), mmB
);
409 _mm_store_si64((__m64
*)(outptr
+ 16), mmC
);
410 _mm_store_si64((__m64
*)(outptr
+ 24), mmD
);
411 _mm_store_si64((__m64
*)(outptr
+ 32), mmE
);
412 _mm_store_si64((__m64
*)(outptr
+ 40), mmF
);
413 _mm_store_si64((__m64
*)(outptr
+ 48), mmG
);
414 _mm_store_si64((__m64
*)(outptr
+ 56), mmH
);
416 _mm_storeu_si64((__m64
*)outptr
, mmA
);
417 _mm_storeu_si64((__m64
*)(outptr
+ 8), mmB
);
418 _mm_storeu_si64((__m64
*)(outptr
+ 16), mmC
);
419 _mm_storeu_si64((__m64
*)(outptr
+ 24), mmD
);
420 _mm_storeu_si64((__m64
*)(outptr
+ 32), mmE
);
421 _mm_storeu_si64((__m64
*)(outptr
+ 40), mmF
);
422 _mm_storeu_si64((__m64
*)(outptr
+ 48), mmG
);
423 _mm_storeu_si64((__m64
*)(outptr
+ 56), mmH
);
425 outptr
+= RGB_PIXELSIZE
* 16;
427 if (output_width
& 1)
428 col
= num_cols
* 2 + 1;
431 asm(".set noreorder\r\n" /* st32 */
440 "bltu $9, $8, 1f\r\n"
442 "gssdlc1 $f4, 7($10)\r\n"
443 "gssdrc1 $f4, 0($10)\r\n"
444 "gssdlc1 $f6, 7+8($10)\r\n"
445 "gssdrc1 $f6, 8($10)\r\n"
446 "gssdlc1 $f8, 7+16($10)\r\n"
447 "gssdrc1 $f8, 16($10)\r\n"
448 "gssdlc1 $f10, 7+24($10)\r\n"
449 "gssdrc1 $f10, 24($10)\r\n"
455 PTR_ADDU
"$10, $10, 32\r\n"
458 "li $8, 4\r\n" /* st16 */
459 "bltu $9, $8, 2f\r\n"
461 "gssdlc1 $f4, 7($10)\r\n"
462 "gssdrc1 $f4, 0($10)\r\n"
463 "gssdlc1 $f6, 7+8($10)\r\n"
464 "gssdrc1 $f6, 8($10)\r\n"
466 "mov.s $f6, $f10\r\n"
468 PTR_ADDU
"$10, $10, 16\r\n"
471 "li $8, 2\r\n" /* st8 */
472 "bltu $9, $8, 3f\r\n"
474 "gssdlc1 $f4, 7($10)\r\n"
475 "gssdrc1 $f4, 0($10)\r\n"
478 PTR_ADDU
"$10, $10, 8\r\n"
481 "li $8, 1\r\n" /* st4 */
482 "bltu $9, $8, 4f\r\n"
484 "gsswlc1 $f4, 3($10)\r\n"
485 "gsswrc1 $f4, 0($10)\r\n"
488 "li %1, 0\r\n" /* end */
489 : "=m" (*outptr
), "=r" (col
)
490 : "f" (mmA
), "f" (mmB
), "f" (mmC
), "f" (mmD
), "f" (mmE
), "f" (mmF
),
491 "f" (mmG
), "f" (mmH
), "r" (col
), "r" (outptr
)
492 : "$f4", "$f6", "$f8", "$f10", "$8", "$9", "$10", "memory"
500 if (!((output_width
>> 1) & 7)) {
501 if (output_width
& 1) {
502 cb
= _mm_load_si64((__m64
*)inptr1
);
503 cr
= _mm_load_si64((__m64
*)inptr2
);
504 y
= _mm_load_si64((__m64
*)inptr0
);
507 decenter
= _mm_cmpeq_pi16(decenter
, decenter
);
508 decenter
= _mm_slli_pi16(decenter
, 7); /* {0xFF80 0xFF80 0xFF80 0xFF80} */
510 cbl
= _mm_unpacklo_pi8(cb
, zero
); /* Cb(0123) */
511 crl
= _mm_unpacklo_pi8(cr
, zero
); /* Cr(0123) */
512 cbl
= _mm_add_pi16(cbl
, decenter
);
513 crl
= _mm_add_pi16(crl
, decenter
);
515 cbl2
= _mm_add_pi16(cbl
, cbl
); /* 2*CbL */
516 crl2
= _mm_add_pi16(crl
, crl
); /* 2*CrL */
517 bl
= _mm_mulhi_pi16(cbl2
, PW_MF0228
); /* (2*CbL * -FIX(0.22800) */
518 rl
= _mm_mulhi_pi16(crl2
, PW_F0402
); /* (2*CrL * FIX(0.40200)) */
520 bl
= _mm_add_pi16(bl
, PW_ONE
);
521 bl
= _mm_srai_pi16(bl
, 1); /* (CbL * -FIX(0.22800)) */
522 rl
= _mm_add_pi16(rl
, PW_ONE
);
523 rl
= _mm_srai_pi16(rl
, 1); /* (CrL * FIX(0.40200)) */
525 bl
= _mm_add_pi16(bl
, cbl
);
526 bl
= _mm_add_pi16(bl
, cbl
); /* (CbL * FIX(1.77200))=(B-Y)L */
527 rl
= _mm_add_pi16(rl
, crl
); /* (CrL * FIX(1.40200))=(R-Y)L */
529 gl
= _mm_unpacklo_pi16(cbl
, crl
);
530 gl
= _mm_madd_pi16(gl
, PW_MF0344_F0285
);
531 gl
= _mm_add_pi32(gl
, PD_ONEHALF
);
532 gl
= _mm_srai_pi32(gl
, SCALEBITS
);
533 gl
= _mm_packs_pi32(gl
, zero
); /* CbL*-FIX(0.344)+CrL*FIX(0.285) */
534 gl
= _mm_sub_pi16(gl
, crl
); /* CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L */
536 yl
= _mm_unpacklo_pi8(y
, zero
); /* Y(0123) */
537 rl
= _mm_add_pi16(rl
, yl
); /* (R0 R1 R2 R3) */
538 gl
= _mm_add_pi16(gl
, yl
); /* (G0 G1 G2 G3) */
539 bl
= _mm_add_pi16(bl
, yl
); /* (B0 B1 B2 B3) */
540 re
= _mm_packs_pu16(rl
, rl
);
541 ge
= _mm_packs_pu16(gl
, gl
);
542 be
= _mm_packs_pu16(bl
, bl
);
543 #if RGB_PIXELSIZE == 3
544 mmA
= _mm_unpacklo_pi8(mmA
, mmC
);
545 mmA
= _mm_unpacklo_pi16(mmA
, mmE
);
546 asm(".set noreorder\r\n"
555 : "f" (mmA
), "r" (outptr
)
556 : "$f4", "$8", "$9", "memory"
558 #else /* RGB_PIXELSIZE == 4 */
560 #ifdef RGBX_FILLER_0XFF
561 xe
= _mm_cmpeq_pi8(xe
, xe
);
563 xe
= _mm_xor_si64(xe
, xe
);
565 mmA
= _mm_unpacklo_pi8(mmA
, mmC
);
566 mmE
= _mm_unpacklo_pi8(mmE
, mmG
);
567 mmA
= _mm_unpacklo_pi16(mmA
, mmE
);
568 asm(".set noreorder\r\n"
572 "gsswlc1 $f4, 3($8)\r\n"
573 "gsswrc1 $f4, 0($8)\r\n"
575 : "f" (mmA
), "r" (outptr
)
576 : "$f4", "$8", "memory"
584 void jsimd_h2v2_merged_upsample_mmi(JDIMENSION output_width
,
585 JSAMPIMAGE input_buf
,
586 JDIMENSION in_row_group_ctr
,
587 JSAMPARRAY output_buf
)
589 JSAMPROW inptr
, outptr
;
591 inptr
= input_buf
[0][in_row_group_ctr
];
592 outptr
= output_buf
[0];
594 input_buf
[0][in_row_group_ctr
] = input_buf
[0][in_row_group_ctr
* 2];
595 jsimd_h2v1_merged_upsample_mmi(output_width
, input_buf
, in_row_group_ctr
,
598 input_buf
[0][in_row_group_ctr
] = input_buf
[0][in_row_group_ctr
* 2 + 1];
599 output_buf
[0] = output_buf
[1];
600 jsimd_h2v1_merged_upsample_mmi(output_width
, input_buf
, in_row_group_ctr
,
603 input_buf
[0][in_row_group_ctr
] = inptr
;
604 output_buf
[0] = outptr
;