2 * Loongson MMI optimizations for libjpeg-turbo
4 * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
5 * Copyright (C) 2014-2015, 2019, D. R. Commander. All Rights Reserved.
6 * Copyright (C) 2016-2018, Loongson Technology Corporation Limited, BeiJing.
8 * Authors: ZhangLixia <zhanglixia-hf@loongson.cn>
10 * Based on the x86 SIMD extension for IJG JPEG library
11 * Copyright (C) 1999-2006, MIYASAKA Masaru.
13 * This software is provided 'as-is', without any express or implied
14 * warranty. In no event will the authors be held liable for any damages
15 * arising from the use of this software.
17 * Permission is granted to anyone to use this software for any purpose,
18 * including commercial applications, and to alter it and redistribute it
19 * freely, subject to the following restrictions:
21 * 1. The origin of this software must not be misrepresented; you must not
22 * claim that you wrote the original software. If you use this software
23 * in a product, an acknowledgment in the product documentation would be
24 * appreciated but is not required.
25 * 2. Altered source versions must be plainly marked as such, and must not be
26 * misrepresented as being the original software.
27 * 3. This notice may not be removed or altered from any source distribution.
30 /* This file is included by jcgray-mmi.c */
90 void jsimd_rgb_gray_convert_mmi(JDIMENSION image_width
, JSAMPARRAY input_buf
,
91 JSAMPIMAGE output_buf
, JDIMENSION output_row
,
94 JSAMPROW inptr
, outptr
;
96 __m64 re
, ro
, ge
, go
, be
, bo
, xe
;
97 #if RGB_PIXELSIZE == 4
100 __m64 rgle
, rghe
, rglo
, rgho
, bgle
, bghe
, bglo
, bgho
;
101 __m64 yle_rg
, yhe_rg
, yle_bg
, yhe_bg
, yle
, yhe
, ye
;
102 __m64 ylo_rg
, yho_rg
, ylo_bg
, yho_bg
, ylo
, yho
, yo
, y
;
104 while (--num_rows
>= 0) {
105 inptr
= *input_buf
++;
106 outptr
= output_buf
[0][output_row
];
109 for (num_cols
= image_width
; num_cols
> 0; num_cols
-= 8,
112 #if RGB_PIXELSIZE == 3
116 asm(".set noreorder\r\n"
120 "and $10, $9, $8\r\n"
124 "xor $12, $12, $12\r\n"
126 PTR_ADDU
"$13, $13, $9\r\n"
127 "lbu $12, 0($13)\r\n"
131 "and $10, $9, $8\r\n"
135 "xor $11, $11, $11\r\n"
137 PTR_ADDU
"$13, $13, $9\r\n"
138 "lhu $11, 0($13)\r\n"
139 "sll $12, $12, 16\r\n"
140 "or $12, $12, $11\r\n"
145 "and $10, $9, $8\r\n"
150 PTR_ADDU
"$13, $13, $9\r\n"
151 "lwu $14, 0($13)\r\n"
153 "dsll32 $12, $12, 0\r\n"
154 "or $12, $12, $14\r\n"
159 "and $10, $9, $8\r\n"
170 "and $10, $9, $8\r\n"
181 : "=f" (mmA
), "=f" (mmG
), "=f" (mmF
)
182 : "r" (col
), "r" (num_rows
), "r" (inptr
)
183 : "$f0", "$f2", "$f4", "$8", "$9", "$10", "$11", "$12", "$13",
187 if (!(((long)inptr
) & 7)) {
188 mmA
= _mm_load_si64((__m64
*)&inptr
[0]);
189 mmG
= _mm_load_si64((__m64
*)&inptr
[8]);
190 mmF
= _mm_load_si64((__m64
*)&inptr
[16]);
192 mmA
= _mm_loadu_si64((__m64
*)&inptr
[0]);
193 mmG
= _mm_loadu_si64((__m64
*)&inptr
[8]);
194 mmF
= _mm_loadu_si64((__m64
*)&inptr
[16]);
196 inptr
+= RGB_PIXELSIZE
* 8;
198 mmD
= _mm_srli_si64(mmA
, 4 * BYTE_BIT
);
199 mmA
= _mm_slli_si64(mmA
, 4 * BYTE_BIT
);
201 mmA
= _mm_unpackhi_pi8(mmA
, mmG
);
202 mmG
= _mm_slli_si64(mmG
, 4 * BYTE_BIT
);
204 mmD
= _mm_unpacklo_pi8(mmD
, mmF
);
205 mmG
= _mm_unpackhi_pi8(mmG
, mmF
);
207 mmE
= _mm_srli_si64(mmA
, 4 * BYTE_BIT
);
208 mmA
= _mm_slli_si64(mmA
, 4 * BYTE_BIT
);
210 mmA
= _mm_unpackhi_pi8(mmA
, mmD
);
211 mmD
= _mm_slli_si64(mmD
, 4 * BYTE_BIT
);
213 mmE
= _mm_unpacklo_pi8(mmE
, mmG
);
214 mmD
= _mm_unpackhi_pi8(mmD
, mmG
);
215 mmC
= _mm_loadhi_pi8_f(mmA
);
216 mmA
= _mm_loadlo_pi8_f(mmA
);
218 mmB
= _mm_loadhi_pi8_f(mmE
);
219 mmE
= _mm_loadlo_pi8_f(mmE
);
221 mmF
= _mm_loadhi_pi8_f(mmD
);
222 mmD
= _mm_loadlo_pi8_f(mmD
);
224 #else /* RGB_PIXELSIZE == 4 */
228 asm(".set noreorder\r\n"
232 "and $10, $9, $8\r\n"
236 PTR_SLL
"$11, $9, 2\r\n"
238 PTR_ADDU
"$13, $13, $11\r\n"
239 "lwc1 %0, 0($13)\r\n"
243 "and $10, $9, $8\r\n"
247 PTR_SLL
"$11, $9, 2\r\n"
249 PTR_ADDU
"$13, $13, $11\r\n"
251 "ldc1 %0, 0($13)\r\n"
255 "and $10, $9, $8\r\n"
267 : "=f" (mmA
), "=f" (mmF
), "=f" (mmD
), "=f" (mmC
)
268 : "r" (col
), "r" (inptr
)
269 : "$f0", "$f2", "$8", "$9", "$10", "$11", "$13", "memory"
272 if (!(((long)inptr
) & 7)) {
273 mmA
= _mm_load_si64((__m64
*)&inptr
[0]);
274 mmF
= _mm_load_si64((__m64
*)&inptr
[8]);
275 mmD
= _mm_load_si64((__m64
*)&inptr
[16]);
276 mmC
= _mm_load_si64((__m64
*)&inptr
[24]);
278 mmA
= _mm_loadu_si64((__m64
*)&inptr
[0]);
279 mmF
= _mm_loadu_si64((__m64
*)&inptr
[8]);
280 mmD
= _mm_loadu_si64((__m64
*)&inptr
[16]);
281 mmC
= _mm_loadu_si64((__m64
*)&inptr
[24]);
283 inptr
+= RGB_PIXELSIZE
* 8;
285 mmB
= _mm_unpackhi_pi8(mmA
, mmF
);
286 mmA
= _mm_unpacklo_pi8(mmA
, mmF
);
288 mmG
= _mm_unpackhi_pi8(mmD
, mmC
);
289 mmD
= _mm_unpacklo_pi8(mmD
, mmC
);
291 mmE
= _mm_unpackhi_pi16(mmA
, mmD
);
292 mmA
= _mm_unpacklo_pi16(mmA
, mmD
);
294 mmH
= _mm_unpackhi_pi16(mmB
, mmG
);
295 mmB
= _mm_unpacklo_pi16(mmB
, mmG
);
297 mmC
= _mm_loadhi_pi8_f(mmA
);
298 mmA
= _mm_loadlo_pi8_f(mmA
);
300 mmD
= _mm_loadhi_pi8_f(mmB
);
301 mmB
= _mm_loadlo_pi8_f(mmB
);
303 mmG
= _mm_loadhi_pi8_f(mmE
);
304 mmE
= _mm_loadlo_pi8_f(mmE
);
306 mmF
= _mm_unpacklo_pi8(mmH
, mmH
);
307 mmH
= _mm_unpackhi_pi8(mmH
, mmH
);
308 mmF
= _mm_srli_pi16(mmF
, BYTE_BIT
);
309 mmH
= _mm_srli_pi16(mmH
, BYTE_BIT
);
313 /* re=(R0 R2 R4 R6), ge=(G0 G2 G4 G6), be=(B0 B2 B4 B6)
314 * ro=(R1 R3 R5 R7), go=(G1 G3 G5 G7), bo=(B1 B3 B5 B7)
317 * Y = 0.29900 * R + 0.58700 * G + 0.11400 * B
319 * (This implementation)
320 * Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
323 rglo
= _mm_unpacklo_pi16(ro
, go
);
324 rgho
= _mm_unpackhi_pi16(ro
, go
);
325 ylo_rg
= _mm_madd_pi16(rglo
, PW_F0299_F0337
);
326 yho_rg
= _mm_madd_pi16(rgho
, PW_F0299_F0337
);
328 rgle
= _mm_unpacklo_pi16(re
, ge
);
329 rghe
= _mm_unpackhi_pi16(re
, ge
);
330 yle_rg
= _mm_madd_pi16(rgle
, PW_F0299_F0337
);
331 yhe_rg
= _mm_madd_pi16(rghe
, PW_F0299_F0337
);
333 bglo
= _mm_unpacklo_pi16(bo
, go
);
334 bgho
= _mm_unpackhi_pi16(bo
, go
);
335 ylo_bg
= _mm_madd_pi16(bglo
, PW_F0114_F0250
);
336 yho_bg
= _mm_madd_pi16(bgho
, PW_F0114_F0250
);
338 ylo
= _mm_add_pi32(ylo_bg
, ylo_rg
);
339 yho
= _mm_add_pi32(yho_bg
, yho_rg
);
340 ylo
= _mm_add_pi32(ylo
, PD_ONEHALF
);
341 yho
= _mm_add_pi32(yho
, PD_ONEHALF
);
342 ylo
= _mm_srli_pi32(ylo
, SCALEBITS
);
343 yho
= _mm_srli_pi32(yho
, SCALEBITS
);
344 yo
= _mm_packs_pi32(ylo
, yho
);
346 bgle
= _mm_unpacklo_pi16(be
, ge
);
347 bghe
= _mm_unpackhi_pi16(be
, ge
);
348 yle_bg
= _mm_madd_pi16(bgle
, PW_F0114_F0250
);
349 yhe_bg
= _mm_madd_pi16(bghe
, PW_F0114_F0250
);
351 yle
= _mm_add_pi32(yle_bg
, yle_rg
);
352 yhe
= _mm_add_pi32(yhe_bg
, yhe_rg
);
353 yle
= _mm_add_pi32(yle
, PD_ONEHALF
);
354 yhe
= _mm_add_pi32(yhe
, PD_ONEHALF
);
355 yle
= _mm_srli_pi32(yle
, SCALEBITS
);
356 yhe
= _mm_srli_pi32(yhe
, SCALEBITS
);
357 ye
= _mm_packs_pi32(yle
, yhe
);
359 yo
= _mm_slli_pi16(yo
, BYTE_BIT
);
360 y
= _mm_or_si64(ye
, yo
);
362 _mm_store_si64((__m64
*)&outptr
[0], y
);