Merge Chromium + Blink git repositories
[chromium-blink-merge.git] / skia / ext / convolver_mips_dspr2.cc
blob955abef7a5259a92b0f3a66ca9fc30c7017a4bb7
1 // Copyright (c) 2013 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include <algorithm>
6 #include "skia/ext/convolver.h"
7 #include "skia/ext/convolver_mips_dspr2.h"
8 #include "third_party/skia/include/core/SkTypes.h"
10 namespace skia {
11 // Convolves horizontally along a single row. The row data is given in
12 // |src_data| and continues for the num_values() of the filter.
13 void ConvolveHorizontally_mips_dspr2(const unsigned char* src_data,
14 const ConvolutionFilter1D& filter,
15 unsigned char* out_row,
16 bool has_alpha) {
17 #if SIMD_MIPS_DSPR2
18 int row_to_filter = 0;
19 int num_values = filter.num_values();
20 if (has_alpha) {
21 for (int out_x = 0; out_x < num_values; out_x++) {
22 // Get the filter that determines the current output pixel.
23 int filter_offset, filter_length;
24 const ConvolutionFilter1D::Fixed* filter_values =
25 filter.FilterForValue(out_x, &filter_offset, &filter_length);
26 int filter_x = 0;
28 __asm__ __volatile__ (
29 ".set push \n"
30 ".set noreorder \n"
32 "beqz %[filter_len], 3f \n"
33 " sll $t0, %[filter_offset], 2 \n"
34 "addu %[rtf], %[src_data], $t0 \n"
35 "mtlo $0, $ac0 \n"
36 "mtlo $0, $ac1 \n"
37 "mtlo $0, $ac2 \n"
38 "mtlo $0, $ac3 \n"
39 "srl $t7, %[filter_len], 2 \n"
40 "beqz $t7, 2f \n"
41 " li %[fx], 0 \n"
43 "11: \n"
44 "addu $t4, %[filter_val], %[fx] \n"
45 "sll $t5, %[fx], 1 \n"
46 "ulw $t6, 0($t4) \n" // t6 = |cur[1]|cur[0]|
47 "ulw $t8, 4($t4) \n" // t8 = |cur[3]|cur[2]|
48 "addu $t0, %[rtf], $t5 \n"
49 "lw $t1, 0($t0) \n" // t1 = |a0|b0|g0|r0|
50 "lw $t2, 4($t0) \n" // t2 = |a1|b1|g1|r1|
51 "lw $t3, 8($t0) \n" // t3 = |a2|b2|g2|r2|
52 "lw $t4, 12($t0) \n" // t4 = |a3|b3|g3|r3|
53 "precrq.qb.ph $t0, $t2, $t1 \n" // t0 = |a1|g1|a0|g0|
54 "precr.qb.ph $t5, $t2, $t1 \n" // t5 = |b1|r1|b0|r0|
55 "preceu.ph.qbla $t1, $t0 \n" // t1 = |0|a1|0|a0|
56 "preceu.ph.qbra $t2, $t0 \n" // t2 = |0|g1|0|g0|
57 "preceu.ph.qbla $t0, $t5 \n" // t0 = |0|b1|0|b0|
58 "preceu.ph.qbra $t5, $t5 \n" // t5 = |0|r1|0|r0|
59 "dpa.w.ph $ac0, $t1, $t6 \n" // ac0+(cur*a1)+(cur*a0)
60 "dpa.w.ph $ac1, $t0, $t6 \n" // ac1+(cur*b1)+(cur*b0)
61 "dpa.w.ph $ac2, $t2, $t6 \n" // ac2+(cur*g1)+(cur*g0)
62 "dpa.w.ph $ac3, $t5, $t6 \n" // ac3+(cur*r1)+(cur*r0)
63 "precrq.qb.ph $t0, $t4, $t3 \n" // t0 = |a3|g3|a2|g2|
64 "precr.qb.ph $t5, $t4, $t3 \n" // t5 = |b3|r3|b2|r2|
65 "preceu.ph.qbla $t1, $t0 \n" // t1 = |0|a3|0|a2|
66 "preceu.ph.qbra $t2, $t0 \n" // t2 = |0|g3|0|g2|
67 "preceu.ph.qbla $t0, $t5 \n" // t0 = |0|b3|0|b2|
68 "preceu.ph.qbra $t5, $t5 \n" // t5 = |0|r3|0|r2|
69 "dpa.w.ph $ac0, $t1, $t8 \n" // ac0+(cur*a3)+(cur*a2)
70 "dpa.w.ph $ac1, $t0, $t8 \n" // ac1+(cur*b3)+(cur*b2)
71 "dpa.w.ph $ac2, $t2, $t8 \n" // ac2+(cur*g3)+(cur*g2)
72 "dpa.w.ph $ac3, $t5, $t8 \n" // ac3+(cur*r3)+(cur*r2)
73 "addiu $t7, $t7, -1 \n"
74 "bgtz $t7, 11b \n"
75 " addiu %[fx], %[fx], 8 \n"
77 "2: \n"
78 "andi $t7, %[filter_len], 0x3 \n" // residual
79 "beqz $t7, 3f \n"
80 " nop \n"
82 "21: \n"
83 "sll $t1, %[fx], 1 \n"
84 "addu $t2, %[filter_val], %[fx] \n"
85 "addu $t0, %[rtf], $t1 \n"
86 "lh $t6, 0($t2) \n" // t6 = filter_val[fx]
87 "lbu $t1, 0($t0) \n" // t1 = row[fx * 4 + 0]
88 "lbu $t2, 1($t0) \n" // t2 = row[fx * 4 + 1]
89 "lbu $t3, 2($t0) \n" // t3 = row[fx * 4 + 2]
90 "lbu $t4, 3($t0) \n" // t4 = row[fx * 4 + 2]
91 "maddu $ac3, $t6, $t1 \n"
92 "maddu $ac2, $t6, $t2 \n"
93 "maddu $ac1, $t6, $t3 \n"
94 "maddu $ac0, $t6, $t4 \n"
95 "addiu $t7, $t7, -1 \n"
96 "bgtz $t7, 21b \n"
97 " addiu %[fx], %[fx], 2 \n"
99 "3: \n"
100 "extrv.w $t0, $ac0, %[kShiftBits] \n" // a >> kShiftBits
101 "extrv.w $t1, $ac1, %[kShiftBits] \n" // b >> kShiftBits
102 "extrv.w $t2, $ac2, %[kShiftBits] \n" // g >> kShiftBits
103 "extrv.w $t3, $ac3, %[kShiftBits] \n" // r >> kShiftBits
104 "sll $t5, %[out_x], 2 \n"
105 "repl.ph $t6, 128 \n" // t6 = | 128 | 128 |
106 "addu $t5, %[out_row], $t5 \n"
107 "append $t2, $t3, 16 \n"
108 "append $t0, $t1, 16 \n"
109 "subu.ph $t1, $t0, $t6 \n"
110 "shll_s.ph $t1, $t1, 8 \n"
111 "shra.ph $t1, $t1, 8 \n"
112 "addu.ph $t1, $t1, $t6 \n"
113 "subu.ph $t3, $t2, $t6 \n"
114 "shll_s.ph $t3, $t3, 8 \n"
115 "shra.ph $t3, $t3, 8 \n"
116 "addu.ph $t3, $t3, $t6 \n"
117 "precr.qb.ph $t0, $t1, $t3 \n"
118 "usw $t0, 0($t5) \n"
120 ".set pop \n"
121 : [fx] "+r" (filter_x), [out_x] "+r" (out_x), [out_row] "+r" (out_row),
122 [rtf] "+r" (row_to_filter)
123 : [filter_val] "r" (filter_values), [filter_len] "r" (filter_length),
124 [kShiftBits] "r" (ConvolutionFilter1D::kShiftBits),
125 [filter_offset] "r" (filter_offset), [src_data] "r" (src_data)
126 : "lo", "hi", "$ac1lo", "$ac1hi", "$ac2lo", "$ac2hi", "$ac3lo", "$ac3hi",
127 "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8"
130 } else {
131 for (int out_x = 0; out_x < num_values; out_x++) {
132 // Get the filter that determines the current output pixel.
133 int filter_offset, filter_length;
134 const ConvolutionFilter1D::Fixed* filter_values =
135 filter.FilterForValue(out_x, &filter_offset, &filter_length);
136 int filter_x = 0;
137 __asm__ __volatile__ (
138 ".set push \n"
139 ".set noreorder \n"
141 "beqz %[filter_len], 3f \n"
142 " sll $t0, %[filter_offset], 2 \n"
143 "addu %[rtf], %[src_data], $t0 \n"
144 "mtlo $0, $ac1 \n"
145 "mtlo $0, $ac2 \n"
146 "mtlo $0, $ac3 \n"
147 "srl $t7, %[filter_len], 2 \n"
148 "beqz $t7, 2f \n"
149 " li %[fx], 0 \n"
151 "11: \n"
152 "addu $t4, %[filter_val], %[fx] \n"
153 "sll $t5, %[fx], 1 \n"
154 "ulw $t6, 0($t4) \n" // t6 = |cur[1]|cur[0]|
155 "ulw $t8, 4($t4) \n" // t8 = |cur[3]|cur[2]|
156 "addu $t0, %[rtf], $t5 \n"
157 "lw $t1, 0($t0) \n" // t1 = |a0|b0|g0|r0|
158 "lw $t2, 4($t0) \n" // t2 = |a1|b1|g1|r1|
159 "lw $t3, 8($t0) \n" // t3 = |a2|b2|g2|r2|
160 "lw $t4, 12($t0) \n" // t4 = |a3|b3|g3|r3|
161 "precrq.qb.ph $t0, $t2, $t1 \n" // t0 = |a1|g1|a0|g0|
162 "precr.qb.ph $t5, $t2, $t1 \n" // t5 = |b1|r1|b0|r0|
163 "preceu.ph.qbra $t2, $t0 \n" // t2 = |0|g1|0|g0|
164 "preceu.ph.qbla $t0, $t5 \n" // t0 = |0|b1|0|b0|
165 "preceu.ph.qbra $t5, $t5 \n" // t5 = |0|r1|0|r0|
166 "dpa.w.ph $ac1, $t0, $t6 \n" // ac1+(cur*b1)+(cur*b0)
167 "dpa.w.ph $ac2, $t2, $t6 \n" // ac2+(cur*g1)+(cur*g0)
168 "dpa.w.ph $ac3, $t5, $t6 \n" // ac3+(cur*r1)+(cur*r0)
169 "precrq.qb.ph $t0, $t4, $t3 \n" // t0 = |a3|g3|a2|g2|
170 "precr.qb.ph $t5, $t4, $t3 \n" // t5 = |b3|r3|b2|r2|
171 "preceu.ph.qbra $t2, $t0 \n" // t2 = |0|g3|0|g2|
172 "preceu.ph.qbla $t0, $t5 \n" // t0 = |0|b3|0|b2|
173 "preceu.ph.qbra $t5, $t5 \n" // t5 = |0|r3|0|r2|
174 "dpa.w.ph $ac1, $t0, $t8 \n" // ac1+(cur*b3)+(cur*b2)
175 "dpa.w.ph $ac2, $t2, $t8 \n" // ac2+(cur*g3)+(cur*g2)
176 "dpa.w.ph $ac3, $t5, $t8 \n" // ac3+(cur*r3)+(cur*r2)
177 "addiu $t7, $t7, -1 \n"
178 "bgtz $t7, 11b \n"
179 " addiu %[fx], %[fx], 8 \n"
181 "2: \n"
182 "andi $t7, %[filter_len], 0x3 \n" // residual
183 "beqz $t7, 3f \n"
184 " nop \n"
186 "21: \n"
187 "sll $t1, %[fx], 1 \n"
188 "addu $t2, %[filter_val], %[fx] \n"
189 "addu $t0, %[rtf], $t1 \n"
190 "lh $t6, 0($t2) \n" // t6 = filter_val[fx]
191 "lbu $t1, 0($t0) \n" // t1 = row[fx * 4 + 0]
192 "lbu $t2, 1($t0) \n" // t2 = row[fx * 4 + 1]
193 "lbu $t3, 2($t0) \n" // t3 = row[fx * 4 + 2]
194 "maddu $ac3, $t6, $t1 \n"
195 "maddu $ac2, $t6, $t2 \n"
196 "maddu $ac1, $t6, $t3 \n"
197 "addiu $t7, $t7, -1 \n"
198 "bgtz $t7, 21b \n"
199 " addiu %[fx], %[fx], 2 \n"
201 "3: \n"
202 "extrv.w $t1, $ac1, %[kShiftBits] \n" // b >> kShiftBits
203 "extrv.w $t2, $ac2, %[kShiftBits] \n" // g >> kShiftBits
204 "extrv.w $t3, $ac3, %[kShiftBits] \n" // r >> kShiftBits
205 "repl.ph $t6, 128 \n" // t6 = | 128 | 128 |
206 "sll $t8, %[out_x], 2 \n"
207 "addu $t8, %[out_row], $t8 \n"
208 "append $t2, $t3, 16 \n"
209 "andi $t1, 0xFFFF \n"
210 "subu.ph $t5, $t1, $t6 \n"
211 "shll_s.ph $t5, $t5, 8 \n"
212 "shra.ph $t5, $t5, 8 \n"
213 "addu.ph $t5, $t5, $t6 \n"
214 "subu.ph $t4, $t2, $t6 \n"
215 "shll_s.ph $t4, $t4, 8 \n"
216 "shra.ph $t4, $t4, 8 \n"
217 "addu.ph $t4, $t4, $t6 \n"
218 "precr.qb.ph $t0, $t5, $t4 \n"
219 "usw $t0, 0($t8) \n"
221 ".set pop \n"
222 : [fx] "+r" (filter_x), [out_x] "+r" (out_x), [out_row] "+r" (out_row),
223 [rtf] "+r" (row_to_filter)
224 : [filter_val] "r" (filter_values), [filter_len] "r" (filter_length),
225 [kShiftBits] "r" (ConvolutionFilter1D::kShiftBits),
226 [filter_offset] "r" (filter_offset), [src_data] "r" (src_data)
227 : "lo", "hi", "$ac1lo", "$ac1hi", "$ac2lo", "$ac2hi", "$ac3lo", "$ac3hi",
228 "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8"
232 #endif
234 void ConvolveVertically_mips_dspr2(const ConvolutionFilter1D::Fixed* filter_val,
235 int filter_length,
236 unsigned char* const* source_data_rows,
237 int pixel_width,
238 unsigned char* out_row,
239 bool has_alpha) {
240 #if SIMD_MIPS_DSPR2
241 // We go through each column in the output and do a vertical convolution,
242 // generating one output pixel each time.
243 int byte_offset;
244 int cnt;
245 int filter_y;
246 if (has_alpha) {
247 for (int out_x = 0; out_x < pixel_width; out_x++) {
248 __asm__ __volatile__ (
249 ".set push \n"
250 ".set noreorder \n"
252 "beqz %[filter_len], 3f \n"
253 " sll %[offset], %[out_x], 2 \n"
254 "mtlo $0, $ac0 \n"
255 "mtlo $0, $ac1 \n"
256 "mtlo $0, $ac2 \n"
257 "mtlo $0, $ac3 \n"
258 "srl %[cnt], %[filter_len], 2 \n"
259 "beqz %[cnt], 2f \n"
260 " li %[fy], 0 \n"
262 "11: \n"
263 "sll $t1, %[fy], 1 \n"
264 "addu $t0, %[src_data_rows], $t1 \n"
265 "lw $t1, 0($t0) \n"
266 "lw $t2, 4($t0) \n"
267 "lw $t3, 8($t0) \n"
268 "lw $t4, 12($t0) \n"
269 "addu $t1, $t1, %[offset] \n"
270 "addu $t2, $t2, %[offset] \n"
271 "addu $t3, $t3, %[offset] \n"
272 "addu $t4, $t4, %[offset] \n"
273 "lw $t1, 0($t1) \n" // t1 = |a0|b0|g0|r0|
274 "lw $t2, 0($t2) \n" // t2 = |a1|b1|g1|r1|
275 "lw $t3, 0($t3) \n" // t3 = |a0|b0|g0|r0|
276 "lw $t4, 0($t4) \n" // t4 = |a1|b1|g1|r1|
277 "precrq.qb.ph $t5, $t2, $t1 \n" // t5 = |a1|g1|a0|g0|
278 "precr.qb.ph $t6, $t2, $t1 \n" // t6 = |b1|r1|b0|r0|
279 "preceu.ph.qbla $t0, $t5 \n" // t0 = |0|a1|0|a0|
280 "preceu.ph.qbra $t1, $t5 \n" // t1 = |0|g1|0|g0|
281 "preceu.ph.qbla $t2, $t6 \n" // t2 = |0|b1|0|b0|
282 "preceu.ph.qbra $t5, $t6 \n" // t5 = |0|r1|0|r0|
283 "addu $t6, %[filter_val], %[fy] \n"
284 "ulw $t7, 0($t6) \n" // t7 = |cur_1|cur_0|
285 "ulw $t6, 4($t6) \n" // t6 = |cur_3|cur_2|
286 "dpa.w.ph $ac0, $t5, $t7 \n" // (cur*r1)+(cur*r0)
287 "dpa.w.ph $ac1, $t1, $t7 \n" // (cur*g1)+(cur*g0)
288 "dpa.w.ph $ac2, $t2, $t7 \n" // (cur*b1)+(cur*b0)
289 "dpa.w.ph $ac3, $t0, $t7 \n" // (cur*a1)+(cur*a0)
290 "precrq.qb.ph $t5, $t4, $t3 \n" // t5 = |a3|g3|a2|g2|
291 "precr.qb.ph $t7, $t4, $t3 \n" // t7 = |b3|r3|b2|r2|
292 "preceu.ph.qbla $t0, $t5 \n" // t0 = |0|a3|0|a2|
293 "preceu.ph.qbra $t1, $t5 \n" // t1 = |0|g3|0|g2|
294 "preceu.ph.qbla $t2, $t7 \n" // t2 = |0|b3|0|b2|
295 "preceu.ph.qbra $t5, $t7 \n" // t5 = |0|r3|0|r2|
296 "dpa.w.ph $ac0, $t5, $t6 \n" // (cur*r3)+(cur*r2)
297 "dpa.w.ph $ac1, $t1, $t6 \n" // (cur*g3)+(cur*g2)
298 "dpa.w.ph $ac2, $t2, $t6 \n" // (cur*b3)+(cur*b2)
299 "dpa.w.ph $ac3, $t0, $t6 \n" // (cur*a3)+(cur*a2)
300 "addiu %[cnt], %[cnt], -1 \n"
301 "bgtz %[cnt], 11b \n"
302 " addiu %[fy], %[fy], 8 \n"
304 "2: \n"
305 "andi %[cnt], %[filter_len], 0x3 \n" // residual
306 "beqz %[cnt], 3f \n"
307 " nop \n"
309 "21: \n"
310 "addu $t0, %[filter_val], %[fy] \n"
311 "lh $t4, 0($t0) \n" // t4=filter_val[fx]
312 "sll $t1, %[fy], 1 \n"
313 "addu $t0, %[src_data_rows], $t1 \n"
314 "lw $t1, 0($t0) \n"
315 "addu $t0, $t1, %[offset] \n"
316 "lbu $t1, 0($t0) \n" // t1 = row[fx*4 + 0]
317 "lbu $t2, 1($t0) \n" // t2 = row[fx*4 + 1]
318 "lbu $t3, 2($t0) \n" // t3 = row[fx*4 + 2]
319 "lbu $t0, 3($t0) \n" // t4 = row[fx*4 + 2]
320 "maddu $ac0, $t4, $t1 \n"
321 "maddu $ac1, $t4, $t2 \n"
322 "maddu $ac2, $t4, $t3 \n"
323 "maddu $ac3, $t4, $t0 \n"
324 "addiu %[cnt], %[cnt], -1 \n"
325 "bgtz %[cnt], 21b \n"
326 " addiu %[fy], %[fy], 2 \n"
328 "3: \n"
329 "extrv.w $t3, $ac0, %[kShiftBits] \n" // a >> kShiftBits
330 "extrv.w $t2, $ac1, %[kShiftBits] \n" // b >> kShiftBits
331 "extrv.w $t1, $ac2, %[kShiftBits] \n" // g >> kShiftBits
332 "extrv.w $t0, $ac3, %[kShiftBits] \n" // r >> kShiftBits
333 "repl.ph $t4, 128 \n" // t4 = | 128 | 128 |
334 "addu $t5, %[out_row], %[offset] \n"
335 "append $t2, $t3, 16 \n" // t2 = |0|g|0|r|
336 "append $t0, $t1, 16 \n" // t0 = |0|a|0|b|
337 "subu.ph $t1, $t0, $t4 \n"
338 "shll_s.ph $t1, $t1, 8 \n"
339 "shra.ph $t1, $t1, 8 \n"
340 "addu.ph $t1, $t1, $t4 \n" // Clamp(a)|Clamp(b)
341 "subu.ph $t2, $t2, $t4 \n"
342 "shll_s.ph $t2, $t2, 8 \n"
343 "shra.ph $t2, $t2, 8 \n"
344 "addu.ph $t2, $t2, $t4 \n" // Clamp(g)|Clamp(r)
345 "andi $t3, $t1, 0xFF \n" // t3 = ClampTo8(b)
346 "cmp.lt.ph $t3, $t2 \n" // cmp b, g, r
347 "pick.ph $t0, $t2, $t3 \n"
348 "andi $t3, $t0, 0xFF \n"
349 "srl $t4, $t0, 16 \n"
350 "cmp.lt.ph $t3, $t4 \n"
351 "pick.ph $t0, $t4, $t3 \n" // t0 = max_color_ch
352 "srl $t3, $t1, 16 \n" // t1 = ClampTo8(a)
353 "cmp.lt.ph $t3, $t0 \n"
354 "pick.ph $t0, $t0, $t3 \n"
355 "ins $t1, $t0, 16, 8 \n"
356 "precr.qb.ph $t0, $t1, $t2 \n" // t0 = |a|b|g|r|
357 "usw $t0, 0($t5) \n"
359 ".set pop \n"
360 : [filter_val] "+r" (filter_val), [filter_len] "+r" (filter_length),
361 [offset] "+r" (byte_offset), [fy] "+r" (filter_y), [cnt] "+r" (cnt),
362 [out_x] "+r" (out_x), [pixel_width] "+r" (pixel_width)
363 : [src_data_rows] "r" (source_data_rows), [out_row] "r" (out_row),
364 [kShiftBits] "r" (ConvolutionFilter1D::kShiftBits)
365 : "lo", "hi", "$ac1lo", "$ac1hi", "$ac2lo", "$ac2hi", "$ac3lo", "$ac3hi",
366 "t0", "t1", "t2", "t3", "t4", "t5", "t6","t7", "memory"
369 } else {
370 for (int out_x = 0; out_x < pixel_width; out_x++) {
371 __asm__ __volatile__ (
372 ".set push \n"
373 ".set noreorder \n"
375 "beqz %[filter_len], 3f \n"
376 " sll %[offset], %[out_x], 2 \n"
377 "mtlo $0, $ac0 \n"
378 "mtlo $0, $ac1 \n"
379 "mtlo $0, $ac2 \n"
380 "srl %[cnt], %[filter_len], 2 \n"
381 "beqz %[cnt], 2f \n"
382 " li %[fy], 0 \n"
384 "11: \n"
385 "sll $t1, %[fy], 1 \n"
386 "addu $t0, %[src_data_rows], $t1 \n"
387 "lw $t1, 0($t0) \n"
388 "lw $t2, 4($t0) \n"
389 "lw $t3, 8($t0) \n"
390 "lw $t4, 12($t0) \n"
391 "addu $t1, $t1, %[offset] \n"
392 "addu $t2, $t2, %[offset] \n"
393 "addu $t3, $t3, %[offset] \n"
394 "addu $t4, $t4, %[offset] \n"
395 "lw $t1, 0($t1) \n" // t1 = |a0|b0|g0|r0|
396 "lw $t2, 0($t2) \n" // t2 = |a1|b1|g1|r1|
397 "lw $t3, 0($t3) \n" // t3 = |a0|b0|g0|r0|
398 "lw $t4, 0($t4) \n" // t4 = |a1|b1|g1|r1|
399 "precrq.qb.ph $t5, $t2, $t1 \n" // t5 = |a1|g1|a0|g0|
400 "precr.qb.ph $t6, $t2, $t1 \n" // t6 = |b1|r1|b0|r0|
401 "preceu.ph.qbra $t1, $t5 \n" // t1 = |0|g1|0|g0|
402 "preceu.ph.qbla $t2, $t6 \n" // t2 = |0|b1|0|b0|
403 "preceu.ph.qbra $t5, $t6 \n" // t5 = |0|r1|0|r0|
404 "addu $t6, %[filter_val], %[fy] \n"
405 "ulw $t0, 0($t6) \n" // t0 = |cur_1|cur_0|
406 "ulw $t6, 4($t6) \n" // t6 = |cur_1|cur_0|
407 "dpa.w.ph $ac0, $t5, $t0 \n" // (cur*r1)+(cur*r0)
408 "dpa.w.ph $ac1, $t1, $t0 \n" // (cur*g1)+(cur*g0)
409 "dpa.w.ph $ac2, $t2, $t0 \n" // (cur*b1)+(cur*b0)
410 "precrq.qb.ph $t5, $t4, $t3 \n" // t5 = |a3|g3|a2|g2|
411 "precr.qb.ph $t0, $t4, $t3 \n" // t0 = |b3|r3|b2|r2|
412 "preceu.ph.qbra $t1, $t5 \n" // t1 = |0|g3|0|g2|
413 "preceu.ph.qbla $t2, $t0 \n" // t2 = |0|b3|0|b2|
414 "preceu.ph.qbra $t5, $t0 \n" // t5 = |0|r3|0|r2|
415 "dpa.w.ph $ac0, $t5, $t6 \n" // (cur*r1)+(cur*r0)
416 "dpa.w.ph $ac1, $t1, $t6 \n" // (cur*g1)+(cur*g0)
417 "dpa.w.ph $ac2, $t2, $t6 \n" // (cur*b1)+(cur*b0)
418 "addiu %[cnt], %[cnt], -1 \n"
419 "bgtz %[cnt], 11b \n"
420 " addiu %[fy], %[fy], 8 \n"
422 "2: \n"
423 "andi %[cnt], %[filter_len], 0x3 \n" // residual
424 "beqz %[cnt], 3f \n"
425 " nop \n"
427 "21: \n"
428 "addu $t0, %[filter_val], %[fy] \n"
429 "lh $t4, 0($t0) \n" // filter_val[fx]
430 "sll $t1, %[fy], 1 \n"
431 "addu $t0, %[src_data_rows], $t1 \n"
432 "lw $t1, 0($t0) \n"
433 "addu $t0, $t1, %[offset] \n"
434 "lbu $t1, 0($t0) \n" // t1 = row[fx*4 + 0]
435 "lbu $t2, 1($t0) \n" // t2 = row[fx*4 + 1]
436 "lbu $t3, 2($t0) \n" // t3 = row[fx*4 + 2]
437 "maddu $ac0, $t4, $t1 \n"
438 "maddu $ac1, $t4, $t2 \n"
439 "maddu $ac2, $t4, $t3 \n"
440 "addiu %[cnt], %[cnt], -1 \n"
441 "bgtz %[cnt], 21b \n"
442 " addiu %[fy], %[fy], 2 \n"
444 "3: \n"
445 "extrv.w $t3, $ac0, %[kShiftBits] \n" // r >> kShiftBits
446 "extrv.w $t2, $ac1, %[kShiftBits] \n" // g >> kShiftBits
447 "extrv.w $t1, $ac2, %[kShiftBits] \n" // b >> kShiftBits
448 "repl.ph $t6, 128 \n" // t6 = | 128 | 128 |
449 "addu $t5, %[out_row], %[offset] \n"
450 "append $t2, $t3, 16 \n" // t2 = |0|g|0|r|
451 "andi $t1, $t1, 0xFFFF \n"
452 "subu.ph $t1, $t1, $t6 \n"
453 "shll_s.ph $t1, $t1, 8 \n"
454 "shra.ph $t1, $t1, 8 \n"
455 "addu.ph $t1, $t1, $t6 \n" // Clamp(a)|Clamp(b)
456 "subu.ph $t2, $t2, $t6 \n"
457 "shll_s.ph $t2, $t2, 8 \n"
458 "shra.ph $t2, $t2, 8 \n"
459 "addu.ph $t2, $t2, $t6 \n" // Clamp(g)|Clamp(r)
460 "li $t0, 0xFF \n"
461 "ins $t1, $t0, 16, 8 \n"
462 "precr.qb.ph $t0, $t1, $t2 \n" // t0 = |a|b|g|r|
463 "usw $t0, 0($t5) \n"
465 ".set pop \n"
466 : [filter_val] "+r" (filter_val), [filter_len] "+r" (filter_length),
467 [offset] "+r" (byte_offset), [fy] "+r" (filter_y), [cnt] "+r" (cnt),
468 [out_x] "+r" (out_x), [pixel_width] "+r" (pixel_width)
469 : [src_data_rows] "r" (source_data_rows), [out_row] "r" (out_row),
470 [kShiftBits] "r" (ConvolutionFilter1D::kShiftBits)
471 : "lo", "hi", "$ac1lo", "$ac1hi", "$ac2lo", "$ac2hi", "$ac3lo", "$ac3hi",
472 "t0", "t1", "t2", "t3", "t4", "t5", "t6", "memory"
476 #endif
478 } // namespace skia