3 Copyright (c) 2013 Dirk Farin
4 Copyright (c) 2003-2015 HandBrake Team
5 This file is part of the HandBrake source code
6 Homepage: <http://handbrake.fr/>.
7 It may be used under the terms of the GNU General Public License v2.
8 For full terms see the file COPYING file or visit http://www.gnu.org/licenses/gpl-2.0.html
11 #include "hb.h" // needed for ARCH_X86
15 #include <emmintrin.h>
17 #include "libavutil/cpu.h"
20 static void build_integral_sse2(uint32_t *integral
,
23 const uint8_t *src_pre
,
24 const uint8_t *compare
,
25 const uint8_t *compare_pre
,
33 const __m128i zero
= _mm_set1_epi8(0);
34 const int bw
= w
+ 2 * border
;
36 for (int y
= 0; y
< dst_h
; y
++)
38 __m128i prevadd
= _mm_set1_epi32(0);
40 const uint8_t *p1
= src_pre
+ y
*bw
;
41 const uint8_t *p2
= compare_pre
+ (y
+dy
)*bw
+ dx
;
42 uint32_t *out
= integral
+ (y
*integral_stride
);
44 for (int x
= 0; x
< dst_w
; x
+= 16)
48 __m128i ldiff
, lldiff
, lhdiff
;
52 __m128i hdiff
,hldiff
,hhdiff
;
55 pa
= _mm_loadu_si128((__m128i
*)p1
); // Load source pixels into register 1
56 pb
= _mm_loadu_si128((__m128i
*)p2
); // Load compare pixels into register 2
59 pla
= _mm_unpacklo_epi8(pa
,zero
); // Unpack and interleave source low with zeros
60 plb
= _mm_unpacklo_epi8(pb
,zero
); // Unpack and interleave compare low with zeros
62 ldiff
= _mm_sub_epi16(pla
,plb
); // Diff source and compare lows (subtract)
63 ldiff
= _mm_mullo_epi16(ldiff
,ldiff
); // Square low diff (multiply at 32-bit precision)
65 lldiff
= _mm_unpacklo_epi16(ldiff
,zero
); // Unpack and interleave diff low with zeros
66 lhdiff
= _mm_unpackhi_epi16(ldiff
,zero
); // Unpack and interleave diff high with zeros
68 ltmp
= _mm_slli_si128(lldiff
, 4); // Temp shift diff low left 4 bytes
69 lldiff
= _mm_add_epi32(lldiff
, ltmp
); // Add above to diff low
70 ltmp
= _mm_slli_si128(lldiff
, 8); // Temp shift diff low left 8 bytes
71 lldiff
= _mm_add_epi32(lldiff
, ltmp
); // Add above to diff low
72 lldiff
= _mm_add_epi32(lldiff
, prevadd
); // Add previous total to diff low
74 ladd
= _mm_shuffle_epi32(lldiff
, 0xff); // Shuffle diff low
76 htmp
= _mm_slli_si128(lhdiff
, 4); // Temp shift diff high left 4 bytes
77 lhdiff
= _mm_add_epi32(lhdiff
, htmp
); // Add above to diff high
78 htmp
= _mm_slli_si128(lhdiff
, 8); // Temp shift diff high left 8 bytes
79 lhdiff
= _mm_add_epi32(lhdiff
, htmp
); // Add above to diff high
80 lhdiff
= _mm_add_epi32(lhdiff
, ladd
); // Add shuffled diff low to diff high
82 prevadd
= _mm_shuffle_epi32(lhdiff
, 0xff); // Shuffle diff high
85 pha
= _mm_unpackhi_epi8(pa
,zero
); // Unpack and interleave source high with zeros
86 phb
= _mm_unpackhi_epi8(pb
,zero
); // Unpack and interleave compare high with zeros
88 hdiff
= _mm_sub_epi16(pha
,phb
); // Diff source and compare highs (subtract)
89 hdiff
= _mm_mullo_epi16(hdiff
,hdiff
); // Square high diff (multiply at 32-bit precision)
91 hldiff
= _mm_unpacklo_epi16(hdiff
,zero
); // Unpack and interleave diff low with zeros
92 hhdiff
= _mm_unpackhi_epi16(hdiff
,zero
); // Unpack and interleave diff high with zeros
94 l2tmp
= _mm_slli_si128(hldiff
, 4); // Temp shift diff low 4 bytes
95 hldiff
= _mm_add_epi32(hldiff
, l2tmp
); // Add above to diff low
96 l2tmp
= _mm_slli_si128(hldiff
, 8); // Temp shift diff low left 8 bytes
97 hldiff
= _mm_add_epi32(hldiff
, l2tmp
); // Add above to diff low
98 hldiff
= _mm_add_epi32(hldiff
, prevadd
); // Add previous total to diff low
100 hadd
= _mm_shuffle_epi32(hldiff
, 0xff); // Shuffle diff low
102 h2tmp
= _mm_slli_si128(hhdiff
, 4); // Temp shift diff high left 4 bytes
103 hhdiff
= _mm_add_epi32(hhdiff
, h2tmp
); // Add above to diff high
104 h2tmp
= _mm_slli_si128(hhdiff
, 8); // Temp shift diff high left 8 bytes
105 hhdiff
= _mm_add_epi32(hhdiff
, h2tmp
); // Add above to diff high
106 hhdiff
= _mm_add_epi32(hhdiff
, hadd
); // Add shuffled diff low to diff high
108 prevadd
= _mm_shuffle_epi32(hhdiff
, 0xff); // Shuffle diff high
111 _mm_store_si128((__m128i
*)(out
), lldiff
); // Store low diff low in memory
112 _mm_store_si128((__m128i
*)(out
+4), lhdiff
); // Store low diff high in memory
113 _mm_store_si128((__m128i
*)(out
+8), hldiff
); // Store high diff low in memory
114 _mm_store_si128((__m128i
*)(out
+12), hhdiff
); // Store high diff high in memory
124 out
= integral
+ y
*integral_stride
;
126 for (int x
= 0; x
< dst_w
; x
+= 16)
128 *((__m128i
*)out
) = _mm_add_epi32(*(__m128i
*)(out
-integral_stride
),
131 *((__m128i
*)(out
+4)) = _mm_add_epi32(*(__m128i
*)(out
+4-integral_stride
),
134 *((__m128i
*)(out
+8)) = _mm_add_epi32(*(__m128i
*)(out
+8-integral_stride
),
137 *((__m128i
*)(out
+12)) = _mm_add_epi32(*(__m128i
*)(out
+12-integral_stride
),
138 *(__m128i
*)(out
+12));
146 void nlmeans_init_x86(NLMeansFunctions
*functions
)
148 if (av_get_cpu_flags() & AV_CPU_FLAG_SSE2
)
150 functions
->build_integral
= build_integral_sse2
;
151 hb_log("NLMeans using SSE2 optimizations");