WinGui: Fix another instance of the Caliburn vs Json.net sillyness where objects...
[HandBrake.git] / libhb / nlmeans_x86.c
blob564668e0a3bc30ebb7aa2d6393f0ee1333028a0b
1 /* nlmeans_x86.c
3 Copyright (c) 2013 Dirk Farin
4 Copyright (c) 2003-2015 HandBrake Team
5 This file is part of the HandBrake source code
6 Homepage: <http://handbrake.fr/>.
7 It may be used under the terms of the GNU General Public License v2.
8 For full terms see the file COPYING file or visit http://www.gnu.org/licenses/gpl-2.0.html
9 */
11 #include "hb.h" // needed for ARCH_X86
13 #if defined(ARCH_X86)
15 #include <emmintrin.h>
17 #include "libavutil/cpu.h"
18 #include "nlmeans.h"
20 static void build_integral_sse2(uint32_t *integral,
21 int integral_stride,
22 const uint8_t *src,
23 const uint8_t *src_pre,
24 const uint8_t *compare,
25 const uint8_t *compare_pre,
26 int w,
27 int border,
28 int dst_w,
29 int dst_h,
30 int dx,
31 int dy)
33 const __m128i zero = _mm_set1_epi8(0);
34 const int bw = w + 2 * border;
36 for (int y = 0; y < dst_h; y++)
38 __m128i prevadd = _mm_set1_epi32(0);
40 const uint8_t *p1 = src_pre + y*bw;
41 const uint8_t *p2 = compare_pre + (y+dy)*bw + dx;
42 uint32_t *out = integral + (y*integral_stride);
44 for (int x = 0; x < dst_w; x += 16)
46 __m128i pa, pb;
47 __m128i pla, plb;
48 __m128i ldiff, lldiff, lhdiff;
49 __m128i ltmp,htmp;
50 __m128i ladd,hadd;
51 __m128i pha,phb;
52 __m128i hdiff,hldiff,hhdiff;
53 __m128i l2tmp,h2tmp;
55 pa = _mm_loadu_si128((__m128i*)p1); // Load source pixels into register 1
56 pb = _mm_loadu_si128((__m128i*)p2); // Load compare pixels into register 2
58 // Low
59 pla = _mm_unpacklo_epi8(pa,zero); // Unpack and interleave source low with zeros
60 plb = _mm_unpacklo_epi8(pb,zero); // Unpack and interleave compare low with zeros
62 ldiff = _mm_sub_epi16(pla,plb); // Diff source and compare lows (subtract)
63 ldiff = _mm_mullo_epi16(ldiff,ldiff); // Square low diff (multiply at 32-bit precision)
65 lldiff = _mm_unpacklo_epi16(ldiff,zero); // Unpack and interleave diff low with zeros
66 lhdiff = _mm_unpackhi_epi16(ldiff,zero); // Unpack and interleave diff high with zeros
68 ltmp = _mm_slli_si128(lldiff, 4); // Temp shift diff low left 4 bytes
69 lldiff = _mm_add_epi32(lldiff, ltmp); // Add above to diff low
70 ltmp = _mm_slli_si128(lldiff, 8); // Temp shift diff low left 8 bytes
71 lldiff = _mm_add_epi32(lldiff, ltmp); // Add above to diff low
72 lldiff = _mm_add_epi32(lldiff, prevadd); // Add previous total to diff low
74 ladd = _mm_shuffle_epi32(lldiff, 0xff); // Shuffle diff low
76 htmp = _mm_slli_si128(lhdiff, 4); // Temp shift diff high left 4 bytes
77 lhdiff = _mm_add_epi32(lhdiff, htmp); // Add above to diff high
78 htmp = _mm_slli_si128(lhdiff, 8); // Temp shift diff high left 8 bytes
79 lhdiff = _mm_add_epi32(lhdiff, htmp); // Add above to diff high
80 lhdiff = _mm_add_epi32(lhdiff, ladd); // Add shuffled diff low to diff high
82 prevadd = _mm_shuffle_epi32(lhdiff, 0xff); // Shuffle diff high
84 // High
85 pha = _mm_unpackhi_epi8(pa,zero); // Unpack and interleave source high with zeros
86 phb = _mm_unpackhi_epi8(pb,zero); // Unpack and interleave compare high with zeros
88 hdiff = _mm_sub_epi16(pha,phb); // Diff source and compare highs (subtract)
89 hdiff = _mm_mullo_epi16(hdiff,hdiff); // Square high diff (multiply at 32-bit precision)
91 hldiff = _mm_unpacklo_epi16(hdiff,zero); // Unpack and interleave diff low with zeros
92 hhdiff = _mm_unpackhi_epi16(hdiff,zero); // Unpack and interleave diff high with zeros
94 l2tmp = _mm_slli_si128(hldiff, 4); // Temp shift diff low 4 bytes
95 hldiff = _mm_add_epi32(hldiff, l2tmp); // Add above to diff low
96 l2tmp = _mm_slli_si128(hldiff, 8); // Temp shift diff low left 8 bytes
97 hldiff = _mm_add_epi32(hldiff, l2tmp); // Add above to diff low
98 hldiff = _mm_add_epi32(hldiff, prevadd); // Add previous total to diff low
100 hadd = _mm_shuffle_epi32(hldiff, 0xff); // Shuffle diff low
102 h2tmp = _mm_slli_si128(hhdiff, 4); // Temp shift diff high left 4 bytes
103 hhdiff = _mm_add_epi32(hhdiff, h2tmp); // Add above to diff high
104 h2tmp = _mm_slli_si128(hhdiff, 8); // Temp shift diff high left 8 bytes
105 hhdiff = _mm_add_epi32(hhdiff, h2tmp); // Add above to diff high
106 hhdiff = _mm_add_epi32(hhdiff, hadd); // Add shuffled diff low to diff high
108 prevadd = _mm_shuffle_epi32(hhdiff, 0xff); // Shuffle diff high
110 // Store
111 _mm_store_si128((__m128i*)(out), lldiff); // Store low diff low in memory
112 _mm_store_si128((__m128i*)(out+4), lhdiff); // Store low diff high in memory
113 _mm_store_si128((__m128i*)(out+8), hldiff); // Store high diff low in memory
114 _mm_store_si128((__m128i*)(out+12), hhdiff); // Store high diff high in memory
116 // Increment
117 out += 16;
118 p1 += 16;
119 p2 += 16;
122 if (y > 0)
124 out = integral + y*integral_stride;
126 for (int x = 0; x < dst_w; x += 16)
128 *((__m128i*)out) = _mm_add_epi32(*(__m128i*)(out-integral_stride),
129 *(__m128i*)(out));
131 *((__m128i*)(out+4)) = _mm_add_epi32(*(__m128i*)(out+4-integral_stride),
132 *(__m128i*)(out+4));
134 *((__m128i*)(out+8)) = _mm_add_epi32(*(__m128i*)(out+8-integral_stride),
135 *(__m128i*)(out+8));
137 *((__m128i*)(out+12)) = _mm_add_epi32(*(__m128i*)(out+12-integral_stride),
138 *(__m128i*)(out+12));
140 out += 16;
146 void nlmeans_init_x86(NLMeansFunctions *functions)
148 if (av_get_cpu_flags() & AV_CPU_FLAG_SSE2)
150 functions->build_integral = build_integral_sse2;
151 hb_log("NLMeans using SSE2 optimizations");
155 #endif // ARCH_X86