1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 /* vim: set ts=8 sts=2 et sw=2 tw=80: */
3 /* This Source Code Form is subject to the terms of the Mozilla Public
4 * License, v. 2.0. If a copy of the MPL was not distributed with this
5 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
8 #include "nsAlgorithm.h"
10 #include <nsUTF8Utils.h>
13 LossyConvertEncoding16to8::write_sse2(const char16_t
* aSource
,
14 uint32_t aSourceLength
)
16 char* dest
= mDestination
;
18 // Align source to a 16-byte boundary.
21 XPCOM_MIN
<uint32_t>(aSourceLength
,
22 uint32_t(-NS_PTR_TO_INT32(aSource
) & 0xf) / sizeof(char16_t
));
23 for (; i
< alignLen
; ++i
) {
24 dest
[i
] = static_cast<unsigned char>(aSource
[i
]);
27 // Walk 64 bytes (four XMM registers) at a time.
28 __m128i vectmask
= _mm_set1_epi16(0x00ff);
29 for (; aSourceLength
- i
> 31; i
+= 32) {
30 __m128i source1
= _mm_load_si128(reinterpret_cast<const __m128i
*>(aSource
+ i
));
31 source1
= _mm_and_si128(source1
, vectmask
);
33 __m128i source2
= _mm_load_si128(reinterpret_cast<const __m128i
*>(aSource
+ i
+ 8));
34 source2
= _mm_and_si128(source2
, vectmask
);
36 __m128i source3
= _mm_load_si128(reinterpret_cast<const __m128i
*>(aSource
+ i
+ 16));
37 source3
= _mm_and_si128(source3
, vectmask
);
39 __m128i source4
= _mm_load_si128(reinterpret_cast<const __m128i
*>(aSource
+ i
+ 24));
40 source4
= _mm_and_si128(source4
, vectmask
);
43 // Pack the source data. SSE2 views this as a saturating uint16_t to
44 // uint8_t conversion, but since we masked off the high-order byte of every
45 // uint16_t, we're really just grabbing the low-order bytes of source1 and
47 __m128i packed1
= _mm_packus_epi16(source1
, source2
);
48 __m128i packed2
= _mm_packus_epi16(source3
, source4
);
50 // This store needs to be unaligned since there's no guarantee that the
51 // alignment we did above for the source will align the destination.
52 _mm_storeu_si128(reinterpret_cast<__m128i
*>(dest
+ i
), packed1
);
53 _mm_storeu_si128(reinterpret_cast<__m128i
*>(dest
+ i
+ 16), packed2
);
56 // Finish up the rest.
57 for (; i
< aSourceLength
; ++i
) {
58 dest
[i
] = static_cast<unsigned char>(aSource
[i
]);
65 LossyConvertEncoding8to16::write_sse2(const char* aSource
,
66 uint32_t aSourceLength
)
68 char16_t
* dest
= mDestination
;
70 // Align source to a 16-byte boundary. We choose to align source rather than
71 // dest because we'd rather have our loads than our stores be fast. You have
72 // to wait for a load to complete, but you can keep on moving after issuing a
75 uint32_t alignLen
= XPCOM_MIN(aSourceLength
,
76 uint32_t(-NS_PTR_TO_INT32(aSource
) & 0xf));
77 for (; i
< alignLen
; ++i
) {
78 dest
[i
] = static_cast<unsigned char>(aSource
[i
]);
81 // Walk 32 bytes (two XMM registers) at a time.
82 for (; aSourceLength
- i
> 31; i
+= 32) {
83 __m128i source1
= _mm_load_si128(reinterpret_cast<const __m128i
*>(aSource
+ i
));
84 __m128i source2
= _mm_load_si128(reinterpret_cast<const __m128i
*>(aSource
+ i
+ 16));
86 // Interleave 0s in with the bytes of source to create lo and hi.
87 __m128i lo1
= _mm_unpacklo_epi8(source1
, _mm_setzero_si128());
88 __m128i hi1
= _mm_unpackhi_epi8(source1
, _mm_setzero_si128());
89 __m128i lo2
= _mm_unpacklo_epi8(source2
, _mm_setzero_si128());
90 __m128i hi2
= _mm_unpackhi_epi8(source2
, _mm_setzero_si128());
92 // store lo and hi into dest.
93 _mm_storeu_si128(reinterpret_cast<__m128i
*>(dest
+ i
), lo1
);
94 _mm_storeu_si128(reinterpret_cast<__m128i
*>(dest
+ i
+ 8), hi1
);
95 _mm_storeu_si128(reinterpret_cast<__m128i
*>(dest
+ i
+ 16), lo2
);
96 _mm_storeu_si128(reinterpret_cast<__m128i
*>(dest
+ i
+ 24), hi2
);
99 // Finish up whatever's left.
100 for (; i
< aSourceLength
; ++i
) {
101 dest
[i
] = static_cast<unsigned char>(aSource
[i
]);