1
// TortoiseGitMerge - a Diff/Patch program
3 // Copyright (C) 2016, 2019, 2021, 2023 - TortoiseGit
4 // Copyright (C) 2007-2016, 2019 - TortoiseSVN
6 // This program is free software; you can redistribute it and/or
7 // modify it under the terms of the GNU General Public License
8 // as published by the Free Software Foundation; either version 2
9 // of the License, or (at your option) any later version.
11 // This program is distributed in the hope that it will be useful,
12 // but WITHOUT ANY WARRANTY; without even the implied warranty of
13 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 // GNU General Public License for more details.
16 // You should have received a copy of the GNU General Public License
17 // along with this program; if not, write to the Free Software Foundation,
18 // 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
22 #include "UnicodeUtils.h"
24 #include "FileTextLines.h"
25 #include "FormatMessageWrapper.h"
26 #include "SmartHandle.h"
29 constexpr wchar_t inline WideCharSwap(wchar_t nValue
) noexcept
31 return (((nValue
>> 8)) | (nValue
<< 8));
32 //return _byteswap_ushort(nValue);
35 constexpr UINT64
inline WordSwapBytes(UINT64 nValue
) noexcept
37 return ((nValue
&0xff00ff00ff00ff)<<8) | ((nValue
>>8)&0xff00ff00ff00ff); // swap BYTESs in WORDs
40 constexpr UINT32
inline DwordSwapBytes(UINT32 nValue
) noexcept
42 UINT32 nRet
= (nValue
<<16) | (nValue
>>16); // swap WORDs
43 nRet
= ((nRet
&0xff00ff)<<8) | ((nRet
>>8)&0xff00ff); // swap BYTESs in WORDs
45 //return _byteswap_ulong(nValue);
48 constexpr UINT64
inline DwordSwapBytes(UINT64 nValue
) noexcept
50 UINT64 nRet
= ((nValue
&0xffff0000ffffL
)<<16) | ((nValue
>>16)&0xffff0000ffffL
); // swap WORDs in DWORDs
51 nRet
= ((nRet
&0xff00ff00ff00ff)<<8) | ((nRet
>>8)&0xff00ff00ff00ff); // swap BYTESs in WORDs
55 CFileTextLines::CFileTextLines()
59 CFileTextLines::~CFileTextLines()
63 CFileTextLines::UnicodeType
CFileTextLines::CheckUnicodeType(LPCVOID pBuffer
, int cb
)
66 return CFileTextLines::UnicodeType::ASCII
;
67 auto const pVal32
= static_cast<const UINT32
*>(pBuffer
);
68 auto const pVal16
= static_cast<const UINT16
*>(pBuffer
);
69 auto const pVal8
= static_cast<const UINT8
*>(pBuffer
);
70 // scan the whole buffer for a 0x00000000 sequence
71 // if found, we assume a binary file
73 for (int j
=0; j
<nDwords
; ++j
)
75 if (0x00000000 == pVal32
[j
])
76 return CFileTextLines::UnicodeType::BINARY
;
80 if (*pVal32
== 0x0000FEFF)
82 return CFileTextLines::UnicodeType::UTF32_LE
;
84 if (*pVal32
== 0xFFFE0000)
86 return CFileTextLines::UnicodeType::UTF32_BE
;
89 if (*pVal16
== 0xFEFF)
91 return CFileTextLines::UnicodeType::UTF16_LEBOM
;
93 if (*pVal16
== 0xFFFE)
95 return CFileTextLines::UnicodeType::UTF16_BEBOM
;
98 return CFileTextLines::UnicodeType::ASCII
;
99 if (*pVal16
== 0xBBEF)
101 if (pVal8
[2] == 0xBF)
102 return CFileTextLines::UnicodeType::UTF8BOM
;
104 // check for illegal UTF8 sequences
105 bool bNonANSI
= false;
114 // count the null chars, we do not want to treat an ASCII/UTF8 file
115 // as UTF16 just because of some null chars that might be accidentally
117 // Use an arbitrary value of one fiftieth of the file length as
118 // the limit after which a file is considered UTF16.
119 if (nullcount
>(cb
/ 50))
121 // null-chars are not allowed for ASCII or UTF8, that means
122 // this file is most likely UTF16 encoded
124 return CFileTextLines::UnicodeType::UTF16_LE
;
126 return CFileTextLines::UnicodeType::UTF16_BE
;
129 if ((pVal8
[i
] & 0x80) != 0) // non ASCII
135 // check remaining text for UTF-8 validity
138 UINT8 zChar
= pVal8
[i
];
139 if ((zChar
& 0x80)==0) // Ascii
144 // count the null chars, we do not want to treat an ASCII/UTF8 file
145 // as UTF16 just because of some null chars that might be accidentally
147 // Use an arbitrary value of one fiftieth of the file length as
148 // the limit after which a file is considered UTF16.
149 if (nullcount
> (cb
/ 50))
151 // null-chars are not allowed for ASCII or UTF8, that means
152 // this file is most likely UTF16 encoded
154 return CFileTextLines::UnicodeType::UTF16_LE
;
156 return CFileTextLines::UnicodeType::UTF16_BE
;
162 return CFileTextLines::UnicodeType::ASCII
;
166 if ((zChar
& 0x40)==0) // top bit
169 return CFileTextLines::UnicodeType::ASCII
;
174 return CFileTextLines::UnicodeType::ASCII
;
176 else if ((zChar
& 0x20)==0) // top two bits
179 return CFileTextLines::UnicodeType::ASCII
;
182 else if ((zChar
& 0x10)==0) // top three bits
186 else if ((zChar
& 0x08)==0) // top four bits
189 return CFileTextLines::UnicodeType::ASCII
;
193 return CFileTextLines::UnicodeType::ASCII
;
195 if (bNonANSI
&& nNeedData
==0)
196 // if get here thru nonAscii and no missing data left then its valid UTF8
197 return CFileTextLines::UnicodeType::UTF8
;
198 if (!bNonANSI
&& (DWORD(CRegDWORD(L
"Software\\TortoiseGitMerge\\UseUTF8", FALSE
))))
199 return CFileTextLines::UnicodeType::UTF8
;
200 return CFileTextLines::UnicodeType::ASCII
;
204 BOOL
CFileTextLines::Load(const CString
& sFilePath
, int /*lengthHint*/ /* = 0*/)
206 m_SaveParams
.m_LineEndings
= EOL::AutoLine
;
207 if (!m_bKeepEncoding
)
208 m_SaveParams
.m_UnicodeType
= CFileTextLines::UnicodeType::AUTOTYPE
;
211 if (PathIsDirectory(sFilePath
))
213 m_sErrorString
.Format(IDS_ERR_FILE_NOTAFILE
, static_cast<LPCWSTR
>(sFilePath
));
217 if (!PathFileExists(sFilePath
))
219 //file does not exist, so just return SUCCESS
223 CAutoFile hFile
= CreateFile(sFilePath
, GENERIC_READ
, FILE_SHARE_READ
| FILE_SHARE_DELETE
| FILE_SHARE_WRITE
, nullptr, OPEN_EXISTING
, 0, nullptr);
231 if (!GetFileSizeEx(hFile
, &fsize
))
236 if (fsize
.QuadPart
>= INT_MAX
)
238 // file is way too big for us
239 m_sErrorString
.LoadString(IDS_ERR_FILE_TOOBIG
);
244 std::unique_ptr
<BYTE
[]> fileBuffer
;
247 fileBuffer
= std::unique_ptr
<BYTE
[]>(new BYTE
[fsize
.LowPart
]); // prevent default initialization
249 catch (CMemoryException
* e
)
251 e
->GetErrorMessage(CStrBuf(m_sErrorString
, 1000), 1000);
256 DWORD dwReadBytes
= 0;
257 if (!ReadFile(hFile
, static_cast<void*>(fileBuffer
.get()), fsize
.LowPart
, &dwReadBytes
, nullptr))
265 if (m_SaveParams
.m_UnicodeType
== CFileTextLines::UnicodeType::AUTOTYPE
)
267 m_SaveParams
.m_UnicodeType
= this->CheckUnicodeType(fileBuffer
.get(), dwReadBytes
);
269 // enforce conversion for all but ASCII and UTF8 type
270 m_bNeedsConversion
= (m_SaveParams
.m_UnicodeType
!= CFileTextLines::UnicodeType::UTF8
) && (m_SaveParams
.m_UnicodeType
!= CFileTextLines::UnicodeType::ASCII
);
272 // no need to decode empty file
273 if (dwReadBytes
== 0)
276 // we may have to convert the file content - CString is UTF16LE
277 std::unique_ptr
<CDecodeFilter
> pFilter
;
280 switch (m_SaveParams
.m_UnicodeType
)
282 case UnicodeType::BINARY
:
283 m_sErrorString
.Format(IDS_ERR_FILE_BINARY
, static_cast<LPCWSTR
>(sFilePath
));
285 case UnicodeType::UTF8
:
286 case UnicodeType::UTF8BOM
:
287 pFilter
= std::make_unique
<CUtf8Filter
>(nullptr);
290 case UnicodeType::ASCII
:
291 pFilter
= std::make_unique
<CAsciiFilter
>(nullptr);
293 case UnicodeType::UTF16_BE
:
294 case UnicodeType::UTF16_BEBOM
:
295 pFilter
= std::make_unique
<CUtf16beFilter
>(nullptr);
297 case UnicodeType::UTF16_LE
:
298 case UnicodeType::UTF16_LEBOM
:
299 pFilter
= std::make_unique
<CUtf16leFilter
>(nullptr);
301 case UnicodeType::UTF32_BE
:
302 pFilter
= std::make_unique
<CUtf32beFilter
>(nullptr);
304 case UnicodeType::UTF32_LE
:
305 pFilter
= std::make_unique
<CUtf32leFilter
>(nullptr);
308 if (!pFilter
->Decode(std::move(fileBuffer
), dwReadBytes
))
314 catch (CMemoryException
* e
)
316 e
->GetErrorMessage(CStrBuf(m_sErrorString
, 1000), 1000);
320 std::wstring_view converted
= pFilter
.get()->GetStringView();
321 int nReadChars
= static_cast<int>(converted
.size()); // see above, we have a INT_MAX limitation
322 auto pTextBuf
= converted
.data();
323 const wchar_t* pLineStart
= pTextBuf
;
324 if (!converted
.empty() && ((m_SaveParams
.m_UnicodeType
== UnicodeType::UTF8BOM
)
325 || (m_SaveParams
.m_UnicodeType
== UnicodeType::UTF16_LEBOM
)
326 || (m_SaveParams
.m_UnicodeType
== UnicodeType::UTF16_BEBOM
)
327 || (m_SaveParams
.m_UnicodeType
== UnicodeType::UTF32_LE
)
328 || (m_SaveParams
.m_UnicodeType
== UnicodeType::UTF32_BE
)))
336 // fill in the lines into the array
337 size_t countEOLs
[static_cast<int>(EOL::_COUNT
)] = { 0 };
338 CFileTextLine oTextLine
;
339 for (int i
= nReadChars
; i
; --i
)
345 // crlf line ending or cr line ending
346 eEol
= ((i
> 1) && *(pTextBuf
) == '\n') ? EOL::CRLF
: EOL::CR
;
349 // lfcr line ending or lf line ending
350 eEol
= ((i
> 1) && *(pTextBuf
) == '\r') ? EOL::LFCR
: EOL::LF
;
351 if (eEol
== EOL::LFCR
)
353 // LFCR is very rare on Windows, so we have to double check
354 // that this is not just a LF followed by CRLF
355 if (((countEOLs
[static_cast<int>(EOL::CRLF
)] > 1) || (countEOLs
[static_cast<int>(EOL::LF
)] > 1) || (GetCount() < 2)) &&
356 ((i
> 2) && (*(pTextBuf
+1) == '\n')))
358 // change the EOL back to a simple LF
381 oTextLine
.sLine
= CString(pLineStart
, static_cast<int>(pTextBuf
-pLineStart
) - 1);
382 oTextLine
.eEnding
= eEol
;
383 CStdFileLineArray::Add(oTextLine
);
384 ++countEOLs
[static_cast<int>(eEol
)];
385 if (eEol
== EOL::CRLF
|| eEol
== EOL::LFCR
)
390 pLineStart
= pTextBuf
;
392 CString
line(pLineStart
, static_cast<int>(pTextBuf
- pLineStart
));
393 Add(line
, EOL::NoEnding
);
395 // some EOLs are not supported by the svn diff lib.
396 m_bNeedsConversion
|= (countEOLs
[static_cast<int>(EOL::CRLF
)] != 0);
397 m_bNeedsConversion
|= (countEOLs
[static_cast<int>(EOL::FF
)] != 0);
398 m_bNeedsConversion
|= (countEOLs
[static_cast<int>(EOL::VT
)] != 0);
399 m_bNeedsConversion
|= (countEOLs
[static_cast<int>(EOL::NEL
)] != 0);
400 m_bNeedsConversion
|= (countEOLs
[static_cast<int>(EOL::LS
)] != 0);
401 m_bNeedsConversion
|= (countEOLs
[static_cast<int>(EOL::PS
)] != 0);
404 for (int nEol
= 0; nEol
< static_cast<int>(EOL::_COUNT
); nEol
++)
406 if (eolmax
< countEOLs
[nEol
])
408 eolmax
= countEOLs
[nEol
];
409 m_SaveParams
.m_LineEndings
= static_cast<EOL
>(nEol
);
416 void CFileTextLines::StripWhiteSpace(CString
& sLine
, DWORD dwIgnoreWhitespaces
, bool blame
)
420 if (sLine
.GetLength() > 66)
421 sLine
= sLine
.Mid(66);
423 switch (dwIgnoreWhitespaces
)
426 // Compare whitespaces
430 // Ignore all whitespaces
431 sLine
.TrimLeft(L
" \t");
432 sLine
.TrimRight(L
" \t");
435 // Ignore leading whitespace
436 sLine
.TrimLeft(L
" \t");
439 // Ignore ending whitespace
440 sLine
.TrimRight(L
" \t");
449 - modify line - whitespaces, lowercase
451 - get cached encoded eol
454 BOOL
CFileTextLines::Save( const CString
& sFilePath
455 , bool bSaveAsUTF8
/*= false */
456 , bool bUseSVNCompatibleEOLs
/*= false */
457 , DWORD dwIgnoreWhitespaces
/*= 0 */
458 , BOOL bIgnoreCase
/*= FALSE */
459 , bool bBlame
/*= false*/
460 , bool bIgnoreComments
/*= false*/
461 , const CString
& linestart
/*= CString()*/
462 , const CString
& blockstart
/*= CString()*/
463 , const CString
& blockend
/*= CString()*/
464 , const std::wregex
& rx
/*= std::wregex()*/
465 , const std::wstring
& replacement
/*=L""*/)
467 m_sCommentLine
= linestart
;
468 m_sCommentBlockStart
= blockstart
;
469 m_sCommentBlockEnd
= blockend
;
473 CString destPath
= sFilePath
;
474 // now make sure that the destination directory exists
476 while (destPath
.Find('\\', ind
)>=2)
478 if (!PathIsDirectory(destPath
.Left(destPath
.Find('\\', ind
))))
480 if (!CreateDirectory(destPath
.Left(destPath
.Find('\\', ind
)), nullptr))
483 ind
= destPath
.Find('\\', ind
)+1;
486 CStdioFile file
; // Hugely faster than CFile for big file writes - because it uses buffering
487 if (!file
.Open(sFilePath
, CFile::modeCreate
| CFile::modeWrite
| CFile::typeBinary
| CFile::shareDenyNone
))
489 m_sErrorString
.Format(IDS_ERR_FILE_OPEN
, static_cast<LPCWSTR
>(sFilePath
));
493 std::unique_ptr
<CEncodeFilter
> pFilter
;
494 bool bSaveBom
= true;
495 CFileTextLines::UnicodeType eUnicodeType
= bSaveAsUTF8
? CFileTextLines::UnicodeType::UTF8
: m_SaveParams
.m_UnicodeType
;
496 switch (eUnicodeType
)
499 case CFileTextLines::UnicodeType::ASCII
:
501 pFilter
= std::make_unique
<CAsciiFilter
>(&file
);
503 case CFileTextLines::UnicodeType::UTF8
:
506 case CFileTextLines::UnicodeType::UTF8BOM
:
507 pFilter
= std::make_unique
<CUtf8Filter
>(&file
);
509 case CFileTextLines::UnicodeType::UTF16_BE
:
511 pFilter
= std::make_unique
<CUtf16beFilter
>(&file
);
513 case CFileTextLines::UnicodeType::UTF16_BEBOM
:
514 pFilter
= std::make_unique
<CUtf16beFilter
>(&file
);
516 case CFileTextLines::UnicodeType::UTF16_LE
:
518 pFilter
= std::make_unique
<CUtf16leFilter
>(&file
);
520 case CFileTextLines::UnicodeType::UTF16_LEBOM
:
521 pFilter
= std::make_unique
<CUtf16leFilter
>(&file
);
523 case CFileTextLines::UnicodeType::UTF32_BE
:
524 pFilter
= std::make_unique
<CUtf32beFilter
>(&file
);
526 case CFileTextLines::UnicodeType::UTF32_LE
:
527 pFilter
= std::make_unique
<CUtf32leFilter
>(&file
);
533 //first write the BOM
534 pFilter
->Write(L
"\xfeff");
537 CBuffer oEncodedEol
[static_cast<int>(EOL::_COUNT
)];
538 oEncodedEol
[static_cast<int>(EOL::LF
)] = pFilter
->Encode(L
"\n"); // x0a
539 oEncodedEol
[static_cast<int>(EOL::CR
)] = pFilter
->Encode(L
"\r"); // x0d
540 oEncodedEol
[static_cast<int>(EOL::CRLF
)] = pFilter
->Encode(L
"\r\n"); // x0d x0a
541 if (bUseSVNCompatibleEOLs
)
543 // when using EOLs that are supported by the svn lib,
544 // we have to use the same EOLs as the file has in case
545 // they're already supported, but a different supported one
546 // in case the original one isn't supported.
547 // Only this way the option "ignore EOLs (recommended)" unchecked
548 // actually shows the lines as different.
549 // However, the diff won't find and differences in EOLs
550 // for these special EOLs if they differ between those special ones
552 // But it will work properly for the most common EOLs LF/CR/CRLF.
553 oEncodedEol
[static_cast<int>(EOL::LFCR
)] = oEncodedEol
[static_cast<int>(EOL::CR
)];
554 for (int nEol
= 0; nEol
< static_cast<int>(EOL::NoEnding
); nEol
++)
556 if (oEncodedEol
[nEol
].IsEmpty())
557 oEncodedEol
[nEol
] = oEncodedEol
[static_cast<int>(EOL::LF
)];
562 oEncodedEol
[static_cast<int>(EOL::LFCR
)] = pFilter
->Encode(L
"\n\r");
563 oEncodedEol
[static_cast<int>(EOL::VT
)] = pFilter
->Encode(L
"\v"); // x0b
564 oEncodedEol
[static_cast<int>(EOL::FF
)] = pFilter
->Encode(L
"\f"); // x0c
565 oEncodedEol
[static_cast<int>(EOL::NEL
)] = pFilter
->Encode(L
"\x85");
566 oEncodedEol
[static_cast<int>(EOL::LS
)] = pFilter
->Encode(L
"\x2028");
567 oEncodedEol
[static_cast<int>(EOL::PS
)] = pFilter
->Encode(L
"\x2029");
569 oEncodedEol
[static_cast<int>(EOL::AutoLine
)] = oEncodedEol
[static_cast<int>(m_SaveParams
.m_LineEndings
== EOL::AutoLine
? EOL::CRLF
: m_SaveParams
.m_LineEndings
)];
571 bool bInBlockComment
= false;
572 for (int i
=0; i
<GetCount(); i
++)
574 CString sLineT
= GetAt(i
);
576 bInBlockComment
= StripComments(sLineT
, bInBlockComment
);
578 LineRegex(sLineT
, rx
, replacement
);
579 StripWhiteSpace(sLineT
, dwIgnoreWhitespaces
, bBlame
);
581 sLineT
= sLineT
.MakeLower();
582 pFilter
->Write(sLineT
);
583 EOL eEol
= GetLineEnding(i
);
584 pFilter
->Write(oEncodedEol
[static_cast<int>(eEol
)]);
588 catch (CException
* e
)
590 e
->GetErrorMessage(CStrBuf(m_sErrorString
, 4096), 4096);
597 void CFileTextLines::SetErrorString()
599 m_sErrorString
= static_cast<LPCWSTR
>(CFormatMessageWrapper());
602 void CFileTextLines::CopySettings(CFileTextLines
* pFileToCopySettingsTo
) const
604 if (pFileToCopySettingsTo
)
606 pFileToCopySettingsTo
->m_SaveParams
= m_SaveParams
;
610 const wchar_t * CFileTextLines::GetEncodingName(UnicodeType eEncoding
)
614 case UnicodeType::ASCII
:
616 case UnicodeType::BINARY
:
618 case UnicodeType::UTF16_LE
:
620 case UnicodeType::UTF16_LEBOM
:
621 return L
"UTF-16LE BOM";
622 case UnicodeType::UTF16_BE
:
624 case UnicodeType::UTF16_BEBOM
:
625 return L
"UTF-16BE BOM";
626 case UnicodeType::UTF32_LE
:
628 case UnicodeType::UTF32_BE
:
630 case UnicodeType::UTF8
:
632 case UnicodeType::UTF8BOM
:
638 bool CFileTextLines::IsInsideString(const CString
& sLine
, int pos
)
642 auto spos
= sLine
.Find('"');
643 while (spos
>= 0 && spos
< pos
)
646 spos
= sLine
.Find('"', spos
+ 1);
648 auto cpos
= sLine
.Find('\'');
649 while (cpos
>= 0 && cpos
< pos
)
652 cpos
= sLine
.Find('"', cpos
+ 1);
654 return (scount
% 2 != 0 || ccount
% 2 != 0);
657 bool CFileTextLines::StripComments( CString
& sLine
, bool bInBlockComment
)
660 int oldStartPos
= -1;
665 int endpos
= sLine
.Find(m_sCommentBlockEnd
);
666 if (IsInsideString(sLine
, endpos
))
668 if (endpos
>= 0 && (endpos
> startpos
|| endpos
== 0))
670 sLine
= sLine
.Left(startpos
) + sLine
.Mid(endpos
+ m_sCommentBlockEnd
.GetLength());
671 bInBlockComment
= false;
676 sLine
= sLine
.Left(startpos
);
680 if (!bInBlockComment
)
682 startpos
= m_sCommentBlockStart
.IsEmpty() ? -1 : sLine
.Find(m_sCommentBlockStart
, startpos
);
683 int startpos2
= m_sCommentLine
.IsEmpty() ? -1 : sLine
.Find(m_sCommentLine
);
684 if ((startpos2
< startpos
&& startpos2
>= 0) || (startpos2
>= 0 && startpos
< 0))
687 // look if there's a string marker (" or ') before that
688 // note: this check is not fully correct. For example, it
689 // does not account for escaped chars or even multiline strings.
690 // but it has to be fast, so this has to do...
691 if (!IsInsideString(sLine
, startpos2
))
693 // line comment, erase the rest of the line
694 sLine
= sLine
.Left(startpos2
);
697 if (startpos
== oldStartPos
)
699 oldStartPos
= startpos
;
701 else if (startpos
>= 0)
703 // starting block comment
704 if (!IsInsideString(sLine
, startpos
))
705 bInBlockComment
= true;
710 } while (startpos
>= 0);
712 return bInBlockComment
;
715 void CFileTextLines::LineRegex( CString
& sLine
, const std::wregex
& rx
, const std::wstring
& replacement
) const
717 std::wstring str
= static_cast<LPCWSTR
>(sLine
);
718 std::wstring str2
= std::regex_replace(str
, rx
, replacement
);
719 sLine
= str2
.c_str();
723 void CBuffer::ExpandToAtLeast(int nNewSize
)
725 ASSERT(nNewSize
>= 0);
726 if (nNewSize
>m_nAllocated
)
728 Free(); // we don't preserve buffer content intentionally
729 if (INT_MAX
- (2048 - 1) >= nNewSize
)
731 nNewSize
+= 2048 - 1;
732 nNewSize
&= ~(1024 - 1);
736 m_pBuffer
=new BYTE
[nNewSize
];
737 m_nAllocated
=nNewSize
;
741 void CBuffer::SetLength(int nUsed
)
744 ExpandToAtLeast(nUsed
);
748 void CBuffer::Swap(CBuffer
& Src
) noexcept
750 std::swap(Src
.m_nAllocated
, m_nAllocated
);
751 std::swap(Src
.m_pBuffer
, m_pBuffer
);
752 std::swap(Src
.m_nUsed
, m_nUsed
);
755 void CBuffer::Copy(const CBuffer
& Src
)
759 SetLength(Src
.m_nUsed
);
760 memcpy(m_pBuffer
, Src
.m_pBuffer
, m_nUsed
);
765 bool CAsciiFilter::Decode(std::unique_ptr
<BYTE
[]> data
, int len
)
768 int nFlags
= (m_nCodePage
==CP_ACP
) ? MB_PRECOMPOSED
: 0;
769 // dry decode is around 8 times faster then real one, alternatively we can set buffer to max length
770 int nReadChars
= MultiByteToWideChar(m_nCodePage
, nFlags
, reinterpret_cast<LPCSTR
>(data
.get()), len
, nullptr, 0);
773 m_pBuffer
= new wchar_t[nReadChars
];
774 int ret2
= MultiByteToWideChar(m_nCodePage
, nFlags
, reinterpret_cast<LPCSTR
>(data
.get()), len
, m_pBuffer
, nReadChars
);
775 if (ret2
!= nReadChars
)
778 m_iBufferLength
= nReadChars
;
783 const CBuffer
& CAsciiFilter::Encode(const CString
& s
)
785 if (int bufferSize
; IntMult(s
.GetLength(), 3, &bufferSize
) != S_OK
|| IntAdd(bufferSize
, 1, &bufferSize
) != S_OK
)
786 AtlThrow(E_OUTOFMEMORY
);
788 m_oBuffer
.SetLength(bufferSize
); // set buffer to guessed max size
789 int nConvertedLen
= WideCharToMultiByte(m_nCodePage
, 0, static_cast<LPCWSTR
>(s
), s
.GetLength(), static_cast<LPSTR
>(m_oBuffer
), m_oBuffer
.GetLength(), nullptr, nullptr);
790 m_oBuffer
.SetLength(nConvertedLen
); // set buffer to used size
795 bool CUtf16leFilter::Decode(std::unique_ptr
<BYTE
[]> data
, int len
)
798 // we believe data is ok for use
799 m_deleter
= [](void* ptr
) { delete[] static_cast<BYTE
*>(ptr
); };
800 m_pBuffer
= reinterpret_cast<wchar_t*>(data
.release());
801 m_iBufferLength
= len
/ sizeof(wchar_t);
805 const CBuffer
& CUtf16leFilter::Encode(const CString
& s
)
808 if (IntMult(s
.GetLength(), sizeof(wchar_t), &nNeedBytes
) != S_OK
)
809 AtlThrow(E_OUTOFMEMORY
);
810 m_oBuffer
.SetLength(nNeedBytes
);
811 memcpy(static_cast<void*>(m_oBuffer
), static_cast<LPCWSTR
>(s
), nNeedBytes
);
816 bool CUtf16beFilter::Decode(std::unique_ptr
<BYTE
[]> data
, int len
)
819 // make in place WORD BYTEs swap
820 auto p_qw
= static_cast<UINT64
*>(static_cast<void*>(data
.get()));
821 int nQwords
= len
/ 8;
822 for (int nQword
= 0; nQword
<nQwords
; nQword
++)
824 p_qw
[nQword
] = WordSwapBytes(p_qw
[nQword
]);
826 auto p_w
= reinterpret_cast<wchar_t*>(p_qw
);
827 int nWords
= len
/ 2;
828 for (int nWord
= nQwords
*4; nWord
<nWords
; nWord
++)
830 p_w
[nWord
] = WideCharSwap(p_w
[nWord
]);
832 return CUtf16leFilter::Decode(std::move(data
), len
);
835 const CBuffer
& CUtf16beFilter::Encode(const CString
& s
)
838 if (IntMult(s
.GetLength(), sizeof(wchar_t), &nNeedBytes
) != S_OK
)
839 AtlThrow(E_OUTOFMEMORY
);
840 m_oBuffer
.SetLength(nNeedBytes
);
841 // copy swaping BYTE order in WORDs
842 auto p_qwIn
= reinterpret_cast<const UINT64
*>(static_cast<LPCWSTR
>(s
));
843 auto p_qwOut
= static_cast<UINT64
*>(static_cast<void*>(m_oBuffer
));
844 int nQwords
= nNeedBytes
/8;
845 for (int nQword
= 0; nQword
<nQwords
; nQword
++)
847 p_qwOut
[nQword
] = WordSwapBytes(p_qwIn
[nQword
]);
849 auto p_wIn
= reinterpret_cast<const wchar_t*>(p_qwIn
);
850 auto p_wOut
= reinterpret_cast<wchar_t*>(p_qwOut
);
851 int nWords
= nNeedBytes
/2;
852 for (int nWord
= nQwords
*4; nWord
<nWords
; nWord
++)
854 p_wOut
[nWord
] = WideCharSwap(p_wIn
[nWord
]);
860 bool CUtf32leFilter::Decode(std::unique_ptr
<BYTE
[]> data
, int len
)
863 // UTF32 have four bytes per char
864 int nReadChars
= len
/ 4;
865 auto p32
= static_cast<UINT32
*>(static_cast<void*>(data
.get()));
867 // count chars which needs surrogate pair
868 int nSurrogatePairCount
= 0;
869 for (int i
= 0; i
<nReadChars
; ++i
)
871 if (p32
[i
]<0x110000 && p32
[i
]>=0x10000)
873 ++nSurrogatePairCount
;
878 if (int bufferSize
; IntAdd(nReadChars
, nSurrogatePairCount
, &bufferSize
) != S_OK
)
879 AtlThrow(E_OUTOFMEMORY
);
881 m_pBuffer
= new wchar_t[bufferSize
]; // set buffer to guessed max size
882 auto pOut
= m_pBuffer
;
883 for (int i
= 0; i
<nReadChars
; ++i
, ++pOut
)
885 UINT32 zChar
= p32
[i
];
888 *pOut
=0xfffd; // ? mark
890 else if (zChar
>=0x10000)
893 pOut
[0] = ((zChar
>>10)&0x3ff) | 0xd800; // lead surrogate
894 pOut
[1] = (zChar
&0x7ff) | 0xdc00; // trail surrogate
899 *pOut
= static_cast<wchar_t>(zChar
);
902 m_iBufferLength
= nReadChars
;
906 const CBuffer
& CUtf32leFilter::Encode(const CString
& s
)
908 int nInWords
= s
.GetLength();
909 if (int bufferSize
; IntMult(nInWords
, 2, &bufferSize
) != S_OK
)
910 AtlThrow(E_OUTOFMEMORY
);
912 m_oBuffer
.SetLength(bufferSize
);
914 auto p_In
= static_cast<LPCWSTR
>(s
);
915 auto p_Out
= static_cast<UINT32
*>(static_cast<void*>(m_oBuffer
));
917 for (int nInWord
= 0; nInWord
<nInWords
; nInWord
++, nOutDword
++)
919 UINT32 zChar
= p_In
[nInWord
];
920 if ((zChar
&0xfc00) == 0xd800) // lead surrogate
922 if (nInWord
+1<nInWords
&& (p_In
[nInWord
+1]&0xfc00) == 0xdc00) // trail surrogate follows
924 zChar
= 0x10000 + ((zChar
&0x3ff)<<10) + (p_In
[++nInWord
]&0x3ff);
928 zChar
= 0xfffd; // ? mark
931 else if ((zChar
&0xfc00) == 0xdc00) // trail surrogate without lead
933 zChar
= 0xfffd; // ? mark
935 p_Out
[nOutDword
] = zChar
;
937 if (int bufferSize
; IntMult(nOutDword
, 4, &bufferSize
) != S_OK
)
938 AtlThrow(E_OUTOFMEMORY
);
940 m_oBuffer
.SetLength(bufferSize
); // store length reduced by surrogates
945 bool CUtf32beFilter::Decode(std::unique_ptr
<BYTE
[]> data
, int len
)
947 // swap BYTEs order in DWORDs
948 auto p64
= static_cast<UINT64
*>(static_cast<void*>(data
.get()));
949 int nQwords
= len
/ 8;
950 for (int nQword
= 0; nQword
<nQwords
; nQword
++)
952 p64
[nQword
] = DwordSwapBytes(p64
[nQword
]);
955 auto p32
= reinterpret_cast<UINT32
*>(p64
);
956 int nDwords
= len
/ 4;
957 for (int nDword
= nQwords
*2; nDword
<nDwords
; nDword
++)
959 p32
[nDword
] = DwordSwapBytes(p32
[nDword
]);
961 return CUtf32leFilter::Decode(std::move(data
), len
);
964 const CBuffer
& CUtf32beFilter::Encode(const CString
& s
)
966 CUtf32leFilter::Encode(s
);
968 // swap BYTEs order in DWORDs
969 auto p64
= static_cast<UINT64
*>(static_cast<void*>(m_oBuffer
));
970 int nQwords
= m_oBuffer
.GetLength()/8;
971 for (int nQword
= 0; nQword
<nQwords
; nQword
++)
973 p64
[nQword
] = DwordSwapBytes(p64
[nQword
]);
976 auto p32
= reinterpret_cast<UINT32
*>(p64
);
977 int nDwords
= m_oBuffer
.GetLength()/4;
978 for (int nDword
= nQwords
*2; nDword
<nDwords
; nDword
++)
980 p32
[nDword
] = DwordSwapBytes(p32
[nDword
]);