1 // TortoiseGitMerge - a Diff/Patch program
3 // Copyright (C) 2016 - TortoiseGit
4 // Copyright (C) 2007-2016 - TortoiseSVN
6 // This program is free software; you can redistribute it and/or
7 // modify it under the terms of the GNU General Public License
8 // as published by the Free Software Foundation; either version 2
9 // of the License, or (at your option) any later version.
11 // This program is distributed in the hope that it will be useful,
12 // but WITHOUT ANY WARRANTY; without even the implied warranty of
13 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 // GNU General Public License for more details.
16 // You should have received a copy of the GNU General Public License
17 // along with this program; if not, write to the Free Software Foundation,
18 // 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
22 #include "UnicodeUtils.h"
24 #include "FileTextLines.h"
25 #include "FormatMessageWrapper.h"
26 #include "SmartHandle.h"
28 wchar_t inline WideCharSwap(wchar_t nValue
)
30 return (((nValue
>> 8)) | (nValue
<< 8));
31 //return _byteswap_ushort(nValue);
34 UINT64
inline WordSwapBytes(UINT64 nValue
)
36 return ((nValue
&0xff00ff00ff00ff)<<8) | ((nValue
>>8)&0xff00ff00ff00ff); // swap BYTESs in WORDs
39 UINT32
inline DwordSwapBytes(UINT32 nValue
)
41 UINT32 nRet
= (nValue
<<16) | (nValue
>>16); // swap WORDs
42 nRet
= ((nRet
&0xff00ff)<<8) | ((nRet
>>8)&0xff00ff); // swap BYTESs in WORDs
44 //return _byteswap_ulong(nValue);
47 UINT64
inline DwordSwapBytes(UINT64 nValue
)
49 UINT64 nRet
= ((nValue
&0xffff0000ffffL
)<<16) | ((nValue
>>16)&0xffff0000ffffL
); // swap WORDs in DWORDs
50 nRet
= ((nRet
&0xff00ff00ff00ff)<<8) | ((nRet
>>8)&0xff00ff00ff00ff); // swap BYTESs in WORDs
54 CFileTextLines::CFileTextLines(void)
55 : m_bNeedsConversion(false)
56 , m_bKeepEncoding(false)
58 m_SaveParams
.m_UnicodeType
= CFileTextLines::AUTOTYPE
;
59 m_SaveParams
.m_LineEndings
= EOL_AUTOLINE
;
62 CFileTextLines::~CFileTextLines(void)
66 CFileTextLines::UnicodeType
CFileTextLines::CheckUnicodeType(LPVOID pBuffer
, int cb
)
69 return CFileTextLines::ASCII
;
70 const UINT32
* const pVal32
= (UINT32
*)pBuffer
;
71 const UINT16
* const pVal16
= (UINT16
*)pBuffer
;
72 const UINT8
* const pVal8
= (UINT8
*)pBuffer
;
73 // scan the whole buffer for a 0x00000000 sequence
74 // if found, we assume a binary file
76 for (int j
=0; j
<nDwords
; ++j
)
78 if (0x00000000 == pVal32
[j
])
79 return CFileTextLines::BINARY
;
83 if (*pVal32
== 0x0000FEFF)
85 return CFileTextLines::UTF32_LE
;
87 if (*pVal32
== 0xFFFE0000)
89 return CFileTextLines::UTF32_BE
;
92 if (*pVal16
== 0xFEFF)
94 return CFileTextLines::UTF16_LEBOM
;
96 if (*pVal16
== 0xFFFE)
98 return CFileTextLines::UTF16_BEBOM
;
101 return CFileTextLines::ASCII
;
102 if (*pVal16
== 0xBBEF)
104 if (pVal8
[2] == 0xBF)
105 return CFileTextLines::UTF8BOM
;
107 // check for illegal UTF8 sequences
108 bool bNonANSI
= false;
117 // count the null chars, we do not want to treat an ASCII/UTF8 file
118 // as UTF16 just because of some null chars that might be accidentally
120 // Use an arbitrary value of one fiftieth of the file length as
121 // the limit after which a file is considered UTF16.
122 if (nullcount
>(cb
/ 50))
124 // null-chars are not allowed for ASCII or UTF8, that means
125 // this file is most likely UTF16 encoded
127 return CFileTextLines::UTF16_LE
;
129 return CFileTextLines::UTF16_BE
;
132 if ((pVal8
[i
] & 0x80) != 0) // non ASCII
138 // check remaining text for UTF-8 validity
141 UINT8 zChar
= pVal8
[i
];
142 if ((zChar
& 0x80)==0) // Ascii
147 // count the null chars, we do not want to treat an ASCII/UTF8 file
148 // as UTF16 just because of some null chars that might be accidentally
150 // Use an arbitrary value of one fiftieth of the file length as
151 // the limit after which a file is considered UTF16.
152 if (nullcount
> (cb
/ 50))
154 // null-chars are not allowed for ASCII or UTF8, that means
155 // this file is most likely UTF16 encoded
157 return CFileTextLines::UTF16_LE
;
159 return CFileTextLines::UTF16_BE
;
165 return CFileTextLines::ASCII
;
169 if ((zChar
& 0x40)==0) // top bit
172 return CFileTextLines::ASCII
;
177 return CFileTextLines::ASCII
;
179 else if ((zChar
& 0x20)==0) // top two bits
182 return CFileTextLines::ASCII
;
185 else if ((zChar
& 0x10)==0) // top three bits
189 else if ((zChar
& 0x08)==0) // top four bits
192 return CFileTextLines::ASCII
;
196 return CFileTextLines::ASCII
;
198 if (bNonANSI
&& nNeedData
==0)
199 // if get here thru nonAscii and no missing data left then its valid UTF8
200 return CFileTextLines::UTF8
;
201 if (!bNonANSI
&& (DWORD(CRegDWORD(L
"Software\\TortoiseGitMerge\\UseUTF8", FALSE
))))
202 return CFileTextLines::UTF8
;
203 return CFileTextLines::ASCII
;
207 BOOL
CFileTextLines::Load(const CString
& sFilePath
, int lengthHint
/* = 0*/)
209 WCHAR exceptionError
[1000] = {0};
210 m_SaveParams
.m_LineEndings
= EOL_AUTOLINE
;
211 if (!m_bKeepEncoding
)
212 m_SaveParams
.m_UnicodeType
= CFileTextLines::AUTOTYPE
;
219 if (PathIsDirectory(sFilePath
))
221 m_sErrorString
.Format(IDS_ERR_FILE_NOTAFILE
, (LPCTSTR
)sFilePath
);
225 if (!PathFileExists(sFilePath
))
227 //file does not exist, so just return SUCCESS
231 CAutoFile hFile
= CreateFile(sFilePath
, GENERIC_READ
, FILE_SHARE_READ
| FILE_SHARE_DELETE
| FILE_SHARE_WRITE
, nullptr, OPEN_EXISTING
, 0, nullptr);
239 if (!GetFileSizeEx(hFile
, &fsize
))
246 // file is way too big for us
247 m_sErrorString
.LoadString(IDS_ERR_FILE_TOOBIG
);
252 // If new[] was done for type T delete[] must be called on a pointer of type T*,
253 // otherwise the behavior is undefined.
254 // +1 is to address possible truncation when integer division is done
258 oFile
.SetLength(fsize
.LowPart
);
260 catch (CMemoryException
* e
)
262 e
->GetErrorMessage(exceptionError
, _countof(exceptionError
));
263 m_sErrorString
= exceptionError
;
268 DWORD dwReadBytes
= 0;
269 if (!ReadFile(hFile
, (void*)oFile
, fsize
.LowPart
, &dwReadBytes
, nullptr))
277 if (m_SaveParams
.m_UnicodeType
== CFileTextLines::AUTOTYPE
)
279 m_SaveParams
.m_UnicodeType
= this->CheckUnicodeType((LPVOID
)oFile
, dwReadBytes
);
281 // enforce conversion for all but ASCII and UTF8 type
282 m_bNeedsConversion
= (m_SaveParams
.m_UnicodeType
!= CFileTextLines::UTF8
) && (m_SaveParams
.m_UnicodeType
!= CFileTextLines::ASCII
);
284 // we may have to convert the file content - CString is UTF16LE
287 CBaseFilter
* pFilter
= nullptr;
288 switch (m_SaveParams
.m_UnicodeType
)
291 m_sErrorString
.Format(IDS_ERR_FILE_BINARY
, (LPCTSTR
)sFilePath
);
295 pFilter
= new CUtf8Filter(nullptr);
299 pFilter
= new CAsciiFilter(nullptr);
303 pFilter
= new CUtf16beFilter(nullptr);
307 pFilter
= new CUtf16leFilter(nullptr);
310 pFilter
= new CUtf32beFilter(nullptr);
313 pFilter
= new CUtf32leFilter(nullptr);
316 pFilter
->Decode(oFile
);
319 catch (CMemoryException
* e
)
321 e
->GetErrorMessage(exceptionError
, _countof(exceptionError
));
322 m_sErrorString
= exceptionError
;
326 int nReadChars
=oFile
.GetLength()/sizeof(wchar_t);
327 wchar_t * pTextBuf
= (wchar_t *)oFile
;
328 wchar_t * pLineStart
= pTextBuf
;
329 if ((m_SaveParams
.m_UnicodeType
== UTF8BOM
)
330 || (m_SaveParams
.m_UnicodeType
== UTF16_LEBOM
)
331 || (m_SaveParams
.m_UnicodeType
== UTF16_BEBOM
)
332 || (m_SaveParams
.m_UnicodeType
== UTF32_LE
)
333 || (m_SaveParams
.m_UnicodeType
== UTF32_BE
))
341 // fill in the lines into the array
342 size_t countEOLs
[EOL__COUNT
] = { 0 };
343 CFileTextLine oTextLine
;
344 for (int i
= nReadChars
; i
; --i
)
350 // crlf line ending or cr line ending
351 eEol
= ((i
> 1) && *(pTextBuf
) == '\n') ? EOL_CRLF
: EOL_CR
;
354 // lfcr line ending or lf line ending
355 eEol
= ((i
> 1) && *(pTextBuf
) == '\r') ? EOL_LFCR
: EOL_LF
;
356 if (eEol
== EOL_LFCR
)
358 // LFCR is very rare on Windows, so we have to double check
359 // that this is not just a LF followed by CRLF
360 if (((countEOLs
[EOL_CRLF
] > 1) || (countEOLs
[EOL_LF
] > 1) || (GetCount() < 2)) &&
361 ((i
> 2) && (*(pTextBuf
+1) == '\n')))
363 // change the EOL back to a simple LF
386 oTextLine
.sLine
= CString(pLineStart
, (int)(pTextBuf
-pLineStart
)-1);
387 oTextLine
.eEnding
= eEol
;
388 CStdFileLineArray::Add(oTextLine
);
390 if (eEol
==EOL_CRLF
|| eEol
==EOL_LFCR
)
395 pLineStart
= pTextBuf
;
397 CString
line(pLineStart
, (int)(pTextBuf
-pLineStart
));
398 Add(line
, EOL_NOENDING
);
400 // some EOLs are not supported by the svn diff lib.
401 m_bNeedsConversion
|= (countEOLs
[EOL_CRLF
]!=0);
402 m_bNeedsConversion
|= (countEOLs
[EOL_FF
]!=0);
403 m_bNeedsConversion
|= (countEOLs
[EOL_VT
]!=0);
404 m_bNeedsConversion
|= (countEOLs
[EOL_NEL
]!=0);
405 m_bNeedsConversion
|= (countEOLs
[EOL_LS
]!=0);
406 m_bNeedsConversion
|= (countEOLs
[EOL_PS
]!=0);
409 for (int nEol
= 0; nEol
<EOL__COUNT
; nEol
++)
411 if (eolmax
< countEOLs
[nEol
])
413 eolmax
= countEOLs
[nEol
];
414 m_SaveParams
.m_LineEndings
= (EOL
)nEol
;
421 void CFileTextLines::StripWhiteSpace(CString
& sLine
, DWORD dwIgnoreWhitespaces
, bool blame
)
425 if (sLine
.GetLength() > 66)
426 sLine
= sLine
.Mid(66);
428 switch (dwIgnoreWhitespaces
)
431 // Compare whitespaces
435 // Ignore all whitespaces
436 sLine
.TrimLeft(L
" \t");
437 sLine
.TrimRight(L
" \t");
440 // Ignore leading whitespace
441 sLine
.TrimLeft(L
" \t");
444 // Ignore ending whitespace
445 sLine
.TrimRight(L
" \t");
454 - modify line - whitespaces, lowercase
456 - get cached encoded eol
459 BOOL
CFileTextLines::Save( const CString
& sFilePath
460 , bool bSaveAsUTF8
/*= false */
461 , bool bUseSVNCompatibleEOLs
/*= false */
462 , DWORD dwIgnoreWhitespaces
/*= 0 */
463 , BOOL bIgnoreCase
/*= FALSE */
464 , bool bBlame
/*= false*/
465 , bool bIgnoreComments
/*= false*/
466 , const CString
& linestart
/*= CString()*/
467 , const CString
& blockstart
/*= CString()*/
468 , const CString
& blockend
/*= CString()*/
469 , const std::wregex
& rx
/*= std::wregex(L"")*/
470 , const std::wstring
& replacement
/*=L""*/)
472 m_sCommentLine
= linestart
;
473 m_sCommentBlockStart
= blockstart
;
474 m_sCommentBlockEnd
= blockend
;
478 CString destPath
= sFilePath
;
479 // now make sure that the destination directory exists
481 while (destPath
.Find('\\', ind
)>=2)
483 if (!PathIsDirectory(destPath
.Left(destPath
.Find('\\', ind
))))
485 if (!CreateDirectory(destPath
.Left(destPath
.Find('\\', ind
)), nullptr))
488 ind
= destPath
.Find('\\', ind
)+1;
491 CStdioFile file
; // Hugely faster than CFile for big file writes - because it uses buffering
492 if (!file
.Open(sFilePath
, CFile::modeCreate
| CFile::modeWrite
| CFile::typeBinary
| CFile::shareDenyNone
))
494 const_cast<CString
*>(&m_sErrorString
)->Format(IDS_ERR_FILE_OPEN
, (LPCTSTR
)sFilePath
);
498 CBaseFilter
* pFilter
= nullptr;
499 bool bSaveBom
= true;
500 CFileTextLines::UnicodeType eUnicodeType
= bSaveAsUTF8
? CFileTextLines::UTF8
: m_SaveParams
.m_UnicodeType
;
501 switch (eUnicodeType
)
504 case CFileTextLines::ASCII
:
506 pFilter
= new CAsciiFilter(&file
);
508 case CFileTextLines::UTF8
:
510 case CFileTextLines::UTF8BOM
:
511 pFilter
= new CUtf8Filter(&file
);
513 case CFileTextLines::UTF16_BE
:
515 pFilter
= new CUtf16beFilter(&file
);
517 case CFileTextLines::UTF16_BEBOM
:
518 pFilter
= new CUtf16beFilter(&file
);
520 case CFileTextLines::UTF16_LE
:
522 pFilter
= new CUtf16leFilter(&file
);
524 case CFileTextLines::UTF16_LEBOM
:
525 pFilter
= new CUtf16leFilter(&file
);
527 case CFileTextLines::UTF32_BE
:
528 pFilter
= new CUtf32beFilter(&file
);
530 case CFileTextLines::UTF32_LE
:
531 pFilter
= new CUtf32leFilter(&file
);
537 //first write the BOM
538 pFilter
->Write(L
"\xfeff");
541 CBuffer oEncodedEol
[EOL__COUNT
];
542 oEncodedEol
[EOL_LF
] = pFilter
->Encode(L
"\n"); // x0a
543 oEncodedEol
[EOL_CR
] = pFilter
->Encode(L
"\r"); // x0d
544 oEncodedEol
[EOL_CRLF
] = pFilter
->Encode(L
"\r\n"); // x0d x0a
545 if (bUseSVNCompatibleEOLs
)
547 // when using EOLs that are supported by the svn lib,
548 // we have to use the same EOLs as the file has in case
549 // they're already supported, but a different supported one
550 // in case the original one isn't supported.
551 // Only this way the option "ignore EOLs (recommended)" unchecked
552 // actually shows the lines as different.
553 // However, the diff won't find and differences in EOLs
554 // for these special EOLs if they differ between those special ones
556 // But it will work properly for the most common EOLs LF/CR/CRLF.
557 oEncodedEol
[EOL_LFCR
] = oEncodedEol
[EOL_CR
];
558 for (int nEol
= 0; nEol
<EOL_NOENDING
; nEol
++)
560 if (oEncodedEol
[nEol
].IsEmpty())
561 oEncodedEol
[nEol
] = oEncodedEol
[EOL_LF
];
566 oEncodedEol
[EOL_LFCR
] = pFilter
->Encode(L
"\n\r");
567 oEncodedEol
[EOL_VT
] = pFilter
->Encode(L
"\v"); // x0b
568 oEncodedEol
[EOL_FF
] = pFilter
->Encode(L
"\f"); // x0c
569 oEncodedEol
[EOL_NEL
] = pFilter
->Encode(L
"\x85");
570 oEncodedEol
[EOL_LS
] = pFilter
->Encode(L
"\x2028");
571 oEncodedEol
[EOL_PS
] = pFilter
->Encode(L
"\x2029");
573 oEncodedEol
[EOL_AUTOLINE
] = oEncodedEol
[m_SaveParams
.m_LineEndings
==EOL_AUTOLINE
575 : m_SaveParams
.m_LineEndings
];
577 bool bInBlockComment
= false;
578 for (int i
=0; i
<GetCount(); i
++)
580 CString sLineT
= GetAt(i
);
582 bInBlockComment
= StripComments(sLineT
, bInBlockComment
);
584 LineRegex(sLineT
, rx
, replacement
);
585 StripWhiteSpace(sLineT
, dwIgnoreWhitespaces
, bBlame
);
587 sLineT
= sLineT
.MakeLower();
588 pFilter
->Write(sLineT
);
589 EOL eEol
= GetLineEnding(i
);
590 pFilter
->Write(oEncodedEol
[eEol
]);
595 catch (CException
* e
)
597 CString
* psErrorString
= const_cast<CString
*>(&m_sErrorString
);
598 e
->GetErrorMessage(psErrorString
->GetBuffer(4096), 4096);
599 psErrorString
->ReleaseBuffer();
606 void CFileTextLines::SetErrorString()
608 m_sErrorString
= CFormatMessageWrapper();
611 void CFileTextLines::CopySettings(CFileTextLines
* pFileToCopySettingsTo
) const
613 if (pFileToCopySettingsTo
)
615 pFileToCopySettingsTo
->m_SaveParams
= m_SaveParams
;
619 const wchar_t * CFileTextLines::GetEncodingName(UnicodeType eEncoding
)
630 return L
"UTF-16LE BOM";
634 return L
"UTF-16BE BOM";
647 bool CFileTextLines::StripComments( CString
& sLine
, bool bInBlockComment
)
650 int oldStartPos
= -1;
655 int endpos
= sLine
.Find(m_sCommentBlockEnd
);
658 sLine
= sLine
.Left(startpos
) + sLine
.Mid(endpos
+m_sCommentBlockEnd
.GetLength());
659 bInBlockComment
= false;
663 sLine
= sLine
.Left(startpos
);
667 if (!bInBlockComment
)
669 startpos
= m_sCommentBlockStart
.IsEmpty() ? -1 : sLine
.Find(m_sCommentBlockStart
);
670 int startpos2
= m_sCommentLine
.IsEmpty() ? -1 : sLine
.Find(m_sCommentLine
);
671 if ( ((startpos2
< startpos
) && (startpos2
>= 0)) ||
672 ((startpos2
>= 0) && (startpos
< 0)) )
675 // look if there's a string marker (" or ') before that
676 // note: this check is not fully correct. For example, it
677 // does not account for escaped chars or even multiline strings.
678 // but it has to be fast, so this has to do...
681 auto spos
= sLine
.Find('"');
682 while ((spos
>= 0) && (spos
< startpos2
))
685 spos
= sLine
.Find('"', spos
+ 1);
687 auto cpos
= sLine
.Find('\'');
688 while ((cpos
>= 0) && (cpos
< startpos2
))
691 cpos
= sLine
.Find('"', cpos
+ 1);
693 if ((scount
% 2 == 0) && (ccount
% 2 == 0))
695 // line comment, erase the rest of the line
696 sLine
= sLine
.Left(startpos2
);
699 if (startpos
== oldStartPos
)
701 oldStartPos
= startpos
;
703 else if (startpos
>= 0)
705 // starting block comment
706 bInBlockComment
= true;
709 } while (startpos
>= 0);
711 return bInBlockComment
;
714 void CFileTextLines::LineRegex( CString
& sLine
, const std::wregex
& rx
, const std::wstring
& replacement
) const
716 std::wstring str
= (LPCTSTR
)sLine
;
717 std::wstring str2
= std::regex_replace(str
, rx
, replacement
);
718 sLine
= str2
.c_str();
722 void CBuffer::ExpandToAtLeast(int nNewSize
)
724 if (nNewSize
>m_nAllocated
)
726 delete [] m_pBuffer
; // we don't preserve buffer content intentionally
729 m_pBuffer
=new BYTE
[nNewSize
];
730 m_nAllocated
=nNewSize
;
734 void CBuffer::SetLength(int nUsed
)
736 ExpandToAtLeast(nUsed
);
740 void CBuffer::Swap(CBuffer
& Src
)
742 std::swap(Src
.m_nAllocated
, m_nAllocated
);
743 std::swap(Src
.m_pBuffer
, m_pBuffer
);
744 std::swap(Src
.m_nUsed
, m_nUsed
);
747 void CBuffer::Copy(const CBuffer
& Src
)
751 SetLength(Src
.m_nUsed
);
752 memcpy(m_pBuffer
, Src
.m_pBuffer
, m_nUsed
);
758 bool CBaseFilter::Decode(/*in out*/ CBuffer
& data
)
760 int nFlags
= (m_nCodePage
==CP_ACP
) ? MB_PRECOMPOSED
: 0;
761 // dry decode is around 8 times faster then real one, alternatively we can set buffer to max length
762 int nReadChars
= MultiByteToWideChar(m_nCodePage
, nFlags
, (LPCSTR
)data
, data
.GetLength(), nullptr, 0);
763 m_oBuffer
.SetLength(nReadChars
*sizeof(wchar_t));
764 int ret2
= MultiByteToWideChar(m_nCodePage
, nFlags
, (LPCSTR
)data
, data
.GetLength(), (LPWSTR
)(void *)m_oBuffer
, nReadChars
);
765 if (ret2
!= nReadChars
)
769 data
.Swap(m_oBuffer
);
773 const CBuffer
& CBaseFilter::Encode(const CString
& s
)
775 m_oBuffer
.SetLength(s
.GetLength()*3+1); // set buffer to guessed max size
776 int nConvertedLen
= WideCharToMultiByte(m_nCodePage
, 0, (LPCTSTR
)s
, s
.GetLength(), (LPSTR
)m_oBuffer
, m_oBuffer
.GetLength(), nullptr, nullptr);
777 m_oBuffer
.SetLength(nConvertedLen
); // set buffer to used size
783 bool CUtf16leFilter::Decode(/*in out*/ CBuffer
& /*data*/)
785 // we believe data is ok for use
789 const CBuffer
& CUtf16leFilter::Encode(const CString
& s
)
791 int nNeedBytes
= s
.GetLength()*sizeof(TCHAR
);
792 m_oBuffer
.SetLength(nNeedBytes
);
793 memcpy((void *)m_oBuffer
, (LPCTSTR
)s
, nNeedBytes
);
799 bool CUtf16beFilter::Decode(/*in out*/ CBuffer
& data
)
801 int nNeedBytes
= data
.GetLength();
802 // make in place WORD BYTEs swap
803 UINT64
* p_qw
= (UINT64
*)(void *)data
;
804 int nQwords
= nNeedBytes
/8;
805 for (int nQword
= 0; nQword
<nQwords
; nQword
++)
807 p_qw
[nQword
] = WordSwapBytes(p_qw
[nQword
]);
809 wchar_t * p_w
= (wchar_t *)p_qw
;
810 int nWords
= nNeedBytes
/2;
811 for (int nWord
= nQwords
*4; nWord
<nWords
; nWord
++)
813 p_w
[nWord
] = WideCharSwap(p_w
[nWord
]);
815 return CUtf16leFilter::Decode(data
);
818 const CBuffer
& CUtf16beFilter::Encode(const CString
& s
)
820 int nNeedBytes
= s
.GetLength()*sizeof(TCHAR
);
821 m_oBuffer
.SetLength(nNeedBytes
);
822 // copy swaping BYTE order in WORDs
823 const UINT64
* p_qwIn
= (const UINT64
*)(LPCTSTR
)s
;
824 UINT64
* p_qwOut
= (UINT64
*)(void *)m_oBuffer
;
825 int nQwords
= nNeedBytes
/8;
826 for (int nQword
= 0; nQword
<nQwords
; nQword
++)
828 p_qwOut
[nQword
] = WordSwapBytes(p_qwIn
[nQword
]);
830 wchar_t * p_wIn
= (wchar_t *)p_qwIn
;
831 wchar_t * p_wOut
= (wchar_t *)p_qwOut
;
832 int nWords
= nNeedBytes
/2;
833 for (int nWord
= nQwords
*4; nWord
<nWords
; nWord
++)
835 p_wOut
[nWord
] = WideCharSwap(p_wIn
[nWord
]);
842 bool CUtf32leFilter::Decode(/*in out*/ CBuffer
& data
)
844 // UTF32 have four bytes per char
845 int nReadChars
= data
.GetLength()/4;
846 UINT32
* p32
= (UINT32
*)(void *)data
;
848 // count chars which needs surrogate pair
849 int nSurrogatePairCount
= 0;
850 for (int i
= 0; i
<nReadChars
; ++i
)
852 if (p32
[i
]<0x110000 && p32
[i
]>=0x10000)
854 ++nSurrogatePairCount
;
859 m_oBuffer
.SetLength((nReadChars
+nSurrogatePairCount
)*sizeof(wchar_t));
860 wchar_t * pOut
= (wchar_t *)m_oBuffer
;
861 for (int i
= 0; i
<nReadChars
; ++i
, ++pOut
)
863 UINT32 zChar
= p32
[i
];
866 *pOut
=0xfffd; // ? mark
868 else if (zChar
>=0x10000)
871 pOut
[0] = ((zChar
>>10)&0x3ff) | 0xd800; // lead surrogate
872 pOut
[1] = (zChar
&0x7ff) | 0xdc00; // trail surrogate
877 *pOut
= (wchar_t)zChar
;
880 data
.Swap(m_oBuffer
);
884 const CBuffer
& CUtf32leFilter::Encode(const CString
& s
)
886 int nInWords
= s
.GetLength();
887 m_oBuffer
.SetLength(nInWords
*2);
889 LPCTSTR p_In
= (LPCTSTR
)s
;
890 UINT32
* p_Out
= (UINT32
*)(void *)m_oBuffer
;
892 for (int nInWord
= 0; nInWord
<nInWords
; nInWord
++, nOutDword
++)
894 UINT32 zChar
= p_In
[nInWord
];
895 if ((zChar
&0xfc00) == 0xd800) // lead surrogate
897 if (nInWord
+1<nInWords
&& (p_In
[nInWord
+1]&0xfc00) == 0xdc00) // trail surrogate follows
899 zChar
= 0x10000 + ((zChar
&0x3ff)<<10) + (p_In
[++nInWord
]&0x3ff);
903 zChar
= 0xfffd; // ? mark
906 else if ((zChar
&0xfc00) == 0xdc00) // trail surrogate without lead
908 zChar
= 0xfffd; // ? mark
910 p_Out
[nOutDword
] = zChar
;
912 m_oBuffer
.SetLength(nOutDword
*4); // store length reduced by surrogates
918 bool CUtf32beFilter::Decode(/*in out*/ CBuffer
& data
)
921 // swap BYTEs order in DWORDs
922 UINT64
* p64
= (UINT64
*)(void *)data
;
923 int nQwords
= data
.GetLength()/8;
924 for (int nQword
= 0; nQword
<nQwords
; nQword
++)
926 p64
[nQword
] = DwordSwapBytes(p64
[nQword
]);
929 UINT32
* p32
= (UINT32
*)p64
;
930 int nDwords
= data
.GetLength()/4;
931 for (int nDword
= nQwords
*2; nDword
<nDwords
; nDword
++)
933 p32
[nDword
] = DwordSwapBytes(p32
[nDword
]);
935 return CUtf32leFilter::Decode(data
);
938 const CBuffer
& CUtf32beFilter::Encode(const CString
& s
)
940 CUtf32leFilter::Encode(s
);
942 // swap BYTEs order in DWORDs
943 UINT64
* p64
= (UINT64
*)(void *)m_oBuffer
;
944 int nQwords
= m_oBuffer
.GetLength()/8;
945 for (int nQword
= 0; nQword
<nQwords
; nQword
++)
947 p64
[nQword
] = DwordSwapBytes(p64
[nQword
]);
950 UINT32
* p32
= (UINT32
*)p64
;
951 int nDwords
= m_oBuffer
.GetLength()/4;
952 for (int nDword
= nQwords
*2; nDword
<nDwords
; nDword
++)
954 p32
[nDword
] = DwordSwapBytes(p32
[nDword
]);