Use nullptr instead of NULL
[TortoiseGit.git] / src / TortoiseMerge / FileTextLines.cpp
blob501956700069fee3182080ec466b112becb7ade5
1 // TortoiseGitMerge - a Diff/Patch program
3 // Copyright (C) 2016 - TortoiseGit
4 // Copyright (C) 2007-2016 - TortoiseSVN
6 // This program is free software; you can redistribute it and/or
7 // modify it under the terms of the GNU General Public License
8 // as published by the Free Software Foundation; either version 2
9 // of the License, or (at your option) any later version.
11 // This program is distributed in the hope that it will be useful,
12 // but WITHOUT ANY WARRANTY; without even the implied warranty of
13 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 // GNU General Public License for more details.
16 // You should have received a copy of the GNU General Public License
17 // along with this program; if not, write to the Free Software Foundation,
18 // 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
20 #include "stdafx.h"
21 #include "resource.h"
22 #include "UnicodeUtils.h"
23 #include "registry.h"
24 #include "FileTextLines.h"
25 #include "FormatMessageWrapper.h"
26 #include "SmartHandle.h"
28 wchar_t inline WideCharSwap(wchar_t nValue)
30 return (((nValue>> 8)) | (nValue << 8));
31 //return _byteswap_ushort(nValue);
34 UINT64 inline WordSwapBytes(UINT64 nValue)
36 return ((nValue&0xff00ff00ff00ff)<<8) | ((nValue>>8)&0xff00ff00ff00ff); // swap BYTESs in WORDs
39 UINT32 inline DwordSwapBytes(UINT32 nValue)
41 UINT32 nRet = (nValue<<16) | (nValue>>16); // swap WORDs
42 nRet = ((nRet&0xff00ff)<<8) | ((nRet>>8)&0xff00ff); // swap BYTESs in WORDs
43 return nRet;
44 //return _byteswap_ulong(nValue);
47 UINT64 inline DwordSwapBytes(UINT64 nValue)
49 UINT64 nRet = ((nValue&0xffff0000ffffL)<<16) | ((nValue>>16)&0xffff0000ffffL); // swap WORDs in DWORDs
50 nRet = ((nRet&0xff00ff00ff00ff)<<8) | ((nRet>>8)&0xff00ff00ff00ff); // swap BYTESs in WORDs
51 return nRet;
54 CFileTextLines::CFileTextLines(void)
55 : m_bNeedsConversion(false)
56 , m_bKeepEncoding(false)
58 m_SaveParams.m_UnicodeType = CFileTextLines::AUTOTYPE;
59 m_SaveParams.m_LineEndings = EOL_AUTOLINE;
62 CFileTextLines::~CFileTextLines(void)
66 CFileTextLines::UnicodeType CFileTextLines::CheckUnicodeType(LPVOID pBuffer, int cb)
68 if (cb < 2)
69 return CFileTextLines::ASCII;
70 const UINT32 * const pVal32 = (UINT32 *)pBuffer;
71 const UINT16 * const pVal16 = (UINT16 *)pBuffer;
72 const UINT8 * const pVal8 = (UINT8 *)pBuffer;
73 // scan the whole buffer for a 0x00000000 sequence
74 // if found, we assume a binary file
75 int nDwords = cb/4;
76 for (int j=0; j<nDwords; ++j)
78 if (0x00000000 == pVal32[j])
79 return CFileTextLines::BINARY;
81 if (cb >=4 )
83 if (*pVal32 == 0x0000FEFF)
85 return CFileTextLines::UTF32_LE;
87 if (*pVal32 == 0xFFFE0000)
89 return CFileTextLines::UTF32_BE;
92 if (*pVal16 == 0xFEFF)
94 return CFileTextLines::UTF16_LEBOM;
96 if (*pVal16 == 0xFFFE)
98 return CFileTextLines::UTF16_BEBOM;
100 if (cb < 3)
101 return CFileTextLines::ASCII;
102 if (*pVal16 == 0xBBEF)
104 if (pVal8[2] == 0xBF)
105 return CFileTextLines::UTF8BOM;
107 // check for illegal UTF8 sequences
108 bool bNonANSI = false;
109 int nNeedData = 0;
110 int i=0;
111 int nullcount = 0;
112 for (; i < cb; ++i)
114 if (pVal8[i] == 0)
116 ++nullcount;
117 // count the null chars, we do not want to treat an ASCII/UTF8 file
118 // as UTF16 just because of some null chars that might be accidentally
119 // in the file.
120 // Use an arbitrary value of one fiftieth of the file length as
121 // the limit after which a file is considered UTF16.
122 if (nullcount >(cb / 50))
124 // null-chars are not allowed for ASCII or UTF8, that means
125 // this file is most likely UTF16 encoded
126 if (i % 2)
127 return CFileTextLines::UTF16_LE;
128 else
129 return CFileTextLines::UTF16_BE;
132 if ((pVal8[i] & 0x80) != 0) // non ASCII
134 bNonANSI = true;
135 break;
138 // check remaining text for UTF-8 validity
139 for (; i<cb; ++i)
141 UINT8 zChar = pVal8[i];
142 if ((zChar & 0x80)==0) // Ascii
144 if (zChar == 0)
146 ++nullcount;
147 // count the null chars, we do not want to treat an ASCII/UTF8 file
148 // as UTF16 just because of some null chars that might be accidentally
149 // in the file.
150 // Use an arbitrary value of one fiftieth of the file length as
151 // the limit after which a file is considered UTF16.
152 if (nullcount > (cb / 50))
154 // null-chars are not allowed for ASCII or UTF8, that means
155 // this file is most likely UTF16 encoded
156 if (i%2)
157 return CFileTextLines::UTF16_LE;
158 else
159 return CFileTextLines::UTF16_BE;
161 nNeedData = 0;
163 else if (nNeedData)
165 return CFileTextLines::ASCII;
167 continue;
169 if ((zChar & 0x40)==0) // top bit
171 if (!nNeedData)
172 return CFileTextLines::ASCII;
173 --nNeedData;
175 else if (nNeedData)
177 return CFileTextLines::ASCII;
179 else if ((zChar & 0x20)==0) // top two bits
181 if (zChar<=0xC1)
182 return CFileTextLines::ASCII;
183 nNeedData = 1;
185 else if ((zChar & 0x10)==0) // top three bits
187 nNeedData = 2;
189 else if ((zChar & 0x08)==0) // top four bits
191 if (zChar>=0xf5)
192 return CFileTextLines::ASCII;
193 nNeedData = 3;
195 else
196 return CFileTextLines::ASCII;
198 if (bNonANSI && nNeedData==0)
199 // if get here thru nonAscii and no missing data left then its valid UTF8
200 return CFileTextLines::UTF8;
201 if (!bNonANSI && (DWORD(CRegDWORD(L"Software\\TortoiseGitMerge\\UseUTF8", FALSE))))
202 return CFileTextLines::UTF8;
203 return CFileTextLines::ASCII;
207 BOOL CFileTextLines::Load(const CString& sFilePath, int lengthHint /* = 0*/)
209 WCHAR exceptionError[1000] = {0};
210 m_SaveParams.m_LineEndings = EOL_AUTOLINE;
211 if (!m_bKeepEncoding)
212 m_SaveParams.m_UnicodeType = CFileTextLines::AUTOTYPE;
213 RemoveAll();
214 if(lengthHint != 0)
216 Reserve(lengthHint);
219 if (PathIsDirectory(sFilePath))
221 m_sErrorString.Format(IDS_ERR_FILE_NOTAFILE, (LPCTSTR)sFilePath);
222 return FALSE;
225 if (!PathFileExists(sFilePath))
227 //file does not exist, so just return SUCCESS
228 return TRUE;
231 CAutoFile hFile = CreateFile(sFilePath, GENERIC_READ, FILE_SHARE_READ | FILE_SHARE_DELETE | FILE_SHARE_WRITE, nullptr, OPEN_EXISTING, 0, nullptr);
232 if (!hFile)
234 SetErrorString();
235 return FALSE;
238 LARGE_INTEGER fsize;
239 if (!GetFileSizeEx(hFile, &fsize))
241 SetErrorString();
242 return FALSE;
244 if (fsize.HighPart)
246 // file is way too big for us
247 m_sErrorString.LoadString(IDS_ERR_FILE_TOOBIG);
248 return FALSE;
251 // create buffer
252 // If new[] was done for type T delete[] must be called on a pointer of type T*,
253 // otherwise the behavior is undefined.
254 // +1 is to address possible truncation when integer division is done
255 CBuffer oFile;
258 oFile.SetLength(fsize.LowPart);
260 catch (CMemoryException* e)
262 e->GetErrorMessage(exceptionError, _countof(exceptionError));
263 m_sErrorString = exceptionError;
264 return FALSE;
267 // load file
268 DWORD dwReadBytes = 0;
269 if (!ReadFile(hFile, (void*)oFile, fsize.LowPart, &dwReadBytes, nullptr))
271 SetErrorString();
272 return FALSE;
274 hFile.CloseHandle();
276 // detect type
277 if (m_SaveParams.m_UnicodeType == CFileTextLines::AUTOTYPE)
279 m_SaveParams.m_UnicodeType = this->CheckUnicodeType((LPVOID)oFile, dwReadBytes);
281 // enforce conversion for all but ASCII and UTF8 type
282 m_bNeedsConversion = (m_SaveParams.m_UnicodeType != CFileTextLines::UTF8) && (m_SaveParams.m_UnicodeType != CFileTextLines::ASCII);
284 // we may have to convert the file content - CString is UTF16LE
287 CBaseFilter* pFilter = nullptr;
288 switch (m_SaveParams.m_UnicodeType)
290 case BINARY:
291 m_sErrorString.Format(IDS_ERR_FILE_BINARY, (LPCTSTR)sFilePath);
292 return FALSE;
293 case UTF8:
294 case UTF8BOM:
295 pFilter = new CUtf8Filter(nullptr);
296 break;
297 default:
298 case ASCII:
299 pFilter = new CAsciiFilter(nullptr);
300 break;
301 case UTF16_BE:
302 case UTF16_BEBOM:
303 pFilter = new CUtf16beFilter(nullptr);
304 break;
305 case UTF16_LE:
306 case UTF16_LEBOM:
307 pFilter = new CUtf16leFilter(nullptr);
308 break;
309 case UTF32_BE:
310 pFilter = new CUtf32beFilter(nullptr);
311 break;
312 case UTF32_LE:
313 pFilter = new CUtf32leFilter(nullptr);
314 break;
316 pFilter->Decode(oFile);
317 delete pFilter;
319 catch (CMemoryException* e)
321 e->GetErrorMessage(exceptionError, _countof(exceptionError));
322 m_sErrorString = exceptionError;
323 return FALSE;
326 int nReadChars=oFile.GetLength()/sizeof(wchar_t);
327 wchar_t * pTextBuf = (wchar_t *)oFile;
328 wchar_t * pLineStart = pTextBuf;
329 if ((m_SaveParams.m_UnicodeType == UTF8BOM)
330 || (m_SaveParams.m_UnicodeType == UTF16_LEBOM)
331 || (m_SaveParams.m_UnicodeType == UTF16_BEBOM)
332 || (m_SaveParams.m_UnicodeType == UTF32_LE)
333 || (m_SaveParams.m_UnicodeType == UTF32_BE))
335 // ignore the BOM
336 ++pTextBuf;
337 ++pLineStart;
338 --nReadChars;
341 // fill in the lines into the array
342 size_t countEOLs[EOL__COUNT] = { 0 };
343 CFileTextLine oTextLine;
344 for (int i = nReadChars; i; --i)
346 EOL eEol;
347 switch (*pTextBuf++)
349 case '\r':
350 // crlf line ending or cr line ending
351 eEol = ((i > 1) && *(pTextBuf) == '\n') ? EOL_CRLF : EOL_CR;
352 break;
353 case '\n':
354 // lfcr line ending or lf line ending
355 eEol = ((i > 1) && *(pTextBuf) == '\r') ? EOL_LFCR : EOL_LF;
356 if (eEol == EOL_LFCR)
358 // LFCR is very rare on Windows, so we have to double check
359 // that this is not just a LF followed by CRLF
360 if (((countEOLs[EOL_CRLF] > 1) || (countEOLs[EOL_LF] > 1) || (GetCount() < 2)) &&
361 ((i > 2) && (*(pTextBuf+1) == '\n')))
363 // change the EOL back to a simple LF
364 eEol = EOL_LF;
367 break;
368 case 0x000b:
369 eEol = EOL_VT;
370 break;
371 case 0x000c:
372 eEol = EOL_FF;
373 break;
374 case 0x0085:
375 eEol = EOL_NEL;
376 break;
377 case 0x2028:
378 eEol = EOL_LS;
379 break;
380 case 0x2029:
381 eEol = EOL_PS;
382 break;
383 default:
384 continue;
386 oTextLine.sLine = CString(pLineStart, (int)(pTextBuf-pLineStart)-1);
387 oTextLine.eEnding = eEol;
388 CStdFileLineArray::Add(oTextLine);
389 ++countEOLs[eEol];
390 if (eEol==EOL_CRLF || eEol==EOL_LFCR)
392 ++pTextBuf;
393 --i;
395 pLineStart = pTextBuf;
397 CString line(pLineStart, (int)(pTextBuf-pLineStart));
398 Add(line, EOL_NOENDING);
400 // some EOLs are not supported by the svn diff lib.
401 m_bNeedsConversion |= (countEOLs[EOL_CRLF]!=0);
402 m_bNeedsConversion |= (countEOLs[EOL_FF]!=0);
403 m_bNeedsConversion |= (countEOLs[EOL_VT]!=0);
404 m_bNeedsConversion |= (countEOLs[EOL_NEL]!=0);
405 m_bNeedsConversion |= (countEOLs[EOL_LS]!=0);
406 m_bNeedsConversion |= (countEOLs[EOL_PS]!=0);
408 size_t eolmax = 0;
409 for (int nEol = 0; nEol<EOL__COUNT; nEol++)
411 if (eolmax < countEOLs[nEol])
413 eolmax = countEOLs[nEol];
414 m_SaveParams.m_LineEndings = (EOL)nEol;
418 return TRUE;
421 void CFileTextLines::StripWhiteSpace(CString& sLine, DWORD dwIgnoreWhitespaces, bool blame)
423 if (blame)
425 if (sLine.GetLength() > 66)
426 sLine = sLine.Mid(66);
428 switch (dwIgnoreWhitespaces)
430 case 0:
431 // Compare whitespaces
432 // do nothing
433 break;
434 case 1:
435 // Ignore all whitespaces
436 sLine.TrimLeft(L" \t");
437 sLine.TrimRight(L" \t");
438 break;
439 case 2:
440 // Ignore leading whitespace
441 sLine.TrimLeft(L" \t");
442 break;
443 case 3:
444 // Ignore ending whitespace
445 sLine.TrimRight(L" \t");
446 break;
451 Encoding pattern:
452 - encode & save BOM
453 - Get Line
454 - modify line - whitespaces, lowercase
455 - encode & save line
456 - get cached encoded eol
457 - save eol
459 BOOL CFileTextLines::Save( const CString& sFilePath
460 , bool bSaveAsUTF8 /*= false */
461 , bool bUseSVNCompatibleEOLs /*= false */
462 , DWORD dwIgnoreWhitespaces /*= 0 */
463 , BOOL bIgnoreCase /*= FALSE */
464 , bool bBlame /*= false*/
465 , bool bIgnoreComments /*= false*/
466 , const CString& linestart /*= CString()*/
467 , const CString& blockstart /*= CString()*/
468 , const CString& blockend /*= CString()*/
469 , const std::wregex& rx /*= std::wregex(L"")*/
470 , const std::wstring& replacement /*=L""*/)
472 m_sCommentLine = linestart;
473 m_sCommentBlockStart = blockstart;
474 m_sCommentBlockEnd = blockend;
478 CString destPath = sFilePath;
479 // now make sure that the destination directory exists
480 int ind = 0;
481 while (destPath.Find('\\', ind)>=2)
483 if (!PathIsDirectory(destPath.Left(destPath.Find('\\', ind))))
485 if (!CreateDirectory(destPath.Left(destPath.Find('\\', ind)), nullptr))
486 return FALSE;
488 ind = destPath.Find('\\', ind)+1;
491 CStdioFile file; // Hugely faster than CFile for big file writes - because it uses buffering
492 if (!file.Open(sFilePath, CFile::modeCreate | CFile::modeWrite | CFile::typeBinary | CFile::shareDenyNone))
494 const_cast<CString *>(&m_sErrorString)->Format(IDS_ERR_FILE_OPEN, (LPCTSTR)sFilePath);
495 return FALSE;
498 CBaseFilter* pFilter = nullptr;
499 bool bSaveBom = true;
500 CFileTextLines::UnicodeType eUnicodeType = bSaveAsUTF8 ? CFileTextLines::UTF8 : m_SaveParams.m_UnicodeType;
501 switch (eUnicodeType)
503 default:
504 case CFileTextLines::ASCII:
505 bSaveBom = false;
506 pFilter = new CAsciiFilter(&file);
507 break;
508 case CFileTextLines::UTF8:
509 bSaveBom = false;
510 case CFileTextLines::UTF8BOM:
511 pFilter = new CUtf8Filter(&file);
512 break;
513 case CFileTextLines::UTF16_BE:
514 bSaveBom = false;
515 pFilter = new CUtf16beFilter(&file);
516 break;
517 case CFileTextLines::UTF16_BEBOM:
518 pFilter = new CUtf16beFilter(&file);
519 break;
520 case CFileTextLines::UTF16_LE:
521 bSaveBom = false;
522 pFilter = new CUtf16leFilter(&file);
523 break;
524 case CFileTextLines::UTF16_LEBOM:
525 pFilter = new CUtf16leFilter(&file);
526 break;
527 case CFileTextLines::UTF32_BE:
528 pFilter = new CUtf32beFilter(&file);
529 break;
530 case CFileTextLines::UTF32_LE:
531 pFilter = new CUtf32leFilter(&file);
532 break;
535 if (bSaveBom)
537 //first write the BOM
538 pFilter->Write(L"\xfeff");
540 // cache EOLs
541 CBuffer oEncodedEol[EOL__COUNT];
542 oEncodedEol[EOL_LF] = pFilter->Encode(L"\n"); // x0a
543 oEncodedEol[EOL_CR] = pFilter->Encode(L"\r"); // x0d
544 oEncodedEol[EOL_CRLF] = pFilter->Encode(L"\r\n"); // x0d x0a
545 if (bUseSVNCompatibleEOLs)
547 // when using EOLs that are supported by the svn lib,
548 // we have to use the same EOLs as the file has in case
549 // they're already supported, but a different supported one
550 // in case the original one isn't supported.
551 // Only this way the option "ignore EOLs (recommended)" unchecked
552 // actually shows the lines as different.
553 // However, the diff won't find and differences in EOLs
554 // for these special EOLs if they differ between those special ones
555 // listed below.
556 // But it will work properly for the most common EOLs LF/CR/CRLF.
557 oEncodedEol[EOL_LFCR] = oEncodedEol[EOL_CR];
558 for (int nEol = 0; nEol<EOL_NOENDING; nEol++)
560 if (oEncodedEol[nEol].IsEmpty())
561 oEncodedEol[nEol] = oEncodedEol[EOL_LF];
564 else
566 oEncodedEol[EOL_LFCR] = pFilter->Encode(L"\n\r");
567 oEncodedEol[EOL_VT] = pFilter->Encode(L"\v"); // x0b
568 oEncodedEol[EOL_FF] = pFilter->Encode(L"\f"); // x0c
569 oEncodedEol[EOL_NEL] = pFilter->Encode(L"\x85");
570 oEncodedEol[EOL_LS] = pFilter->Encode(L"\x2028");
571 oEncodedEol[EOL_PS] = pFilter->Encode(L"\x2029");
573 oEncodedEol[EOL_AUTOLINE] = oEncodedEol[m_SaveParams.m_LineEndings==EOL_AUTOLINE
574 ? EOL_CRLF
575 : m_SaveParams.m_LineEndings];
577 bool bInBlockComment = false;
578 for (int i=0; i<GetCount(); i++)
580 CString sLineT = GetAt(i);
581 if (bIgnoreComments)
582 bInBlockComment = StripComments(sLineT, bInBlockComment);
583 if (!rx._Empty())
584 LineRegex(sLineT, rx, replacement);
585 StripWhiteSpace(sLineT, dwIgnoreWhitespaces, bBlame);
586 if (bIgnoreCase)
587 sLineT = sLineT.MakeLower();
588 pFilter->Write(sLineT);
589 EOL eEol = GetLineEnding(i);
590 pFilter->Write(oEncodedEol[eEol]);
592 delete pFilter;
593 file.Close();
595 catch (CException * e)
597 CString * psErrorString = const_cast<CString *>(&m_sErrorString);
598 e->GetErrorMessage(psErrorString->GetBuffer(4096), 4096);
599 psErrorString->ReleaseBuffer();
600 e->Delete();
601 return FALSE;
603 return TRUE;
606 void CFileTextLines::SetErrorString()
608 m_sErrorString = CFormatMessageWrapper();
611 void CFileTextLines::CopySettings(CFileTextLines * pFileToCopySettingsTo) const
613 if (pFileToCopySettingsTo)
615 pFileToCopySettingsTo->m_SaveParams = m_SaveParams;
619 const wchar_t * CFileTextLines::GetEncodingName(UnicodeType eEncoding)
621 switch (eEncoding)
623 case ASCII:
624 return L"ASCII";
625 case BINARY:
626 return L"BINARY";
627 case UTF16_LE:
628 return L"UTF-16LE";
629 case UTF16_LEBOM:
630 return L"UTF-16LE BOM";
631 case UTF16_BE:
632 return L"UTF-16BE";
633 case UTF16_BEBOM:
634 return L"UTF-16BE BOM";
635 case UTF32_LE:
636 return L"UTF-32LE";
637 case UTF32_BE:
638 return L"UTF-32BE";
639 case UTF8:
640 return L"UTF-8";
641 case UTF8BOM:
642 return L"UTF-8 BOM";
644 return L"";
647 bool CFileTextLines::StripComments( CString& sLine, bool bInBlockComment )
649 int startpos = 0;
650 int oldStartPos = -1;
653 if (bInBlockComment)
655 int endpos = sLine.Find(m_sCommentBlockEnd);
656 if (endpos >= 0)
658 sLine = sLine.Left(startpos) + sLine.Mid(endpos+m_sCommentBlockEnd.GetLength());
659 bInBlockComment = false;
661 else
663 sLine = sLine.Left(startpos);
664 startpos = -1;
667 if (!bInBlockComment)
669 startpos = m_sCommentBlockStart.IsEmpty() ? -1 : sLine.Find(m_sCommentBlockStart);
670 int startpos2 = m_sCommentLine.IsEmpty() ? -1 : sLine.Find(m_sCommentLine);
671 if ( ((startpos2 < startpos) && (startpos2 >= 0)) ||
672 ((startpos2 >= 0) && (startpos < 0)) )
674 // line comment
675 // look if there's a string marker (" or ') before that
676 // note: this check is not fully correct. For example, it
677 // does not account for escaped chars or even multiline strings.
678 // but it has to be fast, so this has to do...
679 int scount = 0;
680 int ccount = 0;
681 auto spos = sLine.Find('"');
682 while ((spos >= 0) && (spos < startpos2))
684 ++scount;
685 spos = sLine.Find('"', spos + 1);
687 auto cpos = sLine.Find('\'');
688 while ((cpos >= 0) && (cpos < startpos2))
690 ++ccount;
691 cpos = sLine.Find('"', cpos + 1);
693 if ((scount % 2 == 0) && (ccount % 2 == 0))
695 // line comment, erase the rest of the line
696 sLine = sLine.Left(startpos2);
697 startpos = -1;
699 if (startpos == oldStartPos)
700 return false;
701 oldStartPos = startpos;
703 else if (startpos >= 0)
705 // starting block comment
706 bInBlockComment = true;
709 } while (startpos >= 0);
711 return bInBlockComment;
714 void CFileTextLines::LineRegex( CString& sLine, const std::wregex& rx, const std::wstring& replacement ) const
716 std::wstring str = (LPCTSTR)sLine;
717 std::wstring str2 = std::regex_replace(str, rx, replacement);
718 sLine = str2.c_str();
722 void CBuffer::ExpandToAtLeast(int nNewSize)
724 if (nNewSize>m_nAllocated)
726 delete [] m_pBuffer; // we don't preserve buffer content intentionally
727 nNewSize+=2048-1;
728 nNewSize&=~(1024-1);
729 m_pBuffer=new BYTE[nNewSize];
730 m_nAllocated=nNewSize;
734 void CBuffer::SetLength(int nUsed)
736 ExpandToAtLeast(nUsed);
737 m_nUsed = nUsed;
740 void CBuffer::Swap(CBuffer & Src)
742 std::swap(Src.m_nAllocated, m_nAllocated);
743 std::swap(Src.m_pBuffer, m_pBuffer);
744 std::swap(Src.m_nUsed, m_nUsed);
747 void CBuffer::Copy(const CBuffer & Src)
749 if (&Src != this)
751 SetLength(Src.m_nUsed);
752 memcpy(m_pBuffer, Src.m_pBuffer, m_nUsed);
758 bool CBaseFilter::Decode(/*in out*/ CBuffer & data)
760 int nFlags = (m_nCodePage==CP_ACP) ? MB_PRECOMPOSED : 0;
761 // dry decode is around 8 times faster then real one, alternatively we can set buffer to max length
762 int nReadChars = MultiByteToWideChar(m_nCodePage, nFlags, (LPCSTR)data, data.GetLength(), nullptr, 0);
763 m_oBuffer.SetLength(nReadChars*sizeof(wchar_t));
764 int ret2 = MultiByteToWideChar(m_nCodePage, nFlags, (LPCSTR)data, data.GetLength(), (LPWSTR)(void *)m_oBuffer, nReadChars);
765 if (ret2 != nReadChars)
767 return FALSE;
769 data.Swap(m_oBuffer);
770 return TRUE;
773 const CBuffer& CBaseFilter::Encode(const CString& s)
775 m_oBuffer.SetLength(s.GetLength()*3+1); // set buffer to guessed max size
776 int nConvertedLen = WideCharToMultiByte(m_nCodePage, 0, (LPCTSTR)s, s.GetLength(), (LPSTR)m_oBuffer, m_oBuffer.GetLength(), nullptr, nullptr);
777 m_oBuffer.SetLength(nConvertedLen); // set buffer to used size
778 return m_oBuffer;
783 bool CUtf16leFilter::Decode(/*in out*/ CBuffer & /*data*/)
785 // we believe data is ok for use
786 return TRUE;
789 const CBuffer& CUtf16leFilter::Encode(const CString& s)
791 int nNeedBytes = s.GetLength()*sizeof(TCHAR);
792 m_oBuffer.SetLength(nNeedBytes);
793 memcpy((void *)m_oBuffer, (LPCTSTR)s, nNeedBytes);
794 return m_oBuffer;
799 bool CUtf16beFilter::Decode(/*in out*/ CBuffer & data)
801 int nNeedBytes = data.GetLength();
802 // make in place WORD BYTEs swap
803 UINT64 * p_qw = (UINT64 *)(void *)data;
804 int nQwords = nNeedBytes/8;
805 for (int nQword = 0; nQword<nQwords; nQword++)
807 p_qw[nQword] = WordSwapBytes(p_qw[nQword]);
809 wchar_t * p_w = (wchar_t *)p_qw;
810 int nWords = nNeedBytes/2;
811 for (int nWord = nQwords*4; nWord<nWords; nWord++)
813 p_w[nWord] = WideCharSwap(p_w[nWord]);
815 return CUtf16leFilter::Decode(data);
818 const CBuffer& CUtf16beFilter::Encode(const CString& s)
820 int nNeedBytes = s.GetLength()*sizeof(TCHAR);
821 m_oBuffer.SetLength(nNeedBytes);
822 // copy swaping BYTE order in WORDs
823 const UINT64 * p_qwIn = (const UINT64 *)(LPCTSTR)s;
824 UINT64 * p_qwOut = (UINT64 *)(void *)m_oBuffer;
825 int nQwords = nNeedBytes/8;
826 for (int nQword = 0; nQword<nQwords; nQword++)
828 p_qwOut[nQword] = WordSwapBytes(p_qwIn[nQword]);
830 wchar_t * p_wIn = (wchar_t *)p_qwIn;
831 wchar_t * p_wOut = (wchar_t *)p_qwOut;
832 int nWords = nNeedBytes/2;
833 for (int nWord = nQwords*4; nWord<nWords; nWord++)
835 p_wOut[nWord] = WideCharSwap(p_wIn[nWord]);
837 return m_oBuffer;
842 bool CUtf32leFilter::Decode(/*in out*/ CBuffer & data)
844 // UTF32 have four bytes per char
845 int nReadChars = data.GetLength()/4;
846 UINT32 * p32 = (UINT32 *)(void *)data;
848 // count chars which needs surrogate pair
849 int nSurrogatePairCount = 0;
850 for (int i = 0; i<nReadChars; ++i)
852 if (p32[i]<0x110000 && p32[i]>=0x10000)
854 ++nSurrogatePairCount;
858 // fill buffer
859 m_oBuffer.SetLength((nReadChars+nSurrogatePairCount)*sizeof(wchar_t));
860 wchar_t * pOut = (wchar_t *)m_oBuffer;
861 for (int i = 0; i<nReadChars; ++i, ++pOut)
863 UINT32 zChar = p32[i];
864 if (zChar>=0x110000)
866 *pOut=0xfffd; // ? mark
868 else if (zChar>=0x10000)
870 zChar-=0x10000;
871 pOut[0] = ((zChar>>10)&0x3ff) | 0xd800; // lead surrogate
872 pOut[1] = (zChar&0x7ff) | 0xdc00; // trail surrogate
873 pOut++;
875 else
877 *pOut = (wchar_t)zChar;
880 data.Swap(m_oBuffer);
881 return TRUE;
884 const CBuffer& CUtf32leFilter::Encode(const CString& s)
886 int nInWords = s.GetLength();
887 m_oBuffer.SetLength(nInWords*2);
889 LPCTSTR p_In = (LPCTSTR)s;
890 UINT32 * p_Out = (UINT32 *)(void *)m_oBuffer;
891 int nOutDword = 0;
892 for (int nInWord = 0; nInWord<nInWords; nInWord++, nOutDword++)
894 UINT32 zChar = p_In[nInWord];
895 if ((zChar&0xfc00) == 0xd800) // lead surrogate
897 if (nInWord+1<nInWords && (p_In[nInWord+1]&0xfc00) == 0xdc00) // trail surrogate follows
899 zChar = 0x10000 + ((zChar&0x3ff)<<10) + (p_In[++nInWord]&0x3ff);
901 else
903 zChar = 0xfffd; // ? mark
906 else if ((zChar&0xfc00) == 0xdc00) // trail surrogate without lead
908 zChar = 0xfffd; // ? mark
910 p_Out[nOutDword] = zChar;
912 m_oBuffer.SetLength(nOutDword*4); // store length reduced by surrogates
913 return m_oBuffer;
918 bool CUtf32beFilter::Decode(/*in out*/ CBuffer & data)
921 // swap BYTEs order in DWORDs
922 UINT64 * p64 = (UINT64 *)(void *)data;
923 int nQwords = data.GetLength()/8;
924 for (int nQword = 0; nQword<nQwords; nQword++)
926 p64[nQword] = DwordSwapBytes(p64[nQword]);
929 UINT32 * p32 = (UINT32 *)p64;
930 int nDwords = data.GetLength()/4;
931 for (int nDword = nQwords*2; nDword<nDwords; nDword++)
933 p32[nDword] = DwordSwapBytes(p32[nDword]);
935 return CUtf32leFilter::Decode(data);
938 const CBuffer& CUtf32beFilter::Encode(const CString& s)
940 CUtf32leFilter::Encode(s);
942 // swap BYTEs order in DWORDs
943 UINT64 * p64 = (UINT64 *)(void *)m_oBuffer;
944 int nQwords = m_oBuffer.GetLength()/8;
945 for (int nQword = 0; nQword<nQwords; nQword++)
947 p64[nQword] = DwordSwapBytes(p64[nQword]);
950 UINT32 * p32 = (UINT32 *)p64;
951 int nDwords = m_oBuffer.GetLength()/4;
952 for (int nDword = nQwords*2; nDword<nDwords; nDword++)
954 p32[nDword] = DwordSwapBytes(p32[nDword]);
956 return m_oBuffer;