Use better words for "git diff-tree --cc"
[TortoiseGit.git] / src / TortoiseMerge / FileTextLines.cpp
blob553b10b07cfcca8f5f22993e105fd9dbfe3e748d
1 // TortoiseGitMerge - a Diff/Patch program
3 // Copyright (C) 2007-2014 - TortoiseSVN
5 // This program is free software; you can redistribute it and/or
6 // modify it under the terms of the GNU General Public License
7 // as published by the Free Software Foundation; either version 2
8 // of the License, or (at your option) any later version.
10 // This program is distributed in the hope that it will be useful,
11 // but WITHOUT ANY WARRANTY; without even the implied warranty of
12 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 // GNU General Public License for more details.
15 // You should have received a copy of the GNU General Public License
16 // along with this program; if not, write to the Free Software Foundation,
17 // 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
19 #include "stdafx.h"
20 #include "resource.h"
21 #include "UnicodeUtils.h"
22 #include "registry.h"
23 #include "FileTextLines.h"
24 #include "FormatMessageWrapper.h"
25 #include "SmartHandle.h"
27 wchar_t inline WideCharSwap(wchar_t nValue)
29 return (((nValue>> 8)) | (nValue << 8));
30 //return _byteswap_ushort(nValue);
33 UINT64 inline WordSwapBytes(UINT64 nValue)
35 return ((nValue&0xff00ff00ff00ff)<<8) | ((nValue>>8)&0xff00ff00ff00ff); // swap BYTESs in WORDs
38 UINT32 inline DwordSwapBytes(UINT32 nValue)
40 UINT32 nRet = (nValue<<16) | (nValue>>16); // swap WORDs
41 nRet = ((nRet&0xff00ff)<<8) | ((nRet>>8)&0xff00ff); // swap BYTESs in WORDs
42 return nRet;
43 //return _byteswap_ulong(nValue);
46 UINT64 inline DwordSwapBytes(UINT64 nValue)
48 UINT64 nRet = ((nValue&0xffff0000ffffL)<<16) | ((nValue>>16)&0xffff0000ffffL); // swap WORDs in DWORDs
49 nRet = ((nRet&0xff00ff00ff00ff)<<8) | ((nRet>>8)&0xff00ff00ff00ff); // swap BYTESs in WORDs
50 return nRet;
53 CFileTextLines::CFileTextLines(void)
54 : m_bNeedsConversion(false)
56 m_SaveParams.m_UnicodeType = CFileTextLines::AUTOTYPE;
57 m_SaveParams.m_LineEndings = EOL_AUTOLINE;
60 CFileTextLines::~CFileTextLines(void)
64 CFileTextLines::UnicodeType CFileTextLines::CheckUnicodeType(LPVOID pBuffer, int cb)
66 if (cb < 2)
67 return CFileTextLines::ASCII;
68 const UINT32 * const pVal32 = (UINT32 *)pBuffer;
69 const UINT16 * const pVal16 = (UINT16 *)pBuffer;
70 const UINT8 * const pVal8 = (UINT8 *)pBuffer;
71 // scan the whole buffer for a 0x00000000 sequence
72 // if found, we assume a binary file
73 int nDwords = cb/4;
74 for (int j=0; j<nDwords; ++j)
76 if (0x00000000 == pVal32[j])
77 return CFileTextLines::BINARY;
79 if (cb >=4 )
81 if (*pVal32 == 0x0000FEFF)
83 return CFileTextLines::UTF32_LE;
85 if (*pVal32 == 0xFFFE0000)
87 return CFileTextLines::UTF32_BE;
90 if (*pVal16 == 0xFEFF)
92 return CFileTextLines::UTF16_LEBOM;
94 if (*pVal16 == 0xFFFE)
96 return CFileTextLines::UTF16_BEBOM;
98 if (cb < 3)
99 return CFileTextLines::ASCII;
100 if (*pVal16 == 0xBBEF)
102 if (pVal8[2] == 0xBF)
103 return CFileTextLines::UTF8BOM;
105 // check for illegal UTF8 sequences
106 bool bNonANSI = false;
107 int nNeedData = 0;
108 int i=0;
109 int nullcount = 0;
110 if (!bNonANSI)
112 for (; i<cb; ++i)
114 if (pVal8[i] == 0)
116 ++nullcount;
117 // count the null chars, we do not want to treat an ASCII/UTF8 file
118 // as UTF16 just because of some null chars that might be accidentally
119 // in the file.
120 // Use an arbitrary value of one fiftieth of the file length as
121 // the limit after which a file is considered UTF16.
122 if (nullcount > (cb / 50))
124 // null-chars are not allowed for ASCII or UTF8, that means
125 // this file is most likely UTF16 encoded
126 if (i%2)
127 return CFileTextLines::UTF16_LE;
128 else
129 return CFileTextLines::UTF16_BE;
132 if ((pVal8[i] & 0x80)!=0) // non ASCII
134 bNonANSI = true;
135 break;
139 // check remaining text for UTF-8 validity
140 for (; i<cb; ++i)
142 UINT8 zChar = pVal8[i];
143 if ((zChar & 0x80)==0) // Ascii
145 if (zChar == 0)
147 ++nullcount;
148 // count the null chars, we do not want to treat an ASCII/UTF8 file
149 // as UTF16 just because of some null chars that might be accidentally
150 // in the file.
151 // Use an arbitrary value of one fiftieth of the file length as
152 // the limit after which a file is considered UTF16.
153 if (nullcount > (cb / 50))
155 // null-chars are not allowed for ASCII or UTF8, that means
156 // this file is most likely UTF16 encoded
157 if (i%2)
158 return CFileTextLines::UTF16_LE;
159 else
160 return CFileTextLines::UTF16_BE;
162 nNeedData = 0;
164 else if (nNeedData)
166 return CFileTextLines::ASCII;
168 continue;
170 if ((zChar & 0x40)==0) // top bit
172 if (!nNeedData)
173 return CFileTextLines::ASCII;
174 --nNeedData;
176 else if (nNeedData)
178 return CFileTextLines::ASCII;
180 else if ((zChar & 0x20)==0) // top two bits
182 if (zChar<=0xC1)
183 return CFileTextLines::ASCII;
184 nNeedData = 1;
186 else if ((zChar & 0x10)==0) // top three bits
188 nNeedData = 2;
190 else if ((zChar & 0x08)==0) // top four bits
192 if (zChar>=0xf5)
193 return CFileTextLines::ASCII;
194 nNeedData = 3;
196 else
197 return CFileTextLines::ASCII;
199 if (bNonANSI && nNeedData==0)
200 // if get here thru nonAscii and no missing data left then its valid UTF8
201 return CFileTextLines::UTF8;
202 if ((!bNonANSI)&&(DWORD(CRegDWORD(_T("Software\\TortoiseGitMerge\\UseUTF8"), FALSE))))
203 return CFileTextLines::UTF8;
204 return CFileTextLines::ASCII;
208 BOOL CFileTextLines::Load(const CString& sFilePath, int lengthHint /* = 0*/)
210 WCHAR exceptionError[1000] = {0};
211 m_SaveParams.m_LineEndings = EOL_AUTOLINE;
212 m_SaveParams.m_UnicodeType = CFileTextLines::AUTOTYPE;
213 RemoveAll();
214 if(lengthHint != 0)
216 Reserve(lengthHint);
219 if (PathIsDirectory(sFilePath))
221 m_sErrorString.Format(IDS_ERR_FILE_NOTAFILE, (LPCTSTR)sFilePath);
222 return FALSE;
225 if (!PathFileExists(sFilePath))
227 //file does not exist, so just return SUCCESS
228 return TRUE;
231 CAutoFile hFile = CreateFile(sFilePath, GENERIC_READ, FILE_SHARE_READ|FILE_SHARE_DELETE|FILE_SHARE_WRITE, NULL, OPEN_EXISTING, NULL, NULL);
232 if (!hFile)
234 SetErrorString();
235 return FALSE;
238 LARGE_INTEGER fsize;
239 if (!GetFileSizeEx(hFile, &fsize))
241 SetErrorString();
242 return FALSE;
244 if (fsize.HighPart)
246 // file is way too big for us
247 m_sErrorString.LoadString(IDS_ERR_FILE_TOOBIG);
248 return FALSE;
251 // create buffer
252 // If new[] was done for type T delete[] must be called on a pointer of type T*,
253 // otherwise the behavior is undefined.
254 // +1 is to address possible truncation when integer division is done
255 CBuffer oFile;
258 oFile.SetLength(fsize.LowPart);
260 catch (CMemoryException* e)
262 e->GetErrorMessage(exceptionError, _countof(exceptionError));
263 m_sErrorString = exceptionError;
264 return FALSE;
267 // load file
268 DWORD dwReadBytes = 0;
269 if (!ReadFile(hFile, (void *)oFile, fsize.LowPart, &dwReadBytes, NULL))
271 SetErrorString();
272 return FALSE;
274 hFile.CloseHandle();
276 // detect type
277 if (m_SaveParams.m_UnicodeType == CFileTextLines::AUTOTYPE)
279 m_SaveParams.m_UnicodeType = this->CheckUnicodeType((LPVOID)oFile, dwReadBytes);
280 // enforce conversion for all but ASCII and UTF8 type
281 m_bNeedsConversion = (m_SaveParams.m_UnicodeType!=CFileTextLines::UTF8)&&(m_SaveParams.m_UnicodeType!=CFileTextLines::ASCII);
284 // we may have to convert the file content - CString is UTF16LE
287 CBaseFilter * pFilter = NULL;
288 switch (m_SaveParams.m_UnicodeType)
290 case BINARY:
291 m_sErrorString.Format(IDS_ERR_FILE_BINARY, (LPCTSTR)sFilePath);
292 return FALSE;
293 case UTF8:
294 case UTF8BOM:
295 pFilter = new CUtf8Filter(NULL);
296 break;
297 default:
298 case ASCII:
299 pFilter = new CAsciiFilter(NULL);
300 break;
301 case UTF16_BE:
302 case UTF16_BEBOM:
303 pFilter = new CUtf16beFilter(NULL);
304 break;
305 case UTF16_LE:
306 case UTF16_LEBOM:
307 pFilter = new CUtf16leFilter(NULL);
308 break;
309 case UTF32_BE:
310 pFilter = new CUtf32beFilter(NULL);
311 break;
312 case UTF32_LE:
313 pFilter = new CUtf32leFilter(NULL);
314 break;
316 pFilter->Decode(oFile);
317 delete pFilter;
319 catch (CMemoryException* e)
321 e->GetErrorMessage(exceptionError, _countof(exceptionError));
322 m_sErrorString = exceptionError;
323 return FALSE;
326 int nReadChars=oFile.GetLength()/sizeof(wchar_t);
327 wchar_t * pTextBuf = (wchar_t *)oFile;
328 wchar_t * pLineStart = pTextBuf;
329 if ((m_SaveParams.m_UnicodeType == UTF8BOM)
330 || (m_SaveParams.m_UnicodeType == UTF16_LEBOM)
331 || (m_SaveParams.m_UnicodeType == UTF16_BEBOM)
332 || (m_SaveParams.m_UnicodeType == UTF32_LE)
333 || (m_SaveParams.m_UnicodeType == UTF32_BE))
335 // ignore the BOM
336 ++pTextBuf;
337 ++pLineStart;
338 --nReadChars;
341 // fill in the lines into the array
342 size_t countEOLs[EOL__COUNT];
343 memset(countEOLs, 0, sizeof(countEOLs));
344 CFileTextLine oTextLine;
345 for (int i = nReadChars; i; --i)
347 EOL eEol;
348 switch (*pTextBuf++)
350 case '\r':
351 // crlf line ending or cr line ending
352 eEol = ((i > 1) && *(pTextBuf) == '\n') ? EOL_CRLF : EOL_CR;
353 break;
354 case '\n':
355 // lfcr line ending or lf line ending
356 eEol = ((i > 1) && *(pTextBuf) == '\r') ? EOL_LFCR : EOL_LF;
357 if (eEol == EOL_LFCR)
359 // LFCR is very rare on Windows, so we have to double check
360 // that this is not just a LF followed by CRLF
361 if (((countEOLs[EOL_CRLF] > 1) || (countEOLs[EOL_LF]>1)) &&
362 ((i > 2) && (*(pTextBuf+1) == '\n')))
364 // change the EOL back to a simple LF
365 eEol = EOL_LF;
368 break;
369 case 0x000b:
370 eEol = EOL_VT;
371 break;
372 case 0x000c:
373 eEol = EOL_FF;
374 break;
375 case 0x0085:
376 eEol = EOL_NEL;
377 break;
378 case 0x2028:
379 eEol = EOL_LS;
380 break;
381 case 0x2029:
382 eEol = EOL_PS;
383 break;
384 default:
385 continue;
387 oTextLine.sLine = CString(pLineStart, (int)(pTextBuf-pLineStart)-1);
388 oTextLine.eEnding = eEol;
389 CStdFileLineArray::Add(oTextLine);
390 ++countEOLs[eEol];
391 if (eEol==EOL_CRLF || eEol==EOL_LFCR)
393 ++pTextBuf;
394 --i;
396 pLineStart = pTextBuf;
398 CString line(pLineStart, (int)(pTextBuf-pLineStart));
399 Add(line, EOL_NOENDING);
401 // some EOLs are not supported by the svn diff lib.
402 m_bNeedsConversion |= (countEOLs[EOL_CRLF]!=0);
403 m_bNeedsConversion |= (countEOLs[EOL_FF]!=0);
404 m_bNeedsConversion |= (countEOLs[EOL_VT]!=0);
405 m_bNeedsConversion |= (countEOLs[EOL_NEL]!=0);
406 m_bNeedsConversion |= (countEOLs[EOL_LS]!=0);
407 m_bNeedsConversion |= (countEOLs[EOL_PS]!=0);
409 size_t eolmax = 0;
410 for (int nEol = 0; nEol<EOL__COUNT; nEol++)
412 if (eolmax < countEOLs[nEol])
414 eolmax = countEOLs[nEol];
415 m_SaveParams.m_LineEndings = (EOL)nEol;
419 return TRUE;
422 void CFileTextLines::StripWhiteSpace(CString& sLine, DWORD dwIgnoreWhitespaces, bool blame)
424 if (blame)
426 if (sLine.GetLength() > 66)
427 sLine = sLine.Mid(66);
429 switch (dwIgnoreWhitespaces)
431 case 0:
432 // Compare whitespaces
433 // do nothing
434 break;
435 case 1:
436 // Ignore all whitespaces
437 sLine.TrimLeft(_T(" \t"));
438 sLine.TrimRight(_T(" \t"));
439 break;
440 case 2:
441 // Ignore leading whitespace
442 sLine.TrimLeft(_T(" \t"));
443 break;
444 case 3:
445 // Ignore ending whitespace
446 sLine.TrimRight(_T(" \t"));
447 break;
452 Encoding pattern:
453 - encode & save BOM
454 - Get Line
455 - modify line - whitespaces, lowercase
456 - encode & save line
457 - get cached encoded eol
458 - save eol
460 BOOL CFileTextLines::Save( const CString& sFilePath
461 , bool bSaveAsUTF8 /*= false */
462 , bool bUseSVNCompatibleEOLs /*= false */
463 , DWORD dwIgnoreWhitespaces /*= 0 */
464 , BOOL bIgnoreCase /*= FALSE */
465 , bool bBlame /*= false*/
466 , bool bIgnoreComments /*= false*/
467 , const CString& linestart /*= CString()*/
468 , const CString& blockstart /*= CString()*/
469 , const CString& blockend /*= CString()*/
470 , const std::wregex& rx /*= std::wregex(L"")*/
471 , const std::wstring& replacement /*=L""*/)
473 m_sCommentLine = linestart;
474 m_sCommentBlockStart = blockstart;
475 m_sCommentBlockEnd = blockend;
479 CString destPath = sFilePath;
480 // now make sure that the destination directory exists
481 int ind = 0;
482 while (destPath.Find('\\', ind)>=2)
484 if (!PathIsDirectory(destPath.Left(destPath.Find('\\', ind))))
486 if (!CreateDirectory(destPath.Left(destPath.Find('\\', ind)), NULL))
487 return FALSE;
489 ind = destPath.Find('\\', ind)+1;
492 CStdioFile file; // Hugely faster than CFile for big file writes - because it uses buffering
493 if (!file.Open(sFilePath, CFile::modeCreate | CFile::modeWrite | CFile::typeBinary))
495 const_cast<CString *>(&m_sErrorString)->Format(IDS_ERR_FILE_OPEN, (LPCTSTR)sFilePath);
496 return FALSE;
499 CBaseFilter * pFilter = NULL;
500 bool bSaveBom = true;
501 CFileTextLines::UnicodeType eUnicodeType = bSaveAsUTF8 ? CFileTextLines::UTF8 : m_SaveParams.m_UnicodeType;
502 switch (eUnicodeType)
504 default:
505 case CFileTextLines::ASCII:
506 bSaveBom = false;
507 pFilter = new CAsciiFilter(&file);
508 break;
509 case CFileTextLines::UTF8:
510 bSaveBom = false;
511 case CFileTextLines::UTF8BOM:
512 pFilter = new CUtf8Filter(&file);
513 break;
514 case CFileTextLines::UTF16_BE:
515 bSaveBom = false;
516 pFilter = new CUtf16beFilter(&file);
517 break;
518 case CFileTextLines::UTF16_BEBOM:
519 pFilter = new CUtf16beFilter(&file);
520 break;
521 case CFileTextLines::UTF16_LE:
522 bSaveBom = false;
523 pFilter = new CUtf16leFilter(&file);
524 break;
525 case CFileTextLines::UTF16_LEBOM:
526 pFilter = new CUtf16leFilter(&file);
527 break;
528 case CFileTextLines::UTF32_BE:
529 pFilter = new CUtf32beFilter(&file);
530 break;
531 case CFileTextLines::UTF32_LE:
532 pFilter = new CUtf32leFilter(&file);
533 break;
536 if (bSaveBom)
538 //first write the BOM
539 pFilter->Write(L"\xfeff");
541 // cache EOLs
542 CBuffer oEncodedEol[EOL__COUNT];
543 oEncodedEol[EOL_LF] = pFilter->Encode(_T("\n")); // x0a
544 oEncodedEol[EOL_CR] = pFilter->Encode(_T("\r")); // x0d
545 oEncodedEol[EOL_CRLF] = pFilter->Encode(_T("\r\n")); // x0d x0a
546 if (bUseSVNCompatibleEOLs)
548 // when using EOLs that are supported by the svn lib,
549 // we have to use the same EOLs as the file has in case
550 // they're already supported, but a different supported one
551 // in case the original one isn't supported.
552 // Only this way the option "ignore EOLs (recommended)" unchecked
553 // actually shows the lines as different.
554 // However, the diff won't find and differences in EOLs
555 // for these special EOLs if they differ between those special ones
556 // listed below.
557 // But it will work properly for the most common EOLs LF/CR/CRLF.
558 oEncodedEol[EOL_LFCR] = oEncodedEol[EOL_CR];
559 for (int nEol = 0; nEol<EOL_NOENDING; nEol++)
561 if (oEncodedEol[nEol].IsEmpty())
562 oEncodedEol[nEol] = oEncodedEol[EOL_LF];
565 else
567 oEncodedEol[EOL_LFCR] = pFilter->Encode(_T("\n\r"));
568 oEncodedEol[EOL_VT] = pFilter->Encode(_T("\v")); // x0b
569 oEncodedEol[EOL_FF] = pFilter->Encode(_T("\f")); // x0c
570 oEncodedEol[EOL_NEL] = pFilter->Encode(_T("\x85"));
571 oEncodedEol[EOL_LS] = pFilter->Encode(_T("\x2028"));
572 oEncodedEol[EOL_PS] = pFilter->Encode(_T("\x2029"));
574 oEncodedEol[EOL_AUTOLINE] = oEncodedEol[m_SaveParams.m_LineEndings==EOL_AUTOLINE
575 ? EOL_CRLF
576 : m_SaveParams.m_LineEndings];
578 bool bInBlockComment = false;
579 for (int i=0; i<GetCount(); i++)
581 CString sLineT = GetAt(i);
582 if (bIgnoreComments)
583 bInBlockComment = StripComments(sLineT, bInBlockComment);
584 if (!rx._Empty())
585 LineRegex(sLineT, rx, replacement);
586 StripWhiteSpace(sLineT, dwIgnoreWhitespaces, bBlame);
587 if (bIgnoreCase)
588 sLineT = sLineT.MakeLower();
589 pFilter->Write(sLineT);
590 EOL eEol = GetLineEnding(i);
591 pFilter->Write(oEncodedEol[eEol]);
593 delete pFilter;
594 file.Close();
596 catch (CException * e)
598 CString * psErrorString = const_cast<CString *>(&m_sErrorString);
599 e->GetErrorMessage(psErrorString->GetBuffer(4096), 4096);
600 psErrorString->ReleaseBuffer();
601 e->Delete();
602 return FALSE;
604 return TRUE;
607 void CFileTextLines::SetErrorString()
609 m_sErrorString = CFormatMessageWrapper();
612 void CFileTextLines::CopySettings(CFileTextLines * pFileToCopySettingsTo) const
614 if (pFileToCopySettingsTo)
616 pFileToCopySettingsTo->m_SaveParams = m_SaveParams;
620 const wchar_t * CFileTextLines::GetEncodingName(UnicodeType eEncoding)
622 switch (eEncoding)
624 case ASCII:
625 return L"ASCII";
626 case BINARY:
627 return L"BINARY";
628 case UTF16_LE:
629 return L"UTF-16LE";
630 case UTF16_LEBOM:
631 return L"UTF-16LE BOM";
632 case UTF16_BE:
633 return L"UTF-16BE";
634 case UTF16_BEBOM:
635 return L"UTF-16BE BOM";
636 case UTF32_LE:
637 return L"UTF-32LE";
638 case UTF32_BE:
639 return L"UTF-32BE";
640 case UTF8:
641 return L"UTF-8";
642 case UTF8BOM:
643 return L"UTF-8 BOM";
645 return L"";
648 bool CFileTextLines::StripComments( CString& sLine, bool bInBlockComment )
650 int startpos = 0;
654 if (bInBlockComment)
656 int endpos = sLine.Find(m_sCommentBlockEnd);
657 if (endpos >= 0)
659 sLine = sLine.Left(startpos) + sLine.Mid(endpos+m_sCommentBlockEnd.GetLength());
660 bInBlockComment = false;
662 else
664 sLine = sLine.Left(startpos);
665 startpos = -1;
668 if (!bInBlockComment)
670 startpos = m_sCommentBlockStart.IsEmpty() ? -1 : sLine.Find(m_sCommentBlockStart);
671 int startpos2 = m_sCommentLine.IsEmpty() ? -1 : sLine.Find(m_sCommentLine);
672 if ( ((startpos2 < startpos) && (startpos2 >= 0)) ||
673 ((startpos2 >= 0) && (startpos < 0)) )
675 // line comment, erase the rest of the line
676 sLine = sLine.Left(startpos2);
677 startpos = -1;
679 else if (startpos >= 0)
681 // starting block comment
682 bInBlockComment = true;
685 } while (startpos >= 0);
687 return bInBlockComment;
690 void CFileTextLines::LineRegex( CString& sLine, const std::wregex& rx, const std::wstring& replacement ) const
692 std::wstring str = (LPCTSTR)sLine;
693 std::wstring str2 = std::regex_replace(str, rx, replacement);
694 sLine = str2.c_str();
698 void CBuffer::ExpandToAtLeast(int nNewSize)
700 if (nNewSize>m_nAllocated)
702 delete [] m_pBuffer; // we don't preserve buffer content intentionally
703 nNewSize+=2048-1;
704 nNewSize&=~(1024-1);
705 m_pBuffer=new BYTE[nNewSize];
706 m_nAllocated=nNewSize;
710 void CBuffer::SetLength(int nUsed)
712 ExpandToAtLeast(nUsed);
713 m_nUsed = nUsed;
716 void CBuffer::Swap(CBuffer & Src)
718 std::swap(Src.m_nAllocated, m_nAllocated);
719 std::swap(Src.m_pBuffer, m_pBuffer);
720 std::swap(Src.m_nUsed, m_nUsed);
723 void CBuffer::Copy(const CBuffer & Src)
725 if (&Src != this)
727 SetLength(Src.m_nUsed);
728 memcpy(m_pBuffer, Src.m_pBuffer, m_nUsed);
734 bool CBaseFilter::Decode(/*in out*/ CBuffer & data)
736 int nFlags = (m_nCodePage==CP_ACP) ? MB_PRECOMPOSED : 0;
737 // dry decode is around 8 times faster then real one, alternatively we can set buffer to max length
738 int nReadChars = MultiByteToWideChar(m_nCodePage, nFlags, (LPCSTR)data, data.GetLength(), NULL, 0);
739 m_oBuffer.SetLength(nReadChars*sizeof(wchar_t));
740 int ret2 = MultiByteToWideChar(m_nCodePage, nFlags, (LPCSTR)data, data.GetLength(), (LPWSTR)(void *)m_oBuffer, nReadChars);
741 if (ret2 != nReadChars)
743 return FALSE;
745 data.Swap(m_oBuffer);
746 return TRUE;
749 const CBuffer & CBaseFilter::Encode(const CString s)
751 m_oBuffer.SetLength(s.GetLength()*3+1); // set buffer to guessed max size
752 int nConvertedLen = WideCharToMultiByte(m_nCodePage, 0, (LPCTSTR)s, s.GetLength(), (LPSTR)m_oBuffer, m_oBuffer.GetLength(), NULL, NULL);
753 m_oBuffer.SetLength(nConvertedLen); // set buffer to used size
754 return m_oBuffer;
759 bool CUtf16leFilter::Decode(/*in out*/ CBuffer & /*data*/)
761 // we believe data is ok for use
762 return TRUE;
765 const CBuffer & CUtf16leFilter::Encode(const CString s)
767 int nNeedBytes = s.GetLength()*sizeof(TCHAR);
768 m_oBuffer.SetLength(nNeedBytes);
769 memcpy((void *)m_oBuffer, (LPCTSTR)s, nNeedBytes);
770 return m_oBuffer;
775 bool CUtf16beFilter::Decode(/*in out*/ CBuffer & data)
777 int nNeedBytes = data.GetLength();
778 // make in place WORD BYTEs swap
779 UINT64 * p_qw = (UINT64 *)(void *)data;
780 int nQwords = nNeedBytes/8;
781 for (int nQword = 0; nQword<nQwords; nQword++)
783 p_qw[nQword] = WordSwapBytes(p_qw[nQword]);
785 wchar_t * p_w = (wchar_t *)p_qw;
786 int nWords = nNeedBytes/2;
787 for (int nWord = nQwords*4; nWord<nWords; nWord++)
789 p_w[nWord] = WideCharSwap(p_w[nWord]);
791 return CUtf16leFilter::Decode(data);
794 const CBuffer & CUtf16beFilter::Encode(const CString s)
796 int nNeedBytes = s.GetLength()*sizeof(TCHAR);
797 m_oBuffer.SetLength(nNeedBytes);
798 // copy swaping BYTE order in WORDs
799 const UINT64 * p_qwIn = (const UINT64 *)(LPCTSTR)s;
800 UINT64 * p_qwOut = (UINT64 *)(void *)m_oBuffer;
801 int nQwords = nNeedBytes/8;
802 for (int nQword = 0; nQword<nQwords; nQword++)
804 p_qwOut[nQword] = WordSwapBytes(p_qwIn[nQword]);
806 wchar_t * p_wIn = (wchar_t *)p_qwIn;
807 wchar_t * p_wOut = (wchar_t *)p_qwOut;
808 int nWords = nNeedBytes/2;
809 for (int nWord = nQwords*4; nWord<nWords; nWord++)
811 p_wOut[nWord] = WideCharSwap(p_wIn[nWord]);
813 return m_oBuffer;
818 bool CUtf32leFilter::Decode(/*in out*/ CBuffer & data)
820 // UTF32 have four bytes per char
821 int nReadChars = data.GetLength()/4;
822 UINT32 * p32 = (UINT32 *)(void *)data;
824 // count chars which needs surrogate pair
825 int nSurrogatePairCount = 0;
826 for (int i = 0; i<nReadChars; ++i)
828 if (p32[i]<0x110000 && p32[i]>=0x10000)
830 ++nSurrogatePairCount;
834 // fill buffer
835 m_oBuffer.SetLength((nReadChars+nSurrogatePairCount)*sizeof(wchar_t));
836 wchar_t * pOut = (wchar_t *)m_oBuffer;
837 for (int i = 0; i<nReadChars; ++i, ++pOut)
839 UINT32 zChar = p32[i];
840 if (zChar>=0x110000)
842 *pOut=0xfffd; // ? mark
844 else if (zChar>=0x10000)
846 zChar-=0x10000;
847 pOut[0] = ((zChar>>10)&0x3ff) | 0xd800; // lead surrogate
848 pOut[1] = (zChar&0x7ff) | 0xdc00; // trail surrogate
849 pOut++;
851 else
853 *pOut = (wchar_t)zChar;
856 data.Swap(m_oBuffer);
857 return TRUE;
860 const CBuffer & CUtf32leFilter::Encode(const CString s)
862 int nInWords = s.GetLength();
863 m_oBuffer.SetLength(nInWords*2);
865 LPCTSTR p_In = (LPCTSTR)s;
866 UINT32 * p_Out = (UINT32 *)(void *)m_oBuffer;
867 int nOutDword = 0;
868 for (int nInWord = 0; nInWord<nInWords; nInWord++, nOutDword++)
870 UINT32 zChar = p_In[nInWord];
871 if ((zChar&0xfc00) == 0xd800) // lead surrogate
873 if (nInWord+1<nInWords && (p_In[nInWord+1]&0xfc00) == 0xdc00) // trail surrogate follows
875 zChar = 0x10000 + ((zChar&0x3ff)<<10) + (p_In[++nInWord]&0x3ff);
877 else
879 zChar = 0xfffd; // ? mark
882 else if ((zChar&0xfc00) == 0xdc00) // trail surrogate without lead
884 zChar = 0xfffd; // ? mark
886 p_Out[nOutDword] = zChar;
888 m_oBuffer.SetLength(nOutDword*4); // store length reduced by surrogates
889 return m_oBuffer;
894 bool CUtf32beFilter::Decode(/*in out*/ CBuffer & data)
897 // swap BYTEs order in DWORDs
898 UINT64 * p64 = (UINT64 *)(void *)data;
899 int nQwords = data.GetLength()/8;
900 for (int nQword = 0; nQword<nQwords; nQword++)
902 p64[nQword] = DwordSwapBytes(p64[nQword]);
905 UINT32 * p32 = (UINT32 *)p64;
906 int nDwords = data.GetLength()/4;
907 for (int nDword = nQwords*2; nDword<nDwords; nDword++)
909 p32[nDword] = DwordSwapBytes(p32[nDword]);
911 return CUtf32leFilter::Decode(data);
914 const CBuffer & CUtf32beFilter::Encode(const CString s)
916 CUtf32leFilter::Encode(s);
918 // swap BYTEs order in DWORDs
919 UINT64 * p64 = (UINT64 *)(void *)m_oBuffer;
920 int nQwords = m_oBuffer.GetLength()/8;
921 for (int nQword = 0; nQword<nQwords; nQword++)
923 p64[nQword] = DwordSwapBytes(p64[nQword]);
926 UINT32 * p32 = (UINT32 *)p64;
927 int nDwords = m_oBuffer.GetLength()/4;
928 for (int nDword = nQwords*2; nDword<nDwords; nDword++)
930 p32[nDword] = DwordSwapBytes(p32[nDword]);
932 return m_oBuffer;