Apply backgroundcolors.patch
[TortoiseGit.git] / src / TortoiseMerge / FileTextLines.cpp
blob44efc061413f888702f4235feed3a74efec0e31e
1 // TortoiseGitMerge - a Diff/Patch program
3 // Copyright (C) 2016, 2019, 2021, 2023 - TortoiseGit
4 // Copyright (C) 2007-2016, 2019 - TortoiseSVN
6 // This program is free software; you can redistribute it and/or
7 // modify it under the terms of the GNU General Public License
8 // as published by the Free Software Foundation; either version 2
9 // of the License, or (at your option) any later version.
11 // This program is distributed in the hope that it will be useful,
12 // but WITHOUT ANY WARRANTY; without even the implied warranty of
13 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 // GNU General Public License for more details.
16 // You should have received a copy of the GNU General Public License
17 // along with this program; if not, write to the Free Software Foundation,
18 // 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
20 #include "stdafx.h"
21 #include "resource.h"
22 #include "UnicodeUtils.h"
23 #include "registry.h"
24 #include "FileTextLines.h"
25 #include "FormatMessageWrapper.h"
26 #include "SmartHandle.h"
27 #include <intsafe.h>
29 constexpr wchar_t inline WideCharSwap(wchar_t nValue) noexcept
31 return (((nValue>> 8)) | (nValue << 8));
32 //return _byteswap_ushort(nValue);
35 constexpr UINT64 inline WordSwapBytes(UINT64 nValue) noexcept
37 return ((nValue&0xff00ff00ff00ff)<<8) | ((nValue>>8)&0xff00ff00ff00ff); // swap BYTESs in WORDs
40 constexpr UINT32 inline DwordSwapBytes(UINT32 nValue) noexcept
42 UINT32 nRet = (nValue<<16) | (nValue>>16); // swap WORDs
43 nRet = ((nRet&0xff00ff)<<8) | ((nRet>>8)&0xff00ff); // swap BYTESs in WORDs
44 return nRet;
45 //return _byteswap_ulong(nValue);
48 constexpr UINT64 inline DwordSwapBytes(UINT64 nValue) noexcept
50 UINT64 nRet = ((nValue&0xffff0000ffffL)<<16) | ((nValue>>16)&0xffff0000ffffL); // swap WORDs in DWORDs
51 nRet = ((nRet&0xff00ff00ff00ff)<<8) | ((nRet>>8)&0xff00ff00ff00ff); // swap BYTESs in WORDs
52 return nRet;
55 CFileTextLines::CFileTextLines()
59 CFileTextLines::~CFileTextLines()
63 CFileTextLines::UnicodeType CFileTextLines::CheckUnicodeType(LPCVOID pBuffer, int cb)
65 if (cb < 2)
66 return CFileTextLines::UnicodeType::ASCII;
67 auto const pVal32 = static_cast<const UINT32*>(pBuffer);
68 auto const pVal16 = static_cast<const UINT16*>(pBuffer);
69 auto const pVal8 = static_cast<const UINT8*>(pBuffer);
70 // scan the whole buffer for a 0x00000000 sequence
71 // if found, we assume a binary file
72 int nDwords = cb/4;
73 for (int j=0; j<nDwords; ++j)
75 if (0x00000000 == pVal32[j])
76 return CFileTextLines::UnicodeType::BINARY;
78 if (cb >=4 )
80 if (*pVal32 == 0x0000FEFF)
82 return CFileTextLines::UnicodeType::UTF32_LE;
84 if (*pVal32 == 0xFFFE0000)
86 return CFileTextLines::UnicodeType::UTF32_BE;
89 if (*pVal16 == 0xFEFF)
91 return CFileTextLines::UnicodeType::UTF16_LEBOM;
93 if (*pVal16 == 0xFFFE)
95 return CFileTextLines::UnicodeType::UTF16_BEBOM;
97 if (cb < 3)
98 return CFileTextLines::UnicodeType::ASCII;
99 if (*pVal16 == 0xBBEF)
101 if (pVal8[2] == 0xBF)
102 return CFileTextLines::UnicodeType::UTF8BOM;
104 // check for illegal UTF8 sequences
105 bool bNonANSI = false;
106 int nNeedData = 0;
107 int i=0;
108 int nullcount = 0;
109 for (; i < cb; ++i)
111 if (pVal8[i] == 0)
113 ++nullcount;
114 // count the null chars, we do not want to treat an ASCII/UTF8 file
115 // as UTF16 just because of some null chars that might be accidentally
116 // in the file.
117 // Use an arbitrary value of one fiftieth of the file length as
118 // the limit after which a file is considered UTF16.
119 if (nullcount >(cb / 50))
121 // null-chars are not allowed for ASCII or UTF8, that means
122 // this file is most likely UTF16 encoded
123 if (i % 2)
124 return CFileTextLines::UnicodeType::UTF16_LE;
125 else
126 return CFileTextLines::UnicodeType::UTF16_BE;
129 if ((pVal8[i] & 0x80) != 0) // non ASCII
131 bNonANSI = true;
132 break;
135 // check remaining text for UTF-8 validity
136 for (; i<cb; ++i)
138 UINT8 zChar = pVal8[i];
139 if ((zChar & 0x80)==0) // Ascii
141 if (zChar == 0)
143 ++nullcount;
144 // count the null chars, we do not want to treat an ASCII/UTF8 file
145 // as UTF16 just because of some null chars that might be accidentally
146 // in the file.
147 // Use an arbitrary value of one fiftieth of the file length as
148 // the limit after which a file is considered UTF16.
149 if (nullcount > (cb / 50))
151 // null-chars are not allowed for ASCII or UTF8, that means
152 // this file is most likely UTF16 encoded
153 if (i%2)
154 return CFileTextLines::UnicodeType::UTF16_LE;
155 else
156 return CFileTextLines::UnicodeType::UTF16_BE;
158 nNeedData = 0;
160 else if (nNeedData)
162 return CFileTextLines::UnicodeType::ASCII;
164 continue;
166 if ((zChar & 0x40)==0) // top bit
168 if (!nNeedData)
169 return CFileTextLines::UnicodeType::ASCII;
170 --nNeedData;
172 else if (nNeedData)
174 return CFileTextLines::UnicodeType::ASCII;
176 else if ((zChar & 0x20)==0) // top two bits
178 if (zChar<=0xC1)
179 return CFileTextLines::UnicodeType::ASCII;
180 nNeedData = 1;
182 else if ((zChar & 0x10)==0) // top three bits
184 nNeedData = 2;
186 else if ((zChar & 0x08)==0) // top four bits
188 if (zChar>=0xf5)
189 return CFileTextLines::UnicodeType::ASCII;
190 nNeedData = 3;
192 else
193 return CFileTextLines::UnicodeType::ASCII;
195 if (bNonANSI && nNeedData==0)
196 // if get here thru nonAscii and no missing data left then its valid UTF8
197 return CFileTextLines::UnicodeType::UTF8;
198 if (!bNonANSI && (DWORD(CRegDWORD(L"Software\\TortoiseGitMerge\\UseUTF8", FALSE))))
199 return CFileTextLines::UnicodeType::UTF8;
200 return CFileTextLines::UnicodeType::ASCII;
204 BOOL CFileTextLines::Load(const CString& sFilePath, int /*lengthHint*/ /* = 0*/)
206 m_SaveParams.m_LineEndings = EOL::AutoLine;
207 if (!m_bKeepEncoding)
208 m_SaveParams.m_UnicodeType = CFileTextLines::UnicodeType::AUTOTYPE;
209 RemoveAll();
211 if (PathIsDirectory(sFilePath))
213 m_sErrorString.Format(IDS_ERR_FILE_NOTAFILE, static_cast<LPCWSTR>(sFilePath));
214 return FALSE;
217 if (!PathFileExists(sFilePath))
219 //file does not exist, so just return SUCCESS
220 return TRUE;
223 CAutoFile hFile = CreateFile(sFilePath, GENERIC_READ, FILE_SHARE_READ | FILE_SHARE_DELETE | FILE_SHARE_WRITE, nullptr, OPEN_EXISTING, 0, nullptr);
224 if (!hFile)
226 SetErrorString();
227 return FALSE;
230 LARGE_INTEGER fsize;
231 if (!GetFileSizeEx(hFile, &fsize))
233 SetErrorString();
234 return FALSE;
236 if (fsize.QuadPart >= INT_MAX)
238 // file is way too big for us
239 m_sErrorString.LoadString(IDS_ERR_FILE_TOOBIG);
240 return FALSE;
243 // create buffer
244 std::unique_ptr<BYTE[]> fileBuffer;
247 fileBuffer = std::unique_ptr<BYTE[]>(new BYTE[fsize.LowPart]); // prevent default initialization
249 catch (CMemoryException* e)
251 e->GetErrorMessage(CStrBuf(m_sErrorString, 1000), 1000);
252 return FALSE;
255 // load file
256 DWORD dwReadBytes = 0;
257 if (!ReadFile(hFile, static_cast<void*>(fileBuffer.get()), fsize.LowPart, &dwReadBytes, nullptr))
259 SetErrorString();
260 return FALSE;
262 hFile.CloseHandle();
264 // detect type
265 if (m_SaveParams.m_UnicodeType == CFileTextLines::UnicodeType::AUTOTYPE)
267 m_SaveParams.m_UnicodeType = this->CheckUnicodeType(fileBuffer.get(), dwReadBytes);
269 // enforce conversion for all but ASCII and UTF8 type
270 m_bNeedsConversion = (m_SaveParams.m_UnicodeType != CFileTextLines::UnicodeType::UTF8) && (m_SaveParams.m_UnicodeType != CFileTextLines::UnicodeType::ASCII);
272 // no need to decode empty file
273 if (dwReadBytes == 0)
274 return TRUE;
276 // we may have to convert the file content - CString is UTF16LE
277 std::unique_ptr<CDecodeFilter> pFilter;
280 switch (m_SaveParams.m_UnicodeType)
282 case UnicodeType::BINARY:
283 m_sErrorString.Format(IDS_ERR_FILE_BINARY, static_cast<LPCWSTR>(sFilePath));
284 return FALSE;
285 case UnicodeType::UTF8:
286 case UnicodeType::UTF8BOM:
287 pFilter = std::make_unique<CUtf8Filter>(nullptr);
288 break;
289 default:
290 case UnicodeType::ASCII:
291 pFilter = std::make_unique<CAsciiFilter>(nullptr);
292 break;
293 case UnicodeType::UTF16_BE:
294 case UnicodeType::UTF16_BEBOM:
295 pFilter = std::make_unique<CUtf16beFilter>(nullptr);
296 break;
297 case UnicodeType::UTF16_LE:
298 case UnicodeType::UTF16_LEBOM:
299 pFilter = std::make_unique<CUtf16leFilter>(nullptr);
300 break;
301 case UnicodeType::UTF32_BE:
302 pFilter = std::make_unique<CUtf32beFilter>(nullptr);
303 break;
304 case UnicodeType::UTF32_LE:
305 pFilter = std::make_unique<CUtf32leFilter>(nullptr);
306 break;
308 if (!pFilter->Decode(std::move(fileBuffer), dwReadBytes))
310 SetErrorString();
311 return FALSE;
314 catch (CMemoryException* e)
316 e->GetErrorMessage(CStrBuf(m_sErrorString, 1000), 1000);
317 return FALSE;
320 std::wstring_view converted = pFilter.get()->GetStringView();
321 int nReadChars = static_cast<int>(converted.size()); // see above, we have a INT_MAX limitation
322 auto pTextBuf = converted.data();
323 const wchar_t* pLineStart = pTextBuf;
324 if (!converted.empty() && ((m_SaveParams.m_UnicodeType == UnicodeType::UTF8BOM)
325 || (m_SaveParams.m_UnicodeType == UnicodeType::UTF16_LEBOM)
326 || (m_SaveParams.m_UnicodeType == UnicodeType::UTF16_BEBOM)
327 || (m_SaveParams.m_UnicodeType == UnicodeType::UTF32_LE)
328 || (m_SaveParams.m_UnicodeType == UnicodeType::UTF32_BE)))
330 // ignore the BOM
331 ++pTextBuf;
332 ++pLineStart;
333 --nReadChars;
336 // fill in the lines into the array
337 size_t countEOLs[static_cast<int>(EOL::_COUNT)] = { 0 };
338 CFileTextLine oTextLine;
339 for (int i = nReadChars; i; --i)
341 EOL eEol;
342 switch (*pTextBuf++)
344 case '\r':
345 // crlf line ending or cr line ending
346 eEol = ((i > 1) && *(pTextBuf) == '\n') ? EOL::CRLF : EOL::CR;
347 break;
348 case '\n':
349 // lfcr line ending or lf line ending
350 eEol = ((i > 1) && *(pTextBuf) == '\r') ? EOL::LFCR : EOL::LF;
351 if (eEol == EOL::LFCR)
353 // LFCR is very rare on Windows, so we have to double check
354 // that this is not just a LF followed by CRLF
355 if (((countEOLs[static_cast<int>(EOL::CRLF)] > 1) || (countEOLs[static_cast<int>(EOL::LF)] > 1) || (GetCount() < 2)) &&
356 ((i > 2) && (*(pTextBuf+1) == '\n')))
358 // change the EOL back to a simple LF
359 eEol = EOL::LF;
362 break;
363 case 0x000b:
364 eEol = EOL::VT;
365 break;
366 case 0x000c:
367 eEol = EOL::FF;
368 break;
369 case 0x0085:
370 eEol = EOL::NEL;
371 break;
372 case 0x2028:
373 eEol = EOL::LS;
374 break;
375 case 0x2029:
376 eEol = EOL::PS;
377 break;
378 default:
379 continue;
381 oTextLine.sLine = CString(pLineStart, static_cast<int>(pTextBuf-pLineStart) - 1);
382 oTextLine.eEnding = eEol;
383 CStdFileLineArray::Add(oTextLine);
384 ++countEOLs[static_cast<int>(eEol)];
385 if (eEol == EOL::CRLF || eEol == EOL::LFCR)
387 ++pTextBuf;
388 --i;
390 pLineStart = pTextBuf;
392 CString line(pLineStart, static_cast<int>(pTextBuf - pLineStart));
393 Add(line, EOL::NoEnding);
395 // some EOLs are not supported by the svn diff lib.
396 m_bNeedsConversion |= (countEOLs[static_cast<int>(EOL::CRLF)] != 0);
397 m_bNeedsConversion |= (countEOLs[static_cast<int>(EOL::FF)] != 0);
398 m_bNeedsConversion |= (countEOLs[static_cast<int>(EOL::VT)] != 0);
399 m_bNeedsConversion |= (countEOLs[static_cast<int>(EOL::NEL)] != 0);
400 m_bNeedsConversion |= (countEOLs[static_cast<int>(EOL::LS)] != 0);
401 m_bNeedsConversion |= (countEOLs[static_cast<int>(EOL::PS)] != 0);
403 size_t eolmax = 0;
404 for (int nEol = 0; nEol < static_cast<int>(EOL::_COUNT); nEol++)
406 if (eolmax < countEOLs[nEol])
408 eolmax = countEOLs[nEol];
409 m_SaveParams.m_LineEndings = static_cast<EOL>(nEol);
413 return TRUE;
416 void CFileTextLines::StripWhiteSpace(CString& sLine, DWORD dwIgnoreWhitespaces, bool blame)
418 if (blame)
420 if (sLine.GetLength() > 66)
421 sLine = sLine.Mid(66);
423 switch (dwIgnoreWhitespaces)
425 case 0:
426 // Compare whitespaces
427 // do nothing
428 break;
429 case 1:
430 // Ignore all whitespaces
431 sLine.TrimLeft(L" \t");
432 sLine.TrimRight(L" \t");
433 break;
434 case 2:
435 // Ignore leading whitespace
436 sLine.TrimLeft(L" \t");
437 break;
438 case 3:
439 // Ignore ending whitespace
440 sLine.TrimRight(L" \t");
441 break;
446 Encoding pattern:
447 - encode & save BOM
448 - Get Line
449 - modify line - whitespaces, lowercase
450 - encode & save line
451 - get cached encoded eol
452 - save eol
454 BOOL CFileTextLines::Save( const CString& sFilePath
455 , bool bSaveAsUTF8 /*= false */
456 , bool bUseSVNCompatibleEOLs /*= false */
457 , DWORD dwIgnoreWhitespaces /*= 0 */
458 , BOOL bIgnoreCase /*= FALSE */
459 , bool bBlame /*= false*/
460 , bool bIgnoreComments /*= false*/
461 , const CString& linestart /*= CString()*/
462 , const CString& blockstart /*= CString()*/
463 , const CString& blockend /*= CString()*/
464 , const std::wregex& rx /*= std::wregex()*/
465 , const std::wstring& replacement /*=L""*/)
467 m_sCommentLine = linestart;
468 m_sCommentBlockStart = blockstart;
469 m_sCommentBlockEnd = blockend;
473 CString destPath = sFilePath;
474 // now make sure that the destination directory exists
475 int ind = 0;
476 while (destPath.Find('\\', ind)>=2)
478 if (!PathIsDirectory(destPath.Left(destPath.Find('\\', ind))))
480 if (!CreateDirectory(destPath.Left(destPath.Find('\\', ind)), nullptr))
481 return FALSE;
483 ind = destPath.Find('\\', ind)+1;
486 CStdioFile file; // Hugely faster than CFile for big file writes - because it uses buffering
487 if (!file.Open(sFilePath, CFile::modeCreate | CFile::modeWrite | CFile::typeBinary | CFile::shareDenyNone))
489 m_sErrorString.Format(IDS_ERR_FILE_OPEN, static_cast<LPCWSTR>(sFilePath));
490 return FALSE;
493 std::unique_ptr<CEncodeFilter> pFilter;
494 bool bSaveBom = true;
495 CFileTextLines::UnicodeType eUnicodeType = bSaveAsUTF8 ? CFileTextLines::UnicodeType::UTF8 : m_SaveParams.m_UnicodeType;
496 switch (eUnicodeType)
498 default:
499 case CFileTextLines::UnicodeType::ASCII:
500 bSaveBom = false;
501 pFilter = std::make_unique<CAsciiFilter>(&file);
502 break;
503 case CFileTextLines::UnicodeType::UTF8:
504 bSaveBom = false;
505 [[fallthrough]];
506 case CFileTextLines::UnicodeType::UTF8BOM:
507 pFilter = std::make_unique<CUtf8Filter>(&file);
508 break;
509 case CFileTextLines::UnicodeType::UTF16_BE:
510 bSaveBom = false;
511 pFilter = std::make_unique<CUtf16beFilter>(&file);
512 break;
513 case CFileTextLines::UnicodeType::UTF16_BEBOM:
514 pFilter = std::make_unique<CUtf16beFilter>(&file);
515 break;
516 case CFileTextLines::UnicodeType::UTF16_LE:
517 bSaveBom = false;
518 pFilter = std::make_unique<CUtf16leFilter>(&file);
519 break;
520 case CFileTextLines::UnicodeType::UTF16_LEBOM:
521 pFilter = std::make_unique<CUtf16leFilter>(&file);
522 break;
523 case CFileTextLines::UnicodeType::UTF32_BE:
524 pFilter = std::make_unique<CUtf32beFilter>(&file);
525 break;
526 case CFileTextLines::UnicodeType::UTF32_LE:
527 pFilter = std::make_unique<CUtf32leFilter>(&file);
528 break;
531 if (bSaveBom)
533 //first write the BOM
534 pFilter->Write(L"\xfeff");
536 // cache EOLs
537 CBuffer oEncodedEol[static_cast<int>(EOL::_COUNT)];
538 oEncodedEol[static_cast<int>(EOL::LF)] = pFilter->Encode(L"\n"); // x0a
539 oEncodedEol[static_cast<int>(EOL::CR)] = pFilter->Encode(L"\r"); // x0d
540 oEncodedEol[static_cast<int>(EOL::CRLF)] = pFilter->Encode(L"\r\n"); // x0d x0a
541 if (bUseSVNCompatibleEOLs)
543 // when using EOLs that are supported by the svn lib,
544 // we have to use the same EOLs as the file has in case
545 // they're already supported, but a different supported one
546 // in case the original one isn't supported.
547 // Only this way the option "ignore EOLs (recommended)" unchecked
548 // actually shows the lines as different.
549 // However, the diff won't find and differences in EOLs
550 // for these special EOLs if they differ between those special ones
551 // listed below.
552 // But it will work properly for the most common EOLs LF/CR/CRLF.
553 oEncodedEol[static_cast<int>(EOL::LFCR)] = oEncodedEol[static_cast<int>(EOL::CR)];
554 for (int nEol = 0; nEol < static_cast<int>(EOL::NoEnding); nEol++)
556 if (oEncodedEol[nEol].IsEmpty())
557 oEncodedEol[nEol] = oEncodedEol[static_cast<int>(EOL::LF)];
560 else
562 oEncodedEol[static_cast<int>(EOL::LFCR)] = pFilter->Encode(L"\n\r");
563 oEncodedEol[static_cast<int>(EOL::VT)] = pFilter->Encode(L"\v"); // x0b
564 oEncodedEol[static_cast<int>(EOL::FF)] = pFilter->Encode(L"\f"); // x0c
565 oEncodedEol[static_cast<int>(EOL::NEL)] = pFilter->Encode(L"\x85");
566 oEncodedEol[static_cast<int>(EOL::LS)] = pFilter->Encode(L"\x2028");
567 oEncodedEol[static_cast<int>(EOL::PS)] = pFilter->Encode(L"\x2029");
569 oEncodedEol[static_cast<int>(EOL::AutoLine)] = oEncodedEol[static_cast<int>(m_SaveParams.m_LineEndings == EOL::AutoLine ? EOL::CRLF : m_SaveParams.m_LineEndings)];
571 bool bInBlockComment = false;
572 for (int i=0; i<GetCount(); i++)
574 CString sLineT = GetAt(i);
575 if (bIgnoreComments)
576 bInBlockComment = StripComments(sLineT, bInBlockComment);
577 if (!rx._Empty())
578 LineRegex(sLineT, rx, replacement);
579 StripWhiteSpace(sLineT, dwIgnoreWhitespaces, bBlame);
580 if (bIgnoreCase)
581 sLineT = sLineT.MakeLower();
582 pFilter->Write(sLineT);
583 EOL eEol = GetLineEnding(i);
584 pFilter->Write(oEncodedEol[static_cast<int>(eEol)]);
586 file.Close();
588 catch (CException * e)
590 e->GetErrorMessage(CStrBuf(m_sErrorString, 4096), 4096);
591 e->Delete();
592 return FALSE;
594 return TRUE;
597 void CFileTextLines::SetErrorString()
599 m_sErrorString = static_cast<LPCWSTR>(CFormatMessageWrapper());
602 void CFileTextLines::CopySettings(CFileTextLines * pFileToCopySettingsTo) const
604 if (pFileToCopySettingsTo)
606 pFileToCopySettingsTo->m_SaveParams = m_SaveParams;
610 const wchar_t * CFileTextLines::GetEncodingName(UnicodeType eEncoding)
612 switch (eEncoding)
614 case UnicodeType::ASCII:
615 return L"ASCII";
616 case UnicodeType::BINARY:
617 return L"BINARY";
618 case UnicodeType::UTF16_LE:
619 return L"UTF-16LE";
620 case UnicodeType::UTF16_LEBOM:
621 return L"UTF-16LE BOM";
622 case UnicodeType::UTF16_BE:
623 return L"UTF-16BE";
624 case UnicodeType::UTF16_BEBOM:
625 return L"UTF-16BE BOM";
626 case UnicodeType::UTF32_LE:
627 return L"UTF-32LE";
628 case UnicodeType::UTF32_BE:
629 return L"UTF-32BE";
630 case UnicodeType::UTF8:
631 return L"UTF-8";
632 case UnicodeType::UTF8BOM:
633 return L"UTF-8 BOM";
635 return L"";
638 bool CFileTextLines::IsInsideString(const CString& sLine, int pos)
640 int scount = 0;
641 int ccount = 0;
642 auto spos = sLine.Find('"');
643 while (spos >= 0 && spos < pos)
645 ++scount;
646 spos = sLine.Find('"', spos + 1);
648 auto cpos = sLine.Find('\'');
649 while (cpos >= 0 && cpos < pos)
651 ++ccount;
652 cpos = sLine.Find('"', cpos + 1);
654 return (scount % 2 != 0 || ccount % 2 != 0);
657 bool CFileTextLines::StripComments( CString& sLine, bool bInBlockComment )
659 int startpos = 0;
660 int oldStartPos = -1;
663 if (bInBlockComment)
665 int endpos = sLine.Find(m_sCommentBlockEnd);
666 if (IsInsideString(sLine, endpos))
667 endpos = -1;
668 if (endpos >= 0 && (endpos > startpos || endpos == 0))
670 sLine = sLine.Left(startpos) + sLine.Mid(endpos + m_sCommentBlockEnd.GetLength());
671 bInBlockComment = false;
672 startpos = endpos;
674 else
676 sLine = sLine.Left(startpos);
677 startpos = -1;
680 if (!bInBlockComment)
682 startpos = m_sCommentBlockStart.IsEmpty() ? -1 : sLine.Find(m_sCommentBlockStart, startpos);
683 int startpos2 = m_sCommentLine.IsEmpty() ? -1 : sLine.Find(m_sCommentLine);
684 if ((startpos2 < startpos && startpos2 >= 0) || (startpos2 >= 0 && startpos < 0))
686 // line comment
687 // look if there's a string marker (" or ') before that
688 // note: this check is not fully correct. For example, it
689 // does not account for escaped chars or even multiline strings.
690 // but it has to be fast, so this has to do...
691 if (!IsInsideString(sLine, startpos2))
693 // line comment, erase the rest of the line
694 sLine = sLine.Left(startpos2);
695 startpos = -1;
697 if (startpos == oldStartPos)
698 return false;
699 oldStartPos = startpos;
701 else if (startpos >= 0)
703 // starting block comment
704 if (!IsInsideString(sLine, startpos))
705 bInBlockComment = true;
706 else
707 ++startpos;
710 } while (startpos >= 0);
712 return bInBlockComment;
715 void CFileTextLines::LineRegex( CString& sLine, const std::wregex& rx, const std::wstring& replacement ) const
717 std::wstring str = static_cast<LPCWSTR>(sLine);
718 std::wstring str2 = std::regex_replace(str, rx, replacement);
719 sLine = str2.c_str();
723 void CBuffer::ExpandToAtLeast(int nNewSize)
725 ASSERT(nNewSize >= 0);
726 if (nNewSize>m_nAllocated)
728 Free(); // we don't preserve buffer content intentionally
729 if (INT_MAX - (2048 - 1) >= nNewSize)
731 nNewSize += 2048 - 1;
732 nNewSize &= ~(1024 - 1);
734 else
735 nNewSize = INT_MAX;
736 m_pBuffer=new BYTE[nNewSize];
737 m_nAllocated=nNewSize;
741 void CBuffer::SetLength(int nUsed)
743 ASSERT(nUsed >= 0);
744 ExpandToAtLeast(nUsed);
745 m_nUsed = nUsed;
748 void CBuffer::Swap(CBuffer& Src) noexcept
750 std::swap(Src.m_nAllocated, m_nAllocated);
751 std::swap(Src.m_pBuffer, m_pBuffer);
752 std::swap(Src.m_nUsed, m_nUsed);
755 void CBuffer::Copy(const CBuffer & Src)
757 if (&Src != this)
759 SetLength(Src.m_nUsed);
760 memcpy(m_pBuffer, Src.m_pBuffer, m_nUsed);
765 bool CAsciiFilter::Decode(std::unique_ptr<BYTE[]> data, int len)
767 ASSERT(!m_pBuffer);
768 int nFlags = (m_nCodePage==CP_ACP) ? MB_PRECOMPOSED : 0;
769 // dry decode is around 8 times faster then real one, alternatively we can set buffer to max length
770 int nReadChars = MultiByteToWideChar(m_nCodePage, nFlags, reinterpret_cast<LPCSTR>(data.get()), len, nullptr, 0);
771 if (!nReadChars)
772 return false;
773 m_pBuffer = new wchar_t[nReadChars];
774 int ret2 = MultiByteToWideChar(m_nCodePage, nFlags, reinterpret_cast<LPCSTR>(data.get()), len, m_pBuffer, nReadChars);
775 if (ret2 != nReadChars)
776 return false;
778 m_iBufferLength = nReadChars;
780 return true;
783 const CBuffer& CAsciiFilter::Encode(const CString& s)
785 if (int bufferSize; IntMult(s.GetLength(), 3, &bufferSize) != S_OK || IntAdd(bufferSize, 1, &bufferSize) != S_OK)
786 AtlThrow(E_OUTOFMEMORY);
787 else
788 m_oBuffer.SetLength(bufferSize); // set buffer to guessed max size
789 int nConvertedLen = WideCharToMultiByte(m_nCodePage, 0, static_cast<LPCWSTR>(s), s.GetLength(), static_cast<LPSTR>(m_oBuffer), m_oBuffer.GetLength(), nullptr, nullptr);
790 m_oBuffer.SetLength(nConvertedLen); // set buffer to used size
791 return m_oBuffer;
795 bool CUtf16leFilter::Decode(std::unique_ptr<BYTE[]> data, int len)
797 ASSERT(!m_pBuffer);
798 // we believe data is ok for use
799 m_deleter = [](void* ptr) { delete[] static_cast<BYTE*>(ptr); };
800 m_pBuffer = reinterpret_cast<wchar_t*>(data.release());
801 m_iBufferLength = len / sizeof(wchar_t);
802 return true;
805 const CBuffer& CUtf16leFilter::Encode(const CString& s)
807 int nNeedBytes;
808 if (IntMult(s.GetLength(), sizeof(wchar_t), &nNeedBytes) != S_OK)
809 AtlThrow(E_OUTOFMEMORY);
810 m_oBuffer.SetLength(nNeedBytes);
811 memcpy(static_cast<void*>(m_oBuffer), static_cast<LPCWSTR>(s), nNeedBytes);
812 return m_oBuffer;
816 bool CUtf16beFilter::Decode(std::unique_ptr<BYTE[]> data, int len)
818 ASSERT(!m_pBuffer);
819 // make in place WORD BYTEs swap
820 auto p_qw = static_cast<UINT64*>(static_cast<void*>(data.get()));
821 int nQwords = len / 8;
822 for (int nQword = 0; nQword<nQwords; nQword++)
824 p_qw[nQword] = WordSwapBytes(p_qw[nQword]);
826 auto p_w = reinterpret_cast<wchar_t*>(p_qw);
827 int nWords = len / 2;
828 for (int nWord = nQwords*4; nWord<nWords; nWord++)
830 p_w[nWord] = WideCharSwap(p_w[nWord]);
832 return CUtf16leFilter::Decode(std::move(data), len);
835 const CBuffer& CUtf16beFilter::Encode(const CString& s)
837 int nNeedBytes;
838 if (IntMult(s.GetLength(), sizeof(wchar_t), &nNeedBytes) != S_OK)
839 AtlThrow(E_OUTOFMEMORY);
840 m_oBuffer.SetLength(nNeedBytes);
841 // copy swaping BYTE order in WORDs
842 auto p_qwIn = reinterpret_cast<const UINT64*>(static_cast<LPCWSTR>(s));
843 auto p_qwOut = static_cast<UINT64*>(static_cast<void*>(m_oBuffer));
844 int nQwords = nNeedBytes/8;
845 for (int nQword = 0; nQword<nQwords; nQword++)
847 p_qwOut[nQword] = WordSwapBytes(p_qwIn[nQword]);
849 auto p_wIn = reinterpret_cast<const wchar_t*>(p_qwIn);
850 auto p_wOut = reinterpret_cast<wchar_t*>(p_qwOut);
851 int nWords = nNeedBytes/2;
852 for (int nWord = nQwords*4; nWord<nWords; nWord++)
854 p_wOut[nWord] = WideCharSwap(p_wIn[nWord]);
856 return m_oBuffer;
860 bool CUtf32leFilter::Decode(std::unique_ptr<BYTE[]> data, int len)
862 ASSERT(!m_pBuffer);
863 // UTF32 have four bytes per char
864 int nReadChars = len / 4;
865 auto p32 = static_cast<UINT32*>(static_cast<void*>(data.get()));
867 // count chars which needs surrogate pair
868 int nSurrogatePairCount = 0;
869 for (int i = 0; i<nReadChars; ++i)
871 if (p32[i]<0x110000 && p32[i]>=0x10000)
873 ++nSurrogatePairCount;
877 // fill buffer
878 if (int bufferSize; IntAdd(nReadChars, nSurrogatePairCount, &bufferSize) != S_OK)
879 AtlThrow(E_OUTOFMEMORY);
880 else
881 m_pBuffer = new wchar_t[bufferSize]; // set buffer to guessed max size
882 auto pOut = m_pBuffer;
883 for (int i = 0; i<nReadChars; ++i, ++pOut)
885 UINT32 zChar = p32[i];
886 if (zChar>=0x110000)
888 *pOut=0xfffd; // ? mark
890 else if (zChar>=0x10000)
892 zChar-=0x10000;
893 pOut[0] = ((zChar>>10)&0x3ff) | 0xd800; // lead surrogate
894 pOut[1] = (zChar&0x7ff) | 0xdc00; // trail surrogate
895 pOut++;
897 else
899 *pOut = static_cast<wchar_t>(zChar);
902 m_iBufferLength = nReadChars;
903 return true;
906 const CBuffer& CUtf32leFilter::Encode(const CString& s)
908 int nInWords = s.GetLength();
909 if (int bufferSize; IntMult(nInWords, 2, &bufferSize) != S_OK)
910 AtlThrow(E_OUTOFMEMORY);
911 else
912 m_oBuffer.SetLength(bufferSize);
914 auto p_In = static_cast<LPCWSTR>(s);
915 auto p_Out = static_cast<UINT32*>(static_cast<void*>(m_oBuffer));
916 int nOutDword = 0;
917 for (int nInWord = 0; nInWord<nInWords; nInWord++, nOutDword++)
919 UINT32 zChar = p_In[nInWord];
920 if ((zChar&0xfc00) == 0xd800) // lead surrogate
922 if (nInWord+1<nInWords && (p_In[nInWord+1]&0xfc00) == 0xdc00) // trail surrogate follows
924 zChar = 0x10000 + ((zChar&0x3ff)<<10) + (p_In[++nInWord]&0x3ff);
926 else
928 zChar = 0xfffd; // ? mark
931 else if ((zChar&0xfc00) == 0xdc00) // trail surrogate without lead
933 zChar = 0xfffd; // ? mark
935 p_Out[nOutDword] = zChar;
937 if (int bufferSize; IntMult(nOutDword, 4, &bufferSize) != S_OK)
938 AtlThrow(E_OUTOFMEMORY);
939 else
940 m_oBuffer.SetLength(bufferSize); // store length reduced by surrogates
941 return m_oBuffer;
945 bool CUtf32beFilter::Decode(std::unique_ptr<BYTE[]> data, int len)
947 // swap BYTEs order in DWORDs
948 auto p64 = static_cast<UINT64*>(static_cast<void*>(data.get()));
949 int nQwords = len / 8;
950 for (int nQword = 0; nQword<nQwords; nQword++)
952 p64[nQword] = DwordSwapBytes(p64[nQword]);
955 auto p32 = reinterpret_cast<UINT32*>(p64);
956 int nDwords = len / 4;
957 for (int nDword = nQwords*2; nDword<nDwords; nDword++)
959 p32[nDword] = DwordSwapBytes(p32[nDword]);
961 return CUtf32leFilter::Decode(std::move(data), len);
964 const CBuffer& CUtf32beFilter::Encode(const CString& s)
966 CUtf32leFilter::Encode(s);
968 // swap BYTEs order in DWORDs
969 auto p64 = static_cast<UINT64*>(static_cast<void*>(m_oBuffer));
970 int nQwords = m_oBuffer.GetLength()/8;
971 for (int nQword = 0; nQword<nQwords; nQword++)
973 p64[nQword] = DwordSwapBytes(p64[nQword]);
976 auto p32 = reinterpret_cast<UINT32*>(p64);
977 int nDwords = m_oBuffer.GetLength()/4;
978 for (int nDword = nQwords*2; nDword<nDwords; nDword++)
980 p32[nDword] = DwordSwapBytes(p32[nDword]);
982 return m_oBuffer;