src/TortoiseMerge/FileTextLines.cpp

   1 // TortoiseGitMerge - a Diff/Patch program
   2
   3 // Copyright (C) 2016, 2019, 2021, 2023 - TortoiseGit
   4 // Copyright (C) 2007-2016, 2019 - TortoiseSVN
   5
   6 // This program is free software; you can redistribute it and/or
   7 // modify it under the terms of the GNU General Public License
   8 // as published by the Free Software Foundation; either version 2
   9 // of the License, or (at your option) any later version.
  10
  11 // This program is distributed in the hope that it will be useful,
  12 // but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 // GNU General Public License for more details.
  15
  16 // You should have received a copy of the GNU General Public License
  17 // along with this program; if not, write to the Free Software Foundation,
  18 // 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
  19 //
  20 #include "stdafx.h"
  21 #include "resource.h"
  22 #include "UnicodeUtils.h"
  23 #include "registry.h"
  24 #include "FileTextLines.h"
  25 #include "FormatMessageWrapper.h"
  26 #include "SmartHandle.h"
  27 #include <intsafe.h>
  28
  29 constexpr wchar_t inline WideCharSwap(wchar_t nValue) noexcept
  30 {
  31         return (((nValue>> 8)) | (nValue << 8));
  32         //return _byteswap_ushort(nValue);
  33 }
  34
  35 constexpr UINT64 inline WordSwapBytes(UINT64 nValue) noexcept
  36 {
  37         return ((nValue&0xff00ff00ff00ff)<<8) | ((nValue>>8)&0xff00ff00ff00ff); // swap BYTESs in WORDs
  38 }
  39
  40 constexpr UINT32 inline DwordSwapBytes(UINT32 nValue) noexcept
  41 {
  42         UINT32 nRet = (nValue<<16) | (nValue>>16); // swap WORDs
  43         nRet = ((nRet&0xff00ff)<<8) | ((nRet>>8)&0xff00ff); // swap BYTESs in WORDs
  44         return nRet;
  45         //return _byteswap_ulong(nValue);
  46 }
  47
  48 constexpr UINT64 inline DwordSwapBytes(UINT64 nValue) noexcept
  49 {
  50         UINT64 nRet = ((nValue&0xffff0000ffffL)<<16) | ((nValue>>16)&0xffff0000ffffL); // swap WORDs in DWORDs
  51         nRet = ((nRet&0xff00ff00ff00ff)<<8) | ((nRet>>8)&0xff00ff00ff00ff); // swap BYTESs in WORDs
  52         return nRet;
  53 }
  54
  55 CFileTextLines::CFileTextLines()
  56 {
  57 }
  58
  59 CFileTextLines::~CFileTextLines()
  60 {
  61 }
  62
  63 CFileTextLines::UnicodeType CFileTextLines::CheckUnicodeType(LPCVOID pBuffer, int cb)
  64 {
  65         if (cb < 2)
  66                 return CFileTextLines::UnicodeType::ASCII;
  67         auto const pVal32 = static_cast<const UINT32*>(pBuffer);
  68         auto const pVal16 = static_cast<const UINT16*>(pBuffer);
  69         auto const pVal8 = static_cast<const UINT8*>(pBuffer);
  70         // scan the whole buffer for a 0x00000000 sequence
  71         // if found, we assume a binary file
  72         int nDwords = cb/4;
  73         for (int j=0; j<nDwords; ++j)
  74         {
  75                 if (0x00000000 == pVal32[j])
  76                         return CFileTextLines::UnicodeType::BINARY;
  77         }
  78         if (cb >=4 )
  79         {
  80                 if (*pVal32 == 0x0000FEFF)
  81                 {
  82                         return CFileTextLines::UnicodeType::UTF32_LE;
  83                 }
  84                 if (*pVal32 == 0xFFFE0000)
  85                 {
  86                         return CFileTextLines::UnicodeType::UTF32_BE;
  87                 }
  88         }
  89         if (*pVal16 == 0xFEFF)
  90         {
  91                 return CFileTextLines::UnicodeType::UTF16_LEBOM;
  92         }
  93         if (*pVal16 == 0xFFFE)
  94         {
  95                 return CFileTextLines::UnicodeType::UTF16_BEBOM;
  96         }
  97         if (cb < 3)
  98                 return CFileTextLines::UnicodeType::ASCII;
  99         if (*pVal16 == 0xBBEF)
 100         {
 101                 if (pVal8[2] == 0xBF)
 102                         return CFileTextLines::UnicodeType::UTF8BOM;
 103         }
 104         // check for illegal UTF8 sequences
 105         bool bNonANSI = false;
 106         int nNeedData = 0;
 107         int i=0;
 108         int nullcount = 0;
 109         for (; i < cb; ++i)
 110         {
 111                 if (pVal8[i] == 0)
 112                 {
 113                         ++nullcount;
 114                         // count the null chars, we do not want to treat an ASCII/UTF8 file
 115                         // as UTF16 just because of some null chars that might be accidentally
 116                         // in the file.
 117                         // Use an arbitrary value of one fiftieth of the file length as
 118                         // the limit after which a file is considered UTF16.
 119                         if (nullcount >(cb / 50))
 120                         {
 121                                 // null-chars are not allowed for ASCII or UTF8, that means
 122                                 // this file is most likely UTF16 encoded
 123                                 if (i % 2)
 124                                         return CFileTextLines::UnicodeType::UTF16_LE;
 125                                 else
 126                                         return CFileTextLines::UnicodeType::UTF16_BE;
 127                         }
 128                 }
 129                 if ((pVal8[i] & 0x80) != 0) // non ASCII
 130                 {
 131                         bNonANSI = true;
 132                         break;
 133                 }
 134         }
 135         // check remaining text for UTF-8 validity
 136         for (; i<cb; ++i)
 137         {
 138                 UINT8 zChar = pVal8[i];
 139                 if ((zChar & 0x80)==0) // Ascii
 140                 {
 141                         if (zChar == 0)
 142                         {
 143                                 ++nullcount;
 144                                 // count the null chars, we do not want to treat an ASCII/UTF8 file
 145                                 // as UTF16 just because of some null chars that might be accidentally
 146                                 // in the file.
 147                                 // Use an arbitrary value of one fiftieth of the file length as
 148                                 // the limit after which a file is considered UTF16.
 149                                 if (nullcount > (cb / 50))
 150                                 {
 151                                         // null-chars are not allowed for ASCII or UTF8, that means
 152                                         // this file is most likely UTF16 encoded
 153                                         if (i%2)
 154                                                 return CFileTextLines::UnicodeType::UTF16_LE;
 155                                         else
 156                                                 return CFileTextLines::UnicodeType::UTF16_BE;
 157                                 }
 158                                 nNeedData = 0;
 159                         }
 160                         else if (nNeedData)
 161                         {
 162                                 return CFileTextLines::UnicodeType::ASCII;
 163                         }
 164                         continue;
 165                 }
 166                 if ((zChar & 0x40)==0) // top bit
 167                 {
 168                         if (!nNeedData)
 169                                 return CFileTextLines::UnicodeType::ASCII;
 170                         --nNeedData;
 171                 }
 172                 else if (nNeedData)
 173                 {
 174                         return CFileTextLines::UnicodeType::ASCII;
 175                 }
 176                 else if ((zChar & 0x20)==0) // top two bits
 177                 {
 178                         if (zChar<=0xC1)
 179                                 return CFileTextLines::UnicodeType::ASCII;
 180                         nNeedData = 1;
 181                 }
 182                 else if ((zChar & 0x10)==0) // top three bits
 183                 {
 184                         nNeedData = 2;
 185                 }
 186                 else if ((zChar & 0x08)==0) // top four bits
 187                 {
 188                         if (zChar>=0xf5)
 189                                 return CFileTextLines::UnicodeType::ASCII;
 190                         nNeedData = 3;
 191                 }
 192                 else
 193                         return CFileTextLines::UnicodeType::ASCII;
 194         }
 195         if (bNonANSI && nNeedData==0)
 196                 // if get here thru nonAscii and no missing data left then its valid UTF8
 197                 return CFileTextLines::UnicodeType::UTF8;
 198         if (!bNonANSI && (DWORD(CRegDWORD(L"Software\\TortoiseGitMerge\\UseUTF8", FALSE))))
 199                 return CFileTextLines::UnicodeType::UTF8;
 200         return CFileTextLines::UnicodeType::ASCII;
 201 }
 202
 203
 204 BOOL CFileTextLines::Load(const CString& sFilePath, int /*lengthHint*/ /* = 0*/)
 205 {
 206         m_SaveParams.m_LineEndings = EOL::AutoLine;
 207         if (!m_bKeepEncoding)
 208                 m_SaveParams.m_UnicodeType = CFileTextLines::UnicodeType::AUTOTYPE;
 209         RemoveAll();
 210
 211         if (PathIsDirectory(sFilePath))
 212         {
 213                 m_sErrorString.Format(IDS_ERR_FILE_NOTAFILE, static_cast<LPCWSTR>(sFilePath));
 214                 return FALSE;
 215         }
 216
 217         if (!PathFileExists(sFilePath))
 218         {
 219                 //file does not exist, so just return SUCCESS
 220                 return TRUE;
 221         }
 222
 223         CAutoFile hFile = CreateFile(sFilePath, GENERIC_READ, FILE_SHARE_READ | FILE_SHARE_DELETE | FILE_SHARE_WRITE, nullptr, OPEN_EXISTING, 0, nullptr);
 224         if (!hFile)
 225         {
 226                 SetErrorString();
 227                 return FALSE;
 228         }
 229
 230         LARGE_INTEGER fsize;
 231         if (!GetFileSizeEx(hFile, &fsize))
 232         {
 233                 SetErrorString();
 234                 return FALSE;
 235         }
 236         if (fsize.QuadPart >= INT_MAX)
 237         {
 238                 // file is way too big for us
 239                 m_sErrorString.LoadString(IDS_ERR_FILE_TOOBIG);
 240                 return FALSE;
 241         }
 242
 243         // create buffer
 244         std::unique_ptr<BYTE[]> fileBuffer;
 245         try
 246         {
 247                 fileBuffer = std::unique_ptr<BYTE[]>(new BYTE[fsize.LowPart]); // prevent default initialization
 248         }
 249         catch (CMemoryException* e)
 250         {
 251                 e->GetErrorMessage(CStrBuf(m_sErrorString, 1000), 1000);
 252                 return FALSE;
 253         }
 254
 255         // load file
 256         DWORD dwReadBytes = 0;
 257         if (!ReadFile(hFile, static_cast<void*>(fileBuffer.get()), fsize.LowPart, &dwReadBytes, nullptr))
 258         {
 259                 SetErrorString();
 260                 return FALSE;
 261         }
 262         hFile.CloseHandle();
 263
 264         // detect type
 265         if (m_SaveParams.m_UnicodeType == CFileTextLines::UnicodeType::AUTOTYPE)
 266         {
 267                 m_SaveParams.m_UnicodeType = this->CheckUnicodeType(fileBuffer.get(), dwReadBytes);
 268         }
 269         // enforce conversion for all but ASCII and UTF8 type
 270         m_bNeedsConversion = (m_SaveParams.m_UnicodeType != CFileTextLines::UnicodeType::UTF8) && (m_SaveParams.m_UnicodeType != CFileTextLines::UnicodeType::ASCII);
 271
 272         // no need to decode empty file
 273         if (dwReadBytes == 0)
 274                 return TRUE;
 275
 276         // we may have to convert the file content - CString is UTF16LE
 277         std::unique_ptr<CDecodeFilter> pFilter;
 278         try
 279         {
 280                 switch (m_SaveParams.m_UnicodeType)
 281                 {
 282                 case UnicodeType::BINARY:
 283                         m_sErrorString.Format(IDS_ERR_FILE_BINARY, static_cast<LPCWSTR>(sFilePath));
 284                         return FALSE;
 285                 case UnicodeType::UTF8:
 286                 case UnicodeType::UTF8BOM:
 287                         pFilter = std::make_unique<CUtf8Filter>(nullptr);
 288                         break;
 289                 default:
 290                 case UnicodeType::ASCII:
 291                         pFilter = std::make_unique<CAsciiFilter>(nullptr);
 292                         break;
 293                 case UnicodeType::UTF16_BE:
 294                 case UnicodeType::UTF16_BEBOM:
 295                         pFilter = std::make_unique<CUtf16beFilter>(nullptr);
 296                         break;
 297                 case UnicodeType::UTF16_LE:
 298                 case UnicodeType::UTF16_LEBOM:
 299                         pFilter = std::make_unique<CUtf16leFilter>(nullptr);
 300                         break;
 301                 case UnicodeType::UTF32_BE:
 302                         pFilter = std::make_unique<CUtf32beFilter>(nullptr);
 303                         break;
 304                 case UnicodeType::UTF32_LE:
 305                         pFilter = std::make_unique<CUtf32leFilter>(nullptr);
 306                         break;
 307                 }
 308                 if (!pFilter->Decode(std::move(fileBuffer), dwReadBytes))
 309                 {
 310                         SetErrorString();
 311                         return FALSE;
 312                 }
 313         }
 314         catch (CMemoryException* e)
 315         {
 316                 e->GetErrorMessage(CStrBuf(m_sErrorString, 1000), 1000);
 317                 return FALSE;
 318         }
 319
 320         std::wstring_view converted = pFilter.get()->GetStringView();
 321         int nReadChars = static_cast<int>(converted.size()); // see above, we have a INT_MAX limitation
 322         auto pTextBuf = converted.data();
 323         const wchar_t* pLineStart = pTextBuf;
 324         if (!converted.empty() && ((m_SaveParams.m_UnicodeType == UnicodeType::UTF8BOM)
 325                 || (m_SaveParams.m_UnicodeType == UnicodeType::UTF16_LEBOM)
 326                 || (m_SaveParams.m_UnicodeType == UnicodeType::UTF16_BEBOM)
 327                 || (m_SaveParams.m_UnicodeType == UnicodeType::UTF32_LE)
 328                 || (m_SaveParams.m_UnicodeType == UnicodeType::UTF32_BE)))
 329         {
 330                 // ignore the BOM
 331                 ++pTextBuf;
 332                 ++pLineStart;
 333                 --nReadChars;
 334         }
 335
 336         // fill in the lines into the array
 337         size_t countEOLs[static_cast<int>(EOL::_COUNT)] = { 0 };
 338         CFileTextLine oTextLine;
 339         for (int i = nReadChars; i; --i)
 340         {
 341                 EOL eEol;
 342                 switch (*pTextBuf++)
 343                 {
 344                 case '\r':
 345                         // crlf line ending or cr line ending
 346                         eEol = ((i > 1) && *(pTextBuf) == '\n') ? EOL::CRLF : EOL::CR;
 347                         break;
 348                 case '\n':
 349                         // lfcr line ending or lf line ending
 350                         eEol = ((i > 1) && *(pTextBuf) == '\r') ? EOL::LFCR : EOL::LF;
 351                         if (eEol == EOL::LFCR)
 352                         {
 353                                 // LFCR is very rare on Windows, so we have to double check
 354                                 // that this is not just a LF followed by CRLF
 355                                 if (((countEOLs[static_cast<int>(EOL::CRLF)] > 1) || (countEOLs[static_cast<int>(EOL::LF)] > 1) || (GetCount() < 2)) &&
 356                                         ((i > 2) && (*(pTextBuf+1) == '\n')))
 357                                 {
 358                                         // change the EOL back to a simple LF
 359                                         eEol = EOL::LF;
 360                                 }
 361                         }
 362                         break;
 363                 case 0x000b:
 364                         eEol = EOL::VT;
 365                         break;
 366                 case 0x000c:
 367                         eEol = EOL::FF;
 368                         break;
 369                 case 0x0085:
 370                         eEol = EOL::NEL;
 371                         break;
 372                 case 0x2028:
 373                         eEol = EOL::LS;
 374                         break;
 375                 case 0x2029:
 376                         eEol = EOL::PS;
 377                         break;
 378                 default:
 379                         continue;
 380                 }
 381                 oTextLine.sLine = CString(pLineStart, static_cast<int>(pTextBuf-pLineStart) - 1);
 382                 oTextLine.eEnding = eEol;
 383                 CStdFileLineArray::Add(oTextLine);
 384                 ++countEOLs[static_cast<int>(eEol)];
 385                 if (eEol == EOL::CRLF || eEol == EOL::LFCR)
 386                 {
 387                         ++pTextBuf;
 388                         --i;
 389                 }
 390                 pLineStart = pTextBuf;
 391         }
 392         CString line(pLineStart, static_cast<int>(pTextBuf - pLineStart));
 393         Add(line, EOL::NoEnding);
 394
 395         // some EOLs are not supported by the svn diff lib.
 396         m_bNeedsConversion |= (countEOLs[static_cast<int>(EOL::CRLF)] != 0);
 397         m_bNeedsConversion |= (countEOLs[static_cast<int>(EOL::FF)] != 0);
 398         m_bNeedsConversion |= (countEOLs[static_cast<int>(EOL::VT)] != 0);
 399         m_bNeedsConversion |= (countEOLs[static_cast<int>(EOL::NEL)] != 0);
 400         m_bNeedsConversion |= (countEOLs[static_cast<int>(EOL::LS)] != 0);
 401         m_bNeedsConversion |= (countEOLs[static_cast<int>(EOL::PS)] != 0);
 402
 403         size_t eolmax = 0;
 404         for (int nEol = 0; nEol < static_cast<int>(EOL::_COUNT); nEol++)
 405         {
 406                 if (eolmax < countEOLs[nEol])
 407                 {
 408                         eolmax = countEOLs[nEol];
 409                         m_SaveParams.m_LineEndings = static_cast<EOL>(nEol);
 410                 }
 411         }
 412
 413         return TRUE;
 414 }
 415
 416 void CFileTextLines::StripWhiteSpace(CString& sLine, DWORD dwIgnoreWhitespaces, bool blame)
 417 {
 418         if (blame)
 419         {
 420                 if (sLine.GetLength() > 66)
 421                         sLine = sLine.Mid(66);
 422         }
 423         switch (dwIgnoreWhitespaces)
 424         {
 425         case 0:
 426                 // Compare whitespaces
 427                 // do nothing
 428                 break;
 429         case 1:
 430                 // Ignore all whitespaces
 431                 sLine.TrimLeft(L" \t");
 432                 sLine.TrimRight(L" \t");
 433                 break;
 434         case 2:
 435                 // Ignore leading whitespace
 436                 sLine.TrimLeft(L" \t");
 437                 break;
 438         case 3:
 439                 // Ignore ending whitespace
 440                 sLine.TrimRight(L" \t");
 441                 break;
 442         }
 443 }
 444
 445 /**
 446         Encoding pattern:
 447                 - encode & save BOM
 448                 - Get Line
 449                 - modify line - whitespaces, lowercase
 450                 - encode & save line
 451                 - get cached encoded eol
 452                 - save eol
 453 */
 454 BOOL CFileTextLines::Save( const CString& sFilePath
 455                                                 , bool bSaveAsUTF8 /*= false */
 456                                                 , bool bUseSVNCompatibleEOLs /*= false */
 457                                                 , DWORD dwIgnoreWhitespaces /*= 0 */
 458                                                 , BOOL bIgnoreCase /*= FALSE */
 459                                                 , bool bBlame /*= false*/
 460                                                 , bool bIgnoreComments /*= false*/
 461                                                 , const CString& linestart /*= CString()*/
 462                                                 , const CString& blockstart /*= CString()*/
 463                                                 , const CString& blockend /*= CString()*/
 464                                                 , const std::wregex& rx /*= std::wregex()*/
 465                                                 , const std::wstring& replacement /*=L""*/)
 466 {
 467         m_sCommentLine = linestart;
 468         m_sCommentBlockStart = blockstart;
 469         m_sCommentBlockEnd = blockend;
 470
 471         try
 472         {
 473                 CString destPath = sFilePath;
 474                 // now make sure that the destination directory exists
 475                 int ind = 0;
 476                 while (destPath.Find('\\', ind)>=2)
 477                 {
 478                         if (!PathIsDirectory(destPath.Left(destPath.Find('\\', ind))))
 479                         {
 480                                 if (!CreateDirectory(destPath.Left(destPath.Find('\\', ind)), nullptr))
 481                                         return FALSE;
 482                         }
 483                         ind = destPath.Find('\\', ind)+1;
 484                 }
 485
 486                 CStdioFile file;                        // Hugely faster than CFile for big file writes - because it uses buffering
 487                 if (!file.Open(sFilePath, CFile::modeCreate | CFile::modeWrite | CFile::typeBinary | CFile::shareDenyNone))
 488                 {
 489                         m_sErrorString.Format(IDS_ERR_FILE_OPEN, static_cast<LPCWSTR>(sFilePath));
 490                         return FALSE;
 491                 }
 492
 493                 std::unique_ptr<CEncodeFilter> pFilter;
 494                 bool bSaveBom = true;
 495                 CFileTextLines::UnicodeType eUnicodeType = bSaveAsUTF8 ? CFileTextLines::UnicodeType::UTF8 : m_SaveParams.m_UnicodeType;
 496                 switch (eUnicodeType)
 497                 {
 498                 default:
 499                 case CFileTextLines::UnicodeType::ASCII:
 500                         bSaveBom = false;
 501                         pFilter = std::make_unique<CAsciiFilter>(&file);
 502                         break;
 503                 case CFileTextLines::UnicodeType::UTF8:
 504                         bSaveBom = false;
 505                         [[fallthrough]];
 506                 case CFileTextLines::UnicodeType::UTF8BOM:
 507                         pFilter = std::make_unique<CUtf8Filter>(&file);
 508                         break;
 509                 case CFileTextLines::UnicodeType::UTF16_BE:
 510                         bSaveBom = false;
 511                         pFilter = std::make_unique<CUtf16beFilter>(&file);
 512                         break;
 513                 case CFileTextLines::UnicodeType::UTF16_BEBOM:
 514                         pFilter = std::make_unique<CUtf16beFilter>(&file);
 515                         break;
 516                 case CFileTextLines::UnicodeType::UTF16_LE:
 517                         bSaveBom = false;
 518                         pFilter = std::make_unique<CUtf16leFilter>(&file);
 519                         break;
 520                 case CFileTextLines::UnicodeType::UTF16_LEBOM:
 521                         pFilter = std::make_unique<CUtf16leFilter>(&file);
 522                         break;
 523                 case CFileTextLines::UnicodeType::UTF32_BE:
 524                         pFilter = std::make_unique<CUtf32beFilter>(&file);
 525                         break;
 526                 case CFileTextLines::UnicodeType::UTF32_LE:
 527                         pFilter = std::make_unique<CUtf32leFilter>(&file);
 528                         break;
 529                 }
 530
 531                 if (bSaveBom)
 532                 {
 533                         //first write the BOM
 534                         pFilter->Write(L"\xfeff");
 535                 }
 536                 // cache EOLs
 537                 CBuffer oEncodedEol[static_cast<int>(EOL::_COUNT)];
 538                 oEncodedEol[static_cast<int>(EOL::LF)] = pFilter->Encode(L"\n"); // x0a
 539                 oEncodedEol[static_cast<int>(EOL::CR)] = pFilter->Encode(L"\r"); // x0d
 540                 oEncodedEol[static_cast<int>(EOL::CRLF)] = pFilter->Encode(L"\r\n"); // x0d x0a
 541                 if (bUseSVNCompatibleEOLs)
 542                 {
 543                         // when using EOLs that are supported by the svn lib,
 544                         // we have to use the same EOLs as the file has in case
 545                         // they're already supported, but a different supported one
 546                         // in case the original one isn't supported.
 547                         // Only this way the option "ignore EOLs (recommended)" unchecked
 548                         // actually shows the lines as different.
 549                         // However, the diff won't find and differences in EOLs
 550                         // for these special EOLs if they differ between those special ones
 551                         // listed below.
 552                         // But it will work properly for the most common EOLs LF/CR/CRLF.
 553                         oEncodedEol[static_cast<int>(EOL::LFCR)] = oEncodedEol[static_cast<int>(EOL::CR)];
 554                         for (int nEol = 0; nEol < static_cast<int>(EOL::NoEnding); nEol++)
 555                         {
 556                                 if (oEncodedEol[nEol].IsEmpty())
 557                                         oEncodedEol[nEol] = oEncodedEol[static_cast<int>(EOL::LF)];
 558                         }
 559                 }
 560                 else
 561                 {
 562                         oEncodedEol[static_cast<int>(EOL::LFCR)] = pFilter->Encode(L"\n\r");
 563                         oEncodedEol[static_cast<int>(EOL::VT)] = pFilter->Encode(L"\v"); // x0b
 564                         oEncodedEol[static_cast<int>(EOL::FF)] = pFilter->Encode(L"\f"); // x0c
 565                         oEncodedEol[static_cast<int>(EOL::NEL)] = pFilter->Encode(L"\x85");
 566                         oEncodedEol[static_cast<int>(EOL::LS)] = pFilter->Encode(L"\x2028");
 567                         oEncodedEol[static_cast<int>(EOL::PS)] = pFilter->Encode(L"\x2029");
 568                 }
 569                 oEncodedEol[static_cast<int>(EOL::AutoLine)] = oEncodedEol[static_cast<int>(m_SaveParams.m_LineEndings == EOL::AutoLine ? EOL::CRLF : m_SaveParams.m_LineEndings)];
 570
 571                 bool bInBlockComment = false;
 572                 for (int i=0; i<GetCount(); i++)
 573                 {
 574                         CString sLineT = GetAt(i);
 575                         if (bIgnoreComments)
 576                                 bInBlockComment = StripComments(sLineT, bInBlockComment);
 577                         if (!rx._Empty())
 578                                 LineRegex(sLineT, rx, replacement);
 579                         StripWhiteSpace(sLineT, dwIgnoreWhitespaces, bBlame);
 580                         if (bIgnoreCase)
 581                                 sLineT = sLineT.MakeLower();
 582                         pFilter->Write(sLineT);
 583                         EOL eEol = GetLineEnding(i);
 584                         pFilter->Write(oEncodedEol[static_cast<int>(eEol)]);
 585                 }
 586                 file.Close();
 587         }
 588         catch (CException * e)
 589         {
 590                 e->GetErrorMessage(CStrBuf(m_sErrorString, 4096), 4096);
 591                 e->Delete();
 592                 return FALSE;
 593         }
 594         return TRUE;
 595 }
 596
 597 void CFileTextLines::SetErrorString()
 598 {
 599         m_sErrorString = static_cast<LPCWSTR>(CFormatMessageWrapper());
 600 }
 601
 602 void CFileTextLines::CopySettings(CFileTextLines * pFileToCopySettingsTo) const
 603 {
 604         if (pFileToCopySettingsTo)
 605         {
 606                 pFileToCopySettingsTo->m_SaveParams = m_SaveParams;
 607         }
 608 }
 609
 610 const wchar_t * CFileTextLines::GetEncodingName(UnicodeType eEncoding)
 611 {
 612         switch (eEncoding)
 613         {
 614         case UnicodeType::ASCII:
 615                 return L"ASCII";
 616         case UnicodeType::BINARY:
 617                 return L"BINARY";
 618         case UnicodeType::UTF16_LE:
 619                 return L"UTF-16LE";
 620         case UnicodeType::UTF16_LEBOM:
 621                 return L"UTF-16LE BOM";
 622         case UnicodeType::UTF16_BE:
 623                 return L"UTF-16BE";
 624         case UnicodeType::UTF16_BEBOM:
 625                 return L"UTF-16BE BOM";
 626         case UnicodeType::UTF32_LE:
 627                 return L"UTF-32LE";
 628         case UnicodeType::UTF32_BE:
 629                 return L"UTF-32BE";
 630         case UnicodeType::UTF8:
 631                 return L"UTF-8";
 632         case UnicodeType::UTF8BOM:
 633                 return L"UTF-8 BOM";
 634         }
 635         return L"";
 636 }
 637
 638 bool CFileTextLines::IsInsideString(const CString& sLine, int pos)
 639 {
 640         int scount = 0;
 641         int ccount = 0;
 642         auto spos = sLine.Find('"');
 643         while (spos >= 0 && spos < pos)
 644         {
 645                 ++scount;
 646                 spos = sLine.Find('"', spos + 1);
 647         }
 648         auto cpos = sLine.Find('\'');
 649         while (cpos >= 0 && cpos < pos)
 650         {
 651                 ++ccount;
 652                 cpos = sLine.Find('"', cpos + 1);
 653         }
 654         return (scount % 2 != 0 || ccount % 2 != 0);
 655 }
 656
 657 bool CFileTextLines::StripComments( CString& sLine, bool bInBlockComment )
 658 {
 659         int startpos = 0;
 660         int oldStartPos = -1;
 661         do
 662         {
 663                 if (bInBlockComment)
 664                 {
 665                         int endpos = sLine.Find(m_sCommentBlockEnd);
 666                         if (IsInsideString(sLine, endpos))
 667                                 endpos = -1;
 668                         if (endpos >= 0 && (endpos > startpos || endpos == 0))
 669                         {
 670                                 sLine = sLine.Left(startpos) + sLine.Mid(endpos + m_sCommentBlockEnd.GetLength());
 671                                 bInBlockComment = false;
 672                                 startpos = endpos;
 673                         }
 674                         else
 675                         {
 676                                 sLine = sLine.Left(startpos);
 677                                 startpos = -1;
 678                         }
 679                 }
 680                 if (!bInBlockComment)
 681                 {
 682                         startpos = m_sCommentBlockStart.IsEmpty() ? -1 : sLine.Find(m_sCommentBlockStart, startpos);
 683                         int startpos2 = m_sCommentLine.IsEmpty() ? -1 : sLine.Find(m_sCommentLine);
 684                         if ((startpos2 < startpos && startpos2 >= 0) || (startpos2 >= 0 && startpos < 0))
 685                         {
 686                                 // line comment
 687                                 // look if there's a string marker (" or ') before that
 688                                 // note: this check is not fully correct. For example, it
 689                                 // does not account for escaped chars or even multiline strings.
 690                                 // but it has to be fast, so this has to do...
 691                                 if (!IsInsideString(sLine, startpos2))
 692                                 {
 693                                         // line comment, erase the rest of the line
 694                                         sLine = sLine.Left(startpos2);
 695                                         startpos = -1;
 696                                 }
 697                                 if (startpos == oldStartPos)
 698                                         return false;
 699                                 oldStartPos = startpos;
 700                         }
 701                         else if (startpos >= 0)
 702                         {
 703                                 // starting block comment
 704                                 if (!IsInsideString(sLine, startpos))
 705                                         bInBlockComment = true;
 706                                 else
 707                                         ++startpos;
 708                         }
 709                 }
 710         } while (startpos >= 0);
 711
 712         return bInBlockComment;
 713 }
 714
 715 void CFileTextLines::LineRegex( CString& sLine, const std::wregex& rx, const std::wstring& replacement ) const
 716 {
 717         std::wstring str = static_cast<LPCWSTR>(sLine);
 718         std::wstring str2 = std::regex_replace(str, rx, replacement);
 719         sLine = str2.c_str();
 720 }
 721
 722
 723 void CBuffer::ExpandToAtLeast(int nNewSize)
 724 {
 725         ASSERT(nNewSize >= 0);
 726         if (nNewSize>m_nAllocated)
 727         {
 728                 Free(); // we don't preserve buffer content intentionally
 729                 if (INT_MAX - (2048 - 1) >= nNewSize)
 730                 {
 731                         nNewSize += 2048 - 1;
 732                         nNewSize &= ~(1024 - 1);
 733                 }
 734                 else
 735                         nNewSize = INT_MAX;
 736                 m_pBuffer=new BYTE[nNewSize];
 737                 m_nAllocated=nNewSize;
 738         }
 739 }
 740
 741 void CBuffer::SetLength(int nUsed)
 742 {
 743         ASSERT(nUsed >= 0);
 744         ExpandToAtLeast(nUsed);
 745         m_nUsed = nUsed;
 746 }
 747
 748 void CBuffer::Swap(CBuffer& Src) noexcept
 749 {
 750         std::swap(Src.m_nAllocated, m_nAllocated);
 751         std::swap(Src.m_pBuffer, m_pBuffer);
 752         std::swap(Src.m_nUsed, m_nUsed);
 753 }
 754
 755 void CBuffer::Copy(const CBuffer & Src)
 756 {
 757         if (&Src != this)
 758         {
 759                 SetLength(Src.m_nUsed);
 760                 memcpy(m_pBuffer, Src.m_pBuffer, m_nUsed);
 761         }
 762 }
 763
 764
 765 bool CAsciiFilter::Decode(std::unique_ptr<BYTE[]> data, int len)
 766 {
 767         ASSERT(!m_pBuffer);
 768         int nFlags = (m_nCodePage==CP_ACP) ? MB_PRECOMPOSED : 0;
 769         // dry decode is around 8 times faster then real one, alternatively we can set buffer to max length
 770         int nReadChars = MultiByteToWideChar(m_nCodePage, nFlags, reinterpret_cast<LPCSTR>(data.get()), len, nullptr, 0);
 771         if (!nReadChars)
 772                 return false;
 773         m_pBuffer = new wchar_t[nReadChars];
 774         int ret2 = MultiByteToWideChar(m_nCodePage, nFlags, reinterpret_cast<LPCSTR>(data.get()), len, m_pBuffer, nReadChars);
 775         if (ret2 != nReadChars)
 776                 return false;
 777
 778         m_iBufferLength = nReadChars;
 779
 780         return true;
 781 }
 782
 783 const CBuffer& CAsciiFilter::Encode(const CString& s)
 784 {
 785         if (int bufferSize; IntMult(s.GetLength(), 3, &bufferSize) != S_OK || IntAdd(bufferSize, 1, &bufferSize) != S_OK)
 786                 AtlThrow(E_OUTOFMEMORY);
 787         else
 788                 m_oBuffer.SetLength(bufferSize); // set buffer to guessed max size
 789         int nConvertedLen = WideCharToMultiByte(m_nCodePage, 0, static_cast<LPCWSTR>(s), s.GetLength(), static_cast<LPSTR>(m_oBuffer), m_oBuffer.GetLength(), nullptr, nullptr);
 790         m_oBuffer.SetLength(nConvertedLen); // set buffer to used size
 791         return m_oBuffer;
 792 }
 793
 794
 795 bool CUtf16leFilter::Decode(std::unique_ptr<BYTE[]> data, int len)
 796 {
 797         ASSERT(!m_pBuffer);
 798         // we believe data is ok for use
 799         m_deleter = [](void* ptr) { delete[] static_cast<BYTE*>(ptr); };
 800         m_pBuffer = reinterpret_cast<wchar_t*>(data.release());
 801         m_iBufferLength = len / sizeof(wchar_t);
 802         return true;
 803 }
 804
 805 const CBuffer& CUtf16leFilter::Encode(const CString& s)
 806 {
 807         int nNeedBytes;
 808         if (IntMult(s.GetLength(), sizeof(wchar_t), &nNeedBytes) != S_OK)
 809                 AtlThrow(E_OUTOFMEMORY);
 810         m_oBuffer.SetLength(nNeedBytes);
 811         memcpy(static_cast<void*>(m_oBuffer), static_cast<LPCWSTR>(s), nNeedBytes);
 812         return m_oBuffer;
 813 }
 814
 815
 816 bool CUtf16beFilter::Decode(std::unique_ptr<BYTE[]> data, int len)
 817 {
 818         ASSERT(!m_pBuffer);
 819         // make in place WORD BYTEs swap
 820         auto p_qw = static_cast<UINT64*>(static_cast<void*>(data.get()));
 821         int nQwords = len / 8;
 822         for (int nQword = 0; nQword<nQwords; nQword++)
 823         {
 824                 p_qw[nQword] = WordSwapBytes(p_qw[nQword]);
 825         }
 826         auto p_w = reinterpret_cast<wchar_t*>(p_qw);
 827         int nWords = len / 2;
 828         for (int nWord = nQwords*4; nWord<nWords; nWord++)
 829         {
 830                 p_w[nWord] = WideCharSwap(p_w[nWord]);
 831         }
 832         return CUtf16leFilter::Decode(std::move(data), len);
 833 }
 834
 835 const CBuffer& CUtf16beFilter::Encode(const CString& s)
 836 {
 837         int nNeedBytes;
 838         if (IntMult(s.GetLength(), sizeof(wchar_t), &nNeedBytes) != S_OK)
 839                 AtlThrow(E_OUTOFMEMORY);
 840         m_oBuffer.SetLength(nNeedBytes);
 841         // copy swaping BYTE order in WORDs
 842         auto p_qwIn = reinterpret_cast<const UINT64*>(static_cast<LPCWSTR>(s));
 843         auto p_qwOut = static_cast<UINT64*>(static_cast<void*>(m_oBuffer));
 844         int nQwords = nNeedBytes/8;
 845         for (int nQword = 0; nQword<nQwords; nQword++)
 846         {
 847                 p_qwOut[nQword] = WordSwapBytes(p_qwIn[nQword]);
 848         }
 849         auto p_wIn = reinterpret_cast<const wchar_t*>(p_qwIn);
 850         auto p_wOut = reinterpret_cast<wchar_t*>(p_qwOut);
 851         int nWords = nNeedBytes/2;
 852         for (int nWord = nQwords*4; nWord<nWords; nWord++)
 853         {
 854                 p_wOut[nWord] = WideCharSwap(p_wIn[nWord]);
 855         }
 856         return m_oBuffer;
 857 }
 858
 859
 860 bool CUtf32leFilter::Decode(std::unique_ptr<BYTE[]> data, int len)
 861 {
 862         ASSERT(!m_pBuffer);
 863         // UTF32 have four bytes per char
 864         int nReadChars = len / 4;
 865         auto p32 = static_cast<UINT32*>(static_cast<void*>(data.get()));
 866
 867         // count chars which needs surrogate pair
 868         int nSurrogatePairCount = 0;
 869         for (int i = 0; i<nReadChars; ++i)
 870         {
 871                 if (p32[i]<0x110000 && p32[i]>=0x10000)
 872                 {
 873                         ++nSurrogatePairCount;
 874                 }
 875         }
 876
 877         // fill buffer
 878         if (int bufferSize; IntAdd(nReadChars, nSurrogatePairCount, &bufferSize) != S_OK)
 879                 AtlThrow(E_OUTOFMEMORY);
 880         else
 881                 m_pBuffer = new wchar_t[bufferSize]; // set buffer to guessed max size
 882         auto pOut = m_pBuffer;
 883         for (int i = 0; i<nReadChars; ++i, ++pOut)
 884         {
 885                 UINT32 zChar = p32[i];
 886                 if (zChar>=0x110000)
 887                 {
 888                         *pOut=0xfffd; // ? mark
 889                 }
 890                 else if (zChar>=0x10000)
 891                 {
 892                         zChar-=0x10000;
 893                         pOut[0] = ((zChar>>10)&0x3ff) | 0xd800; // lead surrogate
 894                         pOut[1] = (zChar&0x7ff) | 0xdc00; // trail surrogate
 895                         pOut++;
 896                 }
 897                 else
 898                 {
 899                         *pOut = static_cast<wchar_t>(zChar);
 900                 }
 901         }
 902         m_iBufferLength = nReadChars;
 903         return true;
 904 }
 905
 906 const CBuffer& CUtf32leFilter::Encode(const CString& s)
 907 {
 908         int nInWords = s.GetLength();
 909         if (int bufferSize; IntMult(nInWords, 2, &bufferSize) != S_OK)
 910                 AtlThrow(E_OUTOFMEMORY);
 911         else
 912                 m_oBuffer.SetLength(bufferSize);
 913
 914         auto p_In = static_cast<LPCWSTR>(s);
 915         auto p_Out = static_cast<UINT32*>(static_cast<void*>(m_oBuffer));
 916         int nOutDword = 0;
 917         for (int nInWord = 0; nInWord<nInWords; nInWord++, nOutDword++)
 918         {
 919                 UINT32 zChar = p_In[nInWord];
 920                 if ((zChar&0xfc00) == 0xd800) // lead surrogate
 921                 {
 922                         if (nInWord+1<nInWords && (p_In[nInWord+1]&0xfc00) == 0xdc00) // trail surrogate follows
 923                         {
 924                                 zChar = 0x10000 + ((zChar&0x3ff)<<10) + (p_In[++nInWord]&0x3ff);
 925                         }
 926                         else
 927                         {
 928                                 zChar = 0xfffd; // ? mark
 929                         }
 930                 }
 931                 else if ((zChar&0xfc00) == 0xdc00) // trail surrogate without lead
 932                 {
 933                         zChar = 0xfffd; // ? mark
 934                 }
 935                 p_Out[nOutDword] = zChar;
 936         }
 937         if (int bufferSize; IntMult(nOutDword, 4, &bufferSize) != S_OK)
 938                 AtlThrow(E_OUTOFMEMORY);
 939         else
 940                 m_oBuffer.SetLength(bufferSize); // store length reduced by surrogates
 941         return m_oBuffer;
 942 }
 943
 944
 945 bool CUtf32beFilter::Decode(std::unique_ptr<BYTE[]> data, int len)
 946 {
 947         // swap BYTEs order in DWORDs
 948         auto p64 = static_cast<UINT64*>(static_cast<void*>(data.get()));
 949         int nQwords = len / 8;
 950         for (int nQword = 0; nQword<nQwords; nQword++)
 951         {
 952                 p64[nQword] = DwordSwapBytes(p64[nQword]);
 953         }
 954
 955         auto p32 = reinterpret_cast<UINT32*>(p64);
 956         int nDwords = len / 4;
 957         for (int nDword = nQwords*2; nDword<nDwords; nDword++)
 958         {
 959                 p32[nDword] = DwordSwapBytes(p32[nDword]);
 960         }
 961         return CUtf32leFilter::Decode(std::move(data), len);
 962 }
 963
 964 const CBuffer& CUtf32beFilter::Encode(const CString& s)
 965 {
 966         CUtf32leFilter::Encode(s);
 967
 968         // swap BYTEs order in DWORDs
 969         auto p64 = static_cast<UINT64*>(static_cast<void*>(m_oBuffer));
 970         int nQwords = m_oBuffer.GetLength()/8;
 971         for (int nQword = 0; nQword<nQwords; nQword++)
 972         {
 973                 p64[nQword] = DwordSwapBytes(p64[nQword]);
 974         }
 975
 976         auto p32 = reinterpret_cast<UINT32*>(p64);
 977         int nDwords = m_oBuffer.GetLength()/4;
 978         for (int nDword = nQwords*2; nDword<nDwords; nDword++)
 979         {
 980                 p32[nDword] = DwordSwapBytes(p32[nDword]);
 981         }
 982         return m_oBuffer;
 983 }