src/TortoiseMerge/FileTextLines.cpp

   1 // TortoiseGitMerge - a Diff/Patch program
   2
   3 // Copyright (C) 2016 - TortoiseGit
   4 // Copyright (C) 2007-2016 - TortoiseSVN
   5
   6 // This program is free software; you can redistribute it and/or
   7 // modify it under the terms of the GNU General Public License
   8 // as published by the Free Software Foundation; either version 2
   9 // of the License, or (at your option) any later version.
  10
  11 // This program is distributed in the hope that it will be useful,
  12 // but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 // GNU General Public License for more details.
  15
  16 // You should have received a copy of the GNU General Public License
  17 // along with this program; if not, write to the Free Software Foundation,
  18 // 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
  19 //
  20 #include "stdafx.h"
  21 #include "resource.h"
  22 #include "UnicodeUtils.h"
  23 #include "registry.h"
  24 #include "FileTextLines.h"
  25 #include "FormatMessageWrapper.h"
  26 #include "SmartHandle.h"
  27
  28 wchar_t inline WideCharSwap(wchar_t nValue)
  29 {
  30         return (((nValue>> 8)) | (nValue << 8));
  31         //return _byteswap_ushort(nValue);
  32 }
  33
  34 UINT64 inline WordSwapBytes(UINT64 nValue)
  35 {
  36         return ((nValue&0xff00ff00ff00ff)<<8) | ((nValue>>8)&0xff00ff00ff00ff); // swap BYTESs in WORDs
  37 }
  38
  39 UINT32 inline DwordSwapBytes(UINT32 nValue)
  40 {
  41         UINT32 nRet = (nValue<<16) | (nValue>>16); // swap WORDs
  42         nRet = ((nRet&0xff00ff)<<8) | ((nRet>>8)&0xff00ff); // swap BYTESs in WORDs
  43         return nRet;
  44         //return _byteswap_ulong(nValue);
  45 }
  46
  47 UINT64 inline DwordSwapBytes(UINT64 nValue)
  48 {
  49         UINT64 nRet = ((nValue&0xffff0000ffffL)<<16) | ((nValue>>16)&0xffff0000ffffL); // swap WORDs in DWORDs
  50         nRet = ((nRet&0xff00ff00ff00ff)<<8) | ((nRet>>8)&0xff00ff00ff00ff); // swap BYTESs in WORDs
  51         return nRet;
  52 }
  53
  54 CFileTextLines::CFileTextLines(void)
  55         : m_bNeedsConversion(false)
  56         , m_bKeepEncoding(false)
  57 {
  58         m_SaveParams.m_UnicodeType = CFileTextLines::AUTOTYPE;
  59         m_SaveParams.m_LineEndings = EOL_AUTOLINE;
  60 }
  61
  62 CFileTextLines::~CFileTextLines(void)
  63 {
  64 }
  65
  66 CFileTextLines::UnicodeType CFileTextLines::CheckUnicodeType(LPVOID pBuffer, int cb)
  67 {
  68         if (cb < 2)
  69                 return CFileTextLines::ASCII;
  70         const UINT32 * const pVal32 = (UINT32 *)pBuffer;
  71         const UINT16 * const pVal16 = (UINT16 *)pBuffer;
  72         const UINT8 * const pVal8 = (UINT8 *)pBuffer;
  73         // scan the whole buffer for a 0x00000000 sequence
  74         // if found, we assume a binary file
  75         int nDwords = cb/4;
  76         for (int j=0; j<nDwords; ++j)
  77         {
  78                 if (0x00000000 == pVal32[j])
  79                         return CFileTextLines::BINARY;
  80         }
  81         if (cb >=4 )
  82         {
  83                 if (*pVal32 == 0x0000FEFF)
  84                 {
  85                         return CFileTextLines::UTF32_LE;
  86                 }
  87                 if (*pVal32 == 0xFFFE0000)
  88                 {
  89                         return CFileTextLines::UTF32_BE;
  90                 }
  91         }
  92         if (*pVal16 == 0xFEFF)
  93         {
  94                 return CFileTextLines::UTF16_LEBOM;
  95         }
  96         if (*pVal16 == 0xFFFE)
  97         {
  98                 return CFileTextLines::UTF16_BEBOM;
  99         }
 100         if (cb < 3)
 101                 return CFileTextLines::ASCII;
 102         if (*pVal16 == 0xBBEF)
 103         {
 104                 if (pVal8[2] == 0xBF)
 105                         return CFileTextLines::UTF8BOM;
 106         }
 107         // check for illegal UTF8 sequences
 108         bool bNonANSI = false;
 109         int nNeedData = 0;
 110         int i=0;
 111         int nullcount = 0;
 112         for (; i < cb; ++i)
 113         {
 114                 if (pVal8[i] == 0)
 115                 {
 116                         ++nullcount;
 117                         // count the null chars, we do not want to treat an ASCII/UTF8 file
 118                         // as UTF16 just because of some null chars that might be accidentally
 119                         // in the file.
 120                         // Use an arbitrary value of one fiftieth of the file length as
 121                         // the limit after which a file is considered UTF16.
 122                         if (nullcount >(cb / 50))
 123                         {
 124                                 // null-chars are not allowed for ASCII or UTF8, that means
 125                                 // this file is most likely UTF16 encoded
 126                                 if (i % 2)
 127                                         return CFileTextLines::UTF16_LE;
 128                                 else
 129                                         return CFileTextLines::UTF16_BE;
 130                         }
 131                 }
 132                 if ((pVal8[i] & 0x80) != 0) // non ASCII
 133                 {
 134                         bNonANSI = true;
 135                         break;
 136                 }
 137         }
 138         // check remaining text for UTF-8 validity
 139         for (; i<cb; ++i)
 140         {
 141                 UINT8 zChar = pVal8[i];
 142                 if ((zChar & 0x80)==0) // Ascii
 143                 {
 144                         if (zChar == 0)
 145                         {
 146                                 ++nullcount;
 147                                 // count the null chars, we do not want to treat an ASCII/UTF8 file
 148                                 // as UTF16 just because of some null chars that might be accidentally
 149                                 // in the file.
 150                                 // Use an arbitrary value of one fiftieth of the file length as
 151                                 // the limit after which a file is considered UTF16.
 152                                 if (nullcount > (cb / 50))
 153                                 {
 154                                         // null-chars are not allowed for ASCII or UTF8, that means
 155                                         // this file is most likely UTF16 encoded
 156                                         if (i%2)
 157                                                 return CFileTextLines::UTF16_LE;
 158                                         else
 159                                                 return CFileTextLines::UTF16_BE;
 160                                 }
 161                                 nNeedData = 0;
 162                         }
 163                         else if (nNeedData)
 164                         {
 165                                 return CFileTextLines::ASCII;
 166                         }
 167                         continue;
 168                 }
 169                 if ((zChar & 0x40)==0) // top bit
 170                 {
 171                         if (!nNeedData)
 172                                 return CFileTextLines::ASCII;
 173                         --nNeedData;
 174                 }
 175                 else if (nNeedData)
 176                 {
 177                         return CFileTextLines::ASCII;
 178                 }
 179                 else if ((zChar & 0x20)==0) // top two bits
 180                 {
 181                         if (zChar<=0xC1)
 182                                 return CFileTextLines::ASCII;
 183                         nNeedData = 1;
 184                 }
 185                 else if ((zChar & 0x10)==0) // top three bits
 186                 {
 187                         nNeedData = 2;
 188                 }
 189                 else if ((zChar & 0x08)==0) // top four bits
 190                 {
 191                         if (zChar>=0xf5)
 192                                 return CFileTextLines::ASCII;
 193                         nNeedData = 3;
 194                 }
 195                 else
 196                         return CFileTextLines::ASCII;
 197         }
 198         if (bNonANSI && nNeedData==0)
 199                 // if get here thru nonAscii and no missing data left then its valid UTF8
 200                 return CFileTextLines::UTF8;
 201         if (!bNonANSI && (DWORD(CRegDWORD(L"Software\\TortoiseGitMerge\\UseUTF8", FALSE))))
 202                 return CFileTextLines::UTF8;
 203         return CFileTextLines::ASCII;
 204 }
 205
 206
 207 BOOL CFileTextLines::Load(const CString& sFilePath, int lengthHint /* = 0*/)
 208 {
 209         WCHAR exceptionError[1000] = {0};
 210         m_SaveParams.m_LineEndings = EOL_AUTOLINE;
 211         if (!m_bKeepEncoding)
 212                 m_SaveParams.m_UnicodeType = CFileTextLines::AUTOTYPE;
 213         RemoveAll();
 214         if(lengthHint != 0)
 215         {
 216                 Reserve(lengthHint);
 217         }
 218
 219         if (PathIsDirectory(sFilePath))
 220         {
 221                 m_sErrorString.Format(IDS_ERR_FILE_NOTAFILE, (LPCTSTR)sFilePath);
 222                 return FALSE;
 223         }
 224
 225         if (!PathFileExists(sFilePath))
 226         {
 227                 //file does not exist, so just return SUCCESS
 228                 return TRUE;
 229         }
 230
 231         CAutoFile hFile = CreateFile(sFilePath, GENERIC_READ, FILE_SHARE_READ | FILE_SHARE_DELETE | FILE_SHARE_WRITE, nullptr, OPEN_EXISTING, 0, nullptr);
 232         if (!hFile)
 233         {
 234                 SetErrorString();
 235                 return FALSE;
 236         }
 237
 238         LARGE_INTEGER fsize;
 239         if (!GetFileSizeEx(hFile, &fsize))
 240         {
 241                 SetErrorString();
 242                 return FALSE;
 243         }
 244         if (fsize.HighPart)
 245         {
 246                 // file is way too big for us
 247                 m_sErrorString.LoadString(IDS_ERR_FILE_TOOBIG);
 248                 return FALSE;
 249         }
 250
 251         // create buffer
 252         // If new[] was done for type T delete[] must be called on a pointer of type T*,
 253         // otherwise the behavior is undefined.
 254         // +1 is to address possible truncation when integer division is done
 255         CBuffer oFile;
 256         try
 257         {
 258                 oFile.SetLength(fsize.LowPart);
 259         }
 260         catch (CMemoryException* e)
 261         {
 262                 e->GetErrorMessage(exceptionError, _countof(exceptionError));
 263                 m_sErrorString = exceptionError;
 264                 return FALSE;
 265         }
 266
 267         // load file
 268         DWORD dwReadBytes = 0;
 269         if (!ReadFile(hFile, (void*)oFile, fsize.LowPart, &dwReadBytes, nullptr))
 270         {
 271                 SetErrorString();
 272                 return FALSE;
 273         }
 274         hFile.CloseHandle();
 275
 276         // detect type
 277         if (m_SaveParams.m_UnicodeType == CFileTextLines::AUTOTYPE)
 278         {
 279                 m_SaveParams.m_UnicodeType = this->CheckUnicodeType((LPVOID)oFile, dwReadBytes);
 280         }
 281         // enforce conversion for all but ASCII and UTF8 type
 282         m_bNeedsConversion = (m_SaveParams.m_UnicodeType != CFileTextLines::UTF8) && (m_SaveParams.m_UnicodeType != CFileTextLines::ASCII);
 283
 284         // we may have to convert the file content - CString is UTF16LE
 285         try
 286         {
 287                 CBaseFilter* pFilter = nullptr;
 288                 switch (m_SaveParams.m_UnicodeType)
 289                 {
 290                 case BINARY:
 291                         m_sErrorString.Format(IDS_ERR_FILE_BINARY, (LPCTSTR)sFilePath);
 292                         return FALSE;
 293                 case UTF8:
 294                 case UTF8BOM:
 295                         pFilter = new CUtf8Filter(nullptr);
 296                         break;
 297                 default:
 298                 case ASCII:
 299                         pFilter = new CAsciiFilter(nullptr);
 300                         break;
 301                 case UTF16_BE:
 302                 case UTF16_BEBOM:
 303                         pFilter = new CUtf16beFilter(nullptr);
 304                         break;
 305                 case UTF16_LE:
 306                 case UTF16_LEBOM:
 307                         pFilter = new CUtf16leFilter(nullptr);
 308                         break;
 309                 case UTF32_BE:
 310                         pFilter = new CUtf32beFilter(nullptr);
 311                         break;
 312                 case UTF32_LE:
 313                         pFilter = new CUtf32leFilter(nullptr);
 314                         break;
 315                 }
 316                 pFilter->Decode(oFile);
 317                 delete pFilter;
 318         }
 319         catch (CMemoryException* e)
 320         {
 321                 e->GetErrorMessage(exceptionError, _countof(exceptionError));
 322                 m_sErrorString = exceptionError;
 323                 return FALSE;
 324         }
 325
 326         int nReadChars=oFile.GetLength()/sizeof(wchar_t);
 327         wchar_t * pTextBuf = (wchar_t *)oFile;
 328         wchar_t * pLineStart = pTextBuf;
 329         if ((m_SaveParams.m_UnicodeType == UTF8BOM)
 330                 || (m_SaveParams.m_UnicodeType == UTF16_LEBOM)
 331                 || (m_SaveParams.m_UnicodeType == UTF16_BEBOM)
 332                 || (m_SaveParams.m_UnicodeType == UTF32_LE)
 333                 || (m_SaveParams.m_UnicodeType == UTF32_BE))
 334         {
 335                 // ignore the BOM
 336                 ++pTextBuf;
 337                 ++pLineStart;
 338                 --nReadChars;
 339         }
 340
 341         // fill in the lines into the array
 342         size_t countEOLs[EOL__COUNT] = { 0 };
 343         CFileTextLine oTextLine;
 344         for (int i = nReadChars; i; --i)
 345         {
 346                 EOL eEol;
 347                 switch (*pTextBuf++)
 348                 {
 349                 case '\r':
 350                         // crlf line ending or cr line ending
 351                         eEol = ((i > 1) && *(pTextBuf) == '\n') ? EOL_CRLF : EOL_CR;
 352                         break;
 353                 case '\n':
 354                         // lfcr line ending or lf line ending
 355                         eEol = ((i > 1) && *(pTextBuf) == '\r') ? EOL_LFCR : EOL_LF;
 356                         if (eEol == EOL_LFCR)
 357                         {
 358                                 // LFCR is very rare on Windows, so we have to double check
 359                                 // that this is not just a LF followed by CRLF
 360                                 if (((countEOLs[EOL_CRLF] > 1) || (countEOLs[EOL_LF] > 1) || (GetCount() < 2)) &&
 361                                         ((i > 2) && (*(pTextBuf+1) == '\n')))
 362                                 {
 363                                         // change the EOL back to a simple LF
 364                                         eEol = EOL_LF;
 365                                 }
 366                         }
 367                         break;
 368                 case 0x000b:
 369                         eEol = EOL_VT;
 370                         break;
 371                 case 0x000c:
 372                         eEol = EOL_FF;
 373                         break;
 374                 case 0x0085:
 375                         eEol = EOL_NEL;
 376                         break;
 377                 case 0x2028:
 378                         eEol = EOL_LS;
 379                         break;
 380                 case 0x2029:
 381                         eEol = EOL_PS;
 382                         break;
 383                 default:
 384                         continue;
 385                 }
 386                 oTextLine.sLine = CString(pLineStart, (int)(pTextBuf-pLineStart)-1);
 387                 oTextLine.eEnding = eEol;
 388                 CStdFileLineArray::Add(oTextLine);
 389                 ++countEOLs[eEol];
 390                 if (eEol==EOL_CRLF || eEol==EOL_LFCR)
 391                 {
 392                         ++pTextBuf;
 393                         --i;
 394                 }
 395                 pLineStart = pTextBuf;
 396         }
 397         CString line(pLineStart, (int)(pTextBuf-pLineStart));
 398         Add(line, EOL_NOENDING);
 399
 400         // some EOLs are not supported by the svn diff lib.
 401         m_bNeedsConversion |= (countEOLs[EOL_CRLF]!=0);
 402         m_bNeedsConversion |= (countEOLs[EOL_FF]!=0);
 403         m_bNeedsConversion |= (countEOLs[EOL_VT]!=0);
 404         m_bNeedsConversion |= (countEOLs[EOL_NEL]!=0);
 405         m_bNeedsConversion |= (countEOLs[EOL_LS]!=0);
 406         m_bNeedsConversion |= (countEOLs[EOL_PS]!=0);
 407
 408         size_t eolmax = 0;
 409         for (int nEol = 0; nEol<EOL__COUNT; nEol++)
 410         {
 411                 if (eolmax < countEOLs[nEol])
 412                 {
 413                         eolmax = countEOLs[nEol];
 414                         m_SaveParams.m_LineEndings = (EOL)nEol;
 415                 }
 416         }
 417
 418         return TRUE;
 419 }
 420
 421 void CFileTextLines::StripWhiteSpace(CString& sLine, DWORD dwIgnoreWhitespaces, bool blame)
 422 {
 423         if (blame)
 424         {
 425                 if (sLine.GetLength() > 66)
 426                         sLine = sLine.Mid(66);
 427         }
 428         switch (dwIgnoreWhitespaces)
 429         {
 430         case 0:
 431                 // Compare whitespaces
 432                 // do nothing
 433                 break;
 434         case 1:
 435                 // Ignore all whitespaces
 436                 sLine.TrimLeft(L" \t");
 437                 sLine.TrimRight(L" \t");
 438                 break;
 439         case 2:
 440                 // Ignore leading whitespace
 441                 sLine.TrimLeft(L" \t");
 442                 break;
 443         case 3:
 444                 // Ignore ending whitespace
 445                 sLine.TrimRight(L" \t");
 446                 break;
 447         }
 448 }
 449
 450 /**
 451         Encoding pattern:
 452                 - encode & save BOM
 453                 - Get Line
 454                 - modify line - whitespaces, lowercase
 455                 - encode & save line
 456                 - get cached encoded eol
 457                 - save eol
 458 */
 459 BOOL CFileTextLines::Save( const CString& sFilePath
 460                                                 , bool bSaveAsUTF8 /*= false */
 461                                                 , bool bUseSVNCompatibleEOLs /*= false */
 462                                                 , DWORD dwIgnoreWhitespaces /*= 0 */
 463                                                 , BOOL bIgnoreCase /*= FALSE */
 464                                                 , bool bBlame /*= false*/
 465                                                 , bool bIgnoreComments /*= false*/
 466                                                 , const CString& linestart /*= CString()*/
 467                                                 , const CString& blockstart /*= CString()*/
 468                                                 , const CString& blockend /*= CString()*/
 469                                                 , const std::wregex& rx /*= std::wregex(L"")*/
 470                                                 , const std::wstring& replacement /*=L""*/)
 471 {
 472         m_sCommentLine = linestart;
 473         m_sCommentBlockStart = blockstart;
 474         m_sCommentBlockEnd = blockend;
 475
 476         try
 477         {
 478                 CString destPath = sFilePath;
 479                 // now make sure that the destination directory exists
 480                 int ind = 0;
 481                 while (destPath.Find('\\', ind)>=2)
 482                 {
 483                         if (!PathIsDirectory(destPath.Left(destPath.Find('\\', ind))))
 484                         {
 485                                 if (!CreateDirectory(destPath.Left(destPath.Find('\\', ind)), nullptr))
 486                                         return FALSE;
 487                         }
 488                         ind = destPath.Find('\\', ind)+1;
 489                 }
 490
 491                 CStdioFile file;                        // Hugely faster than CFile for big file writes - because it uses buffering
 492                 if (!file.Open(sFilePath, CFile::modeCreate | CFile::modeWrite | CFile::typeBinary | CFile::shareDenyNone))
 493                 {
 494                         const_cast<CString *>(&m_sErrorString)->Format(IDS_ERR_FILE_OPEN, (LPCTSTR)sFilePath);
 495                         return FALSE;
 496                 }
 497
 498                 CBaseFilter* pFilter = nullptr;
 499                 bool bSaveBom = true;
 500                 CFileTextLines::UnicodeType eUnicodeType = bSaveAsUTF8 ? CFileTextLines::UTF8 : m_SaveParams.m_UnicodeType;
 501                 switch (eUnicodeType)
 502                 {
 503                 default:
 504                 case CFileTextLines::ASCII:
 505                         bSaveBom = false;
 506                         pFilter = new CAsciiFilter(&file);
 507                         break;
 508                 case CFileTextLines::UTF8:
 509                         bSaveBom = false;
 510                 case CFileTextLines::UTF8BOM:
 511                         pFilter = new CUtf8Filter(&file);
 512                         break;
 513                 case CFileTextLines::UTF16_BE:
 514                         bSaveBom = false;
 515                         pFilter = new CUtf16beFilter(&file);
 516                         break;
 517                 case CFileTextLines::UTF16_BEBOM:
 518                         pFilter = new CUtf16beFilter(&file);
 519                         break;
 520                 case CFileTextLines::UTF16_LE:
 521                         bSaveBom = false;
 522                         pFilter = new CUtf16leFilter(&file);
 523                         break;
 524                 case CFileTextLines::UTF16_LEBOM:
 525                         pFilter = new CUtf16leFilter(&file);
 526                         break;
 527                 case CFileTextLines::UTF32_BE:
 528                         pFilter = new CUtf32beFilter(&file);
 529                         break;
 530                 case CFileTextLines::UTF32_LE:
 531                         pFilter = new CUtf32leFilter(&file);
 532                         break;
 533                 }
 534
 535                 if (bSaveBom)
 536                 {
 537                         //first write the BOM
 538                         pFilter->Write(L"\xfeff");
 539                 }
 540                 // cache EOLs
 541                 CBuffer oEncodedEol[EOL__COUNT];
 542                 oEncodedEol[EOL_LF] = pFilter->Encode(L"\n"); // x0a
 543                 oEncodedEol[EOL_CR] = pFilter->Encode(L"\r"); // x0d
 544                 oEncodedEol[EOL_CRLF] = pFilter->Encode(L"\r\n"); // x0d x0a
 545                 if (bUseSVNCompatibleEOLs)
 546                 {
 547                         // when using EOLs that are supported by the svn lib,
 548                         // we have to use the same EOLs as the file has in case
 549                         // they're already supported, but a different supported one
 550                         // in case the original one isn't supported.
 551                         // Only this way the option "ignore EOLs (recommended)" unchecked
 552                         // actually shows the lines as different.
 553                         // However, the diff won't find and differences in EOLs
 554                         // for these special EOLs if they differ between those special ones
 555                         // listed below.
 556                         // But it will work properly for the most common EOLs LF/CR/CRLF.
 557                         oEncodedEol[EOL_LFCR] = oEncodedEol[EOL_CR];
 558                         for (int nEol = 0; nEol<EOL_NOENDING; nEol++)
 559                         {
 560                                 if (oEncodedEol[nEol].IsEmpty())
 561                                         oEncodedEol[nEol] = oEncodedEol[EOL_LF];
 562                         }
 563                 }
 564                 else
 565                 {
 566                         oEncodedEol[EOL_LFCR] = pFilter->Encode(L"\n\r");
 567                         oEncodedEol[EOL_VT] = pFilter->Encode(L"\v"); // x0b
 568                         oEncodedEol[EOL_FF] = pFilter->Encode(L"\f"); // x0c
 569                         oEncodedEol[EOL_NEL] = pFilter->Encode(L"\x85");
 570                         oEncodedEol[EOL_LS] = pFilter->Encode(L"\x2028");
 571                         oEncodedEol[EOL_PS] = pFilter->Encode(L"\x2029");
 572                 }
 573                 oEncodedEol[EOL_AUTOLINE] = oEncodedEol[m_SaveParams.m_LineEndings==EOL_AUTOLINE
 574                                 ? EOL_CRLF
 575                                 : m_SaveParams.m_LineEndings];
 576
 577                 bool bInBlockComment = false;
 578                 for (int i=0; i<GetCount(); i++)
 579                 {
 580                         CString sLineT = GetAt(i);
 581                         if (bIgnoreComments)
 582                                 bInBlockComment = StripComments(sLineT, bInBlockComment);
 583                         if (!rx._Empty())
 584                                 LineRegex(sLineT, rx, replacement);
 585                         StripWhiteSpace(sLineT, dwIgnoreWhitespaces, bBlame);
 586                         if (bIgnoreCase)
 587                                 sLineT = sLineT.MakeLower();
 588                         pFilter->Write(sLineT);
 589                         EOL eEol = GetLineEnding(i);
 590                         pFilter->Write(oEncodedEol[eEol]);
 591                 }
 592                 delete pFilter;
 593                 file.Close();
 594         }
 595         catch (CException * e)
 596         {
 597                 CString * psErrorString = const_cast<CString *>(&m_sErrorString);
 598                 e->GetErrorMessage(psErrorString->GetBuffer(4096), 4096);
 599                 psErrorString->ReleaseBuffer();
 600                 e->Delete();
 601                 return FALSE;
 602         }
 603         return TRUE;
 604 }
 605
 606 void CFileTextLines::SetErrorString()
 607 {
 608         m_sErrorString = CFormatMessageWrapper();
 609 }
 610
 611 void CFileTextLines::CopySettings(CFileTextLines * pFileToCopySettingsTo) const
 612 {
 613         if (pFileToCopySettingsTo)
 614         {
 615                 pFileToCopySettingsTo->m_SaveParams = m_SaveParams;
 616         }
 617 }
 618
 619 const wchar_t * CFileTextLines::GetEncodingName(UnicodeType eEncoding)
 620 {
 621         switch (eEncoding)
 622         {
 623         case ASCII:
 624                 return L"ASCII";
 625         case BINARY:
 626                 return L"BINARY";
 627         case UTF16_LE:
 628                 return L"UTF-16LE";
 629         case UTF16_LEBOM:
 630                 return L"UTF-16LE BOM";
 631         case UTF16_BE:
 632                 return L"UTF-16BE";
 633         case UTF16_BEBOM:
 634                 return L"UTF-16BE BOM";
 635         case UTF32_LE:
 636                 return L"UTF-32LE";
 637         case UTF32_BE:
 638                 return L"UTF-32BE";
 639         case UTF8:
 640                 return L"UTF-8";
 641         case UTF8BOM:
 642                 return L"UTF-8 BOM";
 643         }
 644         return L"";
 645 }
 646
 647 bool CFileTextLines::StripComments( CString& sLine, bool bInBlockComment )
 648 {
 649         int startpos = 0;
 650         int oldStartPos = -1;
 651         do
 652         {
 653                 if (bInBlockComment)
 654                 {
 655                         int endpos = sLine.Find(m_sCommentBlockEnd);
 656                         if (endpos >= 0)
 657                         {
 658                                 sLine = sLine.Left(startpos) + sLine.Mid(endpos+m_sCommentBlockEnd.GetLength());
 659                                 bInBlockComment = false;
 660                         }
 661                         else
 662                         {
 663                                 sLine = sLine.Left(startpos);
 664                                 startpos = -1;
 665                         }
 666                 }
 667                 if (!bInBlockComment)
 668                 {
 669                         startpos = m_sCommentBlockStart.IsEmpty() ? -1 : sLine.Find(m_sCommentBlockStart);
 670                         int startpos2 = m_sCommentLine.IsEmpty() ? -1 : sLine.Find(m_sCommentLine);
 671                         if ( ((startpos2 < startpos) && (startpos2 >= 0)) ||
 672                                  ((startpos2 >= 0) && (startpos < 0)) )
 673                         {
 674                                 // line comment
 675                                 // look if there's a string marker (" or ') before that
 676                                 // note: this check is not fully correct. For example, it
 677                                 // does not account for escaped chars or even multiline strings.
 678                                 // but it has to be fast, so this has to do...
 679                                 int scount = 0;
 680                                 int ccount = 0;
 681                                 auto spos = sLine.Find('"');
 682                                 while ((spos >= 0) && (spos < startpos2))
 683                                 {
 684                                         ++scount;
 685                                         spos = sLine.Find('"', spos + 1);
 686                                 }
 687                                 auto cpos = sLine.Find('\'');
 688                                 while ((cpos >= 0) && (cpos < startpos2))
 689                                 {
 690                                         ++ccount;
 691                                         cpos = sLine.Find('"', cpos + 1);
 692                                 }
 693                                 if ((scount % 2 == 0) && (ccount % 2 == 0))
 694                                 {
 695                                         // line comment, erase the rest of the line
 696                                         sLine = sLine.Left(startpos2);
 697                                         startpos = -1;
 698                                 }
 699                                 if (startpos == oldStartPos)
 700                                         return false;
 701                                 oldStartPos = startpos;
 702                         }
 703                         else if (startpos >= 0)
 704                         {
 705                                 // starting block comment
 706                                 bInBlockComment = true;
 707                         }
 708                 }
 709         } while (startpos >= 0);
 710
 711         return bInBlockComment;
 712 }
 713
 714 void CFileTextLines::LineRegex( CString& sLine, const std::wregex& rx, const std::wstring& replacement ) const
 715 {
 716         std::wstring str = (LPCTSTR)sLine;
 717         std::wstring str2 = std::regex_replace(str, rx, replacement);
 718         sLine = str2.c_str();
 719 }
 720
 721
 722 void CBuffer::ExpandToAtLeast(int nNewSize)
 723 {
 724         if (nNewSize>m_nAllocated)
 725         {
 726                 delete [] m_pBuffer; // we don't preserve buffer content intentionally
 727                 nNewSize+=2048-1;
 728                 nNewSize&=~(1024-1);
 729                 m_pBuffer=new BYTE[nNewSize];
 730                 m_nAllocated=nNewSize;
 731         }
 732 }
 733
 734 void CBuffer::SetLength(int nUsed)
 735 {
 736         ExpandToAtLeast(nUsed);
 737         m_nUsed = nUsed;
 738 }
 739
 740 void CBuffer::Swap(CBuffer & Src)
 741 {
 742         std::swap(Src.m_nAllocated, m_nAllocated);
 743         std::swap(Src.m_pBuffer, m_pBuffer);
 744         std::swap(Src.m_nUsed, m_nUsed);
 745 }
 746
 747 void CBuffer::Copy(const CBuffer & Src)
 748 {
 749         if (&Src != this)
 750         {
 751                 SetLength(Src.m_nUsed);
 752                 memcpy(m_pBuffer, Src.m_pBuffer, m_nUsed);
 753         }
 754 }
 755
 756
 757
 758 bool CBaseFilter::Decode(/*in out*/ CBuffer & data)
 759 {
 760         int nFlags = (m_nCodePage==CP_ACP) ? MB_PRECOMPOSED : 0;
 761         // dry decode is around 8 times faster then real one, alternatively we can set buffer to max length
 762         int nReadChars = MultiByteToWideChar(m_nCodePage, nFlags, (LPCSTR)data, data.GetLength(), nullptr, 0);
 763         m_oBuffer.SetLength(nReadChars*sizeof(wchar_t));
 764         int ret2 = MultiByteToWideChar(m_nCodePage, nFlags, (LPCSTR)data, data.GetLength(), (LPWSTR)(void *)m_oBuffer, nReadChars);
 765         if (ret2 != nReadChars)
 766         {
 767                 return FALSE;
 768         }
 769         data.Swap(m_oBuffer);
 770         return TRUE;
 771 }
 772
 773 const CBuffer& CBaseFilter::Encode(const CString& s)
 774 {
 775         m_oBuffer.SetLength(s.GetLength()*3+1); // set buffer to guessed max size
 776         int nConvertedLen = WideCharToMultiByte(m_nCodePage, 0, (LPCTSTR)s, s.GetLength(), (LPSTR)m_oBuffer, m_oBuffer.GetLength(), nullptr, nullptr);
 777         m_oBuffer.SetLength(nConvertedLen); // set buffer to used size
 778         return m_oBuffer;
 779 }
 780
 781
 782
 783 bool CUtf16leFilter::Decode(/*in out*/ CBuffer & /*data*/)
 784 {
 785         // we believe data is ok for use
 786         return TRUE;
 787 }
 788
 789 const CBuffer& CUtf16leFilter::Encode(const CString& s)
 790 {
 791         int nNeedBytes = s.GetLength()*sizeof(TCHAR);
 792         m_oBuffer.SetLength(nNeedBytes);
 793         memcpy((void *)m_oBuffer, (LPCTSTR)s, nNeedBytes);
 794         return m_oBuffer;
 795 }
 796
 797
 798
 799 bool CUtf16beFilter::Decode(/*in out*/ CBuffer & data)
 800 {
 801         int nNeedBytes = data.GetLength();
 802         // make in place WORD BYTEs swap
 803         UINT64 * p_qw = (UINT64 *)(void *)data;
 804         int nQwords = nNeedBytes/8;
 805         for (int nQword = 0; nQword<nQwords; nQword++)
 806         {
 807                 p_qw[nQword] = WordSwapBytes(p_qw[nQword]);
 808         }
 809         wchar_t * p_w = (wchar_t *)p_qw;
 810         int nWords = nNeedBytes/2;
 811         for (int nWord = nQwords*4; nWord<nWords; nWord++)
 812         {
 813                 p_w[nWord] = WideCharSwap(p_w[nWord]);
 814         }
 815         return CUtf16leFilter::Decode(data);
 816 }
 817
 818 const CBuffer& CUtf16beFilter::Encode(const CString& s)
 819 {
 820         int nNeedBytes = s.GetLength()*sizeof(TCHAR);
 821         m_oBuffer.SetLength(nNeedBytes);
 822         // copy swaping BYTE order in WORDs
 823         const UINT64 * p_qwIn = (const UINT64 *)(LPCTSTR)s;
 824         UINT64 * p_qwOut = (UINT64 *)(void *)m_oBuffer;
 825         int nQwords = nNeedBytes/8;
 826         for (int nQword = 0; nQword<nQwords; nQword++)
 827         {
 828                 p_qwOut[nQword] = WordSwapBytes(p_qwIn[nQword]);
 829         }
 830         wchar_t * p_wIn = (wchar_t *)p_qwIn;
 831         wchar_t * p_wOut = (wchar_t *)p_qwOut;
 832         int nWords = nNeedBytes/2;
 833         for (int nWord = nQwords*4; nWord<nWords; nWord++)
 834         {
 835                 p_wOut[nWord] = WideCharSwap(p_wIn[nWord]);
 836         }
 837         return m_oBuffer;
 838 }
 839
 840
 841
 842 bool CUtf32leFilter::Decode(/*in out*/ CBuffer & data)
 843 {
 844         // UTF32 have four bytes per char
 845         int nReadChars = data.GetLength()/4;
 846         UINT32 * p32 = (UINT32 *)(void *)data;
 847
 848         // count chars which needs surrogate pair
 849         int nSurrogatePairCount = 0;
 850         for (int i = 0; i<nReadChars; ++i)
 851         {
 852                 if (p32[i]<0x110000 && p32[i]>=0x10000)
 853                 {
 854                         ++nSurrogatePairCount;
 855                 }
 856         }
 857
 858         // fill buffer
 859         m_oBuffer.SetLength((nReadChars+nSurrogatePairCount)*sizeof(wchar_t));
 860         wchar_t * pOut = (wchar_t *)m_oBuffer;
 861         for (int i = 0; i<nReadChars; ++i, ++pOut)
 862         {
 863                 UINT32 zChar = p32[i];
 864                 if (zChar>=0x110000)
 865                 {
 866                         *pOut=0xfffd; // ? mark
 867                 }
 868                 else if (zChar>=0x10000)
 869                 {
 870                         zChar-=0x10000;
 871                         pOut[0] = ((zChar>>10)&0x3ff) | 0xd800; // lead surrogate
 872                         pOut[1] = (zChar&0x7ff) | 0xdc00; // trail surrogate
 873                         pOut++;
 874                 }
 875                 else
 876                 {
 877                         *pOut = (wchar_t)zChar;
 878                 }
 879         }
 880         data.Swap(m_oBuffer);
 881         return TRUE;
 882 }
 883
 884 const CBuffer& CUtf32leFilter::Encode(const CString& s)
 885 {
 886         int nInWords = s.GetLength();
 887         m_oBuffer.SetLength(nInWords*2);
 888
 889         LPCTSTR p_In = (LPCTSTR)s;
 890         UINT32 * p_Out = (UINT32 *)(void *)m_oBuffer;
 891         int nOutDword = 0;
 892         for (int nInWord = 0; nInWord<nInWords; nInWord++, nOutDword++)
 893         {
 894                 UINT32 zChar = p_In[nInWord];
 895                 if ((zChar&0xfc00) == 0xd800) // lead surrogate
 896                 {
 897                         if (nInWord+1<nInWords && (p_In[nInWord+1]&0xfc00) == 0xdc00) // trail surrogate follows
 898                         {
 899                                 zChar = 0x10000 + ((zChar&0x3ff)<<10) + (p_In[++nInWord]&0x3ff);
 900                         }
 901                         else
 902                         {
 903                                 zChar = 0xfffd; // ? mark
 904                         }
 905                 }
 906                 else if ((zChar&0xfc00) == 0xdc00) // trail surrogate without lead
 907                 {
 908                         zChar = 0xfffd; // ? mark
 909                 }
 910                 p_Out[nOutDword] = zChar;
 911         }
 912         m_oBuffer.SetLength(nOutDword*4); // store length reduced by surrogates
 913         return m_oBuffer;
 914 }
 915
 916
 917
 918 bool CUtf32beFilter::Decode(/*in out*/ CBuffer & data)
 919 {
 920
 921         // swap BYTEs order in DWORDs
 922         UINT64 * p64 = (UINT64 *)(void *)data;
 923         int nQwords = data.GetLength()/8;
 924         for (int nQword = 0; nQword<nQwords; nQword++)
 925         {
 926                 p64[nQword] = DwordSwapBytes(p64[nQword]);
 927         }
 928
 929         UINT32 * p32 = (UINT32 *)p64;
 930         int nDwords = data.GetLength()/4;
 931         for (int nDword = nQwords*2; nDword<nDwords; nDword++)
 932         {
 933                 p32[nDword] = DwordSwapBytes(p32[nDword]);
 934         }
 935         return CUtf32leFilter::Decode(data);
 936 }
 937
 938 const CBuffer& CUtf32beFilter::Encode(const CString& s)
 939 {
 940         CUtf32leFilter::Encode(s);
 941
 942         // swap BYTEs order in DWORDs
 943         UINT64 * p64 = (UINT64 *)(void *)m_oBuffer;
 944         int nQwords = m_oBuffer.GetLength()/8;
 945         for (int nQword = 0; nQword<nQwords; nQword++)
 946         {
 947                 p64[nQword] = DwordSwapBytes(p64[nQword]);
 948         }
 949
 950         UINT32 * p32 = (UINT32 *)p64;
 951         int nDwords = m_oBuffer.GetLength()/4;
 952         for (int nDword = nQwords*2; nDword<nDwords; nDword++)
 953         {
 954                 p32[nDword] = DwordSwapBytes(p32[nDword]);
 955         }
 956         return m_oBuffer;
 957 }