src/TortoiseMerge/FileTextLines.cpp

   1 // TortoiseGitMerge - a Diff/Patch program
   2
   3 // Copyright (C) 2007-2016 - TortoiseSVN
   4
   5 // This program is free software; you can redistribute it and/or
   6 // modify it under the terms of the GNU General Public License
   7 // as published by the Free Software Foundation; either version 2
   8 // of the License, or (at your option) any later version.
   9
  10 // This program is distributed in the hope that it will be useful,
  11 // but WITHOUT ANY WARRANTY; without even the implied warranty of
  12 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13 // GNU General Public License for more details.
  14
  15 // You should have received a copy of the GNU General Public License
  16 // along with this program; if not, write to the Free Software Foundation,
  17 // 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
  18 //
  19 #include "stdafx.h"
  20 #include "resource.h"
  21 #include "UnicodeUtils.h"
  22 #include "registry.h"
  23 #include "FileTextLines.h"
  24 #include "FormatMessageWrapper.h"
  25 #include "SmartHandle.h"
  26
  27 wchar_t inline WideCharSwap(wchar_t nValue)
  28 {
  29         return (((nValue>> 8)) | (nValue << 8));
  30         //return _byteswap_ushort(nValue);
  31 }
  32
  33 UINT64 inline WordSwapBytes(UINT64 nValue)
  34 {
  35         return ((nValue&0xff00ff00ff00ff)<<8) | ((nValue>>8)&0xff00ff00ff00ff); // swap BYTESs in WORDs
  36 }
  37
  38 UINT32 inline DwordSwapBytes(UINT32 nValue)
  39 {
  40         UINT32 nRet = (nValue<<16) | (nValue>>16); // swap WORDs
  41         nRet = ((nRet&0xff00ff)<<8) | ((nRet>>8)&0xff00ff); // swap BYTESs in WORDs
  42         return nRet;
  43         //return _byteswap_ulong(nValue);
  44 }
  45
  46 UINT64 inline DwordSwapBytes(UINT64 nValue)
  47 {
  48         UINT64 nRet = ((nValue&0xffff0000ffffL)<<16) | ((nValue>>16)&0xffff0000ffffL); // swap WORDs in DWORDs
  49         nRet = ((nRet&0xff00ff00ff00ff)<<8) | ((nRet>>8)&0xff00ff00ff00ff); // swap BYTESs in WORDs
  50         return nRet;
  51 }
  52
  53 CFileTextLines::CFileTextLines(void)
  54         : m_bNeedsConversion(false)
  55         , m_bKeepEncoding(false)
  56 {
  57         m_SaveParams.m_UnicodeType = CFileTextLines::AUTOTYPE;
  58         m_SaveParams.m_LineEndings = EOL_AUTOLINE;
  59 }
  60
  61 CFileTextLines::~CFileTextLines(void)
  62 {
  63 }
  64
  65 CFileTextLines::UnicodeType CFileTextLines::CheckUnicodeType(LPVOID pBuffer, int cb)
  66 {
  67         if (cb < 2)
  68                 return CFileTextLines::ASCII;
  69         const UINT32 * const pVal32 = (UINT32 *)pBuffer;
  70         const UINT16 * const pVal16 = (UINT16 *)pBuffer;
  71         const UINT8 * const pVal8 = (UINT8 *)pBuffer;
  72         // scan the whole buffer for a 0x00000000 sequence
  73         // if found, we assume a binary file
  74         int nDwords = cb/4;
  75         for (int j=0; j<nDwords; ++j)
  76         {
  77                 if (0x00000000 == pVal32[j])
  78                         return CFileTextLines::BINARY;
  79         }
  80         if (cb >=4 )
  81         {
  82                 if (*pVal32 == 0x0000FEFF)
  83                 {
  84                         return CFileTextLines::UTF32_LE;
  85                 }
  86                 if (*pVal32 == 0xFFFE0000)
  87                 {
  88                         return CFileTextLines::UTF32_BE;
  89                 }
  90         }
  91         if (*pVal16 == 0xFEFF)
  92         {
  93                 return CFileTextLines::UTF16_LEBOM;
  94         }
  95         if (*pVal16 == 0xFFFE)
  96         {
  97                 return CFileTextLines::UTF16_BEBOM;
  98         }
  99         if (cb < 3)
 100                 return CFileTextLines::ASCII;
 101         if (*pVal16 == 0xBBEF)
 102         {
 103                 if (pVal8[2] == 0xBF)
 104                         return CFileTextLines::UTF8BOM;
 105         }
 106         // check for illegal UTF8 sequences
 107         bool bNonANSI = false;
 108         int nNeedData = 0;
 109         int i=0;
 110         int nullcount = 0;
 111         for (; i < cb; ++i)
 112         {
 113                 if (pVal8[i] == 0)
 114                 {
 115                         ++nullcount;
 116                         // count the null chars, we do not want to treat an ASCII/UTF8 file
 117                         // as UTF16 just because of some null chars that might be accidentally
 118                         // in the file.
 119                         // Use an arbitrary value of one fiftieth of the file length as
 120                         // the limit after which a file is considered UTF16.
 121                         if (nullcount >(cb / 50))
 122                         {
 123                                 // null-chars are not allowed for ASCII or UTF8, that means
 124                                 // this file is most likely UTF16 encoded
 125                                 if (i % 2)
 126                                         return CFileTextLines::UTF16_LE;
 127                                 else
 128                                         return CFileTextLines::UTF16_BE;
 129                         }
 130                 }
 131                 if ((pVal8[i] & 0x80) != 0) // non ASCII
 132                 {
 133                         bNonANSI = true;
 134                         break;
 135                 }
 136         }
 137         // check remaining text for UTF-8 validity
 138         for (; i<cb; ++i)
 139         {
 140                 UINT8 zChar = pVal8[i];
 141                 if ((zChar & 0x80)==0) // Ascii
 142                 {
 143                         if (zChar == 0)
 144                         {
 145                                 ++nullcount;
 146                                 // count the null chars, we do not want to treat an ASCII/UTF8 file
 147                                 // as UTF16 just because of some null chars that might be accidentally
 148                                 // in the file.
 149                                 // Use an arbitrary value of one fiftieth of the file length as
 150                                 // the limit after which a file is considered UTF16.
 151                                 if (nullcount > (cb / 50))
 152                                 {
 153                                         // null-chars are not allowed for ASCII or UTF8, that means
 154                                         // this file is most likely UTF16 encoded
 155                                         if (i%2)
 156                                                 return CFileTextLines::UTF16_LE;
 157                                         else
 158                                                 return CFileTextLines::UTF16_BE;
 159                                 }
 160                                 nNeedData = 0;
 161                         }
 162                         else if (nNeedData)
 163                         {
 164                                 return CFileTextLines::ASCII;
 165                         }
 166                         continue;
 167                 }
 168                 if ((zChar & 0x40)==0) // top bit
 169                 {
 170                         if (!nNeedData)
 171                                 return CFileTextLines::ASCII;
 172                         --nNeedData;
 173                 }
 174                 else if (nNeedData)
 175                 {
 176                         return CFileTextLines::ASCII;
 177                 }
 178                 else if ((zChar & 0x20)==0) // top two bits
 179                 {
 180                         if (zChar<=0xC1)
 181                                 return CFileTextLines::ASCII;
 182                         nNeedData = 1;
 183                 }
 184                 else if ((zChar & 0x10)==0) // top three bits
 185                 {
 186                         nNeedData = 2;
 187                 }
 188                 else if ((zChar & 0x08)==0) // top four bits
 189                 {
 190                         if (zChar>=0xf5)
 191                                 return CFileTextLines::ASCII;
 192                         nNeedData = 3;
 193                 }
 194                 else
 195                         return CFileTextLines::ASCII;
 196         }
 197         if (bNonANSI && nNeedData==0)
 198                 // if get here thru nonAscii and no missing data left then its valid UTF8
 199                 return CFileTextLines::UTF8;
 200         if ((!bNonANSI)&&(DWORD(CRegDWORD(_T("Software\\TortoiseGitMerge\\UseUTF8"), FALSE))))
 201                 return CFileTextLines::UTF8;
 202         return CFileTextLines::ASCII;
 203 }
 204
 205
 206 BOOL CFileTextLines::Load(const CString& sFilePath, int lengthHint /* = 0*/)
 207 {
 208         WCHAR exceptionError[1000] = {0};
 209         m_SaveParams.m_LineEndings = EOL_AUTOLINE;
 210         if (!m_bKeepEncoding)
 211                 m_SaveParams.m_UnicodeType = CFileTextLines::AUTOTYPE;
 212         RemoveAll();
 213         if(lengthHint != 0)
 214         {
 215                 Reserve(lengthHint);
 216         }
 217
 218         if (PathIsDirectory(sFilePath))
 219         {
 220                 m_sErrorString.Format(IDS_ERR_FILE_NOTAFILE, (LPCTSTR)sFilePath);
 221                 return FALSE;
 222         }
 223
 224         if (!PathFileExists(sFilePath))
 225         {
 226                 //file does not exist, so just return SUCCESS
 227                 return TRUE;
 228         }
 229
 230         CAutoFile hFile = CreateFile(sFilePath, GENERIC_READ, FILE_SHARE_READ | FILE_SHARE_DELETE | FILE_SHARE_WRITE, nullptr, OPEN_EXISTING, 0, nullptr);
 231         if (!hFile)
 232         {
 233                 SetErrorString();
 234                 return FALSE;
 235         }
 236
 237         LARGE_INTEGER fsize;
 238         if (!GetFileSizeEx(hFile, &fsize))
 239         {
 240                 SetErrorString();
 241                 return FALSE;
 242         }
 243         if (fsize.HighPart)
 244         {
 245                 // file is way too big for us
 246                 m_sErrorString.LoadString(IDS_ERR_FILE_TOOBIG);
 247                 return FALSE;
 248         }
 249
 250         // create buffer
 251         // If new[] was done for type T delete[] must be called on a pointer of type T*,
 252         // otherwise the behavior is undefined.
 253         // +1 is to address possible truncation when integer division is done
 254         CBuffer oFile;
 255         try
 256         {
 257                 oFile.SetLength(fsize.LowPart);
 258         }
 259         catch (CMemoryException* e)
 260         {
 261                 e->GetErrorMessage(exceptionError, _countof(exceptionError));
 262                 m_sErrorString = exceptionError;
 263                 return FALSE;
 264         }
 265
 266         // load file
 267         DWORD dwReadBytes = 0;
 268         if (!ReadFile(hFile, (void *)oFile, fsize.LowPart, &dwReadBytes, NULL))
 269         {
 270                 SetErrorString();
 271                 return FALSE;
 272         }
 273         hFile.CloseHandle();
 274
 275         // detect type
 276         if (m_SaveParams.m_UnicodeType == CFileTextLines::AUTOTYPE)
 277         {
 278                 m_SaveParams.m_UnicodeType = this->CheckUnicodeType((LPVOID)oFile, dwReadBytes);
 279         }
 280         // enforce conversion for all but ASCII and UTF8 type
 281         m_bNeedsConversion = (m_SaveParams.m_UnicodeType != CFileTextLines::UTF8) && (m_SaveParams.m_UnicodeType != CFileTextLines::ASCII);
 282
 283         // we may have to convert the file content - CString is UTF16LE
 284         try
 285         {
 286                 CBaseFilter * pFilter = NULL;
 287                 switch (m_SaveParams.m_UnicodeType)
 288                 {
 289                 case BINARY:
 290                         m_sErrorString.Format(IDS_ERR_FILE_BINARY, (LPCTSTR)sFilePath);
 291                         return FALSE;
 292                 case UTF8:
 293                 case UTF8BOM:
 294                         pFilter = new CUtf8Filter(NULL);
 295                         break;
 296                 default:
 297                 case ASCII:
 298                         pFilter = new CAsciiFilter(NULL);
 299                         break;
 300                 case UTF16_BE:
 301                 case UTF16_BEBOM:
 302                         pFilter = new CUtf16beFilter(NULL);
 303                         break;
 304                 case UTF16_LE:
 305                 case UTF16_LEBOM:
 306                         pFilter = new CUtf16leFilter(NULL);
 307                         break;
 308                 case UTF32_BE:
 309                         pFilter = new CUtf32beFilter(NULL);
 310                         break;
 311                 case UTF32_LE:
 312                         pFilter = new CUtf32leFilter(NULL);
 313                         break;
 314                 }
 315                 pFilter->Decode(oFile);
 316                 delete pFilter;
 317         }
 318         catch (CMemoryException* e)
 319         {
 320                 e->GetErrorMessage(exceptionError, _countof(exceptionError));
 321                 m_sErrorString = exceptionError;
 322                 return FALSE;
 323         }
 324
 325         int nReadChars=oFile.GetLength()/sizeof(wchar_t);
 326         wchar_t * pTextBuf = (wchar_t *)oFile;
 327         wchar_t * pLineStart = pTextBuf;
 328         if ((m_SaveParams.m_UnicodeType == UTF8BOM)
 329                 || (m_SaveParams.m_UnicodeType == UTF16_LEBOM)
 330                 || (m_SaveParams.m_UnicodeType == UTF16_BEBOM)
 331                 || (m_SaveParams.m_UnicodeType == UTF32_LE)
 332                 || (m_SaveParams.m_UnicodeType == UTF32_BE))
 333         {
 334                 // ignore the BOM
 335                 ++pTextBuf;
 336                 ++pLineStart;
 337                 --nReadChars;
 338         }
 339
 340         // fill in the lines into the array
 341         size_t countEOLs[EOL__COUNT] = { 0 };
 342         CFileTextLine oTextLine;
 343         for (int i = nReadChars; i; --i)
 344         {
 345                 EOL eEol;
 346                 switch (*pTextBuf++)
 347                 {
 348                 case '\r':
 349                         // crlf line ending or cr line ending
 350                         eEol = ((i > 1) && *(pTextBuf) == '\n') ? EOL_CRLF : EOL_CR;
 351                         break;
 352                 case '\n':
 353                         // lfcr line ending or lf line ending
 354                         eEol = ((i > 1) && *(pTextBuf) == '\r') ? EOL_LFCR : EOL_LF;
 355                         if (eEol == EOL_LFCR)
 356                         {
 357                                 // LFCR is very rare on Windows, so we have to double check
 358                                 // that this is not just a LF followed by CRLF
 359                                 if (((countEOLs[EOL_CRLF] > 1) || (countEOLs[EOL_LF] > 1) || (GetCount() < 2)) &&
 360                                         ((i > 2) && (*(pTextBuf+1) == '\n')))
 361                                 {
 362                                         // change the EOL back to a simple LF
 363                                         eEol = EOL_LF;
 364                                 }
 365                         }
 366                         break;
 367                 case 0x000b:
 368                         eEol = EOL_VT;
 369                         break;
 370                 case 0x000c:
 371                         eEol = EOL_FF;
 372                         break;
 373                 case 0x0085:
 374                         eEol = EOL_NEL;
 375                         break;
 376                 case 0x2028:
 377                         eEol = EOL_LS;
 378                         break;
 379                 case 0x2029:
 380                         eEol = EOL_PS;
 381                         break;
 382                 default:
 383                         continue;
 384                 }
 385                 oTextLine.sLine = CString(pLineStart, (int)(pTextBuf-pLineStart)-1);
 386                 oTextLine.eEnding = eEol;
 387                 CStdFileLineArray::Add(oTextLine);
 388                 ++countEOLs[eEol];
 389                 if (eEol==EOL_CRLF || eEol==EOL_LFCR)
 390                 {
 391                         ++pTextBuf;
 392                         --i;
 393                 }
 394                 pLineStart = pTextBuf;
 395         }
 396         CString line(pLineStart, (int)(pTextBuf-pLineStart));
 397         Add(line, EOL_NOENDING);
 398
 399         // some EOLs are not supported by the svn diff lib.
 400         m_bNeedsConversion |= (countEOLs[EOL_CRLF]!=0);
 401         m_bNeedsConversion |= (countEOLs[EOL_FF]!=0);
 402         m_bNeedsConversion |= (countEOLs[EOL_VT]!=0);
 403         m_bNeedsConversion |= (countEOLs[EOL_NEL]!=0);
 404         m_bNeedsConversion |= (countEOLs[EOL_LS]!=0);
 405         m_bNeedsConversion |= (countEOLs[EOL_PS]!=0);
 406
 407         size_t eolmax = 0;
 408         for (int nEol = 0; nEol<EOL__COUNT; nEol++)
 409         {
 410                 if (eolmax < countEOLs[nEol])
 411                 {
 412                         eolmax = countEOLs[nEol];
 413                         m_SaveParams.m_LineEndings = (EOL)nEol;
 414                 }
 415         }
 416
 417         return TRUE;
 418 }
 419
 420 void CFileTextLines::StripWhiteSpace(CString& sLine, DWORD dwIgnoreWhitespaces, bool blame)
 421 {
 422         if (blame)
 423         {
 424                 if (sLine.GetLength() > 66)
 425                         sLine = sLine.Mid(66);
 426         }
 427         switch (dwIgnoreWhitespaces)
 428         {
 429         case 0:
 430                 // Compare whitespaces
 431                 // do nothing
 432                 break;
 433         case 1:
 434                 // Ignore all whitespaces
 435                 sLine.TrimLeft(_T(" \t"));
 436                 sLine.TrimRight(_T(" \t"));
 437                 break;
 438         case 2:
 439                 // Ignore leading whitespace
 440                 sLine.TrimLeft(_T(" \t"));
 441                 break;
 442         case 3:
 443                 // Ignore ending whitespace
 444                 sLine.TrimRight(_T(" \t"));
 445                 break;
 446         }
 447 }
 448
 449 /**
 450         Encoding pattern:
 451                 - encode & save BOM
 452                 - Get Line
 453                 - modify line - whitespaces, lowercase
 454                 - encode & save line
 455                 - get cached encoded eol
 456                 - save eol
 457 */
 458 BOOL CFileTextLines::Save( const CString& sFilePath
 459                                                 , bool bSaveAsUTF8 /*= false */
 460                                                 , bool bUseSVNCompatibleEOLs /*= false */
 461                                                 , DWORD dwIgnoreWhitespaces /*= 0 */
 462                                                 , BOOL bIgnoreCase /*= FALSE */
 463                                                 , bool bBlame /*= false*/
 464                                                 , bool bIgnoreComments /*= false*/
 465                                                 , const CString& linestart /*= CString()*/
 466                                                 , const CString& blockstart /*= CString()*/
 467                                                 , const CString& blockend /*= CString()*/
 468                                                 , const std::wregex& rx /*= std::wregex(L"")*/
 469                                                 , const std::wstring& replacement /*=L""*/)
 470 {
 471         m_sCommentLine = linestart;
 472         m_sCommentBlockStart = blockstart;
 473         m_sCommentBlockEnd = blockend;
 474
 475         try
 476         {
 477                 CString destPath = sFilePath;
 478                 // now make sure that the destination directory exists
 479                 int ind = 0;
 480                 while (destPath.Find('\\', ind)>=2)
 481                 {
 482                         if (!PathIsDirectory(destPath.Left(destPath.Find('\\', ind))))
 483                         {
 484                                 if (!CreateDirectory(destPath.Left(destPath.Find('\\', ind)), NULL))
 485                                         return FALSE;
 486                         }
 487                         ind = destPath.Find('\\', ind)+1;
 488                 }
 489
 490                 CStdioFile file;                        // Hugely faster than CFile for big file writes - because it uses buffering
 491                 if (!file.Open(sFilePath, CFile::modeCreate | CFile::modeWrite | CFile::typeBinary | CFile::shareDenyNone))
 492                 {
 493                         const_cast<CString *>(&m_sErrorString)->Format(IDS_ERR_FILE_OPEN, (LPCTSTR)sFilePath);
 494                         return FALSE;
 495                 }
 496
 497                 CBaseFilter * pFilter = NULL;
 498                 bool bSaveBom = true;
 499                 CFileTextLines::UnicodeType eUnicodeType = bSaveAsUTF8 ? CFileTextLines::UTF8 : m_SaveParams.m_UnicodeType;
 500                 switch (eUnicodeType)
 501                 {
 502                 default:
 503                 case CFileTextLines::ASCII:
 504                         bSaveBom = false;
 505                         pFilter = new CAsciiFilter(&file);
 506                         break;
 507                 case CFileTextLines::UTF8:
 508                         bSaveBom = false;
 509                 case CFileTextLines::UTF8BOM:
 510                         pFilter = new CUtf8Filter(&file);
 511                         break;
 512                 case CFileTextLines::UTF16_BE:
 513                         bSaveBom = false;
 514                         pFilter = new CUtf16beFilter(&file);
 515                         break;
 516                 case CFileTextLines::UTF16_BEBOM:
 517                         pFilter = new CUtf16beFilter(&file);
 518                         break;
 519                 case CFileTextLines::UTF16_LE:
 520                         bSaveBom = false;
 521                         pFilter = new CUtf16leFilter(&file);
 522                         break;
 523                 case CFileTextLines::UTF16_LEBOM:
 524                         pFilter = new CUtf16leFilter(&file);
 525                         break;
 526                 case CFileTextLines::UTF32_BE:
 527                         pFilter = new CUtf32beFilter(&file);
 528                         break;
 529                 case CFileTextLines::UTF32_LE:
 530                         pFilter = new CUtf32leFilter(&file);
 531                         break;
 532                 }
 533
 534                 if (bSaveBom)
 535                 {
 536                         //first write the BOM
 537                         pFilter->Write(L"\xfeff");
 538                 }
 539                 // cache EOLs
 540                 CBuffer oEncodedEol[EOL__COUNT];
 541                 oEncodedEol[EOL_LF] = pFilter->Encode(_T("\n")); // x0a
 542                 oEncodedEol[EOL_CR] = pFilter->Encode(_T("\r")); // x0d
 543                 oEncodedEol[EOL_CRLF] = pFilter->Encode(_T("\r\n")); // x0d x0a
 544                 if (bUseSVNCompatibleEOLs)
 545                 {
 546                         // when using EOLs that are supported by the svn lib,
 547                         // we have to use the same EOLs as the file has in case
 548                         // they're already supported, but a different supported one
 549                         // in case the original one isn't supported.
 550                         // Only this way the option "ignore EOLs (recommended)" unchecked
 551                         // actually shows the lines as different.
 552                         // However, the diff won't find and differences in EOLs
 553                         // for these special EOLs if they differ between those special ones
 554                         // listed below.
 555                         // But it will work properly for the most common EOLs LF/CR/CRLF.
 556                         oEncodedEol[EOL_LFCR] = oEncodedEol[EOL_CR];
 557                         for (int nEol = 0; nEol<EOL_NOENDING; nEol++)
 558                         {
 559                                 if (oEncodedEol[nEol].IsEmpty())
 560                                         oEncodedEol[nEol] = oEncodedEol[EOL_LF];
 561                         }
 562                 }
 563                 else
 564                 {
 565                         oEncodedEol[EOL_LFCR] = pFilter->Encode(_T("\n\r"));
 566                         oEncodedEol[EOL_VT] = pFilter->Encode(_T("\v")); // x0b
 567                         oEncodedEol[EOL_FF] = pFilter->Encode(_T("\f")); // x0c
 568                         oEncodedEol[EOL_NEL] = pFilter->Encode(_T("\x85"));
 569                         oEncodedEol[EOL_LS] = pFilter->Encode(_T("\x2028"));
 570                         oEncodedEol[EOL_PS] = pFilter->Encode(_T("\x2029"));
 571                 }
 572                 oEncodedEol[EOL_AUTOLINE] = oEncodedEol[m_SaveParams.m_LineEndings==EOL_AUTOLINE
 573                                 ? EOL_CRLF
 574                                 : m_SaveParams.m_LineEndings];
 575
 576                 bool bInBlockComment = false;
 577                 for (int i=0; i<GetCount(); i++)
 578                 {
 579                         CString sLineT = GetAt(i);
 580                         if (bIgnoreComments)
 581                                 bInBlockComment = StripComments(sLineT, bInBlockComment);
 582                         if (!rx._Empty())
 583                                 LineRegex(sLineT, rx, replacement);
 584                         StripWhiteSpace(sLineT, dwIgnoreWhitespaces, bBlame);
 585                         if (bIgnoreCase)
 586                                 sLineT = sLineT.MakeLower();
 587                         pFilter->Write(sLineT);
 588                         EOL eEol = GetLineEnding(i);
 589                         pFilter->Write(oEncodedEol[eEol]);
 590                 }
 591                 delete pFilter;
 592                 file.Close();
 593         }
 594         catch (CException * e)
 595         {
 596                 CString * psErrorString = const_cast<CString *>(&m_sErrorString);
 597                 e->GetErrorMessage(psErrorString->GetBuffer(4096), 4096);
 598                 psErrorString->ReleaseBuffer();
 599                 e->Delete();
 600                 return FALSE;
 601         }
 602         return TRUE;
 603 }
 604
 605 void CFileTextLines::SetErrorString()
 606 {
 607         m_sErrorString = CFormatMessageWrapper();
 608 }
 609
 610 void CFileTextLines::CopySettings(CFileTextLines * pFileToCopySettingsTo) const
 611 {
 612         if (pFileToCopySettingsTo)
 613         {
 614                 pFileToCopySettingsTo->m_SaveParams = m_SaveParams;
 615         }
 616 }
 617
 618 const wchar_t * CFileTextLines::GetEncodingName(UnicodeType eEncoding)
 619 {
 620         switch (eEncoding)
 621         {
 622         case ASCII:
 623                 return L"ASCII";
 624         case BINARY:
 625                 return L"BINARY";
 626         case UTF16_LE:
 627                 return L"UTF-16LE";
 628         case UTF16_LEBOM:
 629                 return L"UTF-16LE BOM";
 630         case UTF16_BE:
 631                 return L"UTF-16BE";
 632         case UTF16_BEBOM:
 633                 return L"UTF-16BE BOM";
 634         case UTF32_LE:
 635                 return L"UTF-32LE";
 636         case UTF32_BE:
 637                 return L"UTF-32BE";
 638         case UTF8:
 639                 return L"UTF-8";
 640         case UTF8BOM:
 641                 return L"UTF-8 BOM";
 642         }
 643         return L"";
 644 }
 645
 646 bool CFileTextLines::StripComments( CString& sLine, bool bInBlockComment )
 647 {
 648         int startpos = 0;
 649
 650         do
 651         {
 652                 if (bInBlockComment)
 653                 {
 654                         int endpos = sLine.Find(m_sCommentBlockEnd);
 655                         if (endpos >= 0)
 656                         {
 657                                 sLine = sLine.Left(startpos) + sLine.Mid(endpos+m_sCommentBlockEnd.GetLength());
 658                                 bInBlockComment = false;
 659                         }
 660                         else
 661                         {
 662                                 sLine = sLine.Left(startpos);
 663                                 startpos = -1;
 664                         }
 665                 }
 666                 if (!bInBlockComment)
 667                 {
 668                         startpos = m_sCommentBlockStart.IsEmpty() ? -1 : sLine.Find(m_sCommentBlockStart);
 669                         int startpos2 = m_sCommentLine.IsEmpty() ? -1 : sLine.Find(m_sCommentLine);
 670                         if ( ((startpos2 < startpos) && (startpos2 >= 0)) ||
 671                                  ((startpos2 >= 0) && (startpos < 0)) )
 672                         {
 673                                 // line comment
 674                                 // look if there's a string marker (" or ') before that
 675                                 // note: this check is not fully correct. For example, it
 676                                 // does not account for escaped chars or even multiline strings.
 677                                 // but it has to be fast, so this has to do...
 678                                 int scount = 0;
 679                                 int ccount = 0;
 680                                 auto spos = sLine.Find('"');
 681                                 while ((spos >= 0) && (spos < startpos2))
 682                                 {
 683                                         ++scount;
 684                                         spos = sLine.Find('"', spos + 1);
 685                                 }
 686                                 auto cpos = sLine.Find('\'');
 687                                 while ((cpos >= 0) && (cpos < startpos2))
 688                                 {
 689                                         ++ccount;
 690                                         cpos = sLine.Find('"', cpos + 1);
 691                                 }
 692                                 if ((scount % 2 == 0) && (ccount % 2 == 0))
 693                                 {
 694                                         // line comment, erase the rest of the line
 695                                         sLine = sLine.Left(startpos2);
 696                                         startpos = -1;
 697                                 }
 698                         }
 699                         else if (startpos >= 0)
 700                         {
 701                                 // starting block comment
 702                                 bInBlockComment = true;
 703                         }
 704                 }
 705         } while (startpos >= 0);
 706
 707         return bInBlockComment;
 708 }
 709
 710 void CFileTextLines::LineRegex( CString& sLine, const std::wregex& rx, const std::wstring& replacement ) const
 711 {
 712         std::wstring str = (LPCTSTR)sLine;
 713         std::wstring str2 = std::regex_replace(str, rx, replacement);
 714         sLine = str2.c_str();
 715 }
 716
 717
 718 void CBuffer::ExpandToAtLeast(int nNewSize)
 719 {
 720         if (nNewSize>m_nAllocated)
 721         {
 722                 delete [] m_pBuffer; // we don't preserve buffer content intentionally
 723                 nNewSize+=2048-1;
 724                 nNewSize&=~(1024-1);
 725                 m_pBuffer=new BYTE[nNewSize];
 726                 m_nAllocated=nNewSize;
 727         }
 728 }
 729
 730 void CBuffer::SetLength(int nUsed)
 731 {
 732         ExpandToAtLeast(nUsed);
 733         m_nUsed = nUsed;
 734 }
 735
 736 void CBuffer::Swap(CBuffer & Src)
 737 {
 738         std::swap(Src.m_nAllocated, m_nAllocated);
 739         std::swap(Src.m_pBuffer, m_pBuffer);
 740         std::swap(Src.m_nUsed, m_nUsed);
 741 }
 742
 743 void CBuffer::Copy(const CBuffer & Src)
 744 {
 745         if (&Src != this)
 746         {
 747                 SetLength(Src.m_nUsed);
 748                 memcpy(m_pBuffer, Src.m_pBuffer, m_nUsed);
 749         }
 750 }
 751
 752
 753
 754 bool CBaseFilter::Decode(/*in out*/ CBuffer & data)
 755 {
 756         int nFlags = (m_nCodePage==CP_ACP) ? MB_PRECOMPOSED : 0;
 757         // dry decode is around 8 times faster then real one, alternatively we can set buffer to max length
 758         int nReadChars = MultiByteToWideChar(m_nCodePage, nFlags, (LPCSTR)data, data.GetLength(), NULL, 0);
 759         m_oBuffer.SetLength(nReadChars*sizeof(wchar_t));
 760         int ret2 = MultiByteToWideChar(m_nCodePage, nFlags, (LPCSTR)data, data.GetLength(), (LPWSTR)(void *)m_oBuffer, nReadChars);
 761         if (ret2 != nReadChars)
 762         {
 763                 return FALSE;
 764         }
 765         data.Swap(m_oBuffer);
 766         return TRUE;
 767 }
 768
 769 const CBuffer & CBaseFilter::Encode(const CString s)
 770 {
 771         m_oBuffer.SetLength(s.GetLength()*3+1); // set buffer to guessed max size
 772         int nConvertedLen = WideCharToMultiByte(m_nCodePage, 0, (LPCTSTR)s, s.GetLength(), (LPSTR)m_oBuffer, m_oBuffer.GetLength(), NULL, NULL);
 773         m_oBuffer.SetLength(nConvertedLen); // set buffer to used size
 774         return m_oBuffer;
 775 }
 776
 777
 778
 779 bool CUtf16leFilter::Decode(/*in out*/ CBuffer & /*data*/)
 780 {
 781         // we believe data is ok for use
 782         return TRUE;
 783 }
 784
 785 const CBuffer & CUtf16leFilter::Encode(const CString s)
 786 {
 787         int nNeedBytes = s.GetLength()*sizeof(TCHAR);
 788         m_oBuffer.SetLength(nNeedBytes);
 789         memcpy((void *)m_oBuffer, (LPCTSTR)s, nNeedBytes);
 790         return m_oBuffer;
 791 }
 792
 793
 794
 795 bool CUtf16beFilter::Decode(/*in out*/ CBuffer & data)
 796 {
 797         int nNeedBytes = data.GetLength();
 798         // make in place WORD BYTEs swap
 799         UINT64 * p_qw = (UINT64 *)(void *)data;
 800         int nQwords = nNeedBytes/8;
 801         for (int nQword = 0; nQword<nQwords; nQword++)
 802         {
 803                 p_qw[nQword] = WordSwapBytes(p_qw[nQword]);
 804         }
 805         wchar_t * p_w = (wchar_t *)p_qw;
 806         int nWords = nNeedBytes/2;
 807         for (int nWord = nQwords*4; nWord<nWords; nWord++)
 808         {
 809                 p_w[nWord] = WideCharSwap(p_w[nWord]);
 810         }
 811         return CUtf16leFilter::Decode(data);
 812 }
 813
 814 const CBuffer & CUtf16beFilter::Encode(const CString s)
 815 {
 816         int nNeedBytes = s.GetLength()*sizeof(TCHAR);
 817         m_oBuffer.SetLength(nNeedBytes);
 818         // copy swaping BYTE order in WORDs
 819         const UINT64 * p_qwIn = (const UINT64 *)(LPCTSTR)s;
 820         UINT64 * p_qwOut = (UINT64 *)(void *)m_oBuffer;
 821         int nQwords = nNeedBytes/8;
 822         for (int nQword = 0; nQword<nQwords; nQword++)
 823         {
 824                 p_qwOut[nQword] = WordSwapBytes(p_qwIn[nQword]);
 825         }
 826         wchar_t * p_wIn = (wchar_t *)p_qwIn;
 827         wchar_t * p_wOut = (wchar_t *)p_qwOut;
 828         int nWords = nNeedBytes/2;
 829         for (int nWord = nQwords*4; nWord<nWords; nWord++)
 830         {
 831                 p_wOut[nWord] = WideCharSwap(p_wIn[nWord]);
 832         }
 833         return m_oBuffer;
 834 }
 835
 836
 837
 838 bool CUtf32leFilter::Decode(/*in out*/ CBuffer & data)
 839 {
 840         // UTF32 have four bytes per char
 841         int nReadChars = data.GetLength()/4;
 842         UINT32 * p32 = (UINT32 *)(void *)data;
 843
 844         // count chars which needs surrogate pair
 845         int nSurrogatePairCount = 0;
 846         for (int i = 0; i<nReadChars; ++i)
 847         {
 848                 if (p32[i]<0x110000 && p32[i]>=0x10000)
 849                 {
 850                         ++nSurrogatePairCount;
 851                 }
 852         }
 853
 854         // fill buffer
 855         m_oBuffer.SetLength((nReadChars+nSurrogatePairCount)*sizeof(wchar_t));
 856         wchar_t * pOut = (wchar_t *)m_oBuffer;
 857         for (int i = 0; i<nReadChars; ++i, ++pOut)
 858         {
 859                 UINT32 zChar = p32[i];
 860                 if (zChar>=0x110000)
 861                 {
 862                         *pOut=0xfffd; // ? mark
 863                 }
 864                 else if (zChar>=0x10000)
 865                 {
 866                         zChar-=0x10000;
 867                         pOut[0] = ((zChar>>10)&0x3ff) | 0xd800; // lead surrogate
 868                         pOut[1] = (zChar&0x7ff) | 0xdc00; // trail surrogate
 869                         pOut++;
 870                 }
 871                 else
 872                 {
 873                         *pOut = (wchar_t)zChar;
 874                 }
 875         }
 876         data.Swap(m_oBuffer);
 877         return TRUE;
 878 }
 879
 880 const CBuffer & CUtf32leFilter::Encode(const CString s)
 881 {
 882         int nInWords = s.GetLength();
 883         m_oBuffer.SetLength(nInWords*2);
 884
 885         LPCTSTR p_In = (LPCTSTR)s;
 886         UINT32 * p_Out = (UINT32 *)(void *)m_oBuffer;
 887         int nOutDword = 0;
 888         for (int nInWord = 0; nInWord<nInWords; nInWord++, nOutDword++)
 889         {
 890                 UINT32 zChar = p_In[nInWord];
 891                 if ((zChar&0xfc00) == 0xd800) // lead surrogate
 892                 {
 893                         if (nInWord+1<nInWords && (p_In[nInWord+1]&0xfc00) == 0xdc00) // trail surrogate follows
 894                         {
 895                                 zChar = 0x10000 + ((zChar&0x3ff)<<10) + (p_In[++nInWord]&0x3ff);
 896                         }
 897                         else
 898                         {
 899                                 zChar = 0xfffd; // ? mark
 900                         }
 901                 }
 902                 else if ((zChar&0xfc00) == 0xdc00) // trail surrogate without lead
 903                 {
 904                         zChar = 0xfffd; // ? mark
 905                 }
 906                 p_Out[nOutDword] = zChar;
 907         }
 908         m_oBuffer.SetLength(nOutDword*4); // store length reduced by surrogates
 909         return m_oBuffer;
 910 }
 911
 912
 913
 914 bool CUtf32beFilter::Decode(/*in out*/ CBuffer & data)
 915 {
 916
 917         // swap BYTEs order in DWORDs
 918         UINT64 * p64 = (UINT64 *)(void *)data;
 919         int nQwords = data.GetLength()/8;
 920         for (int nQword = 0; nQword<nQwords; nQword++)
 921         {
 922                 p64[nQword] = DwordSwapBytes(p64[nQword]);
 923         }
 924
 925         UINT32 * p32 = (UINT32 *)p64;
 926         int nDwords = data.GetLength()/4;
 927         for (int nDword = nQwords*2; nDword<nDwords; nDword++)
 928         {
 929                 p32[nDword] = DwordSwapBytes(p32[nDword]);
 930         }
 931         return CUtf32leFilter::Decode(data);
 932 }
 933
 934 const CBuffer & CUtf32beFilter::Encode(const CString s)
 935 {
 936         CUtf32leFilter::Encode(s);
 937
 938         // swap BYTEs order in DWORDs
 939         UINT64 * p64 = (UINT64 *)(void *)m_oBuffer;
 940         int nQwords = m_oBuffer.GetLength()/8;
 941         for (int nQword = 0; nQword<nQwords; nQword++)
 942         {
 943                 p64[nQword] = DwordSwapBytes(p64[nQword]);
 944         }
 945
 946         UINT32 * p32 = (UINT32 *)p64;
 947         int nDwords = m_oBuffer.GetLength()/4;
 948         for (int nDword = nQwords*2; nDword<nDwords; nDword++)
 949         {
 950                 p32[nDword] = DwordSwapBytes(p32[nDword]);
 951         }
 952         return m_oBuffer;
 953 }