src/TortoiseMerge/FileTextLines.cpp

   1 // TortoiseGitMerge - a Diff/Patch program
   2
   3 // Copyright (C) 2007-2014 - TortoiseSVN
   4
   5 // This program is free software; you can redistribute it and/or
   6 // modify it under the terms of the GNU General Public License
   7 // as published by the Free Software Foundation; either version 2
   8 // of the License, or (at your option) any later version.
   9
  10 // This program is distributed in the hope that it will be useful,
  11 // but WITHOUT ANY WARRANTY; without even the implied warranty of
  12 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13 // GNU General Public License for more details.
  14
  15 // You should have received a copy of the GNU General Public License
  16 // along with this program; if not, write to the Free Software Foundation,
  17 // 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
  18 //
  19 #include "stdafx.h"
  20 #include "resource.h"
  21 #include "UnicodeUtils.h"
  22 #include "registry.h"
  23 #include "FileTextLines.h"
  24 #include "FormatMessageWrapper.h"
  25 #include "SmartHandle.h"
  26
  27 wchar_t inline WideCharSwap(wchar_t nValue)
  28 {
  29         return (((nValue>> 8)) | (nValue << 8));
  30         //return _byteswap_ushort(nValue);
  31 }
  32
  33 UINT64 inline WordSwapBytes(UINT64 nValue)
  34 {
  35         return ((nValue&0xff00ff00ff00ff)<<8) | ((nValue>>8)&0xff00ff00ff00ff); // swap BYTESs in WORDs
  36 }
  37
  38 UINT32 inline DwordSwapBytes(UINT32 nValue)
  39 {
  40         UINT32 nRet = (nValue<<16) | (nValue>>16); // swap WORDs
  41         nRet = ((nRet&0xff00ff)<<8) | ((nRet>>8)&0xff00ff); // swap BYTESs in WORDs
  42         return nRet;
  43         //return _byteswap_ulong(nValue);
  44 }
  45
  46 UINT64 inline DwordSwapBytes(UINT64 nValue)
  47 {
  48         UINT64 nRet = ((nValue&0xffff0000ffffL)<<16) | ((nValue>>16)&0xffff0000ffffL); // swap WORDs in DWORDs
  49         nRet = ((nRet&0xff00ff00ff00ff)<<8) | ((nRet>>8)&0xff00ff00ff00ff); // swap BYTESs in WORDs
  50         return nRet;
  51 }
  52
  53 CFileTextLines::CFileTextLines(void)
  54         : m_bNeedsConversion(false)
  55 {
  56         m_SaveParams.m_UnicodeType = CFileTextLines::AUTOTYPE;
  57         m_SaveParams.m_LineEndings = EOL_AUTOLINE;
  58 }
  59
  60 CFileTextLines::~CFileTextLines(void)
  61 {
  62 }
  63
  64 CFileTextLines::UnicodeType CFileTextLines::CheckUnicodeType(LPVOID pBuffer, int cb)
  65 {
  66         if (cb < 2)
  67                 return CFileTextLines::ASCII;
  68         const UINT32 * const pVal32 = (UINT32 *)pBuffer;
  69         const UINT16 * const pVal16 = (UINT16 *)pBuffer;
  70         const UINT8 * const pVal8 = (UINT8 *)pBuffer;
  71         // scan the whole buffer for a 0x00000000 sequence
  72         // if found, we assume a binary file
  73         int nDwords = cb/4;
  74         for (int j=0; j<nDwords; ++j)
  75         {
  76                 if (0x00000000 == pVal32[j])
  77                         return CFileTextLines::BINARY;
  78         }
  79         if (cb >=4 )
  80         {
  81                 if (*pVal32 == 0x0000FEFF)
  82                 {
  83                         return CFileTextLines::UTF32_LE;
  84                 }
  85                 if (*pVal32 == 0xFFFE0000)
  86                 {
  87                         return CFileTextLines::UTF32_BE;
  88                 }
  89         }
  90         if (*pVal16 == 0xFEFF)
  91         {
  92                 return CFileTextLines::UTF16_LEBOM;
  93         }
  94         if (*pVal16 == 0xFFFE)
  95         {
  96                 return CFileTextLines::UTF16_BEBOM;
  97         }
  98         if (cb < 3)
  99                 return CFileTextLines::ASCII;
 100         if (*pVal16 == 0xBBEF)
 101         {
 102                 if (pVal8[2] == 0xBF)
 103                         return CFileTextLines::UTF8BOM;
 104         }
 105         // check for illegal UTF8 sequences
 106         bool bNonANSI = false;
 107         int nNeedData = 0;
 108         int i=0;
 109         int nullcount = 0;
 110         if (!bNonANSI)
 111         {
 112                 for (; i<cb; ++i)
 113                 {
 114                         if (pVal8[i] == 0)
 115                         {
 116                                 ++nullcount;
 117                                 // count the null chars, we do not want to treat an ASCII/UTF8 file
 118                                 // as UTF16 just because of some null chars that might be accidentally
 119                                 // in the file.
 120                                 // Use an arbitrary value of one fiftieth of the file length as
 121                                 // the limit after which a file is considered UTF16.
 122                                 if (nullcount > (cb / 50))
 123                                 {
 124                                         // null-chars are not allowed for ASCII or UTF8, that means
 125                                         // this file is most likely UTF16 encoded
 126                                         if (i%2)
 127                                                 return CFileTextLines::UTF16_LE;
 128                                         else
 129                                                 return CFileTextLines::UTF16_BE;
 130                                 }
 131                         }
 132                         if ((pVal8[i] & 0x80)!=0) // non ASCII
 133                         {
 134                                 bNonANSI = true;
 135                                 break;
 136                         }
 137                 }
 138         }
 139         // check remaining text for UTF-8 validity
 140         for (; i<cb; ++i)
 141         {
 142                 UINT8 zChar = pVal8[i];
 143                 if ((zChar & 0x80)==0) // Ascii
 144                 {
 145                         if (zChar == 0)
 146                         {
 147                                 ++nullcount;
 148                                 // count the null chars, we do not want to treat an ASCII/UTF8 file
 149                                 // as UTF16 just because of some null chars that might be accidentally
 150                                 // in the file.
 151                                 // Use an arbitrary value of one fiftieth of the file length as
 152                                 // the limit after which a file is considered UTF16.
 153                                 if (nullcount > (cb / 50))
 154                                 {
 155                                         // null-chars are not allowed for ASCII or UTF8, that means
 156                                         // this file is most likely UTF16 encoded
 157                                         if (i%2)
 158                                                 return CFileTextLines::UTF16_LE;
 159                                         else
 160                                                 return CFileTextLines::UTF16_BE;
 161                                 }
 162                                 nNeedData = 0;
 163                         }
 164                         else if (nNeedData)
 165                         {
 166                                 return CFileTextLines::ASCII;
 167                         }
 168                         continue;
 169                 }
 170                 if ((zChar & 0x40)==0) // top bit
 171                 {
 172                         if (!nNeedData)
 173                                 return CFileTextLines::ASCII;
 174                         --nNeedData;
 175                 }
 176                 else if (nNeedData)
 177                 {
 178                         return CFileTextLines::ASCII;
 179                 }
 180                 else if ((zChar & 0x20)==0) // top two bits
 181                 {
 182                         if (zChar<=0xC1)
 183                                 return CFileTextLines::ASCII;
 184                         nNeedData = 1;
 185                 }
 186                 else if ((zChar & 0x10)==0) // top three bits
 187                 {
 188                         nNeedData = 2;
 189                 }
 190                 else if ((zChar & 0x08)==0) // top four bits
 191                 {
 192                         if (zChar>=0xf5)
 193                                 return CFileTextLines::ASCII;
 194                         nNeedData = 3;
 195                 }
 196                 else
 197                         return CFileTextLines::ASCII;
 198         }
 199         if (bNonANSI && nNeedData==0)
 200                 // if get here thru nonAscii and no missing data left then its valid UTF8
 201                 return CFileTextLines::UTF8;
 202         if ((!bNonANSI)&&(DWORD(CRegDWORD(_T("Software\\TortoiseGitMerge\\UseUTF8"), FALSE))))
 203                 return CFileTextLines::UTF8;
 204         return CFileTextLines::ASCII;
 205 }
 206
 207
 208 BOOL CFileTextLines::Load(const CString& sFilePath, int lengthHint /* = 0*/)
 209 {
 210         WCHAR exceptionError[1000] = {0};
 211         m_SaveParams.m_LineEndings = EOL_AUTOLINE;
 212         m_SaveParams.m_UnicodeType = CFileTextLines::AUTOTYPE;
 213         RemoveAll();
 214         if(lengthHint != 0)
 215         {
 216                 Reserve(lengthHint);
 217         }
 218
 219         if (PathIsDirectory(sFilePath))
 220         {
 221                 m_sErrorString.Format(IDS_ERR_FILE_NOTAFILE, (LPCTSTR)sFilePath);
 222                 return FALSE;
 223         }
 224
 225         if (!PathFileExists(sFilePath))
 226         {
 227                 //file does not exist, so just return SUCCESS
 228                 return TRUE;
 229         }
 230
 231         CAutoFile hFile = CreateFile(sFilePath, GENERIC_READ, FILE_SHARE_READ|FILE_SHARE_DELETE|FILE_SHARE_WRITE, NULL, OPEN_EXISTING, NULL, NULL);
 232         if (!hFile)
 233         {
 234                 SetErrorString();
 235                 return FALSE;
 236         }
 237
 238         LARGE_INTEGER fsize;
 239         if (!GetFileSizeEx(hFile, &fsize))
 240         {
 241                 SetErrorString();
 242                 return FALSE;
 243         }
 244         if (fsize.HighPart)
 245         {
 246                 // file is way too big for us
 247                 m_sErrorString.LoadString(IDS_ERR_FILE_TOOBIG);
 248                 return FALSE;
 249         }
 250
 251         // create buffer
 252         // If new[] was done for type T delete[] must be called on a pointer of type T*,
 253         // otherwise the behavior is undefined.
 254         // +1 is to address possible truncation when integer division is done
 255         CBuffer oFile;
 256         try
 257         {
 258                 oFile.SetLength(fsize.LowPart);
 259         }
 260         catch (CMemoryException* e)
 261         {
 262                 e->GetErrorMessage(exceptionError, _countof(exceptionError));
 263                 m_sErrorString = exceptionError;
 264                 return FALSE;
 265         }
 266
 267         // load file
 268         DWORD dwReadBytes = 0;
 269         if (!ReadFile(hFile, (void *)oFile, fsize.LowPart, &dwReadBytes, NULL))
 270         {
 271                 SetErrorString();
 272                 return FALSE;
 273         }
 274         hFile.CloseHandle();
 275
 276         // detect type
 277         if (m_SaveParams.m_UnicodeType == CFileTextLines::AUTOTYPE)
 278         {
 279                 m_SaveParams.m_UnicodeType = this->CheckUnicodeType((LPVOID)oFile, dwReadBytes);
 280                 // enforce conversion for all but ASCII and UTF8 type
 281                 m_bNeedsConversion = (m_SaveParams.m_UnicodeType!=CFileTextLines::UTF8)&&(m_SaveParams.m_UnicodeType!=CFileTextLines::ASCII);
 282         }
 283
 284         // we may have to convert the file content - CString is UTF16LE
 285         try
 286         {
 287                 CBaseFilter * pFilter = NULL;
 288                 switch (m_SaveParams.m_UnicodeType)
 289                 {
 290                 case BINARY:
 291                         m_sErrorString.Format(IDS_ERR_FILE_BINARY, (LPCTSTR)sFilePath);
 292                         return FALSE;
 293                 case UTF8:
 294                 case UTF8BOM:
 295                         pFilter = new CUtf8Filter(NULL);
 296                         break;
 297                 default:
 298                 case ASCII:
 299                         pFilter = new CAsciiFilter(NULL);
 300                         break;
 301                 case UTF16_BE:
 302                 case UTF16_BEBOM:
 303                         pFilter = new CUtf16beFilter(NULL);
 304                         break;
 305                 case UTF16_LE:
 306                 case UTF16_LEBOM:
 307                         pFilter = new CUtf16leFilter(NULL);
 308                         break;
 309                 case UTF32_BE:
 310                         pFilter = new CUtf32beFilter(NULL);
 311                         break;
 312                 case UTF32_LE:
 313                         pFilter = new CUtf32leFilter(NULL);
 314                         break;
 315                 }
 316                 pFilter->Decode(oFile);
 317                 delete pFilter;
 318         }
 319         catch (CMemoryException* e)
 320         {
 321                 e->GetErrorMessage(exceptionError, _countof(exceptionError));
 322                 m_sErrorString = exceptionError;
 323                 return FALSE;
 324         }
 325
 326         int nReadChars=oFile.GetLength()/sizeof(wchar_t);
 327         wchar_t * pTextBuf = (wchar_t *)oFile;
 328         wchar_t * pLineStart = pTextBuf;
 329         if ((m_SaveParams.m_UnicodeType == UTF8BOM)
 330                 || (m_SaveParams.m_UnicodeType == UTF16_LEBOM)
 331                 || (m_SaveParams.m_UnicodeType == UTF16_BEBOM)
 332                 || (m_SaveParams.m_UnicodeType == UTF32_LE)
 333                 || (m_SaveParams.m_UnicodeType == UTF32_BE))
 334         {
 335                 // ignore the BOM
 336                 ++pTextBuf;
 337                 ++pLineStart;
 338                 --nReadChars;
 339         }
 340
 341         // fill in the lines into the array
 342         size_t countEOLs[EOL__COUNT];
 343         memset(countEOLs, 0, sizeof(countEOLs));
 344         CFileTextLine oTextLine;
 345         for (int i = nReadChars; i; --i)
 346         {
 347                 EOL eEol;
 348                 switch (*pTextBuf++)
 349                 {
 350                 case '\r':
 351                         // crlf line ending or cr line ending
 352                         eEol = ((i > 1) && *(pTextBuf) == '\n') ? EOL_CRLF : EOL_CR;
 353                         break;
 354                 case '\n':
 355                         // lfcr line ending or lf line ending
 356                         eEol = ((i > 1) && *(pTextBuf) == '\r') ? EOL_LFCR : EOL_LF;
 357                         if (eEol == EOL_LFCR)
 358                         {
 359                                 // LFCR is very rare on Windows, so we have to double check
 360                                 // that this is not just a LF followed by CRLF
 361                                 if (((countEOLs[EOL_CRLF] > 1) || (countEOLs[EOL_LF]>1)) &&
 362                                         ((i > 2) && (*(pTextBuf+1) == '\n')))
 363                                 {
 364                                         // change the EOL back to a simple LF
 365                                         eEol = EOL_LF;
 366                                 }
 367                         }
 368                         break;
 369                 case 0x000b:
 370                         eEol = EOL_VT;
 371                         break;
 372                 case 0x000c:
 373                         eEol = EOL_FF;
 374                         break;
 375                 case 0x0085:
 376                         eEol = EOL_NEL;
 377                         break;
 378                 case 0x2028:
 379                         eEol = EOL_LS;
 380                         break;
 381                 case 0x2029:
 382                         eEol = EOL_PS;
 383                         break;
 384                 default:
 385                         continue;
 386                 }
 387                 oTextLine.sLine = CString(pLineStart, (int)(pTextBuf-pLineStart)-1);
 388                 oTextLine.eEnding = eEol;
 389                 CStdFileLineArray::Add(oTextLine);
 390                 ++countEOLs[eEol];
 391                 if (eEol==EOL_CRLF || eEol==EOL_LFCR)
 392                 {
 393                         ++pTextBuf;
 394                         --i;
 395                 }
 396                 pLineStart = pTextBuf;
 397         }
 398         CString line(pLineStart, (int)(pTextBuf-pLineStart));
 399         Add(line, EOL_NOENDING);
 400
 401         // some EOLs are not supported by the svn diff lib.
 402         m_bNeedsConversion |= (countEOLs[EOL_CRLF]!=0);
 403         m_bNeedsConversion |= (countEOLs[EOL_FF]!=0);
 404         m_bNeedsConversion |= (countEOLs[EOL_VT]!=0);
 405         m_bNeedsConversion |= (countEOLs[EOL_NEL]!=0);
 406         m_bNeedsConversion |= (countEOLs[EOL_LS]!=0);
 407         m_bNeedsConversion |= (countEOLs[EOL_PS]!=0);
 408
 409         size_t eolmax = 0;
 410         for (int nEol = 0; nEol<EOL__COUNT; nEol++)
 411         {
 412                 if (eolmax < countEOLs[nEol])
 413                 {
 414                         eolmax = countEOLs[nEol];
 415                         m_SaveParams.m_LineEndings = (EOL)nEol;
 416                 }
 417         }
 418
 419         return TRUE;
 420 }
 421
 422 void CFileTextLines::StripWhiteSpace(CString& sLine, DWORD dwIgnoreWhitespaces, bool blame)
 423 {
 424         if (blame)
 425         {
 426                 if (sLine.GetLength() > 66)
 427                         sLine = sLine.Mid(66);
 428         }
 429         switch (dwIgnoreWhitespaces)
 430         {
 431         case 0:
 432                 // Compare whitespaces
 433                 // do nothing
 434                 break;
 435         case 1:
 436                 // Ignore all whitespaces
 437                 sLine.TrimLeft(_T(" \t"));
 438                 sLine.TrimRight(_T(" \t"));
 439                 break;
 440         case 2:
 441                 // Ignore leading whitespace
 442                 sLine.TrimLeft(_T(" \t"));
 443                 break;
 444         case 3:
 445                 // Ignore ending whitespace
 446                 sLine.TrimRight(_T(" \t"));
 447                 break;
 448         }
 449 }
 450
 451 /**
 452         Encoding pattern:
 453                 - encode & save BOM
 454                 - Get Line
 455                 - modify line - whitespaces, lowercase
 456                 - encode & save line
 457                 - get cached encoded eol
 458                 - save eol
 459 */
 460 BOOL CFileTextLines::Save( const CString& sFilePath
 461                                                 , bool bSaveAsUTF8 /*= false */
 462                                                 , bool bUseSVNCompatibleEOLs /*= false */
 463                                                 , DWORD dwIgnoreWhitespaces /*= 0 */
 464                                                 , BOOL bIgnoreCase /*= FALSE */
 465                                                 , bool bBlame /*= false*/
 466                                                 , bool bIgnoreComments /*= false*/
 467                                                 , const CString& linestart /*= CString()*/
 468                                                 , const CString& blockstart /*= CString()*/
 469                                                 , const CString& blockend /*= CString()*/
 470                                                 , const std::wregex& rx /*= std::wregex(L"")*/
 471                                                 , const std::wstring& replacement /*=L""*/)
 472 {
 473         m_sCommentLine = linestart;
 474         m_sCommentBlockStart = blockstart;
 475         m_sCommentBlockEnd = blockend;
 476
 477         try
 478         {
 479                 CString destPath = sFilePath;
 480                 // now make sure that the destination directory exists
 481                 int ind = 0;
 482                 while (destPath.Find('\\', ind)>=2)
 483                 {
 484                         if (!PathIsDirectory(destPath.Left(destPath.Find('\\', ind))))
 485                         {
 486                                 if (!CreateDirectory(destPath.Left(destPath.Find('\\', ind)), NULL))
 487                                         return FALSE;
 488                         }
 489                         ind = destPath.Find('\\', ind)+1;
 490                 }
 491
 492                 CStdioFile file;                        // Hugely faster than CFile for big file writes - because it uses buffering
 493                 if (!file.Open(sFilePath, CFile::modeCreate | CFile::modeWrite | CFile::typeBinary))
 494                 {
 495                         const_cast<CString *>(&m_sErrorString)->Format(IDS_ERR_FILE_OPEN, (LPCTSTR)sFilePath);
 496                         return FALSE;
 497                 }
 498
 499                 CBaseFilter * pFilter = NULL;
 500                 bool bSaveBom = true;
 501                 CFileTextLines::UnicodeType eUnicodeType = bSaveAsUTF8 ? CFileTextLines::UTF8 : m_SaveParams.m_UnicodeType;
 502                 switch (eUnicodeType)
 503                 {
 504                 default:
 505                 case CFileTextLines::ASCII:
 506                         bSaveBom = false;
 507                         pFilter = new CAsciiFilter(&file);
 508                         break;
 509                 case CFileTextLines::UTF8:
 510                         bSaveBom = false;
 511                 case CFileTextLines::UTF8BOM:
 512                         pFilter = new CUtf8Filter(&file);
 513                         break;
 514                 case CFileTextLines::UTF16_BE:
 515                         bSaveBom = false;
 516                         pFilter = new CUtf16beFilter(&file);
 517                         break;
 518                 case CFileTextLines::UTF16_BEBOM:
 519                         pFilter = new CUtf16beFilter(&file);
 520                         break;
 521                 case CFileTextLines::UTF16_LE:
 522                         bSaveBom = false;
 523                         pFilter = new CUtf16leFilter(&file);
 524                         break;
 525                 case CFileTextLines::UTF16_LEBOM:
 526                         pFilter = new CUtf16leFilter(&file);
 527                         break;
 528                 case CFileTextLines::UTF32_BE:
 529                         pFilter = new CUtf32beFilter(&file);
 530                         break;
 531                 case CFileTextLines::UTF32_LE:
 532                         pFilter = new CUtf32leFilter(&file);
 533                         break;
 534                 }
 535
 536                 if (bSaveBom)
 537                 {
 538                         //first write the BOM
 539                         pFilter->Write(L"\xfeff");
 540                 }
 541                 // cache EOLs
 542                 CBuffer oEncodedEol[EOL__COUNT];
 543                 oEncodedEol[EOL_LF] = pFilter->Encode(_T("\n")); // x0a
 544                 oEncodedEol[EOL_CR] = pFilter->Encode(_T("\r")); // x0d
 545                 oEncodedEol[EOL_CRLF] = pFilter->Encode(_T("\r\n")); // x0d x0a
 546                 if (bUseSVNCompatibleEOLs)
 547                 {
 548                         // when using EOLs that are supported by the svn lib,
 549                         // we have to use the same EOLs as the file has in case
 550                         // they're already supported, but a different supported one
 551                         // in case the original one isn't supported.
 552                         // Only this way the option "ignore EOLs (recommended)" unchecked
 553                         // actually shows the lines as different.
 554                         // However, the diff won't find and differences in EOLs
 555                         // for these special EOLs if they differ between those special ones
 556                         // listed below.
 557                         // But it will work properly for the most common EOLs LF/CR/CRLF.
 558                         oEncodedEol[EOL_LFCR] = oEncodedEol[EOL_CR];
 559                         for (int nEol = 0; nEol<EOL_NOENDING; nEol++)
 560                         {
 561                                 if (oEncodedEol[nEol].IsEmpty())
 562                                         oEncodedEol[nEol] = oEncodedEol[EOL_LF];
 563                         }
 564                 }
 565                 else
 566                 {
 567                         oEncodedEol[EOL_LFCR] = pFilter->Encode(_T("\n\r"));
 568                         oEncodedEol[EOL_VT] = pFilter->Encode(_T("\v")); // x0b
 569                         oEncodedEol[EOL_FF] = pFilter->Encode(_T("\f")); // x0c
 570                         oEncodedEol[EOL_NEL] = pFilter->Encode(_T("\x85"));
 571                         oEncodedEol[EOL_LS] = pFilter->Encode(_T("\x2028"));
 572                         oEncodedEol[EOL_PS] = pFilter->Encode(_T("\x2029"));
 573                 }
 574                 oEncodedEol[EOL_AUTOLINE] = oEncodedEol[m_SaveParams.m_LineEndings==EOL_AUTOLINE
 575                                 ? EOL_CRLF
 576                                 : m_SaveParams.m_LineEndings];
 577
 578                 bool bInBlockComment = false;
 579                 for (int i=0; i<GetCount(); i++)
 580                 {
 581                         CString sLineT = GetAt(i);
 582                         if (bIgnoreComments)
 583                                 bInBlockComment = StripComments(sLineT, bInBlockComment);
 584                         if (!rx._Empty())
 585                                 LineRegex(sLineT, rx, replacement);
 586                         StripWhiteSpace(sLineT, dwIgnoreWhitespaces, bBlame);
 587                         if (bIgnoreCase)
 588                                 sLineT = sLineT.MakeLower();
 589                         pFilter->Write(sLineT);
 590                         EOL eEol = GetLineEnding(i);
 591                         pFilter->Write(oEncodedEol[eEol]);
 592                 }
 593                 delete pFilter;
 594                 file.Close();
 595         }
 596         catch (CException * e)
 597         {
 598                 CString * psErrorString = const_cast<CString *>(&m_sErrorString);
 599                 e->GetErrorMessage(psErrorString->GetBuffer(4096), 4096);
 600                 psErrorString->ReleaseBuffer();
 601                 e->Delete();
 602                 return FALSE;
 603         }
 604         return TRUE;
 605 }
 606
 607 void CFileTextLines::SetErrorString()
 608 {
 609         m_sErrorString = CFormatMessageWrapper();
 610 }
 611
 612 void CFileTextLines::CopySettings(CFileTextLines * pFileToCopySettingsTo) const
 613 {
 614         if (pFileToCopySettingsTo)
 615         {
 616                 pFileToCopySettingsTo->m_SaveParams = m_SaveParams;
 617         }
 618 }
 619
 620 const wchar_t * CFileTextLines::GetEncodingName(UnicodeType eEncoding)
 621 {
 622         switch (eEncoding)
 623         {
 624         case ASCII:
 625                 return L"ASCII";
 626         case BINARY:
 627                 return L"BINARY";
 628         case UTF16_LE:
 629                 return L"UTF-16LE";
 630         case UTF16_LEBOM:
 631                 return L"UTF-16LE BOM";
 632         case UTF16_BE:
 633                 return L"UTF-16BE";
 634         case UTF16_BEBOM:
 635                 return L"UTF-16BE BOM";
 636         case UTF32_LE:
 637                 return L"UTF-32LE";
 638         case UTF32_BE:
 639                 return L"UTF-32BE";
 640         case UTF8:
 641                 return L"UTF-8";
 642         case UTF8BOM:
 643                 return L"UTF-8 BOM";
 644         }
 645         return L"";
 646 }
 647
 648 bool CFileTextLines::StripComments( CString& sLine, bool bInBlockComment )
 649 {
 650         int startpos = 0;
 651
 652         do
 653         {
 654                 if (bInBlockComment)
 655                 {
 656                         int endpos = sLine.Find(m_sCommentBlockEnd);
 657                         if (endpos >= 0)
 658                         {
 659                                 sLine = sLine.Left(startpos) + sLine.Mid(endpos+m_sCommentBlockEnd.GetLength());
 660                                 bInBlockComment = false;
 661                         }
 662                         else
 663                         {
 664                                 sLine = sLine.Left(startpos);
 665                                 startpos = -1;
 666                         }
 667                 }
 668                 if (!bInBlockComment)
 669                 {
 670                         startpos = m_sCommentBlockStart.IsEmpty() ? -1 : sLine.Find(m_sCommentBlockStart);
 671                         int startpos2 = m_sCommentLine.IsEmpty() ? -1 : sLine.Find(m_sCommentLine);
 672                         if ( ((startpos2 < startpos) && (startpos2 >= 0)) ||
 673                                  ((startpos2 >= 0) && (startpos < 0)) )
 674                         {
 675                                 // line comment, erase the rest of the line
 676                                 sLine = sLine.Left(startpos2);
 677                                 startpos = -1;
 678                         }
 679                         else if (startpos >= 0)
 680                         {
 681                                 // starting block comment
 682                                 bInBlockComment = true;
 683                         }
 684                 }
 685         } while (startpos >= 0);
 686
 687         return bInBlockComment;
 688 }
 689
 690 void CFileTextLines::LineRegex( CString& sLine, const std::wregex& rx, const std::wstring& replacement ) const
 691 {
 692         std::wstring str = (LPCTSTR)sLine;
 693         std::wstring str2 = std::regex_replace(str, rx, replacement);
 694         sLine = str2.c_str();
 695 }
 696
 697
 698 void CBuffer::ExpandToAtLeast(int nNewSize)
 699 {
 700         if (nNewSize>m_nAllocated)
 701         {
 702                 delete [] m_pBuffer; // we don't preserve buffer content intentionally
 703                 nNewSize+=2048-1;
 704                 nNewSize&=~(1024-1);
 705                 m_pBuffer=new BYTE[nNewSize];
 706                 m_nAllocated=nNewSize;
 707         }
 708 }
 709
 710 void CBuffer::SetLength(int nUsed)
 711 {
 712         ExpandToAtLeast(nUsed);
 713         m_nUsed = nUsed;
 714 }
 715
 716 void CBuffer::Swap(CBuffer & Src)
 717 {
 718         std::swap(Src.m_nAllocated, m_nAllocated);
 719         std::swap(Src.m_pBuffer, m_pBuffer);
 720         std::swap(Src.m_nUsed, m_nUsed);
 721 }
 722
 723 void CBuffer::Copy(const CBuffer & Src)
 724 {
 725         if (&Src != this)
 726         {
 727                 SetLength(Src.m_nUsed);
 728                 memcpy(m_pBuffer, Src.m_pBuffer, m_nUsed);
 729         }
 730 }
 731
 732
 733
 734 bool CBaseFilter::Decode(/*in out*/ CBuffer & data)
 735 {
 736         int nFlags = (m_nCodePage==CP_ACP) ? MB_PRECOMPOSED : 0;
 737         // dry decode is around 8 times faster then real one, alternatively we can set buffer to max length
 738         int nReadChars = MultiByteToWideChar(m_nCodePage, nFlags, (LPCSTR)data, data.GetLength(), NULL, 0);
 739         m_oBuffer.SetLength(nReadChars*sizeof(wchar_t));
 740         int ret2 = MultiByteToWideChar(m_nCodePage, nFlags, (LPCSTR)data, data.GetLength(), (LPWSTR)(void *)m_oBuffer, nReadChars);
 741         if (ret2 != nReadChars)
 742         {
 743                 return FALSE;
 744         }
 745         data.Swap(m_oBuffer);
 746         return TRUE;
 747 }
 748
 749 const CBuffer & CBaseFilter::Encode(const CString s)
 750 {
 751         m_oBuffer.SetLength(s.GetLength()*3+1); // set buffer to guessed max size
 752         int nConvertedLen = WideCharToMultiByte(m_nCodePage, 0, (LPCTSTR)s, s.GetLength(), (LPSTR)m_oBuffer, m_oBuffer.GetLength(), NULL, NULL);
 753         m_oBuffer.SetLength(nConvertedLen); // set buffer to used size
 754         return m_oBuffer;
 755 }
 756
 757
 758
 759 bool CUtf16leFilter::Decode(/*in out*/ CBuffer & /*data*/)
 760 {
 761         // we believe data is ok for use
 762         return TRUE;
 763 }
 764
 765 const CBuffer & CUtf16leFilter::Encode(const CString s)
 766 {
 767         int nNeedBytes = s.GetLength()*sizeof(TCHAR);
 768         m_oBuffer.SetLength(nNeedBytes);
 769         memcpy((void *)m_oBuffer, (LPCTSTR)s, nNeedBytes);
 770         return m_oBuffer;
 771 }
 772
 773
 774
 775 bool CUtf16beFilter::Decode(/*in out*/ CBuffer & data)
 776 {
 777         int nNeedBytes = data.GetLength();
 778         // make in place WORD BYTEs swap
 779         UINT64 * p_qw = (UINT64 *)(void *)data;
 780         int nQwords = nNeedBytes/8;
 781         for (int nQword = 0; nQword<nQwords; nQword++)
 782         {
 783                 p_qw[nQword] = WordSwapBytes(p_qw[nQword]);
 784         }
 785         wchar_t * p_w = (wchar_t *)p_qw;
 786         int nWords = nNeedBytes/2;
 787         for (int nWord = nQwords*4; nWord<nWords; nWord++)
 788         {
 789                 p_w[nWord] = WideCharSwap(p_w[nWord]);
 790         }
 791         return CUtf16leFilter::Decode(data);
 792 }
 793
 794 const CBuffer & CUtf16beFilter::Encode(const CString s)
 795 {
 796         int nNeedBytes = s.GetLength()*sizeof(TCHAR);
 797         m_oBuffer.SetLength(nNeedBytes);
 798         // copy swaping BYTE order in WORDs
 799         const UINT64 * p_qwIn = (const UINT64 *)(LPCTSTR)s;
 800         UINT64 * p_qwOut = (UINT64 *)(void *)m_oBuffer;
 801         int nQwords = nNeedBytes/8;
 802         for (int nQword = 0; nQword<nQwords; nQword++)
 803         {
 804                 p_qwOut[nQword] = WordSwapBytes(p_qwIn[nQword]);
 805         }
 806         wchar_t * p_wIn = (wchar_t *)p_qwIn;
 807         wchar_t * p_wOut = (wchar_t *)p_qwOut;
 808         int nWords = nNeedBytes/2;
 809         for (int nWord = nQwords*4; nWord<nWords; nWord++)
 810         {
 811                 p_wOut[nWord] = WideCharSwap(p_wIn[nWord]);
 812         }
 813         return m_oBuffer;
 814 }
 815
 816
 817
 818 bool CUtf32leFilter::Decode(/*in out*/ CBuffer & data)
 819 {
 820         // UTF32 have four bytes per char
 821         int nReadChars = data.GetLength()/4;
 822         UINT32 * p32 = (UINT32 *)(void *)data;
 823
 824         // count chars which needs surrogate pair
 825         int nSurrogatePairCount = 0;
 826         for (int i = 0; i<nReadChars; ++i)
 827         {
 828                 if (p32[i]<0x110000 && p32[i]>=0x10000)
 829                 {
 830                         ++nSurrogatePairCount;
 831                 }
 832         }
 833
 834         // fill buffer
 835         m_oBuffer.SetLength((nReadChars+nSurrogatePairCount)*sizeof(wchar_t));
 836         wchar_t * pOut = (wchar_t *)m_oBuffer;
 837         for (int i = 0; i<nReadChars; ++i, ++pOut)
 838         {
 839                 UINT32 zChar = p32[i];
 840                 if (zChar>=0x110000)
 841                 {
 842                         *pOut=0xfffd; // ? mark
 843                 }
 844                 else if (zChar>=0x10000)
 845                 {
 846                         zChar-=0x10000;
 847                         pOut[0] = ((zChar>>10)&0x3ff) | 0xd800; // lead surrogate
 848                         pOut[1] = (zChar&0x7ff) | 0xdc00; // trail surrogate
 849                         pOut++;
 850                 }
 851                 else
 852                 {
 853                         *pOut = (wchar_t)zChar;
 854                 }
 855         }
 856         data.Swap(m_oBuffer);
 857         return TRUE;
 858 }
 859
 860 const CBuffer & CUtf32leFilter::Encode(const CString s)
 861 {
 862         int nInWords = s.GetLength();
 863         m_oBuffer.SetLength(nInWords*2);
 864
 865         LPCTSTR p_In = (LPCTSTR)s;
 866         UINT32 * p_Out = (UINT32 *)(void *)m_oBuffer;
 867         int nOutDword = 0;
 868         for (int nInWord = 0; nInWord<nInWords; nInWord++, nOutDword++)
 869         {
 870                 UINT32 zChar = p_In[nInWord];
 871                 if ((zChar&0xfc00) == 0xd800) // lead surrogate
 872                 {
 873                         if (nInWord+1<nInWords && (p_In[nInWord+1]&0xfc00) == 0xdc00) // trail surrogate follows
 874                         {
 875                                 zChar = 0x10000 + ((zChar&0x3ff)<<10) + (p_In[++nInWord]&0x3ff);
 876                         }
 877                         else
 878                         {
 879                                 zChar = 0xfffd; // ? mark
 880                         }
 881                 }
 882                 else if ((zChar&0xfc00) == 0xdc00) // trail surrogate without lead
 883                 {
 884                         zChar = 0xfffd; // ? mark
 885                 }
 886                 p_Out[nOutDword] = zChar;
 887         }
 888         m_oBuffer.SetLength(nOutDword*4); // store length reduced by surrogates
 889         return m_oBuffer;
 890 }
 891
 892
 893
 894 bool CUtf32beFilter::Decode(/*in out*/ CBuffer & data)
 895 {
 896
 897         // swap BYTEs order in DWORDs
 898         UINT64 * p64 = (UINT64 *)(void *)data;
 899         int nQwords = data.GetLength()/8;
 900         for (int nQword = 0; nQword<nQwords; nQword++)
 901         {
 902                 p64[nQword] = DwordSwapBytes(p64[nQword]);
 903         }
 904
 905         UINT32 * p32 = (UINT32 *)p64;
 906         int nDwords = data.GetLength()/4;
 907         for (int nDword = nQwords*2; nDword<nDwords; nDword++)
 908         {
 909                 p32[nDword] = DwordSwapBytes(p32[nDword]);
 910         }
 911         return CUtf32leFilter::Decode(data);
 912 }
 913
 914 const CBuffer & CUtf32beFilter::Encode(const CString s)
 915 {
 916         CUtf32leFilter::Encode(s);
 917
 918         // swap BYTEs order in DWORDs
 919         UINT64 * p64 = (UINT64 *)(void *)m_oBuffer;
 920         int nQwords = m_oBuffer.GetLength()/8;
 921         for (int nQword = 0; nQword<nQwords; nQword++)
 922         {
 923                 p64[nQword] = DwordSwapBytes(p64[nQword]);
 924         }
 925
 926         UINT32 * p32 = (UINT32 *)p64;
 927         int nDwords = m_oBuffer.GetLength()/4;
 928         for (int nDword = nQwords*2; nDword<nDwords; nDword++)
 929         {
 930                 p32[nDword] = DwordSwapBytes(p32[nDword]);
 931         }
 932         return m_oBuffer;
 933 }