src/TortoiseMerge/FileTextLines.cpp

   1 // TortoiseMerge - a Diff/Patch program
   2
   3 // Copyright (C) 2007-2011 - TortoiseSVN
   4
   5 // This program is free software; you can redistribute it and/or
   6 // modify it under the terms of the GNU General Public License
   7 // as published by the Free Software Foundation; either version 2
   8 // of the License, or (at your option) any later version.
   9
  10 // This program is distributed in the hope that it will be useful,
  11 // but WITHOUT ANY WARRANTY; without even the implied warranty of
  12 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13 // GNU General Public License for more details.
  14
  15 // You should have received a copy of the GNU General Public License
  16 // along with this program; if not, write to the Free Software Foundation,
  17 // 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
  18 //
  19 #include "StdAfx.h"
  20 #include "Resource.h"
  21 #include "UnicodeUtils.h"
  22 #include "registry.h"
  23 #include ".\filetextlines.h"
  24 #include "FormatMessageWrapper.h"
  25 #include "SmartHandle.h"
  26
  27 CFileTextLines::CFileTextLines(void)
  28         : m_UnicodeType(CFileTextLines::AUTOTYPE)
  29         , m_LineEndings(EOL_AUTOLINE)
  30         , m_bReturnAtEnd(false)
  31 {
  32 }
  33
  34 CFileTextLines::~CFileTextLines(void)
  35 {
  36 }
  37
  38 CFileTextLines::UnicodeType CFileTextLines::CheckUnicodeType(LPVOID pBuffer, int cb)
  39 {
  40         if (cb < 2)
  41                 return CFileTextLines::ASCII;
  42         UINT16 * pVal16 = (UINT16 *)pBuffer;
  43         UINT8 * pVal8 = (UINT8 *)(pVal16+1);
  44         // scan the whole buffer for a 0x0000 sequence
  45         // if found, we assume a binary file
  46         for (int i=0; i<(cb-2); i=i+2)
  47         {
  48                 if (0x0000 == *pVal16++)
  49                         return CFileTextLines::BINARY;
  50         }
  51         pVal16 = (UINT16 *)pBuffer;
  52         if (*pVal16 == 0xFEFF)
  53                 return CFileTextLines::UNICODE_LE;
  54         if (cb < 3)
  55                 return ASCII;
  56         if (*pVal16 == 0xBBEF)
  57         {
  58                 if (*pVal8 == 0xBF)
  59                         return CFileTextLines::UTF8BOM;
  60         }
  61         // check for illegal UTF8 chars
  62         pVal8 = (UINT8 *)pBuffer;
  63         for (int i=0; i<cb; ++i)
  64         {
  65                 if ((*pVal8 == 0xC0)||(*pVal8 == 0xC1)||(*pVal8 >= 0xF5))
  66                         return CFileTextLines::ASCII;
  67                 pVal8++;
  68         }
  69         pVal8 = (UINT8 *)pBuffer;
  70         bool bUTF8 = false;
  71         bool bNonANSI = false;
  72         for (int i=0; i<(cb-3); ++i)
  73         {
  74                 if (*pVal8 > 127)
  75                         bNonANSI = true;
  76                 if ((*pVal8 & 0xE0)==0xC0)
  77                 {
  78                         pVal8++;i++;
  79                         if ((*pVal8 & 0xC0)!=0x80)
  80                                 return CFileTextLines::ASCII;
  81                         bUTF8 = true;
  82                 }
  83                 if ((*pVal8 & 0xF0)==0xE0)
  84                 {
  85                         pVal8++;i++;
  86                         if ((*pVal8 & 0xC0)!=0x80)
  87                                 return CFileTextLines::ASCII;
  88                         pVal8++;i++;
  89                         if ((*pVal8 & 0xC0)!=0x80)
  90                                 return CFileTextLines::ASCII;
  91                         bUTF8 = true;
  92                 }
  93                 if ((*pVal8 & 0xF8)==0xF0)
  94                 {
  95                         pVal8++;i++;
  96                         if ((*pVal8 & 0xC0)!=0x80)
  97                                 return CFileTextLines::ASCII;
  98                         pVal8++;i++;
  99                         if ((*pVal8 & 0xC0)!=0x80)
 100                                 return CFileTextLines::ASCII;
 101                         pVal8++;i++;
 102                         if ((*pVal8 & 0xC0)!=0x80)
 103                                 return CFileTextLines::ASCII;
 104                         bUTF8 = true;
 105                 }
 106                 pVal8++;
 107         }
 108         if (bUTF8)
 109                 return CFileTextLines::UTF8;
 110         if ((!bNonANSI)&&(DWORD(CRegDWORD(_T("Software\\TortoiseMerge\\UseUTF8"), FALSE))))
 111                 return CFileTextLines::UTF8;
 112         return CFileTextLines::ASCII;
 113 }
 114
 115
 116 EOL CFileTextLines::CheckLineEndings(LPVOID pBuffer, int cb)
 117 {
 118         EOL retval = EOL_AUTOLINE;
 119         char * buf = (char *)pBuffer;
 120         for (int i=0; i<cb; i++)
 121         {
 122                 //now search the buffer for line endings
 123                 if (buf[i] == 0x0a)
 124                 {
 125                         if ((i+1)<cb)
 126                         {
 127                                 if (buf[i+1] == 0)
 128                                 {
 129                                         //UNICODE
 130                                         if ((i+2)<cb)
 131                                         {
 132                                                 if (buf[i+2] == 0x0d)
 133                                                 {
 134                                                         retval = EOL_LFCR;
 135                                                         break;
 136                                                 }
 137                                                 else
 138                                                 {
 139                                                         retval = EOL_LF;
 140                                                         break;
 141                                                 }
 142                                         }
 143                                 }
 144                                 else if (buf[i+1] == 0x0d)
 145                                 {
 146                                         retval = EOL_LFCR;
 147                                         break;
 148                                 }
 149                         }
 150                         retval = EOL_LF;
 151                         break;
 152                 }
 153                 else if (buf[i] == 0x0d)
 154                 {
 155                         if ((i+1)<cb)
 156                         {
 157                                 if (buf[i+1] == 0)
 158                                 {
 159                                         //UNICODE
 160                                         if ((i+2)<cb)
 161                                         {
 162                                                 if (buf[i+2] == 0x0a)
 163                                                 {
 164                                                         retval = EOL_CRLF;
 165                                                         break;
 166                                                 }
 167                                                 else
 168                                                 {
 169                                                         retval = EOL_CR;
 170                                                         break;
 171                                                 }
 172                                         }
 173                                 }
 174                                 else if (buf[i+1] == 0x0a)
 175                                 {
 176                                         retval = EOL_CRLF;
 177                                         break;
 178                                 }
 179                         }
 180                         retval = EOL_CR;
 181                         break;
 182                 }
 183         }
 184         return retval;
 185 }
 186
 187 BOOL CFileTextLines::Load(const CString& sFilePath, int lengthHint /* = 0*/)
 188 {
 189         m_LineEndings = EOL_AUTOLINE;
 190         m_UnicodeType = CFileTextLines::AUTOTYPE;
 191         RemoveAll();
 192         m_endings.clear();
 193         if(lengthHint != 0)
 194         {
 195                 Reserve(lengthHint);
 196         }
 197
 198         if (PathIsDirectory(sFilePath))
 199         {
 200                 m_sErrorString.Format(IDS_ERR_FILE_NOTAFILE, (LPCTSTR)sFilePath);
 201                 return FALSE;
 202         }
 203
 204         if (!PathFileExists(sFilePath))
 205         {
 206                 //file does not exist, so just return SUCCESS
 207                 return TRUE;
 208         }
 209
 210         CAutoFile hFile = CreateFile(sFilePath, GENERIC_READ, FILE_SHARE_READ, NULL, OPEN_EXISTING, NULL, NULL);
 211         if (!hFile)
 212         {
 213                 SetErrorString();
 214                 return FALSE;
 215         }
 216
 217         LARGE_INTEGER fsize;
 218         if (!GetFileSizeEx(hFile, &fsize))
 219         {
 220                 SetErrorString();
 221                 return false;
 222         }
 223         if (fsize.HighPart)
 224         {
 225                 // file is way too big for us
 226                 m_sErrorString.LoadString(IDS_ERR_FILE_TOOBIG);
 227                 return FALSE;
 228         }
 229
 230         // If new[] was done for type T delete[] must be called on a pointer of type T*,
 231         // otherwise the behavior is undefined.
 232         // +1 is to address possible truncation when integer division is done
 233         wchar_t* pFileBuf = new wchar_t[fsize.LowPart/sizeof(wchar_t) + 1];
 234         DWORD dwReadBytes = 0;
 235         if (!ReadFile(hFile, pFileBuf, fsize.LowPart, &dwReadBytes, NULL))
 236         {
 237                 delete [] pFileBuf;
 238                 SetErrorString();
 239                 return FALSE;
 240         }
 241         if (m_UnicodeType == CFileTextLines::AUTOTYPE)
 242         {
 243                 m_UnicodeType = this->CheckUnicodeType(pFileBuf, dwReadBytes);
 244         }
 245         if (m_LineEndings == EOL_AUTOLINE)
 246         {
 247                 m_LineEndings = CheckLineEndings(pFileBuf, min(10000, dwReadBytes));
 248         }
 249         hFile.CloseHandle();
 250
 251         if (m_UnicodeType == CFileTextLines::BINARY)
 252         {
 253                 m_sErrorString.Format(IDS_ERR_FILE_BINARY, (LPCTSTR)sFilePath);
 254                 delete [] pFileBuf;
 255                 return FALSE;
 256         }
 257
 258         // we may have to convert the file content
 259         if ((m_UnicodeType == UTF8)||(m_UnicodeType == UTF8BOM))
 260         {
 261                 int ret = MultiByteToWideChar(CP_UTF8, 0, (LPCSTR)pFileBuf, dwReadBytes, NULL, 0);
 262                 wchar_t * pWideBuf = new wchar_t[ret];
 263                 int ret2 = MultiByteToWideChar(CP_UTF8, 0, (LPCSTR)pFileBuf, dwReadBytes, pWideBuf, ret);
 264                 if (ret2 == ret)
 265                 {
 266                         delete [] pFileBuf;
 267                         pFileBuf = pWideBuf;
 268                         dwReadBytes = ret2;
 269                 } else
 270                         delete [] pWideBuf;
 271         }
 272         else if (m_UnicodeType == ASCII)
 273         {
 274                 int ret = MultiByteToWideChar(CP_ACP, MB_PRECOMPOSED, (LPCSTR)pFileBuf, dwReadBytes, NULL, 0);
 275                 wchar_t * pWideBuf = new wchar_t[ret];
 276                 int ret2 = MultiByteToWideChar(CP_ACP, MB_PRECOMPOSED, (LPCSTR)pFileBuf, dwReadBytes, pWideBuf, ret);
 277                 if (ret2 == ret)
 278                 {
 279                         delete [] pFileBuf;
 280                         pFileBuf = pWideBuf;
 281                         dwReadBytes = ret2;
 282                 }
 283                 else
 284                         delete [] pWideBuf;
 285         }
 286         // fill in the lines into the array
 287         wchar_t * pTextBuf = pFileBuf;
 288         wchar_t * pLineStart = pFileBuf;
 289         if (m_UnicodeType == UNICODE_LE)
 290         {
 291                 // UTF16 have two bytes per char
 292                 dwReadBytes/=2;
 293         }
 294         if ((m_UnicodeType == UTF8BOM)||(m_UnicodeType == UNICODE_LE))
 295         {
 296                 // ignore the BOM
 297                 ++pTextBuf;
 298                 ++pLineStart;
 299                 --dwReadBytes;
 300         }
 301
 302         for (DWORD i = 0; i<dwReadBytes; ++i)
 303         {
 304                 if (*pTextBuf == '\r')
 305                 {
 306                         if ((i + 1) < dwReadBytes)
 307                         {
 308                                 if (*(pTextBuf+1) == '\n')
 309                                 {
 310                                         // crlf line ending
 311                                         CString line(pLineStart, (int)(pTextBuf-pLineStart));
 312                                         Add(line, EOL_CRLF);
 313                                         pLineStart = pTextBuf+2;
 314                                         ++pTextBuf;
 315                                         ++i;
 316                                 }
 317                                 else
 318                                 {
 319                                         // cr line ending
 320                                         CString line(pLineStart, (int)(pTextBuf-pLineStart));
 321                                         Add(line, EOL_CR);
 322                                         pLineStart =pTextBuf+1;
 323                                 }
 324                         }
 325                 }
 326                 else if (*pTextBuf == '\n')
 327                 {
 328                         // lf line ending
 329                         CString line(pLineStart, (int)(pTextBuf-pLineStart));
 330                         Add(line, EOL_LF);
 331                         pLineStart =pTextBuf+1;
 332                 }
 333                 ++pTextBuf;
 334         }
 335         if (pLineStart < pTextBuf)
 336         {
 337                 CString line(pLineStart, (int)(pTextBuf-pLineStart));
 338                 Add(line, EOL_NOENDING);
 339                 m_bReturnAtEnd = false;
 340         }
 341         else
 342                 m_bReturnAtEnd = true;
 343
 344         delete [] pFileBuf;
 345
 346         return TRUE;
 347 }
 348
 349 void CFileTextLines::StripWhiteSpace(CString& sLine,DWORD dwIgnoreWhitespaces, bool blame)
 350 {
 351         if (blame)
 352         {
 353                 if (sLine.GetLength() > 66)
 354                         sLine = sLine.Mid(66);
 355         }
 356         switch (dwIgnoreWhitespaces)
 357         {
 358         case 0:
 359                 // Compare whitespaces
 360                 // do nothing
 361                 break;
 362         case 1:
 363                 // Ignore all whitespaces
 364                 sLine.TrimLeft(_T(" \t"));
 365                 sLine.TrimRight(_T(" \t"));
 366                 break;
 367         case 2:
 368                 // Ignore leading whitespace
 369                 sLine.TrimLeft(_T(" \t"));
 370                 break;
 371         case 3:
 372                 // Ignore ending whitespace
 373                 sLine.TrimRight(_T(" \t"));
 374                 break;
 375         }
 376 }
 377
 378 void CFileTextLines::StripAsciiWhiteSpace(CStringA& sLine,DWORD dwIgnoreWhitespaces, bool blame)
 379 {
 380         if (blame)
 381         {
 382                 if (sLine.GetLength() > 66)
 383                         sLine = sLine.Mid(66);
 384         }
 385         switch (dwIgnoreWhitespaces)
 386         {
 387         case 0: // Compare whitespaces
 388                 // do nothing
 389                 break;
 390         case 1:
 391                 // Ignore all whitespaces
 392                 StripAsciiWhiteSpace(sLine);
 393                 break;
 394         case 2:
 395                 // Ignore leading whitespace
 396                 sLine.TrimLeft(" \t");
 397                 break;
 398         case 3:
 399                 // Ignore leading whitespace
 400                 sLine.TrimRight(" \t");
 401                 break;
 402         }
 403 }
 404
 405 //
 406 // Fast in-place removal of spaces and tabs from CStringA line
 407 //
 408 void CFileTextLines::StripAsciiWhiteSpace(CStringA& sLine)
 409 {
 410         int outputLen = 0;
 411         char* pWriteChr = sLine.GetBuffer(sLine.GetLength());
 412         const char* pReadChr = pWriteChr;
 413         while(*pReadChr)
 414         {
 415                 if(*pReadChr != ' ' && *pReadChr != '\t')
 416                 {
 417                         *pWriteChr++ = *pReadChr;
 418                         outputLen++;
 419                 }
 420                 ++pReadChr;
 421         }
 422         *pWriteChr = '\0';
 423         sLine.ReleaseBuffer(outputLen);
 424 }
 425
 426 BOOL CFileTextLines::Save(const CString& sFilePath, bool bSaveAsUTF8, DWORD dwIgnoreWhitespaces /*=0*/, BOOL bIgnoreCase /*= FALSE*/, bool bBlame /*= false*/)
 427 {
 428         try
 429         {
 430                 CString destPath = sFilePath;
 431                 // now make sure that the destination directory exists
 432                 int ind = 0;
 433                 while (destPath.Find('\\', ind)>=2)
 434                 {
 435                         if (!PathIsDirectory(destPath.Left(destPath.Find('\\', ind))))
 436                         {
 437                                 if (!CreateDirectory(destPath.Left(destPath.Find('\\', ind)), NULL))
 438                                         return FALSE;
 439                         }
 440                         ind = destPath.Find('\\', ind)+1;
 441                 }
 442
 443                 CStdioFile file;                        // Hugely faster than CFile for big file writes - because it uses buffering
 444                 if (!file.Open(sFilePath, CFile::modeCreate | CFile::modeWrite | CFile::typeBinary))
 445                 {
 446                         m_sErrorString.Format(IDS_ERR_FILE_OPEN, (LPCTSTR)sFilePath);
 447                         return FALSE;
 448                 }
 449                 if ((!bSaveAsUTF8)&&(m_UnicodeType == CFileTextLines::UNICODE_LE))
 450                 {
 451                         //first write the BOM
 452                         UINT16 wBOM = 0xFEFF;
 453                         file.Write(&wBOM, 2);
 454                         for (int i=0; i<GetCount(); i++)
 455                         {
 456                                 CString sLine = GetAt(i);
 457                                 EOL ending = GetLineEnding(i);
 458                                 StripWhiteSpace(sLine,dwIgnoreWhitespaces, bBlame);
 459                                 if (bIgnoreCase)
 460                                         sLine = sLine.MakeLower();
 461                                 file.Write((LPCTSTR)sLine, sLine.GetLength()*sizeof(TCHAR));
 462                                 if (ending == EOL_AUTOLINE)
 463                                         ending = m_LineEndings;
 464                                 switch (ending)
 465                                 {
 466                                 case EOL_CR:
 467                                         sLine = _T("\x0d");
 468                                         break;
 469                                 case EOL_CRLF:
 470                                 case EOL_AUTOLINE:
 471                                         sLine = _T("\x0d\x0a");
 472                                         break;
 473                                 case EOL_LF:
 474                                         sLine = _T("\x0a");
 475                                         break;
 476                                 case EOL_LFCR:
 477                                         sLine = _T("\x0a\x0d");
 478                                         break;
 479                                 default:
 480                                         sLine.Empty();
 481                                         break;
 482                                 }
 483                                 if ((m_bReturnAtEnd)||(i != GetCount()-1))
 484                                         file.Write((LPCTSTR)sLine, sLine.GetLength()*sizeof(TCHAR));
 485                         }
 486                 }
 487                 else if ((!bSaveAsUTF8)&&((m_UnicodeType == CFileTextLines::ASCII)||(m_UnicodeType == CFileTextLines::AUTOTYPE)))
 488                 {
 489                         for (int i=0; i< GetCount(); i++)
 490                         {
 491                                 // Copy CString to 8 bit without conversion
 492                                 CString sLineT = GetAt(i);
 493                                 CStringA sLine = CStringA(sLineT);
 494                                 EOL ending = GetLineEnding(i);
 495
 496                                 StripAsciiWhiteSpace(sLine,dwIgnoreWhitespaces, bBlame);
 497                                 if (bIgnoreCase)
 498                                         sLine = sLine.MakeLower();
 499                                 if ((m_bReturnAtEnd)||(i != GetCount()-1))
 500                                 {
 501                                         if (ending == EOL_AUTOLINE)
 502                                                 ending = m_LineEndings;
 503                                         switch (ending)
 504                                         {
 505                                         case EOL_CR:
 506                                                 sLine += '\x0d';
 507                                                 break;
 508                                         case EOL_CRLF:
 509                                         case EOL_AUTOLINE:
 510                                                 sLine.Append("\x0d\x0a", 2);
 511                                                 break;
 512                                         case EOL_LF:
 513                                                 sLine += '\x0a';
 514                                                 break;
 515                                         case EOL_LFCR:
 516                                                 sLine.Append("\x0a\x0d", 2);
 517                                                 break;
 518                                         }
 519                                 }
 520                                 file.Write((LPCSTR)sLine, sLine.GetLength());
 521                         }
 522                 }
 523                 else if ((bSaveAsUTF8)||((m_UnicodeType == CFileTextLines::UTF8BOM)||(m_UnicodeType == CFileTextLines::UTF8)))
 524                 {
 525                         if (m_UnicodeType == CFileTextLines::UTF8BOM)
 526                         {
 527                                 //first write the BOM
 528                                 UINT16 wBOM = 0xBBEF;
 529                                 file.Write(&wBOM, 2);
 530                                 UINT8 uBOM = 0xBF;
 531                                 file.Write(&uBOM, 1);
 532                         }
 533                         for (int i=0; i<GetCount(); i++)
 534                         {
 535                                 CStringA sLine = CUnicodeUtils::GetUTF8(GetAt(i));
 536                                 EOL ending = GetLineEnding(i);
 537                                 StripAsciiWhiteSpace(sLine,dwIgnoreWhitespaces, bBlame);
 538                                 if (bIgnoreCase)
 539                                         sLine = sLine.MakeLower();
 540
 541                                 if ((m_bReturnAtEnd)||(i != GetCount()-1))
 542                                 {
 543                                         if (ending == EOL_AUTOLINE)
 544                                                 ending = m_LineEndings;
 545                                         switch (ending)
 546                                         {
 547                                         case EOL_CR:
 548                                                 sLine += '\x0d';
 549                                                 break;
 550                                         case EOL_CRLF:
 551                                         case EOL_AUTOLINE:
 552                                                 sLine.Append("\x0d\x0a",2);
 553                                                 break;
 554                                         case EOL_LF:
 555                                                 sLine += '\x0a';
 556                                                 break;
 557                                         case EOL_LFCR:
 558                                                 sLine.Append("\x0a\x0d",2);
 559                                                 break;
 560                                         }
 561                                 }
 562                                 file.Write((LPCSTR)sLine, sLine.GetLength());
 563                         }
 564                 }
 565                 file.Close();
 566         }
 567         catch (CException * e)
 568         {
 569                 e->GetErrorMessage(m_sErrorString.GetBuffer(4096), 4096);
 570                 m_sErrorString.ReleaseBuffer();
 571                 e->Delete();
 572                 return FALSE;
 573         }
 574         return TRUE;
 575 }
 576
 577 void CFileTextLines::SetErrorString()
 578 {
 579         m_sErrorString = CFormatMessageWrapper();
 580 }
 581
 582 void CFileTextLines::CopySettings(CFileTextLines * pFileToCopySettingsTo)
 583 {
 584         if (pFileToCopySettingsTo)
 585         {
 586                 pFileToCopySettingsTo->m_UnicodeType = m_UnicodeType;
 587                 pFileToCopySettingsTo->m_LineEndings = m_LineEndings;
 588                 pFileToCopySettingsTo->m_bReturnAtEnd = m_bReturnAtEnd;
 589         }
 590 }