src/TortoiseMerge/FileTextLines.cpp

   1 // TortoiseGitMerge - a Diff/Patch program
   2
   3 // Copyright (C) 2007-2012 - TortoiseSVN
   4
   5 // This program is free software; you can redistribute it and/or
   6 // modify it under the terms of the GNU General Public License
   7 // as published by the Free Software Foundation; either version 2
   8 // of the License, or (at your option) any later version.
   9
  10 // This program is distributed in the hope that it will be useful,
  11 // but WITHOUT ANY WARRANTY; without even the implied warranty of
  12 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13 // GNU General Public License for more details.
  14
  15 // You should have received a copy of the GNU General Public License
  16 // along with this program; if not, write to the Free Software Foundation,
  17 // 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
  18 //
  19 #include "stdafx.h"
  20 #include "resource.h"
  21 #include "UnicodeUtils.h"
  22 #include "registry.h"
  23 #include "filetextlines.h"
  24 #include "FormatMessageWrapper.h"
  25 #include "SmartHandle.h"
  26
  27 wchar_t inline WideCharSwap(wchar_t nValue)
  28 {
  29         return (((nValue>> 8)) | (nValue << 8));
  30         //return _byteswap_ushort(nValue);
  31 }
  32
  33 UINT64 inline WordSwapBytes(UINT64 nValue)
  34 {
  35         return ((nValue&0xff00ff00ff00ff)<<8) | ((nValue>>8)&0xff00ff00ff00ff); // swap BYTESs in WORDs
  36 }
  37
  38 UINT32 inline DwordSwapBytes(UINT32 nValue)
  39 {
  40         UINT32 nRet = (nValue<<16) | (nValue>>16); // swap WORDs
  41         nRet = ((nRet&0xff00ff)<<8) | ((nRet>>8)&0xff00ff); // swap BYTESs in WORDs
  42         return nRet;
  43         //return _byteswap_ulong(nValue);
  44 }
  45
  46 UINT64 inline DwordSwapBytes(UINT64 nValue)
  47 {
  48         UINT64 nRet = ((nValue&0xffff0000ffffL)<<16) | ((nValue>>16)&0xffff0000ffffL); // swap WORDs in DWORDs
  49         nRet = ((nRet&0xff00ff00ff00ff)<<8) | ((nRet>>8)&0xff00ff00ff00ff); // swap BYTESs in WORDs
  50         return nRet;
  51 }
  52
  53 CFileTextLines::CFileTextLines(void)
  54         : m_UnicodeType(CFileTextLines::AUTOTYPE)
  55         , m_LineEndings(EOL_AUTOLINE)
  56         , m_bNeedsConversion(false)
  57 {
  58 }
  59
  60 CFileTextLines::~CFileTextLines(void)
  61 {
  62 }
  63
  64 CFileTextLines::UnicodeType CFileTextLines::CheckUnicodeType(LPVOID pBuffer, int cb)
  65 {
  66         if (cb < 2)
  67                 return CFileTextLines::ASCII;
  68         const UINT32 * const pVal32 = (UINT32 *)pBuffer;
  69         const UINT16 * const pVal16 = (UINT16 *)pBuffer;
  70         const UINT8 * const pVal8 = (UINT8 *)pBuffer;
  71         // scan the whole buffer for a 0x00000000 sequence
  72         // if found, we assume a binary file
  73         int nDwords = cb/4;
  74         for (int i=0; i<nDwords; ++i)
  75         {
  76                 if (0x00000000 == pVal32[i])
  77                         return CFileTextLines::BINARY;
  78         }
  79         if (cb >=4 )
  80         {
  81                 if (*pVal32 == 0x0000FEFF)
  82                 {
  83                         return CFileTextLines::UTF32_LE;
  84                 }
  85                 if (*pVal32 == 0xFFFE0000)
  86                 {
  87                         return CFileTextLines::UTF32_BE;
  88                 }
  89         }
  90         if (*pVal16 == 0xFEFF)
  91         {
  92                 return CFileTextLines::UTF16_LE;
  93         }
  94         if (*pVal16 == 0xFFFE)
  95         {
  96                 return CFileTextLines::UTF16_BE;
  97         }
  98         if (cb < 3)
  99                 return CFileTextLines::ASCII;
 100         if (*pVal16 == 0xBBEF)
 101         {
 102                 if (pVal8[2] == 0xBF)
 103                         return CFileTextLines::UTF8BOM;
 104         }
 105         // check for illegal UTF8 sequences
 106         bool bNonANSI = false;
 107         int nNeedData = 0;
 108         int i=0;
 109         // run fast for ascii
 110         for (; i<cb; i+=8)
 111         {
 112                 if ((*(UINT64 *)&pVal8[i] & 0x8080808080808080)!=0) // all Ascii?
 113                 {
 114                         bNonANSI = true;
 115                         break;
 116                 }
 117         }
 118         // continue slow
 119         for (; i<cb; ++i)
 120         {
 121                 UINT8 zChar = pVal8[i];
 122                 if ((zChar & 0x80)==0) // Ascii
 123                 {
 124                         if (nNeedData)
 125                         {
 126                                 return CFileTextLines::ASCII;
 127                         }
 128                         continue;
 129                 }
 130                 if ((zChar & 0x40)==0) // top bit
 131                 {
 132                         if (!nNeedData)
 133                                 return CFileTextLines::ASCII;
 134                         --nNeedData;
 135                 }
 136                 else if (nNeedData)
 137                 {
 138                         return CFileTextLines::ASCII;
 139                 }
 140                 else if ((zChar & 0x20)==0) // top two bits
 141                 {
 142                         if (zChar<=0xC1)
 143                                 return CFileTextLines::ASCII;
 144                         nNeedData = 1;
 145                 }
 146                 else if ((zChar & 0x10)==0) // top three bits
 147                 {
 148                         nNeedData = 2;
 149                 }
 150                 else if ((zChar & 0x08)==0) // top four bits
 151                 {
 152                         if (zChar>=0xf5)
 153                                 return CFileTextLines::ASCII;
 154                         nNeedData = 3;
 155                 }
 156                 else
 157                         return CFileTextLines::ASCII;
 158         }
 159         if (bNonANSI && nNeedData==0)
 160                 // if get here thru nonAscii and no missing data left then its valid UTF8
 161                 return CFileTextLines::UTF8;
 162         if ((!bNonANSI)&&(DWORD(CRegDWORD(_T("Software\\TortoiseGitMerge\\UseUTF8"), FALSE))))
 163                 return CFileTextLines::UTF8;
 164         return CFileTextLines::ASCII;
 165 }
 166
 167
 168 BOOL CFileTextLines::Load(const CString& sFilePath, int lengthHint /* = 0*/)
 169 {
 170         WCHAR exceptionError[1000] = {0};
 171         m_LineEndings = EOL_AUTOLINE;
 172         m_UnicodeType = CFileTextLines::AUTOTYPE;
 173         RemoveAll();
 174         if(lengthHint != 0)
 175         {
 176                 Reserve(lengthHint);
 177         }
 178
 179         if (PathIsDirectory(sFilePath))
 180         {
 181                 m_sErrorString.Format(IDS_ERR_FILE_NOTAFILE, (LPCTSTR)sFilePath);
 182                 return FALSE;
 183         }
 184
 185         if (!PathFileExists(sFilePath))
 186         {
 187                 //file does not exist, so just return SUCCESS
 188                 return TRUE;
 189         }
 190
 191         CAutoFile hFile = CreateFile(sFilePath, GENERIC_READ, FILE_SHARE_READ|FILE_SHARE_DELETE|FILE_SHARE_WRITE, NULL, OPEN_EXISTING, NULL, NULL);
 192         if (!hFile)
 193         {
 194                 SetErrorString();
 195                 return FALSE;
 196         }
 197
 198         LARGE_INTEGER fsize;
 199         if (!GetFileSizeEx(hFile, &fsize))
 200         {
 201                 SetErrorString();
 202                 return FALSE;
 203         }
 204         if (fsize.HighPart)
 205         {
 206                 // file is way too big for us
 207                 m_sErrorString.LoadString(IDS_ERR_FILE_TOOBIG);
 208                 return FALSE;
 209         }
 210
 211         // create buffer
 212         // If new[] was done for type T delete[] must be called on a pointer of type T*,
 213         // otherwise the behavior is undefined.
 214         // +1 is to address possible truncation when integer division is done
 215         CBuffer oFile;
 216         try
 217         {
 218                 oFile.SetLength(fsize.LowPart);
 219         }
 220         catch (CMemoryException* e)
 221         {
 222                 e->GetErrorMessage(exceptionError, _countof(exceptionError));
 223                 m_sErrorString = exceptionError;
 224                 return FALSE;
 225         }
 226
 227         // load file
 228         DWORD dwReadBytes = 0;
 229         if (!ReadFile(hFile, (void *)oFile, fsize.LowPart, &dwReadBytes, NULL))
 230         {
 231                 SetErrorString();
 232                 return FALSE;
 233         }
 234         hFile.CloseHandle();
 235
 236         // detect type
 237         if (m_UnicodeType == CFileTextLines::AUTOTYPE)
 238         {
 239                 m_UnicodeType = this->CheckUnicodeType((LPVOID)oFile, dwReadBytes);
 240                 // enforce conversion for all but ASCII and UTF8 type
 241                 m_bNeedsConversion = (m_UnicodeType!=CFileTextLines::UTF8)&&(m_UnicodeType!=CFileTextLines::ASCII);
 242         }
 243
 244         // we may have to convert the file content - CString is UTF16LE
 245         try
 246         {
 247                 CBaseFilter * pFilter = NULL;
 248                 switch (m_UnicodeType)
 249                 {
 250                 case BINARY:
 251                         m_sErrorString.Format(IDS_ERR_FILE_BINARY, (LPCTSTR)sFilePath);
 252                         return FALSE;
 253                 case UTF8:
 254                 case UTF8BOM:
 255                         pFilter = new CUtf8Filter(NULL);
 256                         break;
 257                 default:
 258                 case ASCII:
 259                         pFilter = new CAsciiFilter(NULL);
 260                         break;
 261                 case UTF16_BE:
 262                         pFilter = new CUtf16beFilter(NULL);
 263                         break;
 264                 case UTF16_LE:
 265                         pFilter = new CUtf16leFilter(NULL);
 266                         break;
 267                 case UTF32_BE:
 268                         pFilter = new CUtf32beFilter(NULL);
 269                         break;
 270                 case UTF32_LE:
 271                         pFilter = new CUtf32leFilter(NULL);
 272                         break;
 273                 }
 274                 pFilter->Decode(oFile);
 275                 delete pFilter;
 276         }
 277         catch (CMemoryException* e)
 278         {
 279                 e->GetErrorMessage(exceptionError, _countof(exceptionError));
 280                 m_sErrorString = exceptionError;
 281                 return FALSE;
 282         }
 283
 284         int nReadChars=oFile.GetLength()/sizeof(wchar_t);
 285         wchar_t * pTextBuf = (wchar_t *)oFile;
 286         wchar_t * pLineStart = pTextBuf;
 287         if ((m_UnicodeType == UTF8BOM)
 288                 || (m_UnicodeType == UTF16_LE)
 289                 || (m_UnicodeType == UTF16_BE)
 290                 || (m_UnicodeType == UTF32_LE)
 291                 || (m_UnicodeType == UTF32_BE))
 292         {
 293                 // ignore the BOM
 294                 ++pTextBuf;
 295                 ++pLineStart;
 296                 --nReadChars;
 297         }
 298
 299         // fill in the lines into the array
 300         size_t countEOLs[EOL__COUNT];
 301         memset(countEOLs, 0, sizeof(countEOLs));
 302         CFileTextLine oTextLine;
 303         for (int i = nReadChars; i; --i)
 304         {
 305                 EOL eEol;
 306                 switch (*pTextBuf++)
 307                 {
 308                 case '\r':
 309                         // crlf line ending or cr line ending
 310                         eEol = ((i > 1) && *(pTextBuf) == '\n') ? EOL_CRLF : EOL_CR;
 311                         break;
 312                 case '\n':
 313                         // lfcr line ending or lf line ending
 314                         eEol = ((i > 1) && *(pTextBuf) == '\r') ? EOL_LFCR : EOL_LF;
 315                         break;
 316                 case 0x000b:
 317                         eEol = EOL_VT;
 318                         break;
 319                 case 0x000c:
 320                         eEol = EOL_FF;
 321                         break;
 322                 case 0x0085:
 323                         eEol = EOL_NEL;
 324                         break;
 325                 case 0x2028:
 326                         eEol = EOL_LS;
 327                         break;
 328                 case 0x2029:
 329                         eEol = EOL_PS;
 330                         break;
 331                 default:
 332                         continue;
 333                 }
 334                 oTextLine.sLine = CString(pLineStart, (int)(pTextBuf-pLineStart)-1);
 335                 oTextLine.eEnding = eEol;
 336                 Add(oTextLine);
 337                 ++countEOLs[eEol];
 338                 if (eEol==EOL_CRLF || eEol==EOL_LFCR)
 339                 {
 340                         ++pTextBuf;
 341                         --i;
 342                 }
 343                 pLineStart = pTextBuf;
 344         }
 345         CString line(pLineStart, (int)(pTextBuf-pLineStart));
 346         Add(line, EOL_NOENDING);
 347
 348         // some EOLs are not supported by the svn diff lib.
 349         m_bNeedsConversion |= (countEOLs[EOL_CRLF]!=0);
 350         m_bNeedsConversion |= (countEOLs[EOL_FF]!=0);
 351         m_bNeedsConversion |= (countEOLs[EOL_VT]!=0);
 352         m_bNeedsConversion |= (countEOLs[EOL_NEL]!=0);
 353         m_bNeedsConversion |= (countEOLs[EOL_LS]!=0);
 354         m_bNeedsConversion |= (countEOLs[EOL_PS]!=0);
 355
 356         size_t eolmax = 0;
 357         for (int nEol = 0; nEol<EOL__COUNT; nEol++)
 358         {
 359                 if (eolmax < countEOLs[nEol])
 360                 {
 361                         eolmax = countEOLs[nEol];
 362                         m_LineEndings = (EOL)nEol;
 363                 }
 364         }
 365
 366         return TRUE;
 367 }
 368
 369 void CFileTextLines::StripWhiteSpace(CString& sLine, DWORD dwIgnoreWhitespaces, bool blame)
 370 {
 371         if (blame)
 372         {
 373                 if (sLine.GetLength() > 66)
 374                         sLine = sLine.Mid(66);
 375         }
 376         switch (dwIgnoreWhitespaces)
 377         {
 378         case 0:
 379                 // Compare whitespaces
 380                 // do nothing
 381                 break;
 382         case 1:
 383                 // Ignore all whitespaces
 384                 sLine.TrimLeft(_T(" \t"));
 385                 sLine.TrimRight(_T(" \t"));
 386                 break;
 387         case 2:
 388                 // Ignore leading whitespace
 389                 sLine.TrimLeft(_T(" \t"));
 390                 break;
 391         case 3:
 392                 // Ignore ending whitespace
 393                 sLine.TrimRight(_T(" \t"));
 394                 break;
 395         }
 396 }
 397
 398 /**
 399         Encoding pattern:
 400                 - encode & save BOM
 401                 - Get Line
 402                 - modify line - whitespaces, lowercase
 403                 - encode & save line
 404                 - get cached encoded eol
 405                 - save eol
 406 */
 407 BOOL CFileTextLines::Save(const CString& sFilePath
 408                                                  , bool bSaveAsUTF8 /*= false*/
 409                                                  , bool bUseSVNCompatibleEOLs /*= false*/
 410                                                  , DWORD dwIgnoreWhitespaces /*=0*/
 411                                                  , BOOL bIgnoreCase /*= FALSE*/
 412                                                  , bool bBlame /*= false*/) const
 413 {
 414         try
 415         {
 416                 CString destPath = sFilePath;
 417                 // now make sure that the destination directory exists
 418                 int ind = 0;
 419                 while (destPath.Find('\\', ind)>=2)
 420                 {
 421                         if (!PathIsDirectory(destPath.Left(destPath.Find('\\', ind))))
 422                         {
 423                                 if (!CreateDirectory(destPath.Left(destPath.Find('\\', ind)), NULL))
 424                                         return FALSE;
 425                         }
 426                         ind = destPath.Find('\\', ind)+1;
 427                 }
 428
 429                 CStdioFile file;                        // Hugely faster than CFile for big file writes - because it uses buffering
 430                 if (!file.Open(sFilePath, CFile::modeCreate | CFile::modeWrite | CFile::typeBinary))
 431                 {
 432                         const_cast<CString *>(&m_sErrorString)->Format(IDS_ERR_FILE_OPEN, (LPCTSTR)sFilePath);
 433                         return FALSE;
 434                 }
 435
 436                 CBaseFilter * pFilter = NULL;
 437                 bool bSaveBom = true;
 438                 CFileTextLines::UnicodeType eUnicodeType = bSaveAsUTF8 ? CFileTextLines::UTF8 : m_UnicodeType;
 439                 switch (eUnicodeType)
 440                 {
 441                 default:
 442                 case CFileTextLines::ASCII:
 443                         bSaveBom = false;
 444                         pFilter = new CAsciiFilter(&file);
 445                         break;
 446                 case CFileTextLines::UTF8:
 447                         bSaveBom = false;
 448                 case CFileTextLines::UTF8BOM:
 449                         pFilter = new CUtf8Filter(&file);
 450                         break;
 451                 case CFileTextLines::UTF16_BE:
 452                         pFilter = new CUtf16beFilter(&file);
 453                         break;
 454                 case CFileTextLines::UTF16_LE:
 455                         pFilter = new CUtf16leFilter(&file);
 456                         break;
 457                 case CFileTextLines::UTF32_BE:
 458                         pFilter = new CUtf32beFilter(&file);
 459                         break;
 460                 case CFileTextLines::UTF32_LE:
 461                         pFilter = new CUtf32leFilter(&file);
 462                         break;
 463                 }
 464
 465                 if (bSaveBom)
 466                 {
 467                         //first write the BOM
 468                         pFilter->Write(L"\xfeff");
 469                 }
 470                 // cache EOLs
 471                 CBuffer oEncodedEol[EOL__COUNT];
 472                 oEncodedEol[EOL_LF] = pFilter->Encode(_T("\n")); // x0a
 473                 oEncodedEol[EOL_CR] = pFilter->Encode(_T("\r")); // x0d
 474                 oEncodedEol[EOL_CRLF] = pFilter->Encode(_T("\r\n")); // x0d x0a
 475                 if (bUseSVNCompatibleEOLs)
 476                 {
 477                         // when using EOLs that are supported by the svn lib,
 478                         // we have to use the same EOLs as the file has in case
 479                         // they're already supported, but a different supported one
 480                         // in case the original one isn't supported.
 481                         // Only this way the option "ignore EOLs (recommended)" unchecked
 482                         // actually shows the lines as different.
 483                         // However, the diff won't find and differences in EOLs
 484                         // for these special EOLs if they differ between those special ones
 485                         // listed below.
 486                         // But it will work properly for the most common EOLs LF/CR/CRLF.
 487                         oEncodedEol[EOL_LFCR] = oEncodedEol[EOL_CR];
 488                         for (int nEol = 0; nEol<EOL_NOENDING; nEol++)
 489                         {
 490                                 if (oEncodedEol[nEol].IsEmpty())
 491                                         oEncodedEol[nEol] = oEncodedEol[EOL_LF];
 492                         }
 493                 }
 494                 else
 495                 {
 496                         oEncodedEol[EOL_LFCR] = pFilter->Encode(_T("\n\r"));
 497                         oEncodedEol[EOL_VT] = pFilter->Encode(_T("\v")); // x0b
 498                         oEncodedEol[EOL_FF] = pFilter->Encode(_T("\f")); // x0c
 499                         oEncodedEol[EOL_NEL] = pFilter->Encode(_T("\x85"));
 500                         oEncodedEol[EOL_LS] = pFilter->Encode(_T("\x2028"));
 501                         oEncodedEol[EOL_PS] = pFilter->Encode(_T("\x2029"));
 502                 }
 503                 oEncodedEol[EOL_AUTOLINE] = oEncodedEol[m_LineEndings==EOL_AUTOLINE ? EOL_CRLF : m_LineEndings];
 504
 505                 for (int i=0; i<GetCount(); i++)
 506                 {
 507                         CString sLineT = GetAt(i);
 508                         StripWhiteSpace(sLineT, dwIgnoreWhitespaces, bBlame);
 509                         if (bIgnoreCase)
 510                                 sLineT = sLineT.MakeLower();
 511                         pFilter->Write(sLineT);
 512                         EOL eEol = GetLineEnding(i);
 513                         pFilter->Write(oEncodedEol[eEol]);
 514                 }
 515                 delete pFilter;
 516                 file.Close();
 517         }
 518         catch (CException * e)
 519         {
 520                 CString * psErrorString = const_cast<CString *>(&m_sErrorString);
 521                 e->GetErrorMessage(psErrorString->GetBuffer(4096), 4096);
 522                 psErrorString->ReleaseBuffer();
 523                 e->Delete();
 524                 return FALSE;
 525         }
 526         return TRUE;
 527 }
 528
 529 void CFileTextLines::SetErrorString()
 530 {
 531         m_sErrorString = CFormatMessageWrapper();
 532 }
 533
 534 void CFileTextLines::CopySettings(CFileTextLines * pFileToCopySettingsTo)
 535 {
 536         if (pFileToCopySettingsTo)
 537         {
 538                 pFileToCopySettingsTo->m_UnicodeType = m_UnicodeType;
 539                 pFileToCopySettingsTo->m_LineEndings = m_LineEndings;
 540         }
 541 }
 542
 543
 544
 545 void CBuffer::ExpandToAtLeast(int nNewSize)
 546 {
 547         if (nNewSize>m_nAllocated)
 548         {
 549                 delete [] m_pBuffer; // we don't preserve buffer content intentionally
 550                 nNewSize+=2048-1;
 551                 nNewSize&=~(1024-1);
 552                 m_pBuffer=new BYTE[nNewSize];
 553                 m_nAllocated=nNewSize;
 554         }
 555 }
 556
 557 void CBuffer::SetLength(int nUsed)
 558 {
 559         ExpandToAtLeast(nUsed);
 560         m_nUsed = nUsed;
 561 }
 562
 563 void CBuffer::Swap(CBuffer & Src)
 564 {
 565         std::swap(Src.m_nAllocated, m_nAllocated);
 566         std::swap(Src.m_pBuffer, m_pBuffer);
 567         std::swap(Src.m_nUsed, m_nUsed);
 568 }
 569
 570 void CBuffer::Copy(const CBuffer & Src)
 571 {
 572         if (&Src != this)
 573         {
 574                 SetLength(Src.m_nUsed);
 575                 memcpy(m_pBuffer, Src.m_pBuffer, m_nUsed);
 576         }
 577 }
 578
 579
 580
 581 bool CBaseFilter::Decode(/*in out*/ CBuffer & data)
 582 {
 583         int nFlags = (m_nCodePage==CP_ACP) ? MB_PRECOMPOSED : 0;
 584         // dry decode is around 8 times faster then real one, alternatively we can set buffer to max length
 585         int nReadChars = MultiByteToWideChar(m_nCodePage, nFlags, (LPCSTR)data, data.GetLength(), NULL, 0);
 586         m_oBuffer.SetLength(nReadChars*sizeof(wchar_t));
 587         int ret2 = MultiByteToWideChar(m_nCodePage, nFlags, (LPCSTR)data, data.GetLength(), (LPWSTR)(void *)m_oBuffer, nReadChars);
 588         if (ret2 != nReadChars)
 589         {
 590                 return FALSE;
 591         }
 592         data.Swap(m_oBuffer);
 593         return TRUE;
 594 }
 595
 596 const CBuffer & CBaseFilter::Encode(const CString s)
 597 {
 598         m_oBuffer.SetLength(s.GetLength()*3+1); // set buffer to guessed max size
 599         int nConvertedLen = WideCharToMultiByte(m_nCodePage, 0, (LPCTSTR)s, s.GetLength(), (LPSTR)m_oBuffer, m_oBuffer.GetLength(), NULL, NULL);
 600         m_oBuffer.SetLength(nConvertedLen); // set buffer to used size
 601         return m_oBuffer;
 602 }
 603
 604
 605
 606 bool CUtf16leFilter::Decode(/*in out*/ CBuffer & /*data*/)
 607 {
 608         // we believe data is ok for use
 609         return TRUE;
 610 }
 611
 612 const CBuffer & CUtf16leFilter::Encode(const CString s)
 613 {
 614         int nNeedBytes = s.GetLength()*sizeof(TCHAR);
 615         m_oBuffer.SetLength(nNeedBytes);
 616         memcpy((void *)m_oBuffer, (LPCTSTR)s, nNeedBytes);
 617         return m_oBuffer;
 618 }
 619
 620
 621
 622 bool CUtf16beFilter::Decode(/*in out*/ CBuffer & data)
 623 {
 624         int nNeedBytes = data.GetLength();
 625         // make in place WORD BYTEs swap
 626         UINT64 * p_qw = (UINT64 *)(void *)data;
 627         int nQwords = nNeedBytes/8;
 628         for (int nQword = 0; nQword<nQwords; nQword++)
 629         {
 630                 p_qw[nQword] = WordSwapBytes(p_qw[nQword]);
 631         }
 632         wchar_t * p_w = (wchar_t *)p_qw;
 633         int nWords = nNeedBytes/2;
 634         for (int nWord = nQwords*4; nWord<nWords; nWord++)
 635         {
 636                 p_w[nWord] = WideCharSwap(p_w[nWord]);
 637         }
 638         return CUtf16leFilter::Decode(data);
 639 }
 640
 641 const CBuffer & CUtf16beFilter::Encode(const CString s)
 642 {
 643         int nNeedBytes = s.GetLength()*sizeof(TCHAR);
 644         m_oBuffer.SetLength(nNeedBytes);
 645         // copy swaping BYTE order in WORDs
 646         const UINT64 * p_qwIn = (const UINT64 *)(LPCTSTR)s;
 647         UINT64 * p_qwOut = (UINT64 *)(void *)m_oBuffer;
 648         int nQwords = nNeedBytes/8;
 649         for (int nQword = 0; nQword<nQwords; nQword++)
 650         {
 651                 p_qwOut[nQword] = WordSwapBytes(p_qwIn[nQword]);
 652         }
 653         wchar_t * p_wIn = (wchar_t *)p_qwIn;
 654         wchar_t * p_wOut = (wchar_t *)p_qwOut;
 655         int nWords = nNeedBytes/2;
 656         for (int nWord = nQwords*4; nWord<nWords; nWord++)
 657         {
 658                 p_wOut[nWord] = WideCharSwap(p_wIn[nWord]);
 659         }
 660         return m_oBuffer;
 661 }
 662
 663
 664
 665 bool CUtf32leFilter::Decode(/*in out*/ CBuffer & data)
 666 {
 667         // UTF32 have four bytes per char
 668         int nReadChars = data.GetLength()/4;
 669         UINT32 * p32 = (UINT32 *)(void *)data;
 670
 671         // count chars which needs surrogate pair
 672         int nSurrogatePairCount = 0;
 673         for (int i = 0; i<nReadChars; ++i)
 674         {
 675                 if (p32[i]<0x110000 && p32[i]>=0x10000)
 676                 {
 677                         ++nSurrogatePairCount;
 678                 }
 679         }
 680
 681         // fill buffer
 682         m_oBuffer.SetLength((nReadChars+nSurrogatePairCount)*sizeof(wchar_t));
 683         wchar_t * pOut = (wchar_t *)m_oBuffer;
 684         for (int i = 0; i<nReadChars; ++i, ++pOut)
 685         {
 686                 UINT32 zChar = p32[i];
 687                 if (zChar>=0x110000)
 688                 {
 689                         *pOut=0xfffd; // ? mark
 690                 }
 691                 else if (zChar>=0x10000)
 692                 {
 693                         zChar-=0x10000;
 694                         pOut[0] = ((zChar>>10)&0x3ff) | 0xd800; // lead surrogate
 695                         pOut[1] = (zChar&0x7ff) | 0xdc00; // trail surrogate
 696                         pOut++;
 697                 }
 698                 else
 699                 {
 700                         *pOut = (wchar_t)zChar;
 701                 }
 702         }
 703         data.Swap(m_oBuffer);
 704         return TRUE;
 705 }
 706
 707 const CBuffer & CUtf32leFilter::Encode(const CString s)
 708 {
 709         int nInWords = s.GetLength();
 710         m_oBuffer.SetLength(nInWords*2);
 711
 712         LPCTSTR p_In = (LPCTSTR)s;
 713         UINT32 * p_Out = (UINT32 *)(void *)m_oBuffer;
 714         int nOutDword = 0;
 715         for (int nInWord = 0; nInWord<nInWords; nInWord++, nOutDword++)
 716         {
 717                 UINT32 zChar = p_In[nInWord];
 718                 if ((zChar&0xfc00) == 0xd800) // lead surrogate
 719                 {
 720                         if (nInWord+1<nInWords && (p_In[nInWord+1]&0xfc00) == 0xdc00) // trail surrogate follows
 721                         {
 722                                 zChar = 0x10000 + ((zChar&0x3ff)<<10) + (p_In[++nInWord]&0x3ff);
 723                         }
 724                         else
 725                         {
 726                                 zChar = 0xfffd; // ? mark
 727                         }
 728                 }
 729                 else if ((zChar&0xfc00) == 0xdc00) // trail surrogate without lead
 730                 {
 731                         zChar = 0xfffd; // ? mark
 732                 }
 733                 p_Out[nOutDword] = zChar;
 734         }
 735         m_oBuffer.SetLength(nOutDword*4); // store length reduced by surrogates
 736         return m_oBuffer;
 737 }
 738
 739
 740
 741 bool CUtf32beFilter::Decode(/*in out*/ CBuffer & data)
 742 {
 743
 744         // swap BYTEs order in DWORDs
 745         UINT64 * p64 = (UINT64 *)(void *)data;
 746         int nQwords = data.GetLength()/8;
 747         for (int nQword = 0; nQword<nQwords; nQword++)
 748         {
 749                 p64[nQword] = DwordSwapBytes(p64[nQword]);
 750         }
 751
 752         UINT32 * p32 = (UINT32 *)p64;
 753         int nDwords = data.GetLength()/4;
 754         for (int nDword = nQwords*2; nDword<nDwords; nDword++)
 755         {
 756                 p32[nDword] = DwordSwapBytes(p32[nDword]);
 757         }
 758         return CUtf32leFilter::Decode(data);
 759 }
 760
 761 const CBuffer & CUtf32beFilter::Encode(const CString s)
 762 {
 763         CUtf32leFilter::Encode(s);
 764
 765         // swap BYTEs order in DWORDs
 766         UINT64 * p64 = (UINT64 *)(void *)m_oBuffer;
 767         int nQwords = m_oBuffer.GetLength()/8;
 768         for (int nQword = 0; nQword<nQwords; nQword++)
 769         {
 770                 p64[nQword] = DwordSwapBytes(p64[nQword]);
 771         }
 772
 773         UINT32 * p32 = (UINT32 *)p64;
 774         int nDwords = m_oBuffer.GetLength()/4;
 775         for (int nDword = nQwords*2; nDword<nDwords; nDword++)
 776         {
 777                 p32[nDword] = DwordSwapBytes(p32[nDword]);
 778         }
 779         return m_oBuffer;
 780 }
 781