ext/scintilla/src/Document.cxx

   1 // Scintilla source code edit control
   2 /** @file Document.cxx
   3  ** Text document that handles notifications, DBCS, styling, words and end of line.
   4  **/
   5 // Copyright 1998-2011 by Neil Hodgson <neilh@scintilla.org>
   6 // The License.txt file describes the conditions under which this software may be distributed.
   7
   8 #include <stdlib.h>
   9 #include <string.h>
  10 #include <stdio.h>
  11 #include <assert.h>
  12 #include <ctype.h>
  13
  14 #include <stdexcept>
  15 #include <string>
  16 #include <vector>
  17 #include <algorithm>
  18
  19 #ifdef CXX11_REGEX
  20 #include <regex>
  21 #endif
  22
  23 #include "Platform.h"
  24
  25 #include "ILexer.h"
  26 #include "Scintilla.h"
  27
  28 #include "CharacterSet.h"
  29 #include "SplitVector.h"
  30 #include "Partitioning.h"
  31 #include "RunStyles.h"
  32 #include "CellBuffer.h"
  33 #include "PerLine.h"
  34 #include "CharClassify.h"
  35 #include "Decoration.h"
  36 #include "CaseFolder.h"
  37 #include "Document.h"
  38 #include "RESearch.h"
  39 #include "UniConversion.h"
  40
  41 #ifdef SCI_NAMESPACE
  42 using namespace Scintilla;
  43 #endif
  44
  45 static inline bool IsPunctuation(char ch) {
  46         return IsASCII(ch) && ispunct(ch);
  47 }
  48
  49 void LexInterface::Colourise(int start, int end) {
  50         if (pdoc && instance && !performingStyle) {
  51                 // Protect against reentrance, which may occur, for example, when
  52                 // fold points are discovered while performing styling and the folding
  53                 // code looks for child lines which may trigger styling.
  54                 performingStyle = true;
  55
  56                 int lengthDoc = pdoc->Length();
  57                 if (end == -1)
  58                         end = lengthDoc;
  59                 int len = end - start;
  60
  61                 PLATFORM_ASSERT(len >= 0);
  62                 PLATFORM_ASSERT(start + len <= lengthDoc);
  63
  64                 int styleStart = 0;
  65                 if (start > 0)
  66                         styleStart = pdoc->StyleAt(start - 1);
  67
  68                 if (len > 0) {
  69                         instance->Lex(start, len, styleStart, pdoc);
  70                         instance->Fold(start, len, styleStart, pdoc);
  71                 }
  72
  73                 performingStyle = false;
  74         }
  75 }
  76
  77 int LexInterface::LineEndTypesSupported() {
  78         if (instance) {
  79                 int interfaceVersion = instance->Version();
  80                 if (interfaceVersion >= lvSubStyles) {
  81                         ILexerWithSubStyles *ssinstance = static_cast<ILexerWithSubStyles *>(instance);
  82                         return ssinstance->LineEndTypesSupported();
  83                 }
  84         }
  85         return 0;
  86 }
  87
  88 Document::Document() {
  89         refCount = 0;
  90         pcf = NULL;
  91 #ifdef _WIN32
  92         eolMode = SC_EOL_CRLF;
  93 #else
  94         eolMode = SC_EOL_LF;
  95 #endif
  96         dbcsCodePage = 0;
  97         lineEndBitSet = SC_LINE_END_TYPE_DEFAULT;
  98         endStyled = 0;
  99         styleClock = 0;
 100         enteredModification = 0;
 101         enteredStyling = 0;
 102         enteredReadOnlyCount = 0;
 103         insertionSet = false;
 104         tabInChars = 8;
 105         indentInChars = 0;
 106         actualIndentInChars = 8;
 107         useTabs = true;
 108         tabIndents = true;
 109         backspaceUnindents = false;
 110
 111         matchesValid = false;
 112         regex = 0;
 113
 114         UTF8BytesOfLeadInitialise();
 115
 116         perLineData[ldMarkers] = new LineMarkers();
 117         perLineData[ldLevels] = new LineLevels();
 118         perLineData[ldState] = new LineState();
 119         perLineData[ldMargin] = new LineAnnotation();
 120         perLineData[ldAnnotation] = new LineAnnotation();
 121
 122         cb.SetPerLine(this);
 123
 124         pli = 0;
 125 }
 126
 127 Document::~Document() {
 128         for (std::vector<WatcherWithUserData>::iterator it = watchers.begin(); it != watchers.end(); ++it) {
 129                 it->watcher->NotifyDeleted(this, it->userData);
 130         }
 131         for (int j=0; j<ldSize; j++) {
 132                 delete perLineData[j];
 133                 perLineData[j] = 0;
 134         }
 135         delete regex;
 136         regex = 0;
 137         delete pli;
 138         pli = 0;
 139         delete pcf;
 140         pcf = 0;
 141 }
 142
 143 void Document::Init() {
 144         for (int j=0; j<ldSize; j++) {
 145                 if (perLineData[j])
 146                         perLineData[j]->Init();
 147         }
 148 }
 149
 150 int Document::LineEndTypesSupported() const {
 151         if ((SC_CP_UTF8 == dbcsCodePage) && pli)
 152                 return pli->LineEndTypesSupported();
 153         else
 154                 return 0;
 155 }
 156
 157 bool Document::SetDBCSCodePage(int dbcsCodePage_) {
 158         if (dbcsCodePage != dbcsCodePage_) {
 159                 dbcsCodePage = dbcsCodePage_;
 160                 SetCaseFolder(NULL);
 161                 cb.SetLineEndTypes(lineEndBitSet & LineEndTypesSupported());
 162                 return true;
 163         } else {
 164                 return false;
 165         }
 166 }
 167
 168 bool Document::SetLineEndTypesAllowed(int lineEndBitSet_) {
 169         if (lineEndBitSet != lineEndBitSet_) {
 170                 lineEndBitSet = lineEndBitSet_;
 171                 int lineEndBitSetActive = lineEndBitSet & LineEndTypesSupported();
 172                 if (lineEndBitSetActive != cb.GetLineEndTypes()) {
 173                         ModifiedAt(0);
 174                         cb.SetLineEndTypes(lineEndBitSetActive);
 175                         return true;
 176                 } else {
 177                         return false;
 178                 }
 179         } else {
 180                 return false;
 181         }
 182 }
 183
 184 void Document::InsertLine(int line) {
 185         for (int j=0; j<ldSize; j++) {
 186                 if (perLineData[j])
 187                         perLineData[j]->InsertLine(line);
 188         }
 189 }
 190
 191 void Document::RemoveLine(int line) {
 192         for (int j=0; j<ldSize; j++) {
 193                 if (perLineData[j])
 194                         perLineData[j]->RemoveLine(line);
 195         }
 196 }
 197
 198 // Increase reference count and return its previous value.
 199 int Document::AddRef() {
 200         return refCount++;
 201 }
 202
 203 // Decrease reference count and return its previous value.
 204 // Delete the document if reference count reaches zero.
 205 int SCI_METHOD Document::Release() {
 206         int curRefCount = --refCount;
 207         if (curRefCount == 0)
 208                 delete this;
 209         return curRefCount;
 210 }
 211
 212 void Document::SetSavePoint() {
 213         cb.SetSavePoint();
 214         NotifySavePoint(true);
 215 }
 216
 217 void Document::TentativeUndo() {
 218         CheckReadOnly();
 219         if (enteredModification == 0) {
 220                 enteredModification++;
 221                 if (!cb.IsReadOnly()) {
 222                         bool startSavePoint = cb.IsSavePoint();
 223                         bool multiLine = false;
 224                         int steps = cb.TentativeSteps();
 225                         //Platform::DebugPrintf("Steps=%d\n", steps);
 226                         for (int step = 0; step < steps; step++) {
 227                                 const int prevLinesTotal = LinesTotal();
 228                                 const Action &action = cb.GetUndoStep();
 229                                 if (action.at == removeAction) {
 230                                         NotifyModified(DocModification(
 231                                                                         SC_MOD_BEFOREINSERT | SC_PERFORMED_UNDO, action));
 232                                 } else if (action.at == containerAction) {
 233                                         DocModification dm(SC_MOD_CONTAINER | SC_PERFORMED_UNDO);
 234                                         dm.token = action.position;
 235                                         NotifyModified(dm);
 236                                 } else {
 237                                         NotifyModified(DocModification(
 238                                                                         SC_MOD_BEFOREDELETE | SC_PERFORMED_UNDO, action));
 239                                 }
 240                                 cb.PerformUndoStep();
 241                                 if (action.at != containerAction) {
 242                                         ModifiedAt(action.position);
 243                                 }
 244
 245                                 int modFlags = SC_PERFORMED_UNDO;
 246                                 // With undo, an insertion action becomes a deletion notification
 247                                 if (action.at == removeAction) {
 248                                         modFlags |= SC_MOD_INSERTTEXT;
 249                                 } else if (action.at == insertAction) {
 250                                         modFlags |= SC_MOD_DELETETEXT;
 251                                 }
 252                                 if (steps > 1)
 253                                         modFlags |= SC_MULTISTEPUNDOREDO;
 254                                 const int linesAdded = LinesTotal() - prevLinesTotal;
 255                                 if (linesAdded != 0)
 256                                         multiLine = true;
 257                                 if (step == steps - 1) {
 258                                         modFlags |= SC_LASTSTEPINUNDOREDO;
 259                                         if (multiLine)
 260                                                 modFlags |= SC_MULTILINEUNDOREDO;
 261                                 }
 262                                 NotifyModified(DocModification(modFlags, action.position, action.lenData,
 263                                                                                            linesAdded, action.data));
 264                         }
 265
 266                         bool endSavePoint = cb.IsSavePoint();
 267                         if (startSavePoint != endSavePoint)
 268                                 NotifySavePoint(endSavePoint);
 269
 270                         cb.TentativeCommit();
 271                 }
 272                 enteredModification--;
 273         }
 274 }
 275
 276 int Document::GetMark(int line) {
 277         return static_cast<LineMarkers *>(perLineData[ldMarkers])->MarkValue(line);
 278 }
 279
 280 int Document::MarkerNext(int lineStart, int mask) const {
 281         return static_cast<LineMarkers *>(perLineData[ldMarkers])->MarkerNext(lineStart, mask);
 282 }
 283
 284 int Document::AddMark(int line, int markerNum) {
 285         if (line >= 0 && line <= LinesTotal()) {
 286                 int prev = static_cast<LineMarkers *>(perLineData[ldMarkers])->
 287                         AddMark(line, markerNum, LinesTotal());
 288                 DocModification mh(SC_MOD_CHANGEMARKER, LineStart(line), 0, 0, 0, line);
 289                 NotifyModified(mh);
 290                 return prev;
 291         } else {
 292                 return 0;
 293         }
 294 }
 295
 296 void Document::AddMarkSet(int line, int valueSet) {
 297         if (line < 0 || line > LinesTotal()) {
 298                 return;
 299         }
 300         unsigned int m = valueSet;
 301         for (int i = 0; m; i++, m >>= 1)
 302                 if (m & 1)
 303                         static_cast<LineMarkers *>(perLineData[ldMarkers])->
 304                                 AddMark(line, i, LinesTotal());
 305         DocModification mh(SC_MOD_CHANGEMARKER, LineStart(line), 0, 0, 0, line);
 306         NotifyModified(mh);
 307 }
 308
 309 void Document::DeleteMark(int line, int markerNum) {
 310         static_cast<LineMarkers *>(perLineData[ldMarkers])->DeleteMark(line, markerNum, false);
 311         DocModification mh(SC_MOD_CHANGEMARKER, LineStart(line), 0, 0, 0, line);
 312         NotifyModified(mh);
 313 }
 314
 315 void Document::DeleteMarkFromHandle(int markerHandle) {
 316         static_cast<LineMarkers *>(perLineData[ldMarkers])->DeleteMarkFromHandle(markerHandle);
 317         DocModification mh(SC_MOD_CHANGEMARKER, 0, 0, 0, 0);
 318         mh.line = -1;
 319         NotifyModified(mh);
 320 }
 321
 322 void Document::DeleteAllMarks(int markerNum) {
 323         bool someChanges = false;
 324         for (int line = 0; line < LinesTotal(); line++) {
 325                 if (static_cast<LineMarkers *>(perLineData[ldMarkers])->DeleteMark(line, markerNum, true))
 326                         someChanges = true;
 327         }
 328         if (someChanges) {
 329                 DocModification mh(SC_MOD_CHANGEMARKER, 0, 0, 0, 0);
 330                 mh.line = -1;
 331                 NotifyModified(mh);
 332         }
 333 }
 334
 335 int Document::LineFromHandle(int markerHandle) {
 336         return static_cast<LineMarkers *>(perLineData[ldMarkers])->LineFromHandle(markerHandle);
 337 }
 338
 339 int SCI_METHOD Document::LineStart(int line) const {
 340         return cb.LineStart(line);
 341 }
 342
 343 bool Document::IsLineStartPosition(int position) const {
 344         return LineStart(LineFromPosition(position)) == position;
 345 }
 346
 347 int SCI_METHOD Document::LineEnd(int line) const {
 348         if (line >= LinesTotal() - 1) {
 349                 return LineStart(line + 1);
 350         } else {
 351                 int position = LineStart(line + 1);
 352                 if (SC_CP_UTF8 == dbcsCodePage) {
 353                         unsigned char bytes[] = {
 354                                 static_cast<unsigned char>(cb.CharAt(position-3)),
 355                                 static_cast<unsigned char>(cb.CharAt(position-2)),
 356                                 static_cast<unsigned char>(cb.CharAt(position-1)),
 357                         };
 358                         if (UTF8IsSeparator(bytes)) {
 359                                 return position - UTF8SeparatorLength;
 360                         }
 361                         if (UTF8IsNEL(bytes+1)) {
 362                                 return position - UTF8NELLength;
 363                         }
 364                 }
 365                 position--; // Back over CR or LF
 366                 // When line terminator is CR+LF, may need to go back one more
 367                 if ((position > LineStart(line)) && (cb.CharAt(position - 1) == '\r')) {
 368                         position--;
 369                 }
 370                 return position;
 371         }
 372 }
 373
 374 void SCI_METHOD Document::SetErrorStatus(int status) {
 375         // Tell the watchers an error has occurred.
 376         for (std::vector<WatcherWithUserData>::iterator it = watchers.begin(); it != watchers.end(); ++it) {
 377                 it->watcher->NotifyErrorOccurred(this, it->userData, status);
 378         }
 379 }
 380
 381 int SCI_METHOD Document::LineFromPosition(int pos) const {
 382         return cb.LineFromPosition(pos);
 383 }
 384
 385 int Document::LineEndPosition(int position) const {
 386         return LineEnd(LineFromPosition(position));
 387 }
 388
 389 bool Document::IsLineEndPosition(int position) const {
 390         return LineEnd(LineFromPosition(position)) == position;
 391 }
 392
 393 bool Document::IsPositionInLineEnd(int position) const {
 394         return position >= LineEnd(LineFromPosition(position));
 395 }
 396
 397 int Document::VCHomePosition(int position) const {
 398         int line = LineFromPosition(position);
 399         int startPosition = LineStart(line);
 400         int endLine = LineEnd(line);
 401         int startText = startPosition;
 402         while (startText < endLine && (cb.CharAt(startText) == ' ' || cb.CharAt(startText) == '\t'))
 403                 startText++;
 404         if (position == startText)
 405                 return startPosition;
 406         else
 407                 return startText;
 408 }
 409
 410 int SCI_METHOD Document::SetLevel(int line, int level) {
 411         int prev = static_cast<LineLevels *>(perLineData[ldLevels])->SetLevel(line, level, LinesTotal());
 412         if (prev != level) {
 413                 DocModification mh(SC_MOD_CHANGEFOLD | SC_MOD_CHANGEMARKER,
 414                                    LineStart(line), 0, 0, 0, line);
 415                 mh.foldLevelNow = level;
 416                 mh.foldLevelPrev = prev;
 417                 NotifyModified(mh);
 418         }
 419         return prev;
 420 }
 421
 422 int SCI_METHOD Document::GetLevel(int line) const {
 423         return static_cast<LineLevels *>(perLineData[ldLevels])->GetLevel(line);
 424 }
 425
 426 void Document::ClearLevels() {
 427         static_cast<LineLevels *>(perLineData[ldLevels])->ClearLevels();
 428 }
 429
 430 static bool IsSubordinate(int levelStart, int levelTry) {
 431         if (levelTry & SC_FOLDLEVELWHITEFLAG)
 432                 return true;
 433         else
 434                 return (levelStart & SC_FOLDLEVELNUMBERMASK) < (levelTry & SC_FOLDLEVELNUMBERMASK);
 435 }
 436
 437 int Document::GetLastChild(int lineParent, int level, int lastLine) {
 438         if (level == -1)
 439                 level = GetLevel(lineParent) & SC_FOLDLEVELNUMBERMASK;
 440         int maxLine = LinesTotal();
 441         int lookLastLine = (lastLine != -1) ? Platform::Minimum(LinesTotal() - 1, lastLine) : -1;
 442         int lineMaxSubord = lineParent;
 443         while (lineMaxSubord < maxLine - 1) {
 444                 EnsureStyledTo(LineStart(lineMaxSubord + 2));
 445                 if (!IsSubordinate(level, GetLevel(lineMaxSubord + 1)))
 446                         break;
 447                 if ((lookLastLine != -1) && (lineMaxSubord >= lookLastLine) && !(GetLevel(lineMaxSubord) & SC_FOLDLEVELWHITEFLAG))
 448                         break;
 449                 lineMaxSubord++;
 450         }
 451         if (lineMaxSubord > lineParent) {
 452                 if (level > (GetLevel(lineMaxSubord + 1) & SC_FOLDLEVELNUMBERMASK)) {
 453                         // Have chewed up some whitespace that belongs to a parent so seek back
 454                         if (GetLevel(lineMaxSubord) & SC_FOLDLEVELWHITEFLAG) {
 455                                 lineMaxSubord--;
 456                         }
 457                 }
 458         }
 459         return lineMaxSubord;
 460 }
 461
 462 int Document::GetFoldParent(int line) const {
 463         int level = GetLevel(line) & SC_FOLDLEVELNUMBERMASK;
 464         int lineLook = line - 1;
 465         while ((lineLook > 0) && (
 466                     (!(GetLevel(lineLook) & SC_FOLDLEVELHEADERFLAG)) ||
 467                     ((GetLevel(lineLook) & SC_FOLDLEVELNUMBERMASK) >= level))
 468               ) {
 469                 lineLook--;
 470         }
 471         if ((GetLevel(lineLook) & SC_FOLDLEVELHEADERFLAG) &&
 472                 ((GetLevel(lineLook) & SC_FOLDLEVELNUMBERMASK) < level)) {
 473                 return lineLook;
 474         } else {
 475                 return -1;
 476         }
 477 }
 478
 479 void Document::GetHighlightDelimiters(HighlightDelimiter &highlightDelimiter, int line, int lastLine) {
 480         int level = GetLevel(line);
 481         int lookLastLine = Platform::Maximum(line, lastLine) + 1;
 482
 483         int lookLine = line;
 484         int lookLineLevel = level;
 485         int lookLineLevelNum = lookLineLevel & SC_FOLDLEVELNUMBERMASK;
 486         while ((lookLine > 0) && ((lookLineLevel & SC_FOLDLEVELWHITEFLAG) ||
 487                 ((lookLineLevel & SC_FOLDLEVELHEADERFLAG) && (lookLineLevelNum >= (GetLevel(lookLine + 1) & SC_FOLDLEVELNUMBERMASK))))) {
 488                 lookLineLevel = GetLevel(--lookLine);
 489                 lookLineLevelNum = lookLineLevel & SC_FOLDLEVELNUMBERMASK;
 490         }
 491
 492         int beginFoldBlock = (lookLineLevel & SC_FOLDLEVELHEADERFLAG) ? lookLine : GetFoldParent(lookLine);
 493         if (beginFoldBlock == -1) {
 494                 highlightDelimiter.Clear();
 495                 return;
 496         }
 497
 498         int endFoldBlock = GetLastChild(beginFoldBlock, -1, lookLastLine);
 499         int firstChangeableLineBefore = -1;
 500         if (endFoldBlock < line) {
 501                 lookLine = beginFoldBlock - 1;
 502                 lookLineLevel = GetLevel(lookLine);
 503                 lookLineLevelNum = lookLineLevel & SC_FOLDLEVELNUMBERMASK;
 504                 while ((lookLine >= 0) && (lookLineLevelNum >= SC_FOLDLEVELBASE)) {
 505                         if (lookLineLevel & SC_FOLDLEVELHEADERFLAG) {
 506                                 if (GetLastChild(lookLine, -1, lookLastLine) == line) {
 507                                         beginFoldBlock = lookLine;
 508                                         endFoldBlock = line;
 509                                         firstChangeableLineBefore = line - 1;
 510                                 }
 511                         }
 512                         if ((lookLine > 0) && (lookLineLevelNum == SC_FOLDLEVELBASE) && ((GetLevel(lookLine - 1) & SC_FOLDLEVELNUMBERMASK) > lookLineLevelNum))
 513                                 break;
 514                         lookLineLevel = GetLevel(--lookLine);
 515                         lookLineLevelNum = lookLineLevel & SC_FOLDLEVELNUMBERMASK;
 516                 }
 517         }
 518         if (firstChangeableLineBefore == -1) {
 519                 for (lookLine = line - 1, lookLineLevel = GetLevel(lookLine), lookLineLevelNum = lookLineLevel & SC_FOLDLEVELNUMBERMASK;
 520                         lookLine >= beginFoldBlock;
 521                         lookLineLevel = GetLevel(--lookLine), lookLineLevelNum = lookLineLevel & SC_FOLDLEVELNUMBERMASK) {
 522                         if ((lookLineLevel & SC_FOLDLEVELWHITEFLAG) || (lookLineLevelNum > (level & SC_FOLDLEVELNUMBERMASK))) {
 523                                 firstChangeableLineBefore = lookLine;
 524                                 break;
 525                         }
 526                 }
 527         }
 528         if (firstChangeableLineBefore == -1)
 529                 firstChangeableLineBefore = beginFoldBlock - 1;
 530
 531         int firstChangeableLineAfter = -1;
 532         for (lookLine = line + 1, lookLineLevel = GetLevel(lookLine), lookLineLevelNum = lookLineLevel & SC_FOLDLEVELNUMBERMASK;
 533                 lookLine <= endFoldBlock;
 534                 lookLineLevel = GetLevel(++lookLine), lookLineLevelNum = lookLineLevel & SC_FOLDLEVELNUMBERMASK) {
 535                 if ((lookLineLevel & SC_FOLDLEVELHEADERFLAG) && (lookLineLevelNum < (GetLevel(lookLine + 1) & SC_FOLDLEVELNUMBERMASK))) {
 536                         firstChangeableLineAfter = lookLine;
 537                         break;
 538                 }
 539         }
 540         if (firstChangeableLineAfter == -1)
 541                 firstChangeableLineAfter = endFoldBlock + 1;
 542
 543         highlightDelimiter.beginFoldBlock = beginFoldBlock;
 544         highlightDelimiter.endFoldBlock = endFoldBlock;
 545         highlightDelimiter.firstChangeableLineBefore = firstChangeableLineBefore;
 546         highlightDelimiter.firstChangeableLineAfter = firstChangeableLineAfter;
 547 }
 548
 549 int Document::ClampPositionIntoDocument(int pos) const {
 550         return Platform::Clamp(pos, 0, Length());
 551 }
 552
 553 bool Document::IsCrLf(int pos) const {
 554         if (pos < 0)
 555                 return false;
 556         if (pos >= (Length() - 1))
 557                 return false;
 558         return (cb.CharAt(pos) == '\r') && (cb.CharAt(pos + 1) == '\n');
 559 }
 560
 561 int Document::LenChar(int pos) {
 562         if (pos < 0) {
 563                 return 1;
 564         } else if (IsCrLf(pos)) {
 565                 return 2;
 566         } else if (SC_CP_UTF8 == dbcsCodePage) {
 567                 const unsigned char leadByte = static_cast<unsigned char>(cb.CharAt(pos));
 568                 const int widthCharBytes = UTF8BytesOfLead[leadByte];
 569                 int lengthDoc = Length();
 570                 if ((pos + widthCharBytes) > lengthDoc)
 571                         return lengthDoc - pos;
 572                 else
 573                         return widthCharBytes;
 574         } else if (dbcsCodePage) {
 575                 return IsDBCSLeadByte(cb.CharAt(pos)) ? 2 : 1;
 576         } else {
 577                 return 1;
 578         }
 579 }
 580
 581 bool Document::InGoodUTF8(int pos, int &start, int &end) const {
 582         int trail = pos;
 583         while ((trail>0) && (pos-trail < UTF8MaxBytes) && UTF8IsTrailByte(static_cast<unsigned char>(cb.CharAt(trail-1))))
 584                 trail--;
 585         start = (trail > 0) ? trail-1 : trail;
 586
 587         const unsigned char leadByte = static_cast<unsigned char>(cb.CharAt(start));
 588         const int widthCharBytes = UTF8BytesOfLead[leadByte];
 589         if (widthCharBytes == 1) {
 590                 return false;
 591         } else {
 592                 int trailBytes = widthCharBytes - 1;
 593                 int len = pos - start;
 594                 if (len > trailBytes)
 595                         // pos too far from lead
 596                         return false;
 597                 char charBytes[UTF8MaxBytes] = {static_cast<char>(leadByte),0,0,0};
 598                 for (int b=1; b<widthCharBytes && ((start+b) < Length()); b++)
 599                         charBytes[b] = cb.CharAt(static_cast<int>(start+b));
 600                 int utf8status = UTF8Classify(reinterpret_cast<const unsigned char *>(charBytes), widthCharBytes);
 601                 if (utf8status & UTF8MaskInvalid)
 602                         return false;
 603                 end = start + widthCharBytes;
 604                 return true;
 605         }
 606 }
 607
 608 // Normalise a position so that it is not halfway through a two byte character.
 609 // This can occur in two situations -
 610 // When lines are terminated with \r\n pairs which should be treated as one character.
 611 // When displaying DBCS text such as Japanese.
 612 // If moving, move the position in the indicated direction.
 613 int Document::MovePositionOutsideChar(int pos, int moveDir, bool checkLineEnd) const {
 614         //Platform::DebugPrintf("NoCRLF %d %d\n", pos, moveDir);
 615         // If out of range, just return minimum/maximum value.
 616         if (pos <= 0)
 617                 return 0;
 618         if (pos >= Length())
 619                 return Length();
 620
 621         // PLATFORM_ASSERT(pos > 0 && pos < Length());
 622         if (checkLineEnd && IsCrLf(pos - 1)) {
 623                 if (moveDir > 0)
 624                         return pos + 1;
 625                 else
 626                         return pos - 1;
 627         }
 628
 629         if (dbcsCodePage) {
 630                 if (SC_CP_UTF8 == dbcsCodePage) {
 631                         unsigned char ch = static_cast<unsigned char>(cb.CharAt(pos));
 632                         // If ch is not a trail byte then pos is valid intercharacter position
 633                         if (UTF8IsTrailByte(ch)) {
 634                                 int startUTF = pos;
 635                                 int endUTF = pos;
 636                                 if (InGoodUTF8(pos, startUTF, endUTF)) {
 637                                         // ch is a trail byte within a UTF-8 character
 638                                         if (moveDir > 0)
 639                                                 pos = endUTF;
 640                                         else
 641                                                 pos = startUTF;
 642                                 }
 643                                 // Else invalid UTF-8 so return position of isolated trail byte
 644                         }
 645                 } else {
 646                         // Anchor DBCS calculations at start of line because start of line can
 647                         // not be a DBCS trail byte.
 648                         int posStartLine = LineStart(LineFromPosition(pos));
 649                         if (pos == posStartLine)
 650                                 return pos;
 651
 652                         // Step back until a non-lead-byte is found.
 653                         int posCheck = pos;
 654                         while ((posCheck > posStartLine) && IsDBCSLeadByte(cb.CharAt(posCheck-1)))
 655                                 posCheck--;
 656
 657                         // Check from known start of character.
 658                         while (posCheck < pos) {
 659                                 int mbsize = IsDBCSLeadByte(cb.CharAt(posCheck)) ? 2 : 1;
 660                                 if (posCheck + mbsize == pos) {
 661                                         return pos;
 662                                 } else if (posCheck + mbsize > pos) {
 663                                         if (moveDir > 0) {
 664                                                 return posCheck + mbsize;
 665                                         } else {
 666                                                 return posCheck;
 667                                         }
 668                                 }
 669                                 posCheck += mbsize;
 670                         }
 671                 }
 672         }
 673
 674         return pos;
 675 }
 676
 677 // NextPosition moves between valid positions - it can not handle a position in the middle of a
 678 // multi-byte character. It is used to iterate through text more efficiently than MovePositionOutsideChar.
 679 // A \r\n pair is treated as two characters.
 680 int Document::NextPosition(int pos, int moveDir) const {
 681         // If out of range, just return minimum/maximum value.
 682         int increment = (moveDir > 0) ? 1 : -1;
 683         if (pos + increment <= 0)
 684                 return 0;
 685         if (pos + increment >= Length())
 686                 return Length();
 687
 688         if (dbcsCodePage) {
 689                 if (SC_CP_UTF8 == dbcsCodePage) {
 690                         if (increment == 1) {
 691                                 // Simple forward movement case so can avoid some checks
 692                                 const unsigned char leadByte = static_cast<unsigned char>(cb.CharAt(pos));
 693                                 if (UTF8IsAscii(leadByte)) {
 694                                         // Single byte character or invalid
 695                                         pos++;
 696                                 } else {
 697                                         const int widthCharBytes = UTF8BytesOfLead[leadByte];
 698                                         char charBytes[UTF8MaxBytes] = {static_cast<char>(leadByte),0,0,0};
 699                                         for (int b=1; b<widthCharBytes; b++)
 700                                                 charBytes[b] = cb.CharAt(static_cast<int>(pos+b));
 701                                         int utf8status = UTF8Classify(reinterpret_cast<const unsigned char *>(charBytes), widthCharBytes);
 702                                         if (utf8status & UTF8MaskInvalid)
 703                                                 pos++;
 704                                         else
 705                                                 pos += utf8status & UTF8MaskWidth;
 706                                 }
 707                         } else {
 708                                 // Examine byte before position
 709                                 pos--;
 710                                 unsigned char ch = static_cast<unsigned char>(cb.CharAt(pos));
 711                                 // If ch is not a trail byte then pos is valid intercharacter position
 712                                 if (UTF8IsTrailByte(ch)) {
 713                                         // If ch is a trail byte in a valid UTF-8 character then return start of character
 714                                         int startUTF = pos;
 715                                         int endUTF = pos;
 716                                         if (InGoodUTF8(pos, startUTF, endUTF)) {
 717                                                 pos = startUTF;
 718                                         }
 719                                         // Else invalid UTF-8 so return position of isolated trail byte
 720                                 }
 721                         }
 722                 } else {
 723                         if (moveDir > 0) {
 724                                 int mbsize = IsDBCSLeadByte(cb.CharAt(pos)) ? 2 : 1;
 725                                 pos += mbsize;
 726                                 if (pos > Length())
 727                                         pos = Length();
 728                         } else {
 729                                 // Anchor DBCS calculations at start of line because start of line can
 730                                 // not be a DBCS trail byte.
 731                                 int posStartLine = LineStart(LineFromPosition(pos));
 732                                 // See http://msdn.microsoft.com/en-us/library/cc194792%28v=MSDN.10%29.aspx
 733                                 // http://msdn.microsoft.com/en-us/library/cc194790.aspx
 734                                 if ((pos - 1) <= posStartLine) {
 735                                         return pos - 1;
 736                                 } else if (IsDBCSLeadByte(cb.CharAt(pos - 1))) {
 737                                         // Must actually be trail byte
 738                                         return pos - 2;
 739                                 } else {
 740                                         // Otherwise, step back until a non-lead-byte is found.
 741                                         int posTemp = pos - 1;
 742                                         while (posStartLine <= --posTemp && IsDBCSLeadByte(cb.CharAt(posTemp)))
 743                                                 ;
 744                                         // Now posTemp+1 must point to the beginning of a character,
 745                                         // so figure out whether we went back an even or an odd
 746                                         // number of bytes and go back 1 or 2 bytes, respectively.
 747                                         return (pos - 1 - ((pos - posTemp) & 1));
 748                                 }
 749                         }
 750                 }
 751         } else {
 752                 pos += increment;
 753         }
 754
 755         return pos;
 756 }
 757
 758 bool Document::NextCharacter(int &pos, int moveDir) const {
 759         // Returns true if pos changed
 760         int posNext = NextPosition(pos, moveDir);
 761         if (posNext == pos) {
 762                 return false;
 763         } else {
 764                 pos = posNext;
 765                 return true;
 766         }
 767 }
 768
 769 static inline int UnicodeFromBytes(const unsigned char *us) {
 770         if (us[0] < 0xC2) {
 771                 return us[0];
 772         } else if (us[0] < 0xE0) {
 773                 return ((us[0] & 0x1F) << 6) + (us[1] & 0x3F);
 774         } else if (us[0] < 0xF0) {
 775                 return ((us[0] & 0xF) << 12) + ((us[1] & 0x3F) << 6) + (us[2] & 0x3F);
 776         } else if (us[0] < 0xF5) {
 777                 return ((us[0] & 0x7) << 18) + ((us[1] & 0x3F) << 12) + ((us[2] & 0x3F) << 6) + (us[3] & 0x3F);
 778         }
 779         return us[0];
 780 }
 781
 782 // Return -1  on out-of-bounds
 783 int SCI_METHOD Document::GetRelativePosition(int positionStart, int characterOffset) const {
 784         int pos = positionStart;
 785         if (dbcsCodePage) {
 786                 const int increment = (characterOffset > 0) ? 1 : -1;
 787                 while (characterOffset != 0) {
 788                         const int posNext = NextPosition(pos, increment);
 789                         if (posNext == pos)
 790                                 return INVALID_POSITION;
 791                         pos = posNext;
 792                         characterOffset -= increment;
 793                 }
 794         } else {
 795                 pos = positionStart + characterOffset;
 796                 if ((pos < 0) || (pos > Length()))
 797                         return INVALID_POSITION;
 798         }
 799         return pos;
 800 }
 801
 802 int SCI_METHOD Document::GetCharacterAndWidth(int position, int *pWidth) const {
 803         int character;
 804         int bytesInCharacter = 1;
 805         if (dbcsCodePage) {
 806                 const unsigned char leadByte = static_cast<unsigned char>(cb.CharAt(position));
 807                 if (SC_CP_UTF8 == dbcsCodePage) {
 808                         if (UTF8IsAscii(leadByte)) {
 809                                 // Single byte character or invalid
 810                                 character =  leadByte;
 811                         } else {
 812                                 const int widthCharBytes = UTF8BytesOfLead[leadByte];
 813                                 unsigned char charBytes[UTF8MaxBytes] = {leadByte,0,0,0};
 814                                 for (int b=1; b<widthCharBytes; b++)
 815                                         charBytes[b] = static_cast<unsigned char>(cb.CharAt(position+b));
 816                                 int utf8status = UTF8Classify(charBytes, widthCharBytes);
 817                                 if (utf8status & UTF8MaskInvalid) {
 818                                         // Report as singleton surrogate values which are invalid Unicode
 819                                         character =  0xDC80 + leadByte;
 820                                 } else {
 821                                         bytesInCharacter = utf8status & UTF8MaskWidth;
 822                                         character = UnicodeFromBytes(charBytes);
 823                                 }
 824                         }
 825                 } else {
 826                         if (IsDBCSLeadByte(leadByte)) {
 827                                 bytesInCharacter = 2;
 828                                 character = (leadByte << 8) | static_cast<unsigned char>(cb.CharAt(position+1));
 829                         } else {
 830                                 character = leadByte;
 831                         }
 832                 }
 833         } else {
 834                 character = cb.CharAt(position);
 835         }
 836         if (pWidth) {
 837                 *pWidth = bytesInCharacter;
 838         }
 839         return character;
 840 }
 841
 842 int SCI_METHOD Document::CodePage() const {
 843         return dbcsCodePage;
 844 }
 845
 846 bool SCI_METHOD Document::IsDBCSLeadByte(char ch) const {
 847         // Byte ranges found in Wikipedia articles with relevant search strings in each case
 848         unsigned char uch = static_cast<unsigned char>(ch);
 849         switch (dbcsCodePage) {
 850                 case 932:
 851                         // Shift_jis
 852                         return ((uch >= 0x81) && (uch <= 0x9F)) ||
 853                                 ((uch >= 0xE0) && (uch <= 0xFC));
 854                                 // Lead bytes F0 to FC may be a Microsoft addition.
 855                 case 936:
 856                         // GBK
 857                         return (uch >= 0x81) && (uch <= 0xFE);
 858                 case 949:
 859                         // Korean Wansung KS C-5601-1987
 860                         return (uch >= 0x81) && (uch <= 0xFE);
 861                 case 950:
 862                         // Big5
 863                         return (uch >= 0x81) && (uch <= 0xFE);
 864                 case 1361:
 865                         // Korean Johab KS C-5601-1992
 866                         return
 867                                 ((uch >= 0x84) && (uch <= 0xD3)) ||
 868                                 ((uch >= 0xD8) && (uch <= 0xDE)) ||
 869                                 ((uch >= 0xE0) && (uch <= 0xF9));
 870         }
 871         return false;
 872 }
 873
 874 static inline bool IsSpaceOrTab(int ch) {
 875         return ch == ' ' || ch == '\t';
 876 }
 877
 878 // Need to break text into segments near lengthSegment but taking into
 879 // account the encoding to not break inside a UTF-8 or DBCS character
 880 // and also trying to avoid breaking inside a pair of combining characters.
 881 // The segment length must always be long enough (more than 4 bytes)
 882 // so that there will be at least one whole character to make a segment.
 883 // For UTF-8, text must consist only of valid whole characters.
 884 // In preference order from best to worst:
 885 //   1) Break after space
 886 //   2) Break before punctuation
 887 //   3) Break after whole character
 888
 889 int Document::SafeSegment(const char *text, int length, int lengthSegment) const {
 890         if (length <= lengthSegment)
 891                 return length;
 892         int lastSpaceBreak = -1;
 893         int lastPunctuationBreak = -1;
 894         int lastEncodingAllowedBreak = 0;
 895         for (int j=0; j < lengthSegment;) {
 896                 unsigned char ch = static_cast<unsigned char>(text[j]);
 897                 if (j > 0) {
 898                         if (IsSpaceOrTab(text[j - 1]) && !IsSpaceOrTab(text[j])) {
 899                                 lastSpaceBreak = j;
 900                         }
 901                         if (ch < 'A') {
 902                                 lastPunctuationBreak = j;
 903                         }
 904                 }
 905                 lastEncodingAllowedBreak = j;
 906
 907                 if (dbcsCodePage == SC_CP_UTF8) {
 908                         j += UTF8BytesOfLead[ch];
 909                 } else if (dbcsCodePage) {
 910                         j += IsDBCSLeadByte(ch) ? 2 : 1;
 911                 } else {
 912                         j++;
 913                 }
 914         }
 915         if (lastSpaceBreak >= 0) {
 916                 return lastSpaceBreak;
 917         } else if (lastPunctuationBreak >= 0) {
 918                 return lastPunctuationBreak;
 919         }
 920         return lastEncodingAllowedBreak;
 921 }
 922
 923 EncodingFamily Document::CodePageFamily() const {
 924         if (SC_CP_UTF8 == dbcsCodePage)
 925                 return efUnicode;
 926         else if (dbcsCodePage)
 927                 return efDBCS;
 928         else
 929                 return efEightBit;
 930 }
 931
 932 void Document::ModifiedAt(int pos) {
 933         if (endStyled > pos)
 934                 endStyled = pos;
 935 }
 936
 937 void Document::CheckReadOnly() {
 938         if (cb.IsReadOnly() && enteredReadOnlyCount == 0) {
 939                 enteredReadOnlyCount++;
 940                 NotifyModifyAttempt();
 941                 enteredReadOnlyCount--;
 942         }
 943 }
 944
 945 // Document only modified by gateways DeleteChars, InsertString, Undo, Redo, and SetStyleAt.
 946 // SetStyleAt does not change the persistent state of a document
 947
 948 bool Document::DeleteChars(int pos, int len) {
 949         if (pos < 0)
 950                 return false;
 951         if (len <= 0)
 952                 return false;
 953         if ((pos + len) > Length())
 954                 return false;
 955         CheckReadOnly();
 956         if (enteredModification != 0) {
 957                 return false;
 958         } else {
 959                 enteredModification++;
 960                 if (!cb.IsReadOnly()) {
 961                         NotifyModified(
 962                             DocModification(
 963                                 SC_MOD_BEFOREDELETE | SC_PERFORMED_USER,
 964                                 pos, len,
 965                                 0, 0));
 966                         int prevLinesTotal = LinesTotal();
 967                         bool startSavePoint = cb.IsSavePoint();
 968                         bool startSequence = false;
 969                         const char *text = cb.DeleteChars(pos, len, startSequence);
 970                         if (startSavePoint && cb.IsCollectingUndo())
 971                                 NotifySavePoint(!startSavePoint);
 972                         if ((pos < Length()) || (pos == 0))
 973                                 ModifiedAt(pos);
 974                         else
 975                                 ModifiedAt(pos-1);
 976                         NotifyModified(
 977                             DocModification(
 978                                 SC_MOD_DELETETEXT | SC_PERFORMED_USER | (startSequence?SC_STARTACTION:0),
 979                                 pos, len,
 980                                 LinesTotal() - prevLinesTotal, text));
 981                 }
 982                 enteredModification--;
 983         }
 984         return !cb.IsReadOnly();
 985 }
 986
 987 /**
 988  * Insert a string with a length.
 989  */
 990 int Document::InsertString(int position, const char *s, int insertLength) {
 991         if (insertLength <= 0) {
 992                 return 0;
 993         }
 994         CheckReadOnly();        // Application may change read only state here
 995         if (cb.IsReadOnly()) {
 996                 return 0;
 997         }
 998         if (enteredModification != 0) {
 999                 return 0;
1000         }
1001         enteredModification++;
1002         insertionSet = false;
1003         insertion.clear();
1004         NotifyModified(
1005                 DocModification(
1006                         SC_MOD_INSERTCHECK,
1007                         position, insertLength,
1008                         0, s));
1009         if (insertionSet) {
1010                 s = insertion.c_str();
1011                 insertLength = static_cast<int>(insertion.length());
1012         }
1013         NotifyModified(
1014                 DocModification(
1015                         SC_MOD_BEFOREINSERT | SC_PERFORMED_USER,
1016                         position, insertLength,
1017                         0, s));
1018         int prevLinesTotal = LinesTotal();
1019         bool startSavePoint = cb.IsSavePoint();
1020         bool startSequence = false;
1021         const char *text = cb.InsertString(position, s, insertLength, startSequence);
1022         if (startSavePoint && cb.IsCollectingUndo())
1023                 NotifySavePoint(!startSavePoint);
1024         ModifiedAt(position);
1025         NotifyModified(
1026                 DocModification(
1027                         SC_MOD_INSERTTEXT | SC_PERFORMED_USER | (startSequence?SC_STARTACTION:0),
1028                         position, insertLength,
1029                         LinesTotal() - prevLinesTotal, text));
1030         if (insertionSet) {     // Free memory as could be large
1031                 std::string().swap(insertion);
1032         }
1033         enteredModification--;
1034         return insertLength;
1035 }
1036
1037 void Document::ChangeInsertion(const char *s, int length) {
1038         insertionSet = true;
1039         insertion.assign(s, length);
1040 }
1041
1042 int SCI_METHOD Document::AddData(char *data, int length) {
1043         try {
1044                 int position = Length();
1045                 InsertString(position, data, length);
1046         } catch (std::bad_alloc &) {
1047                 return SC_STATUS_BADALLOC;
1048         } catch (...) {
1049                 return SC_STATUS_FAILURE;
1050         }
1051         return 0;
1052 }
1053
1054 void * SCI_METHOD Document::ConvertToDocument() {
1055         return this;
1056 }
1057
1058 int Document::Undo() {
1059         int newPos = -1;
1060         CheckReadOnly();
1061         if ((enteredModification == 0) && (cb.IsCollectingUndo())) {
1062                 enteredModification++;
1063                 if (!cb.IsReadOnly()) {
1064                         bool startSavePoint = cb.IsSavePoint();
1065                         bool multiLine = false;
1066                         int steps = cb.StartUndo();
1067                         //Platform::DebugPrintf("Steps=%d\n", steps);
1068                         int coalescedRemovePos = -1;
1069                         int coalescedRemoveLen = 0;
1070                         int prevRemoveActionPos = -1;
1071                         int prevRemoveActionLen = 0;
1072                         for (int step = 0; step < steps; step++) {
1073                                 const int prevLinesTotal = LinesTotal();
1074                                 const Action &action = cb.GetUndoStep();
1075                                 if (action.at == removeAction) {
1076                                         NotifyModified(DocModification(
1077                                                                         SC_MOD_BEFOREINSERT | SC_PERFORMED_UNDO, action));
1078                                 } else if (action.at == containerAction) {
1079                                         DocModification dm(SC_MOD_CONTAINER | SC_PERFORMED_UNDO);
1080                                         dm.token = action.position;
1081                                         NotifyModified(dm);
1082                                         if (!action.mayCoalesce) {
1083                                                 coalescedRemovePos = -1;
1084                                                 coalescedRemoveLen = 0;
1085                                                 prevRemoveActionPos = -1;
1086                                                 prevRemoveActionLen = 0;
1087                                         }
1088                                 } else {
1089                                         NotifyModified(DocModification(
1090                                                                         SC_MOD_BEFOREDELETE | SC_PERFORMED_UNDO, action));
1091                                 }
1092                                 cb.PerformUndoStep();
1093                                 if (action.at != containerAction) {
1094                                         ModifiedAt(action.position);
1095                                         newPos = action.position;
1096                                 }
1097
1098                                 int modFlags = SC_PERFORMED_UNDO;
1099                                 // With undo, an insertion action becomes a deletion notification
1100                                 if (action.at == removeAction) {
1101                                         newPos += action.lenData;
1102                                         modFlags |= SC_MOD_INSERTTEXT;
1103                                         if ((coalescedRemoveLen > 0) &&
1104                                                 (action.position == prevRemoveActionPos || action.position == (prevRemoveActionPos + prevRemoveActionLen))) {
1105                                                 coalescedRemoveLen += action.lenData;
1106                                                 newPos = coalescedRemovePos + coalescedRemoveLen;
1107                                         } else {
1108                                                 coalescedRemovePos = action.position;
1109                                                 coalescedRemoveLen = action.lenData;
1110                                         }
1111                                         prevRemoveActionPos = action.position;
1112                                         prevRemoveActionLen = action.lenData;
1113                                 } else if (action.at == insertAction) {
1114                                         modFlags |= SC_MOD_DELETETEXT;
1115                                         coalescedRemovePos = -1;
1116                                         coalescedRemoveLen = 0;
1117                                         prevRemoveActionPos = -1;
1118                                         prevRemoveActionLen = 0;
1119                                 }
1120                                 if (steps > 1)
1121                                         modFlags |= SC_MULTISTEPUNDOREDO;
1122                                 const int linesAdded = LinesTotal() - prevLinesTotal;
1123                                 if (linesAdded != 0)
1124                                         multiLine = true;
1125                                 if (step == steps - 1) {
1126                                         modFlags |= SC_LASTSTEPINUNDOREDO;
1127                                         if (multiLine)
1128                                                 modFlags |= SC_MULTILINEUNDOREDO;
1129                                 }
1130                                 NotifyModified(DocModification(modFlags, action.position, action.lenData,
1131                                                                                            linesAdded, action.data));
1132                         }
1133
1134                         bool endSavePoint = cb.IsSavePoint();
1135                         if (startSavePoint != endSavePoint)
1136                                 NotifySavePoint(endSavePoint);
1137                 }
1138                 enteredModification--;
1139         }
1140         return newPos;
1141 }
1142
1143 int Document::Redo() {
1144         int newPos = -1;
1145         CheckReadOnly();
1146         if ((enteredModification == 0) && (cb.IsCollectingUndo())) {
1147                 enteredModification++;
1148                 if (!cb.IsReadOnly()) {
1149                         bool startSavePoint = cb.IsSavePoint();
1150                         bool multiLine = false;
1151                         int steps = cb.StartRedo();
1152                         for (int step = 0; step < steps; step++) {
1153                                 const int prevLinesTotal = LinesTotal();
1154                                 const Action &action = cb.GetRedoStep();
1155                                 if (action.at == insertAction) {
1156                                         NotifyModified(DocModification(
1157                                                                         SC_MOD_BEFOREINSERT | SC_PERFORMED_REDO, action));
1158                                 } else if (action.at == containerAction) {
1159                                         DocModification dm(SC_MOD_CONTAINER | SC_PERFORMED_REDO);
1160                                         dm.token = action.position;
1161                                         NotifyModified(dm);
1162                                 } else {
1163                                         NotifyModified(DocModification(
1164                                                                         SC_MOD_BEFOREDELETE | SC_PERFORMED_REDO, action));
1165                                 }
1166                                 cb.PerformRedoStep();
1167                                 if (action.at != containerAction) {
1168                                         ModifiedAt(action.position);
1169                                         newPos = action.position;
1170                                 }
1171
1172                                 int modFlags = SC_PERFORMED_REDO;
1173                                 if (action.at == insertAction) {
1174                                         newPos += action.lenData;
1175                                         modFlags |= SC_MOD_INSERTTEXT;
1176                                 } else if (action.at == removeAction) {
1177                                         modFlags |= SC_MOD_DELETETEXT;
1178                                 }
1179                                 if (steps > 1)
1180                                         modFlags |= SC_MULTISTEPUNDOREDO;
1181                                 const int linesAdded = LinesTotal() - prevLinesTotal;
1182                                 if (linesAdded != 0)
1183                                         multiLine = true;
1184                                 if (step == steps - 1) {
1185                                         modFlags |= SC_LASTSTEPINUNDOREDO;
1186                                         if (multiLine)
1187                                                 modFlags |= SC_MULTILINEUNDOREDO;
1188                                 }
1189                                 NotifyModified(
1190                                         DocModification(modFlags, action.position, action.lenData,
1191                                                                         linesAdded, action.data));
1192                         }
1193
1194                         bool endSavePoint = cb.IsSavePoint();
1195                         if (startSavePoint != endSavePoint)
1196                                 NotifySavePoint(endSavePoint);
1197                 }
1198                 enteredModification--;
1199         }
1200         return newPos;
1201 }
1202
1203 void Document::DelChar(int pos) {
1204         DeleteChars(pos, LenChar(pos));
1205 }
1206
1207 void Document::DelCharBack(int pos) {
1208         if (pos <= 0) {
1209                 return;
1210         } else if (IsCrLf(pos - 2)) {
1211                 DeleteChars(pos - 2, 2);
1212         } else if (dbcsCodePage) {
1213                 int startChar = NextPosition(pos, -1);
1214                 DeleteChars(startChar, pos - startChar);
1215         } else {
1216                 DeleteChars(pos - 1, 1);
1217         }
1218 }
1219
1220 static int NextTab(int pos, int tabSize) {
1221         return ((pos / tabSize) + 1) * tabSize;
1222 }
1223
1224 static std::string CreateIndentation(int indent, int tabSize, bool insertSpaces) {
1225         std::string indentation;
1226         if (!insertSpaces) {
1227                 while (indent >= tabSize) {
1228                         indentation += '\t';
1229                         indent -= tabSize;
1230                 }
1231         }
1232         while (indent > 0) {
1233                 indentation += ' ';
1234                 indent--;
1235         }
1236         return indentation;
1237 }
1238
1239 int SCI_METHOD Document::GetLineIndentation(int line) {
1240         int indent = 0;
1241         if ((line >= 0) && (line < LinesTotal())) {
1242                 int lineStart = LineStart(line);
1243                 int length = Length();
1244                 for (int i = lineStart; i < length; i++) {
1245                         char ch = cb.CharAt(i);
1246                         if (ch == ' ')
1247                                 indent++;
1248                         else if (ch == '\t')
1249                                 indent = NextTab(indent, tabInChars);
1250                         else
1251                                 return indent;
1252                 }
1253         }
1254         return indent;
1255 }
1256
1257 int Document::SetLineIndentation(int line, int indent) {
1258         int indentOfLine = GetLineIndentation(line);
1259         if (indent < 0)
1260                 indent = 0;
1261         if (indent != indentOfLine) {
1262                 std::string linebuf = CreateIndentation(indent, tabInChars, !useTabs);
1263                 int thisLineStart = LineStart(line);
1264                 int indentPos = GetLineIndentPosition(line);
1265                 UndoGroup ug(this);
1266                 DeleteChars(thisLineStart, indentPos - thisLineStart);
1267                 return thisLineStart + InsertString(thisLineStart, linebuf.c_str(),
1268                         static_cast<int>(linebuf.length()));
1269         } else {
1270                 return GetLineIndentPosition(line);
1271         }
1272 }
1273
1274 int Document::GetLineIndentPosition(int line) const {
1275         if (line < 0)
1276                 return 0;
1277         int pos = LineStart(line);
1278         int length = Length();
1279         while ((pos < length) && IsSpaceOrTab(cb.CharAt(pos))) {
1280                 pos++;
1281         }
1282         return pos;
1283 }
1284
1285 int Document::GetColumn(int pos) {
1286         int column = 0;
1287         int line = LineFromPosition(pos);
1288         if ((line >= 0) && (line < LinesTotal())) {
1289                 for (int i = LineStart(line); i < pos;) {
1290                         char ch = cb.CharAt(i);
1291                         if (ch == '\t') {
1292                                 column = NextTab(column, tabInChars);
1293                                 i++;
1294                         } else if (ch == '\r') {
1295                                 return column;
1296                         } else if (ch == '\n') {
1297                                 return column;
1298                         } else if (i >= Length()) {
1299                                 return column;
1300                         } else {
1301                                 column++;
1302                                 i = NextPosition(i, 1);
1303                         }
1304                 }
1305         }
1306         return column;
1307 }
1308
1309 int Document::CountCharacters(int startPos, int endPos) const {
1310         startPos = MovePositionOutsideChar(startPos, 1, false);
1311         endPos = MovePositionOutsideChar(endPos, -1, false);
1312         int count = 0;
1313         int i = startPos;
1314         while (i < endPos) {
1315                 count++;
1316                 if (IsCrLf(i))
1317                         i++;
1318                 i = NextPosition(i, 1);
1319         }
1320         return count;
1321 }
1322
1323 int Document::FindColumn(int line, int column) {
1324         int position = LineStart(line);
1325         if ((line >= 0) && (line < LinesTotal())) {
1326                 int columnCurrent = 0;
1327                 while ((columnCurrent < column) && (position < Length())) {
1328                         char ch = cb.CharAt(position);
1329                         if (ch == '\t') {
1330                                 columnCurrent = NextTab(columnCurrent, tabInChars);
1331                                 if (columnCurrent > column)
1332                                         return position;
1333                                 position++;
1334                         } else if (ch == '\r') {
1335                                 return position;
1336                         } else if (ch == '\n') {
1337                                 return position;
1338                         } else {
1339                                 columnCurrent++;
1340                                 position = NextPosition(position, 1);
1341                         }
1342                 }
1343         }
1344         return position;
1345 }
1346
1347 void Document::Indent(bool forwards, int lineBottom, int lineTop) {
1348         // Dedent - suck white space off the front of the line to dedent by equivalent of a tab
1349         for (int line = lineBottom; line >= lineTop; line--) {
1350                 int indentOfLine = GetLineIndentation(line);
1351                 if (forwards) {
1352                         if (LineStart(line) < LineEnd(line)) {
1353                                 SetLineIndentation(line, indentOfLine + IndentSize());
1354                         }
1355                 } else {
1356                         SetLineIndentation(line, indentOfLine - IndentSize());
1357                 }
1358         }
1359 }
1360
1361 // Convert line endings for a piece of text to a particular mode.
1362 // Stop at len or when a NUL is found.
1363 std::string Document::TransformLineEnds(const char *s, size_t len, int eolModeWanted) {
1364         std::string dest;
1365         for (size_t i = 0; (i < len) && (s[i]); i++) {
1366                 if (s[i] == '\n' || s[i] == '\r') {
1367                         if (eolModeWanted == SC_EOL_CR) {
1368                                 dest.push_back('\r');
1369                         } else if (eolModeWanted == SC_EOL_LF) {
1370                                 dest.push_back('\n');
1371                         } else { // eolModeWanted == SC_EOL_CRLF
1372                                 dest.push_back('\r');
1373                                 dest.push_back('\n');
1374                         }
1375                         if ((s[i] == '\r') && (i+1 < len) && (s[i+1] == '\n')) {
1376                                 i++;
1377                         }
1378                 } else {
1379                         dest.push_back(s[i]);
1380                 }
1381         }
1382         return dest;
1383 }
1384
1385 void Document::ConvertLineEnds(int eolModeSet) {
1386         UndoGroup ug(this);
1387
1388         for (int pos = 0; pos < Length(); pos++) {
1389                 if (cb.CharAt(pos) == '\r') {
1390                         if (cb.CharAt(pos + 1) == '\n') {
1391                                 // CRLF
1392                                 if (eolModeSet == SC_EOL_CR) {
1393                                         DeleteChars(pos + 1, 1); // Delete the LF
1394                                 } else if (eolModeSet == SC_EOL_LF) {
1395                                         DeleteChars(pos, 1); // Delete the CR
1396                                 } else {
1397                                         pos++;
1398                                 }
1399                         } else {
1400                                 // CR
1401                                 if (eolModeSet == SC_EOL_CRLF) {
1402                                         pos += InsertString(pos + 1, "\n", 1); // Insert LF
1403                                 } else if (eolModeSet == SC_EOL_LF) {
1404                                         pos += InsertString(pos, "\n", 1); // Insert LF
1405                                         DeleteChars(pos, 1); // Delete CR
1406                                         pos--;
1407                                 }
1408                         }
1409                 } else if (cb.CharAt(pos) == '\n') {
1410                         // LF
1411                         if (eolModeSet == SC_EOL_CRLF) {
1412                                 pos += InsertString(pos, "\r", 1); // Insert CR
1413                         } else if (eolModeSet == SC_EOL_CR) {
1414                                 pos += InsertString(pos, "\r", 1); // Insert CR
1415                                 DeleteChars(pos, 1); // Delete LF
1416                                 pos--;
1417                         }
1418                 }
1419         }
1420
1421 }
1422
1423 bool Document::IsWhiteLine(int line) const {
1424         int currentChar = LineStart(line);
1425         int endLine = LineEnd(line);
1426         while (currentChar < endLine) {
1427                 if (cb.CharAt(currentChar) != ' ' && cb.CharAt(currentChar) != '\t') {
1428                         return false;
1429                 }
1430                 ++currentChar;
1431         }
1432         return true;
1433 }
1434
1435 int Document::ParaUp(int pos) const {
1436         int line = LineFromPosition(pos);
1437         line--;
1438         while (line >= 0 && IsWhiteLine(line)) { // skip empty lines
1439                 line--;
1440         }
1441         while (line >= 0 && !IsWhiteLine(line)) { // skip non-empty lines
1442                 line--;
1443         }
1444         line++;
1445         return LineStart(line);
1446 }
1447
1448 int Document::ParaDown(int pos) const {
1449         int line = LineFromPosition(pos);
1450         while (line < LinesTotal() && !IsWhiteLine(line)) { // skip non-empty lines
1451                 line++;
1452         }
1453         while (line < LinesTotal() && IsWhiteLine(line)) { // skip empty lines
1454                 line++;
1455         }
1456         if (line < LinesTotal())
1457                 return LineStart(line);
1458         else // end of a document
1459                 return LineEnd(line-1);
1460 }
1461
1462 CharClassify::cc Document::WordCharClass(unsigned char ch) const {
1463         if ((SC_CP_UTF8 == dbcsCodePage) && (!UTF8IsAscii(ch)))
1464                 return CharClassify::ccWord;
1465         return charClass.GetClass(ch);
1466 }
1467
1468 /**
1469  * Used by commmands that want to select whole words.
1470  * Finds the start of word at pos when delta < 0 or the end of the word when delta >= 0.
1471  */
1472 int Document::ExtendWordSelect(int pos, int delta, bool onlyWordCharacters) {
1473         CharClassify::cc ccStart = CharClassify::ccWord;
1474         if (delta < 0) {
1475                 if (!onlyWordCharacters)
1476                         ccStart = WordCharClass(cb.CharAt(pos-1));
1477                 while (pos > 0 && (WordCharClass(cb.CharAt(pos - 1)) == ccStart))
1478                         pos--;
1479         } else {
1480                 if (!onlyWordCharacters && pos < Length())
1481                         ccStart = WordCharClass(cb.CharAt(pos));
1482                 while (pos < (Length()) && (WordCharClass(cb.CharAt(pos)) == ccStart))
1483                         pos++;
1484         }
1485         return MovePositionOutsideChar(pos, delta, true);
1486 }
1487
1488 /**
1489  * Find the start of the next word in either a forward (delta >= 0) or backwards direction
1490  * (delta < 0).
1491  * This is looking for a transition between character classes although there is also some
1492  * additional movement to transit white space.
1493  * Used by cursor movement by word commands.
1494  */
1495 int Document::NextWordStart(int pos, int delta) {
1496         if (delta < 0) {
1497                 while (pos > 0 && (WordCharClass(cb.CharAt(pos - 1)) == CharClassify::ccSpace))
1498                         pos--;
1499                 if (pos > 0) {
1500                         CharClassify::cc ccStart = WordCharClass(cb.CharAt(pos-1));
1501                         while (pos > 0 && (WordCharClass(cb.CharAt(pos - 1)) == ccStart)) {
1502                                 pos--;
1503                         }
1504                 }
1505         } else {
1506                 CharClassify::cc ccStart = WordCharClass(cb.CharAt(pos));
1507                 while (pos < (Length()) && (WordCharClass(cb.CharAt(pos)) == ccStart))
1508                         pos++;
1509                 while (pos < (Length()) && (WordCharClass(cb.CharAt(pos)) == CharClassify::ccSpace))
1510                         pos++;
1511         }
1512         return pos;
1513 }
1514
1515 /**
1516  * Find the end of the next word in either a forward (delta >= 0) or backwards direction
1517  * (delta < 0).
1518  * This is looking for a transition between character classes although there is also some
1519  * additional movement to transit white space.
1520  * Used by cursor movement by word commands.
1521  */
1522 int Document::NextWordEnd(int pos, int delta) {
1523         if (delta < 0) {
1524                 if (pos > 0) {
1525                         CharClassify::cc ccStart = WordCharClass(cb.CharAt(pos-1));
1526                         if (ccStart != CharClassify::ccSpace) {
1527                                 while (pos > 0 && WordCharClass(cb.CharAt(pos - 1)) == ccStart) {
1528                                         pos--;
1529                                 }
1530                         }
1531                         while (pos > 0 && WordCharClass(cb.CharAt(pos - 1)) == CharClassify::ccSpace) {
1532                                 pos--;
1533                         }
1534                 }
1535         } else {
1536                 while (pos < Length() && WordCharClass(cb.CharAt(pos)) == CharClassify::ccSpace) {
1537                         pos++;
1538                 }
1539                 if (pos < Length()) {
1540                         CharClassify::cc ccStart = WordCharClass(cb.CharAt(pos));
1541                         while (pos < Length() && WordCharClass(cb.CharAt(pos)) == ccStart) {
1542                                 pos++;
1543                         }
1544                 }
1545         }
1546         return pos;
1547 }
1548
1549 /**
1550  * Check that the character at the given position is a word or punctuation character and that
1551  * the previous character is of a different character class.
1552  */
1553 bool Document::IsWordStartAt(int pos) const {
1554         if (pos > 0) {
1555                 CharClassify::cc ccPos = WordCharClass(CharAt(pos));
1556                 return (ccPos == CharClassify::ccWord || ccPos == CharClassify::ccPunctuation) &&
1557                         (ccPos != WordCharClass(CharAt(pos - 1)));
1558         }
1559         return true;
1560 }
1561
1562 /**
1563  * Check that the character at the given position is a word or punctuation character and that
1564  * the next character is of a different character class.
1565  */
1566 bool Document::IsWordEndAt(int pos) const {
1567         if (pos < Length()) {
1568                 CharClassify::cc ccPrev = WordCharClass(CharAt(pos-1));
1569                 return (ccPrev == CharClassify::ccWord || ccPrev == CharClassify::ccPunctuation) &&
1570                         (ccPrev != WordCharClass(CharAt(pos)));
1571         }
1572         return true;
1573 }
1574
1575 /**
1576  * Check that the given range is has transitions between character classes at both
1577  * ends and where the characters on the inside are word or punctuation characters.
1578  */
1579 bool Document::IsWordAt(int start, int end) const {
1580         return IsWordStartAt(start) && IsWordEndAt(end);
1581 }
1582
1583 bool Document::MatchesWordOptions(bool word, bool wordStart, int pos, int length) const {
1584         return (!word && !wordStart) ||
1585                         (word && IsWordAt(pos, pos + length)) ||
1586                         (wordStart && IsWordStartAt(pos));
1587 }
1588
1589 bool Document::HasCaseFolder(void) const {
1590         return pcf != 0;
1591 }
1592
1593 void Document::SetCaseFolder(CaseFolder *pcf_) {
1594         delete pcf;
1595         pcf = pcf_;
1596 }
1597
1598 Document::CharacterExtracted Document::ExtractCharacter(int position) const {
1599         const unsigned char leadByte = static_cast<unsigned char>(cb.CharAt(position));
1600         if (UTF8IsAscii(leadByte)) {
1601                 // Common case: ASCII character
1602                 return CharacterExtracted(leadByte, 1);
1603         }
1604         const int widthCharBytes = UTF8BytesOfLead[leadByte];
1605         unsigned char charBytes[UTF8MaxBytes] = { leadByte, 0, 0, 0 };
1606         for (int b=1; b<widthCharBytes; b++)
1607                 charBytes[b] = static_cast<unsigned char>(cb.CharAt(position + b));
1608         int utf8status = UTF8Classify(charBytes, widthCharBytes);
1609         if (utf8status & UTF8MaskInvalid) {
1610                 // Treat as invalid and use up just one byte
1611                 return CharacterExtracted(unicodeReplacementChar, 1);
1612         } else {
1613                 return CharacterExtracted(UnicodeFromBytes(charBytes), utf8status & UTF8MaskWidth);
1614         }
1615 }
1616
1617 /**
1618  * Find text in document, supporting both forward and backward
1619  * searches (just pass minPos > maxPos to do a backward search)
1620  * Has not been tested with backwards DBCS searches yet.
1621  */
1622 long Document::FindText(int minPos, int maxPos, const char *search,
1623                         bool caseSensitive, bool word, bool wordStart, bool regExp, int flags,
1624                         int *length) {
1625         if (*length <= 0)
1626                 return minPos;
1627         if (regExp) {
1628                 if (!regex)
1629                         regex = CreateRegexSearch(&charClass);
1630                 return regex->FindText(this, minPos, maxPos, search, caseSensitive, word, wordStart, flags, length);
1631         } else {
1632
1633                 const bool forward = minPos <= maxPos;
1634                 const int increment = forward ? 1 : -1;
1635
1636                 // Range endpoints should not be inside DBCS characters, but just in case, move them.
1637                 const int startPos = MovePositionOutsideChar(minPos, increment, false);
1638                 const int endPos = MovePositionOutsideChar(maxPos, increment, false);
1639
1640                 // Compute actual search ranges needed
1641                 const int lengthFind = *length;
1642
1643                 //Platform::DebugPrintf("Find %d %d %s %d\n", startPos, endPos, ft->lpstrText, lengthFind);
1644                 const int limitPos = Platform::Maximum(startPos, endPos);
1645                 int pos = startPos;
1646                 if (!forward) {
1647                         // Back all of a character
1648                         pos = NextPosition(pos, increment);
1649                 }
1650                 if (caseSensitive) {
1651                         const int endSearch = (startPos <= endPos) ? endPos - lengthFind + 1 : endPos;
1652                         const char charStartSearch =  search[0];
1653                         while (forward ? (pos < endSearch) : (pos >= endSearch)) {
1654                                 if (CharAt(pos) == charStartSearch) {
1655                                         bool found = (pos + lengthFind) <= limitPos;
1656                                         for (int indexSearch = 1; (indexSearch < lengthFind) && found; indexSearch++) {
1657                                                 found = CharAt(pos + indexSearch) == search[indexSearch];
1658                                         }
1659                                         if (found && MatchesWordOptions(word, wordStart, pos, lengthFind)) {
1660                                                 return pos;
1661                                         }
1662                                 }
1663                                 if (!NextCharacter(pos, increment))
1664                                         break;
1665                         }
1666                 } else if (SC_CP_UTF8 == dbcsCodePage) {
1667                         const size_t maxFoldingExpansion = 4;
1668                         std::vector<char> searchThing(lengthFind * UTF8MaxBytes * maxFoldingExpansion + 1);
1669                         const int lenSearch = static_cast<int>(
1670                                 pcf->Fold(&searchThing[0], searchThing.size(), search, lengthFind));
1671                         char bytes[UTF8MaxBytes + 1];
1672                         char folded[UTF8MaxBytes * maxFoldingExpansion + 1];
1673                         while (forward ? (pos < endPos) : (pos >= endPos)) {
1674                                 int widthFirstCharacter = 0;
1675                                 int posIndexDocument = pos;
1676                                 int indexSearch = 0;
1677                                 bool characterMatches = true;
1678                                 for (;;) {
1679                                         const unsigned char leadByte = static_cast<unsigned char>(cb.CharAt(posIndexDocument));
1680                                         bytes[0] = leadByte;
1681                                         int widthChar = 1;
1682                                         if (!UTF8IsAscii(leadByte)) {
1683                                                 const int widthCharBytes = UTF8BytesOfLead[leadByte];
1684                                                 for (int b=1; b<widthCharBytes; b++) {
1685                                                         bytes[b] = cb.CharAt(posIndexDocument+b);
1686                                                 }
1687                                                 widthChar = UTF8Classify(reinterpret_cast<const unsigned char *>(bytes), widthCharBytes) & UTF8MaskWidth;
1688                                         }
1689                                         if (!widthFirstCharacter)
1690                                                 widthFirstCharacter = widthChar;
1691                                         if ((posIndexDocument + widthChar) > limitPos)
1692                                                 break;
1693                                         const int lenFlat = static_cast<int>(pcf->Fold(folded, sizeof(folded), bytes, widthChar));
1694                                         folded[lenFlat] = 0;
1695                                         // Does folded match the buffer
1696                                         characterMatches = 0 == memcmp(folded, &searchThing[0] + indexSearch, lenFlat);
1697                                         if (!characterMatches)
1698                                                 break;
1699                                         posIndexDocument += widthChar;
1700                                         indexSearch += lenFlat;
1701                                         if (indexSearch >= lenSearch)
1702                                                 break;
1703                                 }
1704                                 if (characterMatches && (indexSearch == static_cast<int>(lenSearch))) {
1705                                         if (MatchesWordOptions(word, wordStart, pos, posIndexDocument - pos)) {
1706                                                 *length = posIndexDocument - pos;
1707                                                 return pos;
1708                                         }
1709                                 }
1710                                 if (forward) {
1711                                         pos += widthFirstCharacter;
1712                                 } else {
1713                                         if (!NextCharacter(pos, increment))
1714                                                 break;
1715                                 }
1716                         }
1717                 } else if (dbcsCodePage) {
1718                         const size_t maxBytesCharacter = 2;
1719                         const size_t maxFoldingExpansion = 4;
1720                         std::vector<char> searchThing(lengthFind * maxBytesCharacter * maxFoldingExpansion + 1);
1721                         const int lenSearch = static_cast<int>(
1722                                 pcf->Fold(&searchThing[0], searchThing.size(), search, lengthFind));
1723                         while (forward ? (pos < endPos) : (pos >= endPos)) {
1724                                 int indexDocument = 0;
1725                                 int indexSearch = 0;
1726                                 bool characterMatches = true;
1727                                 while (characterMatches &&
1728                                         ((pos + indexDocument) < limitPos) &&
1729                                         (indexSearch < lenSearch)) {
1730                                         char bytes[maxBytesCharacter + 1];
1731                                         bytes[0] = cb.CharAt(pos + indexDocument);
1732                                         const int widthChar = IsDBCSLeadByte(bytes[0]) ? 2 : 1;
1733                                         if (widthChar == 2)
1734                                                 bytes[1] = cb.CharAt(pos + indexDocument + 1);
1735                                         if ((pos + indexDocument + widthChar) > limitPos)
1736                                                 break;
1737                                         char folded[maxBytesCharacter * maxFoldingExpansion + 1];
1738                                         const int lenFlat = static_cast<int>(pcf->Fold(folded, sizeof(folded), bytes, widthChar));
1739                                         folded[lenFlat] = 0;
1740                                         // Does folded match the buffer
1741                                         characterMatches = 0 == memcmp(folded, &searchThing[0] + indexSearch, lenFlat);
1742                                         indexDocument += widthChar;
1743                                         indexSearch += lenFlat;
1744                                 }
1745                                 if (characterMatches && (indexSearch == static_cast<int>(lenSearch))) {
1746                                         if (MatchesWordOptions(word, wordStart, pos, indexDocument)) {
1747                                                 *length = indexDocument;
1748                                                 return pos;
1749                                         }
1750                                 }
1751                                 if (!NextCharacter(pos, increment))
1752                                         break;
1753                         }
1754                 } else {
1755                         const int endSearch = (startPos <= endPos) ? endPos - lengthFind + 1 : endPos;
1756                         std::vector<char> searchThing(lengthFind + 1);
1757                         pcf->Fold(&searchThing[0], searchThing.size(), search, lengthFind);
1758                         while (forward ? (pos < endSearch) : (pos >= endSearch)) {
1759                                 bool found = (pos + lengthFind) <= limitPos;
1760                                 for (int indexSearch = 0; (indexSearch < lengthFind) && found; indexSearch++) {
1761                                         char ch = CharAt(pos + indexSearch);
1762                                         char folded[2];
1763                                         pcf->Fold(folded, sizeof(folded), &ch, 1);
1764                                         found = folded[0] == searchThing[indexSearch];
1765                                 }
1766                                 if (found && MatchesWordOptions(word, wordStart, pos, lengthFind)) {
1767                                         return pos;
1768                                 }
1769                                 if (!NextCharacter(pos, increment))
1770                                         break;
1771                         }
1772                 }
1773         }
1774         //Platform::DebugPrintf("Not found\n");
1775         return -1;
1776 }
1777
1778 const char *Document::SubstituteByPosition(const char *text, int *length) {
1779         if (regex)
1780                 return regex->SubstituteByPosition(this, text, length);
1781         else
1782                 return 0;
1783 }
1784
1785 int Document::LinesTotal() const {
1786         return cb.Lines();
1787 }
1788
1789 void Document::SetDefaultCharClasses(bool includeWordClass) {
1790     charClass.SetDefaultCharClasses(includeWordClass);
1791 }
1792
1793 void Document::SetCharClasses(const unsigned char *chars, CharClassify::cc newCharClass) {
1794     charClass.SetCharClasses(chars, newCharClass);
1795 }
1796
1797 int Document::GetCharsOfClass(CharClassify::cc characterClass, unsigned char *buffer) {
1798     return charClass.GetCharsOfClass(characterClass, buffer);
1799 }
1800
1801 void SCI_METHOD Document::StartStyling(int position, char) {
1802         endStyled = position;
1803 }
1804
1805 bool SCI_METHOD Document::SetStyleFor(int length, char style) {
1806         if (enteredStyling != 0) {
1807                 return false;
1808         } else {
1809                 enteredStyling++;
1810                 int prevEndStyled = endStyled;
1811                 if (cb.SetStyleFor(endStyled, length, style)) {
1812                         DocModification mh(SC_MOD_CHANGESTYLE | SC_PERFORMED_USER,
1813                                            prevEndStyled, length);
1814                         NotifyModified(mh);
1815                 }
1816                 endStyled += length;
1817                 enteredStyling--;
1818                 return true;
1819         }
1820 }
1821
1822 bool SCI_METHOD Document::SetStyles(int length, const char *styles) {
1823         if (enteredStyling != 0) {
1824                 return false;
1825         } else {
1826                 enteredStyling++;
1827                 bool didChange = false;
1828                 int startMod = 0;
1829                 int endMod = 0;
1830                 for (int iPos = 0; iPos < length; iPos++, endStyled++) {
1831                         PLATFORM_ASSERT(endStyled < Length());
1832                         if (cb.SetStyleAt(endStyled, styles[iPos])) {
1833                                 if (!didChange) {
1834                                         startMod = endStyled;
1835                                 }
1836                                 didChange = true;
1837                                 endMod = endStyled;
1838                         }
1839                 }
1840                 if (didChange) {
1841                         DocModification mh(SC_MOD_CHANGESTYLE | SC_PERFORMED_USER,
1842                                            startMod, endMod - startMod + 1);
1843                         NotifyModified(mh);
1844                 }
1845                 enteredStyling--;
1846                 return true;
1847         }
1848 }
1849
1850 void Document::EnsureStyledTo(int pos) {
1851         if ((enteredStyling == 0) && (pos > GetEndStyled())) {
1852                 IncrementStyleClock();
1853                 if (pli && !pli->UseContainerLexing()) {
1854                         int lineEndStyled = LineFromPosition(GetEndStyled());
1855                         int endStyledTo = LineStart(lineEndStyled);
1856                         pli->Colourise(endStyledTo, pos);
1857                 } else {
1858                         // Ask the watchers to style, and stop as soon as one responds.
1859                         for (std::vector<WatcherWithUserData>::iterator it = watchers.begin();
1860                                 (pos > GetEndStyled()) && (it != watchers.end()); ++it) {
1861                                 it->watcher->NotifyStyleNeeded(this, it->userData, pos);
1862                         }
1863                 }
1864         }
1865 }
1866
1867 void Document::LexerChanged() {
1868         // Tell the watchers the lexer has changed.
1869         for (std::vector<WatcherWithUserData>::iterator it = watchers.begin(); it != watchers.end(); ++it) {
1870                 it->watcher->NotifyLexerChanged(this, it->userData);
1871         }
1872 }
1873
1874 int SCI_METHOD Document::SetLineState(int line, int state) {
1875         int statePrevious = static_cast<LineState *>(perLineData[ldState])->SetLineState(line, state);
1876         if (state != statePrevious) {
1877                 DocModification mh(SC_MOD_CHANGELINESTATE, LineStart(line), 0, 0, 0, line);
1878                 NotifyModified(mh);
1879         }
1880         return statePrevious;
1881 }
1882
1883 int SCI_METHOD Document::GetLineState(int line) const {
1884         return static_cast<LineState *>(perLineData[ldState])->GetLineState(line);
1885 }
1886
1887 int Document::GetMaxLineState() {
1888         return static_cast<LineState *>(perLineData[ldState])->GetMaxLineState();
1889 }
1890
1891 void SCI_METHOD Document::ChangeLexerState(int start, int end) {
1892         DocModification mh(SC_MOD_LEXERSTATE, start, end-start, 0, 0, 0);
1893         NotifyModified(mh);
1894 }
1895
1896 StyledText Document::MarginStyledText(int line) const {
1897         LineAnnotation *pla = static_cast<LineAnnotation *>(perLineData[ldMargin]);
1898         return StyledText(pla->Length(line), pla->Text(line),
1899                 pla->MultipleStyles(line), pla->Style(line), pla->Styles(line));
1900 }
1901
1902 void Document::MarginSetText(int line, const char *text) {
1903         static_cast<LineAnnotation *>(perLineData[ldMargin])->SetText(line, text);
1904         DocModification mh(SC_MOD_CHANGEMARGIN, LineStart(line), 0, 0, 0, line);
1905         NotifyModified(mh);
1906 }
1907
1908 void Document::MarginSetStyle(int line, int style) {
1909         static_cast<LineAnnotation *>(perLineData[ldMargin])->SetStyle(line, style);
1910         NotifyModified(DocModification(SC_MOD_CHANGEMARGIN, LineStart(line), 0, 0, 0, line));
1911 }
1912
1913 void Document::MarginSetStyles(int line, const unsigned char *styles) {
1914         static_cast<LineAnnotation *>(perLineData[ldMargin])->SetStyles(line, styles);
1915         NotifyModified(DocModification(SC_MOD_CHANGEMARGIN, LineStart(line), 0, 0, 0, line));
1916 }
1917
1918 void Document::MarginClearAll() {
1919         int maxEditorLine = LinesTotal();
1920         for (int l=0; l<maxEditorLine; l++)
1921                 MarginSetText(l, 0);
1922         // Free remaining data
1923         static_cast<LineAnnotation *>(perLineData[ldMargin])->ClearAll();
1924 }
1925
1926 StyledText Document::AnnotationStyledText(int line) const {
1927         LineAnnotation *pla = static_cast<LineAnnotation *>(perLineData[ldAnnotation]);
1928         return StyledText(pla->Length(line), pla->Text(line),
1929                 pla->MultipleStyles(line), pla->Style(line), pla->Styles(line));
1930 }
1931
1932 void Document::AnnotationSetText(int line, const char *text) {
1933         if (line >= 0 && line < LinesTotal()) {
1934                 const int linesBefore = AnnotationLines(line);
1935                 static_cast<LineAnnotation *>(perLineData[ldAnnotation])->SetText(line, text);
1936                 const int linesAfter = AnnotationLines(line);
1937                 DocModification mh(SC_MOD_CHANGEANNOTATION, LineStart(line), 0, 0, 0, line);
1938                 mh.annotationLinesAdded = linesAfter - linesBefore;
1939                 NotifyModified(mh);
1940         }
1941 }
1942
1943 void Document::AnnotationSetStyle(int line, int style) {
1944         static_cast<LineAnnotation *>(perLineData[ldAnnotation])->SetStyle(line, style);
1945         DocModification mh(SC_MOD_CHANGEANNOTATION, LineStart(line), 0, 0, 0, line);
1946         NotifyModified(mh);
1947 }
1948
1949 void Document::AnnotationSetStyles(int line, const unsigned char *styles) {
1950         if (line >= 0 && line < LinesTotal()) {
1951                 static_cast<LineAnnotation *>(perLineData[ldAnnotation])->SetStyles(line, styles);
1952         }
1953 }
1954
1955 int Document::AnnotationLines(int line) const {
1956         return static_cast<LineAnnotation *>(perLineData[ldAnnotation])->Lines(line);
1957 }
1958
1959 void Document::AnnotationClearAll() {
1960         int maxEditorLine = LinesTotal();
1961         for (int l=0; l<maxEditorLine; l++)
1962                 AnnotationSetText(l, 0);
1963         // Free remaining data
1964         static_cast<LineAnnotation *>(perLineData[ldAnnotation])->ClearAll();
1965 }
1966
1967 void Document::IncrementStyleClock() {
1968         styleClock = (styleClock + 1) % 0x100000;
1969 }
1970
1971 void SCI_METHOD Document::DecorationFillRange(int position, int value, int fillLength) {
1972         if (decorations.FillRange(position, value, fillLength)) {
1973                 DocModification mh(SC_MOD_CHANGEINDICATOR | SC_PERFORMED_USER,
1974                                                         position, fillLength);
1975                 NotifyModified(mh);
1976         }
1977 }
1978
1979 bool Document::AddWatcher(DocWatcher *watcher, void *userData) {
1980         WatcherWithUserData wwud(watcher, userData);
1981         std::vector<WatcherWithUserData>::iterator it =
1982                 std::find(watchers.begin(), watchers.end(), wwud);
1983         if (it != watchers.end())
1984                 return false;
1985         watchers.push_back(wwud);
1986         return true;
1987 }
1988
1989 bool Document::RemoveWatcher(DocWatcher *watcher, void *userData) {
1990         std::vector<WatcherWithUserData>::iterator it =
1991                 std::find(watchers.begin(), watchers.end(), WatcherWithUserData(watcher, userData));
1992         if (it != watchers.end()) {
1993                 watchers.erase(it);
1994                 return true;
1995         }
1996         return false;
1997 }
1998
1999 void Document::NotifyModifyAttempt() {
2000         for (std::vector<WatcherWithUserData>::iterator it = watchers.begin(); it != watchers.end(); ++it) {
2001                 it->watcher->NotifyModifyAttempt(this, it->userData);
2002         }
2003 }
2004
2005 void Document::NotifySavePoint(bool atSavePoint) {
2006         for (std::vector<WatcherWithUserData>::iterator it = watchers.begin(); it != watchers.end(); ++it) {
2007                 it->watcher->NotifySavePoint(this, it->userData, atSavePoint);
2008         }
2009 }
2010
2011 void Document::NotifyModified(DocModification mh) {
2012         if (mh.modificationType & SC_MOD_INSERTTEXT) {
2013                 decorations.InsertSpace(mh.position, mh.length);
2014         } else if (mh.modificationType & SC_MOD_DELETETEXT) {
2015                 decorations.DeleteRange(mh.position, mh.length);
2016         }
2017         for (std::vector<WatcherWithUserData>::iterator it = watchers.begin(); it != watchers.end(); ++it) {
2018                 it->watcher->NotifyModified(this, mh, it->userData);
2019         }
2020 }
2021
2022 bool Document::IsWordPartSeparator(char ch) const {
2023         return (WordCharClass(ch) == CharClassify::ccWord) && IsPunctuation(ch);
2024 }
2025
2026 int Document::WordPartLeft(int pos) {
2027         if (pos > 0) {
2028                 --pos;
2029                 char startChar = cb.CharAt(pos);
2030                 if (IsWordPartSeparator(startChar)) {
2031                         while (pos > 0 && IsWordPartSeparator(cb.CharAt(pos))) {
2032                                 --pos;
2033                         }
2034                 }
2035                 if (pos > 0) {
2036                         startChar = cb.CharAt(pos);
2037                         --pos;
2038                         if (IsLowerCase(startChar)) {
2039                                 while (pos > 0 && IsLowerCase(cb.CharAt(pos)))
2040                                         --pos;
2041                                 if (!IsUpperCase(cb.CharAt(pos)) && !IsLowerCase(cb.CharAt(pos)))
2042                                         ++pos;
2043                         } else if (IsUpperCase(startChar)) {
2044                                 while (pos > 0 && IsUpperCase(cb.CharAt(pos)))
2045                                         --pos;
2046                                 if (!IsUpperCase(cb.CharAt(pos)))
2047                                         ++pos;
2048                         } else if (IsADigit(startChar)) {
2049                                 while (pos > 0 && IsADigit(cb.CharAt(pos)))
2050                                         --pos;
2051                                 if (!IsADigit(cb.CharAt(pos)))
2052                                         ++pos;
2053                         } else if (IsPunctuation(startChar)) {
2054                                 while (pos > 0 && IsPunctuation(cb.CharAt(pos)))
2055                                         --pos;
2056                                 if (!IsPunctuation(cb.CharAt(pos)))
2057                                         ++pos;
2058                         } else if (isspacechar(startChar)) {
2059                                 while (pos > 0 && isspacechar(cb.CharAt(pos)))
2060                                         --pos;
2061                                 if (!isspacechar(cb.CharAt(pos)))
2062                                         ++pos;
2063                         } else if (!IsASCII(startChar)) {
2064                                 while (pos > 0 && !IsASCII(cb.CharAt(pos)))
2065                                         --pos;
2066                                 if (IsASCII(cb.CharAt(pos)))
2067                                         ++pos;
2068                         } else {
2069                                 ++pos;
2070                         }
2071                 }
2072         }
2073         return pos;
2074 }
2075
2076 int Document::WordPartRight(int pos) {
2077         char startChar = cb.CharAt(pos);
2078         int length = Length();
2079         if (IsWordPartSeparator(startChar)) {
2080                 while (pos < length && IsWordPartSeparator(cb.CharAt(pos)))
2081                         ++pos;
2082                 startChar = cb.CharAt(pos);
2083         }
2084         if (!IsASCII(startChar)) {
2085                 while (pos < length && !IsASCII(cb.CharAt(pos)))
2086                         ++pos;
2087         } else if (IsLowerCase(startChar)) {
2088                 while (pos < length && IsLowerCase(cb.CharAt(pos)))
2089                         ++pos;
2090         } else if (IsUpperCase(startChar)) {
2091                 if (IsLowerCase(cb.CharAt(pos + 1))) {
2092                         ++pos;
2093                         while (pos < length && IsLowerCase(cb.CharAt(pos)))
2094                                 ++pos;
2095                 } else {
2096                         while (pos < length && IsUpperCase(cb.CharAt(pos)))
2097                                 ++pos;
2098                 }
2099                 if (IsLowerCase(cb.CharAt(pos)) && IsUpperCase(cb.CharAt(pos - 1)))
2100                         --pos;
2101         } else if (IsADigit(startChar)) {
2102                 while (pos < length && IsADigit(cb.CharAt(pos)))
2103                         ++pos;
2104         } else if (IsPunctuation(startChar)) {
2105                 while (pos < length && IsPunctuation(cb.CharAt(pos)))
2106                         ++pos;
2107         } else if (isspacechar(startChar)) {
2108                 while (pos < length && isspacechar(cb.CharAt(pos)))
2109                         ++pos;
2110         } else {
2111                 ++pos;
2112         }
2113         return pos;
2114 }
2115
2116 bool IsLineEndChar(char c) {
2117         return (c == '\n' || c == '\r');
2118 }
2119
2120 int Document::ExtendStyleRange(int pos, int delta, bool singleLine) {
2121         int sStart = cb.StyleAt(pos);
2122         if (delta < 0) {
2123                 while (pos > 0 && (cb.StyleAt(pos) == sStart) && (!singleLine || !IsLineEndChar(cb.CharAt(pos))))
2124                         pos--;
2125                 pos++;
2126         } else {
2127                 while (pos < (Length()) && (cb.StyleAt(pos) == sStart) && (!singleLine || !IsLineEndChar(cb.CharAt(pos))))
2128                         pos++;
2129         }
2130         return pos;
2131 }
2132
2133 static char BraceOpposite(char ch) {
2134         switch (ch) {
2135         case '(':
2136                 return ')';
2137         case ')':
2138                 return '(';
2139         case '[':
2140                 return ']';
2141         case ']':
2142                 return '[';
2143         case '{':
2144                 return '}';
2145         case '}':
2146                 return '{';
2147         case '<':
2148                 return '>';
2149         case '>':
2150                 return '<';
2151         default:
2152                 return '\0';
2153         }
2154 }
2155
2156 // TODO: should be able to extend styled region to find matching brace
2157 int Document::BraceMatch(int position, int /*maxReStyle*/) {
2158         char chBrace = CharAt(position);
2159         char chSeek = BraceOpposite(chBrace);
2160         if (chSeek == '\0')
2161                 return - 1;
2162         char styBrace = static_cast<char>(StyleAt(position));
2163         int direction = -1;
2164         if (chBrace == '(' || chBrace == '[' || chBrace == '{' || chBrace == '<')
2165                 direction = 1;
2166         int depth = 1;
2167         position = NextPosition(position, direction);
2168         while ((position >= 0) && (position < Length())) {
2169                 char chAtPos = CharAt(position);
2170                 char styAtPos = static_cast<char>(StyleAt(position));
2171                 if ((position > GetEndStyled()) || (styAtPos == styBrace)) {
2172                         if (chAtPos == chBrace)
2173                                 depth++;
2174                         if (chAtPos == chSeek)
2175                                 depth--;
2176                         if (depth == 0)
2177                                 return position;
2178                 }
2179                 int positionBeforeMove = position;
2180                 position = NextPosition(position, direction);
2181                 if (position == positionBeforeMove)
2182                         break;
2183         }
2184         return - 1;
2185 }
2186
2187 /**
2188  * Implementation of RegexSearchBase for the default built-in regular expression engine
2189  */
2190 class BuiltinRegex : public RegexSearchBase {
2191 public:
2192         explicit BuiltinRegex(CharClassify *charClassTable) : search(charClassTable) {}
2193
2194         virtual ~BuiltinRegex() {
2195         }
2196
2197         virtual long FindText(Document *doc, int minPos, int maxPos, const char *s,
2198                         bool caseSensitive, bool word, bool wordStart, int flags,
2199                         int *length);
2200
2201         virtual const char *SubstituteByPosition(Document *doc, const char *text, int *length);
2202
2203 private:
2204         RESearch search;
2205         std::string substituted;
2206 };
2207
2208 namespace {
2209
2210 /**
2211 * RESearchRange keeps track of search range.
2212 */
2213 class RESearchRange {
2214 public:
2215         const Document *doc;
2216         int increment;
2217         int startPos;
2218         int endPos;
2219         int lineRangeStart;
2220         int lineRangeEnd;
2221         int lineRangeBreak;
2222         RESearchRange(const Document *doc_, int minPos, int maxPos) : doc(doc_) {
2223                 increment = (minPos <= maxPos) ? 1 : -1;
2224
2225                 // Range endpoints should not be inside DBCS characters, but just in case, move them.
2226                 startPos = doc->MovePositionOutsideChar(minPos, 1, false);
2227                 endPos = doc->MovePositionOutsideChar(maxPos, 1, false);
2228
2229                 lineRangeStart = doc->LineFromPosition(startPos);
2230                 lineRangeEnd = doc->LineFromPosition(endPos);
2231                 if ((increment == 1) &&
2232                         (startPos >= doc->LineEnd(lineRangeStart)) &&
2233                         (lineRangeStart < lineRangeEnd)) {
2234                         // the start position is at end of line or between line end characters.
2235                         lineRangeStart++;
2236                         startPos = doc->LineStart(lineRangeStart);
2237                 } else if ((increment == -1) &&
2238                         (startPos <= doc->LineStart(lineRangeStart)) &&
2239                         (lineRangeStart > lineRangeEnd)) {
2240                         // the start position is at beginning of line.
2241                         lineRangeStart--;
2242                         startPos = doc->LineEnd(lineRangeStart);
2243                 }
2244                 lineRangeBreak = lineRangeEnd + increment;
2245         }
2246         Range LineRange(int line) const {
2247                 Range range(doc->LineStart(line), doc->LineEnd(line));
2248                 if (increment == 1) {
2249                         if (line == lineRangeStart)
2250                                 range.start = startPos;
2251                         if (line == lineRangeEnd)
2252                                 range.end = endPos;
2253                 } else {
2254                         if (line == lineRangeEnd)
2255                                 range.start = endPos;
2256                         if (line == lineRangeStart)
2257                                 range.end = startPos;
2258                 }
2259                 return range;
2260         }
2261 };
2262
2263 // Define a way for the Regular Expression code to access the document
2264 class DocumentIndexer : public CharacterIndexer {
2265         Document *pdoc;
2266         int end;
2267 public:
2268         DocumentIndexer(Document *pdoc_, int end_) :
2269                 pdoc(pdoc_), end(end_) {
2270         }
2271
2272         virtual ~DocumentIndexer() {
2273         }
2274
2275         virtual char CharAt(int index) {
2276                 if (index < 0 || index >= end)
2277                         return 0;
2278                 else
2279                         return pdoc->CharAt(index);
2280         }
2281 };
2282
2283 #ifdef CXX11_REGEX
2284
2285 class ByteIterator : public std::iterator<std::bidirectional_iterator_tag, char> {
2286 public:
2287         const Document *doc;
2288         Position position;
2289         ByteIterator(const Document *doc_ = 0, Position position_ = 0) : doc(doc_), position(position_) {
2290         }
2291         ByteIterator(const ByteIterator &other) {
2292                 doc = other.doc;
2293                 position = other.position;
2294         }
2295         ByteIterator &operator=(const ByteIterator &other) {
2296                 if (this != &other) {
2297                         doc = other.doc;
2298                         position = other.position;
2299                 }
2300                 return *this;
2301         }
2302         char operator*() const {
2303                 return doc->CharAt(position);
2304         }
2305         ByteIterator &operator++() {
2306                 position++;
2307                 return *this;
2308         }
2309         ByteIterator operator++(int) {
2310                 ByteIterator retVal(*this);
2311                 position++;
2312                 return retVal;
2313         }
2314         ByteIterator &operator--() {
2315                 position--;
2316                 return *this;
2317         }
2318         bool operator==(const ByteIterator &other) const {
2319                 return doc == other.doc && position == other.position;
2320         }
2321         bool operator!=(const ByteIterator &other) const {
2322                 return doc != other.doc || position != other.position;
2323         }
2324         int Pos() const {
2325                 return position;
2326         }
2327         int PosRoundUp() const {
2328                 return position;
2329         }
2330 };
2331
2332 // On Windows, wchar_t is 16 bits wide and on Unix it is 32 bits wide.
2333 // Would be better to use sizeof(wchar_t) or similar to differentiate
2334 // but easier for now to hard-code platforms.
2335 // C++11 has char16_t and char32_t but neither Clang nor Visual C++
2336 // appear to allow specializing basic_regex over these.
2337
2338 #ifdef _WIN32
2339 #define WCHAR_T_IS_16 1
2340 #else
2341 #define WCHAR_T_IS_16 0
2342 #endif
2343
2344 #if WCHAR_T_IS_16
2345
2346 // On Windows, report non-BMP characters as 2 separate surrogates as that
2347 // matches wregex since it is based on wchar_t.
2348 class UTF8Iterator : public std::iterator<std::bidirectional_iterator_tag, wchar_t> {
2349         // These 3 fields determine the iterator position and are used for comparisons
2350         const Document *doc;
2351         Position position;
2352         size_t characterIndex;
2353         // Remaining fields are derived from the determining fields so are excluded in comparisons
2354         unsigned int lenBytes;
2355         size_t lenCharacters;
2356         wchar_t buffered[2];
2357 public:
2358         UTF8Iterator(const Document *doc_ = 0, Position position_ = 0) :
2359                 doc(doc_), position(position_), characterIndex(0), lenBytes(0), lenCharacters(0) {
2360                 buffered[0] = 0;
2361                 buffered[1] = 0;
2362         }
2363         UTF8Iterator(const UTF8Iterator &other) {
2364                 doc = other.doc;
2365                 position = other.position;
2366                 characterIndex = other.characterIndex;
2367                 lenBytes = other.lenBytes;
2368                 lenCharacters = other.lenCharacters;
2369                 buffered[0] = other.buffered[0];
2370                 buffered[1] = other.buffered[1];
2371         }
2372         UTF8Iterator &operator=(const UTF8Iterator &other) {
2373                 if (this != &other) {
2374                         doc = other.doc;
2375                         position = other.position;
2376                         characterIndex = other.characterIndex;
2377                         lenBytes = other.lenBytes;
2378                         lenCharacters = other.lenCharacters;
2379                         buffered[0] = other.buffered[0];
2380                         buffered[1] = other.buffered[1];
2381                 }
2382                 return *this;
2383         }
2384         wchar_t operator*() {
2385                 if (lenCharacters == 0) {
2386                         ReadCharacter();
2387                 }
2388                 return buffered[characterIndex];
2389         }
2390         UTF8Iterator &operator++() {
2391                 if ((characterIndex + 1) < (lenCharacters)) {
2392                         characterIndex++;
2393                 } else {
2394                         position += lenBytes;
2395                         ReadCharacter();
2396                         characterIndex = 0;
2397                 }
2398                 return *this;
2399         }
2400         UTF8Iterator operator++(int) {
2401                 UTF8Iterator retVal(*this);
2402                 if ((characterIndex + 1) < (lenCharacters)) {
2403                         characterIndex++;
2404                 } else {
2405                         position += lenBytes;
2406                         ReadCharacter();
2407                         characterIndex = 0;
2408                 }
2409                 return retVal;
2410         }
2411         UTF8Iterator &operator--() {
2412                 if (characterIndex) {
2413                         characterIndex--;
2414                 } else {
2415                         position = doc->NextPosition(position, -1);
2416                         ReadCharacter();
2417                         characterIndex = lenCharacters - 1;
2418                 }
2419                 return *this;
2420         }
2421         bool operator==(const UTF8Iterator &other) const {
2422                 // Only test the determining fields, not the character widths and values derived from this
2423                 return doc == other.doc &&
2424                         position == other.position &&
2425                         characterIndex == other.characterIndex;
2426         }
2427         bool operator!=(const UTF8Iterator &other) const {
2428                 // Only test the determining fields, not the character widths and values derived from this
2429                 return doc != other.doc ||
2430                         position != other.position ||
2431                         characterIndex != other.characterIndex;
2432         }
2433         int Pos() const {
2434                 return position;
2435         }
2436         int PosRoundUp() const {
2437                 if (characterIndex)
2438                         return position + lenBytes;     // Force to end of character
2439                 else
2440                         return position;
2441         }
2442 private:
2443         void ReadCharacter() {
2444                 Document::CharacterExtracted charExtracted = doc->ExtractCharacter(position);
2445                 lenBytes = charExtracted.widthBytes;
2446                 if (charExtracted.character == unicodeReplacementChar) {
2447                         lenCharacters = 1;
2448                         buffered[0] = static_cast<wchar_t>(charExtracted.character);
2449                 } else {
2450                         lenCharacters = UTF16FromUTF32Character(charExtracted.character, buffered);
2451                 }
2452         }
2453 };
2454
2455 #else
2456
2457 // On Unix, report non-BMP characters as single characters
2458
2459 class UTF8Iterator : public std::iterator<std::bidirectional_iterator_tag, wchar_t> {
2460         const Document *doc;
2461         Position position;
2462 public:
2463         UTF8Iterator(const Document *doc_=0, Position position_=0) : doc(doc_), position(position_) {
2464         }
2465         UTF8Iterator(const UTF8Iterator &other) {
2466                 doc = other.doc;
2467                 position = other.position;
2468         }
2469         UTF8Iterator &operator=(const UTF8Iterator &other) {
2470                 if (this != &other) {
2471                         doc = other.doc;
2472                         position = other.position;
2473                 }
2474                 return *this;
2475         }
2476         wchar_t operator*() const {
2477                 Document::CharacterExtracted charExtracted = doc->ExtractCharacter(position);
2478                 return charExtracted.character;
2479         }
2480         UTF8Iterator &operator++() {
2481                 position = doc->NextPosition(position, 1);
2482                 return *this;
2483         }
2484         UTF8Iterator operator++(int) {
2485                 UTF8Iterator retVal(*this);
2486                 position = doc->NextPosition(position, 1);
2487                 return retVal;
2488         }
2489         UTF8Iterator &operator--() {
2490                 position = doc->NextPosition(position, -1);
2491                 return *this;
2492         }
2493         bool operator==(const UTF8Iterator &other) const {
2494                 return doc == other.doc && position == other.position;
2495         }
2496         bool operator!=(const UTF8Iterator &other) const {
2497                 return doc != other.doc || position != other.position;
2498         }
2499         int Pos() const {
2500                 return position;
2501         }
2502         int PosRoundUp() const {
2503                 return position;
2504         }
2505 };
2506
2507 #endif
2508
2509 std::regex_constants::match_flag_type MatchFlags(const Document *doc, int startPos, int endPos) {
2510         std::regex_constants::match_flag_type flagsMatch = std::regex_constants::match_default;
2511         if (!doc->IsLineStartPosition(startPos))
2512                 flagsMatch |= std::regex_constants::match_not_bol;
2513         if (!doc->IsLineEndPosition(endPos))
2514                 flagsMatch |= std::regex_constants::match_not_eol;
2515         return flagsMatch;
2516 }
2517
2518 template<typename Iterator, typename Regex>
2519 bool MatchOnLines(const Document *doc, const Regex &regexp, const RESearchRange &resr, RESearch &search) {
2520         bool matched = false;
2521         std::match_results<Iterator> match;
2522
2523         // MSVC and libc++ have problems with ^ and $ matching line ends inside a range
2524         // If they didn't then the line by line iteration could be removed for the forwards
2525         // case and replaced with the following 4 lines:
2526         //      Iterator uiStart(doc, startPos);
2527         //      Iterator uiEnd(doc, endPos);
2528         //      flagsMatch = MatchFlags(doc, startPos, endPos);
2529         //      matched = std::regex_search(uiStart, uiEnd, match, regexp, flagsMatch);
2530
2531         // Line by line.
2532         for (int line = resr.lineRangeStart; line != resr.lineRangeBreak; line += resr.increment) {
2533                 const Range lineRange = resr.LineRange(line);
2534                 Iterator itStart(doc, lineRange.start);
2535                 Iterator itEnd(doc, lineRange.end);
2536                 std::regex_constants::match_flag_type flagsMatch = MatchFlags(doc, lineRange.start, lineRange.end);
2537                 matched = std::regex_search(itStart, itEnd, match, regexp, flagsMatch);
2538                 // Check for the last match on this line.
2539                 if (matched) {
2540                         if (resr.increment == -1) {
2541                                 while (matched) {
2542                                         Iterator itNext(doc, match[0].second.PosRoundUp());
2543                                         flagsMatch = MatchFlags(doc, itNext.Pos(), lineRange.end);
2544                                         std::match_results<Iterator> matchNext;
2545                                         matched = std::regex_search(itNext, itEnd, matchNext, regexp, flagsMatch);
2546                                         if (matched) {
2547                                                 if (match[0].first == match[0].second) {
2548                                                         // Empty match means failure so exit
2549                                                         return false;
2550                                                 }
2551                                                 match = matchNext;
2552                                         }
2553                                 }
2554                                 matched = true;
2555                         }
2556                         break;
2557                 }
2558         }
2559         if (matched) {
2560                 for (size_t co = 0; co < match.size(); co++) {
2561                         search.bopat[co] = match[co].first.Pos();
2562                         search.eopat[co] = match[co].second.PosRoundUp();
2563                         size_t lenMatch = search.eopat[co] - search.bopat[co];
2564                         search.pat[co].resize(lenMatch);
2565                         for (size_t iPos = 0; iPos < lenMatch; iPos++) {
2566                                 search.pat[co][iPos] = doc->CharAt(iPos + search.bopat[co]);
2567                         }
2568                 }
2569         }
2570         return matched;
2571 }
2572
2573 long Cxx11RegexFindText(Document *doc, int minPos, int maxPos, const char *s,
2574         bool caseSensitive, int *length, RESearch &search) {
2575         const RESearchRange resr(doc, minPos, maxPos);
2576         try {
2577                 //ElapsedTime et;
2578                 std::regex::flag_type flagsRe = std::regex::ECMAScript;
2579                 // Flags that apper to have no effect:
2580                 // | std::regex::collate | std::regex::extended;
2581                 if (!caseSensitive)
2582                         flagsRe = flagsRe | std::regex::icase;
2583
2584                 // Clear the RESearch so can fill in matches
2585                 search.Clear();
2586
2587                 bool matched = false;
2588                 if (SC_CP_UTF8 == doc->dbcsCodePage) {
2589                         unsigned int lenS = static_cast<unsigned int>(strlen(s));
2590                         std::vector<wchar_t> ws(lenS + 1);
2591 #if WCHAR_T_IS_16
2592                         size_t outLen = UTF16FromUTF8(s, lenS, &ws[0], lenS);
2593 #else
2594                         size_t outLen = UTF32FromUTF8(s, lenS, reinterpret_cast<unsigned int *>(&ws[0]), lenS);
2595 #endif
2596                         ws[outLen] = 0;
2597                         std::wregex regexp;
2598 #if defined(__APPLE__)
2599                         // Using a UTF-8 locale doesn't change to Unicode over a byte buffer so '.'
2600                         // is one byte not one character.
2601                         // However, on OS X this makes wregex act as Unicode
2602                         std::locale localeU("en_US.UTF-8");
2603                         regexp.imbue(localeU);
2604 #endif
2605                         regexp.assign(&ws[0], flagsRe);
2606                         matched = MatchOnLines<UTF8Iterator>(doc, regexp, resr, search);
2607
2608                 } else {
2609                         std::regex regexp;
2610                         regexp.assign(s, flagsRe);
2611                         matched = MatchOnLines<ByteIterator>(doc, regexp, resr, search);
2612                 }
2613
2614                 int posMatch = -1;
2615                 if (matched) {
2616                         posMatch = search.bopat[0];
2617                         *length = search.eopat[0] - search.bopat[0];
2618                 }
2619                 // Example - search in doc/ScintillaHistory.html for
2620                 // [[:upper:]]eta[[:space:]]
2621                 // On MacBook, normally around 1 second but with locale imbued -> 14 seconds.
2622                 //double durSearch = et.Duration(true);
2623                 //Platform::DebugPrintf("Search:%9.6g \n", durSearch);
2624                 return posMatch;
2625         } catch (std::regex_error &) {
2626                 // Failed to create regular expression
2627                 throw RegexError();
2628         } catch (...) {
2629                 // Failed in some other way
2630                 return -1;
2631         }
2632 }
2633
2634 #endif
2635
2636 }
2637
2638 long BuiltinRegex::FindText(Document *doc, int minPos, int maxPos, const char *s,
2639                         bool caseSensitive, bool, bool, int flags,
2640                         int *length) {
2641
2642 #ifdef CXX11_REGEX
2643         if (flags & SCFIND_CXX11REGEX) {
2644                         return Cxx11RegexFindText(doc, minPos, maxPos, s,
2645                         caseSensitive, length, search);
2646         }
2647 #endif
2648
2649         const RESearchRange resr(doc, minPos, maxPos);
2650
2651         const bool posix = (flags & SCFIND_POSIX) != 0;
2652
2653         const char *errmsg = search.Compile(s, *length, caseSensitive, posix);
2654         if (errmsg) {
2655                 return -1;
2656         }
2657         // Find a variable in a property file: \$(\([A-Za-z0-9_.]+\))
2658         // Replace first '.' with '-' in each property file variable reference:
2659         //     Search: \$(\([A-Za-z0-9_-]+\)\.\([A-Za-z0-9_.]+\))
2660         //     Replace: $(\1-\2)
2661         int pos = -1;
2662         int lenRet = 0;
2663         const char searchEnd = s[*length - 1];
2664         const char searchEndPrev = (*length > 1) ? s[*length - 2] : '\0';
2665         for (int line = resr.lineRangeStart; line != resr.lineRangeBreak; line += resr.increment) {
2666                 int startOfLine = doc->LineStart(line);
2667                 int endOfLine = doc->LineEnd(line);
2668                 if (resr.increment == 1) {
2669                         if (line == resr.lineRangeStart) {
2670                                 if ((resr.startPos != startOfLine) && (s[0] == '^'))
2671                                         continue;       // Can't match start of line if start position after start of line
2672                                 startOfLine = resr.startPos;
2673                         }
2674                         if (line == resr.lineRangeEnd) {
2675                                 if ((resr.endPos != endOfLine) && (searchEnd == '$') && (searchEndPrev != '\\'))
2676                                         continue;       // Can't match end of line if end position before end of line
2677                                 endOfLine = resr.endPos;
2678                         }
2679                 } else {
2680                         if (line == resr.lineRangeEnd) {
2681                                 if ((resr.endPos != startOfLine) && (s[0] == '^'))
2682                                         continue;       // Can't match start of line if end position after start of line
2683                                 startOfLine = resr.endPos;
2684                         }
2685                         if (line == resr.lineRangeStart) {
2686                                 if ((resr.startPos != endOfLine) && (searchEnd == '$') && (searchEndPrev != '\\'))
2687                                         continue;       // Can't match end of line if start position before end of line
2688                                 endOfLine = resr.startPos;
2689                         }
2690                 }
2691
2692                 DocumentIndexer di(doc, endOfLine);
2693                 int success = search.Execute(di, startOfLine, endOfLine);
2694                 if (success) {
2695                         pos = search.bopat[0];
2696                         // Ensure only whole characters selected
2697                         search.eopat[0] = doc->MovePositionOutsideChar(search.eopat[0], 1, false);
2698                         lenRet = search.eopat[0] - search.bopat[0];
2699                         // There can be only one start of a line, so no need to look for last match in line
2700                         if ((resr.increment == -1) && (s[0] != '^')) {
2701                                 // Check for the last match on this line.
2702                                 int repetitions = 1000; // Break out of infinite loop
2703                                 while (success && (search.eopat[0] <= endOfLine) && (repetitions--)) {
2704                                         success = search.Execute(di, pos+1, endOfLine);
2705                                         if (success) {
2706                                                 if (search.eopat[0] <= minPos) {
2707                                                         pos = search.bopat[0];
2708                                                         lenRet = search.eopat[0] - search.bopat[0];
2709                                                 } else {
2710                                                         success = 0;
2711                                                 }
2712                                         }
2713                                 }
2714                         }
2715                         break;
2716                 }
2717         }
2718         *length = lenRet;
2719         return pos;
2720 }
2721
2722 const char *BuiltinRegex::SubstituteByPosition(Document *doc, const char *text, int *length) {
2723         substituted.clear();
2724         DocumentIndexer di(doc, doc->Length());
2725         search.GrabMatches(di);
2726         for (int j = 0; j < *length; j++) {
2727                 if (text[j] == '\\') {
2728                         if (text[j + 1] >= '0' && text[j + 1] <= '9') {
2729                                 unsigned int patNum = text[j + 1] - '0';
2730                                 unsigned int len = search.eopat[patNum] - search.bopat[patNum];
2731                                 if (!search.pat[patNum].empty())        // Will be null if try for a match that did not occur
2732                                         substituted.append(search.pat[patNum].c_str(), len);
2733                                 j++;
2734                         } else {
2735                                 j++;
2736                                 switch (text[j]) {
2737                                 case 'a':
2738                                         substituted.push_back('\a');
2739                                         break;
2740                                 case 'b':
2741                                         substituted.push_back('\b');
2742                                         break;
2743                                 case 'f':
2744                                         substituted.push_back('\f');
2745                                         break;
2746                                 case 'n':
2747                                         substituted.push_back('\n');
2748                                         break;
2749                                 case 'r':
2750                                         substituted.push_back('\r');
2751                                         break;
2752                                 case 't':
2753                                         substituted.push_back('\t');
2754                                         break;
2755                                 case 'v':
2756                                         substituted.push_back('\v');
2757                                         break;
2758                                 case '\\':
2759                                         substituted.push_back('\\');
2760                                         break;
2761                                 default:
2762                                         substituted.push_back('\\');
2763                                         j--;
2764                                 }
2765                         }
2766                 } else {
2767                         substituted.push_back(text[j]);
2768                 }
2769         }
2770         *length = static_cast<int>(substituted.length());
2771         return substituted.c_str();
2772 }
2773
2774 #ifndef SCI_OWNREGEX
2775
2776 #ifdef SCI_NAMESPACE
2777
2778 RegexSearchBase *Scintilla::CreateRegexSearch(CharClassify *charClassTable) {
2779         return new BuiltinRegex(charClassTable);
2780 }
2781
2782 #else
2783
2784 RegexSearchBase *CreateRegexSearch(CharClassify *charClassTable) {
2785         return new BuiltinRegex(charClassTable);
2786 }
2787
2788 #endif
2789
2790 #endif