docs/HtmlAgilityPack/HtmlDocument.cs

   1 // HtmlAgilityPack V1.0 - Simon Mourier <simon underscore mourier at hotmail dot com>
   2 using System;
   3 using System.Collections;
   4 using System.Collections.Generic;
   5 using System.IO;
   6 using System.Text;
   7 using System.Text.RegularExpressions;
   8 using System.Xml;
   9 using System.Xml.XPath;
  10
  11 namespace HtmlAgilityPack
  12 {
  13     /// <summary>
  14     /// Represents a complete HTML document.
  15     /// </summary>
  16     public class HtmlDocument : IXPathNavigable
  17     {
  18         #region Fields
  19
  20         private int _c;
  21         private Crc32 _crc32;
  22         private HtmlAttribute _currentattribute;
  23         private HtmlNode _currentnode;
  24         private Encoding _declaredencoding;
  25         private HtmlNode _documentnode;
  26         private bool _fullcomment;
  27         private int _index;
  28         internal Hashtable _lastnodes = new Hashtable();
  29         private HtmlNode _lastparentnode;
  30         private int _line;
  31         private int _lineposition, _maxlineposition;
  32         internal Hashtable _nodesid;
  33         private ParseState _oldstate;
  34         private bool _onlyDetectEncoding;
  35         internal Hashtable _openednodes;
  36         private List<HtmlParseError> _parseerrors = new List<HtmlParseError>();
  37         private string _remainder;
  38         private int _remainderOffset;
  39         private ParseState _state;
  40         private Encoding _streamencoding;
  41         internal string _text;
  42
  43         // public props
  44
  45         /// <summary>
  46         /// Adds Debugging attributes to node. Default is false.
  47         /// </summary>
  48         public bool OptionAddDebuggingAttributes;
  49
  50         /// <summary>
  51         /// Defines if closing for non closed nodes must be done at the end or directly in the document.
  52         /// Setting this to true can actually change how browsers render the page. Default is false.
  53         /// </summary>
  54         public bool OptionAutoCloseOnEnd; // close errors at the end
  55
  56         /// <summary>
  57         /// Defines if non closed nodes will be checked at the end of parsing. Default is true.
  58         /// </summary>
  59         public bool OptionCheckSyntax = true;
  60
  61         /// <summary>
  62         /// Defines if a checksum must be computed for the document while parsing. Default is false.
  63         /// </summary>
  64         public bool OptionComputeChecksum;
  65
  66         /// <summary>
  67         /// Defines the default stream encoding to use. Default is System.Text.Encoding.Default.
  68         /// </summary>
  69         public Encoding OptionDefaultStreamEncoding = Encoding.Default;
  70
  71         /// <summary>
  72         /// Defines if source text must be extracted while parsing errors.
  73         /// If the document has a lot of errors, or cascading errors, parsing performance can be dramatically affected if set to true.
  74         /// Default is false.
  75         /// </summary>
  76         public bool OptionExtractErrorSourceText;
  77
  78         // turning this on can dramatically slow performance if a lot of errors are detected
  79
  80         /// <summary>
  81         /// Defines the maximum length of source text or parse errors. Default is 100.
  82         /// </summary>
  83         public int OptionExtractErrorSourceTextMaxLength = 100;
  84
  85         /// <summary>
  86         /// Defines if LI, TR, TH, TD tags must be partially fixed when nesting errors are detected. Default is false.
  87         /// </summary>
  88         public bool OptionFixNestedTags; // fix li, tr, th, td tags
  89
  90         /// <summary>
  91         /// Defines if output must conform to XML, instead of HTML.
  92         /// </summary>
  93         public bool OptionOutputAsXml;
  94
  95         /// <summary>
  96         /// Defines if attribute value output must be optimized (not bound with double quotes if it is possible). Default is false.
  97         /// </summary>
  98         public bool OptionOutputOptimizeAttributeValues;
  99
 100         /// <summary>
 101         /// Defines if name must be output with it's original case. Useful for asp.net tags and attributes
 102         /// </summary>
 103         public bool OptionOutputOriginalCase;
 104
 105         /// <summary>
 106         /// Defines if name must be output in uppercase. Default is false.
 107         /// </summary>
 108         public bool OptionOutputUpperCase;
 109
 110         /// <summary>
 111         /// Defines if declared encoding must be read from the document.
 112         /// Declared encoding is determined using the meta http-equiv="content-type" content="text/html;charset=XXXXX" html node.
 113         /// Default is true.
 114         /// </summary>
 115         public bool OptionReadEncoding = true;
 116
 117         /// <summary>
 118         /// Defines the name of a node that will throw the StopperNodeException when found as an end node. Default is null.
 119         /// </summary>
 120         public string OptionStopperNodeName;
 121
 122         /// <summary>
 123         /// Defines if the 'id' attribute must be specifically used. Default is true.
 124         /// </summary>
 125         public bool OptionUseIdAttribute = true;
 126
 127         /// <summary>
 128         /// Defines if empty nodes must be written as closed during output. Default is false.
 129         /// </summary>
 130         public bool OptionWriteEmptyNodes;
 131
 132         #endregion
 133
 134         #region Static Members
 135
 136         internal static readonly string HtmlExceptionRefNotChild = "Reference node must be a child of this node";
 137
 138         internal static readonly string HtmlExceptionUseIdAttributeFalse =
 139             "You need to set UseIdAttribute property to true to enable this feature";
 140
 141         #endregion
 142
 143         #region Constructors
 144
 145         /// <summary>
 146         /// Creates an instance of an HTML document.
 147         /// </summary>
 148         public HtmlDocument()
 149         {
 150             _documentnode = CreateNode(HtmlNodeType.Document, 0);
 151         }
 152
 153         #endregion
 154
 155         #region Properties
 156
 157         /// <summary>
 158         /// Gets the document CRC32 checksum if OptionComputeChecksum was set to true before parsing, 0 otherwise.
 159         /// </summary>
 160         public int CheckSum
 161         {
 162             get
 163             {
 164                 if (_crc32 == null)
 165                 {
 166                     return 0;
 167                 }
 168                 else
 169                 {
 170                     return (int) _crc32.CheckSum;
 171                 }
 172             }
 173         }
 174
 175         /// <summary>
 176         /// Gets the document's declared encoding.
 177         /// Declared encoding is determined using the meta http-equiv="content-type" content="text/html;charset=XXXXX" html node.
 178         /// </summary>
 179         public Encoding DeclaredEncoding
 180         {
 181             get { return _declaredencoding; }
 182         }
 183
 184         /// <summary>
 185         /// Gets the root node of the document.
 186         /// </summary>
 187         public HtmlNode DocumentNode
 188         {
 189             get { return _documentnode; }
 190         }
 191
 192         /// <summary>
 193         /// Gets the document's output encoding.
 194         /// </summary>
 195         public Encoding Encoding
 196         {
 197             get { return GetOutEncoding(); }
 198         }
 199
 200         /// <summary>
 201         /// Gets a list of parse errors found in the document.
 202         /// </summary>
 203         public IEnumerable<HtmlParseError> ParseErrors
 204         {
 205             get { return _parseerrors; }
 206         }
 207
 208         /// <summary>
 209         /// Gets the remaining text.
 210         /// Will always be null if OptionStopperNodeName is null.
 211         /// </summary>
 212         public string Remainder
 213         {
 214             get { return _remainder; }
 215         }
 216
 217         /// <summary>
 218         /// Gets the offset of Remainder in the original Html text.
 219         /// If OptionStopperNodeName is null, this will return the length of the original Html text.
 220         /// </summary>
 221         public int RemainderOffset
 222         {
 223             get { return _remainderOffset; }
 224         }
 225
 226         /// <summary>
 227         /// Gets the document's stream encoding.
 228         /// </summary>
 229         public Encoding StreamEncoding
 230         {
 231             get { return _streamencoding; }
 232         }
 233
 234         #endregion
 235
 236         #region IXPathNavigable Members
 237
 238         /// <summary>
 239         /// Creates a new XPathNavigator object for navigating this HTML document.
 240         /// </summary>
 241         /// <returns>An XPathNavigator object. The XPathNavigator is positioned on the root of the document.</returns>
 242         public XPathNavigator CreateNavigator()
 243         {
 244             return new HtmlNodeNavigator(this, _documentnode);
 245         }
 246
 247         #endregion
 248
 249         #region Public Methods
 250
 251         /// <summary>
 252         /// Gets a valid XML name.
 253         /// </summary>
 254         /// <param name="name">Any text.</param>
 255         /// <returns>A string that is a valid XML name.</returns>
 256         public static string GetXmlName(string name)
 257         {
 258             string xmlname = string.Empty;
 259             bool nameisok = true;
 260             for (int i = 0; i < name.Length; i++)
 261             {
 262                 // names are lcase
 263                 // note: we are very limited here, too much?
 264                 if (((name[i] >= 'a') && (name[i] <= 'z')) ||
 265                     ((name[i] >= '0') && (name[i] <= '9')) ||
 266                     //                                  (name[i]==':') || (name[i]=='_') || (name[i]=='-') || (name[i]=='.')) // these are bads in fact
 267                     (name[i] == '_') || (name[i] == '-') || (name[i] == '.'))
 268                 {
 269                     xmlname += name[i];
 270                 }
 271                 else
 272                 {
 273                     nameisok = false;
 274                     byte[] bytes = Encoding.UTF8.GetBytes(new char[] {name[i]});
 275                     for (int j = 0; j < bytes.Length; j++)
 276                     {
 277                         xmlname += bytes[j].ToString("x2");
 278                     }
 279                     xmlname += "_";
 280                 }
 281             }
 282             if (nameisok)
 283             {
 284                 return xmlname;
 285             }
 286             return "_" + xmlname;
 287         }
 288
 289         /// <summary>
 290         /// Applies HTML encoding to a specified string.
 291         /// </summary>
 292         /// <param name="html">The input string to encode. May not be null.</param>
 293         /// <returns>The encoded string.</returns>
 294         public static string HtmlEncode(string html)
 295         {
 296             if (html == null)
 297             {
 298                 throw new ArgumentNullException("html");
 299             }
 300             // replace & by &amp; but only once!
 301             Regex rx = new Regex("&(?!(amp;)|(lt;)|(gt;)|(quot;))", RegexOptions.IgnoreCase);
 302             return rx.Replace(html, "&amp;").Replace("<", "&lt;").Replace(">", "&gt;").Replace("\"", "&quot;");
 303         }
 304
 305         /// <summary>
 306         /// Determines if the specified character is considered as a whitespace character.
 307         /// </summary>
 308         /// <param name="c">The character to check.</param>
 309         /// <returns>true if if the specified character is considered as a whitespace character.</returns>
 310         public static bool IsWhiteSpace(int c)
 311         {
 312             if ((c == 10) || (c == 13) || (c == 32) || (c == 9))
 313             {
 314                 return true;
 315             }
 316             return false;
 317         }
 318
 319         /// <summary>
 320         /// Creates an HTML attribute with the specified name.
 321         /// </summary>
 322         /// <param name="name">The name of the attribute. May not be null.</param>
 323         /// <returns>The new HTML attribute.</returns>
 324         public HtmlAttribute CreateAttribute(string name)
 325         {
 326             if (name == null)
 327             {
 328                 throw new ArgumentNullException("name");
 329             }
 330             HtmlAttribute att = CreateAttribute();
 331             att.Name = name;
 332             return att;
 333         }
 334
 335         /// <summary>
 336         /// Creates an HTML attribute with the specified name.
 337         /// </summary>
 338         /// <param name="name">The name of the attribute. May not be null.</param>
 339         /// <param name="value">The value of the attribute.</param>
 340         /// <returns>The new HTML attribute.</returns>
 341         public HtmlAttribute CreateAttribute(string name, string value)
 342         {
 343             if (name == null)
 344             {
 345                 throw new ArgumentNullException("name");
 346             }
 347             HtmlAttribute att = CreateAttribute(name);
 348             att.Value = value;
 349             return att;
 350         }
 351
 352         /// <summary>
 353         /// Creates an HTML comment node.
 354         /// </summary>
 355         /// <returns>The new HTML comment node.</returns>
 356         public HtmlCommentNode CreateComment()
 357         {
 358             return (HtmlCommentNode) CreateNode(HtmlNodeType.Comment);
 359         }
 360
 361         /// <summary>
 362         /// Creates an HTML comment node with the specified comment text.
 363         /// </summary>
 364         /// <param name="comment">The comment text. May not be null.</param>
 365         /// <returns>The new HTML comment node.</returns>
 366         public HtmlCommentNode CreateComment(string comment)
 367         {
 368             if (comment == null)
 369             {
 370                 throw new ArgumentNullException("comment");
 371             }
 372             HtmlCommentNode c = CreateComment();
 373             c.Comment = comment;
 374             return c;
 375         }
 376
 377         /// <summary>
 378         /// Creates an HTML element node with the specified name.
 379         /// </summary>
 380         /// <param name="name">The qualified name of the element. May not be null.</param>
 381         /// <returns>The new HTML node.</returns>
 382         public HtmlNode CreateElement(string name)
 383         {
 384             if (name == null)
 385             {
 386                 throw new ArgumentNullException("name");
 387             }
 388             HtmlNode node = CreateNode(HtmlNodeType.Element);
 389             node.Name = name;
 390             return node;
 391         }
 392
 393         /// <summary>
 394         /// Creates an HTML text node.
 395         /// </summary>
 396         /// <returns>The new HTML text node.</returns>
 397         public HtmlTextNode CreateTextNode()
 398         {
 399             return (HtmlTextNode) CreateNode(HtmlNodeType.Text);
 400         }
 401
 402         /// <summary>
 403         /// Creates an HTML text node with the specified text.
 404         /// </summary>
 405         /// <param name="text">The text of the node. May not be null.</param>
 406         /// <returns>The new HTML text node.</returns>
 407         public HtmlTextNode CreateTextNode(string text)
 408         {
 409             if (text == null)
 410             {
 411                 throw new ArgumentNullException("text");
 412             }
 413             HtmlTextNode t = CreateTextNode();
 414             t.Text = text;
 415             return t;
 416         }
 417
 418         /// <summary>
 419         /// Detects the encoding of an HTML stream.
 420         /// </summary>
 421         /// <param name="stream">The input stream. May not be null.</param>
 422         /// <returns>The detected encoding.</returns>
 423         public Encoding DetectEncoding(Stream stream)
 424         {
 425             if (stream == null)
 426             {
 427                 throw new ArgumentNullException("stream");
 428             }
 429             return DetectEncoding(new StreamReader(stream));
 430         }
 431
 432         /// <summary>
 433         /// Detects the encoding of an HTML file.
 434         /// </summary>
 435         /// <param name="path">Path for the file containing the HTML document to detect. May not be null.</param>
 436         /// <returns>The detected encoding.</returns>
 437         public Encoding DetectEncoding(string path)
 438         {
 439             if (path == null)
 440             {
 441                 throw new ArgumentNullException("path");
 442             }
 443             StreamReader sr = new StreamReader(path, OptionDefaultStreamEncoding);
 444             Encoding encoding = DetectEncoding(sr);
 445             sr.Close();
 446             return encoding;
 447         }
 448
 449         /// <summary>
 450         /// Detects the encoding of an HTML text provided on a TextReader.
 451         /// </summary>
 452         /// <param name="reader">The TextReader used to feed the HTML. May not be null.</param>
 453         /// <returns>The detected encoding.</returns>
 454         public Encoding DetectEncoding(TextReader reader)
 455         {
 456             if (reader == null)
 457             {
 458                 throw new ArgumentNullException("reader");
 459             }
 460             _onlyDetectEncoding = true;
 461             if (OptionCheckSyntax)
 462             {
 463                 _openednodes = new Hashtable();
 464             }
 465             else
 466             {
 467                 _openednodes = null;
 468             }
 469
 470             if (OptionUseIdAttribute)
 471             {
 472                 _nodesid = new Hashtable();
 473             }
 474             else
 475             {
 476                 _nodesid = null;
 477             }
 478
 479             StreamReader sr = reader as StreamReader;
 480             if (sr != null)
 481             {
 482                 _streamencoding = sr.CurrentEncoding;
 483             }
 484             else
 485             {
 486                 _streamencoding = null;
 487             }
 488             _declaredencoding = null;
 489
 490             _text = reader.ReadToEnd();
 491             _documentnode = CreateNode(HtmlNodeType.Document, 0);
 492
 493             // this is almost a hack, but it allows us not to muck with the original parsing code
 494             try
 495             {
 496                 Parse();
 497             }
 498             catch (EncodingFoundException ex)
 499             {
 500                 return ex.Encoding;
 501             }
 502             return null;
 503         }
 504
 505         /// <summary>
 506         /// Detects the encoding of an HTML document from a file first, and then loads the file.
 507         /// </summary>
 508         /// <param name="path">The complete file path to be read.</param>
 509         public void DetectEncodingAndLoad(string path)
 510         {
 511             DetectEncodingAndLoad(path, true);
 512         }
 513
 514         /// <summary>
 515         /// Detects the encoding of an HTML document from a file first, and then loads the file.
 516         /// </summary>
 517         /// <param name="path">The complete file path to be read. May not be null.</param>
 518         /// <param name="detectEncoding">true to detect encoding, false otherwise.</param>
 519         public void DetectEncodingAndLoad(string path, bool detectEncoding)
 520         {
 521             if (path == null)
 522             {
 523                 throw new ArgumentNullException("path");
 524             }
 525             Encoding enc;
 526             if (detectEncoding)
 527             {
 528                 enc = DetectEncoding(path);
 529             }
 530             else
 531             {
 532                 enc = null;
 533             }
 534
 535             if (enc == null)
 536             {
 537                 Load(path);
 538             }
 539             else
 540             {
 541                 Load(path, enc);
 542             }
 543         }
 544
 545         /// <summary>
 546         /// Detects the encoding of an HTML text.
 547         /// </summary>
 548         /// <param name="html">The input html text. May not be null.</param>
 549         /// <returns>The detected encoding.</returns>
 550         public Encoding DetectEncodingHtml(string html)
 551         {
 552             if (html == null)
 553             {
 554                 throw new ArgumentNullException("html");
 555             }
 556             StringReader sr = new StringReader(html);
 557             Encoding encoding = DetectEncoding(sr);
 558             sr.Close();
 559             return encoding;
 560         }
 561
 562         /// <summary>
 563         /// Gets the HTML node with the specified 'id' attribute value.
 564         /// </summary>
 565         /// <param name="id">The attribute id to match. May not be null.</param>
 566         /// <returns>The HTML node with the matching id or null if not found.</returns>
 567         public HtmlNode GetElementbyId(string id)
 568         {
 569             if (id == null)
 570             {
 571                 throw new ArgumentNullException("id");
 572             }
 573             if (_nodesid == null)
 574             {
 575                 throw new Exception(HtmlExceptionUseIdAttributeFalse);
 576             }
 577
 578             return _nodesid[id.ToLower()] as HtmlNode;
 579         }
 580
 581         /// <summary>
 582         /// Loads an HTML document from a stream.
 583         /// </summary>
 584         /// <param name="stream">The input stream.</param>
 585         public void Load(Stream stream)
 586         {
 587             Load(new StreamReader(stream, OptionDefaultStreamEncoding));
 588         }
 589
 590         /// <summary>
 591         /// Loads an HTML document from a stream.
 592         /// </summary>
 593         /// <param name="stream">The input stream.</param>
 594         /// <param name="detectEncodingFromByteOrderMarks">Indicates whether to look for byte order marks at the beginning of the stream.</param>
 595         public void Load(Stream stream, bool detectEncodingFromByteOrderMarks)
 596         {
 597             Load(new StreamReader(stream, detectEncodingFromByteOrderMarks));
 598         }
 599
 600         /// <summary>
 601         /// Loads an HTML document from a stream.
 602         /// </summary>
 603         /// <param name="stream">The input stream.</param>
 604         /// <param name="encoding">The character encoding to use.</param>
 605         public void Load(Stream stream, Encoding encoding)
 606         {
 607             Load(new StreamReader(stream, encoding));
 608         }
 609
 610         /// <summary>
 611         /// Loads an HTML document from a stream.
 612         /// </summary>
 613         /// <param name="stream">The input stream.</param>
 614         /// <param name="encoding">The character encoding to use.</param>
 615         /// <param name="detectEncodingFromByteOrderMarks">Indicates whether to look for byte order marks at the beginning of the stream.</param>
 616         public void Load(Stream stream, Encoding encoding, bool detectEncodingFromByteOrderMarks)
 617         {
 618             Load(new StreamReader(stream, encoding, detectEncodingFromByteOrderMarks));
 619         }
 620
 621         /// <summary>
 622         /// Loads an HTML document from a stream.
 623         /// </summary>
 624         /// <param name="stream">The input stream.</param>
 625         /// <param name="encoding">The character encoding to use.</param>
 626         /// <param name="detectEncodingFromByteOrderMarks">Indicates whether to look for byte order marks at the beginning of the stream.</param>
 627         /// <param name="buffersize">The minimum buffer size.</param>
 628         public void Load(Stream stream, Encoding encoding, bool detectEncodingFromByteOrderMarks, int buffersize)
 629         {
 630             Load(new StreamReader(stream, encoding, detectEncodingFromByteOrderMarks, buffersize));
 631         }
 632
 633         /// <summary>
 634         /// Loads an HTML document from a file.
 635         /// </summary>
 636         /// <param name="path">The complete file path to be read. May not be null.</param>
 637         public void Load(string path)
 638         {
 639             if (path == null)
 640             {
 641                 throw new ArgumentNullException("path");
 642             }
 643             StreamReader sr = new StreamReader(path, OptionDefaultStreamEncoding);
 644             Load(sr);
 645             sr.Close();
 646         }
 647
 648         /// <summary>
 649         /// Loads an HTML document from a file.
 650         /// </summary>
 651         /// <param name="path">The complete file path to be read. May not be null.</param>
 652         /// <param name="detectEncodingFromByteOrderMarks">Indicates whether to look for byte order marks at the beginning of the file.</param>
 653         public void Load(string path, bool detectEncodingFromByteOrderMarks)
 654         {
 655             if (path == null)
 656             {
 657                 throw new ArgumentNullException("path");
 658             }
 659             StreamReader sr = new StreamReader(path, detectEncodingFromByteOrderMarks);
 660             Load(sr);
 661             sr.Close();
 662         }
 663
 664         /// <summary>
 665         /// Loads an HTML document from a file.
 666         /// </summary>
 667         /// <param name="path">The complete file path to be read. May not be null.</param>
 668         /// <param name="encoding">The character encoding to use. May not be null.</param>
 669         public void Load(string path, Encoding encoding)
 670         {
 671             if (path == null)
 672             {
 673                 throw new ArgumentNullException("path");
 674             }
 675             if (encoding == null)
 676             {
 677                 throw new ArgumentNullException("encoding");
 678             }
 679             StreamReader sr = new StreamReader(path, encoding);
 680             Load(sr);
 681             sr.Close();
 682         }
 683
 684         /// <summary>
 685         /// Loads an HTML document from a file.
 686         /// </summary>
 687         /// <param name="path">The complete file path to be read. May not be null.</param>
 688         /// <param name="encoding">The character encoding to use. May not be null.</param>
 689         /// <param name="detectEncodingFromByteOrderMarks">Indicates whether to look for byte order marks at the beginning of the file.</param>
 690         public void Load(string path, Encoding encoding, bool detectEncodingFromByteOrderMarks)
 691         {
 692             if (path == null)
 693             {
 694                 throw new ArgumentNullException("path");
 695             }
 696             if (encoding == null)
 697             {
 698                 throw new ArgumentNullException("encoding");
 699             }
 700             StreamReader sr = new StreamReader(path, encoding, detectEncodingFromByteOrderMarks);
 701             Load(sr);
 702             sr.Close();
 703         }
 704
 705         /// <summary>
 706         /// Loads an HTML document from a file.
 707         /// </summary>
 708         /// <param name="path">The complete file path to be read. May not be null.</param>
 709         /// <param name="encoding">The character encoding to use. May not be null.</param>
 710         /// <param name="detectEncodingFromByteOrderMarks">Indicates whether to look for byte order marks at the beginning of the file.</param>
 711         /// <param name="buffersize">The minimum buffer size.</param>
 712         public void Load(string path, Encoding encoding, bool detectEncodingFromByteOrderMarks, int buffersize)
 713         {
 714             if (path == null)
 715             {
 716                 throw new ArgumentNullException("path");
 717             }
 718             if (encoding == null)
 719             {
 720                 throw new ArgumentNullException("encoding");
 721             }
 722             StreamReader sr = new StreamReader(path, encoding, detectEncodingFromByteOrderMarks, buffersize);
 723             Load(sr);
 724             sr.Close();
 725         }
 726
 727         /// <summary>
 728         /// Loads the HTML document from the specified TextReader.
 729         /// </summary>
 730         /// <param name="reader">The TextReader used to feed the HTML data into the document. May not be null.</param>
 731         public void Load(TextReader reader)
 732         {
 733             // all Load methods pass down to this one
 734             if (reader == null)
 735             {
 736                 throw new ArgumentNullException("reader");
 737             }
 738
 739             _onlyDetectEncoding = false;
 740
 741             if (OptionCheckSyntax)
 742             {
 743                 _openednodes = new Hashtable();
 744             }
 745             else
 746             {
 747                 _openednodes = null;
 748             }
 749
 750             if (OptionUseIdAttribute)
 751             {
 752                 _nodesid = new Hashtable();
 753             }
 754             else
 755             {
 756                 _nodesid = null;
 757             }
 758
 759             StreamReader sr = reader as StreamReader;
 760             if (sr != null)
 761             {
 762                 try
 763                 {
 764                     // trigger bom read if needed
 765                     sr.Peek();
 766                 }
 767                     // ReSharper disable EmptyGeneralCatchClause
 768                 catch (Exception)
 769                     // ReSharper restore EmptyGeneralCatchClause
 770                 {
 771                     // void on purpose
 772                 }
 773                 _streamencoding = sr.CurrentEncoding;
 774             }
 775             else
 776             {
 777                 _streamencoding = null;
 778             }
 779             _declaredencoding = null;
 780
 781             _text = reader.ReadToEnd();
 782             _documentnode = CreateNode(HtmlNodeType.Document, 0);
 783             Parse();
 784
 785             if (OptionCheckSyntax)
 786             {
 787                 foreach (HtmlNode node in _openednodes.Values)
 788                 {
 789                     if (!node._starttag) // already reported
 790                     {
 791                         continue;
 792                     }
 793
 794                     string html;
 795                     if (OptionExtractErrorSourceText)
 796                     {
 797                         html = node.OuterHtml;
 798                         if (html.Length > OptionExtractErrorSourceTextMaxLength)
 799                         {
 800                             html = html.Substring(0, OptionExtractErrorSourceTextMaxLength);
 801                         }
 802                     }
 803                     else
 804                     {
 805                         html = string.Empty;
 806                     }
 807                     AddError(
 808                         HtmlParseErrorCode.TagNotClosed,
 809                         node._line, node._lineposition,
 810                         node._streamposition, html,
 811                         "End tag </" + node.Name + "> was not found");
 812                 }
 813
 814                 // we don't need this anymore
 815                 _openednodes.Clear();
 816             }
 817         }
 818
 819         /// <summary>
 820         /// Loads the HTML document from the specified string.
 821         /// </summary>
 822         /// <param name="html">String containing the HTML document to load. May not be null.</param>
 823         public void LoadHtml(string html)
 824         {
 825             if (html == null)
 826             {
 827                 throw new ArgumentNullException("html");
 828             }
 829             StringReader sr = new StringReader(html);
 830             Load(sr);
 831             sr.Close();
 832         }
 833
 834         /// <summary>
 835         /// Saves the HTML document to the specified stream.
 836         /// </summary>
 837         /// <param name="outStream">The stream to which you want to save.</param>
 838         public void Save(Stream outStream)
 839         {
 840             StreamWriter sw = new StreamWriter(outStream, GetOutEncoding());
 841             Save(sw);
 842         }
 843
 844         /// <summary>
 845         /// Saves the HTML document to the specified stream.
 846         /// </summary>
 847         /// <param name="outStream">The stream to which you want to save. May not be null.</param>
 848         /// <param name="encoding">The character encoding to use. May not be null.</param>
 849         public void Save(Stream outStream, Encoding encoding)
 850         {
 851             if (outStream == null)
 852             {
 853                 throw new ArgumentNullException("outStream");
 854             }
 855             if (encoding == null)
 856             {
 857                 throw new ArgumentNullException("encoding");
 858             }
 859             StreamWriter sw = new StreamWriter(outStream, encoding);
 860             Save(sw);
 861         }
 862
 863         /// <summary>
 864         /// Saves the mixed document to the specified file.
 865         /// </summary>
 866         /// <param name="filename">The location of the file where you want to save the document.</param>
 867         public void Save(string filename)
 868         {
 869             StreamWriter sw = new StreamWriter(filename, false, GetOutEncoding());
 870             Save(sw);
 871             sw.Close();
 872         }
 873
 874         /// <summary>
 875         /// Saves the mixed document to the specified file.
 876         /// </summary>
 877         /// <param name="filename">The location of the file where you want to save the document. May not be null.</param>
 878         /// <param name="encoding">The character encoding to use. May not be null.</param>
 879         public void Save(string filename, Encoding encoding)
 880         {
 881             if (filename == null)
 882             {
 883                 throw new ArgumentNullException("filename");
 884             }
 885             if (encoding == null)
 886             {
 887                 throw new ArgumentNullException("encoding");
 888             }
 889             StreamWriter sw = new StreamWriter(filename, false, encoding);
 890             Save(sw);
 891             sw.Close();
 892         }
 893
 894         /// <summary>
 895         /// Saves the HTML document to the specified StreamWriter.
 896         /// </summary>
 897         /// <param name="writer">The StreamWriter to which you want to save.</param>
 898         public void Save(StreamWriter writer)
 899         {
 900             Save((TextWriter) writer);
 901         }
 902
 903         /// <summary>
 904         /// Saves the HTML document to the specified TextWriter.
 905         /// </summary>
 906         /// <param name="writer">The TextWriter to which you want to save. May not be null.</param>
 907         public void Save(TextWriter writer)
 908         {
 909             if (writer == null)
 910             {
 911                 throw new ArgumentNullException("writer");
 912             }
 913             DocumentNode.WriteTo(writer);
 914         }
 915
 916         /// <summary>
 917         /// Saves the HTML document to the specified XmlWriter.
 918         /// </summary>
 919         /// <param name="writer">The XmlWriter to which you want to save.</param>
 920         public void Save(XmlWriter writer)
 921         {
 922             DocumentNode.WriteTo(writer);
 923             writer.Flush();
 924         }
 925
 926         #endregion
 927
 928         #region Internal Methods
 929
 930         internal HtmlAttribute CreateAttribute()
 931         {
 932             return new HtmlAttribute(this);
 933         }
 934
 935         internal HtmlNode CreateNode(HtmlNodeType type)
 936         {
 937             return CreateNode(type, -1);
 938         }
 939
 940         internal HtmlNode CreateNode(HtmlNodeType type, int index)
 941         {
 942             switch (type)
 943             {
 944                 case HtmlNodeType.Comment:
 945                     return new HtmlCommentNode(this, index);
 946
 947                 case HtmlNodeType.Text:
 948                     return new HtmlTextNode(this, index);
 949
 950                 default:
 951                     return new HtmlNode(type, this, index);
 952             }
 953         }
 954
 955         internal Encoding GetOutEncoding()
 956         {
 957             // when unspecified, use the stream encoding first
 958             if (_declaredencoding != null)
 959             {
 960                 return _declaredencoding;
 961             }
 962             else
 963             {
 964                 if (_streamencoding != null)
 965                 {
 966                     return _streamencoding;
 967                 }
 968             }
 969             return OptionDefaultStreamEncoding;
 970         }
 971
 972         internal HtmlNode GetXmlDeclaration()
 973         {
 974             if (!_documentnode.HasChildNodes)
 975             {
 976                 return null;
 977             }
 978
 979             foreach (HtmlNode node in _documentnode._childnodes)
 980             {
 981                 if (node.Name == "?xml") // it's ok, names are case sensitive
 982                 {
 983                     return node;
 984                 }
 985             }
 986             return null;
 987         }
 988
 989         internal void SetIdForNode(HtmlNode node, string id)
 990         {
 991             if (!OptionUseIdAttribute)
 992             {
 993                 return;
 994             }
 995
 996             if ((_nodesid == null) || (id == null))
 997             {
 998                 return;
 999             }
1000
1001             if (node == null)
1002             {
1003                 _nodesid.Remove(id.ToLower());
1004             }
1005             else
1006             {
1007                 _nodesid[id.ToLower()] = node;
1008             }
1009         }
1010
1011         internal void UpdateLastParentNode()
1012         {
1013             do
1014             {
1015                 if (_lastparentnode.Closed)
1016                 {
1017                     _lastparentnode = _lastparentnode.ParentNode;
1018                 }
1019             } while ((_lastparentnode != null) && (_lastparentnode.Closed));
1020             if (_lastparentnode == null)
1021             {
1022                 _lastparentnode = _documentnode;
1023             }
1024         }
1025
1026         #endregion
1027
1028         #region Private Methods
1029
1030         private HtmlParseError AddError(
1031             HtmlParseErrorCode code,
1032             int line,
1033             int linePosition,
1034             int streamPosition,
1035             string sourceText,
1036             string reason)
1037         {
1038             HtmlParseError err = new HtmlParseError(code, line, linePosition, streamPosition, sourceText, reason);
1039             _parseerrors.Add(err);
1040             return err;
1041         }
1042
1043         private void CloseCurrentNode()
1044         {
1045             if (_currentnode.Closed) // text or document are by def closed
1046                 return;
1047
1048             bool error = false;
1049
1050             // find last node of this kind
1051             HtmlNode prev = (HtmlNode) _lastnodes[_currentnode.Name];
1052             if (prev == null)
1053             {
1054                 if (HtmlNode.IsClosedElement(_currentnode.Name))
1055                 {
1056                     // </br> will be seen as <br>
1057                     _currentnode.CloseNode(_currentnode);
1058
1059                     // add to parent node
1060                     if (_lastparentnode != null)
1061                     {
1062                         HtmlNode foundNode = null;
1063                         Stack futureChild = new Stack();
1064                         for (HtmlNode node = _lastparentnode.LastChild; node != null; node = node.PreviousSibling)
1065                         {
1066                             if ((node.Name == _currentnode.Name) && (!node.HasChildNodes))
1067                             {
1068                                 foundNode = node;
1069                                 break;
1070                             }
1071                             futureChild.Push(node);
1072                         }
1073                         if (foundNode != null)
1074                         {
1075                             HtmlNode node = null;
1076                             while (futureChild.Count != 0)
1077                             {
1078                                 node = (HtmlNode) futureChild.Pop();
1079                                 _lastparentnode.RemoveChild(node);
1080                                 foundNode.AppendChild(node);
1081                             }
1082                         }
1083                         else
1084                         {
1085                             _lastparentnode.AppendChild(_currentnode);
1086                         }
1087                     }
1088                 }
1089                 else
1090                 {
1091                     // node has no parent
1092                     // node is not a closed node
1093
1094                     if (HtmlNode.CanOverlapElement(_currentnode.Name))
1095                     {
1096                         // this is a hack: add it as a text node
1097                         HtmlNode closenode = CreateNode(HtmlNodeType.Text, _currentnode._outerstartindex);
1098                         closenode._outerlength = _currentnode._outerlength;
1099                         ((HtmlTextNode) closenode).Text = ((HtmlTextNode) closenode).Text.ToLower();
1100                         if (_lastparentnode != null)
1101                         {
1102                             _lastparentnode.AppendChild(closenode);
1103                         }
1104                     }
1105                     else
1106                     {
1107                         if (HtmlNode.IsEmptyElement(_currentnode.Name))
1108                         {
1109                             AddError(
1110                                 HtmlParseErrorCode.EndTagNotRequired,
1111                                 _currentnode._line, _currentnode._lineposition,
1112                                 _currentnode._streamposition, _currentnode.OuterHtml,
1113                                 "End tag </" + _currentnode.Name + "> is not required");
1114                         }
1115                         else
1116                         {
1117                             // node cannot overlap, node is not empty
1118                             AddError(
1119                                 HtmlParseErrorCode.TagNotOpened,
1120                                 _currentnode._line, _currentnode._lineposition,
1121                                 _currentnode._streamposition, _currentnode.OuterHtml,
1122                                 "Start tag <" + _currentnode.Name + "> was not found");
1123                             error = true;
1124                         }
1125                     }
1126                 }
1127             }
1128             else
1129             {
1130                 if (OptionFixNestedTags)
1131                 {
1132                     if (FindResetterNodes(prev, GetResetters(_currentnode.Name)))
1133                     {
1134                         AddError(
1135                             HtmlParseErrorCode.EndTagInvalidHere,
1136                             _currentnode._line, _currentnode._lineposition,
1137                             _currentnode._streamposition, _currentnode.OuterHtml,
1138                             "End tag </" + _currentnode.Name + "> invalid here");
1139                         error = true;
1140                     }
1141                 }
1142
1143                 if (!error)
1144                 {
1145                     _lastnodes[_currentnode.Name] = prev._prevwithsamename;
1146                     prev.CloseNode(_currentnode);
1147                 }
1148             }
1149
1150
1151             // we close this node, get grandparent
1152             if (!error)
1153             {
1154                 if ((_lastparentnode != null) &&
1155                     ((!HtmlNode.IsClosedElement(_currentnode.Name)) ||
1156                      (_currentnode._starttag)))
1157                 {
1158                     UpdateLastParentNode();
1159                 }
1160             }
1161         }
1162
1163         private string CurrentAttributeName()
1164         {
1165             return _text.Substring(_currentattribute._namestartindex, _currentattribute._namelength);
1166         }
1167
1168         private string CurrentAttributeValue()
1169         {
1170             return _text.Substring(_currentattribute._valuestartindex, _currentattribute._valuelength);
1171         }
1172
1173         private string CurrentNodeInner()
1174         {
1175             return _text.Substring(_currentnode._innerstartindex, _currentnode._innerlength);
1176         }
1177
1178         private string CurrentNodeName()
1179         {
1180             return _text.Substring(_currentnode._namestartindex, _currentnode._namelength);
1181         }
1182
1183         private string CurrentNodeOuter()
1184         {
1185             return _text.Substring(_currentnode._outerstartindex, _currentnode._outerlength);
1186         }
1187
1188
1189         private void DecrementPosition()
1190         {
1191             _index--;
1192             if (_lineposition == 1)
1193             {
1194                 _lineposition = _maxlineposition;
1195                 _line--;
1196             }
1197             else
1198             {
1199                 _lineposition--;
1200             }
1201         }
1202
1203         private HtmlNode FindResetterNode(HtmlNode node, string name)
1204         {
1205             HtmlNode resetter = (HtmlNode) _lastnodes[name];
1206             if (resetter == null)
1207                 return null;
1208             if (resetter.Closed)
1209             {
1210                 return null;
1211             }
1212             if (resetter._streamposition < node._streamposition)
1213             {
1214                 return null;
1215             }
1216             return resetter;
1217         }
1218
1219         private bool FindResetterNodes(HtmlNode node, string[] names)
1220         {
1221             if (names == null)
1222             {
1223                 return false;
1224             }
1225             for (int i = 0; i < names.Length; i++)
1226             {
1227                 if (FindResetterNode(node, names[i]) != null)
1228                 {
1229                     return true;
1230                 }
1231             }
1232             return false;
1233         }
1234
1235         private void FixNestedTag(string name, string[] resetters)
1236         {
1237             if (resetters == null)
1238                 return;
1239
1240             HtmlNode prev;
1241
1242             // if we find a previous unclosed same name node, without a resetter node between, we must close it
1243             prev = (HtmlNode) _lastnodes[name];
1244             if ((prev != null) && (!prev.Closed))
1245             {
1246                 // try to find a resetter node, if found, we do nothing
1247                 if (FindResetterNodes(prev, resetters))
1248                 {
1249                     return;
1250                 }
1251
1252                 // ok we need to close the prev now
1253                 // create a fake closer node
1254                 HtmlNode close = new HtmlNode(prev.NodeType, this, -1);
1255                 close._endnode = close;
1256                 prev.CloseNode(close);
1257             }
1258         }
1259
1260         private void FixNestedTags()
1261         {
1262             // we are only interested by start tags, not closing tags
1263             if (!_currentnode._starttag)
1264                 return;
1265
1266             string name = CurrentNodeName();
1267             FixNestedTag(name, GetResetters(name));
1268         }
1269
1270         private string[] GetResetters(string name)
1271         {
1272             switch (name)
1273             {
1274                 case "li":
1275                     return new string[] {"ul"};
1276
1277                 case "tr":
1278                     return new string[] {"table"};
1279
1280                 case "th":
1281                 case "td":
1282                     return new string[] {"tr", "table"};
1283
1284                 default:
1285                     return null;
1286             }
1287         }
1288
1289         private void IncrementPosition()
1290         {
1291             if (_crc32 != null)
1292             {
1293                 // REVIEW: should we add some checksum code in DecrementPosition too?
1294                 _crc32.AddToCRC32(_c);
1295             }
1296
1297             _index++;
1298             _maxlineposition = _lineposition;
1299             if (_c == 10)
1300             {
1301                 _lineposition = 1;
1302                 _line++;
1303             }
1304             else
1305             {
1306                 _lineposition++;
1307             }
1308         }
1309
1310         private bool NewCheck()
1311         {
1312             if (_c != '<')
1313             {
1314                 return false;
1315             }
1316             if (_index < _text.Length)
1317             {
1318                 if (_text[_index] == '%')
1319                 {
1320                     switch (_state)
1321                     {
1322                         case ParseState.AttributeAfterEquals:
1323                             PushAttributeValueStart(_index - 1);
1324                             break;
1325
1326                         case ParseState.BetweenAttributes:
1327                             PushAttributeNameStart(_index - 1);
1328                             break;
1329
1330                         case ParseState.WhichTag:
1331                             PushNodeNameStart(true, _index - 1);
1332                             _state = ParseState.Tag;
1333                             break;
1334                     }
1335                     _oldstate = _state;
1336                     _state = ParseState.ServerSideCode;
1337                     return true;
1338                 }
1339             }
1340
1341             if (!PushNodeEnd(_index - 1, true))
1342             {
1343                 // stop parsing
1344                 _index = _text.Length;
1345                 return true;
1346             }
1347             _state = ParseState.WhichTag;
1348             if ((_index - 1) <= (_text.Length - 2))
1349             {
1350                 if (_text[_index] == '!')
1351                 {
1352                     PushNodeStart(HtmlNodeType.Comment, _index - 1);
1353                     PushNodeNameStart(true, _index);
1354                     PushNodeNameEnd(_index + 1);
1355                     _state = ParseState.Comment;
1356                     if (_index < (_text.Length - 2))
1357                     {
1358                         if ((_text[_index + 1] == '-') &&
1359                             (_text[_index + 2] == '-'))
1360                         {
1361                             _fullcomment = true;
1362                         }
1363                         else
1364                         {
1365                             _fullcomment = false;
1366                         }
1367                     }
1368                     return true;
1369                 }
1370             }
1371             PushNodeStart(HtmlNodeType.Element, _index - 1);
1372             return true;
1373         }
1374
1375         private void Parse()
1376         {
1377             int lastquote = 0;
1378             if (OptionComputeChecksum)
1379             {
1380                 _crc32 = new Crc32();
1381             }
1382
1383             _lastnodes = new Hashtable();
1384             _c = 0;
1385             _fullcomment = false;
1386             _parseerrors = new List<HtmlParseError>();
1387             _line = 1;
1388             _lineposition = 1;
1389             _maxlineposition = 1;
1390
1391             _state = ParseState.Text;
1392             _oldstate = _state;
1393             _documentnode._innerlength = _text.Length;
1394             _documentnode._outerlength = _text.Length;
1395             _remainderOffset = _text.Length;
1396
1397             _lastparentnode = _documentnode;
1398             _currentnode = CreateNode(HtmlNodeType.Text, 0);
1399             _currentattribute = null;
1400
1401             _index = 0;
1402             PushNodeStart(HtmlNodeType.Text, 0);
1403             while (_index < _text.Length)
1404             {
1405                 _c = _text[_index];
1406                 IncrementPosition();
1407
1408                 switch (_state)
1409                 {
1410                     case ParseState.Text:
1411                         if (NewCheck())
1412                             continue;
1413                         break;
1414
1415                     case ParseState.WhichTag:
1416                         if (NewCheck())
1417                             continue;
1418                         if (_c == '/')
1419                         {
1420                             PushNodeNameStart(false, _index);
1421                         }
1422                         else
1423                         {
1424                             PushNodeNameStart(true, _index - 1);
1425                             DecrementPosition();
1426                         }
1427                         _state = ParseState.Tag;
1428                         break;
1429
1430                     case ParseState.Tag:
1431                         if (NewCheck())
1432                             continue;
1433                         if (IsWhiteSpace(_c))
1434                         {
1435                             PushNodeNameEnd(_index - 1);
1436                             if (_state != ParseState.Tag)
1437                                 continue;
1438                             _state = ParseState.BetweenAttributes;
1439                             continue;
1440                         }
1441                         if (_c == '/')
1442                         {
1443                             PushNodeNameEnd(_index - 1);
1444                             if (_state != ParseState.Tag)
1445                                 continue;
1446                             _state = ParseState.EmptyTag;
1447                             continue;
1448                         }
1449                         if (_c == '>')
1450                         {
1451                             PushNodeNameEnd(_index - 1);
1452                             if (_state != ParseState.Tag)
1453                                 continue;
1454                             if (!PushNodeEnd(_index, false))
1455                             {
1456                                 // stop parsing
1457                                 _index = _text.Length;
1458                                 break;
1459                             }
1460                             if (_state != ParseState.Tag)
1461                                 continue;
1462                             _state = ParseState.Text;
1463                             PushNodeStart(HtmlNodeType.Text, _index);
1464                         }
1465                         break;
1466
1467                     case ParseState.BetweenAttributes:
1468                         if (NewCheck())
1469                             continue;
1470
1471                         if (IsWhiteSpace(_c))
1472                             continue;
1473
1474                         if ((_c == '/') || (_c == '?'))
1475                         {
1476                             _state = ParseState.EmptyTag;
1477                             continue;
1478                         }
1479
1480                         if (_c == '>')
1481                         {
1482                             if (!PushNodeEnd(_index, false))
1483                             {
1484                                 // stop parsing
1485                                 _index = _text.Length;
1486                                 break;
1487                             }
1488
1489                             if (_state != ParseState.BetweenAttributes)
1490                                 continue;
1491                             _state = ParseState.Text;
1492                             PushNodeStart(HtmlNodeType.Text, _index);
1493                             continue;
1494                         }
1495
1496                         PushAttributeNameStart(_index - 1);
1497                         _state = ParseState.AttributeName;
1498                         break;
1499
1500                     case ParseState.EmptyTag:
1501                         if (NewCheck())
1502                             continue;
1503
1504                         if (_c == '>')
1505                         {
1506                             if (!PushNodeEnd(_index, true))
1507                             {
1508                                 // stop parsing
1509                                 _index = _text.Length;
1510                                 break;
1511                             }
1512
1513                             if (_state != ParseState.EmptyTag)
1514                                 continue;
1515                             _state = ParseState.Text;
1516                             PushNodeStart(HtmlNodeType.Text, _index);
1517                             continue;
1518                         }
1519                         _state = ParseState.BetweenAttributes;
1520                         break;
1521
1522                     case ParseState.AttributeName:
1523                         if (NewCheck())
1524                             continue;
1525
1526                         if (IsWhiteSpace(_c))
1527                         {
1528                             PushAttributeNameEnd(_index - 1);
1529                             _state = ParseState.AttributeBeforeEquals;
1530                             continue;
1531                         }
1532                         if (_c == '=')
1533                         {
1534                             PushAttributeNameEnd(_index - 1);
1535                             _state = ParseState.AttributeAfterEquals;
1536                             continue;
1537                         }
1538                         if (_c == '>')
1539                         {
1540                             PushAttributeNameEnd(_index - 1);
1541                             if (!PushNodeEnd(_index, false))
1542                             {
1543                                 // stop parsing
1544                                 _index = _text.Length;
1545                                 break;
1546                             }
1547                             if (_state != ParseState.AttributeName)
1548                                 continue;
1549                             _state = ParseState.Text;
1550                             PushNodeStart(HtmlNodeType.Text, _index);
1551                             continue;
1552                         }
1553                         break;
1554
1555                     case ParseState.AttributeBeforeEquals:
1556                         if (NewCheck())
1557                             continue;
1558
1559                         if (IsWhiteSpace(_c))
1560                             continue;
1561                         if (_c == '>')
1562                         {
1563                             if (!PushNodeEnd(_index, false))
1564                             {
1565                                 // stop parsing
1566                                 _index = _text.Length;
1567                                 break;
1568                             }
1569                             if (_state != ParseState.AttributeBeforeEquals)
1570                                 continue;
1571                             _state = ParseState.Text;
1572                             PushNodeStart(HtmlNodeType.Text, _index);
1573                             continue;
1574                         }
1575                         if (_c == '=')
1576                         {
1577                             _state = ParseState.AttributeAfterEquals;
1578                             continue;
1579                         }
1580                         // no equals, no whitespace, it's a new attrribute starting
1581                         _state = ParseState.BetweenAttributes;
1582                         DecrementPosition();
1583                         break;
1584
1585                     case ParseState.AttributeAfterEquals:
1586                         if (NewCheck())
1587                             continue;
1588
1589                         if (IsWhiteSpace(_c))
1590                             continue;
1591
1592                         if ((_c == '\'') || (_c == '"'))
1593                         {
1594                             _state = ParseState.QuotedAttributeValue;
1595                             PushAttributeValueStart(_index, _c);
1596                             lastquote = _c;
1597                             continue;
1598                         }
1599                         if (_c == '>')
1600                         {
1601                             if (!PushNodeEnd(_index, false))
1602                             {
1603                                 // stop parsing
1604                                 _index = _text.Length;
1605                                 break;
1606                             }
1607                             if (_state != ParseState.AttributeAfterEquals)
1608                                 continue;
1609                             _state = ParseState.Text;
1610                             PushNodeStart(HtmlNodeType.Text, _index);
1611                             continue;
1612                         }
1613                         PushAttributeValueStart(_index - 1);
1614                         _state = ParseState.AttributeValue;
1615                         break;
1616
1617                     case ParseState.AttributeValue:
1618                         if (NewCheck())
1619                             continue;
1620
1621                         if (IsWhiteSpace(_c))
1622                         {
1623                             PushAttributeValueEnd(_index - 1);
1624                             _state = ParseState.BetweenAttributes;
1625                             continue;
1626                         }
1627
1628                         if (_c == '>')
1629                         {
1630                             PushAttributeValueEnd(_index - 1);
1631                             if (!PushNodeEnd(_index, false))
1632                             {
1633                                 // stop parsing
1634                                 _index = _text.Length;
1635                                 break;
1636                             }
1637                             if (_state != ParseState.AttributeValue)
1638                                 continue;
1639                             _state = ParseState.Text;
1640                             PushNodeStart(HtmlNodeType.Text, _index);
1641                             continue;
1642                         }
1643                         break;
1644
1645                     case ParseState.QuotedAttributeValue:
1646                         if (_c == lastquote)
1647                         {
1648                             PushAttributeValueEnd(_index - 1);
1649                             _state = ParseState.BetweenAttributes;
1650                             continue;
1651                         }
1652                         if (_c == '<')
1653                         {
1654                             if (_index < _text.Length)
1655                             {
1656                                 if (_text[_index] == '%')
1657                                 {
1658                                     _oldstate = _state;
1659                                     _state = ParseState.ServerSideCode;
1660                                     continue;
1661                                 }
1662                             }
1663                         }
1664                         break;
1665
1666                     case ParseState.Comment:
1667                         if (_c == '>')
1668                         {
1669                             if (_fullcomment)
1670                             {
1671                                 if ((_text[_index - 2] != '-') ||
1672                                     (_text[_index - 3] != '-'))
1673                                 {
1674                                     continue;
1675                                 }
1676                             }
1677                             if (!PushNodeEnd(_index, false))
1678                             {
1679                                 // stop parsing
1680                                 _index = _text.Length;
1681                                 break;
1682                             }
1683                             _state = ParseState.Text;
1684                             PushNodeStart(HtmlNodeType.Text, _index);
1685                             continue;
1686                         }
1687                         break;
1688
1689                     case ParseState.ServerSideCode:
1690                         if (_c == '%')
1691                         {
1692                             if (_index < _text.Length)
1693                             {
1694                                 if (_text[_index] == '>')
1695                                 {
1696                                     switch (_oldstate)
1697                                     {
1698                                         case ParseState.AttributeAfterEquals:
1699                                             _state = ParseState.AttributeValue;
1700                                             break;
1701
1702                                         case ParseState.BetweenAttributes:
1703                                             PushAttributeNameEnd(_index + 1);
1704                                             _state = ParseState.BetweenAttributes;
1705                                             break;
1706
1707                                         default:
1708                                             _state = _oldstate;
1709                                             break;
1710                                     }
1711                                     IncrementPosition();
1712                                 }
1713                             }
1714                         }
1715                         break;
1716
1717                     case ParseState.PcData:
1718                         // look for </tag + 1 char
1719
1720                         // check buffer end
1721                         if ((_currentnode._namelength + 3) <= (_text.Length - (_index - 1)))
1722                         {
1723                             if (string.Compare(_text.Substring(_index - 1, _currentnode._namelength + 2),
1724                                                "</" + _currentnode.Name, true) == 0)
1725                             {
1726                                 int c = _text[_index - 1 + 2 + _currentnode.Name.Length];
1727                                 if ((c == '>') || (IsWhiteSpace(c)))
1728                                 {
1729                                     // add the script as a text node
1730                                     HtmlNode script = CreateNode(HtmlNodeType.Text,
1731                                                                  _currentnode._outerstartindex +
1732                                                                  _currentnode._outerlength);
1733                                     script._outerlength = _index - 1 - script._outerstartindex;
1734                                     _currentnode.AppendChild(script);
1735
1736
1737                                     PushNodeStart(HtmlNodeType.Element, _index - 1);
1738                                     PushNodeNameStart(false, _index - 1 + 2);
1739                                     _state = ParseState.Tag;
1740                                     IncrementPosition();
1741                                 }
1742                             }
1743                         }
1744                         break;
1745                 }
1746             }
1747
1748             // finish the current work
1749             if (_currentnode._namestartindex > 0)
1750             {
1751                 PushNodeNameEnd(_index);
1752             }
1753             PushNodeEnd(_index, false);
1754
1755             // we don't need this anymore
1756             _lastnodes.Clear();
1757         }
1758
1759         private void PushAttributeNameEnd(int index)
1760         {
1761             _currentattribute._namelength = index - _currentattribute._namestartindex;
1762             _currentnode.Attributes.Append(_currentattribute);
1763         }
1764
1765         private void PushAttributeNameStart(int index)
1766         {
1767             _currentattribute = CreateAttribute();
1768             _currentattribute._namestartindex = index;
1769             _currentattribute.Line = _line;
1770             _currentattribute._lineposition = _lineposition;
1771             _currentattribute._streamposition = index;
1772         }
1773
1774         private void PushAttributeValueEnd(int index)
1775         {
1776             _currentattribute._valuelength = index - _currentattribute._valuestartindex;
1777         }
1778
1779         private void PushAttributeValueStart(int index)
1780         {
1781             PushAttributeValueStart(index, 0);
1782         }
1783
1784         private void PushAttributeValueStart(int index, int quote)
1785         {
1786             _currentattribute._valuestartindex = index;
1787             if (quote == '\'')
1788                 _currentattribute.QuoteType = AttributeValueQuote.SingleQuote;
1789         }
1790
1791         private bool PushNodeEnd(int index, bool close)
1792         {
1793             _currentnode._outerlength = index - _currentnode._outerstartindex;
1794
1795             if ((_currentnode._nodetype == HtmlNodeType.Text) ||
1796                 (_currentnode._nodetype == HtmlNodeType.Comment))
1797             {
1798                 // forget about void nodes
1799                 if (_currentnode._outerlength > 0)
1800                 {
1801                     _currentnode._innerlength = _currentnode._outerlength;
1802                     _currentnode._innerstartindex = _currentnode._outerstartindex;
1803                     if (_lastparentnode != null)
1804                     {
1805                         _lastparentnode.AppendChild(_currentnode);
1806                     }
1807                 }
1808             }
1809             else
1810             {
1811                 if ((_currentnode._starttag) && (_lastparentnode != _currentnode))
1812                 {
1813                     // add to parent node
1814                     if (_lastparentnode != null)
1815                     {
1816                         _lastparentnode.AppendChild(_currentnode);
1817                     }
1818
1819                     ReadDocumentEncoding(_currentnode);
1820
1821                     // remember last node of this kind
1822                     HtmlNode prev = (HtmlNode) _lastnodes[_currentnode.Name];
1823                     _currentnode._prevwithsamename = prev;
1824                     _lastnodes[_currentnode.Name] = _currentnode;
1825
1826                     // change parent?
1827                     if ((_currentnode.NodeType == HtmlNodeType.Document) ||
1828                         (_currentnode.NodeType == HtmlNodeType.Element))
1829                     {
1830                         _lastparentnode = _currentnode;
1831                     }
1832
1833                     if (HtmlNode.IsCDataElement(CurrentNodeName()))
1834                     {
1835                         _state = ParseState.PcData;
1836                         return true;
1837                     }
1838
1839                     if ((HtmlNode.IsClosedElement(_currentnode.Name)) ||
1840                         (HtmlNode.IsEmptyElement(_currentnode.Name)))
1841                     {
1842                         close = true;
1843                     }
1844                 }
1845             }
1846
1847             if ((close) || (!_currentnode._starttag))
1848             {
1849                 if ((OptionStopperNodeName != null) && (_remainder == null) &&
1850                     (string.Compare(_currentnode.Name, OptionStopperNodeName, true) == 0))
1851                 {
1852                     _remainderOffset = index;
1853                     _remainder = _text.Substring(_remainderOffset);
1854                     CloseCurrentNode();
1855                     return false; // stop parsing
1856                 }
1857                 CloseCurrentNode();
1858             }
1859             return true;
1860         }
1861
1862         private void PushNodeNameEnd(int index)
1863         {
1864             _currentnode._namelength = index - _currentnode._namestartindex;
1865             if (OptionFixNestedTags)
1866             {
1867                 FixNestedTags();
1868             }
1869         }
1870
1871         private void PushNodeNameStart(bool starttag, int index)
1872         {
1873             _currentnode._starttag = starttag;
1874             _currentnode._namestartindex = index;
1875         }
1876
1877         private void PushNodeStart(HtmlNodeType type, int index)
1878         {
1879             _currentnode = CreateNode(type, index);
1880             _currentnode._line = _line;
1881             _currentnode._lineposition = _lineposition;
1882             if (type == HtmlNodeType.Element)
1883             {
1884                 _currentnode._lineposition--;
1885             }
1886             _currentnode._streamposition = index;
1887         }
1888
1889         private void ReadDocumentEncoding(HtmlNode node)
1890         {
1891             if (!OptionReadEncoding)
1892                 return;
1893             // format is
1894             // <meta http-equiv="content-type" content="text/html;charset=iso-8859-1" />
1895
1896             // when we append a child, we are in node end, so attributes are already populated
1897             if (node._namelength == 4) // quick check, avoids string alloc
1898             {
1899                 if (node.Name == "meta") // all nodes names are lowercase
1900                 {
1901                     HtmlAttribute att = node.Attributes["http-equiv"];
1902                     if (att != null)
1903                     {
1904                         if (string.Compare(att.Value, "content-type", true) == 0)
1905                         {
1906                             HtmlAttribute content = node.Attributes["content"];
1907                             if (content != null)
1908                             {
1909                                 string charset = NameValuePairList.GetNameValuePairsValue(content.Value, "charset");
1910                                 if (charset != null && (charset = charset.Trim()).Length > 0)
1911                                 {
1912                                     _declaredencoding = Encoding.GetEncoding(charset.Trim());
1913                                     if (_onlyDetectEncoding)
1914                                     {
1915                                         throw new EncodingFoundException(_declaredencoding);
1916                                     }
1917
1918                                     if (_streamencoding != null)
1919                                     {
1920                                         if (_declaredencoding.WindowsCodePage != _streamencoding.WindowsCodePage)
1921                                         {
1922                                             AddError(
1923                                                 HtmlParseErrorCode.CharsetMismatch,
1924                                                 _line, _lineposition,
1925                                                 _index, node.OuterHtml,
1926                                                 "Encoding mismatch between StreamEncoding: " +
1927                                                 _streamencoding.WebName + " and DeclaredEncoding: " +
1928                                                 _declaredencoding.WebName);
1929                                         }
1930                                     }
1931                                 }
1932                             }
1933                         }
1934                     }
1935                 }
1936             }
1937         }
1938
1939         #endregion
1940
1941         #region Nested type: ParseState
1942
1943         private enum ParseState
1944         {
1945             Text,
1946             WhichTag,
1947             Tag,
1948             BetweenAttributes,
1949             EmptyTag,
1950             AttributeName,
1951             AttributeBeforeEquals,
1952             AttributeAfterEquals,
1953             AttributeValue,
1954             Comment,
1955             QuotedAttributeValue,
1956             ServerSideCode,
1957             PcData
1958         }
1959
1960         #endregion
1961     }
1962 }