1 // HtmlAgilityPack V1.0 - Simon Mourier <simon underscore mourier at hotmail dot com>
7 using System
.Xml
.Serialization
;
11 namespace HtmlAgilityPack
14 /// A utility class to get HTML document from HTTP.
21 /// Represents the method that will handle the PostResponse event.
23 public delegate void PostResponseHandler(HttpWebRequest request
, HttpWebResponse response
);
26 /// Represents the method that will handle the PreHandleDocument event.
28 public delegate void PreHandleDocumentHandler(HtmlDocument document
);
31 /// Represents the method that will handle the PreRequest event.
33 public delegate bool PreRequestHandler(HttpWebRequest request
);
39 private bool _autoDetectEncoding
= true;
40 private bool _cacheOnly
;
42 private string _cachePath
;
43 private bool _fromCache
;
44 private int _requestDuration
;
45 private Uri _responseUri
;
46 private HttpStatusCode _statusCode
= HttpStatusCode
.OK
;
47 private int _streamBufferSize
= 1024;
48 private bool _useCookies
;
49 private bool _usingCache
;
52 /// Occurs after an HTTP request has been executed.
54 public PostResponseHandler PostResponse
;
57 /// Occurs before an HTML document is handled.
59 public PreHandleDocumentHandler PreHandleDocument
;
62 /// Occurs before an HTTP request is executed.
64 public PreRequestHandler PreRequest
;
71 /// Gets or Sets a value indicating if document encoding must be automatically detected.
73 public bool AutoDetectEncoding
75 get { return _autoDetectEncoding; }
76 set { _autoDetectEncoding = value; }
80 /// Gets or Sets a value indicating whether to get document only from the cache.
81 /// If this is set to true and document is not found in the cache, nothing will be loaded.
85 get { return _cacheOnly; }
88 if ((value) && !UsingCache
)
90 throw new HtmlWebException("Cache is not enabled. Set UsingCache to true first.");
97 /// Gets or Sets the cache path. If null, no caching mechanism will be used.
99 public string CachePath
101 get { return _cachePath; }
102 set { _cachePath = value; }
106 /// Gets a value indicating if the last document was retrieved from the cache.
108 public bool FromCache
110 get { return _fromCache; }
114 /// Gets the last request duration in milliseconds.
116 public int RequestDuration
118 get { return _requestDuration; }
122 /// Gets the URI of the Internet resource that actually responded to the request.
124 public Uri ResponseUri
126 get { return _responseUri; }
130 /// Gets the last request status.
132 public HttpStatusCode StatusCode
134 get { return _statusCode; }
138 /// Gets or Sets the size of the buffer used for memory operations.
140 public int StreamBufferSize
142 get { return _streamBufferSize; }
145 if (_streamBufferSize
<= 0)
147 throw new ArgumentException("Size must be greater than zero.");
149 _streamBufferSize
= value;
154 /// Gets or Sets a value indicating if cookies will be stored.
156 public bool UseCookies
158 get { return _useCookies; }
159 set { _useCookies = value; }
163 /// Gets or Sets a value indicating whether the caching mechanisms should be used or not.
165 public bool UsingCache
169 if (_cachePath
== null)
177 if ((value) && (_cachePath
== null))
179 throw new HtmlWebException("You need to define a CachePath first.");
187 #region Public Methods
190 /// Gets the MIME content type for a given path extension.
192 /// <param name="extension">The input path extension.</param>
193 /// <param name="def">The default content type to return if any error occurs.</param>
194 /// <returns>The path extension's MIME content type.</returns>
195 public static string GetContentTypeForExtension(string extension
, string def
)
197 if (string.IsNullOrEmpty(extension
))
201 string contentType
= "";
204 RegistryKey reg
= Registry
.ClassesRoot
;
205 reg
= reg
.OpenSubKey(extension
, false);
206 if (reg
!= null) contentType
= (string)reg
.GetValue("", def
);
216 /// Gets the path extension for a given MIME content type.
218 /// <param name="contentType">The input MIME content type.</param>
219 /// <param name="def">The default path extension to return if any error occurs.</param>
220 /// <returns>The MIME content type's path extension.</returns>
221 public static string GetExtensionForContentType(string contentType
, string def
)
223 if (string.IsNullOrEmpty(contentType
))
230 RegistryKey reg
= Registry
.ClassesRoot
;
231 reg
= reg
.OpenSubKey(@"MIME\Database\Content Type\" + contentType
, false);
232 if (reg
!= null) ext
= (string)reg
.GetValue("Extension", def
);
242 /// Creates an instance of the given type from the specified Internet resource.
244 /// <param name="url">The requested URL, such as "http://Myserver/Mypath/Myfile.asp".</param>
245 /// <param name="type">The requested type.</param>
246 /// <returns>An newly created instance.</returns>
247 public object CreateInstance(string url
, Type type
)
249 return CreateInstance(url
, null, null, type
);
253 /// Creates an instance of the given type from the specified Internet resource.
255 /// <param name="htmlUrl">The requested URL, such as "http://Myserver/Mypath/Myfile.asp".</param>
256 /// <param name="xsltUrl">The URL that specifies the XSLT stylesheet to load.</param>
257 /// <param name="xsltArgs">An <see cref="XsltArgumentList"/> containing the namespace-qualified arguments used as input to the transform.</param>
258 /// <param name="type">The requested type.</param>
259 /// <returns>An newly created instance.</returns>
260 public object CreateInstance(string htmlUrl
, string xsltUrl
, XsltArgumentList xsltArgs
, Type type
)
262 return CreateInstance(htmlUrl
, xsltUrl
, xsltArgs
, type
, null);
266 /// Creates an instance of the given type from the specified Internet resource.
268 /// <param name="htmlUrl">The requested URL, such as "http://Myserver/Mypath/Myfile.asp".</param>
269 /// <param name="xsltUrl">The URL that specifies the XSLT stylesheet to load.</param>
270 /// <param name="xsltArgs">An <see cref="XsltArgumentList"/> containing the namespace-qualified arguments used as input to the transform.</param>
271 /// <param name="type">The requested type.</param>
272 /// <param name="xmlPath">A file path where the temporary XML before transformation will be saved. Mostly used for debugging purposes.</param>
273 /// <returns>An newly created instance.</returns>
274 public object CreateInstance(string htmlUrl
, string xsltUrl
, XsltArgumentList xsltArgs
, Type type
,
277 StringWriter sw
= new StringWriter();
278 XmlTextWriter writer
= new XmlTextWriter(sw
);
281 LoadHtmlAsXml(htmlUrl
, writer
);
287 LoadHtmlAsXml(htmlUrl
, xsltUrl
, xsltArgs
, writer
);
291 LoadHtmlAsXml(htmlUrl
, xsltUrl
, xsltArgs
, writer
, xmlPath
);
295 StringReader sr
= new StringReader(sw
.ToString());
296 XmlTextReader reader
= new XmlTextReader(sr
);
297 XmlSerializer serializer
= new XmlSerializer(type
);
301 o
= serializer
.Deserialize(reader
);
303 catch (InvalidOperationException ex
)
305 throw new Exception(ex
+ ", --- xml:" + sw
);
311 /// Gets an HTML document from an Internet resource and saves it to the specified file.
313 /// <param name="url">The requested URL, such as "http://Myserver/Mypath/Myfile.asp".</param>
314 /// <param name="path">The location of the file where you want to save the document.</param>
315 public void Get(string url
, string path
)
317 Get(url
, path
, "GET");
321 /// Gets an HTML document from an Internet resource and saves it to the specified file. - Proxy aware
323 /// <param name="url">The requested URL, such as "http://Myserver/Mypath/Myfile.asp".</param>
324 /// <param name="path">The location of the file where you want to save the document.</param>
325 /// <param name="proxy"></param>
326 /// <param name="credentials"></param>
327 public void Get(string url
, string path
, WebProxy proxy
, NetworkCredential credentials
)
329 Get(url
, path
, proxy
, credentials
, "GET");
333 /// Gets an HTML document from an Internet resource and saves it to the specified file.
335 /// <param name="url">The requested URL, such as "http://Myserver/Mypath/Myfile.asp".</param>
336 /// <param name="path">The location of the file where you want to save the document.</param>
337 /// <param name="method">The HTTP method used to open the connection, such as GET, POST, PUT, or PROPFIND.</param>
338 public void Get(string url
, string path
, string method
)
340 Uri uri
= new Uri(url
);
341 if ((uri
.Scheme
== Uri
.UriSchemeHttps
) ||
342 (uri
.Scheme
== Uri
.UriSchemeHttp
))
344 Get(uri
, method
, path
, null, null, null);
348 throw new HtmlWebException("Unsupported uri scheme: '" + uri
.Scheme
+ "'.");
353 /// Gets an HTML document from an Internet resource and saves it to the specified file. Understands Proxies
355 /// <param name="url">The requested URL, such as "http://Myserver/Mypath/Myfile.asp".</param>
356 /// <param name="path">The location of the file where you want to save the document.</param>
357 /// <param name="credentials"></param>
358 /// <param name="method">The HTTP method used to open the connection, such as GET, POST, PUT, or PROPFIND.</param>
359 /// <param name="proxy"></param>
360 public void Get(string url
, string path
, WebProxy proxy
, NetworkCredential credentials
, string method
)
362 Uri uri
= new Uri(url
);
363 if ((uri
.Scheme
== Uri
.UriSchemeHttps
) ||
364 (uri
.Scheme
== Uri
.UriSchemeHttp
))
366 Get(uri
, method
, path
, null, proxy
, credentials
);
370 throw new HtmlWebException("Unsupported uri scheme: '" + uri
.Scheme
+ "'.");
375 /// Gets the cache file path for a specified url.
377 /// <param name="uri">The url fo which to retrieve the cache path. May not be null.</param>
378 /// <returns>The cache file path.</returns>
379 public string GetCachePath(Uri uri
)
383 throw new ArgumentNullException("uri");
387 throw new HtmlWebException("Cache is not enabled. Set UsingCache to true first.");
390 if (uri
.AbsolutePath
== "/")
392 cachePath
= Path
.Combine(_cachePath
, ".htm");
396 cachePath
= Path
.Combine(_cachePath
, (uri
.Host
+ uri
.AbsolutePath
).Replace('/', '\\'));
402 /// Gets an HTML document from an Internet resource.
404 /// <param name="url">The requested URL, such as "http://Myserver/Mypath/Myfile.asp".</param>
405 /// <returns>A new HTML document.</returns>
406 public HtmlDocument
Load(string url
)
408 return Load(url
, "GET");
412 /// Gets an HTML document from an Internet resource.
414 /// <param name="url">The requested URL, such as "http://Myserver/Mypath/Myfile.asp".</param>
415 /// <param name="proxyHost">Host to use for Proxy</param>
416 /// <param name="proxyPort">Port the Proxy is on</param>
417 /// <param name="userId">User Id for Authentication</param>
418 /// <param name="password">Password for Authentication</param>
419 /// <returns>A new HTML document.</returns>
420 public HtmlDocument
Load(string url
, string proxyHost
, int proxyPort
, string userId
, string password
)
423 WebProxy myProxy
= new WebProxy(proxyHost
, proxyPort
);
424 myProxy
.BypassProxyOnLocal
= true;
426 //Create my credentials
427 NetworkCredential myCreds
= null;
428 if ((userId
!= null) && (password
!= null))
430 myCreds
= new NetworkCredential(userId
, password
);
431 CredentialCache credCache
= new CredentialCache();
433 credCache
.Add(myProxy
.Address
, "Basic", myCreds
);
434 credCache
.Add(myProxy
.Address
, "Digest", myCreds
);
437 return Load(url
, "GET", myProxy
, myCreds
);
441 /// Loads an HTML document from an Internet resource.
443 /// <param name="url">The requested URL, such as "http://Myserver/Mypath/Myfile.asp".</param>
444 /// <param name="method">The HTTP method used to open the connection, such as GET, POST, PUT, or PROPFIND.</param>
445 /// <returns>A new HTML document.</returns>
446 public HtmlDocument
Load(string url
, string method
)
448 Uri uri
= new Uri(url
);
450 if ((uri
.Scheme
== Uri
.UriSchemeHttps
) ||
451 (uri
.Scheme
== Uri
.UriSchemeHttp
))
453 doc
= LoadUrl(uri
, method
, null, null);
457 if (uri
.Scheme
== Uri
.UriSchemeFile
)
459 doc
= new HtmlDocument();
460 doc
.OptionAutoCloseOnEnd
= false;
461 doc
.OptionAutoCloseOnEnd
= true;
462 doc
.DetectEncodingAndLoad(url
, _autoDetectEncoding
);
466 throw new HtmlWebException("Unsupported uri scheme: '" + uri
.Scheme
+ "'.");
469 if (PreHandleDocument
!= null)
471 PreHandleDocument(doc
);
477 /// Loads an HTML document from an Internet resource.
479 /// <param name="url">The requested URL, such as "http://Myserver/Mypath/Myfile.asp".</param>
480 /// <param name="method">The HTTP method used to open the connection, such as GET, POST, PUT, or PROPFIND.</param>
481 /// <param name="proxy">Proxy to use with this request</param>
482 /// <param name="credentials">Credentials to use when authenticating</param>
483 /// <returns>A new HTML document.</returns>
484 public HtmlDocument
Load(string url
, string method
, WebProxy proxy
, NetworkCredential credentials
)
486 Uri uri
= new Uri(url
);
488 if ((uri
.Scheme
== Uri
.UriSchemeHttps
) ||
489 (uri
.Scheme
== Uri
.UriSchemeHttp
))
491 doc
= LoadUrl(uri
, method
, proxy
, credentials
);
495 if (uri
.Scheme
== Uri
.UriSchemeFile
)
497 doc
= new HtmlDocument();
498 doc
.OptionAutoCloseOnEnd
= false;
499 doc
.OptionAutoCloseOnEnd
= true;
500 doc
.DetectEncodingAndLoad(url
, _autoDetectEncoding
);
504 throw new HtmlWebException("Unsupported uri scheme: '" + uri
.Scheme
+ "'.");
507 if (PreHandleDocument
!= null)
509 PreHandleDocument(doc
);
515 /// Loads an HTML document from an Internet resource and saves it to the specified XmlTextWriter.
517 /// <param name="htmlUrl">The requested URL, such as "http://Myserver/Mypath/Myfile.asp".</param>
518 /// <param name="writer">The XmlTextWriter to which you want to save.</param>
519 public void LoadHtmlAsXml(string htmlUrl
, XmlTextWriter writer
)
521 HtmlDocument doc
= Load(htmlUrl
);
526 /// Loads an HTML document from an Internet resource and saves it to the specified XmlTextWriter, after an XSLT transformation.
528 /// <param name="htmlUrl">The requested URL, such as "http://Myserver/Mypath/Myfile.asp".</param>
529 /// <param name="xsltUrl">The URL that specifies the XSLT stylesheet to load.</param>
530 /// <param name="xsltArgs">An XsltArgumentList containing the namespace-qualified arguments used as input to the transform.</param>
531 /// <param name="writer">The XmlTextWriter to which you want to save.</param>
532 public void LoadHtmlAsXml(string htmlUrl
, string xsltUrl
, XsltArgumentList xsltArgs
, XmlTextWriter writer
)
534 LoadHtmlAsXml(htmlUrl
, xsltUrl
, xsltArgs
, writer
, null);
538 /// Loads an HTML document from an Internet resource and saves it to the specified XmlTextWriter, after an XSLT transformation.
540 /// <param name="htmlUrl">The requested URL, such as "http://Myserver/Mypath/Myfile.asp". May not be null.</param>
541 /// <param name="xsltUrl">The URL that specifies the XSLT stylesheet to load.</param>
542 /// <param name="xsltArgs">An XsltArgumentList containing the namespace-qualified arguments used as input to the transform.</param>
543 /// <param name="writer">The XmlTextWriter to which you want to save.</param>
544 /// <param name="xmlPath">A file path where the temporary XML before transformation will be saved. Mostly used for debugging purposes.</param>
545 public void LoadHtmlAsXml(string htmlUrl
, string xsltUrl
, XsltArgumentList xsltArgs
, XmlTextWriter writer
,
550 throw new ArgumentNullException("htmlUrl");
553 HtmlDocument doc
= Load(htmlUrl
);
557 XmlTextWriter w
= new XmlTextWriter(xmlPath
, doc
.Encoding
);
561 if (xsltArgs
== null)
563 xsltArgs
= new XsltArgumentList();
566 // add some useful variables to the xslt doc
567 xsltArgs
.AddParam("url", "", htmlUrl
);
568 xsltArgs
.AddParam("requestDuration", "", RequestDuration
);
569 xsltArgs
.AddParam("fromCache", "", FromCache
);
571 XslCompiledTransform xslt
= new XslCompiledTransform();
573 xslt
.Transform(doc
, xsltArgs
, writer
);
578 #region Private Methods
580 private static void FilePreparePath(string target
)
582 if (File
.Exists(target
))
584 FileAttributes atts
= File
.GetAttributes(target
);
585 File
.SetAttributes(target
, atts
& ~FileAttributes
.ReadOnly
);
589 string dir
= Path
.GetDirectoryName(target
);
590 if (!Directory
.Exists(dir
))
592 Directory
.CreateDirectory(dir
);
597 private static DateTime
RemoveMilliseconds(DateTime t
)
599 return new DateTime(t
.Year
, t
.Month
, t
.Day
, t
.Hour
, t
.Minute
, t
.Second
, 0);
602 // ReSharper disable UnusedMethodReturnValue.Local
603 private static long SaveStream(Stream stream
, string path
, DateTime touchDate
, int streamBufferSize
)
604 // ReSharper restore UnusedMethodReturnValue.Local
606 FilePreparePath(path
);
607 FileStream fs
= new FileStream(path
, FileMode
.Create
, FileAccess
.Write
);
608 BinaryReader br
= null;
609 BinaryWriter bw
= null;
613 br
= new BinaryReader(stream
);
614 bw
= new BinaryWriter(fs
);
619 buffer
= br
.ReadBytes(streamBufferSize
);
620 len
+= buffer
.Length
;
621 if (buffer
.Length
> 0)
625 } while (buffer
.Length
> 0);
643 File
.SetLastWriteTime(path
, touchDate
);
647 private HttpStatusCode
Get(Uri uri
, string method
, string path
, HtmlDocument doc
, IWebProxy proxy
,
650 string cachePath
= null;
652 bool oldFile
= false;
654 req
= WebRequest
.Create(uri
) as HttpWebRequest
;
661 proxy
.Credentials
= creds
;
662 req
.Credentials
= creds
;
666 proxy
.Credentials
= CredentialCache
.DefaultCredentials
;
667 req
.Credentials
= CredentialCache
.DefaultCredentials
;
673 _requestDuration
= 0;
674 int tc
= Environment
.TickCount
;
677 cachePath
= GetCachePath(req
.RequestUri
);
678 if (File
.Exists(cachePath
))
680 req
.IfModifiedSince
= File
.GetLastAccessTime(cachePath
);
687 if (!File
.Exists(cachePath
))
689 throw new HtmlWebException("File was not found at cache path: '" + cachePath
+ "'");
694 IOLibrary
.CopyAlways(cachePath
, path
);
696 File
.SetLastWriteTime(path
, File
.GetLastWriteTime(cachePath
));
699 return HttpStatusCode
.NotModified
;
704 req
.CookieContainer
= new CookieContainer();
707 if (PreRequest
!= null)
709 // allow our user to change the request at will
710 if (!PreRequest(req
))
712 return HttpStatusCode
.ResetContent
;
718 // foreach(Cookie cookie in req.CookieContainer.GetCookies(req.RequestUri))
720 // HtmlLibrary.Trace("Cookie " + cookie.Name + "=" + cookie.Value + " path=" + cookie.Path + " domain=" + cookie.Domain);
725 HttpWebResponse resp
;
729 resp
= req
.GetResponse() as HttpWebResponse
;
731 catch (WebException we
)
733 _requestDuration
= Environment
.TickCount
- tc
;
734 resp
= (HttpWebResponse
)we
.Response
;
741 IOLibrary
.CopyAlways(cachePath
, path
);
743 File
.SetLastWriteTime(path
, File
.GetLastWriteTime(cachePath
));
745 return HttpStatusCode
.NotModified
;
752 _requestDuration
= Environment
.TickCount
- tc
;
756 // allow our user to get some info from the response
757 if (PostResponse
!= null)
759 PostResponse(req
, resp
);
762 _requestDuration
= Environment
.TickCount
- tc
;
763 _responseUri
= resp
.ResponseUri
;
765 bool html
= IsHtmlContent(resp
.ContentType
);
768 if ((resp
.ContentEncoding
!= null) && (resp
.ContentEncoding
.Length
> 0))
770 respenc
= Encoding
.GetEncoding(resp
.ContentEncoding
);
777 if (resp
.StatusCode
== HttpStatusCode
.NotModified
)
784 IOLibrary
.CopyAlways(cachePath
, path
);
786 File
.SetLastWriteTime(path
, File
.GetLastWriteTime(cachePath
));
788 return resp
.StatusCode
;
792 // this should *never* happen...
793 throw new HtmlWebException("Server has send a NotModifed code, without cache enabled.");
796 Stream s
= resp
.GetResponseStream();
801 // NOTE: LastModified does not contain milliseconds, so we remove them to the file
802 SaveStream(s
, cachePath
, RemoveMilliseconds(resp
.LastModified
), _streamBufferSize
);
805 SaveCacheHeaders(req
.RequestUri
, resp
);
809 // copy and touch the file
810 IOLibrary
.CopyAlways(cachePath
, path
);
811 File
.SetLastWriteTime(path
, File
.GetLastWriteTime(cachePath
));
816 // try to work in-memory
817 if ((doc
!= null) && (html
))
821 doc
.Load(s
, respenc
);
831 return resp
.StatusCode
;
834 private string GetCacheHeader(Uri requestUri
, string name
, string def
)
836 // note: some headers are collection (ex: www-authenticate)
837 // we don't handle that here
838 XmlDocument doc
= new XmlDocument();
839 doc
.Load(GetCacheHeadersPath(requestUri
));
841 doc
.SelectSingleNode("//h[translate(@n, 'abcdefghijklmnopqrstuvwxyz','ABCDEFGHIJKLMNOPQRSTUVWXYZ')='" +
842 name
.ToUpper() + "']");
847 // attribute should exist
848 return node
.Attributes
[name
].Value
;
851 private string GetCacheHeadersPath(Uri uri
)
853 //return Path.Combine(GetCachePath(uri), ".h.xml");
854 return GetCachePath(uri
) + ".h.xml";
857 private bool IsCacheHtmlContent(string path
)
859 string ct
= GetContentTypeForExtension(Path
.GetExtension(path
), null);
860 return IsHtmlContent(ct
);
863 private bool IsHtmlContent(string contentType
)
865 return contentType
.ToLower().StartsWith("text/html");
868 private HtmlDocument
LoadUrl(Uri uri
, string method
, WebProxy proxy
, NetworkCredential creds
)
870 HtmlDocument doc
= new HtmlDocument();
871 doc
.OptionAutoCloseOnEnd
= false;
872 doc
.OptionFixNestedTags
= true;
873 _statusCode
= Get(uri
, method
, null, doc
, proxy
, creds
);
874 if (_statusCode
== HttpStatusCode
.NotModified
)
876 // read cached encoding
877 doc
.DetectEncodingAndLoad(GetCachePath(uri
));
882 private void SaveCacheHeaders(Uri requestUri
, HttpWebResponse resp
)
884 // we cache the original headers aside the cached document.
885 string file
= GetCacheHeadersPath(requestUri
);
886 XmlDocument doc
= new XmlDocument();
887 doc
.LoadXml("<c></c>");
888 XmlNode cache
= doc
.FirstChild
;
889 foreach (string header
in resp
.Headers
)
891 XmlNode entry
= doc
.CreateElement("h");
892 XmlAttribute att
= doc
.CreateAttribute("n");
894 entry
.Attributes
.Append(att
);
896 att
= doc
.CreateAttribute("v");
897 att
.Value
= resp
.Headers
[header
];
898 entry
.Attributes
.Append(att
);
900 cache
.AppendChild(entry
);