using System; using System.Collections.Generic; using System.Text; using System.IO; using System.Text.RegularExpressions; using System.Web; namespace MyDownloader.Spider.Parsers.Html { public class HtmlParser: IDisposable { private static string RegExprHREF = @"(?<=a\s+([^>]+\s+)?href\=[\x27\x22])(?<1>[^\x27\x22]*)(?=[\x27\x22])"; //private static string RegExprIMG = @"]+\s+)?src\s*=\s*(?:""(?<1>[/\a-z0-9_][^""\/]*)""|'(?<1>[/\a-z0-9_][^'']*)''|(?<1>[/\a-z0-9_]\S*))(\s?[^>]*/)?>"; private static string RegExprIMG = @"(?<=img\s+([^>]+\s+)?src\=[\x27\x22])(?<1>[^\x27\x22]*)(?=[\x27\x22])"; private static string RegExprIFrame = @"(?<=iframe\s+src\=[\x27\x22])(?<1>[^\x27\x22]*)(?=[\x27\x22])"; private static string RegExprFrame = @"(?<=frame\s+src\=[\x27\x22])(?<1>[^\x27\x22]*)(?=[\x27\x22])"; private static Regex RegExFindHref = new Regex(RegExprHREF, RegexOptions.Singleline | RegexOptions.IgnoreCase | RegexOptions.Compiled); private static Regex RegExFindImg = new Regex(RegExprIMG, RegexOptions.Singleline | RegexOptions.IgnoreCase | RegexOptions.Compiled); private static Regex RegExFindIFrame = new Regex(RegExprIFrame, RegexOptions.Singleline | RegexOptions.IgnoreCase | RegexOptions.Compiled); private static Regex RegExFindFrame = new Regex(RegExprFrame, RegexOptions.Singleline | RegexOptions.IgnoreCase | RegexOptions.Compiled); private static Regex[] regs = { RegExFindHref, RegExFindImg, RegExFindIFrame, RegExFindFrame }; private string htmlData; public HtmlParser(Stream htmlStream) { using (StreamReader sw = new StreamReader(htmlStream)) { htmlData = sw.ReadToEnd(); } } public IEnumerable GetResources(UrlType urlType, string baseUri) { return GetEnumerator(regs[(int)urlType], baseUri, this.htmlData); } public IEnumerable GetHrefs(string baseUri) { return GetEnumerator(RegExFindHref, baseUri, this.htmlData); } public IEnumerable GetImages(string baseUri) { return GetEnumerator(RegExFindImg, baseUri, this.htmlData); } public IEnumerable GetFrames(string baseUri) { return GetEnumerator(RegExFindFrame, baseUri, this.htmlData); } public IEnumerable GetIFrames(string baseUri) { return GetEnumerator(RegExFindIFrame, baseUri, this.htmlData); } private static IEnumerable GetEnumerator(Regex regExpr, string baseUrl, string html) { for (Match m = regExpr.Match(html); m.Success; m = m.NextMatch()) { string href = m.Groups[1].ToString(); // filter non-real relation urls: if (String.IsNullOrEmpty(href) || href.StartsWith("#") || href.StartsWith("mailto:") || href.StartsWith("javascript:")) { continue; } href = System.Web.HttpUtility.HtmlDecode(href); Uri uri = null; try { uri = ConvertToAbsoluteUrl(href, baseUrl); } catch (Exception) { } if (uri != null) { yield return uri; } } } private static Uri ConvertToAbsoluteUrl(string url, string baseUrl) { // we try to prevent the exception caused in the case the url is relative // (no scheme info) just for speed if (url.IndexOf(Uri.SchemeDelimiter) < 0 && baseUrl != null) { try { Uri baseUri = new Uri(baseUrl); return new Uri(baseUri, url); } catch { return null; } } try { Uri uri = new Uri(url); return uri; } catch (Exception) { if (baseUrl != null) { try { Uri baseUri = new Uri(baseUrl); return new Uri(baseUri, url); } catch (Exception) { return null; } } else { return null; } } } #region IDisposable Members public void Dispose() { this.htmlData = null; } #endregion } }