DotNet 笔记

东方欲晓,莫道君起早。

C#网页采集。使用Fizzler和HtmlAgilityPack

下载第三方dll文件

1:首先项目中引用这三个dll

2:自己封装一个帮助类

    public class HtmlHelper
    {
        /// <summary>
        /// 获取HTML内容
        /// </summary>
        /// <param name="Url">链接</param>
        /// <param name="Code">字符集</param>
        /// <returns></returns>
        public static string GetHtml(string Url, Encoding Code)
        {
            return GetPageByHttpWebRequest(Url, Code);
        }

        private static string GetPageByHttpWebRequest(string url, Encoding encoding)
        {
            Stream sr = null;
            StreamReader sReader = null;
            try
            {
                HttpWebRequest request = (HttpWebRequest)WebRequest.Create(url);
                request.Method = "Get";
                request.Timeout = 30000;
                HttpWebResponse response = request.GetResponse() as HttpWebResponse;
                if (response.ContentEncoding.ToLower() == "gzip")//如果使用了GZip则先解压
                {
                    sr = new GZipStream(response.GetResponseStream(), CompressionMode.Decompress);
                }
                else
                {
                    sr = response.GetResponseStream();
                }
                sReader = new StreamReader(sr, encoding);
                return sReader.ReadToEnd();
            }
            catch
            {
                return null;
            }
            finally
            {
                if (sReader != null)
                    sReader.Close();
                if (sr != null)
                    sr.Close();
            }
        }


        /// <summary>
        /// 获取相应的标签内容
        /// </summary>
        /// <param name="Url">链接</param>
        /// <param name="CSSLoad">CSS路径</param>
        /// <param name="Code">字符集</param>
        /// <returns></returns>
        public static IEnumerable<HtmlNode> GetUrlInfo(string Url, string CSSLoad, Encoding Code)
        {
            HtmlAgilityPack.HtmlDocument htmlDoc = new HtmlAgilityPack.HtmlDocument
            {
                OptionAddDebuggingAttributes = false,
                OptionAutoCloseOnEnd = true,
                OptionFixNestedTags = true,
                OptionReadEncoding = true
            };

            htmlDoc.LoadHtml(GetHtml(Url, Code));
            IEnumerable<HtmlNode> NodesMainContent = htmlDoc.DocumentNode.QuerySelectorAll(CSSLoad);//查询的路径
            return NodesMainContent;
        }

        /// <summary>
        /// 获取相应的标签内容
        /// </summary>
        /// <param name="html">html内容</param>
        /// <param name="CSSLoad">CSS路径</param>
        /// <returns></returns>
        public static IEnumerable<HtmlNode> GetHtmlInfo(string html, string CSSLoad)
        {
            HtmlAgilityPack.HtmlDocument htmlDoc = new HtmlAgilityPack.HtmlDocument
            {
                OptionAddDebuggingAttributes = false,
                OptionAutoCloseOnEnd = true,
                OptionFixNestedTags = true,
                OptionReadEncoding = true
            };

            htmlDoc.LoadHtml(html);
            IEnumerable<HtmlNode> NodesMainContent = htmlDoc.DocumentNode.QuerySelectorAll(CSSLoad);//查询的路径
            return NodesMainContent;
        }
    }

 

3:使用代码如下

                //1:确定页面的URL
                string domain = "http://www.gushiwen.org/default_1.aspx";
                //2:确定要采集内容的csspath
                string csspath = "html body div.main3 div.left div.sons div.cont p a b";
                //3:获取符合条件的内容集合
                var rs = HtmlHelper.GetUrlInfo(domain, csspath, ASCIIEncoding.UTF8);
                //4:遍历集合
                foreach (HtmlAgilityPack.HtmlNode n in rs)
                {
                    /*
                    --n代表一个元素节点
                    n.Attributes  eg:n.Attributes.Where(r => r.Name.ToLower() == "style").FirstOrDefault()//获取style属性
                    n.ChildNodes
                    n.FirstChild
                    n.HasAttributes
                    n.HasChildNodes
                    n.InnerHtml
                    n.InnerText
                    n.LastChild
                    n.NextSibling
                    n.NodeType
                    n.ParentNode
                    n.PreviousSibling
                    n.XPath
                    以及各种Html节点操作的方法
                     */
                }

优点:

1:十分简单,对程序员而言,十分灵活:

2:虽然比市面上的 火车头,八爪鱼来说,功能差的太多,但性能绝对比他们好的多:,因为不用再用浏览器渲染一遍。

 

缺点:

1:AJAX页面的采取需要自己特殊处理。

2:验证码问题,登录问题,等等也都需要自己处理。

3:需要看得懂程序代码,非程序员不能使用。

 

 

Loading