WebBrowser结合HtmlAgilityPack获取百度HTML

代码归类 Views
WebBrowser结合HtmlAgilityPack获取百度HTML

以下是实现代码

 using GetCodes;
using System;
using System.IO;
using System.Text;
using System.Text.RegularExpressions;
using System.Windows.Forms;
using Baiyuez.framework;
using NPOI.SS.UserModel;
using NPOI.XSSF.UserModel;
using System.Security.Permissions;
using System.Threading;
namespace GetBaiDuIndexData
{
    public partial class Form1 : Form
    {
        HttpHelper http = new HttpHelper();
        public string content = string.Empty;
        public Form1()
        {
            InitializeComponent();
        }
        #region 内存回收
        [System.Runtime.InteropServices.DllImport("kernel32.dll", EntryPoint = "SetProcessWorkingSetSize")]
        public static extern int SetProcessWorkingSetSize(IntPtr process, int minSize, int maxSize);
        /// <summary>
        /// 释放内存
        /// </summary>
        public static void ClearMemory()
        {
            GC.Collect();
            GC.WaitForPendingFinalizers();
            if (Environment.OSVersion.Platform == PlatformID.Win32NT)
            {
                SetProcessWorkingSetSize(System.Diagnostics.Process.GetCurrentProcess().Handle, -1, -1);
            }
        }
        #endregion
        #region 基础函数=顺序写入日志文件、倒序将内容写入到日志文件
        public string ReadFile(string fileName)
        {
            try
            {
                string filePath = fileName;
                if (!File.Exists(filePath))
                {
                    File.Create(filePath).Close();
                }
                StreamReader sr = new StreamReader(filePath);
                //调用ReadToEnd方法读取选中文件的全部内容
                var s = sr.ReadToEnd();
                //关闭当前文件读取流
                sr.Close();
                return s;
            }
            catch (Exception exception)
            {
                throw new Exception(exception.ToString());
            }
            finally
            {
            }
        }
        /// 写入文件
        /// </summary>
        /// <param name="fileName" /></param />
        /// <param name="fileContent" /></param />
        /// <param name="isDaoxu" /></param />
        /// <param name="encode" /></param />
        public void WriteFile(string fileName, string fileContent, bool isDaoxu, Encoding encode)
        {
            try
            {
                //Showtext(fileContent);
                string savePath = Application.StartupPath + "/";
                if (!Directory.Exists(savePath))//如果不存在就创建file文件夹
                {
                    //Directory.CreateDirectory(savePath);//创建该文件夹
                }
                string filePath = savePath + fileName;
                if (!File.Exists(filePath))
                {
                    File.Create(filePath).Close();
                }
                string oldContent = File.ReadAllText(filePath, encode);//先读取再关闭,然后写入,,顺序一错,你就傻逼了
                FileStream fs = new FileStream(filePath, FileMode.Create);
                StreamWriter sw = new StreamWriter(fs, encode);
                //开始写入
                if (isDaoxu)
                {
                    sw.Write(fileContent + "\r\n" + oldContent);
                }
                else
                {
                    sw.Write(fileContent);
                }
                //清空缓冲区
                sw.Flush();
                //关闭流
                sw.Close();
                fs.Close();
            }
            catch (Exception exception)
            {
                //throw new Exception(exception.ToString());
            }
            finally
            {
            }
        }
        #endregion
        delegate void labDelegate(string str);
        private void SetLabelText(string str)
        {
            if (label1.InvokeRequired)
            {
                Invoke(new labDelegate(SetLabelText), new string[] { str });
            }
            else
            {
                label1.Text = str;
            }
        }
        delegate void btnDelegate(bool enabled);
        private void SetButton(bool enabled)
        {            
            if (button1.InvokeRequired)
            {
                Invoke(new btnDelegate(SetButton), enabled);
            }
            else
            {
                button1.Enabled = enabled;
            }
        }
        private void button1_Click(object sender, EventArgs e)
        {
            if (webBrowser1.ReadyState != WebBrowserReadyState.Complete)
            {
                MessageBox.Show("请等待页面加载完成!"); return;
            }  
            Thread thread = new Thread(new ThreadStart(LoadData));
            thread.IsBackground = true;
            thread.Start();
        }
        private void LoadData()
        {
            try
            {                     
                SetButton(false); SetLabelText("程序正在获取数据,大约5-10秒....");
                HttpItem item = null;
                HttpResult httpresult = null;
                item = new HttpItem()
                {
                    URL = "http://news.baidu.com/",
                    Method = "GET",
                    Cookie = "",
                    Accept = "image/gif, image/jpeg, image/pjpeg, image/pjpeg, application/msword, application/vnd.ms-excel, application/vnd.ms-powerpoint, application/x-ms-application, application/x-ms-xbap, application/vnd.ms-xpsdocument, application/xaml+xml, application/x-shockwave-flash, */*",
                };
                httpresult = http.GetHtml(item);
                var retCode = httpresult.Html;
                retCode = retCode.Replace("\"", "'").Replace("\t", "").Replace("\n", "").Replace("\r", "");
                //Replace(" ", "").Replace("    ", "")
                var res2 = Utilities.GetMidStr(retCode, "<div id="pane-news" class="mod-tab-pane active">", "</div><div id='pane-recommend'", false);
                //城市新闻
                //var res3 = Utilities.GetMidStr(retCode, "<ul class="ulist focuslistnews" id="localnews-focus">", "</ul></div><div class="l-middle-col" alog-group="log-local-middle">", false);
                res2 = res2.Replace("<div class="hotnews" alog-group="focustop-hotnews">", "");
                res2 = res2.Replace("<ul class="ulist focuslistnews">", "").Replace("<ul class="ulist focuslistnews">", "");
                res2 = res2.Replace("<li class="bold-item">", "");
                res2 = res2.Replace("<i class="dot"></i>", "");
                res2 = res2.Replace("<strong>", "").Replace("</strong>", "");
                res2 = res2.Replace("<span class="dot"></span>", "");
                res2 = res2.Replace("</div>", "").Replace("<ul>", "").Replace("</ul>", "");
                res2 = res2.Replace(" ", "");
                res2 = res2.Replace("<li>;", "").Replace("</li>", "");
                res2 = res2.Replace("<li class="hdline0">", "").Replace("<li class="hdline1">", "").
                    Replace("<li class="hdline2">", "").Replace("<li class="hdline3">", "").
                    Replace("<li class="hdline4">", "").Replace("<li class="hdline5">", "");
                //创建Excel文件名称
                FileStream fs = File.Create(Application.StartupPath + "/" + DateTime.Now.ToString("yyyy-MM-dd_hh-mm") + ".xlsx");
                //创建工作薄
                IWorkbook workbook = new XSSFWorkbook();
                //创建sheet
                ISheet sheet = workbook.CreateSheet("热点要闻");
                IRow header = sheet.CreateRow(0);
                header.CreateCell(0).SetCellValue("序号");
                header.CreateCell(1).SetCellValue("标题");
                header.CreateCell(2).SetCellValue("地址");
                header.CreateCell(3).SetCellValue("来源");
                //int tr_2 = res2.IndexOf("</ul></div><ul class="ulist focuslistnews">");
                //string RemoveFirstTR = res2.Substring(0, tr_2).Replace("\r", "");
                string[] AllInfoLines = Regex.Split(res2, "</a>", RegexOptions.IgnoreCase);
                var data1 = string.Empty;
                for (int i = 0; i < AllInfoLines.Length; i++)
                {
                    var data = AllInfoLines[i];// +"\r\n";
                    MatchCollection matches = Regex.Matches(data + "</a>", @"<\s*a\s+[^>]*href\s*=\s*[""'](?<href>[^""']*)[""'][^>]*>(?<ihtml>[\s\S]+?)<\s*/\s*a\s*>", RegexOptions.IgnoreCase);
                    var href = "";
                    foreach (Match match in matches)
                    {
                        href = match.Groups["HREF"].Value;
                    }
                    var title = HttpCommon.ReplaceHtml(data);
                    IRow row = sheet.CreateRow(i + 1);
                    //循环列:序号-标题-链接-来源
                    row.CreateCell(0).SetCellValue(i + 1);
                    row.CreateCell(1).SetCellValue(title);
                    row.CreateCell(2).SetCellValue(href);
                    row.CreateCell(3).SetCellValue("百度");
                }
                //以下是城市新闻的获取
                content = content.Replace("    ", "");
                content = content.Replace("\"", "'").Replace("\t", "").Replace("\n", "").Replace("\r", "");//Replace("\"", "'").
                //城市新闻,,,</ul></div><div class="l-middle-col" alog-group="log-local-middle">content = content.Replace("localnews-focus", "'localnews-focus'");
                content = content.Replace("l-middle-col", "'l-middle-col'");
                //var res3 = Utilities.GetMidStr(content, "<ul id="localnews-focus" class="ulist focuslistnews">", "</ul></div><div class="l-middle-col" alog-group="log-local-middle">", false);
                WriteFile("news.htm", content, false, Encoding.UTF8);
                HtmlAgilityPack.HtmlWeb htmlWeb = new HtmlAgilityPack.HtmlWeb();
                htmlWeb.OverrideEncoding = Encoding.GetEncoding("gb2312");
                HtmlAgilityPack.HtmlDocument htmlDoc = htmlWeb.Load(Application.StartupPath + "/news.htm");
                HtmlAgilityPack.HtmlNode htmlNode = htmlDoc.DocumentNode.SelectSingleNode("//ul[@id='localnews-focus']");
                var res3 = htmlNode.InnerHtml; //WriteFile("news.htm", content, false, Encoding.UTF8);
                //MessageBox.Show(content);
                res3 = res3.Replace("<span class="dot"></span>", ""); res3 = res3.Replace("<li class="bold-item">", "");
                res3 = res3.Replace("<li>;", "").Replace("</li>", "");
                ISheet sheet2 = workbook.CreateSheet("城市新闻");
                IRow header2 = sheet2.CreateRow(0);
                header2.CreateCell(0).SetCellValue("序号");
                header2.CreateCell(1).SetCellValue("标题");
                header2.CreateCell(2).SetCellValue("地址");
                header2.CreateCell(3).SetCellValue("来源");
                string[] AllInfoLines2 = Regex.Split(res3, "</a>", RegexOptions.IgnoreCase);
                var data2 = string.Empty;
                for (int i = 0; i < AllInfoLines2.Length; i++)
                {
                    var data = AllInfoLines2[i];// +"\r\n";
                    MatchCollection matches = Regex.Matches(data + "</a>", @"<\s*a\s+[^>]*href\s*=\s*[""'](?<href>[^""']*)[""'][^>]*>(?<ihtml>[\s\S]+?)<\s*/\s*a\s*>", RegexOptions.IgnoreCase);
                    var href = "";
                    foreach (Match match in matches)
                    {
                        href = match.Groups["HREF"].Value;
                    }
                    var title = HttpCommon.ReplaceHtml(data);
                    IRow row = sheet2.CreateRow(i + 1);
                    //循环列:序号-标题-链接-来源
                    row.CreateCell(0).SetCellValue(i + 1);
                    row.CreateCell(1).SetCellValue(title);
                    row.CreateCell(2).SetCellValue(href);
                    row.CreateCell(3).SetCellValue("百度");
                }
                //WriteFile("news.htm", data1, false, Encoding.UTF8);
                //向excel文件中写入数据并保保存
                workbook.Write(fs);
                fs.Close();
                SetButton(true); SetLabelText("生成完毕!" + DateTime.Now);
            }
            catch
            {
                MessageBox.Show("程序发生未知错误,有3种可能!\r\r1.百度改版\r\r2.系统写入权限不足\r\r3.程序有BUG");
            }
        }
        private void button2_Click(object sender, EventArgs e)
        {
            this.webBrowser1.Navigate("http://news.baidu.com");
        }
        private void webBrowser1_DocumentCompleted(object sender, WebBrowserDocumentCompletedEventArgs e)
        {
            ThreadPool.QueueUserWorkItem(new WaitCallback(this.GetDocHtml));
            //content = webBrowser1.Document.Body.InnerHtml;  
            button1.Enabled = true;
        }
        void GetDocHtml(object o)
        {
            Func<string> f = new Func<string>(GetHtml);
            object html = this.Invoke(f);
            content = html.ToString();
            //MessageBox.Show(html.ToString());
        }
        private string GetHtml()
        {
            return webBrowser1.Document.Body.InnerHtml;
        }
    }
}

下 载 人已下载

下载说明:

1、解压密码:无

2、只有部分模板会提供多页面下载,未加说明都是只有一个首页index.html模板。

3、如果您发现文件有错,或者您有其他更好的意见、建议请给我们留言,我们会及时处理!

4、如果您遇到什么问题,也可加入本站QQ1828984798咨询!

*下载本站资源以及作品仅供学习研究之用,若发现任何组织机构及个人有用于商业目的者,必追究其法律责任 *

评论 --
  • 消灭零回复