好得很程序员自学网

<tfoot draggable='sEl'></tfoot>

基于C#实现网页爬虫

本文实例为大家分享了基于C#实现网页爬虫的详细代码,供大家参考,具体内容如下

HTTP请求工具类:

功能:

1、获取网页html

2、下载网络图片

?

using System;

using System.Collections.Generic;

using System.IO;

using System.Linq;

using System.Net;

using System.Text;

using System.Threading.Tasks;

using System.Windows.Forms;

 

namespace Utils

{

   /// <summary>

   /// HTTP请求工具类

   /// </summary>

   public class HttpRequestUtil

   {

     /// <summary>

     /// 获取页面html

     /// </summary>

     public static string GetPageHtml( string url)

     {

       // 设置参数

       HttpWebRequest request = WebRequest.Create(url) as HttpWebRequest;

       request.UserAgent = "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)" ;

       //发送请求并获取相应回应数据

       HttpWebResponse response = request.GetResponse() as HttpWebResponse;

       //直到request.GetResponse()程序才开始向目标网页发送Post请求

       Stream responseStream = response.GetResponseStream();

       StreamReader sr = new StreamReader(responseStream, Encoding.UTF8);

       //返回结果网页(html)代码

       string content = sr.ReadToEnd();

       return content;

     }

 

     /// <summary>

     /// Http下载文件

     /// </summary>

     public static void HttpDownloadFile( string url)

     {

       int pos = url.LastIndexOf( "/" ) + 1;

       string fileName = url.Substring(pos);

       string path = Application.StartupPath + "\\download" ;

       if (!Directory.Exists(path))

       {

         Directory.CreateDirectory(path);

       }

       string filePathName = path + "\\" + fileName;

       if (File.Exists(filePathName)) return ;

 

       // 设置参数

       HttpWebRequest request = WebRequest.Create(url) as HttpWebRequest;

       request.UserAgent = "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)" ;

       request.Proxy = null ;

       //发送请求并获取相应回应数据

       HttpWebResponse response = request.GetResponse() as HttpWebResponse;

       //直到request.GetResponse()程序才开始向目标网页发送Post请求

       Stream responseStream = response.GetResponseStream();

 

       //创建本地文件写入流

       Stream stream = new FileStream(filePathName, FileMode.Create);

 

       byte [] bArr = new byte [1024];

       int size = responseStream.Read(bArr, 0, ( int )bArr.Length);

       while (size > 0)

       {

         stream.Write(bArr, 0, size);

         size = responseStream.Read(bArr, 0, ( int )bArr.Length);

       }

       stream.Close();

       responseStream.Close();

     }

   }

}

多线程爬取网页代码:

?

using System;

using System.Collections.Generic;

using System.ComponentModel;

using System.Data;

using System.Drawing;

using System.IO;

using System.Linq;

using System.Text;

using System.Text.RegularExpressions;

using System.Threading;

using System.Threading.Tasks;

using System.Windows.Forms;

using Utils;

 

namespace 爬虫

{

   public partial class Form1 : Form

   {

     List<Thread> threadList = new List<Thread>();

     Thread thread = null ;

 

     public Form1()

     {

       InitializeComponent();

     }

 

     private void button1_Click( object sender, EventArgs e)

     {

       DateTime dtStart = DateTime.Now;

       button3.Enabled = true ;

       button2.Enabled = true ;

       button1.Enabled = false ;

       int page = 0;

       int count = 0;

       int personCount = 0;

       lblPage.Text = "已完成页数:0" ;

       int index = 0;

 

       for ( int i = 1; i <= 10; i++)

       {

         thread = new Thread( new ParameterizedThreadStart( delegate ( object obj)

         {

           for ( int j = 1; j <= 10; j++)

           {

             try

             {

               index = (Convert.ToInt32(obj) - 1) * 10 + j;

               string pageHtml = HttpRequestUtil.GetPageHtml( "http://tt.mop测试数据/c44/0/1_" + index.ToString() + ".html" );

               Regex regA = new Regex( "<a[\\s]+class=\"J-userPic([^<>]*?)[\\s]+href=\"([^\"]*?)\"" );

               Regex regImg = new Regex( "<p class=\"tc mb10\"><img[\\s]+src=\"([^\"]*?)\"" );

               MatchCollection mc = regA.Matches(pageHtml);

               foreach (Match match in mc)

               {

                 int start = match.ToString().IndexOf( "href=\"" );

                 string url = match.ToString().Substring(start + 6);

                 int end = url.IndexOf( "\"" );

                 url = url.Substring(0, end);

                 if (url.IndexOf( "/" ) == 0)

                 {

                   string imgPageHtml = HttpRequestUtil.GetPageHtml( "http://tt.mop测试数据" + url);

                   personCount++;

                   lblPerson.Invoke( new Action( delegate () { lblPerson.Text = "已完成条数:" + personCount.ToString(); }));

                   MatchCollection mcImgPage = regImg.Matches(imgPageHtml);

                   foreach (Match matchImgPage in mcImgPage)

                   {

                     start = matchImgPage.ToString().IndexOf( "src=\"" );

                     string imgUrl = matchImgPage.ToString().Substring(start + 5);

                     end = imgUrl.IndexOf( "\"" );

                     imgUrl = imgUrl.Substring(0, end);

                     if (imgUrl.IndexOf( "http://i1" ) == 0)

                     {

                       try

                       {

                         HttpRequestUtil.HttpDownloadFile(imgUrl);

                         count++;

                         lblNum.Invoke( new Action( delegate ()

                         {

                           lblNum.Text = "已下载图片数" + count.ToString();

                           DateTime dt = DateTime.Now;

                           double time = dt.Subtract(dtStart).TotalSeconds;

                           if (time > 0)

                           {

                             lblSpeed.Text = "速度:" + (count / time).ToString( "0.0" ) + "张/秒" ;

                           }

                         }));

                       }

                       catch { }

                       Thread.Sleep(1);

                     }

                   }

                 }

               }

             }

             catch { }

             page++;

             lblPage.Invoke( new Action( delegate () { lblPage.Text = "已完成页数:" + page.ToString(); }));

 

             if (page == 100)

             {

               button1.Invoke( new Action( delegate () { button1.Enabled = true ; }));

               MessageBox.Show( "完成!" );

             }

           }

         }));

         thread.Start(i);

         threadList.Add(thread);

       }

     }

 

     private void button2_Click( object sender, EventArgs e)

     {

       button1.Invoke( new Action( delegate ()

       {

         foreach (Thread thread in threadList)

         {

           if (thread.ThreadState == ThreadState.Suspended)

           {

             thread.Resume();

           }

           thread.Abort();

         }

         button1.Enabled = true ;

         button2.Enabled = false ;

         button3.Enabled = false ;

         button4.Enabled = false ;

       }));

     }

 

     private void Form1_FormClosing( object sender, FormClosingEventArgs e)

     {

       foreach (Thread thread in threadList)

       {

         thread.Abort();

       }

     }

 

     private void button3_Click( object sender, EventArgs e)

     {

       foreach (Thread thread in threadList)

       {

         if (thread.ThreadState == ThreadState.Running)

         {

           thread.Suspend();

         }

       }

       button3.Enabled = false ;

       button4.Enabled = true ;

     }

 

     private void button4_Click( object sender, EventArgs e)

     {

       foreach (Thread thread in threadList)

       {

         if (thread.ThreadState == ThreadState.Suspended)

         {

           thread.Resume();

         }

       }

       button3.Enabled = true ;

       button4.Enabled = false ;

     }

   }

}

截图:

以上就是本文的全部内容,希望对大家的学习有所帮助。

dy("nrwz");

查看更多关于基于C#实现网页爬虫的详细内容...

  阅读:45次