基于C#实现网页爬虫

				 
	本文实例为大家分享了基于C#实现网页爬虫的详细代码，供大家参考，具体内容如下

	HTTP请求工具类：

	功能：

	1、获取网页html

	2、下载网络图片

				 ? 

									 using   System; 

									 using   System.Collections.Generic; 

									 using   System.IO; 

									 using   System.Linq; 

									 using   System.Net; 

									 using   System.Text; 

									 using   System.Threading.Tasks; 

									 using   System.Windows.Forms; 

									 namespace   Utils 

									 { 

									     /// <summary> 

									     /// HTTP请求工具类 

									     /// </summary> 

									     public   class   HttpRequestUtil 

									     { 

									       /// <summary> 

									       /// 获取页面html 

									       /// </summary> 

									       public   static   string   GetPageHtml(  string   url) 

									       { 

									         // 设置参数 

									         HttpWebRequest request = WebRequest.Create(url)   as   HttpWebRequest; 

									         request.UserAgent =   "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)"  ; 

									         //发送请求并获取相应回应数据 

									         HttpWebResponse response = request.GetResponse()   as   HttpWebResponse; 

									         //直到request.GetResponse()程序才开始向目标网页发送Post请求 

									         Stream responseStream = response.GetResponseStream(); 

									         StreamReader sr =   new   StreamReader(responseStream, Encoding.UTF8); 

									         //返回结果网页（html）代码 

									         string   content = sr.ReadToEnd(); 

									         return   content; 

									       } 

									       /// <summary> 

									       /// Http下载文件 

									       /// </summary> 

									       public   static   void   HttpDownloadFile(  string   url) 

									       { 

									         int   pos = url.LastIndexOf(  "/"  ) + 1; 

									         string   fileName = url.Substring(pos); 

									         string   path = Application.StartupPath +   "\\download"  ; 

									         if   (!Directory.Exists(path)) 

									         { 

									           Directory.CreateDirectory(path); 

									         } 

									         string   filePathName = path +   "\\"   + fileName; 

									         if   (File.Exists(filePathName))   return  ; 

									         // 设置参数 

									         HttpWebRequest request = WebRequest.Create(url)   as   HttpWebRequest; 

									         request.UserAgent =   "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)"  ; 

									         request.Proxy =   null  ; 

									         //发送请求并获取相应回应数据 

									         HttpWebResponse response = request.GetResponse()   as   HttpWebResponse; 

									         //直到request.GetResponse()程序才开始向目标网页发送Post请求 

									         Stream responseStream = response.GetResponseStream(); 

									         //创建本地文件写入流 

									         Stream stream =   new   FileStream(filePathName, FileMode.Create); 

									         byte  [] bArr =   new   byte  [1024]; 

									         int   size = responseStream.Read(bArr, 0, (  int  )bArr.Length); 

									         while   (size > 0) 

									         { 

									           stream.Write(bArr, 0, size); 

									           size = responseStream.Read(bArr, 0, (  int  )bArr.Length); 

									         } 

									         stream.Close(); 

									         responseStream.Close(); 

									       } 

									     } 

									 } 

	多线程爬取网页代码：

				 ? 

									 using   System; 

									 using   System.Collections.Generic; 

									 using   System.ComponentModel; 

									 using   System.Data; 

									 using   System.Drawing; 

									 using   System.IO; 

									 using   System.Linq; 

									 using   System.Text; 

									 using   System.Text.RegularExpressions; 

									 using   System.Threading; 

									 using   System.Threading.Tasks; 

									 using   System.Windows.Forms; 

									 using   Utils; 

									 namespace   爬虫 

									 { 

									     public   partial   class   Form1 : Form 

									     { 

									       List<Thread> threadList =   new   List<Thread>(); 

									       Thread thread =   null  ; 

									       public   Form1() 

									       { 

									         InitializeComponent(); 

									       } 

									       private   void   button1_Click(  object   sender, EventArgs e) 

									       { 

									         DateTime dtStart = DateTime.Now; 

									         button3.Enabled =   true  ; 

									         button2.Enabled =   true  ; 

									         button1.Enabled =   false  ; 

									         int   page = 0; 

									         int   count = 0; 

									         int   personCount = 0; 

									         lblPage.Text =   "已完成页数：0"  ; 

									         int   index = 0; 

									         for   (  int   i = 1; i <= 10; i++) 

									         { 

									           thread =   new   Thread(  new   ParameterizedThreadStart(  delegate  (  object   obj) 

									           { 

									             for   (  int   j = 1; j <= 10; j++) 

									             { 

									               try 

									               { 

									                 index = (Convert.ToInt32(obj) - 1) * 10 + j; 

									                 string   pageHtml = HttpRequestUtil.GetPageHtml(  "http://tt.mop测试数据/c44/0/1_"   + index.ToString() +   ".html"  ); 

									                 Regex regA =   new   Regex(  "<a[\\s]+class=\"J-userPic([^<>]*?)[\\s]+href=\"([^\"]*?)\""  ); 

									                 Regex regImg =   new   Regex(  "<p class=\"tc mb10\"><img[\\s]+src=\"([^\"]*?)\""  ); 

									                 MatchCollection mc = regA.Matches(pageHtml); 

									                 foreach   (Match match   in   mc) 

									                 { 

									                   int   start = match.ToString().IndexOf(  "href=\""  ); 

									                   string   url = match.ToString().Substring(start + 6); 

									                   int   end = url.IndexOf(  "\""  ); 

									                   url = url.Substring(0, end); 

									                   if   (url.IndexOf(  "/"  ) == 0) 

									                   { 

									                     string   imgPageHtml = HttpRequestUtil.GetPageHtml(  "http://tt.mop测试数据"   + url); 

									                     personCount++; 

									                     lblPerson.Invoke(  new   Action(  delegate  () { lblPerson.Text =   "已完成条数："   + personCount.ToString(); })); 

									                     MatchCollection mcImgPage = regImg.Matches(imgPageHtml); 

									                     foreach   (Match matchImgPage   in   mcImgPage) 

									                     { 

									                       start = matchImgPage.ToString().IndexOf(  "src=\""  ); 

									                       string   imgUrl = matchImgPage.ToString().Substring(start + 5); 

									                       end = imgUrl.IndexOf(  "\""  ); 

									                       imgUrl = imgUrl.Substring(0, end); 

									                       if   (imgUrl.IndexOf(  "http://i1"  ) == 0) 

									                       { 

									                         try 

									                         { 

									                           HttpRequestUtil.HttpDownloadFile(imgUrl); 

									                           count++; 

									                           lblNum.Invoke(  new   Action(  delegate  () 

									                           { 

									                             lblNum.Text =   "已下载图片数"   + count.ToString(); 

									                             DateTime dt = DateTime.Now; 

									                             double   time = dt.Subtract(dtStart).TotalSeconds; 

									                             if   (time > 0) 

									                             { 

									                               lblSpeed.Text =   "速度："   + (count / time).ToString(  "0.0"  ) +   "张/秒"  ; 

									                             } 

									                           })); 

									                         } 

									                         catch   { } 

									                         Thread.Sleep(1); 

									                       } 

									                     } 

									                   } 

									                 } 

									               } 

									               catch   { } 

									               page++; 

									               lblPage.Invoke(  new   Action(  delegate  () { lblPage.Text =   "已完成页数："   + page.ToString(); })); 

									               if   (page == 100) 

									               { 

									                 button1.Invoke(  new   Action(  delegate  () { button1.Enabled =   true  ; })); 

									                 MessageBox.Show(  "完成！"  ); 

									               } 

									             } 

									           })); 

									           thread.Start(i); 

									           threadList.Add(thread); 

									         } 

									       } 

									       private   void   button2_Click(  object   sender, EventArgs e) 

									       { 

									         button1.Invoke(  new   Action(  delegate  () 

									         { 

									           foreach   (Thread thread   in   threadList) 

									           { 

									             if   (thread.ThreadState == ThreadState.Suspended) 

									             { 

									               thread.Resume(); 

									             } 

									             thread.Abort(); 

									           } 

									           button1.Enabled =   true  ; 

									           button2.Enabled =   false  ; 

									           button3.Enabled =   false  ; 

									           button4.Enabled =   false  ; 

									         })); 

									       } 

									       private   void   Form1_FormClosing(  object   sender, FormClosingEventArgs e) 

									       { 

									         foreach   (Thread thread   in   threadList) 

									         { 

									           thread.Abort(); 

									         } 

									       } 

									       private   void   button3_Click(  object   sender, EventArgs e) 

									       { 

									         foreach   (Thread thread   in   threadList) 

									         { 

									           if   (thread.ThreadState == ThreadState.Running) 

									           { 

									             thread.Suspend(); 

									           } 

									         } 

									         button3.Enabled =   false  ; 

									         button4.Enabled =   true  ; 

									       } 

									       private   void   button4_Click(  object   sender, EventArgs e) 

									       { 

									         foreach   (Thread thread   in   threadList) 

									         { 

									           if   (thread.ThreadState == ThreadState.Suspended) 

									           { 

									             thread.Resume(); 

									           } 

									         } 

									         button3.Enabled =   true  ; 

									         button4.Enabled =   false  ; 

									       } 

									     } 

									 } 

	截图：

	以上就是本文的全部内容，希望对大家的学习有所帮助。

			 dy("nrwz"); 
			
查看更多关于基于C#实现网页爬虫的详细内容...
声明：本文来自网络，不代表【好得很程序员自学网】立场，转载请注明出处：http://haodehen.cn/did57152
更新时间：2022-09-26 阅读：45次