C#制作多线程处理强化版网络爬虫

				 
	上次做了一个帮公司妹子做了爬虫，不是很精致，这次公司项目里要用到，于是有做了一番修改，功能添加了网址图片采集，下载，线程处理界面网址图片下载等。

	说说思路：首相获取初始网址的所有内容 在初始网址采集图片 去初始网址采集链接 把采集到的链接放入队列 继续采集图片，然后继续采集链接，无限循环

	还是上图片大家看一下，

	处理网页内容抓取跟网页网址爬取都做了改进，下面还是大家来看看代码，有不足之处，还请之处!

	网页内容抓取htmlcoderequest,

	网页网址爬取gethttplinks，用正则去筛选html中的links

	图片抓取gethtmlimageurllist，用正则去筛选html中的img

	都写进了一个封装类里面 httphelper

				 ? 

									     /// <summary>   

									       /// 取得html中所有图片的 url。   

									       /// </summary>   

									       /// <param name="shtmltext">html代码</param>   

									       /// <returns>图片的url列表</returns>  

									 public   static   string   htmlcoderequest(  string   url) 

									       { 

									         if   (  string  .isnullorempty(url)) 

									         { 

									           return   ""  ; 

									         } 

									         try 

									         { 

									           //创建一个请求 

									           httpwebrequest httprequst = (httpwebrequest)webrequest.create(url); 

									           //不建立持久性链接 

									           httprequst.keepalive =   true  ; 

									           //设置请求的方法 

									           httprequst.method =   "get"  ; 

									           //设置标头值 

									           httprequst.useragent =   "user-agent:mozilla/4.0 (compatible; msie 6.0; windows nt 5.2; .net clr 1.0.3705"  ; 

									           httprequst.accept =   "*/*"  ; 

									           httprequst.headers.add(  "accept-language"  ,   "zh-cn,en-us;q=0.5"  ); 

									           httprequst.servicepoint.expect100continue =   false  ; 

									           httprequst.timeout = 5000; 

									           httprequst.allowautoredirect =   true  ;  //是否允许302 

									           servicepointmanager.defaultconnectionlimit = 30; 

									           //获取响应 

									           httpwebresponse webres = (httpwebresponse)httprequst.getresponse(); 

									           //获取响应的文本流 

									           string   content =   string  .empty; 

									           using   (system.io.stream stream = webres.getresponsestream()) 

									           { 

									             using   (system.io.streamreader reader =   new   streamreader(stream, system.text.encoding.getencoding(  "utf-8"  ))) 

									             { 

									               content = reader.readtoend(); 

									             } 

									           } 

									           //取消请求 

									           httprequst.abort(); 

									           //返回数据内容 

									           return   content; 

									         } 

									         catch   (exception) 

									         { 

									           return   ""  ; 

									         } 

									       } 

									 /// <summary> 

									       /// 提取页面链接 

									       /// </summary> 

									       /// <param name="html"></param> 

									       /// <returns></returns> 

									 public   static   list<  string  > gethtmlimageurllist(  string   url) 

									       { 

									         string   html = httphelper.htmlcoderequest(url); 

									         if   (  string  .isnullorempty(html)) 

									         { 

									           return   new   list<  string  >(); 

									         } 

									         // 定义正则表达式用来匹配 img 标签   

									         regex regimg =   new   regex(  @"<img\b[^<>]*?\bsrc[\s\t\r\n]*=[\s\t\r\n]*[""']?[\s\t\r\n]*(?<imgurl>[^\s\t\r\n""'<>]*)[^<>]*?/?[\s\t\r\n]*>"  , regexoptions.ignorecase); 

									         // 搜索匹配的字符串   

									         matchcollection matches = regimg.matches(html); 

									         list<  string  > surllist =   new   list<  string  >(); 

									         // 取得匹配项列表   

									         foreach   (match match   in   matches) 

									           surllist.add(match.groups[  "imgurl"  ].value); 

									         return   surllist; 

									       } 

									       /// <summary> 

									       /// 提取页面链接 

									       /// </summary> 

									       /// <param name="html"></param> 

									       /// <returns></returns> 

									       public   static   list<  string  > gethttplinks(  string   url) 

									       { 

									         //获取网址内容 

									         string   html = httphelper.htmlcoderequest(url); 

									         if   (  string  .isnullorempty(html)) 

									         { 

									           return   new   list<  string  >(); 

									         } 

									         //匹配http链接 

									         const   string   pattern2 =   @"http(s)?://([\w-]+\.)+[\w-]+(/[\w- ./?%&=]*)?"  ; 

									         regex r2 =   new   regex(pattern2, regexoptions.ignorecase); 

									         //获得匹配结果 

									         matchcollection m2 = r2.matches(html); 

									         list<  string  > links =   new   list<  string  >(); 

									         foreach   (match url2   in   m2) 

									         { 

									           if   (stringhelper.checkurlislegal(url2.tostring()) || !stringhelper.ispureurl(url2.tostring()) || links.contains(url2.tostring())) 

									             continue  ; 

									           links.add(url2.tostring()); 

									         } 

									         //匹配href里面的链接 

									         const   string   pattern =   @"(?i)<a\s[^>]*?href=(['""]?)(?!javascript|__dopostback)(?<url>[^'""\s*#<>]+)[^>]*>"  ; ; 

									         regex r =   new   regex(pattern, regexoptions.ignorecase); 

									         //获得匹配结果 

									         matchcollection m = r.matches(html); 

									         foreach   (match url1   in   m) 

									         { 

									           string   href1 = url1.groups[  "url"  ].value; 

									           if   (!href1.contains(  "http"  )) 

									           { 

									             href1 = global.weburl + href1; 

									           } 

									           if   (!stringhelper.ispureurl(href1) || links.contains(href1))   continue  ; 

									           links.add(href1); 

									         } 

									         return   links; 

									       }   

	这边下载图片有个任务条数限制，限制是200条。如果超过的话线程等待5秒，这里下载图片是异步调用的委托

				 ? 

									 public   string   downloadimg(  string   url) 

									       { 

									         if   (!  string  .isnullorempty(url)) 

									         { 

									           try 

									           { 

									             if   (!url.contains(  "http"  )) 

									             { 

									               url = global.weburl + url; 

									             } 

									             httpwebrequest request = (httpwebrequest)webrequest.create(url); 

									             request.timeout = 2000; 

									             request.useragent =   "user-agent:mozilla/4.0 (compatible; msie 6.0; windows nt 5.2; .net clr 1.0.3705"  ; 

									             //是否允许302 

									             request.allowautoredirect =   true  ; 

									             webresponse response = request.getresponse(); 

									             stream reader = response.getresponsestream(); 

									             //文件名 

									             string   afirstname = guid.newguid().tostring(); 

									             //扩展名 

									             string   alastname = url.substring(url.lastindexof(  "."  ) + 1, (url.length - url.lastindexof(  "."  ) - 1)); 

									             filestream writer =   new   filestream(global.floderurl + afirstname +   "."   + alastname, filemode.openorcreate, fileaccess.write); 

									             byte  [] buff =   new   byte  [512]; 

									             //实际读取的字节数 

									             int   c = 0; 

									             while   ((c = reader.read(buff, 0, buff.length)) > 0) 

									             { 

									               writer.write(buff, 0, c); 

									             } 

									             writer.close(); 

									             writer.dispose(); 

									             reader.close(); 

									             reader.dispose(); 

									             response.close(); 

									             return   (afirstname +   "."   + alastname); 

									           } 

									           catch   (exception) 

									           { 

									             return   "错误：地址"   + url; 

									           } 

									         } 

									         return   "错误：地址为空"  ; 

									       } 

	话不多说，更多的需要大家自己去改进咯！欢迎读者来与楼主进行交流。

			 dy("nrwz"); 
			
查看更多关于C#制作多线程处理强化版网络爬虫的详细内容...
声明：本文来自网络，不代表【好得很程序员自学网】立场，转载请注明出处：http://haodehen.cn/did56791
更新时间：2022-09-26 阅读：64次