好得很程序员自学网

<tfoot draggable='sEl'></tfoot>

C#制作多线程处理强化版网络爬虫

上次做了一个帮公司妹子做了爬虫,不是很精致,这次公司项目里要用到,于是有做了一番修改,功能添加了网址图片采集,下载,线程处理界面网址图片下载等。

说说思路:首相获取初始网址的所有内容 在初始网址采集图片 去初始网址采集链接 把采集到的链接放入队列 继续采集图片,然后继续采集链接,无限循环

还是上图片大家看一下,

处理网页内容抓取跟网页网址爬取都做了改进,下面还是大家来看看代码,有不足之处,还请之处!

网页内容抓取htmlcoderequest,

网页网址爬取gethttplinks,用正则去筛选html中的links

图片抓取gethtmlimageurllist,用正则去筛选html中的img

都写进了一个封装类里面 httphelper

?

   /// <summary> 

     /// 取得html中所有图片的 url。 

     /// </summary> 

     /// <param name="shtmltext">html代码</param> 

     /// <returns>图片的url列表</returns>

public static string htmlcoderequest( string url)

     {

       if ( string .isnullorempty(url))

       {

         return "" ;

       }

       try

       {

         //创建一个请求

         httpwebrequest httprequst = (httpwebrequest)webrequest.create(url);

         //不建立持久性链接

         httprequst.keepalive = true ;

         //设置请求的方法

         httprequst.method = "get" ;

         //设置标头值

         httprequst.useragent = "user-agent:mozilla/4.0 (compatible; msie 6.0; windows nt 5.2; .net clr 1.0.3705" ;

         httprequst.accept = "*/*" ;

         httprequst.headers.add( "accept-language" , "zh-cn,en-us;q=0.5" );

         httprequst.servicepoint.expect100continue = false ;

         httprequst.timeout = 5000;

         httprequst.allowautoredirect = true ; //是否允许302

         servicepointmanager.defaultconnectionlimit = 30;

         //获取响应

         httpwebresponse webres = (httpwebresponse)httprequst.getresponse();

         //获取响应的文本流

         string content = string .empty;

         using (system.io.stream stream = webres.getresponsestream())

         {

           using (system.io.streamreader reader = new streamreader(stream, system.text.encoding.getencoding( "utf-8" )))

           {

             content = reader.readtoend();

           }

         }

         //取消请求

         httprequst.abort();

         //返回数据内容

         return content;

       }

       catch (exception)

       {

 

         return "" ;

       }

     }

/// <summary>

     /// 提取页面链接

     /// </summary>

     /// <param name="html"></param>

     /// <returns></returns>

public static list< string > gethtmlimageurllist( string url)

     {

       string html = httphelper.htmlcoderequest(url);

       if ( string .isnullorempty(html))

       {

         return new list< string >();

       }

       // 定义正则表达式用来匹配 img 标签 

       regex regimg = new regex( @"<img\b[^<>]*?\bsrc[\s\t\r\n]*=[\s\t\r\n]*[""']?[\s\t\r\n]*(?<imgurl>[^\s\t\r\n""'<>]*)[^<>]*?/?[\s\t\r\n]*>" , regexoptions.ignorecase);

 

       // 搜索匹配的字符串 

       matchcollection matches = regimg.matches(html);

       list< string > surllist = new list< string >();

 

       // 取得匹配项列表 

       foreach (match match in matches)

         surllist.add(match.groups[ "imgurl" ].value);

       return surllist;

     }

 

 

     /// <summary>

     /// 提取页面链接

     /// </summary>

     /// <param name="html"></param>

     /// <returns></returns>

     public static list< string > gethttplinks( string url)

     {

       //获取网址内容

       string html = httphelper.htmlcoderequest(url);

       if ( string .isnullorempty(html))

       {

         return new list< string >();

       }

       //匹配http链接

       const string pattern2 = @"http(s)?://([\w-]+\.)+[\w-]+(/[\w- ./?%&=]*)?" ;

       regex r2 = new regex(pattern2, regexoptions.ignorecase);

       //获得匹配结果

       matchcollection m2 = r2.matches(html);

       list< string > links = new list< string >();

       foreach (match url2 in m2)

       {

         if (stringhelper.checkurlislegal(url2.tostring()) || !stringhelper.ispureurl(url2.tostring()) || links.contains(url2.tostring()))

           continue ;

         links.add(url2.tostring());

       }

       //匹配href里面的链接

       const string pattern = @"(?i)<a\s[^>]*?href=(['""]?)(?!javascript|__dopostback)(?<url>[^'""\s*#<>]+)[^>]*>" ; ;

       regex r = new regex(pattern, regexoptions.ignorecase);

       //获得匹配结果

       matchcollection m = r.matches(html);

       foreach (match url1 in m)

       {

         string href1 = url1.groups[ "url" ].value;

         if (!href1.contains( "http" ))

         {

           href1 = global.weburl + href1;

         }

         if (!stringhelper.ispureurl(href1) || links.contains(href1)) continue ;

         links.add(href1);

       }

       return links;

     } 

这边下载图片有个任务条数限制,限制是200条。如果超过的话线程等待5秒,这里下载图片是异步调用的委托

?

public string downloadimg( string url)

     {

       if (! string .isnullorempty(url))

       {

         try

         {

           if (!url.contains( "http" ))

           {

             url = global.weburl + url;

           }

           httpwebrequest request = (httpwebrequest)webrequest.create(url);

           request.timeout = 2000;

           request.useragent = "user-agent:mozilla/4.0 (compatible; msie 6.0; windows nt 5.2; .net clr 1.0.3705" ;

           //是否允许302

           request.allowautoredirect = true ;

           webresponse response = request.getresponse();

           stream reader = response.getresponsestream();

           //文件名

           string afirstname = guid.newguid().tostring();

           //扩展名

           string alastname = url.substring(url.lastindexof( "." ) + 1, (url.length - url.lastindexof( "." ) - 1));

           filestream writer = new filestream(global.floderurl + afirstname + "." + alastname, filemode.openorcreate, fileaccess.write);

           byte [] buff = new byte [512];

           //实际读取的字节数

           int c = 0;

           while ((c = reader.read(buff, 0, buff.length)) > 0)

           {

             writer.write(buff, 0, c);

           }

           writer.close();

           writer.dispose();

           reader.close();

           reader.dispose();

           response.close();

           return (afirstname + "." + alastname);

         }

         catch (exception)

         {

           return "错误:地址" + url;

         }

       }

       return "错误:地址为空" ;

     }

话不多说,更多的需要大家自己去改进咯!欢迎读者来与楼主进行交流。

dy("nrwz");

查看更多关于C#制作多线程处理强化版网络爬虫的详细内容...

  阅读:54次