上次做了一个帮公司妹子做了爬虫,不是很精致,这次公司项目里要用到,于是有做了一番修改,功能添加了网址图片采集,下载,线程处理界面网址图片下载等。
说说思路:首相获取初始网址的所有内容 在初始网址采集图片 去初始网址采集链接 把采集到的链接放入队列 继续采集图片,然后继续采集链接,无限循环
还是上图片大家看一下,
处理网页内容抓取跟网页网址爬取都做了改进,下面还是大家来看看代码,有不足之处,还请之处!
网页内容抓取htmlcoderequest,
网页网址爬取gethttplinks,用正则去筛选html中的links
图片抓取gethtmlimageurllist,用正则去筛选html中的img
都写进了一个封装类里面 httphelper
/// <summary>
/// 取得html中所有图片的 url。
/// </summary>
/// <param name="shtmltext">html代码</param>
/// <returns>图片的url列表</returns>
public static string htmlcoderequest( string url)
{
if ( string .isnullorempty(url))
{
return "" ;
}
try
{
//创建一个请求
httpwebrequest httprequst = (httpwebrequest)webrequest.create(url);
//不建立持久性链接
httprequst.keepalive = true ;
//设置请求的方法
httprequst.method = "get" ;
//设置标头值
httprequst.useragent = "user-agent:mozilla/4.0 (compatible; msie 6.0; windows nt 5.2; .net clr 1.0.3705" ;
httprequst.accept = "*/*" ;
httprequst.headers.add( "accept-language" , "zh-cn,en-us;q=0.5" );
httprequst.servicepoint.expect100continue = false ;
httprequst.timeout = 5000;
httprequst.allowautoredirect = true ; //是否允许302
servicepointmanager.defaultconnectionlimit = 30;
//获取响应
httpwebresponse webres = (httpwebresponse)httprequst.getresponse();
//获取响应的文本流
string content = string .empty;
using (system.io.stream stream = webres.getresponsestream())
{
using (system.io.streamreader reader = new streamreader(stream, system.text.encoding.getencoding( "utf-8" )))
{
content = reader.readtoend();
}
}
//取消请求
httprequst.abort();
//返回数据内容
return content;
}
catch (exception)
{
return "" ;
}
}
/// <summary>
/// 提取页面链接
/// </summary>
/// <param name="html"></param>
/// <returns></returns>
public static list< string > gethtmlimageurllist( string url)
{
string html = httphelper.htmlcoderequest(url);
if ( string .isnullorempty(html))
{
return new list< string >();
}
// 定义正则表达式用来匹配 img 标签
regex regimg = new regex( @"<img\b[^<>]*?\bsrc[\s\t\r\n]*=[\s\t\r\n]*[""']?[\s\t\r\n]*(?<imgurl>[^\s\t\r\n""'<>]*)[^<>]*?/?[\s\t\r\n]*>" , regexoptions.ignorecase);
// 搜索匹配的字符串
matchcollection matches = regimg.matches(html);
list< string > surllist = new list< string >();
// 取得匹配项列表
foreach (match match in matches)
surllist.add(match.groups[ "imgurl" ].value);
return surllist;
}
/// <summary>
/// 提取页面链接
/// </summary>
/// <param name="html"></param>
/// <returns></returns>
public static list< string > gethttplinks( string url)
{
//获取网址内容
string html = httphelper.htmlcoderequest(url);
if ( string .isnullorempty(html))
{
return new list< string >();
}
//匹配http链接
const string pattern2 = @"http(s)?://([\w-]+\.)+[\w-]+(/[\w- ./?%&=]*)?" ;
regex r2 = new regex(pattern2, regexoptions.ignorecase);
//获得匹配结果
matchcollection m2 = r2.matches(html);
list< string > links = new list< string >();
foreach (match url2 in m2)
{
if (stringhelper.checkurlislegal(url2.tostring()) || !stringhelper.ispureurl(url2.tostring()) || links.contains(url2.tostring()))
continue ;
links.add(url2.tostring());
}
//匹配href里面的链接
const string pattern = @"(?i)<a\s[^>]*?href=(['""]?)(?!javascript|__dopostback)(?<url>[^'""\s*#<>]+)[^>]*>" ; ;
regex r = new regex(pattern, regexoptions.ignorecase);
//获得匹配结果
matchcollection m = r.matches(html);
foreach (match url1 in m)
{
string href1 = url1.groups[ "url" ].value;
if (!href1.contains( "http" ))
{
href1 = global.weburl + href1;
}
if (!stringhelper.ispureurl(href1) || links.contains(href1)) continue ;
links.add(href1);
}
return links;
}
这边下载图片有个任务条数限制,限制是200条。如果超过的话线程等待5秒,这里下载图片是异步调用的委托
public string downloadimg( string url)
{
if (! string .isnullorempty(url))
{
try
{
if (!url.contains( "http" ))
{
url = global.weburl + url;
}
httpwebrequest request = (httpwebrequest)webrequest.create(url);
request.timeout = 2000;
request.useragent = "user-agent:mozilla/4.0 (compatible; msie 6.0; windows nt 5.2; .net clr 1.0.3705" ;
//是否允许302
request.allowautoredirect = true ;
webresponse response = request.getresponse();
stream reader = response.getresponsestream();
//文件名
string afirstname = guid.newguid().tostring();
//扩展名
string alastname = url.substring(url.lastindexof( "." ) + 1, (url.length - url.lastindexof( "." ) - 1));
filestream writer = new filestream(global.floderurl + afirstname + "." + alastname, filemode.openorcreate, fileaccess.write);
byte [] buff = new byte [512];
//实际读取的字节数
int c = 0;
while ((c = reader.read(buff, 0, buff.length)) > 0)
{
writer.write(buff, 0, c);
}
writer.close();
writer.dispose();
reader.close();
reader.dispose();
response.close();
return (afirstname + "." + alastname);
}
catch (exception)
{
return "错误:地址" + url;
}
}
return "错误:地址为空" ;
}
话不多说,更多的需要大家自己去改进咯!欢迎读者来与楼主进行交流。
dy("nrwz");
查看更多关于C#制作多线程处理强化版网络爬虫的详细内容...