前段时间需要爬取网页上的信息,自己对于爬虫没有任何了解,就了解了一下 webmagic ,写了个简单的爬虫。
一、首先介绍一下webmagic:
webmagic采用完全模块化的设计,功能覆盖整个爬虫的生命周期(链接提取、页面下载、内容抽取、持久化),支持多线程抓取,分布式抓取,并支持自动重试、自定义ua/cookie等功能。
实现理念:
maven依赖:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 |
<dependency> <groupid>us.codecraft</groupid> <artifactid>webmagic-core</artifactid> <version> 0.7 . 3 </version> </dependency> <dependency> <groupid>us.codecraft</groupid> <artifactid>webmagic-extension</artifactid> <version> 0.7 . 3 </version> </dependency>
<dependency> <groupid>us.codecraft</groupid> <artifactid>webmagic-extension</artifactid> <version> 0.7 . 3 </version> <exclusions> <exclusion> <groupid>org.slf4j</groupid> <artifactid>slf4j-log4j12</artifactid> </exclusion> </exclusions> </dependency> |
jdbc 模式:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 |
ublic class csdnblogdao { private connection conn = null ; private statement stmt = null ;
public csdnblogdao() { try { class .forname( "com. mysql .jdbc.driver" ); string url = "jdbc:mysql://localhost:3306/test?" + "user=***&password=***3&useunicode=true&characterencoding=utf8" ; conn = drivermanager.getconnection(url); stmt = conn.createstatement(); } catch (classnotfoundexception e) { e.printstacktrace(); } catch (sqlexception e) { e.printstacktrace(); }
}
public int add(csdnblog csdnblog) { try { string sql = "insert into `test`.`csdnblog` (`keyes`, `titles`, `content` , `dates`, `tags`, `category`, `views`, `comments`, `copyright`) values (?, ?, ?, ?, ?, ?, ?, ?,?);" ; preparedstatement ps = conn.preparestatement(sql); ps.setint( 1 , csdnblog.getkey()); ps.setstring( 2 , csdnblog.gettitle()); ps.setstring( 3 ,csdnblog.getcontent()); ps.setstring( 4 , csdnblog.getdates()); ps.setstring( 5 , csdnblog.gettags()); ps.setstring( 6 , csdnblog.getcategory()); ps.setint( 7 , csdnblog.getview()); ps.setint( 8 , csdnblog.getcomments()); ps.setint( 9 , csdnblog.getcopyright()); return ps.executeupdate(); } catch (sqlexception e) { e.printstacktrace(); } return - 1 ; } } |
实体类:
public class csdnblog { private int key; // 编号 private string title; // 标题 private string dates; // 日期 private string tags; // 标签 private string category; // 分类 private int view; // 阅读人数 private int comments; // 评论人数 private int copyright; // 是否原创 private string content; //文字内容 public string getcontent() { return content; } public void setcontent(string content) { this .content = content; } public int getkey() { return key; } public void setkey( int key) { this .key = key; } public string gettitle() { return title; } public void settitle(string title) { this .title = title; } public string getdates() { return dates; } public void setdates(string dates) { this .dates = dates; } public string gettags() { return tags; } public void settags(string tags) { this .tags = tags; } public string getcategory() { return category; } public void setcategory(string category) { this .category = category; } public int getview() { return view; } public void setview( int view) { this .view = view; } public int getcomments() { return comments; } public void setcomments( int comments) { this .comments = comments; } public int getcopyright() { return copyright; } public void setcopyright( int copyright) { this .copyright = copyright; } public string tostring() { return "csdnblog [key=" + key + ", title=" + title + ", content=" + content + ",dates=" + dates + ", tags=" + tags + ", category=" + category + ", view=" + view + ", comments=" + comments + ", copyright=" + copyright + "]" ; } }启动类:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 |
public class csdnblogpageprocessor implements pageprocessor {
private static string username= "chenyufeng1991" ; // 设置csdn用户名
private static int size = 0 ; // 共抓取到的文章数量
// 抓取网站的相关配置,包括:编码、抓取间隔、重试次数等 private site site = site.me().setretrytimes( 3 ).setsleeptime( 1000 );
public site getsite() { return site; }
// process是定制爬虫逻辑的核心接口,在这里编写抽取逻辑 public void process(page page) { // 列表页 if (!page.geturl().regex( "http://blog.csdn.net/" + username + "/article/details/d+" ).match()) { // 添加所有文章页 page.addtargetrequests(page.gethtml().xpath( "//div[@id='article_list']" ).links()// 限定文章列表获取区域 .regex( "/" + username + "/article/details/d+" ) .replace( "/" + username + "/" , "http://blog.csdn.net/" + username + "/" )// 巧用替换给把相对url转换成绝对url .all()); // 添加其他列表页 page.addtargetrequests(page.gethtml().xpath( "//div[@id='papelist']" ).links()// 限定其他列表页获取区域 .regex( "/" + username + "/article/list/d+" ) .replace( "/" + username + "/" , "http://blog.csdn.net/" + username + "/" )// 巧用替换给把相对url转换成绝对url .all()); // 文章页 } else { size++; // 文章数量加1 // 用csdnblog类来存抓取到的数据,方便存入数据库 csdnblog csdnblog = new csdnblog(); // 设置编号 csdnblog.setkey(integer.parseint( page.geturl().regex( "http://blog.csdn.net/" + username + "/article/details/(d+)" ).get())); // 设置标题 csdnblog.settitle( page.gethtml().xpath( "//div[@class='article_title']//span[@class='link_title']/a/text()" ).get());
//设置内容 csdnblog.setcontent( page.gethtml().xpath( "//div[@class='article_content']/alltext()" ).get());
// 设置日期 csdnblog.setdates( page.gethtml().xpath( "//div[@class='article_r']/span[@class='link_postdate']/text()" ).get()); // 设置标签(可以有多个,用,来分割) csdnblog.settags(listtostring(page.gethtml().xpath( "//div[@class='article_l']/span[@class='link_categories']/a/alltext()" ).all())); // 设置类别(可以有多个,用,来分割) csdnblog.setcategory(listtostring(page.gethtml().xpath( "//div[@class='category_r']/label/span/text()" ).all())); // 设置阅读人数 csdnblog.setview(integer.parseint(page.gethtml().xpath( "//div[@class='article_r']/span[@class='link_view']" ) .regex( "(d+)人阅读" ).get())); // 设置评论人数 csdnblog.setcomments(integer.parseint(page.gethtml() .xpath( "//div[@class='article_r']/span[@class='link_comments']" ).regex( "((d+))" ).get())); // 设置是否原创 csdnblog.setcopyright(page.gethtml().regex( "bog_copyright" ).match() ? 1 : 0 ); // 把对象存入数据库 new csdnblogdao().add(csdnblog); // 把对象输出控制台 system.out.println(csdnblog); } }
// 把list转换为string,用,分割 public static string listtostring(list<string> stringlist) { if (stringlist == null ) { return null ; } stringbuilder result = new stringbuilder(); boolean flag = false ; for (string string : stringlist) { if (flag) { result.append( "," ); } else { flag = true ; } result.append(string); } return result.tostring(); }
public static void main(string[] args) { long starttime, endtime; system.out.println( "【爬虫开始】..." ); starttime = system.currenttimemillis(); // 从用户博客首页开始抓,开启5个线程,启动爬虫 spider.create( new csdnblogpageprocessor()).addurl( "http://blog.csdn.net/" + username).thread( 5 ).run(); endtime = system.currenttimemillis(); system.out.println( "【爬虫结束】共抓取" + size + "篇文章,耗时约" + ((endtime - starttime) / 1000 ) + "秒,已保存到数据库,请查收!" ); } } |
使用mysql类型:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 |
public class gamepageprocessor implements pageprocessor {
private static final logger logger = loggerfactory.getlogger(gamepageprocessor. class ); private static dianjingservice d; private static bannerservice bs; private static sportservice ss; private static yulenewsservice ys;
private static updateservice ud ; // 抓取网站的相关配置,包括:编码、抓取间隔、重试次数等 private site site = site.me().setretrytimes( 3 ).setsleeptime( 1000 );
public site getsite() { return site; } // process是定制爬虫逻辑的核心接口,在这里编写抽取逻辑 public static void main(string[] args) { configurableapplicationcontext context= springapplication.run(gamepageprocessor. class , args); d = context.getbean(dianjingservice. class ); //spider.create(new gamepageprocessor()).addurl("网址").thread(5).run(); }
public void process(page page) { selectable url = page.geturl(); if (url.tostring().equals( "网址" )) { dianjingvideo dv = new dianjingvideo(); list<string> ls = page.gethtml().xpath( "//div[@class='v']/div[@class='v-meta va']/div[@class='v-meta-title']/a/text()" ).all(); //hrefs list<string> ls1 = page.gethtml().xpath( "//div[@class='v']/div[@class='v-link']/a/@href" ).all();//获取a标签的href
list<string> ls2 = page.gethtml().xpath( "//div[@class='v']/div[@class='v-meta va']/div[@class='v-meta-entry']/div[@class='v-meta-data']/span[@class='r']/text()" ).all(); //photo list<string> ls3 = page.gethtml().xpath( "//div[@class='v']/div[@class='v-thumb']/img/@src" ).all();
for ( int i = 0 ; i < 5 ; i++) { dv.settitles(ls.get(i)); dv.setcategory( "" ); dv.setdates(ls2.get(i)); dv.sethrefs(ls1.get(i)); dv.setphoto(ls3.get(i)); dv.setsources( "" );
d.addvideo(dv);
} } } |
controller:
@controller @requestmapping(value = "/dianjing" ) public class dianjingcontroller { @autowired private dianjingservice s; /* 手游 */ @requestmapping( "/dianjing" ) @responsebody public object dianjing(){ list<dianjing> list = s.find2(); jsonobject jo = new jsonobject(); if (list!= null ){ jo.put( "code" ,0); jo.put( "success" , true ); jo.put( "count" ,list.size()); jo.put( "list" ,list); } return jo; } }实体类就不展示了
dao层
1 2 |
@insert ( "insert into dianjing (titles,dates,category,hrefs,photo,sources) values(#{titles},#{dates},#{category},#{hrefs},#{photo},#{sources})" ) int adddj(dianjing dj); |
以上这篇 springboot +webmagic实现java爬虫jdbc及mysql的方法就是小编分享给大家的全部内容了,希望能给大家一个参考,也希望大家多多支持。
原文链接:https://www.cnblogs.com/NCL--/p/8608336.html
查看更多关于springboot+webmagic实现java爬虫jdbc及mysql的方法的详细内容...