縱橫小說分布式采集 · Lucene案例開發

轉載請注明出處：[http://blog.csdn.net/xiaojimanman/article/details/46812645](http://blog.csdn.net/xiaojimanman/article/details/46812645) [http://www.llwjy.com/blogdetail/9df464b20cca5405c7ce07e2fb2d768f.html](http://www.llwjy.com/blogdetail/9df464b20cca5405c7ce07e2fb2d768f.html) 個人博客站已經上線了，網址 [www.llwjy.com ](http://www.llwjy.com)~歡迎各位吐槽~ ------------------------------------------------------------------------------------------------- 在前面的幾篇博客中，我們已經介紹了如何采集縱橫小說網站上的信息以及如何把這些信息持久化到數據庫中，現在我們就開始介紹如何做分布式采集，讓各個模塊之間可以完美的配合。 **采集類修改** 在開始介紹分布式采集之前，我們需要對之前介紹的采集類添加一些方法，也就是返回上一篇博客中介紹的小說javabean，具體源碼還請參照個人網站上的[博客源碼](http://www.llwjy.com/source.html)。 1.簡介頁簡介頁需呀添加一個方法，讓它返回簡介頁的數據信息，具體如下： ~~~ /** * @return * @Author:lulei * @Description: 分析簡介頁，獲取簡介頁數據 */ public NovelIntroModel getNovelIntro() { NovelIntroModel bean = new NovelIntroModel(); bean.setMd5Id(ParseMD5.parseStrToMd5L32(this.pageUrl)); bean.setName(getName()); bean.setAuthor(getAuthor()); bean.setDescription(getDesc()); bean.setType(getType()); bean.setLastChapter(getLatestChapter()); bean.setChapterlisturl(getChapterListUrl()); bean.setWordCount(getWordCount()); bean.setKeyWords(keyWords()); return bean; } ~~~ 2.閱讀頁閱讀頁內同樣需要添加一個方法，讓它返回閱讀頁內的數據信息，具體如下： ~~~ /** * @return * @Author:lulei * @Description: 分析閱讀頁，獲取閱讀頁數據 */ public NovelReadModel getNovelRead(){ NovelReadModel novel = new NovelReadModel(); novel.setTitle(getTitle()); novel.setWordCount(getWordCount()); novel.setContent(getContent()); return novel; } ~~~ 這些方法都是對之前類中的方法做一個整合，將之前分析到的數據組裝成一個javabean返回，方便后面的操作。 **各頁采集線程類** 在實現分布式采集的時候，就需要編寫各個頁面的采集線程類，讓他來控制各頁面的采集業務，下面我們就一一介紹： 1.更新列表頁線程這個線程的主要功能就是監控更新列表頁的數據，提取頁面上的簡介頁URL，認為它們是有更新的頁面，將對應的信息持久化到數據庫中，具體實現如下： ~~~ /** *@Description: 更新列表頁線程 */ package com.lulei.crawl.novel.zongheng; import java.util.List; import java.util.concurrent.TimeUnit; import com.lulei.db.novel.zongheng.ZonghengDb; public class UpdateListThread extends Thread{ private boolean flag = false; private String url;//抓取的更新列表頁URL private int frequency;//采集頻率 public UpdateListThread(String name, String url, int frequency){ super(name); this.url = url; this.frequency = frequency; } @Override public void run() { flag = true; ZonghengDb db = new ZonghengDb(); while (flag){ try { UpdateList updateList = new UpdateList(url); List<String> urls = updateList.getPageUrls(true); db.saveInfoUrls(urls); TimeUnit.SECONDS.sleep(frequency); } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); } } super.run(); } public static void main(String[] args) { // TODO Auto-generated method stub UpdateListThread thread = new UpdateListThread("llist", "http://book.zongheng.com/store/c0/c0/b9/u0/p1/v0/s9/t0/ALL.html", 60); thread.start(); } } ~~~ 2.簡介頁&章節列表頁線程類由于一個簡介頁就對應一個章節列表頁，所以我們就把這兩個線程合為一個線程，讓其實現小說簡介信息的采集以及小說章節列表信息的采集，具體實現如下： ~~~ /** *@Description: 小說簡介信息線程 */ package com.lulei.crawl.novel.zongheng; import java.util.List; import java.util.concurrent.TimeUnit; import com.lulei.crawl.novel.zongheng.model.NovelIntroModel; import com.lulei.db.novel.zongheng.ZonghengDb; public class IntroPageThread extends Thread { private boolean flag = false; public IntroPageThread(String name) { super(name); } @Override public void run() { flag = true; try { ZonghengDb db = new ZonghengDb(); while (flag) { //隨機獲取一個待采集的簡介頁url String url = db.getRandIntroPageUrl(1); if (url != null) { IntroPage intro = new IntroPage(url); NovelIntroModel bean = intro.getNovelIntro(); //采集小說章節列表頁信息 ChapterPage chapterPage = new ChapterPage(bean.getChapterlisturl()); List<String[]> chapters = chapterPage.getChaptersInfo(); bean.setChapterCount(chapters == null ? 0 : chapters.size()); //更新小說簡介信息 db.updateInfo(bean); //插入待采集的章節列表 db.saveChapters(chapters); //如果本次有待采集的資源，睡眠一個時間，沒有待采集的資源，睡眠另一個時間 TimeUnit.MILLISECONDS.sleep(500); }else { TimeUnit.MILLISECONDS.sleep(1000); } } } catch (Exception e) { e.printStackTrace(); } } public static void main(String[] args) { // TODO Auto-generated method stub IntroPageThread thread = new IntroPageThread("novelinfo"); thread.start(); } } ~~~ 3.閱讀頁線程這個線程的主要功能就是將小說閱讀頁的信息采集并持久化到數據庫中，具體如下： ~~~ /** *@Description: 小說閱讀頁線程 */ package com.lulei.crawl.novel.zongheng; import java.util.concurrent.TimeUnit; import com.lulei.crawl.novel.zongheng.model.NovelChapterModel; import com.lulei.crawl.novel.zongheng.model.NovelReadModel; import com.lulei.db.novel.zongheng.ZonghengDb; import com.lulei.util.ParseMD5; public class ReadPageThread extends Thread { private boolean flag = false; public ReadPageThread(String name) { super(name); } @Override public void run() { flag = true; ZonghengDb db = new ZonghengDb(); while (flag) { try { //隨機獲取待采集的閱讀頁 NovelChapterModel chapter = db.getRandReadPageUrl(1); if (chapter != null) { ReadPage read = new ReadPage(chapter.getUrl()); NovelReadModel novel = read.getNovelRead(); if (novel == null) { continue; } novel.setChapterId(chapter.getChapterId()); novel.setTime(chapter.getTime()); novel.setUrl(chapter.getUrl()); //保存閱讀頁信息 db.saveNovelRead(novel); //將狀態修改為不需要采集 db.updateChapterState(ParseMD5.parseStrToMd5L32(novel.getUrl()), 0); //如果本次有待采集的資源，睡眠一個時間，沒有待采集的資源，睡眠另一個時間 TimeUnit.MILLISECONDS.sleep(500); } else { TimeUnit.MILLISECONDS.sleep(1000); } } catch(Exception e){ e.printStackTrace(); } } } public static void main(String[] args) { ReadPageThread thread = new ReadPageThread("novel read page"); thread.start(); } } ~~~ **分布式采集** 上面已經介紹完了各個線程完成的工作，下面就需要一個類來控制管理這些線程，讓其運行起來，具體代碼如下： ~~~ /** *@Description: */ package com.lulei.crawl.novel.zongheng; import java.util.List; import com.lulei.crawl.novel.zongheng.model.CrawlListInfo; import com.lulei.db.novel.zongheng.ZonghengDb; public class CrawStart { private static boolean booleanCrawlList = false; private static boolean booleanCrawlIntro = false; //簡介頁采集線程數目 private static int crawlIntroThreadNum = 2; private static boolean booleanCrawlRead = false; //閱讀頁采集線程數目 private static int crawlReadThreadNum = 10; /** * @Author:lulei * @Description: 更新列表頁采集 */ public void startCrawlList(){ if (booleanCrawlList) { return; } booleanCrawlList = true; ZonghengDb db = new ZonghengDb(); List<CrawlListInfo> infos = db.getCrawlListInfos(); if (infos == null) { return; } for (CrawlListInfo info : infos) { if (info.getUrl() == null || "".equals(info.getUrl())) { continue; } UpdateListThread thread = new UpdateListThread(info.getInfo(), info.getUrl(), info.getFrequency()); thread.start(); } } /** * @Author:lulei * @Description: 小說簡介頁和章節列表頁 */ public void startCrawlIntro() { if (booleanCrawlIntro) { return; } booleanCrawlIntro = true; for (int i = 0; i < crawlIntroThreadNum; i++) { IntroPageThread thread = new IntroPageThread("novel info thread" + i); thread.start(); } } /** * @Author:lulei * @Description: 小說閱讀頁 */ public void startCrawlRead() { if (booleanCrawlRead) { return; } booleanCrawlRead = true; for (int i = 0; i < crawlReadThreadNum; i++) { ReadPageThread thread = new ReadPageThread("novel read page" + i); thread.start(); } } public static void main(String[] args) { CrawStart start = new CrawStart(); start.startCrawlList(); start.startCrawlIntro(); start.startCrawlRead(); } } ~~~ **運行結果** 通過上面的這幾個步驟，縱橫小說的分布式采集程序已經完成，下面就為大家展示一下采集后的數據庫截圖 ![](https://box.kancloud.cn/2016-02-22_56ca7bf2a3397.jpg) **寫在最后** 在上面的線程實現中，有很多的配置信息，比如說線程中的兩個請求之間的間隔時間以及各類線程的數量，像這些信息我們都可以將其寫到配置文件中，方便之后的修改（這里寫到程序中是方便大家的理解，還請見諒）。 ---------------------------------------------------------------------------------------------------- ps:最近發現其他網站可能會對博客轉載，上面并沒有源鏈接，如想查看更多關于基于[lucene的案例開發](http://blog.csdn.net/xiaojimanman/article/category/2841877) 請[點擊這里](http://www.llwjy.com/blogtype/lucene.html)。或訪問網址http://blog.csdn.net/xiaojimanman/article/category/2841877 或 http://www.llwjy.com/blogtype/lucene.html ------------------------------------------------------------------------------------------------- 小福利 ------------------------------------------------------------------------------------------------- 個人在極客學院上《Lucene案例開發》課程已經上線了（目前上線到第二課），歡迎大家吐槽~ [第一課：Lucene概述](http://www.jikexueyuan.com/course/937.html) [第二課：Lucene 常用功能介紹](http://www.jikexueyuan.com/course/1292.html)