php爬蟲 · web開發筆記

> 網絡爬蟲（又被稱為網頁蜘蛛，網絡機器人，在FOAF社區中間，更經常的稱為網頁追逐者），是一種按照一定的規則，自動地抓取萬維網信息的程序或者腳本。另外一些不常使用的名字還有螞蟻、自動索引、模擬程序或者蠕蟲。以上的定義來自百度百科。今天我就給大家是實現一個可以簡易爬取新聞的小爬蟲。當然，如果嚴格意義上講，把它當成一個成熟的爬蟲，那還相差很遠，只能說算是一個小的試驗。但是，它基本可以滿足我們從一些網站上，采集一些有用的信息下來的目的了。首先來介紹一下，我們需要準備哪些工具： 1. 可以啟動多線程請求的 curl 類 2. 可以像 jquery 那樣解析 dom 的 phpQuery 類 3. ThinkPHP5命令行工具下面我們來一一添加這些工具，并完成簡單爬蟲的制作。 ## 添加 curl 類其實 php 的http請求類庫有很多的，其中很優秀的 guzzle 。但是本教程不打算采用這個（因為我也不太熟悉這個類庫）。當然，我們制作的是一個簡單的小爬蟲，可替代的方案有很多，甚至你可以直接使用 file\_get\_contents 。考慮到簡答的并發抓取的問題，于是我在網上尋找了一個不是很復雜，但是很好用的 curl 類庫。我們新建 extend\\curl\\MultiCurl.php ~~~ <?php namespace curl; /* * Multi curl in PHP * @author rainyluo * @date 2016-04-15 */ class MultiCurl { //urls needs to be fetched public $targets = []; //parallel running curl threads public $threads = 10; //curl options public $curlOpt = []; //callback function public $callback = null; //debug ,will show log using echo public $debug = true; //multi curl handler private $mh = null; //curl running signal private $runningSig = null; /** * 架構函數 */ public function __construct() { $this->mh = curl_multi_init(); $this->curlOpt = [ CURLOPT_HEADER => false, CURLOPT_CONNECTTIMEOUT => 10, CURLOPT_TIMEOUT => 10, CURLOPT_AUTOREFERER => true, CURLOPT_RETURNTRANSFER => true, CURLOPT_FOLLOWLOCATION => true, CURLOPT_MAXREDIRS => 5, ]; $this->callback = function ($html) { echo md5($html); echo "fetched"; echo "\r\n"; }; } /** * 設置目標數 * @param unknown $urls * @return \extend\MultiCurl */ public function setTargets($urls) { $this->targets = $urls; return $this; } /** * 設置線程數 * @param unknown $threads * @return \extend\MultiCurl */ public function setThreads($threads) { $this->threads = intval($threads); return $this; } /** * 設置回調函數 * @param unknown $func * @return \extend\MultiCurl */ public function setCallback($func) { $this->callback = $func; return $this; } /* * start running */ public function run() { $this->initPool(); $this->runCurl(); } /* * run multi curl */ private function runCurl() { do { //start request thread and wait for return,if there's no return in 1s,continue add request thread do { curl_multi_exec($this->mh, $this->runningSig); //$this->log("exec results...running sig is" . $this->runningSig); $return = curl_multi_select($this->mh, 1.0); if ($return > 0) { //$this->log("there is a return...$return"); break; } unset($return); } while ($this->runningSig > 0); //if there is return,read it while ($returnInfo = curl_multi_info_read($this->mh)) { $handler = $returnInfo["handle"]; if ($returnInfo["result"] == CURLE_OK) { $url = curl_getinfo($handler, CURLINFO_EFFECTIVE_URL); //$this->log($url . "returns data"); $callback = $this->callback; $callback(curl_multi_getcontent($handler)); } else { $url = curl_getinfo($handler, CURLINFO_EFFECTIVE_URL); //$this->log("$url fetch error." . curl_error($handler)); } curl_multi_remove_handle($this->mh, $handler); curl_close($handler); unset($handler); //add new targets into curl thread if ($this->targets) { $threadsIdel = $this->threads - $this->runningSig; //$this->log("idel threads:" . $threadsIdel); if ($threadsIdel < 0) continue; for ($i = 0; $i < $threadsIdel; $i++) { $t = array_pop($this->targets); if (!$t) continue; $task = curl_init($t); curl_setopt_array($task, $this->curlOpt); curl_multi_add_handle($this->mh, $task); //$this->log("new task adds!" . $task); $this->runningSig += 1; unset($task); } } else { //$this->log("targets all finished"); } } } while ($this->runningSig); } /* * init multi curl pool */ private function initPool() { if (count($this->targets) < $this->threads) $this->threads = count($this->targets); //init curl handler pool ... for ($i = 1; $i <= $this->threads; $i++) { $task = curl_init(array_pop($this->targets)); curl_setopt_array($task, $this->curlOpt); curl_multi_add_handle($this->mh, $task); //$this->log("init pool thread one"); unset($task); } // $this->log("init pool done"); } /** * 日志函數 * @param unknown $log * @return boolean */ private function log($log) { if (!$this->debug) return false; ob_start(); echo "---------- " . date("Y-m-d H:i", time()) . "-------------"; if (is_array($log)) { echo json_encode($log); } else { echo $log; } $m = memory_get_usage(); echo "memory:" . intval($m / 1024) . "kb\r\n"; echo "\r\n"; flush(); ob_end_flush(); unset($log); } /** * 析構函數 */ public function __destruct() { //$this->log("curl ends."); curl_multi_close($this->mh); } } ~~~ ## 下載 phpquery 類庫我們到[packagist](https://packagist.org/)搜索 phpquery ，會看到 phpquery 包的詳情。復制安裝命令，打開 cmd，進入項目根目錄 ~~~ composer require electrolinux/phpquery ~~~ 等待下載完成即可。接下來，應該講解 thinkphp5 命令行的用法，可是我感覺這不是重點。怎么使用，你可以參考[自定義命令行](http://www.hmoore.net/manual/thinkphp5/235129)。下面我想給大家講解一下這個 curl 類庫和如何通過 phpquery 解析數據，得到我們想要的數據。方便起見，我只演示采集文章標題和地址。新建一個表記錄這些數據： ~~~ DROP TABLE IF EXISTS `article_title`; CREATE TABLE `article_title` ( `id` int(11) NOT NULL AUTO_INCREMENT, `title` varchar(155) NOT NULL COMMENT '文章標題', `href` varchar(155) NOT NULL COMMENT '文章鏈接', PRIMARY KEY (`id`) ) ENGINE=MyISAM DEFAULT CHARSET=utf8; ~~~ 本文演示，以砍柴網為列。砍柴網的創頻道[http://www.ikanchai.com/article/](http://www.ikanchai.com/article/)。采集一般我們選擇列表頁來進行，因為這里的內容集中，而且 url 有一定的規律性，我們點擊下面的分頁按鈕，地址欄就會展示出有規律的列表地址。 ~~~ http://www.ikanchai.com/article/index_1.shtml http://www.ikanchai.com/article/index_2.shtml ~~~ 這樣每一頁的列表頁，都是通過 index\_x 后的數字來表示，因此我們很容易構建出很多的采集 url。下面來講解一下，那個curl 類的基本使用方法： ~~~ $mu = new MultiCurl(); // 需要采集的列表數據 $urls = [ 'http://www.ikanchai.com/article/index_1.shtml', 'http://www.ikanchai.com/article/index_2.shtml' ]; // 獲取內容回調函數 $callback = function($html) { // do something }; $mu->setTargets($urls)->setCallback($callback)->setThreads(5)->run(); ~~~ > 1. 實例化 curl 類 > 2. 定義需要采集的 url 集合 > 3. 定義成功采集之后的回調函數 > 4. 設置采集集合，設置回調函數，設置啟動線程數，啟動采集我們要做的重點就是，如何在回調函數中，解析出文章的標題和地址，并且存入數據看。 ![](https://box.kancloud.cn/d3c427cab264448b6f4ad17088c555d4_1496x867.jpg) 我們通過 F12 可以看出，他的文章內容都是在 ~~~ <div class="hlgd-content"></div> ~~~ 里面包裹的，而且所有的標題是以一個循環結構展現的。循環結構的 div 為 ~~~ <div class="hlgd-box"></div> ~~~ 因此，我們在回調函數中的 phpquery 要這么寫： ~~~ // 獲取內容回調函數 $callback = function($html) { $res = \phpQuery::newDocument($html); // 所有標題區域的 div 對象 $div = $res['.hlgd-content .hlgd-box']; // 循環獲取查看每一個 div 中的標題信息 foreach($div as $div){ $title = pq($div)->find('h3 a')->attr('title'); $href = pq($div)->find('h3 a')->attr('href'); db('article_title')->insert(compact('title', 'href')); } }; ~~~ > 如果你熟悉 jquery 的話，很容易理解這部分的寫法。另外，具體的 phpquery 該如何使用，篇幅有限，水平有限，請自行百度。 ## 通過 command 進行采集我們在 application\\command.php 中定義： ~~~ return [ 'app\index\command\Spider' ]; ~~~ 建立命令類文件，新建application/index/command/Spider.php ~~~ <?php namespace app\index\command; use think\console\Command; use think\console\Input; use think\console\Output; use curl\MultiCurl; class Spider extends Command { protected function configure() { $this->setName('spider')->setDescription('spider running '); } protected function execute(Input $input, Output $output) { $mu = new MultiCurl(); // 需要采集的列表數據 $urls = [ 'http://www.ikanchai.com/article/index_1.shtml', 'http://www.ikanchai.com/article/index_2.shtml' ]; // 獲取內容回調函數 $callback = function($html) { $res = \phpQuery::newDocument($html); // 所有標題區域的 div 對象 $div = $res['.hlgd-content .hlgd-box']; // 循環獲取查看每一個 div 中的標題信息 foreach($div as $div){ $title = pq($div)->find('h3 a')->attr('title'); $href = pq($div)->find('h3 a')->attr('href'); db('article_title')->insert(compact('title', 'href')); } }; $mu->setTargets($urls)->setCallback($callback)->setThreads(5)->run(); $output->writeln("complete"); } } ~~~ 打開 cmd 進入系統根目錄，執行 ~~~ php think spider ~~~ ![](https://box.kancloud.cn/620567b14d3310d2c08053dc5062527a_396x139.jpg) 看到 complete 則采集完成。打開表可見： ![](https://box.kancloud.cn/93b4df037565fd330719c8781c923748_613x893.jpg)