常見問題 · pyspider文檔

## 如何把采集結果存入mysql <http://www.jishubu.net/yunwei/python/424.html> pyspider是個非常強大簡單易用的爬蟲框架，但是默認軟件會把采集的所有字段打包保存到默認的數據庫中，和其它軟件沒法整合。現在需求是需要把采集的字段做為單獨的字段保存到自定義的mysql數據庫中，本人技術能力有限，個人感覺實現方法不是最優的，大家有能力的請自行改進，沒能力的湊合著用吧。或是直接下載py腳本：把 pyspider的結果存入自定義的mysql數據庫中[mysqldb.zip](http://www.jishubu.net/wp-content/plugins/wp-ueditor/ueditor/php/upload/8521423797887.zip) ~~~ pyspider結果保存到數據庫簡單樣例。使用方法： ????1，把本文件放到pyspider/pyspider/database/mysql/目錄下命名為mysqldb.py。 ????2，修改本文件的數據庫配置參數及建立相應的表和庫。 ????3，在腳本文件里使用from pyspider.database.mysql.mysqldb import SQL引用本代碼. ????4，重寫on_result方法，實例化sql并調用replace(replace方法參數第一個是表名，第二個是結果。)。簡單例子如下： #!/usr/bin/env python # -*- encoding: utf-8 -*- # Created on 2015-01-26 13:12:04 # Project: jishubu.net ???? from pyspider.libs.base_handler import * from pyspider.database.mysql.mysqldb import SQL ???? ???? class Handler(BaseHandler): ????crawl_config = { ????} ???? ????@every(minutes=24 * 60) ????def on_start(self): ????????self.crawl('http://www.jishubu.net/', callback=self.index_page) ???? ????@config(age=10 * 24 * 60 * 60) ????def index_page(self, response): ????????for each in response.doc('p.pic a[href^="http"]').items(): ????????????print each.attr.href ??????????????? ????@config(priority=2) ????def detail_page(self, response): ????????return { ????????????"url": response.url, ????????????"title": response.doc('HTML>BODY#presc>DIV.main>DIV.prices_box.wid980.clearfix>DIV.detail_box>DL.assort.tongyong>DD>A').text(), ????????} ????def on_result(self, result): ????????#print result ????????if not result or not result['title']: ????????????return ????????sql = SQL() ????????sql.replace('info',**result) ''' from six import itervalues import mysql.connector from datetime import date, datetime, timedelta ???? class SQL: ???? ????????username = 'pyspider'?? #數據庫用戶名 ????????password = 'pyspider'?? #數據庫密碼 ????????database = 'result'???? #數據庫 ????????host = 'localhost'????? #數據庫主機地址 ????????connection = '' ????????connect = True ????placeholder = '%s' ???? ????????def __init__(self): ????????????????if self.connect: ????????????????????????SQL.connect(self) ????def escape(self,string): ????????return '`%s`' % string ????????def connect(self): ????????????config = { ????????????????'user':SQL.username, ????????????????'password':SQL.password, ????????????????'host':SQL.host ????????????} ????????????if SQL.database != None: ????????????????config['database'] = SQL.database ???? ????????????try: ????????????????cnx = mysql.connector.connect(**config) ????????????????SQL.connection = cnx ????????????????return True ????????????except mysql.connector.Error as err: ???? ????????????if (err.errno == errorcode.ER_ACCESS_DENIED_ERROR): ????????????????print "The credentials you provided are not correct." ????????????elif (err.errno == errorcode.ER_BAD_DB_ERROR): ????????????????print "The database you provided does not exist." ????????????else: ????????????????print "Something went wrong: " , err ????????????return False ???? ???? ????def replace(self,tablename=None,**values): ????????if SQL.connection == '': ????????????????????print "Please connect first" ????????????????????return False ???? ????????????????tablename = self.escape(tablename ) ????????????????if values: ????????????????????????_keys = ", ".join(self.escape(k) for k in values) ????????????????????????_values = ", ".join([self.placeholder, ] * len(values)) ????????????????????????sql_query = "REPLACE INTO %s (%s) VALUES (%s)" % (tablename, _keys, _values) ????????????????else: ????????????????????????sql_query = "REPLACE INTO %s DEFAULT VALUES" % tablename ???? ???????????????????? ????????cur = SQL.connection.cursor() ????????????????try: ????????????????????if values: ????????????????????????????cur.execute(sql_query, list(itervalues(values))) ????????????????????else: ????????????????????????????cur.execute(sql_query) ????????????????????SQL.connection.commit() ????????????????????return True ????????????????except mysql.connector.Error as err: ????????????????????print ("An error occured: {}".format(err)) ????????????????????return False ~~~ ## module ：No module named mysqldb `http://ftp.ntu.edu.tw/MySQL/Downloads/Connector-Python/`