### **第一步:創建xxx項目**
~~~
scrapy startproject Tencentjob
~~~
### **第二步:創建要抓取的名稱及抓取網址**
~~~
scrapy genspider cententjob 'https://hr.tencent.com/position.php'
~~~
### **第三步:編寫items.py,明確需要提取的數據**
~~~
import scrapy
class CententjobItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
title = scrapy.Field()
link =scrapy.Field()
category=scrapy.Field()
number=scrapy.Field()
address=scrapy.Field()
create_time=scrapy.Field()
~~~
### **第四步:編寫spiders/xxx.py 編寫爬蟲文件,處理請求和響應,以及提取數據(yeild item)**
~~~
import scrapy
from cententjob.items import CententjobItem
class CententjobSpider(scrapy.Spider):
print(CententjobItem)
name='cententjob'
#允許抓取的域名,可選
#allowed_domains=['centent.com']
#要實現翻頁
baseUrl='https://hr.tencent.com/position.php'
offset=0
start_urls=[baseUrl+str(offset)]
def parse(self,response):
node_list=response.xpath("//tr[@class='even'] | //tr[@class='odd']" )
for node in node_list:
jobs=CententjobItem()
#提取每個職位的信息,并把Unicode數據轉化為uft-8數據
jobs['title']=node.xpath("./td[1]/a/text()").extract()[0]
jobs['link']=node.xpath("./td[1]/a/@href").extract()[0]
try:
jobs['category']=node.xpath("./td[2]/text()").extract()[0]
except:
jobs['category']=''
jobs['number']=node.xpath("./td[3]/text()").extract()[0]
jobs['address']=node.xpath("./td[4]/text()").extract()[0]
jobs['create_time']=node.xpath("./td[5]/text()").extract()[0]
yield jobs
if not response.xpath("//a[@class='noactive' and @id='next']/@href"):
url=response.xpath("//a[@id='next']/@href").extract()[0]
allUrl='https://hr.tencent.com/'+url
yield scrapy.Request(allUrl,callback=self.parse)
# if self.offset < 3110 :
# self.offset +=10
# url=self.baseUrl + str(self.offset)
# yield scrapy.Request(url,callback=self.parse)
~~~
### **第五步:編寫pipelines.py管道文件,處理spider返回item數據**
~~~
import MySQLdb
class CententjobPipeline(object):
def __init__(self):
# 連接數據庫
self.con=MySQLdb.connect(
host='127.0.0.1',
port=3306,
db='ganji',
user='root',
passwd='123456',
charset='utf8',
use_unicode=True,
)
# 通過cu執行增刪查改
self.cu=self.con.cursor()
def open_spider(self,spider):
print('我要開始了哦')
def process_item(self, item, spider):
try:
insert_sql="insert into centent (title,link,category,number,address,create_time) value ('{}','{}','{}','{}','{}','{}')".format(item['title'],item['link'],item['category'],item['number'],item['address'],item['create_time'])
#print(insert_sql)
self.cu.execute(insert_sql)
# 是否有重復數據
repetition = self.cu.fetchone()
# 重復
if repetition:
print('我重復啦')
else:
self.con.commit()
except Exception as error:
# 出現錯誤時打印錯誤日志
log(error)
return item
def close_spider(self,spider):
self.con.close()
print('我結束了')
~~~
### **第六步:編寫settings.py,啟動管理文件,以及其他相關設置**
~~~
ITEM_PIPELINES = {
'cententjob.pipelines.CententjobPipeline': 300,
}
~~~
### **第七步:執行爬蟲**
~~~
C:\Users\Administrator\Desktop\cententjob>scrapy list
<class 'cententjob.items.CententjobItem'>
cententjob
C:\Users\Administrator\Desktop\cententjob>scrapy crawl cententjob
~~~
接下來就可以坐等抓取了。