8-3-亞馬遜案例 · Scrapy框架

![](https://img.kancloud.cn/41/e0/41e066af9a6c25a24868d9667253ec98_1241x333.jpg) ***** ## 亞馬遜爬蟲 - 需求：抓取亞馬遜圖書的信息 - 目標：抓取亞馬遜圖書大分類，圖書URL地址，列表翻頁地址，圖書名字，圖書作者，圖書價格 - URL地址:[https://www.amazon.cn/圖書/b/ref=sd\_allcat\_books\_l1?ie=UTF8&node=658390051](https://www.amazon.cn/%E5%9B%BE%E4%B9%A6/b/ref=sd_allcat_books_l1?ie=UTF8&node=658390051) ``` # -*- coding: utf-8 -*- import scrapy from scrapy.linkextractors import LinkExtractor from scrapy.spiders import CrawlSpider, Rule import re from scrapy_redis.spiders import RedisCrawlSpider """ 目標：抓取亞馬遜圖書信息, 有圖書的名字、封面圖片地址、圖書url地址、作者、出版社、出版時間、價格、圖書所屬大分類、圖書所屬小的分類、分類的url地址思路: 1. 先完成scrapy的CrawlSpider 2. 改為RedisCrawlSpider 2.1 修改繼承關系, 繼承RedisCrawlSpider 2.2 start_urls 改為 redis_key 2.3 修改配置文件(多個爬蟲配置一次就可以了) """ # 2.1 修改繼承關系, 繼承RedisCrawlSpider class AmazonSpider(RedisCrawlSpider): name = 'amazon' allowed_domains = ['amazon.cn'] # 修改起始的URL # start_urls = ['https://www.amazon.cn/圖書/b/ref=sa_menu_top_books_l1?ie=UTF8&node=658390051'] # 2.2 start_urls 改為 redis_key # 用于指定起始URL在redis數據庫的key redis_key = 'amazon:start_urls' rules = ( # 1. 提取分類的URL # restrict_xpaths: 用于指定從那一塊區域中提取鏈接 Rule(LinkExtractor(restrict_xpaths='//*[@id="leftNav"]/ul[1]/ul/div/li'), follow=True), # 2. 提取列表頁分頁的URL Rule(LinkExtractor(restrict_xpaths='//*[@id="pagn"]'), follow=True), # 3. 提取的詳情URL Rule(LinkExtractor(restrict_xpaths='//a[contains(@class, "s-access-detail-page")]'), callback='parse_item'), ) def parse_item(self, response): # 解析詳情頁數據 # print(response.url) item = {} # 有圖書的名字 item['book_name'] = response.xpath('//*[contains(@id,"roductTitle")]/text()').extract_first() # 封面圖片地址 item['book_img'] = response.xpath('//*[contains(@id, "mgBlkFront")]/@src').extract_first() # 圖書url地址 item['book_url'] = response.url # 作者 item['book_author'] = ''.join(response.xpath('//*[@id="bylineInfo"]/span/a/text()').extract()) # 價格 item['book_price'] = response.xpath('//span[contains(@class, "a-color-price")]/text()').extract_first() publish = re.findall('<li><b>出版社:</b> (.+?);.*?\((.+?)\)</li>', response.text) if len(publish) != 0: # print(publish) # [('中信出版社', '2018年7月1日')] # 出版社 item['book_publisher'] = publish[0][0] # 出版時間 item['book_publish_date'] = publish[0][1] # 圖書所屬大分類 # 獲取包含分類信息的a標簽列表 a_s = response.xpath('//span[@class="a-list-item"]/a[text()]') # 獲取大分類 if len(a_s) > 0: item['b_category_name'] = a_s[0].xpath('./text()').extract_first().strip() item['b_category_url'] = response.urljoin(a_s[0].xpath('./@href').extract_first()) # 中分類 if len(a_s) > 1: item['m_category_name'] = a_s[1].xpath('./text()').extract_first().strip() item['m_category_url'] = response.urljoin(a_s[1].xpath('./@href').extract_first()) # 圖書所屬小的分類 if len(a_s) > 2: item['s_category_name'] = a_s[2].xpath('./text()').extract_first().strip() item['s_category_url'] = response.urljoin(a_s[2].xpath('./@href').extract_first()) # 把數據交給引擎 # print(item) yield item ```