4、Scrapy框架，爬取網站返回json數據（spider源碼) · 技術-開發筆記

~~~ # -*- coding: utf-8 -*- import scrapy from scrapy.http import Request from kunnanyuan.spider.spider.common import deal_date, transfrom, get_id from ..items import XkItem import json class XkSdl10822Spider(scrapy.Spider): name = 'XK-FJM-0102' url = 'http://222.76.243.118:8090/publicity/get_double_publicity_record_list' #構造請求頭（postman，網站調試器，apipost，當然我推薦的是博客————爬蟲騷操作之30秒寫爬蟲（實用）直接轉化為python格式復制過來就行，幾秒解決） headers = { 'Origin': 'http://222.76.243.118:8090', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.9', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36', #這里必須有，才會有下面的解析方式 'Content-Type': 'application/json; charset=UTF-8', 'Accept': 'application/json, text/javascript, */*; q=0.01', 'Referer': 'http://222.76.243.118:8090/page/double_publicity/allow.html', 'Connection': 'keep-alive', } #構造起始start_requests def start_requests(self): #偷懶沒寫遍歷去限制每次請求的數據，直接一下拿了，主要是為了后面增量考慮 data = { 'listSql': '', 'linesPerPage': "6704", 'currentPage': "1", 'deptId': '', 'searchKeyword': '', 'tag': 'ALLOW' } yield scrapy.Request(url=self.url, body=json.dumps(data), method='POST', headers=self.headers,callback=self.parse_list) #把數據切割分成一頁多少個 # def parse_page(self, response): # self.parse_list(response) # if self.page == 1: #以下省略 def parse_list(self, response): #返回json，轉化python字典類型 tr1 = json.loads(response.text) #把 tr1看成一個大字典，取鍵拿值 if tr1.get("message") == "請求成功": data = tr1.get('data')#也可以寫成 data = tr1.【'data'】下面類似 list = data.get('list') #這里就是遍歷每個json里面的數據了 for i in list: if i['legalPersonDocNumber'] is not None: identifier = i['legalPersonDocNumber'] else: identifier = i['naturalPersonDocNumber'] if i['jgFr'] is not None: organization = i['jgFr'] else: organization = i['jgZr'] businessId = i['businessId'] id = i['id'] objectType = i['objectType'] createdAt = deal_date(i['businessCreateDate'].split('000000')[0]) source_url = "http://222.76.243.118:8090/page/double_publicity/publicity_detail.html" + "?id={}&businessId={}&tag=ALLOW&objectType={}".format(str(id), str(businessId),str(objectType)) prPrincipal = i['objectName'] data = { "businessId": businessId, "id": id, 'objectType': objectType, 'tag': "ALLOW", 'pictureMinHeight': '628', 'pictureMinWidth': '1200' } url = "http://222.76.243.118:8090/publicity/get_publicity_detail_picture" yield Request(url, callback=self.parse4, body=json.dumps(data), method='POST', headers=self.headers, meta={"identifier": identifier, "organization": organization, "businessId": businessId, "createdAt": createdAt, "source_url": source_url, "prPrincipal": prPrincipal}) #解析并把item傳出去 def parse4(self, response): item = XkItem() item['identifier'] = response.meta["identifier"] item['organization'] = response.meta["organization"] print(item['organization']) # item['businessId'] = response.meta["businessId"] item['createdAt'] = response.meta["createdAt"] item['source_url'] = response.meta['source_url'] item['prPrincipal'] = response.meta['prPrincipal'] item['type'] = transfrom(str(item['organization'])) item['fileType'] = "jpg" item['pid'] = get_id(str(item['identifier'])) item['idMethod'] = '2' tr2 = json.loads(response.text) if tr2.get("message") == "請求成功": data = tr2.get('data') path = data.get('path') item['images'] = "http://222.76.243.118:8090/" + path yield item ~~~ 或者： ~~~ #coding=utf-8 import scrapy import json class DmozSpider(scrapy.Spider): name = "dmoz" allowed_domains = ["dmoz.org"] start_urls = [ "http://www.test.com/test/get_data" ] def parse(self, response): # 調用body_as_unicode()是為了能處理unicode編碼的數據 sites = json.loads(response.body_as_unicode()) #print sites['k'] numbers = sites['k'].split(',') print numbers ~~~