# 3.1 使用urllib
## 1.相關鏈接
官方文檔:[https://docs.python.org/3/library/urllib.html](https://docs.python.org/3/library/urllib.html)
測試網站:httpbin.org
## 2.內容
Urllib 庫,是 Python 內置的 HTTP 請求庫,包含四個模塊:
* urllib.request:用于請求URL
* urllib.error:異常處理模塊,如果出現請求錯誤,我們可以捕獲這些異常,然后進行重試或其他操作保證程序不會意外終止
* urllib.parse:用于解析URL
* urllib.robotparser:用于解析robots.txt文件
## 一個簡單的例子
以下例子分裝成一個類的形式,采集中彩網雙色球的數據(這個版本只打印鏈接,不采集內容)
主要知識點:
* [ ] 使用urllib.parse解析URL
* [ ] 遞歸調用
~~~
import re
import urllib
import urllib.parse
import urllib.request
from urllib.error import URLError
from bs4 import BeautifulSoup
class GetDoubleColorBallNumber(object):
def __init__(self):
urls = self.getUrls()
for url in urls:
print(url)
# 獲取所有需要采集的頁面
def getUrls(self):
urls = []
url = 'http://kaijiang.zhcw.com/zhcw/html/ssq/list.html'
html = self.getResponseContent(url)
soup = BeautifulSoup(html, 'lxml')
tag = soup.find_all(re.compile('p'))[-1]
pages = tag.strong.get_text()
print('頁面總數:' + pages)
for i in range(1, int(pages) + 1):
url = 'http://kaijiang.zhcw.com/zhcw/html/ssq/list_' + str(i) + '.html'
urls.append(url)
return urls
def getResponseContent(self, url):
user_agent = 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1) QQBrowser/6.0'
html = self.download(url, num_retries=4, user_agent=user_agent)
return html
# 下載頁面內容
def download(self, url, user_agent='wswp', proxy=None, num_retries=3):
print("Downloading:%s" % url)
headers = {'User-agent': user_agent}
request = urllib.request.Request(url, headers=headers)
opener = urllib.request.build_opener()
if proxy:
proxy_params = {urllib.parse.urlparse(url).scheme: proxy}
opener.add_handler(urllib.request.ProxyHandler(proxy_params))
html = None
try:
html = opener.open(request).read()
html = html.decode('utf-8') # python3
except URLError as e:
if num_retries > 0:
print("Download error:(code:%s,reson:%s)" % (e.errno, e.reason))
html = None
if hasattr(e, 'code') and 500 <= e.code < 600:
print("Retrying .... ")
return self.download(url, user_agent, proxy, num_retries - 1)
return html
if __name__ == '__main__':
GetDoubleColorBallNumber()
~~~