本文代码节选(略有改动)自《Python程序设计(第2版)》(董付国编著,清华大学出版社),没有使用scrapy爬虫框架,而是使用标准库urllib访问网页实现爬虫功能,如果网页包含感兴趣的关键词,就把这个网页保存成为本地文件,并且有效控制了爬取深度,避免爬遍互联网。
import sys
import re
import os
import urllib.request as lib
def craw_links(url, depth, keywords, processed):
'''url:the url to craw
depth:the current depth to craw
keywords:the tuple of keywords to focus
processed:the urls already crawled
'''
if url.startswith(('http://', 'https://')):
if url not in processed:
# mark this url as processed
processed.append(url)
else:
# avoid processing the same url again
return
print('Crawing '+url+'...')
with lib.urlopen(url) as fp:
# Python3 returns bytes
# so need to decode
contents = fp.read()
contents_decoded = contents.decode('UTF-8')
# form a regular expression
pattern = '|'.join(keywords)
# if this page contains certain keywords, save it to a file
flag = False
if pattern:
searched = re.search(pattern, contents_decoded)
else:
# if the keywords to filter is not given, save current page
flag = True
if flag or searched:
with open('craw\\'+url.replace(':','_').replace('/','_'), 'wb') as fp:
fp.write(contents)
# find all the links in the current page
links = re.findall('href="(.*?)"', contents_decoded)
# craw all links in the current page
for link in links:
# consider the relative path
if not link.startswith(('http://','https://')):
try:
index = url.rindex('/')
link = url[0:index+1]+link
except:
pass
# control the crawl depth
if depth>0 and link.endswith(('.htm','.html')):
craw_links(link, depth-1, keywords, processed)
if __name__ == '__main__':
processed = []
keywords = ('datetime','KeyWord2')
if not os.path.exists('craw') or not os.path.isdir('craw'):
os.mkdir('craw')
start_url = r'https://docs.python.org/3/library/index.html'
craw_links(start_url, 1, keywords, processed)