Python不使用scrapy框架而编写的网页爬虫程序

2016-12-30 董付国 Python小屋 Python小屋

本文代码节选(略有改动)自《Python程序设计(第2版)》(董付国编著,清华大学出版社),没有使用scrapy爬虫框架,而是使用标准库urllib访问网页实现爬虫功能,如果网页包含感兴趣的关键词,就把这个网页保存成为本地文件,并且有效控制了爬取深度,避免爬遍互联网。

import sys

import re

import os

import urllib.request as lib


def craw_links(url, depth, keywords, processed):

    '''url:the url to craw

     depth:the current depth to craw

     keywords:the tuple of keywords to focus

     processed:the urls already crawled

    '''

    

    if url.startswith(('http://', 'https://')):        

        if url not in processed:

            # mark this url as processed

            processed.append(url)

        else:

            # avoid processing the same url again

            return

        

        print('Crawing '+url+'...')

        

        with lib.urlopen(url) as fp:

            # Python3 returns bytes

            # so need to decode

            contents = fp.read()

            contents_decoded = contents.decode('UTF-8')


        # form a regular expression

        pattern = '|'.join(keywords)

        # if this page contains certain keywords, save it to a file

        flag = False

        if pattern:

            searched = re.search(pattern, contents_decoded)

        else:

            # if the keywords to filter is not given, save current page

            flag = True

            

        if flag or searched:

            with open('craw\\'+url.replace(':','_').replace('/','_'), 'wb') as fp:

                fp.write(contents)

                

        # find all the links in the current page

        links = re.findall('href="(.*?)"', contents_decoded)

        # craw all links in the current page

        for link in links:

            # consider the relative path

            if not link.startswith(('http://','https://')):                

                try:

                    index = url.rindex('/')

                    link = url[0:index+1]+link

                except:

                    pass

                

            # control the crawl depth

            if depth>0 and link.endswith(('.htm','.html')):

                craw_links(link, depth-1, keywords, processed)

                

if __name__ == '__main__':   

    processed = []   

    keywords = ('datetime','KeyWord2')

    if not os.path.exists('craw') or not os.path.isdir('craw'):

        os.mkdir('craw')

    start_url = r'https://docs.python.org/3/library/index.html'

    craw_links(start_url, 1, keywords, processed)