Python不使用scrapy框架而编写的网页爬虫程序-CFANZ编程社区

Python不使用scrapy框架而编写的网页爬虫程序

本文代码节选（略有改动）自《Python程序设计（第2版）》（董付国编著，清华大学出版社），没有使用scrapy爬虫框架，而是使用标准库urllib访问网页实现爬虫功能，如果网页包含感兴趣的关键词，就把这个网页保存成为本地文件，并且有效控制了爬取深度，避免爬遍互联网。

import sys
import re
import os
import urllib.request as lib
def craw_links(url, depth, keywords, processed):
    '''url:the url to craw
     depth:the current depth to craw
     keywords:the tuple of keywords to focus
     processed:the urls already crawled
    '''
    
    if url.startswith(('http://', 'https://')):        
        if url not in processed:
            # mark this url as processed
            processed.append(url)
        else:
            # avoid processing the same url again
            return
        
        print('Crawing '+url+'...')
        
        with lib.urlopen(url) as fp:
            # Python3 returns bytes
            # so need to decode
            contents = fp.read()
            contents_decoded = contents.decode('UTF-8')
        # form a regular expression
        pattern = '|'.join(keywords)
        # if this page contains certain keywords, save it to a file
        flag = False
        if pattern:
            searched = re.search(pattern, contents_decoded)
        else:
            # if the keywords to filter is not given, save current page
            flag = True
            
        if flag or searched:
            with open('craw\\'+url.replace(':','_').replace('/','_'), 'wb') as fp:
                fp.write(contents)
                
        # find all the links in the current page
        links = re.findall('href="(.*?)"', contents_decoded)
        # craw all links in the current page
        for link in links:
            # consider the relative path
            if not link.startswith(('http://','https://')):                
                try:
                    index = url.rindex('/')
                    link = url[0:index+1]+link
                except:
                    pass
                
            # control the crawl depth
            if depth>0 and link.endswith(('.htm','.html')):
                craw_links(link, depth-1, keywords, processed)
                
if __name__ == '__main__':   
    processed = []   
    keywords = ('datetime','KeyWord2')
    if not os.path.exists('craw') or not os.path.isdir('craw'):
        os.mkdir('craw')
    start_url = r'https://docs.python.org/3/library/index.html'
    craw_links(start_url, 1, keywords, processed)

0 条评论