python －多进程spider-CFANZ编程社区

问题导读：

抓取‘君不见’性福模块文章，列表页文章图片
url queue 1个进程，文章和图片各10个进程
解决方案：

#!/usr/bin/env python
# coding=utf-8

import multiprocessing
import urllib2
import re
import uuid
import os
import time


class Spider():
    def __init__(self):
        self.counter = 0
        self.lock = multiprocessing.Lock()
        # 文章超链接
        self.queue = multiprocessing.Queue()
        # img
        self.imgs = multiprocessing.Queue()
        self.filename = './data/' + str(uuid.uuid1()) + '.txt'
        self.headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:49.0) Gecko/20100101 Firefox/49.0'}

    def getUrls(self):
        urls = []
        for i in range(1, 51):
            urls.append('http://www.junbujian.cc/xingfu/' + str(i) + '.html')
        for url in urls:
            request = urllib2.Request(url, headers=self.headers)
            try:
                response = urllib2.urlopen(request)
                content = response.read().decode('utf-8')
            except:
                content = ''
                print 'error:' + url
                pass
            pattern = re.compile(r'<a target="_blank" class="thumbnail" href="(.*?)">')
            article_url = re.findall(pattern, content)
            for a_url in article_url:
                # print a_url
                self.queue.put(a_url, timeout=2)
            img_pat = re.compile(r'<img src="(.*?)" class="thumb">')
            img_url = re.findall(img_pat, content)
            for i_url in img_url:
                self.imgs.put(i_url, timeout=2)
    def getImg(self):
        while not self.imgs.empty():
            img_url = self.imgs.get(timeout=2)
            # os.path.isfile('filename')
            if not os.path.exists('./img/'):
                os.mkdir('./img/')
            img_name = './img/' + str(uuid.uuid1()) + '.jpg'
            with open(img_name, 'wb') as f:
                try:
                    f.write(urllib2.urlopen(img_url).read())
                    self.lock.acquire()
                    # print img_url
                    print multiprocessing.current_process().name, ' ', img_name, ' 已保存...'
                    self.lock.release()
                except:
                    print 'error:' + img_url
                    pass
    def getArticle(self):
        while not self.queue.empty():
            art_url = self.queue.get(timeout=2)
            request = urllib2.Request(art_url, headers=self.headers)
            try:
                response = urllib2.urlopen(request)
                content = response.read().decode('utf-8')
            except:
                print 'error:',art_url
                content = ''
            # 更改点 (.) 的含义，使它与每一个字符匹配（而不是与除 \n 之外的每个字符匹配）
            pat = re.compile(r'<article class="article-content">(.*?)</article>', re.S)
            article = re.findall(pat, content)
            if not os.path.exists('./data'):
                os.mkdir('./data')
            with open('./data/' + str(uuid.uuid1()) + '.html', 'w') as f:
                try:
                    f.write(multiprocessing.current_process().name + '\n' + article[0].encode('utf-8'))
                except:
                    with open('./data/error.txt', 'a') as e:
                        e.write('url:' + art_url + '\n')
                    pass
                print '...'

    def run(self):
        urls_proc = multiprocessing.Process(target=self.getUrls)
        urls_proc.daemon =True
        urls_proc.start()
        print 'starting urls_proc...'
        time.sleep(20)

        imgs_proc_list = []
        art_proc_list = []
        for i in range(10):
            imgs_proc = multiprocessing.Process(target=self.getImg)
            imgs_proc_list.append(imgs_proc)
            imgs_proc.daemon = True
            imgs_proc.start()
            print 'starting proc',i
        for i in range(10):
            art_proc = multiprocessing.Process(target=self.getArticle)
            art_proc_list.append(art_proc)
            art_proc.daemon = True
            art_proc.start()
            print 'staring proc_a',i

        urls_proc.join(15)
        for proc in imgs_proc_list:
            proc.join(15)
        for proc in art_proc_list:
            proc.join(15)
        print 'end...'
if __name__ == '__main__':
    spider = Spider()
    spider.run()
python － 多进程spider

问题导读：

解决方案：

python －多进程spider