多线程工作原理
多线程示意图

Queue(队列对象)
queue是python中的标准库,可以直接from queue import Queue引用;队列是线程间最常用的交换数据的形式
python下多线程的思考
对于资源,加锁是个重要的环节。Queue,是线程安全的,因此在满足使用条件下,建议使用队列
创建一个“队列”对象
pageQueue = Queue(10)
将一个值放入队列中
for page in range(1, 11):
    pageQueue.put(page)
将一个值从队列中取出
pageQueue.get()
队列Queue
Queue线程安全
     queue是python中的标准库,可以直接from queue import Queue引用;队列是线程间最常用的交换数据的形式
     创建一个“队列”对象
     队列常用方法
         put()
         get(block)
         empty()
         full()
         qsize()
队列锁与线程锁
import threading
from queue import  Queue
dataQueue = Queue(100)
exitFlag = False
class MyThread(threading.Thread):
    def __init__(self,q):
        super().__init__()
        self.queue = q
    def run(self):
        super().run()
        global exitFlag
        while True:
            if exitFlag:
                print('++++++++++++++++++++++++++exit')
                break
            try:
                print('------------------------',self.queue.get(False))
                self.queue.task_done()
            except:
                pass
def main():
    for i in range(100):
        dataQueue.put(i)
    threads = []
    for i in range(5):
        thread = MyThread(dataQueue)
        threads.append(thread)
        thread.start()
    # 队列锁
    # dataQueue.join()
    global  exitFlag
    exitFlag = True
    print('exit ------------------------------------------------')
    # 线程锁
    for t in threads:
        t.join()
if __name__ == '__main__':
    main()另一个实例 爬去读书网站
import requests
from bs4 import BeautifulSoup
from queue import Queue
import threading
from threading import Lock
url = 'https://www.dushu.com/book/1175_%d.html'
task_queue = Queue(100)
parse_queue = Queue(100)
headers = {'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
'Accept-Encoding':'gzip, deflate, br',
'Accept-Language':'zh-CN,zh;q=0.9',
'Cache-Control':'max-age=0',
'Connection':'keep-alive',
'Cookie':'Hm_lvt_8008bbd51b8bc504162e1a61c3741a9d=1572418328; Hm_lpvt_8008bbd51b8bc504162e1a61c3741a9d=1572418390',
'Host':'www.dushu.com',
'Sec-Fetch-Mode':'navigate',
'Sec-Fetch-Site':'none',
'Sec-Fetch-User':'?1',
'Upgrade-Insecure-Requests':'1',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.70 Safari/537.36',}
# 解析线程退出的标记
exit_flag = False
# 相当于线程池
class CrawlThread(threading.Thread):
    def __init__(self, q_task:Queue,q_parse:Queue) -> None:
        super().__init__()
        self.q_task = q_task
        self.q_parse = q_parse
    def run(self) -> None:
        super().run()
        self.spider()
    # 一直干活
    def spider(self):
        while True:
            if self.q_task.empty():
                print('+++++++爬虫线程%s执行任务结束+++++++'%(threading.current_thread().getName()))
                break
            taskId = self.q_task.get()
            response = requests.get(url % (taskId), headers = headers)
            response.encoding = 'utf-8'
            html = response.text
            self.q_parse.put((html,taskId))
            self.q_task.task_done()
            print('------爬虫线程:%s-----执行任务:%d-------'
                  %(threading.current_thread().getName(),taskId))
# 专心爬虫
def crawl():
    for i in range(1,101):
        task_queue.put(i)
    for i in range(5):
        t = CrawlThread(task_queue,parse_queue)
        t.start()
class ParseThread(threading.Thread):
    def __init__(self,q_parse:Queue,lock:Lock,fp):
        super().__init__()
        self.q_parse = q_parse
        self.lock = lock
        self.fp = fp
    def run(self):
        super().run()
        self.parse()
    def parse(self):
        while True:
            if exit_flag:
                print('-----------解析线程:%s完成任务退出------------'
                      %(threading.current_thread().getName()))
                break
            try:
                html,taskId = self.q_parse.get(block=False)
                soup = BeautifulSoup(html,'lxml')
                books = soup.select('div[class="bookslist"] > ul > li')
                print('----------------',len(books))
                for book in books:
                    self.lock.acquire()
                    book_url = book.find('img').attrs['src']
                    book_title = book.select('h3 a')[0]['title']
                    book_author = book.select('p')[0].get_text()
                    book_describe = book.select('p')[1].get_text()
                    fp.write('%s\t%s\t%s\t%s\n'%(book_url,book_title,book_author,book_describe))
                    self.lock.release()
                self.q_parse.task_done()
                print('**********解析线程:%s完成了第%d页解析任务***********'
                      %(threading.current_thread().getName(),taskId))
            except :
                pass
# 专心的负责网页解析,保存
def parse(fp):
    lock = Lock()
    for i in range(5):
        t = ParseThread(parse_queue,lock,fp)
        t.start()
if __name__ == '__main__':
    crawl()
    fp = open('./book.txt','a',encoding='utf-8')
    parse(fp)
    # 队列join:队列中的任务必须结束,下面才会执行
    task_queue.join()
    parse_queue.join()
    fp.close()
    exit_flag = True
    print('代码执行到这里!!!!!!!!!!!!!!')多线程实现
     读书http://www.qwsy.com/shuku.aspx?&page=1
     导包
     定义变量
     创建爬虫线程并启动
         爬虫线程
     创建解析线程并启动
         解析线程
             Queue.get(block = True/False)
     join()锁定线程,确保线程全部执行完毕
     结束任务
                










