1. 确定 URL
 from urllib import request
 import re
 #定义url
 page=50
 url="https://tieba.baidu.com/f?kw=%E6%AE%B5%E5%AD%90&ie=utf-8&pn="+str(page)
 2.添加headers并抓取页面代码
 try:
     #定义请求头
     headrs={"User-Agent":" Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36"}
     #定义请求,传入请求头
     req=request.Request(url,headers=headrs)
     #打开网页
     resp=request.urlopen(req)
     #打印响应码,解码
     # print(resp.read().decode('utf-8'))
    
 3. 使用正则表达式提取某一页的所有段子
 content=resp.read().decode('utf-8')
     #定义正则表达式
     #<a rel="noopener" 具体的东西
     #.*? 匹配没用的数据
     #(.*?)匹配有用数据分组
     #\s 空格
     pattern=re.compile(r'<a rel="noopener".*?title=(.*?)\s.*?>(.*?)</a>')
     #匹配html
     items=re.findall(pattern,content)
     #打印解析的内容
     for i in items:
         print("标题:"+i[0]+" 内容:"+i[1])
  
 except request.URLError as e:
     #打印响应码
     if hasattr(e,'code'):
         print(e.code)
     #打印异常原因
     if hasattr(e,'reason'):
         print(e.reason)
 面向对象模式
 from urllib import request
 import re
  
 class tieba:
     #初始化
     def __init__(self):
         # 定义url
         self.url="https://tieba.baidu.com/f?kw=%E6%AE%B5%E5%AD%90&ie=utf-8&pn="
         # 定义请求头
         self.headrs={"User-Agent":" Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36"}
         #列表,存储解析后的结果
         self.stories=[]
     #下载页面
     def getPage(self,page_number):
         try:
             # 定义请求,传入请求头
             req=request.Request(self.url+str(page_number),headers=self.headrs)
             # 打开网页
             resp=request.urlopen(req)
             # 打印响应码,解码
             content=resp.read().decode("utf-8")
             return content
         except request.URLError as e:
             # 打印响应码
             if hasattr(e, 'code'):
                 print(e.code)
             # 打印异常原因
             if hasattr(e, 'reason'):
                 print(e.reason)
     #解析页面
     def rexgPage(self,content):
         # 定义正则表达式
         # <a rel="noopener" 具体的东西
         # .*? 匹配没用的数据
         # (.*?)匹配有用数据分组
         # \s 空格
         pattern = re.compile(r'<a rel="noopener".*?title=(.*?)\s.*?>(.*?)</a>')
         # 匹配html
         items = re.findall(pattern, content)
         # 添加解析的内容
         for i in items:
             # print("标题:" + i[0] + " 内容:" + i[1])
             self.stories.append("标题:" + i[0] + " 内容:" + i[1])
     #显示解析的内容
     def getContent(self):
         for i in self.stories:
             print(i)
  
  
 #创建对象
 c=tieba()
 #调用方法
 c.rexgPage(c.getPage(100))
 c.getContent()










