思路
- 我们用360查询电话号码,然后就会返回我们想要的电话归属地(省份、市级)以及来源分类(联通、电信、移动)
- 然后我们批量导入查询,并写入excel文档
- 接口测试1:https://www.so.com/s?ie=utf-8&q=0532-66776800
- 接口测试2:https://www.so.com/s?ie=utf-8&q=0532-66776800
- 案例为了分块测试代码,我们使用jupyter notebook 编写,完整代码在文末。
测试请求流程
url=r"https://www.so.com/s?ie=utf-8&q=18529108189"
import requests
import re
r=requests.get(url)
#r.encoding ="utf-8" #当出现中文乱码问题时使用
print(type(r)) #类型:<class 'requests.models.Response'>
print(r. status_code)#状态:200 或者404、500...等
print(type(r.text)) #响应体的类型:<class 'str'>
print(r.cookies) #cookies:<RequestsCookieJar[<Cookie(.*)/>]>
print(r.text) #页面内容
点击返回值的任意位置,按ctrl+f,输入我们测试的电话号码归属地“广州”回车,定位有我们需要的内容。
测试解析内容
from bs4 import BeautifulSoup
soup=BeautifulSoup(r.text,'lxml') #初始化为BeautifulSoup的解析形式
#标准化
r1=soup.prettify()#把要解析的字符串以标准的缩进格式输出,同时有节点缺失或错误也可以自行更正
#节点定位-内容获取
A=soup.find_all('p',{'class':"mh-detail"})[0].get_text()
A= A.replace("\n", " ").strip() #去除换行,去头尾空格
print(A)
A = A.split() #字符串转化为列表
L = len(A)
print(L)
if L==1:
A0,A1,A2,A3= re.sub("\D", "", A[0]),"未知","未知","未知"
elif L==2:
A0,A1,A2,A3= re.sub("\D", "", A[0]),A[1],"未知","未知"
elif L==3:
A0,A1,A2,A3= re.sub("\D", "", A[0]),A[1],A[1],"未知"
else:
A0,A1,A2,A3= re.sub("\D", "", A[0]),A[1],A[1],A[3]
data = [A0,A1,A2,A3]
print(data)
ok,请求解析的测试都没问题了。当然封装代码的时候我还要考虑请求超时,号码错误等异常,此部分见文末的完整代码。
测试批量获取电话号码
import numpy as np
import pandas as pd
# 假如我们的电话号码存在excel中
df = pd.read_excel(r'D:/Case_data/telephone.xlsx',encoding='utf-8')
display(df)
# 遍历获取单个电话号码
for i in df["电话"]:
print(i)
测试将请求解析结果写入
import csv
file_name = "D:/Case_data/telephone_result.csv"
f = open(file_name, "w+", newline='',encoding = 'gb18030')
writer = csv.writer(f, dialect='excel')
# 先写入columns_name
writer.writerow(['电话', '省份', '市级', '分类'])
# 写入内容
writer.writerows([[A0, A1, A2, A3]])
f.close()
构造完整代码
# 导入需要用到的库
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
import csv
import time
# 请求和解析和写入
def response(url):
try:
#url=r"https://www.so.com/s?ie=utf-8&q=18529108189"
headers = { }
headers["User-Agent"]="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36"
import requests
import re
rand = random.randint(3, 10)
time.sleep(rand)#延时提交
r=requests.get(url,headers=headers,timeout = 60)
#r.encoding ="utf-8" #当出现中文乱码问题时使用
soup=BeautifulSoup(r.text,'lxml') #初始化为BeautifulSoup的解析形式
r1=soup.prettify()#把要解析的字符串以标准的缩进格式输出,同时有节点缺失或错误也可以自行更正
#节点定位-内容获取
A=soup.find_all('p',{'class':"mh-detail"})[0].get_text()
A= A.replace("\n", " ").strip() #去除换行,去出头尾空格
A = A.split() #字符串转化为列表
L = len(A)
if L==1:
A0,A1,A2,A3= re.sub("\D", "", A[0]),"未知","未知","未知"
elif L==2:
A0,A1,A2,A3= re.sub("\D", "", A[0]),A[1],"未知","未知"
elif L==3:
A0,A1,A2,A3= re.sub("\D", "", A[0]),A[1],A[1],"未知"
else:
A0,A1,A2,A3= re.sub("\D", "", A[0]),A[1],A[1],A[3]
# 写入
writer.writerows([[A0, A1, A2, A3]])
except (HTTPError, URLError, socket.timeout, AttributeError,IndexError) as e:
print("请求失败")
return
# 读取数据并构造批量查询与写入
def resultsA(read_file_path):
df = pd.read_excel(read_file_path,encoding='utf-8')
for i in df["电话"]:
print(i)
url=r"https://www.so.com/s?ie=utf-8&q="+ str(i)
response(url)# 请求和解析和写入
#执行
if __name__ == '__main__':
read_file_path= r'D:/Case_data/telephone.xlsx' #读入电话号码文件路径
to_file_path= r"D:/Case_data/telephone_result.csv" #写入结果的路径
f = open(to_file_path, "w+", newline='',encoding = 'gb18030')
writer = csv.writer(f, dialect='excel')
# 先写入columns_name
writer.writerow(['电话', '省份', '市级', '分类'])
resultsA(read_file_path) # 执行
f.close() # 关闭表
这就是我们的完整代码啦,不过这个请求页面的反爬机制还是相对严格,请求不了几次就不让请求了,建议大家设置请求睡眠时长和构造IP池访问。我们下一文再分享另外一个接口。