1、爬虫的应用领域 搜索引擎
2、谷歌分析网站 Elements、Console、Sources、Network
3、数据怎么爬取 目标网站,发起请求-请求方式和请求数据
4、爬取QQ音乐排行榜数据
5、数据解析-BeautifulSoup
6、持久化数据到数据库
import requests
from bs4 import BeautifulSoup
import mysql.connector
def send_requests():
url = "https://y.qq.com/n/ryqq/toplist/4"
headers = {
"User-Agent": "Mozilla/5.0(Windows NT 6.1;WOW64) AppleWebKit/537.36(KABUL, like Gecko) "
"Chrome/86.0.4240.198Safari/537.36 "
}
resp = requests.get(url=url, headers=headers)
# print(resp.text)
parser_content(resp)
def parser_content(resp):
html = resp.text
bs = BeautifulSoup(html, 'html.parser')
ul = bs.find('ul', class_='songlist__list')
li_list = ul.find_all('li')
# print(li_list)
lst = []
count = 0
for item in li_list:
count +=1
if count <= 3:
songlist_number = item.find('div', class_='songlist__number songlist__number--top').text
else:
songlist_number = item.find('div', class_='songlist__number').text
songlist_songname = item.find('div', class_='songlist__songname').text
songname = songlist_songname.split('播放')
songname = songname[0]
songname = ''.join(songname)
# print(songname)
# break
# songname = songname = item.find('a', class_='songlist__songname')
# songname = songlist_songname.find('a', class_='songlist__cover').text
songlist_artist = item.find('div', class_='songlist__artist').text
songlist_time = item.find('div', class_='songlist__time').text
# print(songlist_number, songname, songlist_artist, songlist_time)
# break
lst.append([songlist_number, songname, songlist_artist, songlist_time])
# print(lst)
save_mysql(lst)
my_db = mysql.connector.connect(host='localhost', user='root', password='root', database='python_db',auth_plugin='mysql_native_password')
my_cursor = my_db.cursor()
def save_mysql(lst):
# print(my_db)
# sql语句
sql = 'insert into tbl_qqmusic (songlist_number, songname, songlist_artist, songlist_time) values (%s,%s,%s,%s)'
# 执行批量插入
my_cursor.executemany(sql, lst)
# 提交事务
my_db.commit()
print("保存完毕")
if __name__ == '__main__':
send_requests()
注:数据库知识:
启动mysql
net start mysql
进入数据库
mysql -u root -p
空密码
修改密码
alter user user() identified by "root";
新建立数据库
create database python_lianjia;
查询数据库
show databases;
选择数据库
use python_lianjia;
查询数据库下面的所有表
show tables
删除数据库
drop databases python_lianjia;
退出数据库
quit
关闭数据库服务
net stop mysql
本例中创建数据库:
create table tbl_qqmusic(
id int(4) primary key auto_increment,
songlist_number varchar(255),
songname varchar(255),
songlist_artist varchar(255),
songlist_time varchar(255)
);
python中数据库操作:
my_db = mysql.connector.connect(host='localhost', user='root', password='root', database='python_lianjia',
auth_plugin='mysql_native_password')
my_cursor = my_db.cursor()
print(self.my_db)
# sql语句
sql = 'insert into tbl_lianjia (title,positionInfo,houseInfo,followInfo,totalPrice,unitPrice) values (%s,%s,%s,%s,%s,%s)'
# 执行批量插入
my_cursor.executemany(sql, lst)
# 提交事务
my_db.commit()
总结:
1、BeautifulSoup数据解析
html = resp.text
bs = BeautifulSoup(html, 'html.parser')
ul = bs.find('ul', class_='songlist__list')
li_list = ul.find_all('li')
# print(li_list)
lst = []
2、持久化数据到数据库
import mysql.connector
my_db = mysql.connector.connect(host='localhost', user='root', password='root', database='python_db',auth_plugin='mysql_native_password')
my_cursor = my_db.cursor()
def save_mysql(lst):
# print(my_db)
# sql语句
sql = 'insert into tbl_qqmusic (songlist_number, songname, songlist_artist, songlist_time) values (%s,%s,%s,%s)'
# 执行批量插入
my_cursor.executemany(sql, lst)
# 提交事务
my_db.commit()
print("保存完毕")
3、数据库建表(在数据库中操作)
create table tbl_qqmusic(
id int(4) primary key auto_increment,
songlist_number varchar(255),
songname varchar(255),
songlist_artist varchar(255),
songlist_time varchar(255)
);
4、数据库基本操作(在数据库中操作)
进入数据库
mysql -u root -p
空密码
修改密码
alter user user() identified by "root";
新建立数据库
create database python_lianjia;
查询数据库
show databases;
选择数据库
use python_lianjia;
查询数据库下面的所有表
show tables
删除数据库
drop databases python_lianjia;
退出数据库
quit