hive sql 行列转换 开窗函数 炸裂函数
准备原始数据集
学生表 student.csv
001,彭于晏,1995-05-16,男
002,胡歌,1994-03-20,男
003,周杰伦,1995-04-30,男
004,刘德华,1998-08-28,男
005,唐国强,1993-09-10,男
006,陈道明,1992-11-12,男
007,陈坤,1999-04-09,男
008,吴京,1994-02-06,男
009,郭德纲,1992-12-05,男
010,于谦,1998-08-23,男
011,潘长江,1995-05-27,男
012,杨紫,1996-12-21,女
013,蒋欣,1997-11-08,女
014,赵丽颖,1990-01-09,女
015,刘亦菲,1993-01-14,女
016,周冬雨,1990-06-18,女
017,范冰冰,1992-07-04,女
018,李冰冰,1993-09-24,女
019,邓紫棋,1994-08-31,女
020,宋丹丹,1991-03-01,女
讲师表 teacher.csv
1001,张高数
1002,李体音
1003,王子文
1004,刘丽英
课程表 course.csv
01,语文,1003
02,数学,1001
03,英语,1004
04,体育,1002
05,音乐,1002
分数表 score.csv
001,01,94
002,01,74
004,01,85
005,01,64
006,01,71
007,01,48
008,01,56
009,01,75
010,01,84
011,01,61
012,01,44
013,01,47
014,01,81
015,01,90
016,01,71
017,01,58
018,01,38
019,01,46
020,01,89
001,02,63
002,02,84
004,02,93
005,02,44
006,02,90
007,02,55
008,02,34
009,02,78
010,02,68
011,02,49
012,02,74
013,02,35
014,02,39
015,02,48
016,02,89
017,02,34
018,02,58
019,02,39
020,02,59
001,03,79
002,03,87
004,03,89
005,03,99
006,03,59
007,03,70
008,03,39
009,03,60
010,03,47
011,03,70
012,03,62
013,03,93
014,03,32
015,03,84
016,03,71
017,03,55
018,03,49
019,03,93
020,03,81
001,04,54
002,04,100
004,04,59
005,04,85
007,04,63
009,04,79
010,04,34
013,04,69
014,04,40
016,04,94
017,04,34
020,04,50
005,05,85
007,05,63
009,05,79
015,05,59
018,05,87
员工表 emp.csv
7369,张三,研发,800.00,30
7499,李四,财务,1600.00,20
7521,王五,行政,1250.00,10
7566,赵六,销售,2975.00,40
7654,侯七,研发,1250.00,30
7698,马八,研发,2850.00,30
7782,金九,行政,2450.0,30
7788,银十,行政,3000.00,10
7839,小芳,销售,5000.00,40
7844,小明,销售,1500.00,40
7876,小李,行政,1100.00,10
7900,小元,讲师,950.00,30
7902,小海,行政,3000.00,10
7934,小红明,讲师,1300.00,30
7934,小红,讲师,1300.00,
雇员表 employee.csv
张无忌,男,1980/02/12,2022/08/09,销售,3000,12000,阿朱_小昭,张小无:8_张小忌:9
赵敏,女,1982/05/18,2022/09/10,行政,9000,2000,阿三_阿四,赵小敏:8
宋青书,男,1981/03/15,2022/04/09,研发,18000,1000,王五_赵六,宋小青:7_宋小书:5
周芷若,女,1981/03/17,2022/04/10,研发,18000,1000,王五_赵六,宋小青:7_宋小书:5
郭靖,男,1985/03/11,2022/07/19,销售,2000,13000,南帝_北丐,郭芙,5_郭襄:4
黄蓉,女,1982/12/13,2022/06/11,行政,12000,null,东邪_西毒,郭芙,5_郭襄:4
杨过,男,1988/01/30,2022/08/13,前台,5000,null,郭靖_黄蓉,杨小过:2
小龙女,女,1985/02/12,2022/09/24,前台,6000,null,张三_李四,杨小过:2
电影表 movie.txt
《疑犯追踪》-悬疑,动作,科幻,剧情
《Lie to me》-悬疑,警匪,动作,心理,剧情
《战狼2》-战争,动作,灾难
订单表 order.csv
1,1001,小元,2022-01-01,10
2,1002,小海,2022-01-02,15
3,1001,小元,2022-02-03,23
4,1002,小海,2022-01-04,29
5,1001,小元,2022-01-05,46
6,1001,小元,2022-04-06,42
7,1002,小海,2022-01-07,50
8,1001,小元,2022-01-08,50
9,1003,小辉,2022-04-08,62
10,1003,小辉,2022-04-09,62
11,1004,小猛,2022-05-10,12
12,1003,小辉,2022-04-11,75
13,1004,小猛,2022-06-12,80
14,1003,小辉,2022-04-13,94
创建数据库和数据表
create database chap06;
use chap06;
create external table student (
stu_id string comment '学生ID',
stu_name string comment '学生姓名',
birthday string comment '出生日期',
gender string comment '学生性别'
)
row format delimited fields terminated by ','
lines terminated by '\n'
stored as textfile
location '/quiz03/student';
load data local inpath '/root/data/data02/student.csv' overwrite into table student;
select * from student;
create external table teacher (
tea_id string comment '课程ID',
tea_name string comment '课程名称'
)
row format delimited fields terminated by ','
lines terminated by '\n'
stored as textfile
location '/quiz03/teacher';
load data local inpath '/root/data/data02/teacher.csv' overwrite into table teacher;
select * from teacher;
create external table course (
course_id string comment '课程ID',
course_name string comment '课程名称',
tea_id string comment '讲师ID'
)
row format delimited fields terminated by ','
lines terminated by '\n'
stored as textfile
location '/quiz03/course';
load data local inpath '/root/data/data02/course.csv' overwrite into table course;
select * from course;
create external table score (
stu_id string comment '学生ID',
course_id string comment '课程ID',
score int comment '成绩'
)
row format delimited fields terminated by ','
lines terminated by '\n'
stored as textfile
location '/quiz03/score';
load data local inpath '/root/data/data02/score.csv' overwrite into table score;
select * from score;
create external table emp (
emp_id int comment '员工ID',
emp_name string comment '员工姓名',
emp_job string comment '员工岗位',
emp_salary decimal(8,2) comment '员工薪资',
dept_id int comment '员工隶属部门ID'
)
row format delimited fields terminated by ','
lines terminated by '\n'
stored as textfile
location '/quiz01/emp';
load data local inpath '/root/data/data02/emp.csv' overwrite into table emp;
select * from emp;
create external table employee(
name string comment '姓名',
sex string comment '性别',
birthday string comment '出生年月',
hiredate string comment '入职日期',
job string comment '岗位',
salary int comment '薪资',
bonus int comment '奖金',
friends array<string> comment '朋友',
children map<string,int> comment '孩子'
)
row format delimited fields terminated by ','
collection items terminated by '_'
map keys terminated by ':'
lines terminated by '\n'
stored as textfile
location '/quiz04/employee';
load data local inpath '/root/data/data02/employee.csv' into table employee;
select * from employee;
create external table movie(
name string comment '电影名称',
category string comment '电影分类'
)
row format delimited fields terminated by '-'
lines terminated by '\n'
stored as textfile
location '/quiz04/movie';
load data local inpath '/root/data/data02/movie.txt' into table movie;
select * from movie;
create external table `order`
(
order_id string comment '订单id',
user_id string comment '用户id',
user_name string comment '用户姓名',
order_date string comment '下单日期',
order_amount int comment '订单金额'
)
row format delimited fields terminated by ','
lines terminated by '\n'
stored as textfile
location '/quiz04/order';
load data local inpath '/root/data/data02/order.csv' into table `order`;
select * from `order`;
行列转换
列转行
create table test (
stu_name string,
course_name string,
score int
);
insert into test values ('张三','语文','80'),('张三','数学','90'), ('李四','语文','85'),('李四','数学','95');
select * from test;
select stu_name,
max(case when course_name = '语文' then score end) as yuwen,
max(case when course_name = '数学' then score end) as shuxue
from test group by stu_name;

select collect_list(emp_job) job_list from emp;
select collect_set(emp_job) job_set from emp;
select size(collect_set(emp_job)) job_count from emp;
select concat_ws('-',collect_set(emp_job)) job_string from emp;
select split(concat_ws('-',collect_set(emp_job)),'-') job_item from emp;
行专列
create table sales (
emp_name string,
january int,
february int,
march int
);
insert into sales values ('张三',1000,2000,3000),('李四',1500,2500,3500);
select * from sales;

select t1.emp_name,
sale_list[0] january,
sale_list[1] february,
sale_list[2] march
from(
select t.emp_name,collect_list(sale) sale_list from(
select emp_name,'january' yue, january sale from sales
union all
select emp_name,'february' yue,february sale from sales
union all
select emp_name,'march' yue,march sale from sales) t
group by t.emp_name) t1;
UDF UDTF UDAF
explode
select explode(array('java','python','scala','go')) as course;
select explode(map('name','李昊哲','gender','1')) as (key,value);
posexplode
select posexplode(array('java','python','scala','go')) as (pos,course);
inline
select inline(array(named_struct('id',1,'name','李昊哲','gender','1'),
named_struct('id',2,'name','李哲','gender','0'),
named_struct('id',3,'name','李大宝','gender','1')))
as (id,name,gender);
lateral view
select * from employee lateral view explode(friends) t as friend;
select * from employee lateral view explode(children) t as children_name,children_age;
select * from employee
lateral view explode(friends) t1 as friend
lateral view explode(children) t2 as children_name,children_age;
select name, sex, birthday, hiredate, job, salary, bonus, friend,children_name,children_age from employee e
lateral view explode(friends) t1 as friend
lateral view explode(children) t2 as children_name,children_age;
UDTF 案例
select cate,count(name) as quantity from movie
lateral view explode(split(category,',')) tmp as cate
group by cate;
窗口函数(开窗函数)
聚合函数
统计每个用户截至每次下单的累计下单总额
select *,
sum(order_amount) over (
partition by user_id ,substr(order_date,1,7)
order by order_date
rows between unbounded preceding and current row
) sum_order_amount
from `order`;
select *,
sum(order_amount) over (
partition by user_id ,substr(order_date,1,7)
order by order_date
rows unbounded preceding
) sum_order_amount
from `order`;
统计每个用户截至每次下单的当月累积下单总额
select *,
sum(order_amount) over (
partition by user_id ,substr(order_date,1,7)
order by order_date
rows between unbounded preceding and unbounded following
) sum_order_amount
from `order`;
最近三笔订单总金额
当前订单金额与前两笔订单金额的总和
select *,
sum(order_amount) over (
partition by user_id
order by order_date
rows 2 preceding
) sum_order_amount
from `order`;
当前订单金额与后两笔订单金额的总和
select *,
sum(order_amount) over (
partition by user_id
order by order_date
rows 2 following
) sum_order_amount
from `order`;
当前订单金额与前一笔订单和后一笔订单金额的总和
select *,
sum(order_amount) over (
partition by user_id
order by order_date
rows between 1 preceding and 1 following
) sum_order_amount
from `order`;
分析函数 lag lead first_value last_value
lag lead
统计每个用户每次下单距离上次下单相隔的天数(首次下单按0天算)
select order_id, user_id, user_name, order_date, order_amount from (
select order_id, user_id, user_name, order_date, order_amount,
lag(order_date,1,order_date) over (partition by user_id order by order_date) pre_order_date
from `order`) t where datediff(order_date,pre_order_date) = 0;
每个用户每个月首笔订单时间
select order_id, user_id, user_name, order_date, order_amount from (
select order_id, user_id, user_name, order_date, order_amount,
lag(order_date,1,order_date) over (partition by user_id,substr(order_date,1,7) order by order_date) pre_order_date
from `order`) t where datediff(order_date,pre_order_date) = 0;
每个用户每个月最后笔订单时间
select order_id, user_id, user_name, order_date, order_amount from (
select order_id, user_id, user_name, order_date, order_amount,
lead(order_date,1,order_date) over (partition by user_id,substr(order_date,1,7) order by order_date) next_order_date
from `order`) t where datediff(order_date,next_order_date) = 0;
每个岗位先先入职的远哥和后入在的员工工资差
select name, sex, birthday, hiredate, job, salary, bonus, friends, children, new_salary,(salary - new_salary) salary_diff from (
select name, sex, birthday, hiredate, job, salary, bonus, friends, children,
lead(salary,1,salary) over (partition by job order by hiredate) new_salary
from employee) t;
first_value last_value
每个用户每个月首笔订单时间
select order_id, user_id, user_name, order_date, order_amount,
first_value(order_date) over (partition by user_id,substr(order_date,1,7) order by order_date) first_order_value
from `order`;
每个用户每个月最后笔订单时间
select order_id, user_id, user_name, order_date, order_amount,
last_value(order_date) over (
partition by user_id,substr(order_date,1,7) order by order_date
rows between current row and unbounded following) last_order_value
from `order`;
每个用户每个月首笔订单时间和最后笔订单时间
select order_id, user_id, user_name, order_date, order_amount,
first_value(order_date) over (partition by user_id,substr(order_date,1,7) order by order_date) first_order_value,
last_value(order_date) over (
partition by user_id,substr(order_date,1,7) order by order_date
rows between current row and unbounded following) last_order_value
from `order`;
select order_id, user_id, user_name, order_date, order_amount, first_order_value, last_order_value from
(select order_id, user_id, user_name, order_date, order_amount,
first_value(order_date) over (partition by user_id,substr(order_date,1,7) order by order_date) first_order_value,
last_value(order_date) over (
partition by user_id,substr(order_date,1,7) order by order_date
rows between current row and unbounded following) last_order_value
from `order`) t where order_date = first_order_value or order_date = last_order_value;
排序函数
分组排序取TopN
select a.course_id,a.stu_id,a.score from score a
left join score b
on a.course_id = b.course_id and a.score <= b.score
group by a.stu_id,a.course_id,a.score
having count(a.stu_id) <=5
order by a.course_id,a.score desc;
select S1.course_id,s1.stu_id,s1.score from score s1 where
(select count(*) from score s2
where s2.course_id=s1.course_id AND s2.score > s1.score
) <= 5 order by s1.course_id,s1.score desc;
row_number
select * from
(select course_id, stu_id, score,
row_number() over (partition by course_id order by score desc ) as mum
from score) t where mum <= 5;
rank
select * from
(select course_id, stu_id, score,
rank() over (partition by course_id order by score desc ) as mum
from score) t where mum <= 5;
dense_rank
select * from
(select course_id, stu_id, score,
dense_rank() over (partition by course_id order by score desc ) as mum
from score) t where mum <= 5;
每个月每个消费总金额前三名的用户
select order_id, user_id, user_name, order_date, order_amount, total_order_amount, rank_total_order_amount from
(select order_id, user_id, user_name, order_date, order_amount, total_order_amount,
dense_rank() over (partition by substr(order_date,1,7) order by total_order_amount desc) rank_total_order_amount
from (
select order_id, user_id, user_name, order_date, order_amount,
sum(order_amount) over(partition by substr(order_date,1,7),user_id order by order_date
rows between unbounded preceding and unbounded following) total_order_amount
from `order`) t) t1 where rank_total_order_amount <= 3;