0
点赞
收藏
分享

微信扫一扫

手写一个民用Tomcat (06)

司马吹风 4小时前 阅读 2
hive

一、CTE语法

-- 子查询,将子查询的结果当做表使用
select empno,ename from (
select * from emp) t1;
-- 基本用法
with 临时表名 as(查询语句)
select * from 临时表名
​
-- 多个计算结果保存
with tb1 as(查询语句),,
    tb2 as(查询语句 select * from tb1),
    tb3 as(查询语句)
    .....
select * from tb3 join tb2
with tb1 as(select * from emp)
select ename,sal from tb1;
SELECT t2.shop, t2.user_id, t2.cnt
FROM (SELECT t1.*,
             row_number() over (partition BY t1.shop ORDER BY t1.cnt DESC) rk
      FROM (SELECT user_id, shop, count(*) AS cnt
            FROM test2
            GROUP BY user_id, shop) t1) t2
WHERE rk <= 3;
-- CTE语法可以方便代码阅读,将多个计算步骤拆分
with tb1 as(
    SELECT user_id, shop, count(*) AS cnt
            FROM test2
            GROUP BY user_id, shop
),
    tb2 as(
        SELECT tb1.*,
             row_number() over (partition BY tb1.shop ORDER BY tb1.cnt DESC) rk
      FROM tb1
    )
select * from tb2 where rk <=3;

二、爆炸函数和合并函数

  • explode方法

    • 爆炸函数,可以将数组中的数据拆分多行

create table tb_user(
    id int,
    name string,
    hobby string
)row format delimited fields terminated by ',';
​
select id,name,split(hobby,'-') as hobby  from tb_user;
-- explode不能直接和其他字段出现在select中
select explode(split(hobby,'-')) as hobby  from tb_user;
-- 使用侧视图的方法和其他字段一起展示
-- lateral view 爆炸函数 表名 as 字段名
select id,name,new_hobby from tb_user lateral view explode(split(hobby,'-')) tb1 as new_hobby;
​
-- 不能简单使用join进行关联数据
select * from tb_user join (select explode(split(hobby,'-')) as hobby  from tb_user) tb1;
  • collect方法

    • 将一列数据中的多行数据合并成一行

-- collect_list 合并后不会去重
select collect_list(name) from tb_user;
-- collect_list 合并会对数据进行去重
select collect_set(name) from tb_user;
create table tb_visit(
    id int,
    name string,
    url string
)row format delimited fields terminated by ',';
​
select * from tb_visit;
​
select collect_set(name) from tb_visit;
​
-- 统计不同用户访问了哪些网址
select name,collect_set(url) from tb_visit group by name;

三、随机抽样

  • 格式

SELECT ... FROM tbl TABLESAMPLE(BUCKET x OUT OF y ON(colname | rand()))
​
y表示将表数据随机划分成y份(y个桶)
x表示从y里面随机抽取x份数据作为取样
colname表示随机的依据基于某个列的值
rand()表示随机的依据基于整行
create table tb_stu(
    id int,
    name string,
    age int,
    gender int,
    dt string
)row format delimited fields terminated by ',';
​
-- 指定字段进行分桶抽样
select * from tb_stu tablesample (bucket 2 out of 30 on name);
-- 随机抽取
with tb1 as (select *
             from tb_stu tablesample (bucket 2 out of 20 on rand()))
select gender,count(*)
from tb1 group by gender;  

四、虚拟列(了解)

INPUT__FILE__NAME,显示数据行所在的具体文件
BLOCK__OFFSET__INSIDE__FILE,显示数据行所在文件的偏移量
ROW__OFFSET__INSIDE__BLOCK,显示数据所在HDFS块的偏移量 
    此虚拟列需要设置:SET hive.exec.rowoffset=true 才可使用
select *,INPUT__FILE__NAME from brand;
select * from brand where INPUT__FILE__NAME='hdfs://node1:8020/user/hive/warehouse/pydata.db/brand/000001_0';
​
select *,BLOCK__OFFSET__INSIDE__FILE from tb_stu;
SET hive.exec.rowoffset=true;
select *,ROW__OFFSET__INSIDE__BLOCK from tb_stu;
​

五、快速建表

  • like语法

    • 将原始表的元数据(也就是表的名字字段等信息复制一份),不会复制行数据

    • 创建之后是一个空表

create table 新的表名 like 原始表名
  • as语法

    • 会将原始数据表的内容全部复制一份到新表中

create table 新的表名 as select * from 原始表

select * from tb_user;
create table tb_user_new like tb_user;
select * from tb_user_new;
​
create table tb_user_new_new as select * from tb_user;
​
select * from tb_user_new_new;

六、视图

create view 视图名  as 查询语句
-- 将计算的sql语句保存在视图中
create view sum_view as select sum(if(name is not null,1,0) ) from tb_user;
​
-- 当查询视图时,就会自动执行视图中的sql语句
select * from sum_view;
-- (1)修改表字段注解和表注解
use hive3;
alter table COLUMNS_V2 modify column COMMENT varchar(256) character set utf8;
alter table TABLE_PARAMS modify column PARAM_VALUE varchar(4000) character set utf8;
​
-- (2)修改分区字段注解
alter table PARTITION_PARAMS modify column PARAM_VALUE varchar(4000) character set utf8 ;
alter table PARTITION_KEYS modify column PKEY_COMMENT varchar(4000) character set utf8;
​
-- (3)修改索引注解
alter table INDEX_PARAMS modify column PARAM_VALUE varchar(4000) character set utf8;

七、数据压缩和存储格式

7-1 数据压缩

压缩格式压缩格式所在的类
Zliborg.apache.hadoop.io.compress.DefaultCodec
Gziporg.apache.hadoop.io.compress.GzipCodec
Bzip2org.apache.hadoop.io.compress.BZip2Codec
Lzocom.hadoop.compression.lzo.LzoCodec
Lz4org.apache.hadoop.io.compress.Lz4Codec
Snappyorg.apache.hadoop.io.compress.SnappyCodec

7-2 存储格式

STORED AS orc tblproperties ("orc.compress"="SNAPPY");
create table tb_visit_new(
    id int,
    name string,
    url string
)  -- stored as指定orc(列存储)存储方式  tblproperties("orc.compress"="SNAPPY") 指定压缩方式
    stored as orc tblproperties("orc.compress"="SNAPPY");
​
insert into tb_visit_new select * from tb_visit;
​
​
select * from tb_visit_new;
举报

相关推荐

0 条评论