hive04
hive查询
语法结构:
SELECT [ALL | DISTINCT] 字段名, 字段名, ...
FROM 表名 [inner | left outer | right outer | full outer | left semi JOIN 表名 ON 关联条件 ]
[WHERE 非聚合条件]
[GROUP BY 分组字段名]
[HAVING 聚合条件]
[ORDER BY 排序字段名 asc | desc]
[CLUSTER BY 字段名 | [DISTRIBUTE BY 字段名 SORT BY 字段名]]
[LIMIT x,y]
类sql基本查询
知识点:
基础查询格式: select distinct 字段名 from 表名;
注意: *代表所有字段 distinct去重 as给表或者字段起别名
条件查询格式: select distinct 字段名 from 表名 where 条件;
比较运算符: > < >= <= != <>
逻辑运算符: and or not
模糊查询: %代表任意0个或者多个字符 _代表任意1个字符
空判断: 为空is null 不为空is not null
范围查询: x到y的连续范围:between x and y x或者y或者z类的非连续范围: in(x,y,z)
排序查询格式: select distinct 字段名 from 表名 [where 条件] order by 排序字段名 asc|desc ;
asc : 升序 默认升序
desc: 降序
聚合查询格式: select 聚合函数(字段名) from 表名;
聚合函数: 又叫分组函数或者统计函数
聚合函数: count() sum() avg() max() min()
分组查询格式: select 分组字段名,聚合函数(字段名) from 表名 [where 非聚合条件] group by 分组字段名 [having 聚合条件];
注意: 当分组查询的时候,select后的字段名要么在groupby后出现过,要么放在聚合函数内,否则报错
where和having区别?
区别1: 书写顺序不同,where在group by关键字前,having在group by关键字后
区别2: 执行顺序不同,where在分组之前过滤数据,having在分组之后过滤数据
区别3: 筛选数据不同,where只能在分组之前过滤非聚合数据,having在分组之后主要过滤聚合数据
区别4: 操作对象不同,where底层操作伪表,having底层操作运算区
分页查询格式: select 字段名 from 表名 [ order by 排序字段名 asc|desc] limit x,y;
x: 起始索引 默认从0开始,如果x为0可以省略 计算格式: x=(页数-1)*y
y: 本次查询记录数
数据准备:
CREATE TABLE orders (
orderId bigint COMMENT '订单id',
orderNo string COMMENT '订单编号',
shopId bigint COMMENT '门店id',
userId bigint COMMENT '用户id',
orderStatus tinyint COMMENT '订单状态 -3:用户拒收 -2:未付款的订单 -1:用户取消 0:待发货 1:配送中 2:用户确认收货',
goodsMoney double COMMENT '商品金额',
deliverMoney double COMMENT '运费',
totalMoney double COMMENT '订单金额(包括运费)',
realTotalMoney double COMMENT '实际订单金额(折扣后金额)',
payType tinyint COMMENT '支付方式,0:未知;1:支付宝,2:微信;3、现金;4、其他',
isPay tinyint COMMENT '是否支付 0:未支付 1:已支付',
userName string COMMENT '收件人姓名',
userAddress string COMMENT '收件人地址',
userPhone string COMMENT '收件人电话',
createTime timestamp COMMENT '下单时间',
payTime timestamp COMMENT '支付时间',
totalPayFee int COMMENT '总支付金额'
) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t';
数仓分层思想:
create database xls_ods;
create database xls_dw;
create database xls_da;
create table xls_dw.dw_orders as
select
orderid,
orderno,
shopid,
userid,
orderstatus,
goodsmoney,
delivermoney,
totalmoney,
realtotalmoney,
case
when payType=0 then '未知'
when payType=1 then '支付宝'
when payType=2 then '微信'
when payType=3 then '现金'
when payType=4 then '其他'
end as payType,
payType,
username,
useraddress,
userphone,
createtime,
paytime,
totalpayfee
from orders;
alter table dw_orders change orderstatus orderstatus string;
alter table dw_orders change ispay ispay string;
insert overwrite table xls_dw.dw_orders
select
orderid,
orderno,
shopid,
userid,
case
when orderstatus=-3 then '用户拒收'
when orderstatus=-2 then '未付款的订单'
when orderstatus=-1 then '用户取消'
when orderstatus=0 then '待发货'
when orderstatus=1 then '配送中'
when orderstatus=2 then '用户确认收货'
end
as orderstatus,
goodsmoney,
delivermoney,
totalmoney,
realtotalmoney,
case
when payType=0 then '未知'
when payType=1 then '支付宝'
when payType=2 then '微信'
when payType=3 then '现金'
when payType=4 then '其他'
end as payType,
case
when isPay=0 then '未支付'
when isPay=1 then '已支付'
end as isPay,
username,
useraddress,
userphone,
createtime,
paytime,
totalpayfee
from orders;
课堂练习:
select userName,userPhone from orders where userName='邓力夫';
select distinct userName name,userPhone phone from orders where userName='邓力夫';
select o.userName ,o.userPhone from orders as o ;
select distinct payType from orders;
drop table if exists da_gd_orders;
create table da_gd_orders as
select * from orders where userAddress like '广东省%';
select count(*) from orders where userAddress like '广东省%';
select isPay,count(*) cnt from orders group by isPay;
select userId, username, max(realTotalMoney)
from orders
where isPay = 1
group by userId, username;
select userId, username, avg(realTotalMoney)
from orders
where isPay = 1
group by userId, username;
select userId, username, avg(realTotalMoney) as avg_money
from orders
where isPay = 1
group by userId, username
having avg_money > 10000;
select userId, username,round(avg(realTotalMoney),2)
from orders
where isPay = 1
group by userId, username
having round(avg(realTotalMoney),2) > 10000;
select * from orders where userAddress like '广东省%' order by realTotalMoney desc;
select * from orders where userAddress like '广东省%' order by realTotalMoney desc limit 5;
类sql多表查询
知识点:
交叉连接格式: select 字段名 from 左表 cross join 右表;
注意: 交叉连接产生的结果叫笛卡尔积,此种方式慎用!!!
内连接格式: select 字段名 from 左表 inner join 右表 on 左右表关联条件;
特点: 相当于只取两个表的交集
左外连接格式: select 字段名 from 左表 left outer join 右表 on 左右表关联条件;
特点: 以左表为主,左表数据全部保留,右表只保留和左表有交集的部分
右外连接格式: select 字段名 from 左表 right outer join 右表 on 左右表关联条件;
特点: 以右表为主,右表数据全部保留,左表只保留和右表有交集的部分
自连接: 本质是一个特殊的内外连接,最大特点就是左右表是同一个表
应用场景: 比较局限,场景1: 存储省市县三级数据的区域表 场景2: 存储上下级信息的员工表
子查询: 本质是一个select语句作为另外一个select语句的一部分(表或者条件)
注意: 子查询作为表使用的话必须取别名
数据准备:
CREATE TABLE users (
userId int,
loginName string,
loginSecret int,
loginPwd string,
userSex tinyint,
userName string,
trueName string,
brithday date,
userPhoto string,
userQQ string,
userPhone string,
userScore int,
userTotalScore int,
userFrom tinyint,
userMoney double,
lockMoney double,
createTime timestamp,
payPwd string,
rechargeMoney double
) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t';
load data inpath '/source/itheima_users.txt' into table users;
select * from users limit 1;
练习:
select * from users cross join orders;
select * from users u inner join orders o on u.userId=o.userId ;
select * from users u left outer join orders o on u.userId=o.userId ;
select * from users u right outer join orders o on u.userId=o.userId ;
select xian.title
from
(select * from areas where title = '北京市' and pid is not null) city
join
areas xian
on city.id = xian.pid;
select xian.title
from
areas city
join
areas xian
on city.id = xian.pid
where city.title = '北京市' and city.pid is not null;
;
select title
from areas
where pid = (select id from areas where title = '北京市' and pid is not null);
hive其他join操作
知识点:
全外连接: 左表 full [outer] join 右表 on 条件
左半开连接: 左表 left semi join 右表 on 条件
示例:
select * from users u full outer join orders o on u.userId = o.userId;
select * from users u left semi join orders o on u.userId = o.userId;
hive其他排序操作[练习]
知识点:
set mapreduce.job.reduces: 查看当前设置的reduce数量 默认结果是-1,代表自动匹配reduce数量和桶数量一致
set mapreduce.job.reduces = 数量 : -- 修改reduces数量
cluster by 字段名: 分桶且正序排序 弊端: 分和排序是同一个字段,相对不灵活
distribute by 字段名 sort by 字段名: distribute by负责分,sort by负责排序, 相对比较灵活
order by 字段名:全局排序
注意: cluster by 和 distribute by 字段名 sort by 字段名 受当前设置的reduces数量影响,但是设置的reduces数量对order by无影响,因为orderby就是全局排序,就是一个reduce
建表的时候指定分桶字段和排序字段: clustered by (字段名) sorted by (字段名) into 桶数量 buckets
注意: 如果建表的时候设置了桶数量,那么reduces建议设置值-1或者值大于桶数量
示例:
create table students(
id int,
name string,
gender string,
age int,
cls string
)row format delimited
fields terminated by ',';
load data inpath '/source/students.txt' into table students;
select * from students limit 1;
set mapreduce.job.reduces;
select * from students cluster by id;
set mapreduce.job.reduces=3;
select * from students cluster by age;
set mapreduce.job.reduces = -1;
select * from students distribute by name sort by age desc;
set mapreduce.job.reduces = 2;
select * from students distribute by name sort by age desc;
select * from students order by age desc;
抽样查询
知识点:
TABLESAMPLE抽样好处: 尽可能实现随机抽样,并且不走MR查询效率相对较快
基于随机分桶抽样格式: SELECT 字段名 FROM tbl TABLESAMPLE(BUCKET x OUT OF y ON(字段名 | rand()))
y:表示将表数据随机划分成y份(y个桶)
x:表示从y里面随机抽取x份数据作为取样
| : 或者
字段名: 表示随机的依据基于某个列的值,每次按相关规则取样结果都是一致
rand(): 表示随机的依据基于整行,每次取样结果不同
示例:
select * from orders tablesample ( bucket 1 out of 10 on orderid);
select * from orders tablesample ( bucket 1 out of 10 on rand());
select * from orders tablesample ( 100 rows );
select * from orders tablesample ( 10 percent );
select * from orders tablesample ( 16k );
select * from orders tablesample ( 167k );
select * from orders tablesample ( 1m );
select * from orders distribute by rand() sort by rand() limit 100;
正则模糊查询
sql模糊查询关键字: like 任意0个或者多个: % 任意1个: _
正则模糊查询关键字: rlike 任意0个或者多个: .* 任意1个: . 正则语法还有很多...
select * from orders where userAddress like '广东省%';
select * from orders where userAddress rlike '广东省.*';
select * from orders where userAddress like '__省 __市 __区';
select * from orders where userAddress rlike '..省 ..市 ..区';
select * from orders where username like '张%' or username like '王%' or username like '邓%' ;
select * from orders where username rlike '[张王邓].*';
select * from orders where username rlike "[张王邓].+";
select * from orders where userPhone like '188________' ;
select * from orders where userPhone rlike '188........' ;
select * from orders where userPhone rlike '188.{8}' ;
select * from orders where userPhone rlike '188\\*{4}[0-9]{4}' ;
select * from orders where userPhone rlike '188\\*{4}\\d{4}' ;
union联合查询
知识点:
union联合查询: 就是把两个select语句结果合并成一个临时结果集,整体可以用于其他sql操作
union [distinct]: 去重,只是省略了distinct
union all : 不去重
示例:
insert into product values('p1','联想','c1'),('p2','小米','c2'),('p3','华为',null);
create table category(
cid varchar(100),
cname varchar(100)
);
insert into category values('c1','电脑'),('c2','手机'),('c3','服饰');
select pid,pname,p.cid,cname from product p left join category c on p.cid = c.cid
union
select pid,pname,c.cid,cname from product p right join category c on p.cid = c.cid;
select pid,pname,p.cid,cname from product p left join category c on p.cid = c.cid
union all
select pid,pname,c.cid,cname from product p right join category c on p.cid = c.cid;
select pid,pname,c.cid,cname from product p full join category c on p.cid = c.cid;
CTE表达式
CTE: 公用表表达式(CTE)是一个在查询中定义的临时命名结果集将在from子句中使用它。
注意: 每个CTE仅被定义一次(但在其作用域内可以被引用任意次),仅适用于当前运行的sql语句
语法如下:
with 临时结果集的别名1 as (子查询语句),
临时结果集的别名2 as (子查询语句)
...
select 字段名 from (子查询语句);
内置虚拟列
知识点:
虚拟列是Hive内置的可以在查询语句中使用的特殊标记,可以查询数据本身的详细参数。
Hive目前可用3个虚拟列:
INPUT__FILE__NAME:显示数据行所在的具体文件
BLOCK__OFFSET__INSIDE__FILE:显示数据行所在文件的偏移量
ROW__OFFSET__INSIDE__BLOCK:显示数据所在HDFS块的偏移量 注意: 此虚拟列需要设置:SET hive.exec.rowoffset=true 才可使用
示例:
SET hive.exec.rowoffset=true;
SELECT *, INPUT__FILE__NAME, BLOCK__OFFSET__INSIDE__FILE, ROW__OFFSET__INSIDE__BLOCK FROM students_bucket;
hive函数[预习]
函数分类标准[重点]
知识点:
原生分类标准: 内置函数 和 用户定义函数(UDF,UDAF,UDTF)
分类标准扩大化: 用户定义函数分类标准的扩大化,本来,UDF 、UDAF、UDTF这3个标准是针对用户自定义函数分类的; 但是,现在可以将这个分类标准扩大到hive中所有的函数,包括内置函数和自定义函数;
目前hive三大标准
UDF:(User-Defined-Function)普通函数: 特点是一进一出(输入一行数据输出一行数据) 举例: split
UDAF:(User-Defined Aggregation Function)聚合函数: 特点是多进一出(输入多行输出一行) 举例: count sum max min avg
UDTF:(User-Defined Table-Generating Functions)表生成函数: 特点是一进多出(输入一行输出多行) 举例: explode
查询所有hive函数名称: show functions;
查看某函数使用帮助文档: desc function [extended] 函数名; 注意: 加上extended关键字能查看详细信息示例
示例:
create database hive04;
use hive04;
show functions;
desc function split;
desc function extended split;
SELECT split('one,two,three', ',');
select '苹果-香蕉-西瓜-哈密瓜-火龙果-榴莲';
select split('苹果-香蕉-西瓜-哈密瓜-火龙果-榴莲', '-');
desc function extended explode;
select explode(array("苹果", "香蕉", "西瓜", "哈密瓜", "火龙果", "榴莲"));
复杂类型函数
知识点:
hive复杂类型: array struct map
array类型: 又叫数组类型,存储同类型的单数据的集合
取值: 字段名[索引] 注意: 索引从0开始
获取长度的函数: size(字段名) 常用
判断是否包含某个数据的函数: array_contains(字段名,某数据) 常用
对数组进行排序的函数: sort_array(数组)
struct类型: 又叫结构类型,可以存储不同类型单数据的集合
取值: 字段名.子字段名n
map类型: 又叫映射类型,存储键值对数据的映射(根据key找value)
取值: 字段名[key]
获取长度的函数: size(字段名) 常用
获取所有key的函数: map_keys() 常用
获取所有value的函数: map_values() 常用
示例:
select array('binzi','666','888');
select size(array('binzi','666','888'));
select array_contains(array('binzi','666','888'),'binzi');
select sort_array(array(3,1,5,2,4));
select map('a',1,'b',2,'c',3);
select size(map('a',1,'b',2,'c',3));
select map_keys(map('a',1,'b',2,'c',3));
select map_values(map('a',1,'b',2,'c',3));
字符串函数
知识点:
字符串常见的函数:
concat: 字符串紧凑拼接到一起生成新字符串
concat_ws: 字符串用指定分隔符拼接到一起生成新字符串 常用
length: 获取字符串长度 常用
lower: 把字符串中的字母全部变成小写
upper: 把字符串中的字母全部变成大写
trim: 把字符串两端的空白去除 常用
拓展字符串函数
substr: 截取字符串 常用
replace: 替换字符串 常用
regexp_replace: 正则替换字符串
parse_url: 解析url(统一资源定位符) 组成: 协议/主机地址:端口号/资源路径?查询参数
get_json_object: 获取json对象解析对应数据
示例:
select concat('binzi', '666', '888');
select concat_ws('-', 'binzi', '666', '888');
select length('binzi-666');
select lower('BINZI-666');
select upper('binzi-666');
select ' binzi 666 ';
select trim(' binzi 666 ');
select substr('binzi666',1,2);
select substr('binzi666',1);
select substr('binzi666',-4);
select `current_date`();
select substr('2023-05-21',1,4);
select substr('2023-05-21',6,2);
select substr('2023-05-21',-2,2);
select replace('你TMD哦','TMD','***');
select regexp_replace('binzi-666', '\\d+', 'num');
select regexp_extract('binzi-666-888', '(\\d+)-(\\d+)', 1);
select parse_url('http://www.itcast.cn/path/binzi.html?user=binzi&pwd=123', 'HOST');
select parse_url('http://www.itcast.cn/path/binzi.html?user=binzi&pwd=123', 'PATH');
select parse_url('http://www.itcast.cn/path/binzi.html?user=binzi&pwd=123', 'QUERY');
select parse_url('http://www.itcast.cn/path/binzi.html?user=binzi&pwd=123', 'QUERY', 'user');
select parse_url('http://www.itcast.cn/path/binzi.html?user=binzi&pwd=123', 'QUERY', 'pwd');
select get_json_object('{"name":"杨过", "age":"18"}', '$.name');
select get_json_object('[{"name":"杨过", "age":"18"}, {"name":"小龙女", "age":"26"}]', '$.[1]');
select get_json_object('[{"name":"杨过", "age":"18"}, {"name":"小龙女", "age":"26"}]', '$.[1].name');
日期时间函数
知识点:
current_timestamp: 获取时间原点到现在的秒/毫秒,底层自动转换方便查看的日期格式 常用
to_date: 字符串格式时间戳转日期(年月日)
current_date: 获取当前日期(年月日) 常用
year: 获取指定日期时间中的年 常用
month:获取指定日期时间中的月 常用
day:获取指定日期时间中的日 常用
hour:获取指定日期时间中的时
minute:获取指定日期时间中的分
second:获取指定日期时间中的秒
dayofmonth: 获取指定日期时间中的月中第几天
dayofweek:获取指定日期时间中的周中第几天
quarter:获取指定日期时间中的所属季度
weekofyear:获取指定日期时间中的年中第几周
datediff: 获取两个指定时间的差值 常用
date_add: 在指定日期时间上加几天 常用
date_sub: 在指定日期时间上减几天
unix_timestamp: 获取unix时间戳(时间原点到现在的秒/毫秒) 注意: 可以使用yyyyMMdd HH:mm:ss进行格式化转换
from_unixtime: 把unix时间戳转换为日期格式的时间 注意: 如果传入的参数是0,获取的是时间原点1970-01-01 00:00:00
示例:
select unix_timestamp();
select current_timestamp();
select current_date();
select to_date('2023-05-21 11:19:31.222000000');
select to_date(current_timestamp());
select year('2023-05-21 11:19:31.222000000');
select month('2023-05-21 11:19:31.222000000');
select day('2023-05-21 11:19:31.222000000');
select hour('2023-05-21 11:19:31.222000000');
select minute('2023-05-21 11:19:31.222000000');
select second('2023-05-21 11:19:31.222000000');
select dayofmonth('2023-05-21 11:19:31.222000000');
select dayofweek('2023-05-21 11:19:31.222000000');
select quarter('2023-05-21 11:19:31.222000000');
select weekofyear('2023-05-21 11:19:31.222000000');
select datediff('2023-05-21','2023-05-09');
select date_add(current_timestamp(),1);
select date_sub(current_timestamp(),-1);
select date_sub(current_timestamp(),1);
select date_add(current_timestamp(),-1);
select unix_timestamp();
select unix_timestamp("2023-5-21 11:38:56");
select unix_timestamp('20230521 11:38:56','yyyyMMdd HH:mm:ss');
select from_unixtime(1684669136);
select from_unixtime(0);
数学函数
pi: 生成π结果
round: 指定小数保留位数 常用
rand: 生成0-1的随机数
ceil: 向上取整
floor: 向下取整
select rand();
select '3.1415926';
select pi();
select round(pi(),4);
select ceil(pi());
select floor(pi());
条件函数[练习]
知识点:
if(参数1,参数2,参数3): 如果参数1结果为true,就执行参数2内容,否则执行参数3的内容
case...when.then...end: 条件判断类似于编程语言中的if..else if ...else... 常用
isnull(数据) : 为空null: true 不为空:false
isnotnull(数据): 不为空: true 为空null:false
nvl(数据,参数2): 如果数据不为空打印数据,为空null打印第二个参数 常用
coalesce(参数1,参数2...): 从左到右依次查找,返回第一个不是null的值,如果找到最后都是null,就返回null 常用
示例:
select if(10 > 5, '真', '假');
select if(10 < 5, '真', '假');
select
case 7
when 1 then '周一上班'
when 2 then '周二上班'
when 3 then '周三上班'
when 4 then '周四上班'
when 5 then '周五上班'
when 6 then '周六休息'
when 7 then '周日休息'
else '老弟啊,你是外星人吗?'
end;
select
case
when 7==1 then '周一上班'
when 7==2 then '周二上班'
when 7==3 then '周三上班'
when 7==4 then '周四上班'
when 7==5 then '周五上班'
when 7==6 then '周六休息'
when 7==7 then '周日休息'
else '老弟啊,你是外星人吗?'
end;
select isnull(null);
select isnotnull('斌子');
select nvl('binzi','666');
select nvl(null,'666');
select COALESCE(null,11,22,33);
select COALESCE(null,null,22,33);
select COALESCE(null,null,null,33);
select COALESCE(null,null,null,0);
select COALESCE(null,null,null,null);
类型转换函数
类型转换: cast(数据 as 要转换的类型) 常用
select cast(3.14 as int);
select cast(3.14 as string) ;
select cast('3.14' as float);
select cast('3.14' as int);
select cast('binzi' as int);
select '3'+3;
select concat_ws('_','binzi',666,'888');
select concat_ws('_','binzi',cast(666 as string),'888');
数据脱敏函数
select mask_hash('binzi');
select mask("abc123DEF");
select mask("abc123DEF",'大','小','数');
select mask("abc123DEF",'/','.','%');
select mask_first_n("abc123DEF",6);
select mask_last_n("abc123DEF",6);
select mask_show_first_n("abc123DEF",6);
select mask_show_last_n("abc123DEF",6);
其他函数
select hash("binzi");
select md5("binzi");
select length('072853027b387fcf891a610137f8dc1b');
select sha1("binzi");
select length('66368c80ca9125f9a8a945aaf1e1ec3f8b21f7f9');
select sha2("binzi",224);
select sha2("binzi",512);
select crc32("binzi");
select current_user(),logged_in_user(),current_database(),version();
炸裂函数实战[练习]
知识点:
把一个容器的多个数据炸裂出单独展示: explode(容器)
炸裂函数配合侧视图使用格式:select 原表别名.字段名,侧视图名.字段名 from 原表 原表别名 lateral view explode(要炸开的字段) 侧视图名 as 字段名 ;
示例:
select explode(array('binzi', '666', '888'));
select explode(map('a', 1, 'b', 2, 'c', 3));
实战
create table the_nba_championship(
team_name string,
champion_year array<string>
) row format delimited
fields terminated by ','
collection items terminated by '|';
load data inpath '/source/The_NBA_Championship.txt' into table the_nba_championship;
select * from the_nba_championship;
select explode(champion_year) as year from the_nba_championship ;
with tmp as(
select a.team_name,b.year
from the_nba_championship a
lateral view explode(champion_year) b as year
)
select * from tmp order by year desc;
高频面试题[练习]

行转列
知识点:
collect_set(字段名): 把多个数据收集到一起,默认去重
collect_list(字段名): 把多个数据收集到一起,默认不去重
把多个子串用指定分隔符拼接成一个大字符串: concat_ws(分隔符,多个数据...) 注意: 如果拼接数据不是字符串可以使用cast转换
需求:
示例:
select
col1,
col2,
collect_list(col3)
from
row2col2
group by
col1, col2;
select
col1,
col2,
concat_ws('-',collect_list(cast(col3 as string)))
from
row2col2
group by
col1, col2;
列转行
知识点
把字符串按照指定分隔符切割: split(字符串,分隔符)
炸裂函数配合侧视图使用格式: select 原表别名.字段名,侧视图名.字段名 from 原表 原表别名 lateral view explode(要炸开的字段) 侧视图名 as 字段名 ;
需求

示例
create table col2row2(
col1 string,
col2 string,
col3 string
)row format delimited fields terminated by '\t';
load data inpath '/source/c2r2.txt' into table col2row2;
select * from col2row2;
select col1,col2,lv.col33 from col2row2
lateral view explode(split(col3,',')) lv as col33;
JSON文件处理
知识点:
get_json_object: 获取json对象解析对应数据 一次只能提取一个字段
json_tuple: 直接获取json对应数据 这是一个UDTF函数 可以一次解析提取多个字段
注意: 因为json_tuple是UDTF函数,所以也可以配合侧视图使用
示例:
create table tb_json_test1 (
json string
);
load data inpath '/source/device.json' into table tb_json_test1;
select * from tb_json_test1;
create table device1 as
select
get_json_object(json,"$.device") as device,
get_json_object(json,"$.deviceType") as deviceType,
get_json_object(json,"$.signal") as signal,
get_json_object(json,"$.time") as stime
from tb_json_test1;
create table device2 as
select
json_tuple(json,"device","deviceType","signal","time") as (device,deviceType,signal,stime)
from tb_json_test1;
select
device,deviceType,signal,stime
from tb_json_test1
lateral view json_tuple(json,"device","deviceType","signal","time") b
as device,deviceType,signal,stime;
create table tb_json_test2 (
device string,
deviceType string,
signal double,
`time` string
)ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe' STORED AS TEXTFILE;
load data inpath '/source/device.json' into table tb_json_test2;
select * from tb_json_test2;
开窗函数
基础使用[回顾]
基础知识点[重点]
开窗函数格式: select ... 开窗函数 over(partition by 分组字段名 order by 排序字段名 asc|desc) ... from 表名;
聚合开窗函数: 原来学的聚合函数(max,min,sum,count,avg)配合over()使用的时候,这些聚合函数也可以叫开窗函数
排序开窗函数: row_number dense_rank rank
row_number: 巧记: 1234 特点: 唯一且连续
dense_rank: 巧记: 1223 特点: 并列且连续
rank : 巧记: 1224 特点: 并列不连续
select col1,
max(col3) over()
from row2col2;
select *,
row_number() over (order by signal desc),
dense_rank() over (order by signal desc),
rank() over (order by signal desc)
from device1;
select *,
row_number() over (partition by deviceType order by signal desc),
dense_rank() over (partition by deviceType order by signal desc),
rank() over (partition by deviceType order by signal desc)
from device1;
select *,
max(signal) over(partition by deviceType)
from device1;
create table website_pv_info(
cookieid string,
createtime string,
pv int
) row format delimited
fields terminated by ',';
create table website_url_info (
cookieid string,
createtime string,
url string
) row format delimited
fields terminated by ',';
select * from website_pv_info;
select * from website_url_info;
select cookieid, createtime, pv,
sum(pv) over(partition by cookieid order by createtime) as current_total_pv
from website_pv_info;
开窗函数控制范围
开窗函数控制范围: rows between
- x preceding:往前x行
- x following:往后x行
- current row:当前行
- unbounded: 起点
- unbounded preceding :表示从前面的起点 第一行
- unbounded following :表示到后面的终点 最后一行
select cookieid,createtime,pv,
sum(pv) over(partition by cookieid order by createtime) as pv1
from website_pv_info;
select cookieid,createtime,pv,
sum(pv) over(partition by cookieid order by createtime rows between unbounded preceding and current row) as pv2
from website_pv_info;
select cookieid,createtime,pv,
sum(pv) over(partition by cookieid order by createtime rows between 3 preceding and current row) as pv4
from website_pv_info;
select cookieid,createtime,pv,
sum(pv) over(partition by cookieid order by createtime rows between 3 preceding and 1 following) as pv5
from website_pv_info;
select cookieid,createtime,pv,
sum(pv) over(partition by cookieid order by createtime rows between current row and unbounded following) as pv6
from website_pv_info;
select cookieid,createtime,pv,
sum(pv) over(partition by cookieid order by createtime rows between unbounded preceding and unbounded following) as pv6
from website_pv_info;
其他开窗函数
其他开窗函数: ntile lag和lead first_value和last_value
ntile(x)功能: 将分组排序之后的数据分成指定的x个部分(x个桶)
注意ntile规则:尽量平均分配 ,优先满足最小(编号1)的桶,彼此最多不相差1个。
lag: 用于统计窗口内往上第n行值
lead:用于统计窗口内往下第n行值
first_value: 取分组内排序后,截止到当前行,第一个值
last_value : 取分组内排序后,截止到当前行,最后一个值
注意: 窗口函数结果都是单独生成一列存储对应数据
SELECT
cookieid,
createtime,
pv,
ntile(3) OVER(PARTITION BY cookieid ORDER BY createtime) AS rn2
FROM website_pv_info
ORDER BY cookieid,createtime;
SELECT * from
(SELECT
cookieid,
createtime,
pv,
NTILE(3) OVER(PARTITION BY cookieid ORDER BY pv DESC) AS rn
FROM website_pv_info) tmp where rn =1;
select cookieid, createtime, url,
row_number() over (partition by cookieid order by createtime) rn,
lag(createtime, 1) over (partition by cookieid order by createtime) la1,
lag(createtime, 2, '2000-01-01 00:00:00') over (partition by cookieid order by createtime) la2
from website_url_info;
select cookieid, createtime, url,
row_number() over (partition by cookieid order by createtime) rn,
lead(createtime, 1) over (partition by cookieid order by createtime) la1,
lead(createtime, 2, '2000-01-01 00:00:00') over (partition by cookieid order by createtime) la2
from website_url_info;
select cookieid, createtime, url,
row_number() over (partition by cookieid order by createtime) rn,
first_value(url) over (partition by cookieid order by createtime) fv
from website_url_info;
select cookieid, createtime, url,
row_number() over (partition by cookieid order by createtime) rn,
last_value(url) over (partition by cookieid order by createtime rows between unbounded preceding and unbounded following) fv
from website_url_info;