表生成函数
集合函数
--集合函数 select sort_array(array('y','z','q')); size(Map<K,V>)--返回一个值 map_keys(Map<K,V>)--返回一个数组 map_values(Map<K,V>)--返回一个数组
条件控制函数
--条件控制函数(case when) --查询用户id,name,age(如果年龄30以下,显示青年人,30-40,显示中年人,40以上,老年人) select id,name,info.age, case when info.age<30 then '青年' when info.age<40 and info.age>30 then '中年人' else '老年人' end from t_user; +-----+-----------+------+------+--+ | id | name | age | _c3 | +-----+-----------+------+------+--+ | 1 | zhangsan | 18 | 青年 | | 2 | lisi | 28 | 青年 | | 3 | wangwu | 38 | 中年人 | | 4 | 赵六 | 26 | 青年 | | 5 | 钱琪 | 35 | 中年人 | | 6 | 王八 | 48 | 老年人 | +-----+-----------+------+------+--+ --IF --需求,如果主演中有徐峥的就是好片,否则不是 select movie_name, if(array_contains(actors,'徐峥'),'好片','不是好片'), first_show from t_movie; +-------------+-------+-------------+--+ | movie_name | _c1 | first_show | +-------------+-------+-------------+--+ | 无名之辈 | 不是好片 | 2018-11-16 | | 我不是药神 | 好片 | 2018-07-05 | | 一出好戏 | 不是好片 | 2018-08-10 | | 中国机长 | 不是好片 | 2018-05-18 | | 囧妈 | 好片 | 2020-01-25 | +-------------+-------+-------------+--+
分析函数
--分析函数 --row_number() over()函数:分组TOPN --有如下数据 vi row_number.dat 1,18,a,male 2,19,b,male 3,22,c,female 4,16,d,female 5,30,e,male 6,26,f,female --建表 create table t_rn(id int,age int,name string,sex string) row format delimited fields terminated by ','; --导数据 load data local inpath '/root/hivetest/row_number.dat' into table t_rn; --查询出每种性别中年龄最大的两条数据(思路:分组——排序——标记序号) select id,name,age,sex, row_number() over(partition by sex order by age desc) as rn from t_rn; +-----+-------+------+---------+-----+--+ | id | name | age | sex | rn | +-----+-------+------+---------+-----+--+ | 6 | f | 26 | female | 1 | | 3 | c | 22 | female | 2 | | 4 | d | 16 | female | 3 | | 5 | e | 30 | male | 1 | | 2 | b | 19 | male | 2 | | 1 | a | 18 | male | 3 | +-----+-------+------+---------+-----+--+ select id,name,age,sex from (select id,name,age,sex, row_number() over(partition by sex order by age desc) as rn from t_rn) tmp where rn<3; +-----+-------+------+---------+--+ | id | name | age | sex | +-----+-------+------+---------+--+ | 6 | f | 26 | female | | 3 | c | 22 | female | | 5 | e | 30 | male | | 2 | b | 19 | male | +-----+-------+------+---------+--+
级联报表查询
--级联报表查询 --数据 vi accu.dat A,2015-01,5 A,2015-01,15 B,2015-01,5 A,2015-01,8 B,2015-01,25 A,2015-01,5 C,2015-01,10 ... --建表 create table t_access_times(name string,month string,times int) row format delimited fields terminated by ','; --导数据 load data local inpath '/root/hivetest/accu.dat' into table t_access_times; --查询每个人当月累计次数 --1.先查出每个人每个月的次数并存成一张新表(总额表) create table t_access_number as select name,month,sum(times) as amount from t_access_times group by name,month; +-----------------------+------------------------+-------------------------+--+ | t_access_number.name | t_access_number.month | t_access_number.amount | +-----------------------+------------------------+-------------------------+--+ | A | 2015-01 | 33 | | A | 2015-02 | 10 | | A | 2015-03 | 20 | | B | 2015-01 | 30 | | B | 2015-02 | 15 | | B | 2015-03 | 45 | | C | 2015-01 | 30 | | C | 2015-02 | 40 | | C | 2015-03 | 30 | +-----------------------+------------------------+-------------------------+--+ --2.将新表(总额表)自连接 select name,month,sum(b_amount) as accumulate from (select a.name as name,a.month as month,b.amount as b_amount from t_access_number a left join t_access_number b on a.name=b.name where b.month<=a.month) tmp group by name,month; +-------+----------+-------------+--+ | name | month | accumulate | +-------+----------+-------------+--+ | A | 2015-01 | 33 | | A | 2015-02 | 43 | | A | 2015-03 | 63 | | B | 2015-01 | 30 | | B | 2015-02 | 45 | | B | 2015-03 | 90 | | C | 2015-01 | 30 | | C | 2015-02 | 70 | | C | 2015-03 | 100 | +-------+----------+-------------+--+
窗口分析函数
--窗口分析函数 --求出每个人截至到每个月的总额 --sum()over()函数:可以实现窗口中进行逐行累加 select name,month,amount, sum(amount)over(partition by name order by month rows between unbounded preceding and current row) as accumlate from t_access_number; +-------+----------+---------+------------+--+ | name | month | amount | accumlate | +-------+----------+---------+------------+--+ | A | 2015-01 | 33 | 33 | | A | 2015-02 | 10 | 43 | | A | 2015-03 | 20 | 63 | | B | 2015-01 | 30 | 30 | | B | 2015-02 | 15 | 45 | | B | 2015-03 | 45 | 90 | | C | 2015-01 | 30 | 30 | | C | 2015-02 | 40 | 70 | | C | 2015-03 | 30 | 100 | +-------+----------+---------+------------+--+
hive 自定义函数
--hive 自定义函数 /* 有如下json数据:rating.json {"movie":"1193","rate":"5","timeStamp":"978300760","uid":"1"} {"movie":"661","rate":"3","timeStamp":"978302109","uid":"1"} {"movie":"914","rate":"3","timeStamp":"978301968","uid":"1"} {"movie":"3408","rate":"4","timeStamp":"978300275","uid":"1"} 需要导入hive中进行数据分析 */ --建表映射上述数据 create table t_ratingjson(json string); --导数据 load data local inpath '/root/hivetest/rating.json' into table t_ratingjson; --想把上面的原始数据变成如下形式: 1193,5,978300760,1 661,3,978302109,1 914,3,978301968,1 3408,4,978300275,1 --思路:如果能够定义一个json解析函数 create table t_rate as select myjson(json,1) as movie,cast(myjson(json,2) as int) as rate,myjson(json,3) as ts,myjson(json,4) as uid from t_ratingjson; select * from t_rate; +---------------+--------------+------------+-------------+--+ | t_rate.movie | t_rate.rate | t_rate.ts | t_rate.uid | +---------------+--------------+------------+-------------+--+ | 1193 | 5 | 978300760 | 1 | | 661 | 3 | 978302109 | 1 | | 914 | 3 | 978301968 | 1 | | 3408 | 4 | 978300275 | 1 | | 2355 | 5 | 978824291 | 1 | | 1197 | 3 | 978302268 | 1 | | 1287 | 5 | 978302039 | 1 | | 2804 | 5 | 978300719 | 1 | | 594 | 4 | 978302268 | 1 | | 919 | 4 | 978301368 | 1 | --解决: --hive中如何定义自己的函数: --1、先写一个java类(extends UDF,重载方法public C evaluate(A a,B b)),实现你所想要的函数的功能(传入一个json字符串和一个脚标,返回一个值) --2、将java程序打成jar包,上传到hive所在的机器 --3、在hive命令行中将jar包添加到classpath : hive>add jar /root/hivetest/myjson.jar; --4、在hive命令中用命令创建一个函数叫做myjson,关联你所写的这个java类 hive> create temporary function myjson as 'cn.hive.json.MyJsonParser'; --查询每个人评论过几部电影 select uid,count(1) from t_rate group by uid limit 20; +-------+-------+--+ | uid | _c1 | +-------+-------+--+ | 1 | 53 | | 10 | 401 | | 100 | 76 | | 1000 | 84 | | 1001 | 377 | | 1002 | 66 | | 1003 | 29 | | 1004 | 481 | | 1005 | 92 | | 1006 | 44 | | 1007 | 32 | | 1008 | 50 | | 1009 | 52 | | 101 | 106 | | 1010 | 1004 |
json解析函数
--json解析函数 select json_tuple(json,'movie','rate','timeStamp','uid') as (movie,rate,time,uid) from t_ratingjson limit 20; +--------+-------+------------+------+--+ | movie | rate | time | uid | +--------+-------+------------+------+--+ | 1193 | 5 | 978300760 | 1 | | 661 | 3 | 978302109 | 1 | | 914 | 3 | 978301968 | 1 | | 3408 | 4 | 978300275 | 1 | | 2355 | 5 | 978824291 | 1 | | 1197 | 3 | 978302268 | 1 | | 1287 | 5 | 978302039 | 1 |