hive的函数-阿里云开发者社区

hive的函数

2022-11-24 121

版权

本文内容由阿里云实名注册用户自发贡献，版权归原作者所有，阿里云开发者社区不拥有其著作权，亦不承担相应法律责任。具体规则请查看《阿里云开发者社区用户服务协议》和《阿里云开发者社区知识产权保护指引》。如果您发现本社区中有涉嫌抄袭的内容，填写侵权投诉表单进行举报，一经查实，本社区将立刻删除涉嫌侵权内容。

简介： hive的函数

表生成函数

集合函数

--集合函数
select sort_array(array('y','z','q'));
size(Map<K,V>)--返回一个值
map_keys(Map<K,V>)--返回一个数组
map_values(Map<K,V>)--返回一个数组

条件控制函数

--条件控制函数（case when）
--查询用户id,name,age（如果年龄30以下，显示青年人，30-40，显示中年人，40以上，老年人）
select id,name,info.age,
case 
when info.age<30 then '青年'
when info.age<40 and info.age>30 then '中年人'
else '老年人'
end
from t_user;
+-----+-----------+------+------+--+
| id  |   name    | age  | _c3  |
+-----+-----------+------+------+--+
| 1   | zhangsan  | 18   | 青年   |
| 2   | lisi      | 28   | 青年   |
| 3   | wangwu    | 38   | 中年人  |
| 4   | 赵六        | 26   | 青年   |
| 5   | 钱琪        | 35   | 中年人  |
| 6   | 王八        | 48   | 老年人  |
+-----+-----------+------+------+--+
--IF
--需求，如果主演中有徐峥的就是好片，否则不是
select movie_name,
if(array_contains(actors,'徐峥'),'好片','不是好片'),
first_show
from t_movie;
+-------------+-------+-------------+--+
| movie_name  |  _c1  | first_show  |
+-------------+-------+-------------+--+
| 无名之辈        | 不是好片  | 2018-11-16  |
| 我不是药神       | 好片    | 2018-07-05  |
| 一出好戏        | 不是好片  | 2018-08-10  |
| 中国机长        | 不是好片  | 2018-05-18  |
| 囧妈          | 好片    | 2020-01-25  |
+-------------+-------+-------------+--+

分析函数

--分析函数
--row_number() over()函数：分组TOPN
--有如下数据  vi row_number.dat
1,18,a,male
2,19,b,male
3,22,c,female
4,16,d,female
5,30,e,male
6,26,f,female
--建表
create table t_rn(id int,age int,name string,sex string)
row format delimited fields terminated by ',';
--导数据
load data local inpath '/root/hivetest/row_number.dat' into table t_rn;
--查询出每种性别中年龄最大的两条数据（思路：分组——排序——标记序号）
select id,name,age,sex,
row_number() over(partition by sex order by age desc) as rn
from t_rn;
+-----+-------+------+---------+-----+--+
| id  | name  | age  |   sex   | rn  |
+-----+-------+------+---------+-----+--+
| 6   | f     | 26   | female  | 1   |
| 3   | c     | 22   | female  | 2   |
| 4   | d     | 16   | female  | 3   |
| 5   | e     | 30   | male    | 1   |
| 2   | b     | 19   | male    | 2   |
| 1   | a     | 18   | male    | 3   |
+-----+-------+------+---------+-----+--+
select id,name,age,sex
from
(select id,name,age,sex,
row_number() over(partition by sex order by age desc) as rn
from t_rn) tmp
where rn<3;
+-----+-------+------+---------+--+
| id  | name  | age  |   sex   |
+-----+-------+------+---------+--+
| 6   | f     | 26   | female  |
| 3   | c     | 22   | female  |
| 5   | e     | 30   | male    |
| 2   | b     | 19   | male    |
+-----+-------+------+---------+--+

级联报表查询

--级联报表查询
--数据 vi accu.dat
A,2015-01,5
A,2015-01,15
B,2015-01,5
A,2015-01,8
B,2015-01,25
A,2015-01,5
C,2015-01,10
...
--建表
create table t_access_times(name string,month string,times int)
row format delimited fields terminated by ',';
--导数据
load data local inpath '/root/hivetest/accu.dat' into table t_access_times;
--查询每个人当月累计次数
--1.先查出每个人每个月的次数并存成一张新表（总额表）
create table t_access_number
as
select name,month,sum(times) as amount
from t_access_times
group by name,month;
+-----------------------+------------------------+-------------------------+--+
| t_access_number.name  | t_access_number.month  | t_access_number.amount  |
+-----------------------+------------------------+-------------------------+--+
| A                     | 2015-01                | 33                      |
| A                     | 2015-02                | 10                      |
| A                     | 2015-03                | 20                      |
| B                     | 2015-01                | 30                      |
| B                     | 2015-02                | 15                      |
| B                     | 2015-03                | 45                      |
| C                     | 2015-01                | 30                      |
| C                     | 2015-02                | 40                      |
| C                     | 2015-03                | 30                      |
+-----------------------+------------------------+-------------------------+--+
--2.将新表（总额表）自连接
select name,month,sum(b_amount) as accumulate
from
(select a.name as name,a.month as month,b.amount as b_amount
from t_access_number a left join t_access_number b
on a.name=b.name
where b.month<=a.month) tmp
group by name,month;
+-------+----------+-------------+--+
| name  |  month   | accumulate  |
+-------+----------+-------------+--+
| A     | 2015-01  | 33          |
| A     | 2015-02  | 43          |
| A     | 2015-03  | 63          |
| B     | 2015-01  | 30          |
| B     | 2015-02  | 45          |
| B     | 2015-03  | 90          |
| C     | 2015-01  | 30          |
| C     | 2015-02  | 70          |
| C     | 2015-03  | 100         |
+-------+----------+-------------+--+

窗口分析函数

--窗口分析函数
--求出每个人截至到每个月的总额
--sum()over()函数：可以实现窗口中进行逐行累加
select name,month,amount,
sum(amount)over(partition by name order by month rows between unbounded preceding and current row) as accumlate
from t_access_number;
+-------+----------+---------+------------+--+
| name  |  month   | amount  | accumlate  |
+-------+----------+---------+------------+--+
| A     | 2015-01  | 33      | 33         |
| A     | 2015-02  | 10      | 43         |
| A     | 2015-03  | 20      | 63         |
| B     | 2015-01  | 30      | 30         |
| B     | 2015-02  | 15      | 45         |
| B     | 2015-03  | 45      | 90         |
| C     | 2015-01  | 30      | 30         |
| C     | 2015-02  | 40      | 70         |
| C     | 2015-03  | 30      | 100        |
+-------+----------+---------+------------+--+

hive 自定义函数

--hive 自定义函数
/*
有如下json数据：rating.json
{"movie":"1193","rate":"5","timeStamp":"978300760","uid":"1"}
{"movie":"661","rate":"3","timeStamp":"978302109","uid":"1"}
{"movie":"914","rate":"3","timeStamp":"978301968","uid":"1"}
{"movie":"3408","rate":"4","timeStamp":"978300275","uid":"1"}
需要导入hive中进行数据分析
*/
--建表映射上述数据
create table t_ratingjson(json string);
--导数据
load data local inpath '/root/hivetest/rating.json' into table t_ratingjson;
--想把上面的原始数据变成如下形式：
  1193,5,978300760,1
  661,3,978302109,1
  914,3,978301968,1
  3408,4,978300275,1
--思路：如果能够定义一个json解析函数
create table t_rate
as
select myjson(json,1) as movie,cast(myjson(json,2) as int) as rate,myjson(json,3) as ts,myjson(json,4) as uid from t_ratingjson;
select * from t_rate;
+---------------+--------------+------------+-------------+--+
| t_rate.movie  | t_rate.rate  | t_rate.ts  | t_rate.uid  |
+---------------+--------------+------------+-------------+--+
| 1193          | 5            | 978300760  | 1           |
| 661           | 3            | 978302109  | 1           |
| 914           | 3            | 978301968  | 1           |
| 3408          | 4            | 978300275  | 1           |
| 2355          | 5            | 978824291  | 1           |
| 1197          | 3            | 978302268  | 1           |
| 1287          | 5            | 978302039  | 1           |
| 2804          | 5            | 978300719  | 1           |
| 594           | 4            | 978302268  | 1           |
| 919           | 4            | 978301368  | 1           |
--解决：
--hive中如何定义自己的函数：
--1、先写一个java类（extends UDF,重载方法public C evaluate(A a,B b)），实现你所想要的函数的功能（传入一个json字符串和一个脚标，返回一个值）
--2、将java程序打成jar包，上传到hive所在的机器
--3、在hive命令行中将jar包添加到classpath ：    
      hive>add jar /root/hivetest/myjson.jar;
--4、在hive命令中用命令创建一个函数叫做myjson，关联你所写的这个java类
      hive> create temporary function myjson as 'cn.hive.json.MyJsonParser';
--查询每个人评论过几部电影
select uid,count(1)
from t_rate
group by uid limit 20;
+-------+-------+--+
|  uid  |  _c1  |
+-------+-------+--+
| 1     | 53    |
| 10    | 401   |
| 100   | 76    |
| 1000  | 84    |
| 1001  | 377   |
| 1002  | 66    |
| 1003  | 29    |
| 1004  | 481   |
| 1005  | 92    |
| 1006  | 44    |
| 1007  | 32    |
| 1008  | 50    |
| 1009  | 52    |
| 101   | 106   |
| 1010  | 1004  |

json解析函数

--json解析函数
select json_tuple(json,'movie','rate','timeStamp','uid')
as (movie,rate,time,uid)
from t_ratingjson limit 20;
+--------+-------+------------+------+--+
| movie  | rate  |    time    | uid  |
+--------+-------+------------+------+--+
| 1193   | 5     | 978300760  | 1    |
| 661    | 3     | 978302109  | 1    |
| 914    | 3     | 978301968  | 1    |
| 3408   | 4     | 978300275  | 1    |
| 2355   | 5     | 978824291  | 1    |
| 1197   | 3     | 978302268  | 1    |
| 1287   | 5     | 978302039  | 1    |

hive的函数

表生成函数

集合函数

条件控制函数

分析函数

级联报表查询

窗口分析函数

hive 自定义函数

json解析函数

热门文章

最新文章

相关课程

相关电子书

热门

活动广场

任务中心

开发者评测

高校计划

乘风者计划

训练营

阿里云MVP

话题

直播

下载

镜像站

技术资料

插件

hive的函数

表生成函数

集合函数

条件控制函数

分析函数

级联报表查询

窗口分析函数

hive 自定义函数

json解析函数

热门文章

最新文章

相关课程

相关电子书