clickhouse在做用户画像标签时,怎么去做圈选,表结构应该是怎么样的,我们应该怎么去处理,能够使其高性能的圈选,尽可能缩小其占用的存储空间?
这个问题,我通过代码给大家做下的演示
先在hive中对数据预处理
--最初表结构
create table f_tag_table( tag Int, tag_name String, cust_num String );
--插入数据
INSERT INTO f_tag_table values (1001, '存款客户', '1,2,3'), (2001,'国债客户', '2,3,4'), (3001,'卡黑名单客户', '2,4'), (4001,'短信黑名单', '3,4');
现在的表是这样的:
tag | tag_name | cust_num |
1001 | 存款客户 | 1,2,3 |
2001 | 国债客户 | 2,3,4 |
3001 | 卡黑名单客户 | 2,4 |
4001 | 短信黑名单 | 3,4 |
重新建一张表
create table usr_table( id int, tag_name varchar(30), tag int );
把表f_tag_table的数据插入进去
insert into usr_table select usr, tag_name, tag from f_tag_table lateral view explode(split(cust_num, ',')) tmp as usr;
现在的表结构如下
usr | tag_name | tag |
1 | 存款客户 | 1001 |
2 | 存款客户 | 1001 |
3 | 存款客户 | 1001 |
2 | 国债客户 | 2001 |
3 | 国债客户 | 2001 |
4 | 国债客户 | 2001 |
2 | 卡黑名单客户 | 3001 |
4 | 卡黑名单客户 | 3001 |
3 | 短信黑名单 | 4001 |
4 | 短信黑名单 | 4001 |
--写入到文件中
./hive -e " select * from ck.usr_table " | tr "\t" "," > /fileName.csv
--在clickhouse中建库建表
create database ck; use ck; create table usr_table( id UInt32, tag_name String, tag UInt32 )ENGINE = MergeTree() ORDER BY id;
--写入到CK中
clickhouse-client -m -u default -h 192.168.88.161 --query 'INSERT INTO ck.usr_table FORMAT CSV' < /fileName.csv
-- 存储表结构
create table tag_table( tag UInt32, tag_name String, cust_num AggregateFunction(groupBitmap, UInt64 ) )ENGINE = AggregatingMergeTree() ORDER BY (tag, tag_name) SETTINGS index_granularity = 128; insert into tag_table select tag, tag_name, groupBitmapState(toUInt64(id)) as cust_num from usr_table group by tag,tag_name;
--查询表
select tag, tag_name, bitmapToArray(cust_num) from tag_table;
现在表是这样的:
tag | tag_name | cust_num |
1001 | 存款客户 | [1,2,3] |
2001 | 国债客户 | [2,3,4] |
3001 | 卡黑名单客户 | [2,4] |
4001 | 短信黑名单 | [3,4] |
用bitmap进行圈选
WITH ( SELECT cust_num from tag_table where tag ='1001' ) AS tag1, ( SELECT cust_num from tag_table where tag ='2001' ) AS tag2, ( SELECT cust_num from tag_table where tag ='3001' ) AS tag3, ( SELECT cust_num from tag_table where tag ='4001' ) AS tag4 select bitmapToArray(bitmapAndnot(bitmapOr(tag1, tag2),bitmapOr(tag3, tag4))) as customer;