5.Flink里的Sink Operator实战
5.1.Sink Operator速览
- Flink编程模型
(1)Sink输出源
- 预定义
- writeAsText(过期)
- 自定义
- SinkFunction
- RichSinkFunction
- Rich相关的api更丰富,多了Open、Close方法,用于初始化连接等
- Flink官方提供的Bundle Connector
- Kafka、ES等
- Apache Bahir
- Kafka、ES、Redis等
5.2.自定义Sink连接Mysql
(1)部署MySQL环境
docker pull mysql:5.7 docker run -itd -p 3306:3306 --name my-mysql -e MYSQL_ROOT_PASSWORD=123456 mysql:5.7
(2)连接MySQL创建表
CREATE TABLE `video_order` ( `id` int(11) unsigned NOT NULL AUTO_INCREMENT, `user_id` int(11) DEFAULT NULL, `money` int(11) DEFAULT NULL, `title` varchar(32) DEFAULT NULL, `trade_no` varchar(64) DEFAULT NULL, `create_time` date DEFAULT NULL, PRIMARY KEY (`id`) ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;
(3)加入flink-mysql依赖
<dependency> <groupId>org.apache.flink</groupId> <artifactId>flink-connector-jdbc_2.12</artifactId> <version>1.12.0</version> </dependency> <dependency> <groupId>mysql</groupId> <artifactId>mysql-connector-java</artifactId> <version>8.0.25</version> </dependency>
(4)编写MySQLSink
/** * @author lixiang */ public class MysqlSink extends RichSinkFunction<VideoOrderDO> { private Connection conn = null; private PreparedStatement ps = null; @Override public void invoke(VideoOrderDO videoOrder, Context context) throws Exception { //给ps中的?设置具体值 ps.setInt(1, videoOrder.getUserId()); ps.setInt(2, videoOrder.getMoney()); ps.setString(3, videoOrder.getTitle()); ps.setString(4, videoOrder.getTradeNo()); ps.setDate(5, new Date(videoOrder.getCreateTime().getTime())); int i = ps.executeUpdate(); System.out.println("处理数据,插入数据库结果:" + (i > 0)); } @Override public void open(Configuration parameters) throws Exception { System.out.println("---open---"); conn = DriverManager.getConnection("jdbc:mysql://192.168.139.20:3306/flink?useUnicode=true&characterEncoding=utf8&allowMultiQueries=true&serverTimezone=Asia/Shanghai", "root", "123456"); String sql = "INSERT INTO `video_order` (`user_id`, `money`, `title`, `trade_no`, `create_time`) VALUES(?,?,?,?,?);"; ps = conn.prepareStatement(sql); } @Override public void close() throws Exception { if (conn != null) { conn.close(); } if (ps != null) { ps.close(); } System.out.println("---close---"); } }
(5)整合MySQLSink
public class FlinkMainSink { public static void main(String[] args) throws Exception { StreamExecutionEnvironment env = StreamExecutionEnvironment.createLocalEnvironmentWithWebUI(new Configuration()); env.setParallelism(1); DataStreamSource<VideoOrderDO> source = env.addSource(new VideoOrderSource()); env.setRestartStrategy(RestartStrategies.fixedDelayRestart( 3, // 尝试重启的次数 Time.of(10, TimeUnit.SECONDS) // 间隔 )); source.print("接收的数据"); source.addSink(new MysqlSink()); //流程启动 env.execute("custom sink job"); } }
(6)运行结果
5.3.自定义Sink连接Redis
(1)部署Redis环境
拉取镜像:docker pull redis 启动容器:docker run -d --name redis -p 6379:6379 redis --requirepass "123456"
(2)Flink怎么操作redis?
- 方式一:自定义sink
- 方式二:使用connector
(3)Redis Sink 核心是RedisMapper 是一个接口,使用时要编写自己的redis操作类实现这个接口中的三个方法
- getCommandDescription 选择对应的数据结构和key名称配置
- getKeyFromData 获取key
- getValueFromData 获取value
(4)添加redis的connector依赖,使用connector整合redis
<dependency> <groupId>org.apache.bahir</groupId> <artifactId>flink-connector-redis_2.11</artifactId> <version>1.0</version> </dependency>
(5)自定义RedisSink
/** * 定义泛型,就是要返回的类型 * @author lixiang */ public class MyRedisSink implements RedisMapper<Tuple2<String,Integer>> { /** * 选择对应的数据结构,和key的名称 * @return */ @Override public RedisCommandDescription getCommandDescription() { return new RedisCommandDescription(RedisCommand.HSET,"VIDEO_ORDER_COUNTER"); } /** * 返回key * @param value * @return */ @Override public String getKeyFromData(Tuple2<String,Integer> value) { return value.f0; } /** * 返回value * @param value * @return */ @Override public String getValueFromData(Tuple2<String,Integer> value) { return value.f1.toString(); } }
(5)编写Flink任务类
/** * @author lixiang */ public class FlinkRedisDemo { public static void main(String[] args) throws Exception { //构建执行任务环境以及任务的启动入口 StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); env.setParallelism(1); //自己构建的数据源 /*DataStream<VideoOrderDO> ds = env.fromElements(new VideoOrderDO(5, 32, "java", "2123143432", new Date()), new VideoOrderDO(5, 40, "spring", "2123143432", new Date()), new VideoOrderDO(5, 60, "springBoot", "2233143432", new Date()), new VideoOrderDO(5, 29, "springBoot", "2125643432", new Date()), new VideoOrderDO(5, 67, "docker", "2129843432", new Date()), new VideoOrderDO(5, 89, "java", "2120943432", new Date()));*/ //使用自定义的source DataStream<VideoOrderDO> ds = env.addSource(new VideoOrderSource()); //map转换。来一个记录一个,方便后续统计 DataStream<Tuple2<String,Integer>> mapDS = ds.map(new MapFunction<VideoOrderDO, Tuple2<String,Integer>>() { @Override public Tuple2<String,Integer> map(VideoOrderDO value) throws Exception { return new Tuple2<>(value.getTitle(), 1); } }); KeyedStream<Tuple2<String, Integer>, String> keyedStream = mapDS.keyBy(new KeySelector<Tuple2<String, Integer>, String>() { @Override public String getKey(Tuple2<String, Integer> value) throws Exception { return value.f0; } }); DataStream<Tuple2<String, Integer>> sumDS = keyedStream.sum(1); //输出统计 sumDS.print(); FlinkJedisPoolConfig conf = new FlinkJedisPoolConfig.Builder().setHost("192.168.139.20").setPassword("123456").setPort(6379).build(); sumDS.addSink(new RedisSink<>(conf,new MyRedisSink())); //DataStream需要调用execute,可以取这个名称 env.execute("custom redis job"); } }
/** * 自定义的source * @author lixiang */ public class VideoOrderSource extends RichParallelSourceFunction<VideoOrderDO> { private volatile Boolean flag = true; private Random random = new Random(); private static List<String> list = new ArrayList<>(); static { list.add("SpringBoot2.x"); list.add("Linux"); list.add("Flink"); list.add("Kafka"); list.add("SpringCloud"); list.add("SpringBoot"); list.add("Docker"); list.add("Netty"); } @Override public void run(SourceContext<VideoOrderDO> sourceContext) throws Exception { int x = 0; while (flag) { Thread.sleep(1000); String id = UUID.randomUUID().toString(); int userId = random.nextInt(10); int money = random.nextInt(100); int videoNum = random.nextInt(list.size()); String title = list.get(videoNum); String uuid = UUID.randomUUID().toString(); sourceContext.collect(new VideoOrderDO(userId, money, title,uuid, new Date())); } } /** * 取消任务 */ @Override public void cancel() { flag = false; } }
5.4.Flink整合KafkaConnetor
(1)Kafka环境搭建
- 拉取镜像,部署容器
#zk部署 docker run -d --name zookeeper -p 2181:2181 -t wurstmeister/zookeeper #kafka部署,换成自己的IP docker run -d --name kafka \ -p 9092:9092 \ -e KAFKA_BROKER_ID=0 \ -e KAFKA_ZOOKEEPER_CONNECT=192.168.139.20:2181 \ -e KAFKA_ADVERTISED_LISTENERS=PLAINTEXT://192.168.139.20:9092 \ -e KAFKA_LISTENERS=PLAINTEXT://0.0.0.0:9092 wurstmeister/kafka
- 进入容器内部,创建Topic
#进入容器内部,创建topic docker exec -it kafka /bin/bash cd /opt/kafka bin/kafka-topics.sh --create --zookeeper 192.168.139.20:2181 --replication-factor 1 --partitions 1 --topic test-topic
- 生产者生产消息,消费者消费消息
#创建生产者发送消息 bin/kafka-console-producer.sh --broker-list localhost:9092 --topic test-topic #运行一个消费者,注意--from-beginning从开头第一个开始消费 bin/kafka-console-consumer.sh --bootstrap-server localhost:9092 --topic test-topic --from-beginning
(2)Flink整合Kafka读取消息,发送消息
- 之前自定义SourceFunction,Flink官方也有提供对接外部系统的,比如读取Kafka
- flink官方提供的连接器
- 添加依赖
<dependency> <groupId>org.apache.flink</groupId> <artifactId>flink-connector-kafka_${scala.version}</artifactId> <version>${flink.version}</version> </dependency>
- 编写Flink任务类
/** * @author lixiang */ public class FlinkKafka { public static void main(String[] args) throws Exception { //构建执行任务环境以及任务的启动入口 StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); Properties props = new Properties(); //kafka地址 props.setProperty("bootstrap.servers", "192.168.139.20:9092"); //组名 props.setProperty("group.id", "video-order-group"); //字符串序列化和反序列化规则 props.setProperty("key.deserializer", "org.apache.kafka.common.serialization.StringDeserializer"); props.setProperty("value.deserializer", "org.apache.kafka.common.serialization.StringDeserializer"); //offset重置规则 props.setProperty("auto.offset.reset", "latest"); //自动提交 props.setProperty("enable.auto.commit", "true"); props.setProperty("auto.commit.interval.ms", "2000"); //有后台线程每隔10s检测一下Kafka的分区变化情况 props.setProperty("flink.partition-discovery.interval-millis","10000"); //监听test-topic发送的消息 FlinkKafkaConsumer<String> consumer = new FlinkKafkaConsumer<>("test-topic",new SimpleStringSchema(),props); consumer.setStartFromGroupOffsets(); DataStream<String> consumerDS = env.addSource(consumer); consumerDS.print("test-topic接收的消息"); //接到消息后,处理 DataStream<String> dataStream = consumerDS.map(new MapFunction<String, String>() { @Override public String map(String value) throws Exception { return "新来一个订单课程:"+value; } }); //处理后的消息发送到order-topic FlinkKafkaProducer<String> producer = new FlinkKafkaProducer<>("order-topic",new SimpleStringSchema(),props); dataStream.addSink(producer); env.execute("kafka job"); } }
- 测试testtopic发送消息,order-topic消费消息
#创建生产者发送消息 bin/kafka-console-producer.sh --broker-list localhost:9092 --topic test-topic #运行一个消费者 bin/kafka-console-consumer.sh --bootstrap-server localhost:9092 --topic order-topic --from-beginning
6.Flink常用算子Transformation
6.1.Map和FlatMap实战
(1)java里面的Map操作
- 一对一转换对象
/** * @author lixiang * flink map算子demo */ public class FlinkMapDemo { public static void main(String[] args) throws Exception { //构建环境 StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); env.setParallelism(1); DataStream<VideoOrderDO> streamSource = env.addSource(new VideoOrderSource()); streamSource.print("处理前数据"); DataStream<Tuple2<String,Integer>> dataStream = streamSource.map(new MapFunction<VideoOrderDO, Tuple2<String,Integer>>() { @Override public Tuple2<String, Integer> map(VideoOrderDO value) throws Exception { return new Tuple2<>(value.getTitle(),value.getMoney()); } }); dataStream.print("处理后"); env.execute("map job"); } }
(2)java里面的FlatMap操作
- 一对多转换对象
/** * @author lixiang * flatMap 算子demo */ public class FlinkFlatMapDemo { public static void main(String[] args) throws Exception { //构建环境 StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); env.setParallelism(1); DataStream<String> ds = env.fromElements("java&35,spring&20,springboot&30", "springcloud&21,shiro&39,docker&56,linux&87", "netty&98,kafka&48"); ds.print("处理前"); DataStream<Tuple2<String,Integer>> out = ds.flatMap(new FlatMapFunction<String, Tuple2<String,Integer>>() { @Override public void flatMap(String value, Collector<Tuple2<String, Integer>> out) throws Exception { String[] element = value.split(","); for (String s : element) { String[] eles = s.split("&"); out.collect(new Tuple2<>(eles[0],Integer.parseInt(eles[1]))); } } }); out.print("处理后"); env.execute("flatMap job"); } }
6.2.RichMap和RichFlatMap实战
- Rich相关的api多了open、close方法,用于初始化连接
- RichXXX相关open、close、setRuntimeContext等Api方法会根据并行度进行操作的
- 比如并行度是4,那就有4次触发对应的open、close方法等,是4个不同的subtask
- 如:RichMapFunction、RichFlatMapFunction、RichSourceFunction等
(1)RichMap实战
/** * @author lixiang * flink map算子demo */ public class FlinkMapDemo { public static void main(String[] args) throws Exception { //构建环境 StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); env.setParallelism(1); DataStream<VideoOrderDO> streamSource = env.addSource(new VideoOrderSource()); streamSource.print("处理前数据"); DataStream<Tuple2<String,Integer>> dataStream = streamSource.map(new RichMapFunction<VideoOrderDO, Tuple2<String,Integer>>() { @Override public Tuple2<String, Integer> map(VideoOrderDO value) throws Exception { return new Tuple2<>(value.getTitle(),value.getMoney()); } @Override public void open(Configuration parameters) throws Exception { System.out.println("open方法执行"); } @Override public void close() throws Exception { System.out.println("close方法执行"); } }); dataStream.print("处理后"); env.execute("map job"); } }
(2)RichFlatMapFunction实战
/** * @author lixiang * flatMap 算子demo */ public class FlinkFlatMapDemo { public static void main(String[] args) throws Exception { //构建环境 StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); env.setParallelism(1); DataStream<String> ds = env.fromElements("java&35,spring&20,springboot&30", "springcloud&21,shiro&39,docker&56,linux&87", "netty&98,kafka&48"); ds.print("处理前"); DataStream<Tuple2<String,Integer>> out = ds.flatMap(new RichFlatMapFunction<String, Tuple2<String,Integer>>() { @Override public void flatMap(String value, Collector<Tuple2<String, Integer>> out) throws Exception { String[] element = value.split(","); for (String s : element) { String[] eles = s.split("&"); out.collect(new Tuple2<>(eles[0],Integer.parseInt(eles[1]))); } } @Override public void open(Configuration parameters) throws Exception { System.out.println("open方法执行"); } @Override public void close() throws Exception { System.out.println("close方法执行"); } }); out.print("处理后"); env.execute("flatMap job"); } }
6.3.KeyBy分组实战
- KeyBy分组是把数据流按照某个字段分区,指定字段相同的数据放在同个组中,在进行组内统计
/** * @author lixiang * keyBy 算子demo */ public class FlinkKeyByDemo { public static void main(String[] args) throws Exception { //构建环境 StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); env.setParallelism(1); DataStream<VideoOrderDO> dataStream = env.addSource(new VideoOrderSource()); //根据title进行分组 KeyedStream<VideoOrderDO,String> keyedStream = dataStream.keyBy(new KeySelector<VideoOrderDO, String>() { @Override public String getKey(VideoOrderDO value) throws Exception { return value.getTitle(); } }); //分组后将相同标题的money进行累加 SingleOutputStreamOperator<VideoOrderDO> sumDS = keyedStream.sum("money"); //map转换 DataStream<Tuple2<String, Integer>> outputStreamOperator = sumDS.map(new MapFunction<VideoOrderDO, Tuple2<String,Integer>>() { @Override public Tuple2<String, Integer> map(VideoOrderDO value) throws Exception { return new Tuple2<>(value.getTitle(),value.getMoney()); } }); outputStreamOperator.print(); env.execute("keyBy job"); } }
6.4.filter和sum实战
- filter过滤算子
- sum求和算子
/** * @author lixiang * flink filter算子demo * 先过滤money大于30的,然后根据标题进行分组,然后求每组money总和,最后map转换 */ public class FlinkFliterDemo { public static void main(String[] args) throws Exception { //构建执行任务环境以及任务的启动的入口, 存储全局相关的参数 StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); env.setRuntimeMode(RuntimeExecutionMode.AUTOMATIC); env.setParallelism(1); DataStreamSource<VideoOrderDO> ds = env.addSource(new VideoOrderSource()); DataStream<Tuple2<String,Integer>> out = ds.filter(new FilterFunction<VideoOrderDO>() { @Override public boolean filter(VideoOrderDO value) throws Exception { return value.getMoney()>30; } }).keyBy(new KeySelector<VideoOrderDO, String>() { @Override public String getKey(VideoOrderDO value) throws Exception { return value.getTitle(); } }).sum("money").map(new MapFunction<VideoOrderDO, Tuple2<String,Integer>>() { @Override public Tuple2<String, Integer> map(VideoOrderDO value) throws Exception { return new Tuple2<>(value.getTitle(), value.getMoney()); } }); out.print(); env.execute("filter sum job"); } }
6.5.reduce聚合实战
- reduce函数
- keyBy分组后聚合统计sum和reduce实现一样的效果
- reduce和sum区别
- sum(“xxx”)使用的时候,如果是tuple元组则用序号,POJO则用属性名称
- keyBy分组后聚合统计sum和reduce实现一样的效果
- sum是简单聚合,reduce是可以自定义聚合,aggregate支持复杂的自定义聚合
/** * @author lixiang * reduce 算子demo */ public class FlinkReduceDemo { public static void main(String[] args) throws Exception { //构建执行任务环境以及任务的启动的入口, 存储全局相关的参数 StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); env.setRuntimeMode(RuntimeExecutionMode.AUTOMATIC); env.setParallelism(1); DataStreamSource<VideoOrderDO> ds = env.addSource(new VideoOrderSource()); SingleOutputStreamOperator<Tuple2<String,Integer>> reduce = ds.keyBy(new KeySelector<VideoOrderDO, String>() { @Override public String getKey(VideoOrderDO value) throws Exception { return value.getTitle(); } }).reduce(new AggregationFunction<VideoOrderDO>() { //value1是历史对象,value2是加入统计的对象,所以value1.f1是历史值,value2.f1是新值,不断累加 @Override public VideoOrderDO reduce(VideoOrderDO value1, VideoOrderDO value2) throws Exception { value1.setMoney(value1.getMoney() + value2.getMoney()); return value1; } }).map(new MapFunction<VideoOrderDO, Tuple2<String,Integer>>() { @Override public Tuple2<String,Integer> map(VideoOrderDO value) throws Exception { return new Tuple2<>(value.getTitle(),value.getMoney()); } }); reduce.print(); env.execute("reduce job"); } }
6.6.maxBy-max-minBy-min实战
- 如果是用了keyBy,在后续算子要用maxBy,minBy类型,才可以再分组里面找对应的数据
- 如果用max、min等,就不确定是哪个key中选了
- 如果是keyBy的是对象的某个属性,则分组用max/min聚合统计,只有聚合的字段会更新,其他字段还是旧的,导致对象不准确
- 需要用maxBy/minBy才对让整个对象的属性都是最新的
- max、min出现的问题
/** * @author lixiang * maxBy-max-minBy-min的使用 */ public class FlinkMinMaxDemo { public static void main(String[] args) throws Exception { StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); env.setRuntimeMode(RuntimeExecutionMode.AUTOMATIC); //env.setParallelism(1); DataStream<VideoOrderDO> ds = env.fromElements(new VideoOrderDO(5, 32, "java", "2123143432", new Date()), new VideoOrderDO(25, 40, "spring", "2123143432", new Date()), new VideoOrderDO(45, 60, "springBoot", "2233143432", new Date()), new VideoOrderDO(15, 29, "springBoot", "2125643432", new Date()), new VideoOrderDO(54, 67, "java", "2129843432", new Date()), new VideoOrderDO(59, 89, "java", "2120943432", new Date())); SingleOutputStreamOperator<VideoOrderDO> out = ds.keyBy(new KeySelector<VideoOrderDO, String>() { @Override public String getKey(VideoOrderDO value) throws Exception { return value.getTitle(); } }).max("money"); out.print(); env.execute("max job"); } }
- maxBy、minBy就不会出现这种问题
/** * @author lixiang * maxBy-max-minBy-min的使用 */ public class FlinkMinMaxDemo { public static void main(String[] args) throws Exception { StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); env.setRuntimeMode(RuntimeExecutionMode.AUTOMATIC); //env.setParallelism(1); DataStream<VideoOrderDO> ds = env.fromElements(new VideoOrderDO(5, 32, "java", "2123143432", new Date()), new VideoOrderDO(25, 40, "spring", "2123143432", new Date()), new VideoOrderDO(45, 60, "springBoot", "2233143432", new Date()), new VideoOrderDO(15, 29, "springBoot", "2125643432", new Date()), new VideoOrderDO(54, 67, "java", "2129843432", new Date()), new VideoOrderDO(59, 89, "java", "2120943432", new Date())); SingleOutputStreamOperator<VideoOrderDO> out = ds.keyBy(new KeySelector<VideoOrderDO, String>() { @Override public String getKey(VideoOrderDO value) throws Exception { return value.getTitle(); } }).maxBy("money"); out.print(); env.execute("max job"); } }
7.Flink滑动-滚动时间窗和触发器
7.1.Window窗口介绍和应用
- 背景
- 数据流是一直源源不断产生,业务需要聚合统计使用,比如每10s统计过去5分钟的点击量、成交额等。
- Windows就可以将无限的数据流拆分成有限大小的桶"buckets",然后程序可以对其窗口内的数据进行计算
- 窗口认为是bucket桶,一个窗口段就是一个桶,比如8到9点是一个桶,9到10点是一个桶
- 分类
- time Window时间窗口,即按照一定时间规则作为窗口统计
- time-sliding-window时间滑动窗口
- time-tumbing-window时间滚动窗口
- session Window会话窗口,即一个会话内的数据进行统计
- count Window数量窗口,即按照一定的数量作为窗口统计
(1)窗口属性
- 滑动窗口 Sliding Windows
- 窗口具有固定大小
- 窗口数据有重叠
- 例子:每10s统计一次最近1min内的订单数量
- 滚动窗口 Tumbling Windows
- 窗口具有固定大小
- 窗口数据不重叠
- 例子:每10s统计一次最近10s内的订单数量
(2)窗口大小size 和 滑动间隔 slide
- tumbling-window:滚动窗口: size=slide,如:每隔10s统计最近10s的数据
- sliding-window:滑动窗口: size>slide,如:每隔5s统计最近10s的数据
7.2.Window窗口API和使用流程
(1)什么情况下才可以使用WindowAPI
- 有keyBy用window()api
- 没keyBy用windowAll()api,并行度低
- 一个窗口内 的是左闭右开
- countWindow没过期,但timeWindow在1.12过期,统一使用window;
(2)窗口分配器Window Assigners
- 定义了如何将元素分配给窗口,负责将每条数据分发到正确的window窗口上
- window()的参数是一个WindowAssigner,flink本身提供了Tumbling、Sliding等Assigner
- (3)窗口触发器trigger
- 用来控制一个窗口是否被触发
- 每个窗口分配器WindowAssigner都有一个默认的触发器,也支持自定义触发器
- (4)窗口window function,对窗口内的数据操作
- 增量聚合函数
aggregate(agg函数,WindowFunction(){ })
- 窗口保存临时数据,每进入一个新数据,会与中间数据累加,生成新的中间数据,再保存到窗口中
- 常见的增量聚合函数有 reduceFunction、aggregateFunction
- min、max、sum 都是简单的聚合操作,不需要自定义规则
AggregateFunction<IN, ACC, OUT> IN是输入类型,ACC是中间聚合状态类型,OUT是输出类型,是聚合统计当前窗口的数据
- 全窗口函数
apply(new processWindowFunction(){})
- 窗口先缓存该窗口所有元素,等窗口的全部数据收集起来后再触发条件计算
- 常见的全窗口聚合函数 windowFunction(未来可能弃用)、processWindowFunction(可以获取到窗口上下文 更多信息)
IN是输入类型,OUT是输出类型,KEY是分组类型,W是时间窗
WindowFunction<IN, OUT, KEY, W extends Window>
如果想处理每个元素更底层的API的时候用
//对数据进行解析 ,process对每个元素进行处理,相当于 map+flatMap+filter
process(new KeyedProcessFunction(){processElement、onTimer})
7.3.Tumbling-Window滚动时间窗
- 滚动窗口 Tumbling Windows
- 窗口具有固定大小
- 窗口数据不重叠
- 比如指定了一个5分钟大小的滚动窗口,无限流的数据会根据时间划分为[0:00, 0:05)、[0:05, 0:10)、[0:10, 0:15)等窗口
- 代码实战
/** * @author lixiang * Tumbling-Window滚动窗口 */ public class FlinkTumblingDemo { public static void main(String[] args) throws Exception { //构建环境 StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); env.setParallelism(1); env.setRuntimeMode(RuntimeExecutionMode.AUTOMATIC); DataStream<VideoOrderDO> ds = env.addSource(new VideoOrderSource()); KeyedStream<VideoOrderDO, String> keyedStream = ds.keyBy(new KeySelector<VideoOrderDO, String>() { @Override public String getKey(VideoOrderDO value) throws Exception { return value.getTitle(); } }); SingleOutputStreamOperator<Map<String, Object>> map = keyedStream.window(TumblingProcessingTimeWindows.of(Time.seconds(5))).sum("money").map(new MapFunction<VideoOrderDO, Map<String, Object>>() { @Override public Map<String, Object> map(VideoOrderDO value) throws Exception { Map<String, Object> map = new HashMap<>(); map.put("title", value.getTitle()); map.put("money", value.getMoney()); map.put("createDate", TimeUtil.toDate(value.getCreateTime())); return map; } }); map.print(); env.execute("Tumbling Window job"); } }
7.4.Sliding-Window滑动时间窗
- 滑动窗口 Sliding Windows
- 窗口具有固定大小
- 窗口数据有重叠
- 例子:每5s统计一次最近20s内的订单数量
/** * @author lixiang * Tumbling-Window滚动窗口 */ public class FlinkSlidingDemo { public static void main(String[] args) throws Exception { //构建环境 StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); env.setParallelism(1); env.setRuntimeMode(RuntimeExecutionMode.AUTOMATIC); DataStream<VideoOrderDO> ds = env.addSource(new VideoOrderSource()); KeyedStream<VideoOrderDO, String> keyedStream = ds.keyBy(new KeySelector<VideoOrderDO, String>() { @Override public String getKey(VideoOrderDO value) throws Exception { return value.getTitle(); } }); //每5s去统计过去20s的数据 SingleOutputStreamOperator<Map<String, Object>> map = keyedStream.window(SlidingProcessingTimeWindows.of(Time.seconds(20),Time.seconds(5))).sum("money").map(new MapFunction<VideoOrderDO, Map<String, Object>>() { @Override public Map<String, Object> map(VideoOrderDO value) throws Exception { Map<String, Object> map = new HashMap<>(); map.put("title", value.getTitle()); map.put("money", value.getMoney()); map.put("createDate", TimeUtil.toDate(value.getCreateTime())); return map; } }); map.print(); env.execute("Sliding Window job"); } }
7.5.Count-Window数量窗口
- 基于数量的滚动窗口, 滑动计数窗口
- 统计分组后同个key内的数据超过5次则进行统计 countWindow(5)
/** * @author lixiang * Tumbling-Window滚动窗口 */ public class FlinkWindow1Demo { public static void main(String[] args) throws Exception { //构建环境 StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); env.setParallelism(1); env.setRuntimeMode(RuntimeExecutionMode.AUTOMATIC); DataStream<VideoOrderDO> ds = env.addSource(new VideoOrderSource()); KeyedStream<VideoOrderDO, String> keyedStream = ds.keyBy(new KeySelector<VideoOrderDO, String>() { @Override public String getKey(VideoOrderDO value) throws Exception { return value.getTitle(); } }); SingleOutputStreamOperator<Map<String, Object>> map = keyedStream.countWindow(5).sum("money").map(new MapFunction<VideoOrderDO, Map<String, Object>>() { @Override public Map<String, Object> map(VideoOrderDO value) throws Exception { Map<String, Object> map = new HashMap<>(); map.put("title", value.getTitle()); map.put("money", value.getMoney()); map.put("createDate", TimeUtil.toDate(value.getCreateTime())); return map; } }); map.print(); env.execute("Count Window job"); } }
- 只要有2个数据到达后就可以往后统计5个数据的值, countWindow(5, 2)
/** * @author lixiang * Tumbling-Window滚动窗口 */ public class FlinkWindow1Demo { public static void main(String[] args) throws Exception { //构建环境 StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); env.setParallelism(1); env.setRuntimeMode(RuntimeExecutionMode.AUTOMATIC); DataStream<VideoOrderDO> ds = env.addSource(new VideoOrderSource()); KeyedStream<VideoOrderDO, String> keyedStream = ds.keyBy(new KeySelector<VideoOrderDO, String>() { @Override public String getKey(VideoOrderDO value) throws Exception { return value.getTitle(); } }); SingleOutputStreamOperator<Map<String, Object>> map = keyedStream.countWindow(5,2).sum("money").map(new MapFunction<VideoOrderDO, Map<String, Object>>() { @Override public Map<String, Object> map(VideoOrderDO value) throws Exception { Map<String, Object> map = new HashMap<>(); map.put("title", value.getTitle()); map.put("money", value.getMoney()); map.put("createDate", TimeUtil.toDate(value.getCreateTime())); return map; } }); map.print(); env.execute("Count Window job"); } }