package com.zorkdata.datamask.kafka;

import com.zorkdata.datamask.domain.HadoopParam;
import com.zorkdata.datamask.domain.KafkaParam;
import com.zorkdata.datamask.util.ParamUtils;
import org.apache.flink.api.common.serialization.SimpleStringSchema;
import org.apache.flink.api.java.utils.ParameterTool;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.connectors.fs.bucketing.BucketingSink;
import org.apache.flink.streaming.connectors.fs.bucketing.DateTimeBucketer;
import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer;

import java.text.SimpleDateFormat;
import java.time.ZoneId;
import java.util.Date;
import java.util.Map;
import java.util.Properties;

/**
 * @author 谢森
 * @Description kafka 数据脱敏
 * @Email xiesen310@163.com
 * @Date 2020/10/21 14:51
 */
public class KafkaMask {
    /**
     * kafka消息数据脱敏
     *
     * @param conf 请求参数
     * @return void
     */
    public static void maskKafkaMsg(Map<String, String> conf) {
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        env.setParallelism(1);

        SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd");
        KafkaParam kafkaParam = ParamUtils.initKafkaConf(conf);
        ParameterTool parameterTool = ParameterTool.fromMap(conf);
        env.getConfig().setGlobalJobParameters(parameterTool);


        Properties props = new Properties();
        props.put("bootstrap.servers", kafkaParam.getServers());
        props.put("zookeeper.connect", kafkaParam.getZookeeper());
        props.put("group.id", "group1");
        props.put("enable.auto.commit", false);
        props.put("key.deserializer", "org.apache.kafka.common.serialization.StringDeserializer");
        props.put("value.deserializer", "org.apache.kafka.common.serialization.StringDeserializer");
        props.put("auto.offset.reset", "earliest");
        props.put("max.poll.records", 1000);
        SingleOutputStreamOperator<String> dataStreamSource =
                env.addSource(new FlinkKafkaConsumer<>(kafkaParam.getTopic(),
                        new SimpleStringSchema(), props)).setParallelism(1);

        // TODO 根据date、startTime、endTime过滤
        BucketingSink<String> hdfsSink = new BucketingSink<>(kafkaParam.getHdfsDest());
        //创建一个按照时间创建目录的bucketer,默认是yyyy-MM-dd--HH，时区默认是美国时间。这里我都改了，一天创建一次目录，上海时间
        hdfsSink.setBucketer(new DateTimeBucketer<String>("yyyy-MM-dd", ZoneId.of("Asia/Shanghai")));
        //设置每个文件的最大大小 ,默认是384M(1024 * 1024 * 384)
        hdfsSink.setBatchSize(1024 * 1024 * 384);
        //设置多少时间，就换一个文件写
        hdfsSink.setBatchRolloverInterval(1000 * 60 * 60);
        hdfsSink.setPendingSuffix("ccc");
        hdfsSink.setInactiveBucketThreshold(60 * 1000L);
        hdfsSink.setInactiveBucketCheckInterval(60 * 1000L);
        hdfsSink.setAsyncTimeout(60 * 1000);
        dataStreamSource.addSink(hdfsSink);
        try {
            env.execute("国泰交易日志脱敏job");
        } catch (Exception e) {
            e.printStackTrace();
        }
    }
}
