package com.zorkdata.datamask;

import com.alibaba.fastjson.JSON;
import com.alibaba.fastjson.TypeReference;
import com.zorkdata.datamask.constants.Constants;
import com.zorkdata.datamask.domain.LogData;
import com.zorkdata.datamask.domain.TransactionLog;
import com.zorkdata.datamask.util.MaskUtil;
import com.zorkdata.datamask.util.ZorkParameterUtil;
import org.apache.avro.mapred.AvroInputFormat;
import org.apache.avro.mapred.AvroKey;
import org.apache.avro.mapred.AvroOutputFormat;
import org.apache.avro.mapred.AvroWrapper;
import org.apache.flink.api.common.functions.FlatMapFunction;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.api.common.serialization.SimpleStringSchema;
import org.apache.flink.api.java.ExecutionEnvironment;
import org.apache.flink.api.java.hadoop.mapred.HadoopInputFormat;
import org.apache.flink.api.java.hadoop.mapred.HadoopOutputFormat;
import org.apache.flink.api.java.operators.DataSource;
import org.apache.flink.api.java.operators.FlatMapOperator;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.api.java.utils.ParameterTool;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.connectors.fs.bucketing.BucketingSink;
import org.apache.flink.streaming.connectors.fs.bucketing.DateTimeBucketer;
import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer;
import org.apache.flink.util.Collector;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.LocatedFileStatus;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.RemoteIterator;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobConf;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.time.ZoneId;
import java.util.*;

/**
 * Description : 国泰交易日志脱敏job
 *
 * @author : wanghaiying (<a href="wanghaiying@zorkdata.com.cn">wanghaiying@zorkdata.com.cn</a>)
 * Date : Create in 2020/9/18 17:35
 */
public class TransactionLogMask {

    private static Logger LOG = LoggerFactory.getLogger(TransactionLogMask.class);
    private static String source = "hdfs";
    private static String hdfsSrc;
    private static String hdfsDest;
    private static String core;
    private static String date;
    private static Long startTime;
    private static Long endTime;

    private static String namePattern = "";

    private static String mobilePattern = "";

    private static String phonePattern = "";

    private static String emailPattern = "";

    public static void main(String[] args) throws Exception {
        if ("hdfs".equals(source)) {
            maskHdfsLog(args);
        } else if ("kafka".equals(source)) {
            maskKafkaMsg(args);
        }
    }

    /**
     * 初始化配置文件
     *
     * @param conf
     */
    private static void initConf(Map conf) {
        source = String.valueOf(conf.get(Constants.SOURCE)).trim();
        hdfsSrc = String.valueOf(conf.get(Constants.HDFS_SRC)).trim();
        hdfsDest = String.valueOf(conf.get(Constants.HDFS_DEST)).trim();
        core = String.valueOf(conf.get(Constants.CORE)).trim();
        date = String.valueOf(conf.get(Constants.DATE)).trim();
        startTime = Long.parseLong(String.valueOf(conf.get(Constants.START_TIME)).trim());
        endTime = Long.parseLong(String.valueOf(conf.get(Constants.END_TIME)).trim());
    }

    /**
     * hdfs日志文件脱敏
     *
     * @param args 请求参数
     * @return void
     */
    public static void maskHdfsLog(String[] args) throws Exception {
        ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
        env.setParallelism(1);
        JobConf jobConf = new JobConf();
        jobConf.set("avro.output.schema", TransactionLog.SCHEMA$.toString(true));
        SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd");

        Map<String, String> conf = ZorkParameterUtil.readParameter(args);
        LOG.info("配置文件: " + conf);
        initConf(conf);

        ParameterTool parameterTool = ParameterTool.fromMap(conf);
        env.getConfig().setGlobalJobParameters(parameterTool);

        List<String> logFiles = filterHdfsLogFiles(hdfsSrc, date, startTime, endTime);
        for (String logFile : logFiles) {
            /**
             * 读取hdfs日志文件，avro反序列化处理
             */
            HadoopInputFormat<Object, Object> hadoopInputFormat = new HadoopInputFormat<Object, Object>
                    (new AvroInputFormat(), Object.class, Object.class, jobConf);
            AvroInputFormat.addInputPath(hadoopInputFormat.getJobConf(), new Path(logFile));
            DataSource<Tuple2<Object, Object>> hdfsLogInput = env.createInput(hadoopInputFormat);
//            hdfsLogInput.print();

            /**
             * 脱敏算子
             */
            FlatMapOperator<Tuple2<Object, Object>, Object> maskFlatMapOperator = hdfsLogInput.flatMap(new FlatMapFunction<Tuple2<Object, Object>, Object>() {
                @Override
                public void flatMap(Tuple2<Object, Object> value, Collector<Object> collector) throws Exception {
                    LogData logData = JSON.parseObject(value.getField(0).toString(), new TypeReference<LogData>() {
                    });
                    //根据日志事件的核心信息做过滤
                    if (null != core && logData.getDimensions().get("hostname").indexOf("c9") > -1) {
                        //根据日志事件的timestamp做过滤
                        Long timestamp = utc2timestamp(logData.getTimestamp());
                        if (null != timestamp && timestamp > startTime && timestamp < endTime || Boolean.TRUE) {
                            Map maskResult = MaskUtil.mask(logData.getNormalFields());
                            logData.setNormalFields(maskResult);
                            collector.collect(logData);
                        }
                    }
                }
            });
//            maskFlatMapOperator.print();
            // 获取目标hdfs的输出目录
            String logFileName = logFile.split("/")[logFile.split("/").length - 1];
            String filePath = hdfsDest + logFileName.replace(".avro", "");
            HadoopOutputFormat hadoopOutputFormat = new HadoopOutputFormat<>(new AvroOutputFormat(), jobConf);
            FileOutputFormat.setOutputPath(jobConf, new Path(filePath));

            /**
             * avro序列化算子
             */
            maskFlatMapOperator.map(new MapFunction<Object, Tuple2<AvroWrapper<LogData>, NullWritable>>() {
                @Override
                public Tuple2<AvroWrapper<LogData>, NullWritable> map(Object value) throws Exception {
                    AvroKey<LogData> key = new AvroKey<LogData>((LogData) value);
                    Tuple2<AvroWrapper<LogData>, NullWritable> tupple = new Tuple2<AvroWrapper<LogData>, NullWritable>(key, NullWritable.get());
                    return tupple;
                }
            }).output(hadoopOutputFormat);

            try {
                env.execute("国泰交易日志脱敏job");
            } catch (Exception e) {
                e.printStackTrace();
            }
        }
    }

    /**
     * kafka消息数据脱敏
     *
     * @param args 请求参数
     * @return void
     */
    public static void maskKafkaMsg(String[] args) {
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        env.setParallelism(1);

        SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd");
        ParameterTool params = ParameterTool.fromArgs(args);

        String servers = params.get("servers");
        String zookeeper = params.get("zookeeper");
        String topic = params.get("topic");
        String hdfsDest = params.get("hdfs-dest");
        String core = params.get("core", "c1");
        String date = params.get("date", sdf.format(new Date()));
        String startTime = params.get("startTime");
        String endTime = params.get("endTime");

        Properties props = new Properties();
        props.put("bootstrap.servers", servers);
        props.put("zookeeper.connect", zookeeper);
        props.put("group.id", "group1");
        props.put("enable.auto.commit", false);
        props.put("key.deserializer", "org.apache.kafka.common.serialization.StringDeserializer");
        props.put("value.deserializer", "org.apache.kafka.common.serialization.StringDeserializer");
        props.put("auto.offset.reset", "earliest");
        props.put("max.poll.records", 1000);
        SingleOutputStreamOperator<String> dataStreamSource = env.addSource(new FlinkKafkaConsumer<>(topic, new SimpleStringSchema(), props)).setParallelism(1);

        // TODO 根据date、startTime、endTime过滤
        BucketingSink<String> hdfsSink = new BucketingSink<>(hdfsDest);
        //创建一个按照时间创建目录的bucketer,默认是yyyy-MM-dd--HH，时区默认是美国时间。这里我都改了，一天创建一次目录，上海时间
        hdfsSink.setBucketer(new DateTimeBucketer<String>("yyyy-MM-dd", ZoneId.of("Asia/Shanghai")));
        //设置每个文件的最大大小 ,默认是384M(1024 * 1024 * 384)
        hdfsSink.setBatchSize(1024 * 1024 * 384);
        //设置多少时间，就换一个文件写
        hdfsSink.setBatchRolloverInterval(1000 * 60 * 60);
        hdfsSink.setPendingSuffix("ccc");
        hdfsSink.setInactiveBucketThreshold(60 * 1000L);
        hdfsSink.setInactiveBucketCheckInterval(60 * 1000L);
        hdfsSink.setAsyncTimeout(60 * 1000);
        dataStreamSource.addSink(hdfsSink);
        try {
            env.execute("国泰交易日志脱敏job");
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    /**
     * 过滤hdfs日志文件
     *
     * @param hdfs      hdfs地址
     * @param date      日期
     * @param startTime 起始时间
     * @param endTime   结束时间
     * @return hdfs文件列表
     */
    private static List<String> filterHdfsLogFiles(String hdfs, String date, Long startTime, Long endTime) {
        if (!hdfs.endsWith("/")) {
            hdfs += "/";
        }
        String path = hdfs;
        if (null != date) {
            path = hdfs + date;
        }
        Configuration conf = new Configuration();
        List<String> logFiles = new ArrayList<>();
        try {
            FileSystem fileSystem = null;
            try {
                fileSystem = FileSystem.get(new URI("hdfs://cdh-2:8020/"), conf, "hdfs");
            } catch (InterruptedException e) {
                e.printStackTrace();
            }
            RemoteIterator<LocatedFileStatus> locatedFileStatusRemoteIterator = fileSystem.listFiles(new Path(path), false);
            while (locatedFileStatusRemoteIterator.hasNext()) {
                LocatedFileStatus next = locatedFileStatusRemoteIterator.next();
                long modificationTime = next.getModificationTime();
                // 根据文件的修改时间做过滤，获取用户指定时间段内的文件
                if (modificationTime > startTime && modificationTime < endTime) {
                    Path hdfsFilePath = next.getPath();
                    logFiles.add(hdfsFilePath.toString());
                }
            }
        } catch (IOException e) {
            e.printStackTrace();
        } catch (URISyntaxException e) {
            e.printStackTrace();
        }
        return logFiles;
    }

    /**
     * UTC时间转
     *
     * @param utcTime UTC时间
     * @return unix时间戳
     */
    public static Long utc2timestamp(String utcTime) {
        SimpleDateFormat utcFormater = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSS+08:00");
        utcFormater.setTimeZone(TimeZone.getTimeZone("asia/shanghai"));//时区定义并进行时间获取
        Date gpsUTCDate = null;
        try {
            gpsUTCDate = utcFormater.parse(utcTime);
        } catch (ParseException e) {
            System.out.println("时间戳格式转换异常：" + utcTime + e.getMessage());
            return null;
        }
        return gpsUTCDate.getTime();
    }
}
