package com.zorkdata.desensitization.hadoop;

import com.alibaba.fastjson.JSON;
import com.alibaba.fastjson.TypeReference;
import com.zorkdata.desensitization.avro.AvroSchemaDef;
import com.zorkdata.desensitization.config.RegularExpressions;
import com.zorkdata.desensitization.constans.ConfigConstants;
import com.zorkdata.desensitization.constans.GeneralConstants;
import com.zorkdata.desensitization.function.DesensitizationFunction;
import com.zorkdata.desensitization.schmea.LogData;
import com.zorkdata.desensitization.utils.DateUtil;
import lombok.extern.slf4j.Slf4j;
import org.apache.avro.Schema;
import org.apache.avro.mapred.AvroInputFormat;
import org.apache.avro.mapred.AvroKey;
import org.apache.avro.mapred.AvroOutputFormat;
import org.apache.avro.mapred.AvroWrapper;
import org.apache.flink.api.common.functions.FlatMapFunction;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.api.java.ExecutionEnvironment;
import org.apache.flink.api.java.hadoop.mapred.HadoopInputFormat;
import org.apache.flink.api.java.hadoop.mapred.HadoopOutputFormat;
import org.apache.flink.api.java.operators.DataSource;
import org.apache.flink.api.java.operators.FlatMapOperator;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.api.java.utils.ParameterTool;
import org.apache.flink.util.Collector;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.LocatedFileStatus;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.RemoteIterator;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobConf;

import java.io.IOException;
import java.io.Serializable;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.*;

/**
 * @author: LiaoMingtao
 * @date: 2020/10/26
 */
@Slf4j
public class HdfsLogDesensitization implements Serializable {

    private static final long serialVersionUID = 1L;

    private static final String AVRO_OUTPUT_SCHEMA = "avro.output.schema";
    private static final String HOSTNAME = "hostname";
    private static final List<String> dataFormats = new ArrayList<String>() {{
        add(",");
        add(".");
        add("@");
        add("-");
        add(":");
    }};

    private String jobName;
    private int sourceParallelism;
    private int transformerParallelism;
    private int sinkParallelism;
    private int maxFileNum;
    private String avroOutputSchema;
    private List<String> fieldsWhiteList;
    private String core;
    private String hdfsUri;
    private String hdfsUser;
    private String hdfsSrc;
    private String hdfsDest;
    private String startTime;
    private String endTime;
    private long startTimestamp;
    private long endTimestamp;
    private Map<String, String> confMap;
    private Map<String, String> regularMap;

    public HdfsLogDesensitization initRegular(Map<String, String> regularMap) {
        this.regularMap = regularMap;
        return this;
    }

    public HdfsLogDesensitization initConf(Map<String, String> conf) {
        this.jobName = String.valueOf(conf.get(ConfigConstants.JOB_NAME));
        this.sourceParallelism = Integer.parseInt(conf.get(ConfigConstants.SOURCE_PARALLELISM));
        this.transformerParallelism = Integer.parseInt(conf.get(ConfigConstants.TRANSFORMER_PARALLELISM));
        this.sinkParallelism = Integer.parseInt(conf.get(ConfigConstants.SINK_PARALLELISM));
        String[] fieldsWhiteListArray = String.valueOf(conf.get(ConfigConstants.FIELDS_WHITE_LIST))
                .trim().split(GeneralConstants.COMMA);
        this.fieldsWhiteList = new ArrayList<>(Arrays.asList(fieldsWhiteListArray));
        this.avroOutputSchema = new Schema.Parser().parse(AvroSchemaDef.ZORK_LOG_SCHEMA).toString(true);
        this.hdfsUri = String.valueOf(conf.get(ConfigConstants.HDFS_URI)).trim();
        this.hdfsUser = String.valueOf(conf.get(ConfigConstants.HDFS_USER)).trim();
        this.hdfsSrc = hdfsUri + String.valueOf(conf.get(ConfigConstants.HDFS_SRC)).trim();
        this.hdfsDest = hdfsUri +  String.valueOf(conf.get(ConfigConstants.HDFS_DEST)).trim();
        this.core = String.valueOf(conf.get(ConfigConstants.CORE)).trim();
        this.startTime = String.valueOf(conf.get(ConfigConstants.START_TIME));
        this.endTime = String.valueOf(conf.get(ConfigConstants.END_TIME));
        this.startTimestamp = DateUtil.time2Timestamp(startTime);
        this.endTimestamp = DateUtil.time2Timestamp(endTime);
        this.confMap = conf;
        return this;
    }

    public void desensitizationHdfsLog() throws Exception {
        desensitizationHdfsLog(this.confMap);
    }

    public void desensitizationHdfsLog(Map<String, String> conf) throws Exception {
        // 初始化env
        ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
        JobConf jobConf = new JobConf();
        jobConf.set(AVRO_OUTPUT_SCHEMA, this.avroOutputSchema);
        ParameterTool parameterTool = ParameterTool.fromMap(conf);
        env.getConfig().setGlobalJobParameters(parameterTool);
        RegularExpressions regularExpressions = new RegularExpressions(this.regularMap);
        DesensitizationFunction desensitizationFunction = new DesensitizationFunction(regularExpressions);

        // source
        List<String> logFiles = filterHdfsLogFiles(hdfsSrc, hdfsUri, hdfsUser);
        String logFileListString = list2String(logFiles);
        HadoopInputFormat<Object, Object> hadoopInputFormat = new HadoopInputFormat<>
                (new AvroInputFormat(), Object.class, Object.class, jobConf);
        AvroInputFormat.addInputPaths(hadoopInputFormat.getJobConf(), logFileListString);
        DataSource<Tuple2<Object, Object>> hdfsLogInput = env
                .createInput(hadoopInputFormat).setParallelism(sourceParallelism);

        // transformer
        FlatMapOperator<Tuple2<Object, Object>, Object> maskFlatMapOperator =
                hdfsLogInput.flatMap(new FlatMapFunction<Tuple2<Object, Object>, Object>() {
                    @Override
                    public void flatMap(Tuple2<Object, Object> value, Collector<Object> collector) {
                        LogData logData = JSON.parseObject(value.getField(0).toString(),
                                new TypeReference<LogData>() {
                                });
                        //根据日志事件的核心信息做过滤
                        if (null != core && logData.getDimensions().get(HOSTNAME).contains(core)) {
                            //根据日志事件的timestamp做过滤
                            Long timestamp = DateUtil.utc2timestamp(logData.getTimestamp());

                            if (null != timestamp && timestamp.compareTo(startTimestamp) >= 0 &&
                                    timestamp.compareTo(endTimestamp) <= 0) {
                                Map desensitization = desensitizationFunction.
                                        desensitization(logData.getNormalFields(), fieldsWhiteList, dataFormats);
                                logData.setNormalFields(desensitization);
                                collector.collect(logData);
                            }
                        }
                    }
                }).setParallelism(transformerParallelism);
        // 获取目标hdfs的输出目录
        String filePath = hdfsDest;
        HadoopOutputFormat hadoopOutputFormat = new HadoopOutputFormat<>(new AvroOutputFormat(), jobConf);
        FileOutputFormat.setOutputPath(jobConf, new Path(filePath));
        /**
         * avro序列化算子 .writeAsText("file:///lmt/output");
         */
        maskFlatMapOperator.map(new MapFunction<Object, Tuple2<AvroWrapper<LogData>, NullWritable>>() {
            @Override
            public Tuple2<AvroWrapper<LogData>, NullWritable> map(Object value) throws Exception {
                AvroKey<LogData> key = new AvroKey<>((LogData) value);
                Tuple2<AvroWrapper<LogData>, NullWritable> tuple = new Tuple2<>(key, NullWritable.get());
                return tuple;
            }
        }).setParallelism(transformerParallelism).output(hadoopOutputFormat).setParallelism(sinkParallelism);
        try {
            env.execute(jobName);
        } catch (Exception e) {
            log.error(String.valueOf(e));
        }
    }

    /**
     * 分组list
     *
     * @param list 演示list
     * @return List<String>
     */
    private List<String> changeList(List<String> list) {
        List<String> resultList = new ArrayList<>();
        List<List<String>> lists = subList(list, maxFileNum);
        lists.forEach(item -> {
            String tempString = list2String(item);
            resultList.add(tempString);
        });
        return resultList;
    }

    /**
     * 将List按照每组n个元素进行分组
     *
     * @param sourceList 原始list
     * @param n          n个元素
     * @param <T>        泛型
     * @return List<List < T>>
     */
    private <T> List<List<T>> subList(List<T> sourceList, int n) {
        List<List<T>> rsList = new ArrayList<>();
        if (n <= 0) {
            rsList.add(sourceList);
            return rsList;
        }
        int listSize = sourceList.size();
        int groupNum = (sourceList.size() / n) + 1;
        for (int i = 0; i < groupNum; i++) {
            List<T> subList = new ArrayList<>();
            for (int j = i * n; j < (i + 1) * n; j++) {
                if (j < listSize) {
                    subList.add(sourceList.get(j));
                }
            }
            rsList.add(subList);
        }
        if (rsList.get(rsList.size() - 1).isEmpty()) {
            rsList.remove(rsList.size() - 1);
        }
        return rsList;
    }

    /**
     * list<string>转逗号分割的string
     *
     * @param list list<string>
     * @return String
     */
    private String list2String(List<String> list) {
        return String.join(GeneralConstants.COMMA, list);
    }

    /**
     * 通过路径获取
     *
     * @param fileSystem 文件系统
     * @param path       目录路径
     * @return 文件路径下所有文件全路径
     */
    private List<String> getHdfsLogFilesByPath(FileSystem fileSystem, String path) {
        List<String> logFiles = new ArrayList<>();
        try {
            RemoteIterator<LocatedFileStatus> locatedFileStatusRemoteIterator = fileSystem.listFiles(new Path(path),
                    false);
            while (locatedFileStatusRemoteIterator.hasNext()) {
                LocatedFileStatus next = locatedFileStatusRemoteIterator.next();
                long modificationTime = next.getModificationTime();
                // 根据文件的修改时间做过滤，获取用户指定时间段内的文件
                if (modificationTime > startTimestamp) {
                    Path hdfsFilePath = next.getPath();
                    logFiles.add(hdfsFilePath.toString());
                }
            }
        } catch (IOException e) {
            log.error(String.valueOf(e));
        }
        return logFiles;
    }

    /**
     * 获取hdfs日志文件的所有文件路径
     *
     * @param hdfsSrc  hdfs地址
     * @param hdfsUri  hdfs的URI
     * @param hdfsUser hfs用户名
     * @return hdfs日志文件的所有文件路径
     */
    private List<String> filterHdfsLogFiles(String hdfsSrc, String hdfsUri, String hdfsUser) {
        // hdfs://cdh-2:8020/ hdfs
        if (!hdfsSrc.endsWith(GeneralConstants.FILE_SEPARATOR)) {
            hdfsSrc += GeneralConstants.FILE_SEPARATOR;
        }
        String path = hdfsSrc;
        Configuration conf = new Configuration();
        List<String> logFiles = new ArrayList<>();
        FileSystem fileSystem = null;
        List<String> betweenDate = DateUtil.getBetweenDate(startTime, endTime);
        List<String> dateList = DateUtil.date2date(betweenDate);
        try {
            fileSystem = FileSystem.get(new URI(hdfsUri), conf, hdfsUser);
            for (String item : dateList) {
                path = hdfsSrc + item;
                List<String> hdfsLogFiles = getHdfsLogFilesByPath(fileSystem, path);
                logFiles.addAll(hdfsLogFiles);
            }
        } catch (IOException e) {
            log.error(String.valueOf(e));
        } catch (InterruptedException e) {
            log.error(String.valueOf(e));
        } catch (URISyntaxException e) {
            log.error(String.valueOf(e));
        } finally {
            if (null != fileSystem) {
                try {
                    fileSystem.close();
                } catch (IOException e) {
                    log.error(String.valueOf(e));
                }
            }
        }
        return logFiles;
    }

    public HdfsLogDesensitization() {

    }
}
