package com.zorkdata.datamask.hadoop;

import com.alibaba.fastjson.JSON;
import com.alibaba.fastjson.TypeReference;
import com.zorkdata.datamask.constant.StrConstants;
import com.zorkdata.datamask.domain.LogData;
import com.zorkdata.datamask.domain.HadoopParam;
import com.zorkdata.datamask.domain.TransactionLog;
import com.zorkdata.datamask.util.DateUtils;
import com.zorkdata.datamask.util.MaskUtil;
import com.zorkdata.datamask.util.ParamUtils;
import org.apache.avro.mapred.AvroInputFormat;
import org.apache.avro.mapred.AvroKey;
import org.apache.avro.mapred.AvroOutputFormat;
import org.apache.avro.mapred.AvroWrapper;
import org.apache.flink.api.common.functions.FlatMapFunction;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.api.java.ExecutionEnvironment;
import org.apache.flink.api.java.hadoop.mapred.HadoopInputFormat;
import org.apache.flink.api.java.hadoop.mapred.HadoopOutputFormat;
import org.apache.flink.api.java.operators.DataSource;
import org.apache.flink.api.java.operators.FlatMapOperator;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.api.java.utils.ParameterTool;
import org.apache.flink.util.Collector;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.LocatedFileStatus;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.RemoteIterator;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobConf;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;

/**
 * @author 谢森
 * @Description hadoop 文件数据脱敏
 * @Email xiesen310@163.com
 * @Date 2020/10/21 14:29
 */
public class HadoopMask {
    private static final Logger logger = LoggerFactory.getLogger(HadoopMask.class);

    /**
     * hdfs日志文件脱敏
     *
     * @param conf 请求参数
     * @return void
     */
    public static void maskHdfsLog(Map<String, String> conf) throws Exception {
        ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
        env.setParallelism(1);
        JobConf jobConf = new JobConf();
        jobConf.set("avro.output.schema", TransactionLog.SCHEMA$.toString(true));
        HadoopParam hadoopParam = ParamUtils.initHadoopConf(conf);
        ParameterTool parameterTool = ParameterTool.fromMap(conf);
        env.getConfig().setGlobalJobParameters(parameterTool);

        List<String> logFiles = filterHdfsLogFiles(hadoopParam.getHdfsSrc(), hadoopParam.getDate(),
                hadoopParam.getStartTime(), hadoopParam.getEndTime());

        for (String logFile : logFiles) {
            /**
             * 读取hdfs日志文件，avro反序列化处理
             */
            HadoopInputFormat<Object, Object> hadoopInputFormat = new HadoopInputFormat<Object, Object>
                    (new AvroInputFormat(), Object.class, Object.class, jobConf);
            AvroInputFormat.addInputPath(hadoopInputFormat.getJobConf(), new Path(logFile));
            DataSource<Tuple2<Object, Object>> hdfsLogInput = env.createInput(hadoopInputFormat);

            /**
             * 脱敏算子
             */
            FlatMapOperator<Tuple2<Object, Object>, Object> maskFlatMapOperator =
                    hdfsLogInput.flatMap(new FlatMapFunction<Tuple2<Object, Object>, Object>() {
                        @Override
                        public void flatMap(Tuple2<Object, Object> value, Collector<Object> collector) throws Exception {
                            LogData logData = JSON.parseObject(value.getField(0).toString(),
                                    new TypeReference<LogData>() {
                                    });
                            //根据日志事件的核心信息做过滤
                            if (null != hadoopParam.getCore() && logData.getDimensions().get("hostname").indexOf("c9") > -1) {
                                //根据日志事件的timestamp做过滤
                                Long timestamp = DateUtils.utc2timestamp(logData.getTimestamp());
                                boolean flag = null != timestamp && timestamp > hadoopParam.getStartTime()
                                        && timestamp < hadoopParam.getEndTime() || Boolean.TRUE;

                                if (flag) {
                                    Map maskResult = MaskUtil.mask(logData.getNormalFields());
                                    logData.setNormalFields(maskResult);
                                    collector.collect(logData);
                                }
                            }
                        }
                    });
            // 获取目标hdfs的输出目录
            String logFileName =
                    logFile.split(StrConstants.FILE_SEPARATOR)[logFile.split(StrConstants.FILE_SEPARATOR).length - 1];
            String filePath = hadoopParam.getHdfsSrc() + logFileName.replace(StrConstants.AVRO_SUFFIX,
                    StrConstants.EMPTY_STR);
            HadoopOutputFormat hadoopOutputFormat = new HadoopOutputFormat<>(new AvroOutputFormat(), jobConf);
            FileOutputFormat.setOutputPath(jobConf, new Path(filePath));

            /**
             * avro序列化算子
             */
            maskFlatMapOperator.map(new MapFunction<Object, Tuple2<AvroWrapper<LogData>, NullWritable>>() {
                @Override
                public Tuple2<AvroWrapper<LogData>, NullWritable> map(Object value) throws Exception {
                    AvroKey<LogData> key = new AvroKey<LogData>((LogData) value);
                    Tuple2<AvroWrapper<LogData>, NullWritable> tupple = new Tuple2<AvroWrapper<LogData>,
                            NullWritable>(key, NullWritable.get());
                    return tupple;
                }
            }).output(hadoopOutputFormat);

            try {
                env.execute("国泰交易日志脱敏job");
            } catch (Exception e) {
                e.printStackTrace();
            }
        }
    }


    /**
     * 过滤hdfs日志文件
     *
     * @param hdfs      hdfs地址
     * @param date      日期
     * @param startTime 起始时间
     * @param endTime   结束时间
     * @return hdfs文件列表
     */
    private static List<String> filterHdfsLogFiles(String hdfs, String date, Long startTime, Long endTime) {
        if (!hdfs.endsWith(StrConstants.FILE_SEPARATOR)) {
            hdfs += StrConstants.FILE_SEPARATOR;
        }
        String path = hdfs;
        if (null != date) {
            path = hdfs + date;
        }
        Configuration conf = new Configuration();
        List<String> logFiles = new ArrayList<>();
        try {
            FileSystem fileSystem = null;
            try {
                fileSystem = FileSystem.get(new URI("hdfs://cdh-2:8020/"), conf, "hdfs");
            } catch (InterruptedException e) {
                e.printStackTrace();
            }
            RemoteIterator<LocatedFileStatus> locatedFileStatusRemoteIterator = fileSystem.listFiles(new Path(path),
                    false);
            while (locatedFileStatusRemoteIterator.hasNext()) {
                LocatedFileStatus next = locatedFileStatusRemoteIterator.next();
                long modificationTime = next.getModificationTime();
                // 根据文件的修改时间做过滤，获取用户指定时间段内的文件
                if (modificationTime > startTime && modificationTime < endTime) {
                    Path hdfsFilePath = next.getPath();
                    logFiles.add(hdfsFilePath.toString());
                }
            }
        } catch (IOException e) {
            e.printStackTrace();
        } catch (URISyntaxException e) {
            e.printStackTrace();
        }
        return logFiles;
    }
}
