package com.zorkdata.desensitization.hadoop;

import com.zorkdata.desensitization.config.JobConfig;
import com.zorkdata.desensitization.constans.GeneralConstants;
import com.zorkdata.desensitization.exception.ZorkException;
import com.zorkdata.desensitization.function.DesensitizationFunction;
import com.zorkdata.desensitization.schmea.LogData;
import com.zorkdata.desensitization.utils.DateUtils;
import lombok.extern.slf4j.Slf4j;
import org.apache.avro.mapred.AvroInputFormat;
import org.apache.avro.mapred.AvroKey;
import org.apache.avro.mapred.AvroOutputFormat;
import org.apache.avro.mapred.AvroWrapper;
import org.apache.commons.lang3.StringUtils;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.api.java.ExecutionEnvironment;
import org.apache.flink.api.java.hadoop.mapred.HadoopInputFormat;
import org.apache.flink.api.java.hadoop.mapred.HadoopOutputFormat;
import org.apache.flink.api.java.operators.DataSource;
import org.apache.flink.api.java.operators.FlatMapOperator;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.LocatedFileStatus;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.RemoteIterator;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobConf;

import java.io.IOException;
import java.io.Serializable;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.ArrayList;
import java.util.List;

/**
 * @author: LiaoMingtao
 * @date: 2021/2/24
 */
@Slf4j
public class HdfsLogDesensitization implements Serializable {
    private static final long serialVersionUID = -6253583122681202967L;

    private static final String AVRO_OUTPUT_SCHEMA = "avro.output.schema";
    private JobConfig jobConfig;

    public HdfsLogDesensitization initJobConfig(JobConfig jobConfig) {
        this.jobConfig = jobConfig;
        return this;
    }

    public void desensitizationHdfsLog() {
        desensitizationHdfsLog(this.jobConfig);
    }

    public JobConf config(String jobName, String path) {
        JobConf conf = new JobConf(HdfsLogDesensitization.class);
        conf.setJobName(jobName);
        conf.addResource("classpath:" + path + "/core-site.xml");
        conf.addResource("classpath:" + path + "/hdfs-site.xml");
///        conf.addResource("classpath:" + path + "/mapred-site.xml");
        log.info("配置文件加载:"+"classpath:" + path + "/core-site.xml");
        log.info("配置文件加载:"+"classpath:" + path + "/hdfs-site.xml");
        return conf;
    }

    public void desensitizationHdfsLog(JobConfig jobConfig) {
        // 初始化flink job env
        ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
        JobConf jobConfInput = new JobConf();
        if (!StringUtils.isEmpty(jobConfig.getHdfsSourcePath())) {
            jobConfInput = config("jobConfInput", jobConfig.getHdfsSourcePath());
        }
        jobConfInput.set(AVRO_OUTPUT_SCHEMA, jobConfig.getAvroOutputSchema());

        // source部分
        // 1、通过时间获取文件夹信息
        List<String> logFiles = filterHdfsLogFiles(jobConfig.getHdfsSrc(), jobConfig.getHdfsUri(), jobConfig.getHdfsUser());
        String logFileListString = list2String(logFiles);
        HadoopInputFormat<Object, Object> hadoopInputFormat = new HadoopInputFormat<>
                (new AvroInputFormat(), Object.class, Object.class, jobConfInput);
        AvroInputFormat.addInputPaths(hadoopInputFormat.getJobConf(), logFileListString);
        // 2、创建datasource
        DataSource<Tuple2<Object, Object>> hdfsLogInput = env
                .createInput(hadoopInputFormat).setParallelism(jobConfig.getSourceParallelism());
        // transformer部分
        FlatMapOperator<Tuple2<Object, Object>, LogData> flatMapOperator =
                hdfsLogInput.flatMap(new DesensitizationFunction<Tuple2<Object, Object>, LogData>(jobConfig));
        // sink部分
        // 获取目标hdfs的输出目录

        JobConf jobConfOutput = new JobConf();
        if (!StringUtils.isEmpty(jobConfig.getHdfsSinkPath())) {
            jobConfOutput = config("jobConfOutput", jobConfig.getHdfsSinkPath());
        }
        jobConfOutput.set(AVRO_OUTPUT_SCHEMA, jobConfig.getAvroOutputSchema());
        String filePath = jobConfig.getHdfsDest();
        HadoopOutputFormat hadoopOutputFormat = new HadoopOutputFormat<>(new AvroOutputFormat(), jobConfOutput);
        FileOutputFormat.setOutputPath(jobConfOutput, new Path(filePath));
        // avro序列化算子(.writeAsText("file:///lmt/output"); 本地写入)
        flatMapOperator.map(new MapFunction<LogData, Tuple2<AvroWrapper<LogData>, NullWritable>>() {
                    @Override
                    public Tuple2<AvroWrapper<LogData>, NullWritable> map(LogData value) throws Exception {
                        AvroKey<LogData> key = new AvroKey<>(value);
                        return new Tuple2<>(key, NullWritable.get());
                    }
                }).setParallelism(jobConfig.getTransformerParallelism())
                // .writeAsText("file:///lmt/output", org.apache.flink.core.fs.FileSystem.WriteMode.OVERWRITE)
                .output(hadoopOutputFormat)
                .setParallelism(jobConfig.getSinkParallelism());
        try {
            env.execute(jobConfig.getJobName());
        } catch (Exception e) {
            log.error(String.valueOf(e));
        }
    }

    /**
     * 获取hdfs日志文件的所有文件路径
     *
     * @param hdfsSrc  hdfs地址 eg: /tmp/
     * @param hdfsUri  hdfs的URI eg: hdfs://cdh-2:8020/
     * @param hdfsUser hfs用户名 eg: hdfs
     * @return hdfs日志文件的所有文件路径
     */
    public List<String> filterHdfsLogFiles(String hdfsSrc, String hdfsUri, String hdfsUser) {
        if (!hdfsSrc.endsWith(GeneralConstants.FILE_SEPARATOR)) {
            hdfsSrc += GeneralConstants.FILE_SEPARATOR;
        }
        String path = hdfsSrc;
        Configuration conf = new Configuration();
        List<String> logFiles = new ArrayList<>();
        FileSystem fileSystem = null;
        List<String> betweenDate = DateUtils.getBetweenDate(jobConfig.getStartTime(), jobConfig.getEndTime());
        List<String> dateList = DateUtils.date2date(betweenDate);
        if (!dateList.isEmpty()) {
            try {
                fileSystem = FileSystem.get(new URI(hdfsUri), conf, hdfsUser);
                for (String item : dateList) {
                    path = hdfsSrc + item;
                    List<String> hdfsLogFiles = null;
                    try {
                        hdfsLogFiles = getHdfsLogFilesByPath(fileSystem, path);
                        logFiles.addAll(hdfsLogFiles);
                    } catch (ZorkException e) {
                        e.printStackTrace();
                        log.error(String.valueOf(e));
                    }
                }
            } catch (IOException e) {
                log.error(String.valueOf(e));
            } catch (InterruptedException e) {
                log.error(String.valueOf(e));
            } catch (URISyntaxException e) {
                log.error(String.valueOf(e));
            } finally {
                if (null != fileSystem) {
                    try {
                        fileSystem.close();
                    } catch (IOException e) {
                        log.error(String.valueOf(e));
                    }
                }
            }
        } else {
            log.warn("{} -- {} 时间段内无数据,请注意时间范围", jobConfig.getStartTime(), jobConfig.getEndTime());
        }
        return logFiles;
    }

    /**
     * 将List按照每组n个元素进行分组
     *
     * @param sourceList 原始list
     * @param n          n个元素
     * @param <T>        泛型
     * @return List<List < T>>
     */
    private <T> List<List<T>> subList(List<T> sourceList, int n) {
        List<List<T>> rsList = new ArrayList<>();
        if (n <= 0) {
            rsList.add(sourceList);
            return rsList;
        }
        int listSize = sourceList.size();
        int groupNum = (sourceList.size() / n) + 1;
        for (int i = 0; i < groupNum; i++) {
            List<T> subList = new ArrayList<>();
            for (int j = i * n; j < (i + 1) * n; j++) {
                if (j < listSize) {
                    subList.add(sourceList.get(j));
                }
            }
            rsList.add(subList);
        }
        if (rsList.get(rsList.size() - 1).isEmpty()) {
            rsList.remove(rsList.size() - 1);
        }
        return rsList;
    }

    /**
     * list<string>转逗号分割的string
     *
     * @param list list<string>
     * @return String
     */
    private String list2String(List<String> list) {
        return String.join(GeneralConstants.COMMA, list);
    }

    /**
     * 通过路径获取
     *
     * @param fileSystem 文件系统
     * @param path       目录路径
     * @return 文件路径下所有文件全路径
     */
    private List<String> getHdfsLogFilesByPath(FileSystem fileSystem, String path) throws ZorkException {
        List<String> logFiles = new ArrayList<>();
        try {
            RemoteIterator<LocatedFileStatus> locatedFileStatusRemoteIterator =
                    fileSystem.listFiles(new Path(path), true);
            while (locatedFileStatusRemoteIterator.hasNext()) {
                LocatedFileStatus next = locatedFileStatusRemoteIterator.next();
                long modificationTime = next.getModificationTime();
                // 根据文件的修改时间做过滤，获取用户指定时间段内的文件
                if (modificationTime > jobConfig.getStartTimestamp()) {
                    Path hdfsFilePath = next.getPath();
                    logFiles.add(hdfsFilePath.toString());
                }
            }
        } catch (IOException e) {
            log.error(String.valueOf(e));
            throw new ZorkException(String.format("IO流异常:%s", e.getMessage()));
        }
        return logFiles;
    }

}
