白名单逻辑优化
正则表达式完善
| package com.zorkdata.datamask.constant; | ||
| /** | ||
| * Description : | ||
| * | ||
| * @author : wanghaiying (<a href="wanghaiying@zorkdata.com.cn">wanghaiying@zorkdata.com.cn</a>) | ||
| * Date : Create in 2020/10/20 15:32 | ||
| */ | ||
| public interface RegExpConstants { | ||
|
||
| String REG_EXP = "reg_exp"; | ||
| String NAME_REG_EXP = "name"; | ||
| String MOBILE_REG_EXP = "mobile"; | ||
| String PHONE_REG_EXP = "phone"; | ||
| String EMAIL_REG_EXP = "email"; | ||
| String ID15_REG_EXP = "id15"; | ||
| String ID18_REG_EXP = "id18"; | ||
| String BANK_CARD_REG_EXP = "bank_card"; | ||
| String ADDRESS_REG_EXP = "address"; | ||
| String IP_REG_EXP = "ip"; | ||
| String MAC_REG_EXP = "mac"; | ||
| } | ||
| ... | ... | @@ -2,6 +2,8 @@ package com.zorkdata.datamask.domain; |
| import lombok.Data; | ||
| import java.io.Serializable; | ||
| /** | ||
| * @author 谢森 | ||
| * @Description 参数实体类 | ||
| ... | ... | @@ -9,7 +11,10 @@ import lombok.Data; |
| * @Date 2020/10/21 14:33 | ||
| */ | ||
| @Data | ||
| public class HadoopParam { | ||
| public class HDFSLogQueryParam implements Serializable { | ||
|
||
| private static final long serialVersionUID = 1L; | ||
| private String source; | ||
| private String hdfsSrc; | ||
| private String hdfsDest; | ||
| ... | ... | @@ -18,7 +23,7 @@ public class HadoopParam { |
| private Long startTime; | ||
| private Long endTime; | ||
| public HadoopParam(String source, String hdfsSrc, String hdfsDest, String core, String date, Long startTime, | ||
| public HDFSLogQueryParam(String source, String hdfsSrc, String hdfsDest, String core, String date, Long startTime, | ||
| Long endTime) { | ||
| this.source = source; | ||
| this.hdfsSrc = hdfsSrc; | ||
| ... | ... | @@ -28,5 +33,4 @@ public class HadoopParam { |
| this.startTime = startTime; | ||
| this.endTime = endTime; | ||
| } | ||
| } | ||
| ... | ... | @@ -2,9 +2,10 @@ package com.zorkdata.datamask.hadoop; |
| import com.alibaba.fastjson.JSON; | ||
| import com.alibaba.fastjson.TypeReference; | ||
| import com.zorkdata.datamask.constant.ParamConstants; | ||
| import com.zorkdata.datamask.constant.StrConstants; | ||
| import com.zorkdata.datamask.domain.LogData; | ||
| import com.zorkdata.datamask.domain.HadoopParam; | ||
| import com.zorkdata.datamask.domain.HDFSLogQueryParam; | ||
| import com.zorkdata.datamask.domain.TransactionLog; | ||
| import com.zorkdata.datamask.util.DateUtils; | ||
| import com.zorkdata.datamask.util.MaskUtil; | ||
| ... | ... | @@ -38,14 +39,15 @@ import java.io.IOException; |
| import java.net.URI; | ||
| import java.net.URISyntaxException; | ||
| import java.util.ArrayList; | ||
| import java.util.Collections; | ||
| import java.util.List; | ||
| import java.util.Map; | ||
| /** | ||
| * @author 谢森 | ||
| * @Description hadoop 文件数据脱敏 | ||
| * @Email xiesen310@163.com | ||
| * @Date 2020/10/21 14:29 | ||
| * Description: hdfs日志文件脱敏 | ||
| * | ||
| * @author: wanghaiying (<a href="wanghaiying@zorkdata.com.cn">wanghaiying@zorkdata.com.cn</a>) | ||
| * Date: Create in 2020/9/23 9:30 | ||
| */ | ||
| public class HadoopMask { | ||
| private static final Logger logger = LoggerFactory.getLogger(HadoopMask.class); | ||
| ... | ... | @@ -61,12 +63,18 @@ public class HadoopMask { |
| env.setParallelism(1); | ||
| JobConf jobConf = new JobConf(); | ||
| jobConf.set("avro.output.schema", TransactionLog.SCHEMA$.toString(true)); | ||
| HadoopParam hadoopParam = ParamUtils.initHadoopConf(conf); | ||
| HDFSLogQueryParam hdfsLogQueryParam = ParamUtils.initHadoopConf(conf); | ||
| ParameterTool parameterTool = ParameterTool.fromMap(conf); | ||
| env.getConfig().setGlobalJobParameters(parameterTool); | ||
| List<String> logFiles = filterHdfsLogFiles(hadoopParam.getHdfsSrc(), hadoopParam.getDate(), | ||
| hadoopParam.getStartTime(), hadoopParam.getEndTime()); | ||
| MaskUtil maskUtil = ParamUtils.initMaskUtil(conf); | ||
| String[] fieldsWhiteListArray = String.valueOf(conf.get(ParamConstants.FIELDS_WHITE_LIST)).trim().split(","); | ||
| ArrayList< String> fieldsWhiteList = new ArrayList<String>(fieldsWhiteListArray.length); | ||
|
||
| Collections.addAll(fieldsWhiteList, fieldsWhiteListArray); | ||
| List<String> logFiles = filterHdfsLogFiles(hdfsLogQueryParam.getHdfsSrc(), hdfsLogQueryParam.getDate(), | ||
| hdfsLogQueryParam.getStartTime(), hdfsLogQueryParam.getEndTime()); | ||
| for (String logFile : logFiles) { | ||
| /** | ||
| ... | ... | @@ -88,14 +96,14 @@ public class HadoopMask { |
| new TypeReference<LogData>() { | ||
| }); | ||
| //根据日志事件的核心信息做过滤 | ||
| if (null != hadoopParam.getCore() && logData.getDimensions().get("hostname").indexOf("c9") > -1) { | ||
| if (null != hdfsLogQueryParam.getCore() && logData.getDimensions().get("hostname").indexOf("c9") > -1 ) { | ||
| //根据日志事件的timestamp做过滤 | ||
| Long timestamp = DateUtils.utc2timestamp(logData.getTimestamp()); | ||
| boolean flag = null != timestamp && timestamp > hadoopParam.getStartTime() | ||
| && timestamp < hadoopParam.getEndTime() || Boolean.TRUE; | ||
| boolean flag = null != timestamp && timestamp > hdfsLogQueryParam.getStartTime() | ||
| && timestamp < hdfsLogQueryParam.getEndTime() || Boolean.TRUE; | ||
| if (flag) { | ||
| Map maskResult = MaskUtil.mask(logData.getNormalFields()); | ||
| Map maskResult = maskUtil.mask(logData.getNormalFields(), fieldsWhiteList); | ||
| logData.setNormalFields(maskResult); | ||
| collector.collect(logData); | ||
| } | ||
| ... | ... | @@ -105,7 +113,7 @@ public class HadoopMask { |
| // 获取目标hdfs的输出目录 | ||
| String logFileName = | ||
| logFile.split(StrConstants.FILE_SEPARATOR)[logFile.split(StrConstants.FILE_SEPARATOR).length - 1]; | ||
| String filePath = hadoopParam.getHdfsSrc() + logFileName.replace(StrConstants.AVRO_SUFFIX, | ||
| String filePath = hdfsLogQueryParam.getHdfsDest() + logFileName.replace(StrConstants.AVRO_SUFFIX, | ||
| StrConstants.EMPTY_STR); | ||
| HadoopOutputFormat hadoopOutputFormat = new HadoopOutputFormat<>(new AvroOutputFormat(), jobConf); | ||
| FileOutputFormat.setOutputPath(jobConf, new Path(filePath)); | ||
| ... | ... | |
| package com.zorkdata.datamask.util; | ||
| import java.util.ArrayList; | ||
| import java.util.HashMap; | ||
| import java.util.List; | ||
| import java.util.Map; | ||
| import java.io.Serializable; | ||
| import java.util.*; | ||
| import java.util.regex.Matcher; | ||
| import java.util.regex.Pattern; | ||
| /** | ||
| * Description : | ||
| * Description: | ||
| * | ||
| * @author : wanghaiying (<a href="wanghaiying@zorkdata.com.cn">wanghaiying@zorkdata.com.cn</a>) | ||
| * Date : Create in 2020/9/23 9:30 | ||
| * RegularExpression | ||
| * @author: wanghaiying (<a href="wanghaiying@zorkdata.com.cn">wanghaiying@zorkdata.com.cn</a>) | ||
| * Date: Create in 2020/9/23 9:30 | ||
| */ | ||
| public class MaskUtil { | ||
| public class MaskUtil implements Serializable { | ||
| private static final long serialVersionUID = 1L; | ||
| public static final int DEFAULT_MAP_CAPACITY = 16; | ||
| private MaskRegexConfig maskRegexConfig; | ||
| /** | ||
| * 姓名正则 | ||
| */ | ||
| static Pattern namePattern = Pattern.compile("([\\u4e00-\\u9fa5]{1,20}|[a-zA-Z\\.\\s]{1,20})"); | ||
| private String nameRegExp; | ||
| /** | ||
| * 手机号正则 | ||
| */ | ||
| static Pattern mobilePattern = Pattern.compile("((13[0-9])|(14[5,7])|(15[0-3,5-9])|(17[0,3,5-8])|(18[0-9])|(147))" + | ||
| "\\d{8}"); | ||
| private String mobileRegExp; | ||
| /** | ||
| * 电话号码正则 | ||
| */ | ||
| static Pattern phonePattern = Pattern.compile("(\\d{3,4}-)?\\d{6,8}"); | ||
| private String phoneRegExp; | ||
| /** | ||
| * 邮箱正则 | ||
| */ | ||
| static Pattern emailPattern = Pattern.compile("\\w+([-+.]\\w+)*@\\w+([-.]\\w+)*\\.\\w+([-.]\\w+)*"); | ||
| private String emailRegExp; | ||
| /** | ||
| * 身份证号码(15位)正则 | ||
| */ | ||
| static Pattern idPattern15 = Pattern.compile("[1-9]\\d{7}((0\\d)|(1[0-2]))(([0|1|2]\\d)|3[0-1])\\d{3}"); | ||
| private String idRegExp15; | ||
| /** | ||
| * 身份证号码(18位)正则 | ||
| */ | ||
| static Pattern idPattern18 = Pattern.compile("[1-9]\\d{5}[1-9]\\d{3}((0\\d)|(1[0-2]))(([0|1|2]\\d)|3[0-1])\\d{3}" + | ||
| "([0-9Xx])"); | ||
| private String idRegExp18; | ||
| /** | ||
| * 银行卡号码正则 | ||
| */ | ||
| private String bankCardRegExp; | ||
| /** | ||
| * 家庭住址正则 | ||
| */ | ||
| static Pattern addressPattern = Pattern.compile("([\\u4E00-\\u9FA5A-Za-z0-9_]+(省|市|区|县|道|路|街|号|弄|条|室)){2,}"); | ||
| private String addressRegExp; | ||
| /** | ||
| * ip地址正则 | ||
| * // static Pattern ipPattern = Pattern.compile("^((\\d|[1-9]\\d|1\\d\\d|2[0-4]\\d|25[0-5]|[*])\\.){3} | ||
| * // (\\d|[1-9]\\d|1\\d\\d|2[0-4]\\d|25[0-5]|[*])$"); | ||
| */ | ||
| static Pattern ipPattern = Pattern.compile("((2[0-4]\\d|25[0-5]|[01]?\\d\\d?)\\.){3}" + | ||
| "(2[0-4]\\d|25[0-5]|[01]?\\d\\d?)"); | ||
| private String ipRegExp; | ||
| /** | ||
| * mac地址正则 | ||
| */ | ||
| static Pattern macPattern = Pattern.compile("([A-Fa-f0-9]{2}-){5}[A-Fa-f0-9]{2}"); | ||
| static List<Pattern> patterns = new ArrayList<Pattern>() {{ | ||
| add(macPattern); | ||
| add(emailPattern); | ||
| add(ipPattern); | ||
| add(namePattern); | ||
| add(idPattern18); | ||
| add(idPattern15); | ||
| add(mobilePattern); | ||
| add(phonePattern); | ||
| add(addressPattern); | ||
| private String macRegExp; | ||
| List<Pattern> patterns = new ArrayList<Pattern>() {{ | ||
|
|
||
| }}; | ||
| public static Map mask(Map map) { | ||
| public MaskUtil(String nameRegExp, String mobileRegExp, String phoneRegExp, String emailRegExp, String idRegExp15, String idRegExp18, String bankCardRegExp, String addressRegExp, String ipRegExp, String macRegExp) { | ||
|
||
| this.nameRegExp = nameRegExp; | ||
| this.mobileRegExp = mobileRegExp; | ||
| this.phoneRegExp = phoneRegExp; | ||
| this.emailRegExp = emailRegExp; | ||
| this.idRegExp15 = idRegExp15; | ||
| this.idRegExp18 = idRegExp18; | ||
| this.bankCardRegExp = bankCardRegExp; | ||
| this.addressRegExp = addressRegExp; | ||
| this.ipRegExp = ipRegExp; | ||
| this.macRegExp = macRegExp; | ||
| } | ||
| public Map mask(Map map, ArrayList whiteList) { | ||
|
||
| patterns.add(Pattern.compile(this.nameRegExp)); | ||
| patterns.add(Pattern.compile(this.macRegExp)); | ||
| patterns.add(Pattern.compile(this.emailRegExp)); | ||
| patterns.add(Pattern.compile(this.ipRegExp)); | ||
| patterns.add(Pattern.compile(this.nameRegExp)); | ||
| patterns.add(Pattern.compile(this.idRegExp18)); | ||
| patterns.add(Pattern.compile(this.idRegExp15)); | ||
| patterns.add(Pattern.compile(this.bankCardRegExp)); | ||
| patterns.add(Pattern.compile(this.mobileRegExp)); | ||
| patterns.add(Pattern.compile(this.phoneRegExp)); | ||
| patterns.add(Pattern.compile(this.addressRegExp)); | ||
| map.forEach((k, v) -> { | ||
| if (!whiteList.contains(k)) { | ||
| String value = v.toString(); | ||
| for (Pattern pattern : patterns) { | ||
| Matcher matcher = pattern.matcher(value); | ||
| ... | ... | @@ -86,25 +110,40 @@ public class MaskUtil { |
| } | ||
| } | ||
| map.put(k, value); | ||
| } else { | ||
| map.put(k, v); | ||
| } | ||
| }); | ||
| return map; | ||
| } | ||
| public static void main(String[] args) { | ||
| MaskUtil maskUtil = new MaskUtil(); | ||
| MaskUtil maskUtil = new MaskUtil("[\\u4e00-\\u9fa5]{1,20}|[a-zA-Z\\\\.\\\\s]{1,20}", "((13[0-9])|(14[5,7])|(15[0-3,5-9])|(17[0,3,5-8])|(18[0-9])|(147))\\d{8}", | ||
| "(\\d{3,4}-)?\\d{6,8}", "\\w+([-+.]\\w+)*@\\w+([-.]\\w+)*\\.\\w+([-.]\\w+)*", "[1-9]\\d{7}((0\\d)|(1[0-2]))(([0|1|2]\\d)|3[0-1])\\d{3}", | ||
| "[1-9]\\d{5}[1-9]\\d{3}((0\\d)|(1[0-2]))(([0|1|2]\\d)|3[0-1])\\d{3}([0-9Xx])", "([1-9]{1})(\\d{11}|\\d{15}|\\d{16}|\\d{17}|\\d{18})", | ||
| "([\u4E00-\u9FA5A-Za-z0-9_]+(省|市|区|县|道|路|街|号|弄|条|室)){2,}", "((2[0-4]\\d|25[0-5]|[01]?\\d\\d?)\\.){3}(2[0-4]\\d|25[0-5]|[01]?\\d\\d?)", | ||
| "([A-Fa-f0-9]{2}-){5}[A-Fa-f0-9]{2}"); | ||
| Map map = new HashMap(DEFAULT_MAP_CAPACITY); | ||
| map.put("姓名", "王海鹰"); | ||
| map.put("身份证号", "372925199008075158"); | ||
| map.put("手机号", "15000101879"); | ||
| map.put("电话", "021-61341606"); | ||
| map.put("邮箱", "wanghaiying@zork.com"); | ||
| map.put("住址", "上海市浦东新区碧波路690号1弄"); | ||
| map.put("住址2", "上海市浦东新区张江微电子港304-2室"); | ||
| map.put("ip地址", "192.168.70.2"); | ||
| map.put("mac地址", "3c-78-43-25-80-bd"); | ||
| map.put("message", "王海鹰,372925199008075158#15000101879"); | ||
| // map.put("身份证号", "372925199008075158"); | ||
|
||
| // map.put("手机号", "15000101879"); | ||
| // map.put("电话", "021-61341606"); | ||
| // map.put("邮箱", "wanghaiying@zork.com"); | ||
| // map.put("住址", "上海市浦东新区碧波路690号1弄"); | ||
| // map.put("住址2", "上海市浦东新区张江微电子港304-2室"); | ||
| // map.put("ip地址", "192.168.70.2"); | ||
| // map.put("mac地址", "3c-78-43-25-80-bd"); | ||
| // map.put("message", "王海鹰,372925199008075158#15000101879"); | ||
|
||
| map.put("messid", "0000011404342B32233DDCDA"); | ||
| System.out.println(maskUtil.mask(map)); | ||
| map.put("bsflag", "0000011404342B32233DDCDA"); | ||
| map.put("normalFields", "13811110000-110101199003075517-上海市浦东新区张江微电子港-zorkdata@163.com-123456789-wanghaiying123-王海鹰-192.168.1.1-00-50-56-C0-00-08-6227002470170278192"); | ||
| String[] fieldsWhiteListArray = "messid,fundid,custid,orgid,brhid,secuid,bankcode,market,ordersno,ordergroup,count,poststr,stkcode,bsflag,orderamt,price,qty,bankcode,tacode,ofcode,transacc,taacc".split(","); | ||
| ArrayList< String> fieldsWhiteList = new ArrayList<String>(fieldsWhiteListArray.length); | ||
|
||
| Collections.addAll(fieldsWhiteList, fieldsWhiteListArray); | ||
| System.out.println(maskUtil.mask(map, fieldsWhiteList)); | ||
|
||
| } | ||
| } | ||
-
SonarQube analysis reported 151 issues
-
🚫 19 critical -
⚠ 83 major -
🔽 48 minor -
ℹ 1 info
Watch the comments in this conversation to review them.
Top 30 extra issues
Note: The following issues were found on lines that were not modified in the commit. Because these issues can't be reported as line comments, they are summarized here:
-
🚫 Move constants to a class or enum.📘 -
🚫 Move constants to a class or enum.📘 -
🚫 Add a nested comment explaining why this method is empty, throw an UnsupportedOperationException or complete the implementation.📘 -
🚫 Change this "try" to a try-with-resources. (sonar.java.source not set. Assuming 7 or greater.)📘 -
🚫 Refactor this code to not throw exceptions in finally blocks.📘 -
🚫 Refactor this code to not throw exceptions in finally blocks.📘 -
🚫 Define a constant instead of duplicating this literal "序列化失败" 15 times.📘 -
🚫 Define a constant instead of duplicating this literal " {\n" 7 times.📘 -
🚫 [Define a constant instead of duplicating this literal " "type": \n" 7 times.📘 -
🚫 Define a constant instead of duplicating this literal " "string",\n" 4 times.📘 -
🚫 Define a constant instead of duplicating this literal " "null"\n" 4 times.📘 -
🚫 [Define a constant instead of duplicating this literal " ]\n" 7 times.](https://git.zorkdata.com/wanghaiying/transactionlogmask/blob/aa763a65a73ff81969cf0be645465505d846bd66/src/main/java/com/zorkdata/datamask/util/avro/LogAvroMacroDef.java#L20)📘 -
🚫 Define a constant instead of duplicating this literal " },\n" 6 times.📘 -
🚫 Define a constant instead of duplicating this literal " "null",\n" 3 times.📘 -
🚫 Define a constant instead of duplicating this literal " {\n" 3 times.📘 -
🚫 Define a constant instead of duplicating this literal " "type": "map",\n" 3 times.📘 -
🚫 Define a constant instead of duplicating this literal " }\n" 3 times.📘 -
⚠ Define and throw a dedicated exception instead of using a generic one.📘 -
⚠ Remove this unused "source" private field.📘 -
⚠ Remove this unused "hdfsSrc" private field.📘 -
⚠ Remove this unused "hdfsDest" private field.📘 -
⚠ Remove this unused "core" private field.📘 -
⚠ Remove this unused "date" private field.📘 -
⚠ Remove this unused "startTime" private field.📘 -
⚠ Remove this unused "endTime" private field.📘 -
⚠ Remove this unused "servers" private field.📘 -
⚠ Remove this unused "zookeeper" private field.📘 -
⚠ Remove this unused "topic" private field.📘 -
⚠ Remove this unused "hdfsDest" private field.📘 -
⚠ Remove this unused "core" private field.📘
- ... 107 more
-