Java 敏感词过滤算法
jopen
9年前
1.DFA算法
DFA算法的原理可以参考 这里 ,简单来说就是通过Map构造出一颗敏感词树,树的每一条由根节点到叶子节点的路径构成一个敏感词,例如下图:
代码简单实现如下:
public class TextFilterUtil { //日志 private static final Logger LOG = LoggerFactory.getLogger(TextFilterUtil.class); //敏感词库 private static HashMap sensitiveWordMap = null; //默认编码格式 private static final String ENCODING = "gbk"; //敏感词库的路径 private static final InputStream in = TextFilterUtil.class.getClassLoader().getResourceAsStream("sensitive/keyWords.txt"); /** * 初始化敏感词库 */ private static void init() { //读取文件 Set<String> keyWords = readSensitiveWords(); //创建敏感词库 sensitiveWordMap = new HashMap<>(keyWords.size()); for (String keyWord : keyWords) { createKeyWord(keyWord); } } /** * 构建敏感词库 * * @param keyWord */ private static void createKeyWord(String keyWord) { if (sensitiveWordMap == null) { LOG.error("sensitiveWordMap 未初始化!"); return; } Map nowMap = sensitiveWordMap; for (Character c : keyWord.toCharArray()) { Object obj = nowMap.get(c); if (obj == null) { Map<String, Object> childMap = new HashMap<>(); childMap.put("isEnd", "false"); nowMap.put(c, childMap); nowMap = childMap; } else { nowMap = (Map) obj; } } nowMap.put("isEnd", "true"); } /** * 读取敏感词文件 * * @return */ private static Set<String> readSensitiveWords() { Set<String> keyWords = new HashSet<>(); BufferedReader reader = null; try { reader = new BufferedReader(new InputStreamReader(in, ENCODING)); String line; while ((line = reader.readLine()) != null) { keyWords.add(line.trim()); } } catch (UnsupportedEncodingException e) { LOG.error("敏感词库文件转码失败!"); } catch (FileNotFoundException e) { LOG.error("敏感词库文件不存在!"); } catch (IOException e) { LOG.error("敏感词库文件读取失败!"); } finally { if (reader != null) { try { reader.close(); } catch (IOException e) { e.printStackTrace(); } reader = null; } } return keyWords; } /** * 检查敏感词 * * @return */ private static List<String> checkSensitiveWord(String text) { if (sensitiveWordMap == null) { init(); } List<String> sensitiveWords = new ArrayList<>(); Map nowMap = sensitiveWordMap; for (int i = 0; i < text.length(); i++) { Character word = text.charAt(i); Object obj = nowMap.get(word); if (obj == null) { continue; } int j = i + 1; Map childMap = (Map) obj; while (j < text.length()) { if ("true".equals(childMap.get("isEnd"))) { sensitiveWords.add(text.substring(i, j)); } obj = childMap.get(text.charAt(j)); if (obj != null) { childMap = (Map) obj; } else { break; } j++; } } return sensitiveWords; } }
2.TTMP算法
TTMP算法由网友原创,关于它的起源可以查看 这里 ,TTMP算法的原理是将敏感词拆分成“脏字”的序列,只有待比对字符串完全由“脏字”组成时,才去判断它是否为敏感词,减少了比对次数。这个算法的简单实现如下:
public class TextFilterUtil { //日志 private static final Logger LOG = LoggerFactory.getLogger(TextFilterUtil.class); //默认编码格式 private static final String ENCODING = "gbk"; //敏感词库的路径 private static final InputStream in = TextFilterUtil.class.getClassLoader().getResourceAsStream("sensitive/keyWords.txt"); //脏字库 private static Set<Character> sensitiveCharSet = null; //敏感词库 private static Set<String> sensitiveWordSet = null; /** * 初始化敏感词库 */ private static void init() { //初始化容器 sensitiveCharSet = new HashSet<>(); sensitiveWordSet = new HashSet<>(); //读取文件 创建敏感词库 readSensitiveWords(); } /** * 读取本地的敏感词文件 * * @return */ private static void readSensitiveWords() { BufferedReader reader = null; try { reader = new BufferedReader(new InputStreamReader(in, ENCODING)); String line; while ((line = reader.readLine()) != null) { String word = line.trim(); sensitiveWordSet.add(word); for (Character c : word.toCharArray()) { sensitiveCharSet.add(c); } } } catch (UnsupportedEncodingException e) { LOG.error("敏感词库文件转码失败!"); } catch (FileNotFoundException e) { LOG.error("敏感词库文件不存在!"); } catch (IOException e) { LOG.error("敏感词库文件读取失败!"); } finally { if (reader != null) { try { reader.close(); } catch (IOException e) { e.printStackTrace(); } reader = null; } } return; } /** * 检查敏感词 * * @return */ private static List<String> checkSensitiveWord(String text) { if (sensitiveWordSet == null || sensitiveCharSet == null) { init(); } List<String> sensitiveWords = new ArrayList<>(); for (int i = 0; i < text.length(); i++) { Character word = text.charAt(i); if (!sensitiveCharSet.contains(word)) { continue; } int j = i; while (j < text.length()) { if (!sensitiveCharSet.contains(word)) { break; } String key = text.substring(i, j + 1); if (sensitiveWordSet.contains(key)) { sensitiveWords.add(key); } j++; } } return sensitiveWords; } }
注:以上代码实现仅用于展示思路,在实际使用中还有很多地方可以优化。