当前位置:
首页 > temp > 简明python教程 >
-
C# 实现敏感词过滤
实现 该 敏感词过滤 采用的是 DFA算法,参考文章:https://blog.csdn.net/chenssy/article/details/26961957
具体 实现 步骤 如下:
第一步,构建 敏感词库(WordsLibrary) 类:
using System.Collections.Generic; using System.Linq; using System; namespace ContentSafe.SensitiveWord { /// <summary> /// 敏感词库 /// </summary> public class WordsLibrary { /// <summary> /// 词库树结构类 /// </summary> public class ItemTree { public char Item { get; set; } public bool IsEnd { get; set; } public List<ItemTree> Child { get; set; } } /// <summary> /// 词库树 /// </summary> public ItemTree Library { get; private set; } /// <summary> /// 敏感词组 /// </summary> public string[] Words { get; protected set; } /// <summary> /// 敏感词库 /// </summary> public WordsLibrary() { LoadWords(); Init(); } /// <summary> /// 敏感词库 /// </summary> /// <param name="words">敏感词组</param> public WordsLibrary(string[] words) : this() { Words = words; } /// <summary> /// 加载 敏感词组,可被重写以自定义 如何加载 敏感词组 /// </summary> public virtual void LoadWords() { } /// <summary> /// 词库初始化 /// </summary> private void Init() { if (Words == null) Words = new[] { "" }; Library = new ItemTree() { Item = 'R', IsEnd = false, Child = CreateTree(Words) }; } /// <summary> /// 创建词库树 /// </summary> /// <param name="words">敏感词组</param> /// <returns></returns> private List<ItemTree> CreateTree(string[] words) { List<ItemTree> tree = null; if (words != null && words.Length > 0) { tree = new List<ItemTree>(); foreach (var item in words) if (!string.IsNullOrEmpty(item)) { char cha = item[0]; ItemTree node = tree.Find(e => e.Item == cha); if (node != null) AddChildTree(node, item); else tree.Add(CreateSingleTree(item)); } } return tree; } /// <summary> /// 创建单个完整树 /// </summary> /// <param name="word">单个敏感词</param> /// <returns></returns> private ItemTree CreateSingleTree(string word) { //根节点,此节点 值为空 ItemTree root = new ItemTree(); //移动 游标 ItemTree p = root; for (int i = 0; i < word.Length; i++) { ItemTree child = new ItemTree() { Item = word[i], IsEnd = false, Child = null }; p.Child = new List<ItemTree>() { child }; p = child; } p.IsEnd = true; return root.Child.First(); } /// <summary> /// 附加分支子树 /// </summary> /// <param name="childTree">子树</param> /// <param name="word">单个敏感词</param> private void AddChildTree(ItemTree childTree, string word) { //移动 游标 ItemTree p = childTree; for (int i = 1; i < word.Length; i++) { char cha = word[i]; List<ItemTree> child = p.Child; if (child == null) { ItemTree node = new ItemTree() { Item = cha, IsEnd = false, Child = null }; p.Child = new List<ItemTree>() { node }; p = node; } else { ItemTree node = child.Find(e => e.Item == cha); if (node == null) { node = new ItemTree() { Item = cha, IsEnd = false, Child = null }; child.Add(node); p = node; } else p = node; } } p.IsEnd = true; } } }
第二步,构建 敏感词检测(ContentCheck) 类:
using System.Collections.Generic; using System.Linq; using System; namespace ContentSafe.SensitiveWord { /// <summary> /// 敏感词检测 /// </summary> public class ContentCheck { /// <summary> /// 检测文本 /// </summary> public string Text { private get; set; } /// <summary> /// 敏感词库 词树 /// </summary> public WordsLibrary.ItemTree Library { private get; set; } /// <summary> /// 敏感词检测 /// </summary> public ContentCheck() { } /// <summary> /// 敏感词检测 /// </summary> /// <param name="library">敏感词库</param> public ContentCheck(WordsLibrary library) { if (library.Library == null) throw new Exception("敏感词库未初始化"); Library = library.Library; } /// <summary> /// 敏感词检测 /// </summary> /// <param name="library">敏感词库</param> /// <param name="text">检测文本</param> public ContentCheck(WordsLibrary library, string text) : this(library) { if (text == null) throw new Exception("检测文本不能为null"); Text = text; } /// <summary> /// 检测敏感词 /// </summary> /// <param name="text">检测文本</param> /// <returns></returns> private Dictionary<int, char> WordsCheck(string text) { if (Library == null) throw new Exception("未设置敏感词库 词树"); Dictionary<int, char> dic = new Dictionary<int, char>(); WordsLibrary.ItemTree p = Library; List<int> indexs = new List<int>(); for (int i = 0, j = 0; j < text.Length; j++) { char cha = text[j]; var child = p.Child; var node = child.Find(e => e.Item == cha); if (node != null) { indexs.Add(j); if (node.IsEnd || node.Child == null) { if (node.Child != null) { int k = j + 1; if (k < text.Length && node.Child.Exists(e => e.Item == text[k])) { p = node; continue; } } foreach (var item in indexs) dic.Add(item, text[item]); indexs.Clear(); p = Library; i = j; ++i; } else p = node; } else { indexs.Clear(); if (p.GetHashCode() != Library.GetHashCode()) { ++i; j = i; p = Library; } else i = j; } } return dic; } /// <summary> /// 替换敏感词 /// </summary> /// <param name="library">敏感词库</param> /// <param name="text">检测文本</param> /// <param name="newChar">替换字符</param> /// <returns></returns> public static string SensitiveWordsReplace(WordsLibrary library, string text, char newChar = '*') { Dictionary<int, char> dic = new ContentCheck(library).WordsCheck(text); if (dic != null && dic.Keys.Count > 0) { char[] chars = text.ToCharArray(); foreach (var item in dic) chars[item.Key] = newChar; text = new string(chars); } return text; } /// <summary> /// 替换敏感词 /// </summary> /// <param name="text">检测文本</param> /// <param name="newChar">替换字符</param> /// <returns></returns> public string SensitiveWordsReplace(string text, char newChar = '*') { Dictionary<int, char> dic = WordsCheck(text); if (dic != null && dic.Keys.Count > 0) { char[] chars = text.ToCharArray(); foreach (var item in dic) chars[item.Key] = newChar; text = new string(chars); } return text; } /// <summary> /// 替换敏感词 /// </summary> /// <param name="newChar">替换字符</param> /// <returns></returns> public string SensitiveWordsReplace(char newChar = '*') { if (Text == null) throw new Exception("未设置检测文本"); return SensitiveWordsReplace(Text, newChar); } /// <summary> /// 查找敏感词 /// </summary> /// <param name="library">敏感词库</param> /// <param name="text">检测文本</param> /// <returns></returns> public static List<string> FindSensitiveWords(WordsLibrary library, string text) { ContentCheck check = new ContentCheck(library, text); return check.FindSensitiveWords(); } /// <summary> /// 查找敏感词 /// </summary> /// <param name="text">检测文本</param> /// <returns></returns> public List<string> FindSensitiveWords(string text) { Dictionary<int, char> dic = WordsCheck(text); if (dic != null && dic.Keys.Count > 0) { int i = -1; string str = ""; List<string> list = new List<string>(); foreach(var item in dic) { if (i == -1 || i + 1 == item.Key) str += item.Value; else { list.Add(str); str = "" + item.Value; } i = item.Key; } list.Add(str); return list.Distinct().ToList(); } else return null; } /// <summary> /// 查找敏感词 /// </summary> /// <returns></returns> public List<string> FindSensitiveWords() { if (Text == null) throw new Exception("未设置检测文本"); return FindSensitiveWords(Text); } } }
第三步,测试与使用方法:
string[] words = new[] { "敏感词1", "敏感词2", "含有", "垃圾" }; //敏感词组 可自行在网上 搜索下载 //敏感词库 类可被继承,如果想实现自定义 敏感词导入方法 可以 对 LoadWords 方法进行 重写 var library = new WordsLibrary(words); //实例化 敏感词库 string text = "在任意一个文本中都可能包含敏感词1、2、3等等,只要含有敏感词都会被找出来,比如:垃圾"; ContentCheck check = new ContentCheck(library, text); //实例化 内容检测类 var list = check.FindSensitiveWords(); //调用 查找敏感词方法 返回敏感词列表 var str = check.SensitiveWordsReplace(); //调用 敏感词替换方法 返回处理过的字符串
该 实现方案 不止 这个 使用方法,更多使用方法 可自行 研究
栏目列表
最新更新
nodejs爬虫
Python正则表达式完全指南
爬取豆瓣Top250图书数据
shp 地图文件批量添加字段
爬虫小试牛刀(爬取学校通知公告)
【python基础】函数-初识函数
【python基础】函数-返回值
HTTP请求:requests模块基础使用必知必会
Python初学者友好丨详解参数传递类型
如何有效管理爬虫流量?
2个场景实例讲解GaussDB(DWS)基表统计信息估
常用的 SQL Server 关键字及其含义
动手分析SQL Server中的事务中使用的锁
openGauss内核分析:SQL by pass & 经典执行
一招教你如何高效批量导入与更新数据
天天写SQL,这些神奇的特性你知道吗?
openGauss内核分析:执行计划生成
[IM002]Navicat ODBC驱动器管理器 未发现数据
初入Sql Server 之 存储过程的简单使用
SQL Server -- 解决存储过程传入参数作为s
关于JS定时器的整理
JS中使用Promise.all控制所有的异步请求都完
js中字符串的方法
import-local执行流程与node模块路径解析流程
检测数据类型的四种方法
js中数组的方法,32种方法
前端操作方法
数据类型
window.localStorage.setItem 和 localStorage.setIte
如何完美解决前端数字计算精度丢失与数