C# 屏蔽词过滤

摘要:
*“;privateTreeNodecheckNode;private TreeNode备份节点;privatecharduplicateChar;publicTrie(){Root=newTreeNodes(“”);SkipCharSet=newHashSet<char>();SeparateCharSet=newHashSet>char>publicvoidAddNode{如果{charch=word[0];TreeNodeinsertNode=node.AddNode;如果{insertNode.IsEnd=true;}word=单词。移除(0,1);添加节点;}}privateboolIsSkipChar{returnSkipCharSet.Contains;}//是否为英语privatebookIsEnglishChar{stringstr=newstring;RegexregEnglish=newRegex;returnregEnglic.IsMatch;}//是否是分隔符privatepoolSeparator{returnSeparateCharSet.Contains;}//它是单词privatebookIsWordBegin的开头吗{ifreturntrue;if{charc1=单词[pos-1];charc2=单词[pos];return(分隔符(c1)&&!IsEnglishChar&&IsEnglisChar);}returnfalse;}//是否是单词privatebookIsWordEnd{ifreturntrue;if{charc1=word[pos];charc2=word[pos+1];return

参考:https://www.cnblogs.com/kubidemanong/p/10834993.html

public classTreeNode
    {
        public charChar;
        public boolIsEnd;
        public intWordEndAt;
        private Dictionary<char, TreeNode>NodeDict;
        public TreeNode(charc)
        {
            Char =c;
            IsEnd = false;
            NodeDict = new Dictionary<char, TreeNode>();
        }

        public bool ContainChar(charch)
        {
            returnNodeDict.ContainsKey(ch);
        }

        public TreeNode GetChild(charc)
        {
            TreeNode Child = null;
            NodeDict.TryGetValue(c, outChild);
            returnChild;
        }

        public TreeNode AddNode(charch)
        {
            TreeNode insertNode = null;
            if (!NodeDict.TryGetValue(ch, outinsertNode))
            {
                insertNode = newTreeNode(ch);
                NodeDict.Add(ch, insertNode);
            }
            returninsertNode;
        }
    }
public classTrie
{

        public TreeNode Root { get; }
        private HashSet<char> SkipCharSet;      //忽略字符
        private HashSet<char> SeparateCharSet;  //常见分隔符
        private string skipCharList = "`-=[]\',.·/~!@#$%^&*()_+{}|:"<>?*
";
        privateTreeNode checkNode;
        privateTreeNode backupNode;
        private charduplicateChar;
        publicTrie()
        {
            Root = new TreeNode(' ');
            SkipCharSet = new HashSet<char>();
            SeparateCharSet = new HashSet<char>();
            foreach (char c inskipCharList)
            {
                SkipCharSet.Add(c);
                SeparateCharSet.Add(c);
            }
        }

        public void AddNode(TreeNode node, stringword)
        {
            if (word.Length > 0)
            {
                char ch = word[0];
                TreeNode insertNode =node.AddNode(ch);
                if (word.Length == 1)
                {
                    insertNode.IsEnd = true;
                }
                word = word.Remove(0,1);
                AddNode(insertNode, word);
            }
        }

        private bool IsSkipChar(charc)
        {
            returnSkipCharSet.Contains(c);
        }

        //是否是英文
        private bool IsEnglishChar(charch)
        {
            string str = new string(ch, 1);
            Regex regEnglish = new Regex("^[a-zA-Z]");
            returnregEnglish.IsMatch(str);
        }

        //是否是分割符
        private bool Separator(charch)
        {
            returnSeparateCharSet.Contains(ch);
        }

        //是否是单词的开头
        private bool IsWordBegin(string word,intpos)
        {
            if (pos == 0)
                return true;
            if (pos <word.Length)
            {
                char c1 = word[pos - 1];
                char c2 =word[pos];
                return (Separator(c1) && !Separator(c2)) || (!IsEnglishChar(c1) &&IsEnglishChar(c2));
            }
            return false;
        }

        //是否是单词的结尾
        private bool IsWordEnd(string word,intpos)
        {
            if (pos == word.Length - 1)
                return true;
            if(pos <word.Length)
            {
                char c1 =word[pos];
                char c2 = word[pos + 1];
                return (!Separator(c1) && Separator(c2)) || (IsEnglishChar(c1) && !IsEnglishChar(c2));
            }
            return false;
        }

        private void CheckWord(string checkWord,intbegin)
        {
            int index =begin;
            while(index + 1 <checkWord.Length)
            {
                ++index;
                char ch =checkWord[index];
                if(IsSkipChar(ch))
                {
                    if(checkNode.ContainChar(ch))
                        checkNode =checkNode.GetChild(ch);
                }
                else{
                    if(checkNode.ContainChar(ch))
                    {
                        checkNode =checkNode.GetChild(ch);
                        if(checkNode.IsEnd)
                        {
                            checkNode.WordEndAt =index;
                            backupNode =checkNode;
                            duplicateChar =ch;             
                            CheckWord(checkWord,index);     //继续匹配
                            break;
                        }
                    }
                    else{
                        if (duplicateChar == ch)            //屏蔽fuccccccck例如这样的
                            backupNode.WordEndAt =index;
                        else
                            break;
                    }
                }
                duplicateChar =ch;
            }
        }

        public string Filter(stringfilterWord)
        {
            int begin = 0;
            checkNode = null;
            backupNode = null;
            duplicateChar = ' ';
            string word =filterWord.ToLower();
            StringBuilder result = newStringBuilder(filterWord);
            while(begin <word.Length)
            {
                checkNode =Root;
                backupNode =Root;
                char ch =word[begin];
                duplicateChar =ch;
//设置是否严格匹配, 即逐个字符检测是否有可能是敏感词,否则像英语一样只检测单词, 
//
严格匹配: [av]是敏感词,那么[avoid]被替换成[**oid]
bool isStrict = !IsEnglishChar(ch);              
bool isWordBegin = isStrict ||IsWordBegin(word, begin);
                if(isWordBegin &&checkNode.ContainChar(ch))
                {
                    checkNode =checkNode.GetChild(ch);
                    if(!IsSkipChar(ch))
                    {
                        CheckWord(word, begin);
                        if(backupNode.IsEnd && backupNode.WordEndAt > 0)
                        {
                            bool isWordEnd = isStrict || IsWordEnd(word, backupNode.WordEndAt); //到单词末尾才行 have 中有av 但是不是末尾 所以不是屏蔽词
                            if(isWordEnd)
                            {
                                for(int i = begin; i <= backupNode.WordEndAt;++i)
                                {
                                    result[i] = '*';
                                }
                                begin =backupNode.WordEndAt;
                            }
                        }
                    }
                }
                ++begin;
            }

            returnresult.ToString();
        }
 }

测试用例:

 classProgram
    {
        static void Main(string[] args)
        {
            Trie trie = newTrie();
            trie.AddNode(trie.Root, "fuc");
            trie.AddNode(trie.Root, "fuc bitch");
            trie.AddNode(trie.Root, "fuck");
            trie.AddNode(trie.Root, "bitch");
            trie.AddNode(trie.Root, "屠杀");
            Console.WriteLine(trie.Filter("是哦fuckkkkkk山大的撒bi	ch"));
            Console.WriteLine(trie.Filter("have world fuc bitch"));
            Console.WriteLine(trie.Filter("1218fuck1"));

            Console.ReadKey();
        }
}

结果:

C# 屏蔽词过滤第1张

免责声明:文章转载自《C# 屏蔽词过滤》仅用于学习参考。如对内容有疑问,请及时联系本站处理。

上篇Vs 2008 解决方案的目录结构设置和管理(转)linux下mysql的大小写是否区分设置下篇

宿迁高防,2C2G15M,22元/月;香港BGP,2C5G5M,25元/月 雨云优惠码:MjYwNzM=

相关文章

C#.NET 中的类型转换

C# 出来也有些日子了,最近由于编程的需要,对 C# 的类型转换做了一些研究,其内容涉及 C# 的装箱/拆箱/别名、数值类型间相互转换、字符的 ASCII 码和 Unicode 码、数值字符串和数值之间的转换、字符串和字符数组/字节数组之间的转换、各种数值类型和字节数组之间的转换、十六进制数输出以及日期型数据的一些转换处理,在这里与大家分享—— 1. 装箱...

【转】SSE4.1指令集系列之一

转自:http://www.cnblogs.com/celerychen/archive/2013/04/02/2995586.html 本文要介绍的是SSE4.1指令集中的几条整数指令及其在视频编码中的应用。 1.单指令32字节差分绝对值求和指令MPSADBW 这条指令类似于SSE的PSADBW,但它实现的功能更强大。包括微软官方网站上对这条指令的说明都...

java中unicode和中文相互转换

public class Test{    public static void main(String[] args)    {        String s = "中转地设置导出模板";        String tt = gbEncoding(s);    }    public static String gbEncoding(final Str...

sql server 数据类型

摘自:http://www.cnblogs.com/zhangwei595806165/archive/2012/02/23/2364746.html 1. 字符数据类型 字符数据类型包括varchar、char、nvarchar、nchar、text以及ntext。这些数据类型用于 存储字符数据。varchar和char类型的主要区别是数据填充。如果有一...

JSONCkecker(C语言版本)

/* JSON_checker.h */ typedef struct JSON_checker_struct { int valid; int state; int depth; int top; int* stack; } * JSON_checker; extern JSON_che...

c++ 数组元素拷贝到容器(copy)

#include <iostream> // cout #include <algorithm> // copy #include <vector> // vector using namespace std; int main () { int myints[]={10,20...