应用开发

C# 敏感词过滤算法实现

时间:2010-12-5 17:23:32  作者:域名   来源:域名  查看:  评论:0
内容摘要:本文转载自微信公众号「UP技术控」,作者conan 。转载本文请联系UP技术控公众号。敏感词、文字过滤是一个网站必不可少的功能,如何设计一个好的、高效的过滤算法是非常有必要的。在实现文字过滤的算法中,

 

本文转载自微信公众号「UP技术控」,感词过滤作者conan 。算法实现转载本文请联系UP技术控公众号。感词过滤

敏感词、算法实现文字过滤是感词过滤一个网站必不可少的功能,如何设计一个好的算法实现、高效的感词过滤过滤算法是非常有必要的。

在实现文字过滤的算法实现算法中,云服务器提供商DFA是感词过滤唯一比较好的实现算法。DFA即Deterministic Finite Automaton,算法实现也就是感词过滤确定有穷自动机,它是算法实现是通过event和当前的state得到下一个state,即event+state=nextstate。感词过滤在实现敏感词过滤的算法实现算法中,我们必须要减少运算,高防服务器感词过滤而DFA在DFA算法中几乎没有什么计算,有的只是状态的转换。

下面看下在c#方法下实现方式

1、构建敏感词库类

private bool LoadDictionary()        {             var wordList = new List<string>();            if (_memoryLexicon == null)            {                 _memoryLexicon = new WordGroup[char.MaxValue];                var words = new SensitiveWordBll().GetAllWords();                if (words == null)                    return false;                foreach (string word in words)                {                     wordList.Add(word);                    var chineseWord = Microsoft.VisualBasic.Strings.StrConv(word,                        Microsoft.VisualBasic.VbStrConv.TraditionalChinese, 0);                    if (word != chineseWord)                        wordList.Add(chineseWord);                }                foreach (var word in wordList)                {                     if (word.Length > 0)                    {                         var group = _memoryLexicon[word[0]];                        if (group == null)                        {                             group = new WordGroup();                            _memoryLexicon[word[0]] = group;                        }                        group.Add(word.Substring(1));                    }                }            }            return true;        } 

2、构建敏感词检测类

private bool Check(string blackWord)      {           _wordlenght = 0;          //检测源下一位游标          _nextCursor = _cursor + 1;          var found = false;          var continueCheck = 0;          //遍历词的每一位做匹配          for (var i = 0; i < blackWord.Length; i++)          {               //特殊字符偏移游标              var offset = 0;              if (_nextCursor >= _sourceText.Length)              {                   if (i - 1 < blackWord.Length - 1)                      found = false;                  break;              }              else              {                   //检测下位字符如果不是汉字 数字 字符 偏移量加1                  for (var y = _nextCursor; y < _sourceText.Length; y++)                  {                       if (!IsChs(_sourceText[y]) && !IsNum(_sourceText[y]) && !IsAlphabet(_sourceText[y]))                      {                           offset++;                          //避让特殊字符,下位游标如果>=字符串长度 跳出                          if (_nextCursor + offset >= _sourceText.Length)                              break;                          _wordlenght++;                      }                      else break;                  }                  if (_nextCursor + offset >= _sourceText.Length)                  {                       found = false;                      break;                  }                  if (blackWord[i] == _sourceText[_nextCursor + offset])                  {                       found = true;                      continueCheck = 0;                  }                  else                  {                       // 匹配不到时尝试继续匹配4个字符                      if (continueCheck < 4 && _nextCursor < _sourceText.Length - 1)                      {                           continueCheck++;                          i--;                      }                      else                      {                           found = false;                          break;                      }                  }              }              _nextCursor = _nextCursor + 1 + offset;              _wordlenght++;          }          return found;      }  } 

3、测试与使用方法

_illegalWords = new List<string>();           if (string.IsNullOrEmpty(sourceText) && string.IsNullOrEmpty(_sourceText))           {                return sourceText;           }           if (!string.IsNullOrEmpty(sourceText))               _sourceText = sourceText;           _cursor = 0;           if (!LoadDictionary())           {                return _sourceText;           }           var tempString = _sourceText.ToCharArray();           var sourceTextDbc = ToDBC(SourceText);           for (var i = 0; i < SourceText.Length; i++)           {                //查询以该字为首字符的词组               var group = _memoryLexicon[sourceTextDbc[i]];               if (group != null)               {                    for (var z = 0; z < group.Count(); z++)                   {                        string word = group.GetWord(z);                       if (word.Length == 0 || Check(word))                       {                            if (isFirstCheckedReturn)                           {                                return null;                           }                           var blackword = string.Empty;                           for (var pos = 0; pos < _wordlenght + 1; pos++)                           {                                blackword += tempString[pos + _cursor].ToString();                               tempString[pos + _cursor] = ReplaceChar;                           }                           _illegalWords.Add(blackword);                           _cursor = _cursor + _wordlenght;                           i = i + _wordlenght;                           break;                       }                   }               }               _cursor++;           }           return new string(tempString);  var filter = new SensitiveWordFilter();            filter.SourceText = "dddddd";            var sourctText = filter.SourceText;            filter.ResetMemoryLexicon();            var datetime = DateTime.Now;            var ss = filter.Filter();            var datetime2 = DateTime.Now;            var millisecond = (datetime2 - datetime).TotalMilliseconds;            Console.WriteLine(millisecond);            Console.WriteLine(ss);            var words = System.IO.File.ReadAllLines(@"D:\Recv\敏感词库大全.txt", System.Text.Encoding.UTF8);            var ssx = sourctText;            var datetimex = DateTime.Now;            foreach (var word in words)            {                 if (word.Length > 0)                    ssx = ssx.Replace(word, "*".PadLeft(word.Length, *));            }            var datetime2x = DateTime.Now;            var millisecondx = (datetime2x - datetimex).TotalMilliseconds;            Console.WriteLine(millisecondx);            Console.WriteLine(ssx); 
copyright © 2025 powered by 益强资讯全景  滇ICP备2023006006号-31sitemap