AC自动机算法实现

由于想试试做个分词系统, 从字典中找出指定字符串 找到了 AC自动机算法 维基百科, 之前没有接触过类似的, 所以不可避免的绕个远路,
先放个图

从 ‘yasherhs’ 中找出 ['say','she' ,'shr','he','her']

# 运行结果
# index end str
1 4 she
3 4 he
4 5 her

关系图:

#!/usr/bin/python
# -*- coding: utf-8 -*-
class Node:
  def __init__(self):
    self.value = None
    self.children = {}  # {char, Node}
    self.count = 0
    self.fail = None

s = ["say","she" ,"shr","he","her"]

def CMP(a, b):
  return b.count - a.count

class Trie:
  def __init__(self):
    self.root = Node()
    self.choose = []

  def insert(self, key, num):
    node = self.root
    for index in range(len(key)):
      i = key[index]
      if i not in node.children:
        newNode = Node()
        newNode.value = i
        node.children[i] = newNode
      node = node.children[i]
      if index == len(key) - 1:
        node.count = num


  def find_node(self, string):
    res_node = self.root
    try:
      for i in string:
        res_node = res_node.children[i]
    except:
      res_node = None
    return res_node

  def buildac(self):
    queue = []
    queue.append(self.root)
    while len(queue) > 0:
      temp = queue.pop()
      p = None
      for k, v in temp.children.items():
        # 指向 root,
        if temp == self.root:
          temp.children[k].fail = self.root
        else:
          # 寻找字母相同节点, 否则指向 root,
          p = temp.fail
          while p != None:
            if k in p.children:
              temp.children[k].fail = p.children[k]
              break
            p = p.fail
          if p is None:
            temp.children[k].fail = self.root
        queue.append(temp.children[k])

  def search(self, content):
    print(content)
    p = self.root

    for i in range(len(content)):
      index = content[i]
      while (index in p.children) == False and p != self.root:
          p = p.fail
      if (index in p.children) == False:
        continue
      p = p.children[index]
      t = p
      while(t != self.root and t.count > 0):
        print(t.count, i, s[t.count])
        t = t.fail




if __name__ == '__main__':
    trie = Trie()
    for i in range(len(s)):
      trie.insert(s[i], i)
    trie.buildac()
    trie.search('yasherhs')

折腾了大半天, 有空了好好唠唠