基于N-最短路径方法的中文词语粗分模型 implementation by python

毕竟要做一个分词系统, 接下来找算法
根据论文 基于N-最短路径方法的中文词语粗分模型, 以及上文的粗分模型, 来实现 n最短路径算法

一图胜千言

  1. 创建 graph, 连接边,
  2. 维护一个栈
  3. 倒序从最后一个节点, 查找 PreNode, 向前查询, 入栈, 取结果
  4. 对于入栈过的 PreNode, 往下移,
  5. 至于最短路径, 就很简单求出来了
    测试结果
    # 语料
    工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作
    # 切词结果/工信处
    处/处女
    女/女干事
    干事
    事
    每/每月
    月/月经
    经
    下/下属
    属
    科/科室
    室
    都
    要
    亲/亲口
    口/口交
    交/交代
    代
    口/口交
    交/交换/交换机
    换/换机
    机
    等
    技/技术/技术性
    术
    性/性器
    器/器件
    件
    的
    安/安装/安装工
    装/装工
    工/工作
    作
    # 最短路径
    sort path:
    [37, 35, 33, 32, 30, 27, 26, 23, 22, 21, 20, 18, 16, 15, 14, 12, 10, 9, 7, 6, 3, 0],
    [37, 35, 33, 32, 30, 27, 26, 23, 22, 21, 20, 18, 16, 15, 14, 12, 10, 9, 7, 6, 3, 2, 1, 0],
    [37, 35, 33, 32, 30, 27, 26, 23, 22, 21, 20, 18, 16, 15, 14, 12, 10, 9, 7, 6, 4, 2, 1, 0]
    # 结果
    工信处/女干事//月经//下属/科室///亲口/交代/2/4//交换机//技术性/器件//安装/工作
    
    可以看到消除歧义方面还是很欠缺的, 比如 每/月经/过, 23333

上代码:

#-*- encoding:utf-8 -*-
from __future__ import print_function

import sys
import copy
try:
    reload(sys)
    sys.setdefaultencoding('utf-8')
except:
    pass

import codecs
from ac import Trie
class Node(object):
  """docstring for Node"""
  def __init__(self, value):
    super(Node, self).__init__()
    self.value = value
    self.pre = None


# 加载字典
print('read dict...')
string = '工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作'
# string = '他说的确实在理'
extra_dict = codecs.open('./extra_dict/dict.txt.big', 'r', 'utf-8').read()
# 暂时取4000个 节省计算资源
extra_dict = extra_dict.split('\n')
extra_dict = [x.split(' ')[0] for x in extra_dict]
print('dict length:', len(extra_dict))


# 查找词语
trie = Trie(extra_dict=extra_dict)
trie.buildac()
result = trie.search(string)
queue = [''] * len(string)

for i in result:
  if len(queue[i.start]) > 0:
    queue[i.start] += '/'
  queue[i.start] += i.value
print('\n'.join(queue))

class Vertex(object):
  def __init__(self, id):
    self.id = id
    self.pre_nodes = []
    self.current_index = 0
  def __le__(self, other):
    return self.id <= other.id
  def __eq__(self, other):
    return self.id == other.id
  def __hash__(self):
    return self.id
  def __str__(self):
    return str(self.id)
  def add_pre(self, from_pre):
    self.pre_nodes.append(from_pre)
    self.pre_nodes = sorted(self.pre_nodes)
  def get_current_node(self):
    return self.pre_nodes[self.current_index]
  def pop_pre(self):
    if self.current_index>= len(self.pre_nodes)-1:
      return
    else :
      self.current_index+=1
      return self.pre_nodes[self.current_index]
  def has_pre(self):
    if self.current_index >= len(self.pre_nodes)-1:
      return False
    return True

class Graph(object):
  def __init__(self, sentence, dictionary):
    self.vertexs = {}
    self.size = len(sentence)
    self.sentence = sentence
    self.dictionary = dictionary
    self.edges = []
    self.init_graph()

  def init_graph(self):
    for index in range(self.size+1):
      self.vertexs[index] = Vertex(index)
    for index in range(0, self.size):
      self.add_connect(index, index+1)
    for w in self.dictionary:
      if w.value not in self.sentence:
        continue
      if len(w.value) == 1:
        continue
      start_index = w.start
      end_index = start_index + len(w.value)
      self.add_connect(start_index, end_index)

  def add_connect(self, from_vertex, to_vertex):
    self.edges.append((from_vertex, to_vertex))
    self.vertexs[to_vertex].add_pre(from_vertex)

  def get_last(self):
    return self.size

  def get_vertex(self, vertex_id):
    return self.vertexs[vertex_id]

class NShotPath(object):
  def __init__(self):
    pass

  def seg(self, sentence, dictionary, n):
    if not sentence:
      return sentence
    stack = []
    path = []
    g = Graph(sentence, dictionary)
    last_node = g.get_last()
    stack.append(last_node)
    current_node = last_node
    while stack:
      while current_node!=0:
        first_node = g.get_vertex(current_node).get_current_node()
        stack.append(first_node)
        current_node = first_node
      path.append(copy.deepcopy(stack))
      current_node = stack.pop()
      while True:
        # print(stack)
        current_vertex = g.get_vertex(current_node)
        if not stack:
          break
        if not current_vertex.has_pre() and stack:
          current_node = stack.pop()
          continue
        else:
          stack.append(current_node)
          current_node = current_vertex.pop_pre()
          stack.append(current_node)
          break
    return path

path = NShotPath().seg(string, result, 3)
print('sort path:', path)
cut = []
path = path[0]
index = 0
for i in reversed(path):
  if i == 0:
    continue
  cut.append(string[index:i])
  index = i
print('result:','/'.join(cut))