基于N-最短路径方法的中文词语粗分模型
毕竟要做一个分词系统, 接下来找算法
根据论文 基于N-最短路径方法的中文词语粗分模型
, 以及上文的粗分模型, 来实现 n最短路径
算法
一图胜千言
- 创建 graph, 连接边,
- 维护一个栈
- 倒序从最后一个节点, 查找
PreNode
, 向前查询, 入栈, 取结果 - 对于入栈过的
PreNode
, 往下移, - 至于最短路径, 就很简单求出来了
可以看到消除歧义方面还是很欠缺的, 比如测试结果 # 语料 工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作 # 切词结果 工/工信处 处/处女 女/女干事 干事 事 每/每月 月/月经 经 下/下属 属 科/科室 室 都 要 亲/亲口 口/口交 交/交代 代 口/口交 交/交换/交换机 换/换机 机 等 技/技术/技术性 术 性/性器 器/器件 件 的 安/安装/安装工 装/装工 工/工作 作 # 最短路径 sort path: [37, 35, 33, 32, 30, 27, 26, 23, 22, 21, 20, 18, 16, 15, 14, 12, 10, 9, 7, 6, 3, 0], [37, 35, 33, 32, 30, 27, 26, 23, 22, 21, 20, 18, 16, 15, 14, 12, 10, 9, 7, 6, 3, 2, 1, 0], [37, 35, 33, 32, 30, 27, 26, 23, 22, 21, 20, 18, 16, 15, 14, 12, 10, 9, 7, 6, 4, 2, 1, 0] # 结果 工信处/女干事/每/月经/过/下属/科室/都/要/亲口/交代/2/4/口/交换机/等/技术性/器件/的/安装/工作
每/月经/过
, 23333
上代码:
#-*- encoding:utf-8 -*-
from __future__ import print_function
import sys
import copy
try:
reload(sys)
sys.setdefaultencoding('utf-8')
except:
pass
import codecs
from ac import Trie
class Node(object):
"""docstring for Node"""
def __init__(self, value):
super(Node, self).__init__()
self.value = value
self.pre = None
# 加载字典
print('read dict...')
string = '工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作'
# string = '他说的确实在理'
extra_dict = codecs.open('./extra_dict/dict.txt.big', 'r', 'utf-8').read()
# 暂时取4000个 节省计算资源
extra_dict = extra_dict.split('\n')
extra_dict = [x.split(' ')[0] for x in extra_dict]
print('dict length:', len(extra_dict))
# 查找词语
trie = Trie(extra_dict=extra_dict)
trie.buildac()
result = trie.search(string)
queue = [''] * len(string)
for i in result:
if len(queue[i.start]) > 0:
queue[i.start] += '/'
queue[i.start] += i.value
print('\n'.join(queue))
class Vertex(object):
def __init__(self, id):
self.id = id
self.pre_nodes = []
self.current_index = 0
def __le__(self, other):
return self.id <= other.id
def __eq__(self, other):
return self.id == other.id
def __hash__(self):
return self.id
def __str__(self):
return str(self.id)
def add_pre(self, from_pre):
self.pre_nodes.append(from_pre)
self.pre_nodes = sorted(self.pre_nodes)
def get_current_node(self):
return self.pre_nodes[self.current_index]
def pop_pre(self):
if self.current_index>= len(self.pre_nodes)-1:
return
else :
self.current_index+=1
return self.pre_nodes[self.current_index]
def has_pre(self):
if self.current_index >= len(self.pre_nodes)-1:
return False
return True
class Graph(object):
def __init__(self, sentence, dictionary):
self.vertexs = {}
self.size = len(sentence)
self.sentence = sentence
self.dictionary = dictionary
self.edges = []
self.init_graph()
def init_graph(self):
for index in range(self.size+1):
self.vertexs[index] = Vertex(index)
for index in range(0, self.size):
self.add_connect(index, index+1)
for w in self.dictionary:
if w.value not in self.sentence:
continue
if len(w.value) == 1:
continue
start_index = w.start
end_index = start_index + len(w.value)
self.add_connect(start_index, end_index)
def add_connect(self, from_vertex, to_vertex):
self.edges.append((from_vertex, to_vertex))
self.vertexs[to_vertex].add_pre(from_vertex)
def get_last(self):
return self.size
def get_vertex(self, vertex_id):
return self.vertexs[vertex_id]
class NShotPath(object):
def __init__(self):
pass
def seg(self, sentence, dictionary, n):
if not sentence:
return sentence
stack = []
path = []
g = Graph(sentence, dictionary)
last_node = g.get_last()
stack.append(last_node)
current_node = last_node
while stack:
while current_node!=0:
first_node = g.get_vertex(current_node).get_current_node()
stack.append(first_node)
current_node = first_node
path.append(copy.deepcopy(stack))
current_node = stack.pop()
while True:
# print(stack)
current_vertex = g.get_vertex(current_node)
if not stack:
break
if not current_vertex.has_pre() and stack:
current_node = stack.pop()
continue
else:
stack.append(current_node)
current_node = current_vertex.pop_pre()
stack.append(current_node)
break
return path
path = NShotPath().seg(string, result, 3)
print('sort path:', path)
cut = []
path = path[0]
index = 0
for i in reversed(path):
if i == 0:
continue
cut.append(string[index:i])
index = i
print('result:','/'.join(cut))