#! /usr/bin/env python2.7
#coding=utf-8
import numpy as np
import math
class mmseg:
maxlen=3
maxchunk=3
def __init__(self):
self.result = []
def maximum_matching(self,words,dicts,result):
if len(result)==mmseg.maxchunk or words == '':
self.rule1.append(result)
return
wlen=1
if len(words) < mmseg.maxlen:
maxlen = len(words)
else:
maxlen = mmseg.maxlen
while wlen <= maxlen:
if wlen == 1 or words[:wlen] in dicts:
self.maximum_matching(words[wlen:],dicts,result+[words[:wlen]])
wlen+=1
######## return the perfect match
total_len = 0
temp_arr = []
for arr in self.rule1:
curr_len = sum(len(i) for i in arr)
if curr_len > total_len:
temp_arr=[]
total_len = curr_len
temp_arr.append(arr)
elif curr_len == total_len:
temp_arr.append(arr)
return temp_arr
def largest_average_word_length(self,remain):
largest_average = 0
temp_arr = []
for arr in remain:
curr_word_len = sum(len(i) for i in arr)
arr_len = len(arr)
curr_average = (curr_word_len+0.0) / arr_len
if curr_average > largest_average:
largest_average= curr_average
temp_arr = []
temp_arr.append(arr)
elif curr_average == largest_average:
temp_arr.append(arr)
return temp_arr
def smallest_variance(self,remain):
smallest_variance=float('inf')
temp_arr = []
for arr in remain:
curr_variance = np.var([len(i) for i in arr])
if curr_variance < smallest_variance:
smallest_variance = curr_variance
temp_arr= []
temp_arr.append(arr)
elif curr_variance == smallest_variance:
temp_arr.append(arr)
return temp_arr
def largest_degree_one_character_words(self,remain):
largest_degree = 0
temp_arr = []
dicts = {u"和":10,u"味":3} #one-character word frequency
for arr in remain:
curr_degree= sum(math.log(dicts[i]) for i in arr if i in dicts)
if curr_degree > largest_degree:
largest_degree = curr_degree
temp_arr = []
temp_arr.append(arr)
elif curr_degree == largest_degree:
temp_arr.append(arr)
return temp_arr
def segmenter(self,words,dicts):
if len(words) == 0:
return self.result
self.rule1=[]
rule1_filtered = self.maximum_matching(words,dicts,[])
#print rule1_filtered
if len(rule1_filtered) == 1:
self.result.append(rule1_filtered[0][0])
return self.segmenter(words[len(rule1_filtered[0][0]):],dicts)
rule2_filtered = self.largest_average_word_length(rule1_filtered)
#print rule2_filtered
if len(rule2_filtered) == 1:
self.result.append(rule2_filtered[0][0])
return self.segmenter(words[len(rule2_filtered[0][0]):],dicts)
rule3_filtered = self.smallest_variance(rule2_filtered)
#print rule3_filtered
if len(rule3_filtered) == 1:
self.result.append(rule3_filtered[0][0])
return self.segmenter(words[len(rule3_filtered[0][0]):],dicts)
rule4_filtered = self.largest_degree_one_character_words(rule3_filtered)
#print rule4_filtered
self.result.append(rule4_filtered[0][0])
return self.segmenter(words[len(rule4_filtered[0][0]):],dicts)
words=u"研究生命起源真有意義和趣味"
dicts={u"研究",u"研究生",u"生命",u"起源",u"意義",u"和趣",u"趣味"}
ins = mmseg()
result = ins.segmenter(words,dicts)
print "/".join([i.encode("utf8") for i in result])