本文整理汇总了Python中extractor.Extractor.extract_emit_features方法的典型用法代码示例。如果您正苦于以下问题:Python Extractor.extract_emit_features方法的具体用法?Python Extractor.extract_emit_features怎么用?Python Extractor.extract_emit_features使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类extractor.Extractor
的用法示例。
在下文中一共展示了Extractor.extract_emit_features方法的1个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: Segmentor
# 需要导入模块: from extractor import Extractor [as 别名]
# 或者: from extractor.Extractor import extract_emit_features [as 别名]
#.........这里部分代码省略.........
logging.warning("Max iteration number is not set or in valid state .")
logging.info("set it to default value .")
else :
self.max_iter = max_iter
logging.info("Max iteration is %d ." %(self.max_iter))
def _build_extractor(self) :
self.extractor = Extractor(self.inner_lexicon)
def _build_constrain(self) :
self.constrain = Constrain()
def _build_decoder(self) :
self.decoder = Decoder()
def _build_inner_lexicon_and_predict_model_from_saving_path(self , model_path) :
self.model = Model()
self._load(model_path)
def _build_training_model(self) :
'''
init a empty model , build model emit feature space , build model label space , build model weight
'''
#! Init empty model
logging.info("Initialize an empty model")
self.model = Model()
self.model.init_empty_model()
#! build emit feature space
logging.info("extract all training instance and build model feature space .")
if self.extractor is None or self.model is None or self.training_unigrams_data is None :
logging.error("failed!")
return
for atom_line in self.training_unigrams_data :
emit_feature_list = self.extractor.extract_emit_features(atom_line)
self.model.add_emit_feature_list2feature_space(emit_feature_list)
#! build label space
logging.info("add labels to model label space .")
self.model.add_labels2label_space( (TAG_B , TAG_M , TAG_E , TAG_S) )
#! build feature tans mat and weight
logging.info("Inlitialize feature transition and weight space .")
self.model.build_up_model()
def _build_inner_lexicon(self , threshold=1.) :
logging.info("build inner lexicon from training data .")
if self.raw_training_data is None :
logging.error('failed')
return
words_counter = Counter()
for raw_instance in self.raw_training_data :
#! len > 1 to ensure it is a lexicon
unicode_instance = [ WSAtomTranslator.trans_atom_gram_list2unicode_line(atom_instance_gram_list)
for atom_instance_gram_list in raw_instance if len(atom_instance_gram_list) > 1 ]
words_counter.update(unicode_instance)
total_freq = sum(words_counter.viewvalues())
lexicon_list = []
if threshold < 1. :
##! a fast and clearly implementation is using Counter.most_common(N) to return the threshold number words .
##! but it clearly will cause some words were added to lexicon dict while some ohter words with the same freq is cut off at tail . it is bad.
##! So do following logic to keep fair .
##! strategy changed ! the threshold freq is also accepted (orginal , we reject words with the edge frequnce )!
threshold_num = int( total_freq * threshold )
pre_freq = INF
words_has_same_freq = []
freq_counter = 0
for word , freq in words_counter.most_common() :