本文整理汇总了Python中tokenizer.Tokenizer.get_token_stat_schema方法的典型用法代码示例。如果您正苦于以下问题:Python Tokenizer.get_token_stat_schema方法的具体用法?Python Tokenizer.get_token_stat_schema怎么用?Python Tokenizer.get_token_stat_schema使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类tokenizer.Tokenizer
的用法示例。
在下文中一共展示了Tokenizer.get_token_stat_schema方法的1个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: TextParser
# 需要导入模块: from tokenizer import Tokenizer [as 别名]
# 或者: from tokenizer.Tokenizer import get_token_stat_schema [as 别名]
class TextParser(Logger):
def __init__(self, debug=False, log=None, data_dir="data"):
Logger.__init__(self, log, debug)
# TODO: to config
stop_words = "stop_words.txt"
punct = "punct_symb.txt"
sent_end = "sentence_end.txt"
abbr = "abbr.txt"
senti_words = "product_senti_rus.txt"
# found features in all texts
self.stat = {
'text_cnt': 0,
'avg_sentence_per_text': 0,
'avg_bigram_per_sentence': 0
}
self.tokenizer = Tokenizer(debug, log, data_dir, stop_words, punct, sent_end, abbr, senti_words)
self.stat['token_stat'] = self.tokenizer.get_token_stat_schema()
self.feature_creator = FeatureGetter(debug=self.debug, weight_func_type="senti_trigram", range_val=2)
self.csv_writer = None
self.csv_writer_f = None
def compute_final_stat(self):
if self.stat['text_cnt'] == 0:
self.__print__('ERR', "No texts have been analized")
return
self.stat['avg_sentence_per_text'] = float(self.stat['token_stat']['sentence_cnt']) / self.stat['text_cnt']
self.stat['avg_bigram_per_sentence'] = float(self.stat['token_stat']['bigram_cnt']) / self.stat['token_stat']['sentence_cnt']
def text_to_sent(self, text, features):
# text -> [sentence] , sentence -> [bigram|word]
sentences = self.tokenizer.text_to_sent(text)
if len(sentences) <= 2:
return None
# get extracted features
token_features = self.tokenizer.get_token_stat()
no_normalization = ['token_cnt', 'bigram_cnt', 'sentence_cnt']
# store common stat
for k in self.stat['token_stat'].keys():
self.stat['token_stat'][k] += token_features[k]
# normalize parametrs
if k in no_normalization:
continue
division = 'token_cnt'
if k == 'senti_sentence':
division = 'sentence_cnt'
token_features[k] = float(token_features[k]) / token_features[division]
for k in token_features.keys():
features[k] = token_features[k]
return sentences
# feature schema for 'text_to_features'
def get_schema(self, as_utf8=False):
schema = []
schema.extend(self.stat['token_stat'].keys())
schema.extend(self.feature_creator.get_schema(as_utf8))
schema.append('unfound_words')
return schema
def text_to_features(self, text, as_utf8=False):
features = {}
# split to tokens and store stat
sentences = self.text_to_sent(text, features)
if sentences is None:
return None
self.feature_creator.stat_reset()
features.update(self.feature_creator.word_vec_senti_features(sentences, as_utf8))
features['unfound_words'] = self.feature_creator.get_unfound_percent()
return features
# use for analys only
def get_fixed_word_len(self, texts_features, low_len, up_len):
words = {}
for text_f in texts_features:
for sent in text_f['text']:
for w in sent:
if len(w) > up_len or len(w) < low_len:
continue
if w in words.keys():
words[w] += 1
else:
words[w] = 1
words_freq = sorted(words.items(), key=operator.itemgetter(1))
for w in words_freq:
#.........这里部分代码省略.........