当前位置: 首页>>代码示例>>Python>>正文


Python Tokenizer.text_to_sent方法代码示例

本文整理汇总了Python中tokenizer.Tokenizer.text_to_sent方法的典型用法代码示例。如果您正苦于以下问题:Python Tokenizer.text_to_sent方法的具体用法?Python Tokenizer.text_to_sent怎么用?Python Tokenizer.text_to_sent使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在tokenizer.Tokenizer的用法示例。


在下文中一共展示了Tokenizer.text_to_sent方法的1个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: TextParser

# 需要导入模块: from tokenizer import Tokenizer [as 别名]
# 或者: from tokenizer.Tokenizer import text_to_sent [as 别名]
class TextParser(Logger):
	def __init__(self, debug=False, log=None, data_dir="data"):
		Logger.__init__(self, log, debug)

		# TODO: to config
		stop_words = "stop_words.txt"
		punct = "punct_symb.txt"
		sent_end = "sentence_end.txt"
		abbr = "abbr.txt"
		senti_words = "product_senti_rus.txt"

		# found features in all texts
		self.stat = {
			'text_cnt':		0,
			'avg_sentence_per_text': 0,
			'avg_bigram_per_sentence': 0
		}

		self.tokenizer = Tokenizer(debug, log, data_dir, stop_words, punct, sent_end, abbr, senti_words)
		self.stat['token_stat'] = self.tokenizer.get_token_stat_schema()

		self.feature_creator = FeatureGetter(debug=self.debug, weight_func_type="senti_trigram", range_val=2)

		self.csv_writer = None
		self.csv_writer_f = None

	def compute_final_stat(self):
		if self.stat['text_cnt'] == 0:
			self.__print__('ERR', "No texts have been analized")
			return

		self.stat['avg_sentence_per_text'] = float(self.stat['token_stat']['sentence_cnt']) / self.stat['text_cnt']
		self.stat['avg_bigram_per_sentence'] = float(self.stat['token_stat']['bigram_cnt']) / self.stat['token_stat']['sentence_cnt']

	def text_to_sent(self, text, features):
		# text -> [sentence] , sentence -> [bigram|word]
		sentences = self.tokenizer.text_to_sent(text)
		if len(sentences) <= 2:
			return None

		# get extracted features
		token_features = self.tokenizer.get_token_stat()

		no_normalization = ['token_cnt', 'bigram_cnt', 'sentence_cnt']
		# store common stat
		for k in self.stat['token_stat'].keys():
			self.stat['token_stat'][k] += token_features[k]
			# normalize parametrs
			if k in no_normalization:
				continue

			division = 'token_cnt'
			if k == 'senti_sentence':
				division = 'sentence_cnt'

			token_features[k] = float(token_features[k]) / token_features[division]

		for k in token_features.keys():
			features[k] = token_features[k]

		return sentences

	# feature schema for 'text_to_features'
	def get_schema(self, as_utf8=False):
		schema = []

		schema.extend(self.stat['token_stat'].keys())
		schema.extend(self.feature_creator.get_schema(as_utf8))
		schema.append('unfound_words')

		return schema

	def text_to_features(self, text, as_utf8=False):
		features = {}
		# split to tokens and store stat
		sentences = self.text_to_sent(text, features)
		if sentences is None:
			return None

		self.feature_creator.stat_reset()
		features.update(self.feature_creator.word_vec_senti_features(sentences, as_utf8))
		features['unfound_words'] = self.feature_creator.get_unfound_percent()

		return features

	# use for analys only
	def get_fixed_word_len(self, texts_features, low_len, up_len):
		words = {}
		for text_f in texts_features:
			for sent in text_f['text']:
				for w in sent:
					if len(w) > up_len or len(w) < low_len:
						continue
					if w in words.keys():
						words[w] += 1
					else:
						words[w] = 1

		words_freq = sorted(words.items(), key=operator.itemgetter(1))
		for w in words_freq:
#.........这里部分代码省略.........
开发者ID:KseniyaYakil,项目名称:news_sentiment_analys,代码行数:103,代码来源:text_parser.py


注:本文中的tokenizer.Tokenizer.text_to_sent方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。