本文整理汇总了Python中stemming.porter2.stem方法的典型用法代码示例。如果您正苦于以下问题:Python porter2.stem方法的具体用法?Python porter2.stem怎么用?Python porter2.stem使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类stemming.porter2
的用法示例。
在下文中一共展示了porter2.stem方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: _calculate_word_scores
# 需要导入模块: from stemming import porter2 [as 别名]
# 或者: from stemming.porter2 import stem [as 别名]
def _calculate_word_scores(self, phrase_list):
"""Scores words according to frequency and tendency to appear in multi-word key phrases"""
word_freq = nltk.FreqDist()
word_multiplier = nltk.FreqDist()
for phrase in phrase_list:
# Give a higher score if word appears in multi-word candidates
multi_word = min(2, len(filter(lambda x: not is_numeric(x), phrase)))
for word in phrase:
# Normalize by taking the stem
word_freq[stem(word)] += 1
word_multiplier[stem(word)] += multi_word
for word in word_freq.keys():
word_multiplier[word] = word_multiplier[word] / float(word_freq[word]) # Take average
word_scores = {}
for word in word_freq.keys():
word_scores[word] = word_freq[word] * word_multiplier[word]
return word_scores
示例2: _calculate_phrase_scores
# 需要导入模块: from stemming import porter2 [as 别名]
# 或者: from stemming.porter2 import stem [as 别名]
def _calculate_phrase_scores(self, phrase_list, word_scores, metric='avg'):
"""Scores phrases by taking the average, sum, or max of the scores of its words"""
phrase_scores = {}
for phrase in phrase_list:
phrase_score = 0
if metric in ['avg', 'sum']:
for word in phrase:
phrase_score += word_scores[stem(word)]
phrase_scores[" ".join(phrase)] = phrase_score
if metric == 'avg':
phrase_scores[" ".join(phrase)] = phrase_score / float(len(phrase))
elif metric == 'max':
for word in phrase:
phrase_score = word_scores[stem(word)] if word_scores[stem(word)] > phrase_score else phrase_score
phrase_scores[" ".join(phrase)] = phrase_score
return phrase_scores
示例3: retrieve_dataset
# 需要导入模块: from stemming import porter2 [as 别名]
# 或者: from stemming.porter2 import stem [as 别名]
def retrieve_dataset(index_name, doc_type, weight={'title': 5, 'abstract': 1}):
es = Elasticsearch()
results = es.search(index=index_name, doc_type=doc_type, size=10000)['hits']['hits']
dataset = {}
for res in results:
doc = DocumentInfo(res['_id'])
term_vectors = es.termvectors(index=index_name, doc_type=doc_type, id=res['_id'], offsets=False,
payloads=False, positions=False, fields='title,abstract',
field_statistics=False)['term_vectors']
for zone in {'abstract', 'title'}:
term_vector = term_vectors[zone]['terms']
for term in term_vector:
stemmed = stem(term)
if stemmed.isalpha():
if stemmed not in doc.tf:
doc.tf[stemmed] = term_vector[term]['term_freq'] * weight[zone]
else:
doc.tf[stemmed] += term_vector[term]['term_freq'] * weight[zone]
dataset[res['_id']] = doc
return dataset
示例4: input_f_word
# 需要导入模块: from stemming import porter2 [as 别名]
# 或者: from stemming.porter2 import stem [as 别名]
def input_f_word():
with open(input_file) as lines:
#ptn = re.compile(r'(.*?[\.|\:|\?|\!])(\s)([A-Z].*)')
for line in lines:
l_word=[]
line = line.strip()
l_word = line.split()
for word in l_word:
word = re.sub(r"(\,|\.|\(|\)|\'|\"|)",'',word)
"Need to handle '? -' ?"
print('{}\t{}'.format(word,stem(word)))
#print()
示例5: get_feature
# 需要导入模块: from stemming import porter2 [as 别名]
# 或者: from stemming.porter2 import stem [as 别名]
def get_feature():
target = []
feature = []
for line in open("sentiment.txt"):
y = line.split(" ")[0]
x = [stem(w) for w in line.strip("\n").split(" ")[1:] if not if_stopword(w)]
target.append(int(y))
feature.append(x)
return target, feature
示例6: word_stem
# 需要导入模块: from stemming import porter2 [as 别名]
# 或者: from stemming.porter2 import stem [as 别名]
def word_stem():
for word in get_word():
word = word.replace(',', '')
print(stem(word), '\t', word)
示例7: get_feature
# 需要导入模块: from stemming import porter2 [as 别名]
# 或者: from stemming.porter2 import stem [as 别名]
def get_feature(sentence):
feature = defaultdict(int)
for tok in sentence.split():
tok = tok.lower()
if not in_stoplist(tok):
feature[stem(tok)] += 1
return dict(feature)
示例8: preprocess
# 需要导入模块: from stemming import porter2 [as 别名]
# 或者: from stemming.porter2 import stem [as 别名]
def preprocess(s):
words = s.lower().strip().split(" ")
words = map(lambda x: x.strip(), words)
words = filter(lambda x: x not in foobar, words)
words = list(map(stem, words))
return words
示例9: clean_text
# 需要导入模块: from stemming import porter2 [as 别名]
# 或者: from stemming.porter2 import stem [as 别名]
def clean_text(text):
"""Clean text for TFIDF."""
new_text = re.sub(ur'\p{P}+', ' ', text)
new_text = [stem(i) for i in new_text.lower().split() if not
re.findall(r'[0-9]', i)]
new_text = ' '.join(new_text)
return new_text
###############################################################################
示例10: tokenize
# 需要导入模块: from stemming import porter2 [as 别名]
# 或者: from stemming.porter2 import stem [as 别名]
def tokenize(self, text, use_stem=True):
if use_stem:
tokens = [stem(w.lower()) for w in self.word_tokenize(text) if w not in punctuation and not sqlpie.global_cache[sqlpie.Config.STOPWORDS].get(w.lower()) and re.match('[A-Z0-9]', w, re.IGNORECASE)]
else:
tokens = [w.lower() for w in self.word_tokenize(text) if w not in punctuation and not sqlpie.global_cache[sqlpie.Config.STOPWORDS].get(w.lower()) and re.match('[A-Z0-9]', w, re.IGNORECASE)]
return tokens
示例11: normalize_term
# 需要导入模块: from stemming import porter2 [as 别名]
# 或者: from stemming.porter2 import stem [as 别名]
def normalize_term(s, is_query_term=False):
s = Indexer.normalize_term_without_stemming(s, is_query_term)
term = stem(s)
return term
示例12: make_word_stem
# 需要导入模块: from stemming import porter2 [as 别名]
# 或者: from stemming.porter2 import stem [as 别名]
def make_word_stem(data_in):
for word in data_in:
word = word.strip()
yield [word, stem(word)]
示例13: create_features
# 需要导入模块: from stemming import porter2 [as 别名]
# 或者: from stemming.porter2 import stem [as 别名]
def create_features(x):
phi = defaultdict(lambda: 0)
# words = x.split()
for word in x:
word = stem(word)
phi["UNI:" + word] += 1
return phi
示例14: create_features
# 需要导入模块: from stemming import porter2 [as 别名]
# 或者: from stemming.porter2 import stem [as 别名]
def create_features(x, feature_set):
phi = defaultdict(lambda: 0)
# words = x.split()
for word in x:
if stem(word) in feature_set:
phi["UNI:" + word] += 1
return phi
示例15: test_create_features
# 需要导入模块: from stemming import porter2 [as 别名]
# 或者: from stemming.porter2 import stem [as 别名]
def test_create_features(x):
phi = defaultdict(lambda: 0)
# words = x.split()
for word in x:
word = stem(word)
phi["UNI:" + word] += 1
return phi