本文整理汇总了Python中nltk.corpus.stopwords.words方法的典型用法代码示例。如果您正苦于以下问题:Python stopwords.words方法的具体用法?Python stopwords.words怎么用?Python stopwords.words使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类nltk.corpus.stopwords
的用法示例。
在下文中一共展示了stopwords.words方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: pz_selective_sampling
# 需要导入模块: from nltk.corpus import stopwords [as 别名]
# 或者: from nltk.corpus.stopwords import words [as 别名]
def pz_selective_sampling(self, pz_proba):
"""
Selective sampling of pz(do max-sampling but prevent repeated words)
"""
pz_proba = pz_proba.data
z_proba, z_token = torch.topk(pz_proba, pz_proba.size(0), dim=2)
z_token = z_token.transpose(0, 1) # [B,Tz,top_Tz]
all_sampled_z = []
for b in range(z_token.size(0)):
sampled_z = []
for t in range(z_token.size(1)):
for i in range(z_token.size(2)):
if z_token[b][t][i] not in sampled_z:
sampled_z.append(z_token[b][t][i])
break
all_sampled_z.append(sampled_z)
return all_sampled_z
示例2: collocations
# 需要导入模块: from nltk.corpus import stopwords [as 别名]
# 或者: from nltk.corpus.stopwords import words [as 别名]
def collocations(self, num=20, window_size=2):
"""
Print collocations derived from the text, ignoring stopwords.
:seealso: find_collocations
:param num: The maximum number of collocations to print.
:type num: int
:param window_size: The number of tokens spanned by a collocation (default=2)
:type window_size: int
"""
if not ('_collocations' in self.__dict__ and self._num == num and self._window_size == window_size):
self._num = num
self._window_size = window_size
#print("Building collocations list")
from nltk.corpus import stopwords
ignored_words = stopwords.words('english')
finder = BigramCollocationFinder.from_words(self.tokens, window_size)
finder.apply_freq_filter(2)
finder.apply_word_filter(lambda w: len(w) < 3 or w.lower() in ignored_words)
bigram_measures = BigramAssocMeasures()
self._collocations = finder.nbest(bigram_measures.likelihood_ratio, num)
colloc_strings = [w1+' '+w2 for w1, w2 in self._collocations]
print(tokenwrap(colloc_strings, separator="; "))
示例3: __init__
# 需要导入模块: from nltk.corpus import stopwords [as 别名]
# 或者: from nltk.corpus.stopwords import words [as 别名]
def __init__(self,
w=20,
k=10,
similarity_method=BLOCK_COMPARISON,
stopwords=None,
smoothing_method=DEFAULT_SMOOTHING,
smoothing_width=2,
smoothing_rounds=1,
cutoff_policy=HC,
demo_mode=False):
if stopwords is None:
from nltk.corpus import stopwords
stopwords = stopwords.words('english')
self.__dict__.update(locals())
del self.__dict__['self']
示例4: from_words
# 需要导入模块: from nltk.corpus import stopwords [as 别名]
# 或者: from nltk.corpus.stopwords import words [as 别名]
def from_words(cls, words, window_size=2):
"""Construct a BigramCollocationFinder for all bigrams in the given
sequence. When window_size > 2, count non-contiguous bigrams, in the
style of Church and Hanks's (1990) association ratio.
"""
wfd = FreqDist()
bfd = FreqDist()
if window_size < 2:
raise ValueError("Specify window_size at least 2")
for window in ngrams(words, window_size, pad_right=True):
w1 = window[0]
if w1 is None:
continue
wfd[w1] += 1
for w2 in window[1:]:
if w2 is not None:
bfd[(w1, w2)] += 1
return cls(wfd, bfd, window_size=window_size)
示例5: extract_unigram_feats
# 需要导入模块: from nltk.corpus import stopwords [as 别名]
# 或者: from nltk.corpus.stopwords import words [as 别名]
def extract_unigram_feats(document, unigrams, handle_negation=False):
"""
Populate a dictionary of unigram features, reflecting the presence/absence in
the document of each of the tokens in `unigrams`.
:param document: a list of words/tokens.
:param unigrams: a list of words/tokens whose presence/absence has to be
checked in `document`.
:param handle_negation: if `handle_negation == True` apply `mark_negation`
method to `document` before checking for unigram presence/absence.
:return: a dictionary of unigram features {unigram : boolean}.
>>> words = ['ice', 'police', 'riot']
>>> document = 'ice is melting due to global warming'.split()
>>> sorted(extract_unigram_feats(document, words).items())
[('contains(ice)', True), ('contains(police)', False), ('contains(riot)', False)]
"""
features = {}
if handle_negation:
document = mark_negation(document)
for word in unigrams:
features['contains({0})'.format(word)] = word in set(document)
return features
示例6: extract_bigram_feats
# 需要导入模块: from nltk.corpus import stopwords [as 别名]
# 或者: from nltk.corpus.stopwords import words [as 别名]
def extract_bigram_feats(document, bigrams):
"""
Populate a dictionary of bigram features, reflecting the presence/absence in
the document of each of the tokens in `bigrams`. This extractor function only
considers contiguous bigrams obtained by `nltk.bigrams`.
:param document: a list of words/tokens.
:param unigrams: a list of bigrams whose presence/absence has to be
checked in `document`.
:return: a dictionary of bigram features {bigram : boolean}.
>>> bigrams = [('global', 'warming'), ('police', 'prevented'), ('love', 'you')]
>>> document = 'ice is melting due to global warming'.split()
>>> sorted(extract_bigram_feats(document, bigrams).items())
[('contains(global - warming)', True), ('contains(love - you)', False),
('contains(police - prevented)', False)]
"""
features = {}
for bigr in bigrams:
features['contains({0} - {1})'.format(bigr[0], bigr[1])] = bigr in nltk.bigrams(document)
return features
#////////////////////////////////////////////////////////////
#{ Helper Functions
#////////////////////////////////////////////////////////////
示例7: preprocess
# 需要导入模块: from nltk.corpus import stopwords [as 别名]
# 或者: from nltk.corpus.stopwords import words [as 别名]
def preprocess(text):
min_length = 3
text = re.sub('\d+','#',text)
text = re.sub('\.',' eos ',text)
# Tokenize
words = map(lambda word: word.lower(), word_tokenize(text))
tokens = words
# Remove non characters
p = re.compile('[a-zA-Z#]+')
# Filter tokens (we do not remove stopwords)
filtered_tokens = list(filter(lambda token: p.match(token) and len(token)>=min_length and (token not in english_stopwords), tokens))
# Encode to ascii
filtered_tokens = [token.encode('ascii','ignore') for token in filtered_tokens]
return filtered_tokens
# Modify this path
示例8: convert_string
# 需要导入模块: from nltk.corpus import stopwords [as 别名]
# 或者: from nltk.corpus.stopwords import words [as 别名]
def convert_string(text):
"""Convert a str of text into tokenized and selected list of words.
Parameters
----------
text : str
Text as one long string.
Returns
-------
words_cleaned : list of str
List of tokenized words, after processing.
Notes
-----
This function sets text to lower case, and removes stopwords and punctuation.
"""
words = word_tokenize(text)
words_cleaned = [word.lower() for word in words if (
(not word.lower() in stopwords.words('english')) & word.isalnum())]
return words_cleaned
示例9: _tokenize
# 需要导入模块: from nltk.corpus import stopwords [as 别名]
# 或者: from nltk.corpus.stopwords import words [as 别名]
def _tokenize(self, row):
'''
clean texts by removing special and non-chars
stems each word and removes stop words
return list of tokenized words for each row
'''
chars = re.sub(r'-|"|&',' ',row) #replace dashes, quotes, and ampersands
chars = self.regex.sub('',chars) # remove nonchars
wordlist = str(chars).split()
if self.stemming:
wordlist = [self.stemmer.stem(word.lower()) for word in wordlist if word.lower() not in self.stop] # stem and remove stopwords
else:
wordlist = [word.lower() for word in wordlist if word.lower() not in self.stop]
#create bigrams if enabled
if self.bigrams:
bigrams = []
for i in range(len(wordlist)-1):
bigrams.append(wordlist[i]+" "+wordlist[i+1])
wordlist = wordlist + bigrams
return wordlist
示例10: lightStemAr
# 需要导入模块: from nltk.corpus import stopwords [as 别名]
# 或者: from nltk.corpus.stopwords import words [as 别名]
def lightStemAr(word_list):
result = []
arstemmer = ISRIStemmer()
for word in word_list:
word = arstemmer.norm(word, num=1) # remove diacritics which representing Arabic short vowels
if not word in arstemmer.stop_words: # exclude stop words from being processed
word = arstemmer.pre32(word) # remove length three and length two prefixes in this order
word = arstemmer.suf32(word) # remove length three and length two suffixes in this order
word = arstemmer.waw(word) # remove connective ‘و’ if it precedes a word beginning with ‘و’
word = arstemmer.norm(word, num=2) # normalize initial hamza to bare alif
result.append(word)
return ' '.join(result)
###################################################################################
# combine rooting and light stemming: if light stemming alogrithm manage to reduce word form, then the light stem is returned, else, the root is returned
示例11: distance_based_helper
# 需要导入模块: from nltk.corpus import stopwords [as 别名]
# 或者: from nltk.corpus.stopwords import words [as 别名]
def distance_based_helper(self, P, Q, A):
res = []
U = set(stopwords.words('arabic')) & set(P)
SQ = list(set(P) & set(Q) - U)
for i in range(0, len(A)):
SA = list(((set(A[i]) & set(P)) - set(Q)) - U)
d = len(P) + 1
if(len(SQ) == 0 or len(SA) == 0):
d = 1
else:
for q in SQ:
for a in SA:
d = min(d, self.dist(P, q, a))
d *= 1 / (len(P) - 1)
res.append(d)
return res
示例12: get_last_words_from_parsed_title
# 需要导入模块: from nltk.corpus import stopwords [as 别名]
# 或者: from nltk.corpus.stopwords import words [as 别名]
def get_last_words_from_parsed_title(s):
words=s.split()
if len(words)==0:
last_word=""
word_before_last=""
word2_before_last=""
else:
last_word=words[len(words)-1]
word_before_last=""
word2_before_last=""
if len(words)>1:
word_before_last=words[len(words)-2]
if word_before_last=="and":
word_before_last=""
if len(words)>2 and word_before_last!="and":
word2_before_last=words[len(words)-3]
if word2_before_last=="and":
word2_before_last=""
return last_word, word_before_last, word2_before_last
示例13: _use_stopwords
# 需要导入模块: from nltk.corpus import stopwords [as 别名]
# 或者: from nltk.corpus.stopwords import words [as 别名]
def _use_stopwords(self, x):
words = tokenizer.tokenize(x)
words = [w for w in words if not w in eng_stopwords]
x = " ".join(words)
return x
示例14: _apostrophes
# 需要导入模块: from nltk.corpus import stopwords [as 别名]
# 或者: from nltk.corpus.stopwords import words [as 别名]
def _apostrophes(self, x):
words = tokenizer.tokenize(x)
words = [APOSTROPHES_WORDS[word] if word in APOSTROPHES_WORDS else word for word in words]
words = [lem.lemmatize(word, "v") for word in words]
words = [w for w in words if not w in eng_stopwords]
x = " ".join(words)
return x
示例15: _lemmatize
# 需要导入模块: from nltk.corpus import stopwords [as 别名]
# 或者: from nltk.corpus.stopwords import words [as 别名]
def _lemmatize(self, sent):
words = [wn.lemmatize(_) for _ in sent.split()]
return ' '.join(words)