本文整理汇总了Python中nltk.corpus.stopwords.words函数的典型用法代码示例。如果您正苦于以下问题:Python words函数的具体用法?Python words怎么用?Python words使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了words函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: get_stopwords
def get_stopwords(include_trectext_syntax=True):
ignore_words = ['<doc>', '</doc>', '<text>', '</text>']
ignore_words.extend(stopwords.words('english'))
ignore_words.extend(stopwords.words('dutch'))
return set(ignore_words)
示例2: find_opinions
def find_opinions(tokens, feature, feat, id):
fg = 0
for opinion in tokens:
if opinion[0] == 'advmod' or opinion[0] == 'neg':
if opinion[3].lower() in stopwords.words('english'):
continue
# endif
if feature[1:3] == opinion[1:3]:
fg = 1
modifier_set.add(opinion[3])
if id != -1:
mods[id].append(opinion[3])
feat.write(
feature[3] + ' ' + feature[1] + ' ' +
opinion[3] + '\n')
# endif
# endif
elif opinion[0] == 'dep':
if opinion[3].lower() in stopwords.words('english'):
continue
# endif
if feature[1:3] == opinion[1:3]:
opinions_set.add(opinion[3])
find_opinions(
tokens, ['nsubj', opinion[3], opinion[4], feature[3],
feature[4]], feat, -1)
# endelif
# endfor
if fg == 0:
feat.write(feature[3] + ' ' + feature[1] + '\n')
示例3: find_features
def find_features(tokens, feat):
i = 0
for feature in tokens:
if feature[0] == 'nsubj':
if feature[3].lower() in stopwords.words('english'):
continue
if feature[1].lower() in stopwords.words('english'):
continue
if not valid_feature(tokens, feature):
continue
# endif
mods.append([])
features_set.add(feature[3])
opinions_set.add(feature[1])
find_opinions(tokens, feature, feat, len(mods) - 1)
if i != 0:
if tokens[i - 1][0] == 'nsubj' and tokens[i - 1][3:5] == feature[3:5]:
for mod in mods[len(mods) - 2]:
if mod not in mods[len(mods) - 1]:
mods[len(mods) - 1].append(mod)
feat.write(
feature[3] + ' ' + feature[1] + ' ' + mod + '\n')
# endif
i = i + 1
示例4: extract_bigrams
def extract_bigrams(self, text):
text = self.remove_return_lines_and_quotes(text)
bigrams = []
st = PorterStemmer()
stop = stopwords.words('english')
more_stop_words = [
'(', ')', "'s", ',', ':', '<', '>', '.', '-', '&', '*', '...']
stop = stopwords.words('english')
stop = stop + more_stop_words
tokens = st.stem(text)
tokens = nltk.word_tokenize(tokens.lower())
tokens = [i for i in tokens if i not in stop]
tokens = [word for word in tokens if len(word) > 2]
bigram_measures = nltk.collocations.BigramAssocMeasures()
finder = BigramCollocationFinder.from_words(tokens)
finder.apply_freq_filter(2)
top_bigrams = finder.nbest(bigram_measures.pmi, 1000)
for bg in top_bigrams:
bg = " ".join(bg)
tag = nltk.pos_tag([bg])[0]
if tag[1] not in ['VBG', 'RB', 'VB', 'VBD', 'VBN', 'VBP', 'VBZ', 'PRP', 'IN', 'DT', 'CC', 'PRP$']:
bigrams.append(tag[0])
return bigrams
示例5: clean
def clean(self, raw):
letters_only = re.sub("[^a-zA-Z#@]", " ", raw)
words = letters_only.split()
for i in range(0, len(words)):
if "#" in words[i]:
s = words[i].split('#')
words[i] = '# '.join(s)
if "@" in words[i]:
s = words[i].split('@')
words[i] = '@ '.join(s)
if "http" in words[i]:
s = words[i].split('http')
words[i]= "http".join(s)
total_stop_words = set(stopwords.words("english"))
removed_stop_words = set(stopwords.words("english")[0:20])
stop_words = total_stop_words - removed_stop_words
content_words = [w for w in words if not w in stop_words]
return " ".join(content_words)
示例6: removeStopWords
def removeStopWords(tokens, lang):
filteredToken=tokens
if lang =='en':
filteredToken = [w for w in tokens if not w in stopwords.words('english')]
elif lang =='es':
filteredToken = [w for w in tokens if not w in stopwords.words('spanish')]
return filteredToken
示例7: frequencounting4Up
def frequencounting4Up(Listings):
"""
Get the keywords count and the rank of the keywords
:param Listings: the input list of tweets
:return: a list of tuple ranked by words counts
"""
MyCounter = Counter()
chars = ['.', '/', "'", '"', '?', '!', '#', '$', '%', '^', '&',
'*', '(', ')', ' - ', '_', '+', '=', '@', ':', '\\', ',',
';', '~', '`', '<', '>', '|', '[', ']', '{', '}', '-', '"', '&', 'rt']
UpdatingChars = ['&', 'rt', '', '#dctraffic', '#mdtraffic', '#vatraffic', 'amp', '-']
# This section below will filter out the common english words and punctuations from the target tweets.
for line in Listings:
if type(line) is str:
for word in line.strip().lower().split():
if PunkRemovement(word.strip().lower()) not in UpdatingChars + stopwords.words(
'english') and not word.isdigit():
if len(word) > 1:
MyCounter[PunkRemovement(word.strip().lower())] += 1
else:
for word in line.text.decode('UTF-8').strip().lower().split():
if PunkRemovement(word.strip().lower()) not in chars + stopwords.words('english'):
MyCounter[PunkRemovement(word.strip().lower())] += 1
return MyCounter.most_common()
示例8: freqgen_word
def freqgen_word(word):
connect(word)
# get english stopwords
stopen = stopwords.words('english')
stopfr = stopwords.words('french')
#stopsp = stopwords.words('spanish')
query={}
projection={"text":1}
cursor = db.Tweetfind.find(query,projection)
texts = pandas.Series(list(cursor))
tokens = []
for text in texts.values:
tokens.extend([word.lower().strip(':;,#."-\'!') for word in text['text'].split()])
filtered_tokens=[]
st = ['&',' ','it\'s','haven\'t','can\'t','don\'t','i\'m','i\'ve','i\'ll','i\'d','#','e','@','1','2','3','4','5','6','7','8','9','10','11','12','13','14','15','16','17','18','19','20','rt','(',')']
for word in tokens:
try:
if (not word.decode('utf-8') in stopen) and (not word.decode('utf-8') in stopfr):
if not word in st:
filtered_tokens.append(word.decode('utf-8'))
except :
pass
freq_dist = nltk.FreqDist(filtered_tokens)
print type(freq_dist)
#print freq_dist.plot(25)
return freq_dist
示例9: pre_process
def pre_process(self, text):
for i in range(len(text)):
text[i] = text[i].replace("-", " ")
word_list = text[i].encode('ascii', 'ignore').lower().split(" ")
processed_text = []
count = 0
for word in word_list:
if word in stopwords.words('english'):
continue
if re.match('@\w+', word):
continue
if re.match('#\w+', word):
continue
word = re.sub('[0-9]+', 'gotNumber', word)
word = re.sub('http(s)?.+', 'gotURL', word)
word = re.sub('[^a-zA-Z0-9]', ' ', word)
words = word.split(' ')
for w in words:
if w is not ' ' and len(w) > 1 and w not in stopwords.words('english'):
w = self.sno.stem(w)
processed_text.append(w)
count += 1
print '. ',
if count == 11:
print ''
count = 0
text[i] = processed_text
print ''
return text
示例10: fuzzer
def fuzzer(localstring, dbpstring):
lwl = localstring.replace('-','').replace(',.', '').split()
lfwl = [w for w in lwl if not w in stopwords.words('english')]
dwl = dbpstring.replace('-','').split()
dfwl = [w for w in dwl if not w in stopwords.words('english')]
ratio = fuzz.token_sort_ratio(str(lfwl), str(dfwl))
return ratio
示例11: clean_total_words
def clean_total_words(data):
all_text=list()
for i in range(len(data)):
all_text.append(data[i]['text'])
words=list()
for i in range(len(all_text)):
words.append(nltk.word_tokenize(all_text[i]))
wordss= list(itertools.chain.from_iterable(words))
word_after_clean=list()
for i in range(len(words)):
wordss[i]=wordss[i].lower()
stop_words = set(stopwords.words('english'))
stop_words.update(['.', ',', '"', "'", '?', '!', ':', ';', '(', ')', '[', ']', '{', '}'])
for i in range(len(wordss)):
if wordss[i] not in stop_words:
word_after_clean.append(wordss[i])
word_clean=list()
for i in range(len(word_after_clean)):
if word_after_clean[i].isalpha()==True:
word_clean.append(word_after_clean[i])
word_clea=list()
for i in range(len(word_clean)):
word_clea.append(word_clean[i].lower())
stop_words = set(stopwords.words('english'))
word_c=list()
for i in range(len(word_clea)):
if word_clea[i] not in stop_words:
word_c.append(word_clea[i])
return(word_c)
示例12: extract_features
def extract_features(self, article, feats, threegram_sent_ppl, fourgram_sent_ppl, fivegram_sent_ppl, sixgram_sent_ppl, index = None):
featureSet = {}
articleWords = article.replace("<s>", "").replace("</s>", "").split()
featureSet["articlelen"] = len(articleWords)
fx_words = [word for word in articleWords if word.lower() in stopwords.words('english')]
featureSet["fxwordcount"] = len(fx_words)/len(articleWords)
non_words = [word for word in articleWords if word.isalpha() != True]
featureSet["nonwordcount"] = len(non_words)/len(articleWords)
content_words = [word for word in articleWords if word.lower() not in stopwords.words('english')]
featureSet["contentwordcount"] = len(content_words)/len(articleWords)
featureSet["uniquewords"] = len(set(articleWords))/len(articleWords)
featureSet.update(feats)
try:
sents = [x for x in article.split("\n") if len(x) > 1]
ppl_five = ppl_wrangling(sents, fivegram_sent_ppl)
ppl_six = ppl_wrangling(sents, sixgram_sent_ppl)
ppl_three = ppl_wrangling(sents, threegram_sent_ppl)
ppl_four = ppl_wrangling(sents, fourgram_sent_ppl)
featureSet["ppl-5"] = ppl_five
featureSet["ppl-6"] = ppl_six
featureSet["ppl-3"] = ppl_three
featureSet["ppl-4"] = ppl_four
except:
pass
featureSet.update(self.posTags(index, article))
return featureSet
示例13: evaluate_html
def evaluate_html(content, html_conf):
fdist = FreqDist()
if html_conf['usehtml'] == False:
logging.info('Discarding HTML tags')
return fdist
logging.info("\tEvaluating HTML")
# try with TITLE tag
titles = re.findall("<title>[A-Za-z0-9 ]+</title>", content)
for title in titles:
root = etree.fromstring(title)
words_list = nltk.word_tokenize(re.sub('[^A-Za-z0-9 ]', ' ', root.text))
terms_list = [ x for x in words_list if x.lower() not in stopwords.words('english')]
stems = steming(terms_list)
for i in range(html_conf['title']):
fdist.update(stems)
# try with H1 tag
headers = re.findall("<h1>[A-Za-z0-9 ]+</h1>", content)
for header in headers:
root = etree.fromstring(header)
words_list = nltk.word_tokenize(re.sub('[^A-Za-z0-9 ]', ' ', root.text))
terms_list = [ x for x in words_list if x.lower() not in stopwords.words('english')]
stems = steming(terms_list)
for i in range(html_conf['h1']):
fdist.update(stems)
return fdist
示例14: palavrasChaves
def palavrasChaves(self):
# fun��o da NLTK que retorna as stopwords na lingua inglesa
stopE = stopwords.words('english')
# fun��o da NLTK que retorna as stopwords na lingua portuguesa
stop = stopwords.words('portuguese')
stopS = stopwords.words('spanish')
palavrasChaves = []
textoArtigo = []
#retira pontua��es do texto e divide o texto em palavras
for i in self.titulo.lower().replace(',','').replace('.','').replace('-','').replace('(','').replace(')','').split():
#retira as stopwords da lingua portuguesa do texto do artigo que est� sendo apresentado
if i not in stop:
#retira as stopwords da lingua inglesa do texto do artigo que est� sendo apresentado
if i not in stopE:
#ignora palavras com menos de 3 caracteres. Isso � para tratar palavras, como por exemplo o verbo "�"
if i not in stopS:
if len(i) > 2:
textoArtigo.append(i)
# apresenta a frequencia de repeticoes das palavras no corpo do artigo
freq = FreqDist(textoArtigo)
# separa as quatro palavras mais frequentes
items = freq.items()[:4]
# coloca as palavras mais frequentes do texto na variavel palavrasChaves
for i in range(0,len(items)):
palavrasChaves.append(items[i][0])
return palavrasChaves
示例15: word_standardize
def word_standardize(sentences):
tokens = []
sentences_st = []
for sent in sentences:
tokens.extend(word_tokenize(sent))
sentences_st.append(word_tokenize(sent))
words = tokens
st = LancasterStemmer()
words = [w.lower() for w in words]
words = [w for w in words if not w in stopwords.words('english')]
words = [w for w in words if not w in '!"#$%&\'()*+,-./:;<=>[email protected][\\]^_`{|}~']
st_words = [st.stem(w) for w in words]
sent_result = []
for sent in sentences_st:
sent = [w.lower() for w in sent]
sent = [w for w in sent if not w in stopwords.words('english')]
sent = [w for w in sent if not w in '!"#$%&\'()*+,-./:;<=>[email protected][\\]^_`{|}~']
sent_result.append(sent)
return st_words, sent_result