本文整理汇总了Python中stop_words.get_stop_words方法的典型用法代码示例。如果您正苦于以下问题:Python stop_words.get_stop_words方法的具体用法?Python stop_words.get_stop_words怎么用?Python stop_words.get_stop_words使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类stop_words
的用法示例。
在下文中一共展示了stop_words.get_stop_words方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: LDA_train
# 需要导入模块: import stop_words [as 别名]
# 或者: from stop_words import get_stop_words [as 别名]
def LDA_train(doc):
red = []
en_stop = get_stop_words('en')
for d in doc:
try:
raw = d.lower()
tokens = tokenizer.tokenize(raw)
stopped_tokens = [i for i in tokens if not i in en_stop]
red.append(stopped_tokens)
except:
continue
print("Forming Dictionary.....")
dictionary = corpora.Dictionary(red)
print("Forming Corpus.....")
corpus = [dictionary.doc2bow(text) for text in red]
print("Training Model.....")
lda = models.ldamodel.LdaModel(corpus, num_topics=10, id2word = dictionary, passes=1)
return lda
#Returns Average Of Probablity Of Word Present In LDA Model For Input Document(Returns Float):
示例2: clean_up_words
# 需要导入模块: import stop_words [as 别名]
# 或者: from stop_words import get_stop_words [as 别名]
def clean_up_words(words):
new_words = [] # empty list
pkg_stop_words = get_stop_words('en')
my_stop_words = [
'the',
'is',
'and',
'thisfacebooktwitteremailredditprint',
'',
'reply',
'likelike',
'likeliked',
'comments',
'commenting',
'/',
'='
]
for word in words:
word = word.lower()
cleaned_word = clean_word(word)
if cleaned_word in my_stop_words or cleaned_word in pkg_stop_words:
pass
else:
new_words.append(cleaned_word)
return new_words
示例3: __init__
# 需要导入模块: import stop_words [as 别名]
# 或者: from stop_words import get_stop_words [as 别名]
def __init__(self):
self.stop_words = get_stop_words("en")
示例4: __remove_stop_words
# 需要导入模块: import stop_words [as 别名]
# 或者: from stop_words import get_stop_words [as 别名]
def __remove_stop_words(self, docs):
output = []
for doc in docs:
en_stop = get_stop_words('en')
stopped_tokens = [i for i in doc if not i in en_stop]
output.append(stopped_tokens)
return output
示例5: __init__
# 需要导入模块: import stop_words [as 别名]
# 或者: from stop_words import get_stop_words [as 别名]
def __init__(self):
self.episodeInfo = {}
self.Info = []
self.allTranscripts = {}
self.vocabulary = collections.defaultdict(int)
self.Stopwords = get_stop_words('en')
self.impactActors = ["Leonard","Sheldon","Penny", "Howard","Raj","Amy","Bernadette"]
示例6: clean_up_words
# 需要导入模块: import stop_words [as 别名]
# 或者: from stop_words import get_stop_words [as 别名]
def clean_up_words(words):
new_words = [] # empty list
pkg_stop_words = get_stop_words('en')
my_stop_words = ['the', 'is', 'and', 'thisfacebooktwitteremailredditprint']
for word in words:
word = word.lower()
cleaned_word = clean_word(word)
if cleaned_word in my_stop_words or cleaned_word in pkg_stop_words:
pass
else:
new_words.append(cleaned_word)
return new_words
示例7: __init__
# 需要导入模块: import stop_words [as 别名]
# 或者: from stop_words import get_stop_words [as 别名]
def __init__(self, language):
self._stop_words = set(stop_words.get_stop_words(language))
示例8: load_stopwords
# 需要导入模块: import stop_words [as 别名]
# 或者: from stop_words import get_stop_words [as 别名]
def load_stopwords(language):
return [t for w in get_stop_words(language) for t in slugify(w).split("-")]
示例9: test_get_stop_words
# 需要导入模块: import stop_words [as 别名]
# 或者: from stop_words import get_stop_words [as 别名]
def test_get_stop_words(self):
sw = get_stop_words('english')
self.assertEqual(len(sw), self.number_of_english_stop_words)
示例10: test_get_stop_words_language_mapping
# 需要导入模块: import stop_words [as 别名]
# 或者: from stop_words import get_stop_words [as 别名]
def test_get_stop_words_language_mapping(self):
sw = get_stop_words('en')
self.assertEqual(len(sw), self.number_of_english_stop_words)
self.assertEqual(sw, get_stop_words('english'))
示例11: test_get_stop_words_cache
# 需要导入模块: import stop_words [as 别名]
# 或者: from stop_words import get_stop_words [as 别名]
def test_get_stop_words_cache(self):
self.assertFalse('french' in stop_words.STOP_WORDS_CACHE)
sw = get_stop_words('fr')
self.assertTrue('french' in stop_words.STOP_WORDS_CACHE)
original_stop_words_dir = stop_words.STOP_WORDS_DIR
stop_words.STOP_WORDS_DIR = 'not-existing-directory'
self.assertEqual(sw, get_stop_words('french'))
stop_words.STOP_WORDS_DIR = original_stop_words_dir
try:
get_stop_words('klingon')
except:
pass
self.assertFalse('klingon' in stop_words.STOP_WORDS_CACHE)
示例12: test_get_stop_words_unavailable_language
# 需要导入模块: import stop_words [as 别名]
# 或者: from stop_words import get_stop_words [as 别名]
def test_get_stop_words_unavailable_language(self):
self.assertRaises(StopWordError, get_stop_words, 'sindarin')
示例13: test_get_stop_words_install_issue
# 需要导入模块: import stop_words [as 别名]
# 或者: from stop_words import get_stop_words [as 别名]
def test_get_stop_words_install_issue(self):
original_stop_words_dir = stop_words.STOP_WORDS_DIR
stop_words.STOP_WORDS_DIR = 'not-existing-directory'
self.assertRaises(StopWordError, get_stop_words, 'german')
stop_words.STOP_WORDS_DIR = original_stop_words_dir
示例14: test_filters
# 需要导入模块: import stop_words [as 别名]
# 或者: from stop_words import get_stop_words [as 别名]
def test_filters(self):
language = 'en'
before = get_stop_words(language, False)
letter = random.choice(random.choice(before))
def remove_letter(stopwords, language):
return [word for word in stopwords if letter not in word]
stop_words.add_filter(remove_letter)
after = get_stop_words(language, False)
for stopword in after:
self.assertFalse(letter in stopword)
self.assertTrue(stop_words.remove_filter(remove_letter))
示例15: remove_stopwords
# 需要导入模块: import stop_words [as 别名]
# 或者: from stop_words import get_stop_words [as 别名]
def remove_stopwords(tokenized_data):
en_stop = get_stop_words('en')
stopped_tokens = [token for token in tokenized_data if token not in en_stop]
return stopped_tokens