本文整理汇总了Python中sklearn.feature_extraction.stop_words.ENGLISH_STOP_WORDS属性的典型用法代码示例。如果您正苦于以下问题:Python stop_words.ENGLISH_STOP_WORDS属性的具体用法?Python stop_words.ENGLISH_STOP_WORDS怎么用?Python stop_words.ENGLISH_STOP_WORDS使用的例子?那么恭喜您, 这里精选的属性代码示例或许可以为您提供帮助。您也可以进一步了解该属性所在类sklearn.feature_extraction.stop_words
的用法示例。
在下文中一共展示了stop_words.ENGLISH_STOP_WORDS属性的7个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: build_document_term_matrix
# 需要导入模块: from sklearn.feature_extraction import stop_words [as 别名]
# 或者: from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS [as 别名]
def build_document_term_matrix(self):
self.tfidf_vectorizer = TfidfVectorizer(
stop_words=ENGLISH_STOP_WORDS, lowercase=True,
strip_accents="unicode",
use_idf=True, norm="l2", min_df=Constants.MIN_DICTIONARY_WORD_COUNT,
max_df=Constants.MAX_DICTIONARY_WORD_COUNT, ngram_range=(1, 1))
self.document_term_matrix = \
self.tfidf_vectorizer.fit_transform(self.target_bows)
vocabulary = self.tfidf_vectorizer.vocabulary_
num_terms = len(vocabulary)
self.terms = [""] * num_terms
for term in vocabulary.keys():
self.terms[vocabulary[term]] = term
print "Created document-term matrix of size %d x %d" % (
self.document_term_matrix.shape[0],
self.document_term_matrix.shape[1]
)
示例2: get_stopwords
# 需要导入模块: from sklearn.feature_extraction import stop_words [as 别名]
# 或者: from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS [as 别名]
def get_stopwords():
nltk_stopwords = set(stopwords.words('english'))
sklearn_stopwords = stop_words.ENGLISH_STOP_WORDS
all_stopwords = set()
all_stopwords |= spacy_stopwords
all_stopwords |= nltk_stopwords
all_stopwords |= sklearn_stopwords
return all_stopwords
示例3: _check_stop_list
# 需要导入模块: from sklearn.feature_extraction import stop_words [as 别名]
# 或者: from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS [as 别名]
def _check_stop_list(stop):
if stop == "english":
return ENGLISH_STOP_WORDS
elif isinstance(stop, six.string_types):
raise ValueError("not a built-in stop list: %s" % stop)
elif stop is None:
return None
else: # assume it's a collection
return frozenset(stop)
示例4: __init__
# 需要导入模块: from sklearn.feature_extraction import stop_words [as 别名]
# 或者: from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS [as 别名]
def __init__(self):
self.model = None
self.spacynlp = spacy.load('en')
self.stopwords = set(STOP_WORDS +
["n't", "'s", "'m", "ca"] +
list(ENGLISH_STOP_WORDS))
self.punctuations = " ".join(string.punctuation).split(" ") + \
["-----", "---", "...", "'ve"]
示例5: wordCount
# 需要导入模块: from sklearn.feature_extraction import stop_words [as 别名]
# 或者: from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS [as 别名]
def wordCount(text):
try:
text = text.lower()
regex = re.compile("[" + re.escape(string.punctuation) + "0-9\\r\\t\\n]")
txt = regex.sub(" ", text)
words = [
w
for w in txt.split(" ")
if w not in stop_words.ENGLISH_STOP_WORDS and len(w) > 3
]
return len(words)
except Exception:
return 0
示例6: _loadSpecialWords
# 需要导入模块: from sklearn.feature_extraction import stop_words [as 别名]
# 或者: from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS [as 别名]
def _loadSpecialWords(self):
''' Load stop words, number prefixes, news agencies, and protest subject words. '''
self.S_PREFIX = ['around', 'up to', 'as many as', 'some', 'many', 'nearly', 'more than', 'about']
self.P_SUBJ = {
'protest': ['protesters', 'protestors', 'demonstrators', 'activists', 'strikers', 'marchers', 'signatures',
'counter-demonstrators', 'counter-demonstraters', 'counter-protesters', 'counter-protestors', 'counterprotesters',
'counterprotestors']
}
self.AGW = ['Agence France-Presse, English Service', 'Associated Press Worldstream, English Service']
self.SWS = list(stop_words.ENGLISH_STOP_WORDS)
示例7: normalize_text
# 需要导入模块: from sklearn.feature_extraction import stop_words [as 别名]
# 或者: from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS [as 别名]
def normalize_text(raw_text, remove_stop_words=True, only_letters=True, return_list=False, remove_one_char_words=True, **kwargs):
'''
Algorithm to convert raw text to a return a clean text string
Method modified from code available at:
https://www.kaggle.com/c/word2vec-nlp-tutorial/details/part-1-for-beginners-bag-of-words
Args:
raw_text: Original text to clean and normalize
remove_stop_words: Boolean value to trigger removal of stop words
only_letters: Boolean value to trigger removal of characters that are not letters
return_list: Boolean value to trigger return value as a list of words
remove_one_char_words: Boolean value to trigger removal of words that are only a single character
Returns:
clean_text: Either a string or a list of words that has been filtered based on function parameters.
'''
# Remove web links
clean_text = link_re.sub('', raw_text)
# Remove HTML
# Suppress UserWarnings from BeautifulSoup due to text with tech info (ex: code, directory structure)
with warnings.catch_warnings():
warnings.filterwarnings('ignore', category=UserWarning)
clean_text = BeautifulSoup(clean_text, "lxml").get_text()
# Only keep letters or keep letters and numbers
if only_letters:
clean_text = letter_re.sub(" ", clean_text)
else:
clean_text = letter_number_re.sub(" ",clean_text)
# Convert to lower case, split into individual words
clean_text = clean_text.lower().split()
# If numbers are allowed in words, remove candidate words that only contain numbers
if not only_letters:
clean_text = [w for w in clean_text if not all(i.isdigit() for i in w)]
# Remove stop words
if remove_stop_words:
clean_text = [w for w in clean_text if not w in python_stop_words]
clean_text = [w for w in clean_text if not w in ENGLISH_STOP_WORDS]
# Remove words that are only a single character in length
if remove_one_char_words: clean_text = [w for w in clean_text if len(w)>1]
# Return as string or list based on parameters
if return_list:
return clean_text
else:
return " ".join(clean_text)