Python stop_words.ENGLISH_STOP_WORDS属性代码示例

本文整理汇总了Python中sklearn.feature_extraction.stop_words.ENGLISH_STOP_WORDS属性的典型用法代码示例。如果您正苦于以下问题：Python stop_words.ENGLISH_STOP_WORDS属性的具体用法？Python stop_words.ENGLISH_STOP_WORDS怎么用？Python stop_words.ENGLISH_STOP_WORDS使用的例子？那么恭喜您, 这里精选的属性代码示例或许可以为您提供帮助。您也可以进一步了解该属性所在类sklearn.feature_extraction.stop_words的用法示例。

在下文中一共展示了stop_words.ENGLISH_STOP_WORDS属性的7个代码示例，这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞，您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: build_document_term_matrix

# 需要导入模块: from sklearn.feature_extraction import stop_words [as 别名]
# 或者: from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS [as 别名]
def build_document_term_matrix(self):

        self.tfidf_vectorizer = TfidfVectorizer(
            stop_words=ENGLISH_STOP_WORDS, lowercase=True,
            strip_accents="unicode",
            use_idf=True, norm="l2", min_df=Constants.MIN_DICTIONARY_WORD_COUNT,
            max_df=Constants.MAX_DICTIONARY_WORD_COUNT, ngram_range=(1, 1))
        self.document_term_matrix = \
            self.tfidf_vectorizer.fit_transform(self.target_bows)

        vocabulary = self.tfidf_vectorizer.vocabulary_
        num_terms = len(vocabulary)
        self.terms = [""] * num_terms
        for term in vocabulary.keys():
            self.terms[vocabulary[term]] = term

        print "Created document-term matrix of size %d x %d" % (
            self.document_term_matrix.shape[0],
            self.document_term_matrix.shape[1]
        )

开发者ID:melqkiades，项目名称:yelp，代码行数:22，代码来源:nmf_context_extractor.py

示例2: get_stopwords

# 需要导入模块: from sklearn.feature_extraction import stop_words [as 别名]
# 或者: from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS [as 别名]
def get_stopwords():
    nltk_stopwords = set(stopwords.words('english'))
    sklearn_stopwords = stop_words.ENGLISH_STOP_WORDS

    all_stopwords = set()
    all_stopwords |= spacy_stopwords
    all_stopwords |= nltk_stopwords
    all_stopwords |= sklearn_stopwords

    return all_stopwords

开发者ID:vineetjohn，项目名称:linguistic-style-transfer，代码行数:12，代码来源:lexicon_helper.py

示例3: _check_stop_list

# 需要导入模块: from sklearn.feature_extraction import stop_words [as 别名]
# 或者: from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS [as 别名]
def _check_stop_list(stop):
    if stop == "english":
        return ENGLISH_STOP_WORDS
    elif isinstance(stop, six.string_types):
        raise ValueError("not a built-in stop list: %s" % stop)
    elif stop is None:
        return None
    else:  # assume it's a collection
        return frozenset(stop)

开发者ID:prozhuchen，项目名称:2016CCF-sougou，代码行数:11，代码来源:STFIWF.py

示例4: init

# 需要导入模块: from sklearn.feature_extraction import stop_words [as 别名]
# 或者: from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS [as 别名]
def __init__(self):

        self.model = None

        self.spacynlp = spacy.load('en')

        self.stopwords = set(STOP_WORDS +
                             ["n't", "'s", "'m", "ca"] +
                             list(ENGLISH_STOP_WORDS))

        self.punctuations = " ".join(string.punctuation).split(" ") + \
                            ["-----", "---", "...", "'ve"]

开发者ID:alfredfrancis，项目名称:ai-chatbot-framework，代码行数:14，代码来源:sklearn_intent_classifer.py

示例5: wordCount

# 需要导入模块: from sklearn.feature_extraction import stop_words [as 别名]
# 或者: from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS [as 别名]
def wordCount(text):
    try:
        text = text.lower()
        regex = re.compile("[" + re.escape(string.punctuation) + "0-9\\r\\t\\n]")
        txt = regex.sub(" ", text)
        words = [
            w
            for w in txt.split(" ")
            if w not in stop_words.ENGLISH_STOP_WORDS and len(w) > 3
        ]
        return len(words)
    except Exception:
        return 0

开发者ID:modin-project，项目名称:modin，代码行数:15，代码来源:kaggle18.py

示例6: _loadSpecialWords

# 需要导入模块: from sklearn.feature_extraction import stop_words [as 别名]
# 或者: from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS [as 别名]
def _loadSpecialWords(self):
        ''' Load stop words, number prefixes, news agencies, and protest subject words. '''
        self.S_PREFIX  = ['around', 'up to', 'as many as', 'some', 'many', 'nearly', 'more than', 'about']

        self.P_SUBJ   = {
            'protest': ['protesters', 'protestors', 'demonstrators', 'activists', 'strikers', 'marchers', 'signatures',
            	'counter-demonstrators', 'counter-demonstraters', 'counter-protesters', 'counter-protestors', 'counterprotesters',
            	'counterprotestors']
                }

        self.AGW = ['Agence France-Presse, English Service', 'Associated Press Worldstream, English Service']

        self.SWS = list(stop_words.ENGLISH_STOP_WORDS)

开发者ID:MPEDS，项目名称:mpeds，代码行数:15，代码来源:open_ended_coders.py

示例7: normalize_text

# 需要导入模块: from sklearn.feature_extraction import stop_words [as 别名]
# 或者: from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS [as 别名]
def normalize_text(raw_text, remove_stop_words=True, only_letters=True, return_list=False, remove_one_char_words=True, **kwargs):
    '''
    Algorithm to convert raw text to a return a clean text string
    Method modified from code available at:
    https://www.kaggle.com/c/word2vec-nlp-tutorial/details/part-1-for-beginners-bag-of-words
    Args:
        raw_text: Original text to clean and normalize
        remove_stop_words: Boolean value to trigger removal of stop words
        only_letters: Boolean value to trigger removal of characters that are not letters
        return_list: Boolean value to trigger return value as a list of words
        remove_one_char_words: Boolean value to trigger removal of words that are only a single character
    Returns:
        clean_text: Either a string or a list of words that has been filtered based on function parameters.

    '''
    # Remove web links
    clean_text = link_re.sub('', raw_text)

    # Remove HTML
    # Suppress UserWarnings from BeautifulSoup due to text with tech info (ex: code, directory structure)
    with warnings.catch_warnings():
        warnings.filterwarnings('ignore', category=UserWarning)
        clean_text = BeautifulSoup(clean_text, "lxml").get_text()

    # Only keep letters or keep letters and numbers
    if only_letters: 
        clean_text = letter_re.sub(" ", clean_text)
    else:
        clean_text = letter_number_re.sub(" ",clean_text)

    # Convert to lower case, split into individual words
    clean_text = clean_text.lower().split()

    # If numbers are allowed in words, remove candidate words that only contain numbers
    if not only_letters:
        clean_text = [w for w in clean_text if not all(i.isdigit() for i in w)]

    # Remove stop words
    if remove_stop_words:
        clean_text = [w for w in clean_text if not w in python_stop_words]
        clean_text = [w for w in clean_text if not w in ENGLISH_STOP_WORDS]

    # Remove words that are only a single character in length
    if remove_one_char_words: clean_text = [w for w in clean_text if len(w)>1]

    # Return as string or list based on parameters
    if return_list:
        return clean_text
    else:
        return " ".join(clean_text)

开发者ID:Lab41，项目名称:altair，代码行数:52，代码来源:normalize_text.py

注：本文中的sklearn.feature_extraction.stop_words.ENGLISH_STOP_WORDS属性示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台，相关代码片段筛选自各路编程大神贡献的开源项目，源码版权归原作者所有，传播和使用请参考对应项目的License；未经允许，请勿转载。