当前位置: 首页>>代码示例>>Python>>正文


Python stop_words.ENGLISH_STOP_WORDS属性代码示例

本文整理汇总了Python中sklearn.feature_extraction.stop_words.ENGLISH_STOP_WORDS属性的典型用法代码示例。如果您正苦于以下问题:Python stop_words.ENGLISH_STOP_WORDS属性的具体用法?Python stop_words.ENGLISH_STOP_WORDS怎么用?Python stop_words.ENGLISH_STOP_WORDS使用的例子?那么恭喜您, 这里精选的属性代码示例或许可以为您提供帮助。您也可以进一步了解该属性所在sklearn.feature_extraction.stop_words的用法示例。


在下文中一共展示了stop_words.ENGLISH_STOP_WORDS属性的7个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: build_document_term_matrix

# 需要导入模块: from sklearn.feature_extraction import stop_words [as 别名]
# 或者: from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS [as 别名]
def build_document_term_matrix(self):

        self.tfidf_vectorizer = TfidfVectorizer(
            stop_words=ENGLISH_STOP_WORDS, lowercase=True,
            strip_accents="unicode",
            use_idf=True, norm="l2", min_df=Constants.MIN_DICTIONARY_WORD_COUNT,
            max_df=Constants.MAX_DICTIONARY_WORD_COUNT, ngram_range=(1, 1))
        self.document_term_matrix = \
            self.tfidf_vectorizer.fit_transform(self.target_bows)

        vocabulary = self.tfidf_vectorizer.vocabulary_
        num_terms = len(vocabulary)
        self.terms = [""] * num_terms
        for term in vocabulary.keys():
            self.terms[vocabulary[term]] = term

        print "Created document-term matrix of size %d x %d" % (
            self.document_term_matrix.shape[0],
            self.document_term_matrix.shape[1]
        ) 
开发者ID:melqkiades,项目名称:yelp,代码行数:22,代码来源:nmf_context_extractor.py

示例2: get_stopwords

# 需要导入模块: from sklearn.feature_extraction import stop_words [as 别名]
# 或者: from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS [as 别名]
def get_stopwords():
    nltk_stopwords = set(stopwords.words('english'))
    sklearn_stopwords = stop_words.ENGLISH_STOP_WORDS

    all_stopwords = set()
    all_stopwords |= spacy_stopwords
    all_stopwords |= nltk_stopwords
    all_stopwords |= sklearn_stopwords

    return all_stopwords 
开发者ID:vineetjohn,项目名称:linguistic-style-transfer,代码行数:12,代码来源:lexicon_helper.py

示例3: _check_stop_list

# 需要导入模块: from sklearn.feature_extraction import stop_words [as 别名]
# 或者: from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS [as 别名]
def _check_stop_list(stop):
    if stop == "english":
        return ENGLISH_STOP_WORDS
    elif isinstance(stop, six.string_types):
        raise ValueError("not a built-in stop list: %s" % stop)
    elif stop is None:
        return None
    else:  # assume it's a collection
        return frozenset(stop) 
开发者ID:prozhuchen,项目名称:2016CCF-sougou,代码行数:11,代码来源:STFIWF.py

示例4: __init__

# 需要导入模块: from sklearn.feature_extraction import stop_words [as 别名]
# 或者: from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS [as 别名]
def __init__(self):

        self.model = None

        self.spacynlp = spacy.load('en')

        self.stopwords = set(STOP_WORDS +
                             ["n't", "'s", "'m", "ca"] +
                             list(ENGLISH_STOP_WORDS))

        self.punctuations = " ".join(string.punctuation).split(" ") + \
                            ["-----", "---", "...", "'ve"] 
开发者ID:alfredfrancis,项目名称:ai-chatbot-framework,代码行数:14,代码来源:sklearn_intent_classifer.py

示例5: wordCount

# 需要导入模块: from sklearn.feature_extraction import stop_words [as 别名]
# 或者: from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS [as 别名]
def wordCount(text):
    try:
        text = text.lower()
        regex = re.compile("[" + re.escape(string.punctuation) + "0-9\\r\\t\\n]")
        txt = regex.sub(" ", text)
        words = [
            w
            for w in txt.split(" ")
            if w not in stop_words.ENGLISH_STOP_WORDS and len(w) > 3
        ]
        return len(words)
    except Exception:
        return 0 
开发者ID:modin-project,项目名称:modin,代码行数:15,代码来源:kaggle18.py

示例6: _loadSpecialWords

# 需要导入模块: from sklearn.feature_extraction import stop_words [as 别名]
# 或者: from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS [as 别名]
def _loadSpecialWords(self):
        ''' Load stop words, number prefixes, news agencies, and protest subject words. '''
        self.S_PREFIX  = ['around', 'up to', 'as many as', 'some', 'many', 'nearly', 'more than', 'about']

        self.P_SUBJ   = {
            'protest': ['protesters', 'protestors', 'demonstrators', 'activists', 'strikers', 'marchers', 'signatures',
            	'counter-demonstrators', 'counter-demonstraters', 'counter-protesters', 'counter-protestors', 'counterprotesters',
            	'counterprotestors']
                }

        self.AGW = ['Agence France-Presse, English Service', 'Associated Press Worldstream, English Service']

        self.SWS = list(stop_words.ENGLISH_STOP_WORDS) 
开发者ID:MPEDS,项目名称:mpeds,代码行数:15,代码来源:open_ended_coders.py

示例7: normalize_text

# 需要导入模块: from sklearn.feature_extraction import stop_words [as 别名]
# 或者: from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS [as 别名]
def normalize_text(raw_text, remove_stop_words=True, only_letters=True, return_list=False, remove_one_char_words=True, **kwargs):
    '''
    Algorithm to convert raw text to a return a clean text string
    Method modified from code available at:
    https://www.kaggle.com/c/word2vec-nlp-tutorial/details/part-1-for-beginners-bag-of-words
    Args:
        raw_text: Original text to clean and normalize
        remove_stop_words: Boolean value to trigger removal of stop words
        only_letters: Boolean value to trigger removal of characters that are not letters
        return_list: Boolean value to trigger return value as a list of words
        remove_one_char_words: Boolean value to trigger removal of words that are only a single character
    Returns:
        clean_text: Either a string or a list of words that has been filtered based on function parameters.

    '''
    # Remove web links
    clean_text = link_re.sub('', raw_text)

    # Remove HTML
    # Suppress UserWarnings from BeautifulSoup due to text with tech info (ex: code, directory structure)
    with warnings.catch_warnings():
        warnings.filterwarnings('ignore', category=UserWarning)
        clean_text = BeautifulSoup(clean_text, "lxml").get_text()

    # Only keep letters or keep letters and numbers
    if only_letters: 
        clean_text = letter_re.sub(" ", clean_text)
    else:
        clean_text = letter_number_re.sub(" ",clean_text)

    # Convert to lower case, split into individual words
    clean_text = clean_text.lower().split()

    # If numbers are allowed in words, remove candidate words that only contain numbers
    if not only_letters:
        clean_text = [w for w in clean_text if not all(i.isdigit() for i in w)]

    # Remove stop words
    if remove_stop_words:
        clean_text = [w for w in clean_text if not w in python_stop_words]
        clean_text = [w for w in clean_text if not w in ENGLISH_STOP_WORDS]

    # Remove words that are only a single character in length
    if remove_one_char_words: clean_text = [w for w in clean_text if len(w)>1]

    # Return as string or list based on parameters
    if return_list:
        return clean_text
    else:
        return " ".join(clean_text) 
开发者ID:Lab41,项目名称:altair,代码行数:52,代码来源:normalize_text.py


注:本文中的sklearn.feature_extraction.stop_words.ENGLISH_STOP_WORDS属性示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。