Python stopwords.words方法代码示例

本文整理汇总了Python中nltk.corpus.stopwords.words方法的典型用法代码示例。如果您正苦于以下问题：Python stopwords.words方法的具体用法？Python stopwords.words怎么用？Python stopwords.words使用的例子？那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类nltk.corpus.stopwords的用法示例。

在下文中一共展示了stopwords.words方法的15个代码示例，这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞，您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: pz_selective_sampling

# 需要导入模块: from nltk.corpus import stopwords [as 别名]
# 或者: from nltk.corpus.stopwords import words [as 别名]
def pz_selective_sampling(self, pz_proba):
        """
        Selective sampling of pz(do max-sampling but prevent repeated words)
        """
        pz_proba = pz_proba.data
        z_proba, z_token = torch.topk(pz_proba, pz_proba.size(0), dim=2)
        z_token = z_token.transpose(0, 1)  # [B,Tz,top_Tz]
        all_sampled_z = []
        for b in range(z_token.size(0)):
            sampled_z = []
            for t in range(z_token.size(1)):
                for i in range(z_token.size(2)):
                    if z_token[b][t][i] not in sampled_z:
                        sampled_z.append(z_token[b][t][i])
                        break
            all_sampled_z.append(sampled_z)
        return all_sampled_z

开发者ID:AuCson，项目名称:SEDST，代码行数:19，代码来源:unsup_net.py

示例2: collocations

# 需要导入模块: from nltk.corpus import stopwords [as 别名]
# 或者: from nltk.corpus.stopwords import words [as 别名]
def collocations(self, num=20, window_size=2):
        """
        Print collocations derived from the text, ignoring stopwords.

        :seealso: find_collocations
        :param num: The maximum number of collocations to print.
        :type num: int
        :param window_size: The number of tokens spanned by a collocation (default=2)
        :type window_size: int
        """
        if not ('_collocations' in self.__dict__ and self._num == num and self._window_size == window_size):
            self._num = num
            self._window_size = window_size

            #print("Building collocations list")
            from nltk.corpus import stopwords
            ignored_words = stopwords.words('english')
            finder = BigramCollocationFinder.from_words(self.tokens, window_size)
            finder.apply_freq_filter(2)
            finder.apply_word_filter(lambda w: len(w) < 3 or w.lower() in ignored_words)
            bigram_measures = BigramAssocMeasures()
            self._collocations = finder.nbest(bigram_measures.likelihood_ratio, num)
        colloc_strings = [w1+' '+w2 for w1, w2 in self._collocations]
        print(tokenwrap(colloc_strings, separator="; "))

开发者ID:rafasashi，项目名称:razzy-spinner，代码行数:26，代码来源:text.py

示例3: init

# 需要导入模块: from nltk.corpus import stopwords [as 别名]
# 或者: from nltk.corpus.stopwords import words [as 别名]
def __init__(self,
                 w=20,
                 k=10,
                 similarity_method=BLOCK_COMPARISON,
                 stopwords=None,
                 smoothing_method=DEFAULT_SMOOTHING,
                 smoothing_width=2,
                 smoothing_rounds=1,
                 cutoff_policy=HC,
                 demo_mode=False):


        if stopwords is None:
            from nltk.corpus import stopwords
            stopwords = stopwords.words('english')
        self.__dict__.update(locals())
        del self.__dict__['self']

开发者ID:rafasashi，项目名称:razzy-spinner，代码行数:19，代码来源:texttiling.py

示例4: from_words

# 需要导入模块: from nltk.corpus import stopwords [as 别名]
# 或者: from nltk.corpus.stopwords import words [as 别名]
def from_words(cls, words, window_size=2):
        """Construct a BigramCollocationFinder for all bigrams in the given
        sequence.  When window_size > 2, count non-contiguous bigrams, in the
        style of Church and Hanks's (1990) association ratio.
        """
        wfd = FreqDist()
        bfd = FreqDist()

        if window_size < 2:
            raise ValueError("Specify window_size at least 2")

        for window in ngrams(words, window_size, pad_right=True):
            w1 = window[0]
            if w1 is None:
                continue
            wfd[w1] += 1
            for w2 in window[1:]:
                if w2 is not None:
                    bfd[(w1, w2)] += 1
        return cls(wfd, bfd, window_size=window_size)

开发者ID:rafasashi，项目名称:razzy-spinner，代码行数:22，代码来源:collocations.py

示例5: extract_unigram_feats

# 需要导入模块: from nltk.corpus import stopwords [as 别名]
# 或者: from nltk.corpus.stopwords import words [as 别名]
def extract_unigram_feats(document, unigrams, handle_negation=False):
    """
    Populate a dictionary of unigram features, reflecting the presence/absence in
    the document of each of the tokens in `unigrams`.

    :param document: a list of words/tokens.
    :param unigrams: a list of words/tokens whose presence/absence has to be
        checked in `document`.
    :param handle_negation: if `handle_negation == True` apply `mark_negation`
        method to `document` before checking for unigram presence/absence.
    :return: a dictionary of unigram features {unigram : boolean}.

    >>> words = ['ice', 'police', 'riot']
    >>> document = 'ice is melting due to global warming'.split()
    >>> sorted(extract_unigram_feats(document, words).items())
    [('contains(ice)', True), ('contains(police)', False), ('contains(riot)', False)]
    """
    features = {}
    if handle_negation:
        document = mark_negation(document)
    for word in unigrams:
        features['contains({0})'.format(word)] = word in set(document)
    return features

开发者ID:rafasashi，项目名称:razzy-spinner，代码行数:25，代码来源:util.py

示例6: extract_bigram_feats

# 需要导入模块: from nltk.corpus import stopwords [as 别名]
# 或者: from nltk.corpus.stopwords import words [as 别名]
def extract_bigram_feats(document, bigrams):
    """
    Populate a dictionary of bigram features, reflecting the presence/absence in
    the document of each of the tokens in `bigrams`. This extractor function only
    considers contiguous bigrams obtained by `nltk.bigrams`.

    :param document: a list of words/tokens.
    :param unigrams: a list of bigrams whose presence/absence has to be
        checked in `document`.
    :return: a dictionary of bigram features {bigram : boolean}.

    >>> bigrams = [('global', 'warming'), ('police', 'prevented'), ('love', 'you')]
    >>> document = 'ice is melting due to global warming'.split()
    >>> sorted(extract_bigram_feats(document, bigrams).items())
    [('contains(global - warming)', True), ('contains(love - you)', False),
    ('contains(police - prevented)', False)]
    """
    features = {}
    for bigr in bigrams:
        features['contains({0} - {1})'.format(bigr[0], bigr[1])] = bigr in nltk.bigrams(document)
    return features

#////////////////////////////////////////////////////////////
#{ Helper Functions
#////////////////////////////////////////////////////////////

开发者ID:rafasashi，项目名称:razzy-spinner，代码行数:27，代码来源:util.py

示例7: preprocess

# 需要导入模块: from nltk.corpus import stopwords [as 别名]
# 或者: from nltk.corpus.stopwords import words [as 别名]
def preprocess(text):
	min_length = 3
	text = re.sub('\d+','#',text)
	text = re.sub('\.',' eos ',text)
	# Tokenize
	words = map(lambda word: word.lower(), word_tokenize(text))
	tokens = words
	# Remove non characters
	p = re.compile('[a-zA-Z#]+')
	# Filter tokens (we do not remove stopwords)
	filtered_tokens = list(filter(lambda token: p.match(token) and len(token)>=min_length and (token not in english_stopwords), tokens))
	# Encode to ascii
	filtered_tokens = [token.encode('ascii','ignore') for token in filtered_tokens]

	return filtered_tokens


# Modify this path

开发者ID:AlexGidiotis，项目名称:Document-Classifier-LSTM，代码行数:20，代码来源:data_prep.py

示例8: convert_string

# 需要导入模块: from nltk.corpus import stopwords [as 别名]
# 或者: from nltk.corpus.stopwords import words [as 别名]
def convert_string(text):
    """Convert a str of text into tokenized and selected list of words.

    Parameters
    ----------
    text : str
        Text as one long string.

    Returns
    -------
    words_cleaned : list of str
        List of tokenized words, after processing.

    Notes
    -----
    This function sets text to lower case, and removes stopwords and punctuation.
    """

    words = word_tokenize(text)
    words_cleaned = [word.lower() for word in words if (
        (not word.lower() in stopwords.words('english')) & word.isalnum())]

    return words_cleaned

开发者ID:lisc-tools，项目名称:lisc，代码行数:25，代码来源:utils.py

示例9: _tokenize

# 需要导入模块: from nltk.corpus import stopwords [as 别名]
# 或者: from nltk.corpus.stopwords import words [as 别名]
def _tokenize(self, row):
        '''
        clean texts by removing special and non-chars
        stems each word and removes stop words
        return list of tokenized words for each row
        '''
        chars = re.sub(r'-|&quot;|&amp;',' ',row) #replace dashes, quotes, and ampersands
        chars = self.regex.sub('',chars) # remove nonchars
        wordlist = str(chars).split()
        if self.stemming:
            wordlist = [self.stemmer.stem(word.lower()) for word in wordlist if word.lower() not in self.stop] # stem and remove stopwords
        else:
            wordlist = [word.lower() for word in wordlist if word.lower() not in self.stop]
        
        #create bigrams if enabled
        if self.bigrams:
            bigrams = []
            for i in range(len(wordlist)-1):
                bigrams.append(wordlist[i]+" "+wordlist[i+1])
            wordlist = wordlist + bigrams
            
        return wordlist

开发者ID:iamshang1，项目名称:Projects，代码行数:24，代码来源:preprocessing.py

示例10: lightStemAr

# 需要导入模块: from nltk.corpus import stopwords [as 别名]
# 或者: from nltk.corpus.stopwords import words [as 别名]
def lightStemAr(word_list):
	result = []
	arstemmer = ISRIStemmer()
	for word in word_list:
		word = arstemmer.norm(word, num=1)  #  remove diacritics which representing Arabic short vowels  
		if not word in arstemmer.stop_words:   # exclude stop words from being processed
			word = arstemmer.pre32(word)        # remove length three and length two prefixes in this order
			word = arstemmer.suf32(word)        # remove length three and length two suffixes in this order
			word = arstemmer.waw(word)          # remove connective ‘و’ if it precedes a word beginning with ‘و’
			word = arstemmer.norm(word, num=2)       # normalize initial hamza to bare alif
		result.append(word)
	return ' '.join(result)

###################################################################################

# combine rooting and light stemming: if light stemming alogrithm manage to reduce word form, then the light stem is returned, else, the root is returned

开发者ID:motazsaad，项目名称:comparable-text-miner，代码行数:18，代码来源:textpro.py

示例11: distance_based_helper

# 需要导入模块: from nltk.corpus import stopwords [as 别名]
# 或者: from nltk.corpus.stopwords import words [as 别名]
def distance_based_helper(self, P, Q, A):
        res = []
        U = set(stopwords.words('arabic')) & set(P)
        SQ = list(set(P) & set(Q) - U)
        for i in range(0, len(A)):
            SA = list(((set(A[i]) & set(P)) - set(Q)) - U)
            d = len(P) + 1
            if(len(SQ) == 0 or len(SA) == 0):
                d = 1
            else:
                for q in SQ:
                    for a in SA:
                        d = min(d, self.dist(P, q, a))
            d *= 1 / (len(P) - 1)
            res.append(d)
        return res

开发者ID:husseinmozannar，项目名称:SOQAL，代码行数:18，代码来源:slidingwindow_distance.py

示例12: get_last_words_from_parsed_title

# 需要导入模块: from nltk.corpus import stopwords [as 别名]
# 或者: from nltk.corpus.stopwords import words [as 别名]
def get_last_words_from_parsed_title(s):
    words=s.split()
    if len(words)==0:
        last_word=""
        word_before_last=""
        word2_before_last=""
    else:
        last_word=words[len(words)-1]
        word_before_last=""
        word2_before_last=""
        if len(words)>1:
            word_before_last=words[len(words)-2]
            if word_before_last=="and":
                word_before_last=""
            if len(words)>2 and word_before_last!="and":
                word2_before_last=words[len(words)-3]
                if word2_before_last=="and":
                    word2_before_last=""
    return last_word, word_before_last, word2_before_last

开发者ID:ChenglongChen，项目名称:kaggle-HomeDepot，代码行数:21，代码来源:text_processing.py

示例13: _use_stopwords

# 需要导入模块: from nltk.corpus import stopwords [as 别名]
# 或者: from nltk.corpus.stopwords import words [as 别名]
def _use_stopwords(self, x):
        words = tokenizer.tokenize(x)
        words = [w for w in words if not w in eng_stopwords]
        x = " ".join(words)
        return x

开发者ID:minerva-ml，项目名称:steppy-toolkit，代码行数:7，代码来源:text.py

示例14: _apostrophes

# 需要导入模块: from nltk.corpus import stopwords [as 别名]
# 或者: from nltk.corpus.stopwords import words [as 别名]
def _apostrophes(self, x):
        words = tokenizer.tokenize(x)
        words = [APOSTROPHES_WORDS[word] if word in APOSTROPHES_WORDS else word for word in words]
        words = [lem.lemmatize(word, "v") for word in words]
        words = [w for w in words if not w in eng_stopwords]
        x = " ".join(words)
        return x

开发者ID:minerva-ml，项目名称:steppy-toolkit，代码行数:9，代码来源:text.py

示例15: _lemmatize

# 需要导入模块: from nltk.corpus import stopwords [as 别名]
# 或者: from nltk.corpus.stopwords import words [as 别名]
def _lemmatize(self, sent):
        words = [wn.lemmatize(_) for _ in sent.split()]
        return ' '.join(words)

开发者ID:AuCson，项目名称:SEDST，代码行数:5，代码来源:metric.py

注：本文中的nltk.corpus.stopwords.words方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台，相关代码片段筛选自各路编程大神贡献的开源项目，源码版权归原作者所有，传播和使用请参考对应项目的License；未经允许，请勿转载。