当前位置: 首页>>代码示例>>Python>>正文

Python stopwords.words方法代码示例

本文整理汇总了Python中nltk.corpus.stopwords.words方法的典型用法代码示例。如果您正苦于以下问题:Python stopwords.words方法的具体用法?Python stopwords.words怎么用?Python stopwords.words使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在nltk.corpus.stopwords的用法示例。


示例1: pz_selective_sampling

# 需要导入模块: from nltk.corpus import stopwords [as 别名]
# 或者: from nltk.corpus.stopwords import words [as 别名]
def pz_selective_sampling(self, pz_proba):
        Selective sampling of pz(do max-sampling but prevent repeated words)
        pz_proba = pz_proba.data
        z_proba, z_token = torch.topk(pz_proba, pz_proba.size(0), dim=2)
        z_token = z_token.transpose(0, 1)  # [B,Tz,top_Tz]
        all_sampled_z = []
        for b in range(z_token.size(0)):
            sampled_z = []
            for t in range(z_token.size(1)):
                for i in range(z_token.size(2)):
                    if z_token[b][t][i] not in sampled_z:
        return all_sampled_z 

示例2: collocations

# 需要导入模块: from nltk.corpus import stopwords [as 别名]
# 或者: from nltk.corpus.stopwords import words [as 别名]
def collocations(self, num=20, window_size=2):
        Print collocations derived from the text, ignoring stopwords.

        :seealso: find_collocations
        :param num: The maximum number of collocations to print.
        :type num: int
        :param window_size: The number of tokens spanned by a collocation (default=2)
        :type window_size: int
        if not ('_collocations' in self.__dict__ and self._num == num and self._window_size == window_size):
            self._num = num
            self._window_size = window_size

            #print("Building collocations list")
            from nltk.corpus import stopwords
            ignored_words = stopwords.words('english')
            finder = BigramCollocationFinder.from_words(self.tokens, window_size)
            finder.apply_word_filter(lambda w: len(w) < 3 or w.lower() in ignored_words)
            bigram_measures = BigramAssocMeasures()
            self._collocations = finder.nbest(bigram_measures.likelihood_ratio, num)
        colloc_strings = [w1+' '+w2 for w1, w2 in self._collocations]
        print(tokenwrap(colloc_strings, separator="; ")) 

示例3: __init__

# 需要导入模块: from nltk.corpus import stopwords [as 别名]
# 或者: from nltk.corpus.stopwords import words [as 别名]
def __init__(self,

        if stopwords is None:
            from nltk.corpus import stopwords
            stopwords = stopwords.words('english')
        del self.__dict__['self'] 

示例4: from_words

# 需要导入模块: from nltk.corpus import stopwords [as 别名]
# 或者: from nltk.corpus.stopwords import words [as 别名]
def from_words(cls, words, window_size=2):
        """Construct a BigramCollocationFinder for all bigrams in the given
        sequence.  When window_size > 2, count non-contiguous bigrams, in the
        style of Church and Hanks's (1990) association ratio.
        wfd = FreqDist()
        bfd = FreqDist()

        if window_size < 2:
            raise ValueError("Specify window_size at least 2")

        for window in ngrams(words, window_size, pad_right=True):
            w1 = window[0]
            if w1 is None:
            wfd[w1] += 1
            for w2 in window[1:]:
                if w2 is not None:
                    bfd[(w1, w2)] += 1
        return cls(wfd, bfd, window_size=window_size) 

示例5: extract_unigram_feats

# 需要导入模块: from nltk.corpus import stopwords [as 别名]
# 或者: from nltk.corpus.stopwords import words [as 别名]
def extract_unigram_feats(document, unigrams, handle_negation=False):
    Populate a dictionary of unigram features, reflecting the presence/absence in
    the document of each of the tokens in `unigrams`.

    :param document: a list of words/tokens.
    :param unigrams: a list of words/tokens whose presence/absence has to be
        checked in `document`.
    :param handle_negation: if `handle_negation == True` apply `mark_negation`
        method to `document` before checking for unigram presence/absence.
    :return: a dictionary of unigram features {unigram : boolean}.

    >>> words = ['ice', 'police', 'riot']
    >>> document = 'ice is melting due to global warming'.split()
    >>> sorted(extract_unigram_feats(document, words).items())
    [('contains(ice)', True), ('contains(police)', False), ('contains(riot)', False)]
    features = {}
    if handle_negation:
        document = mark_negation(document)
    for word in unigrams:
        features['contains({0})'.format(word)] = word in set(document)
    return features 

示例6: extract_bigram_feats

# 需要导入模块: from nltk.corpus import stopwords [as 别名]
# 或者: from nltk.corpus.stopwords import words [as 别名]
def extract_bigram_feats(document, bigrams):
    Populate a dictionary of bigram features, reflecting the presence/absence in
    the document of each of the tokens in `bigrams`. This extractor function only
    considers contiguous bigrams obtained by `nltk.bigrams`.

    :param document: a list of words/tokens.
    :param unigrams: a list of bigrams whose presence/absence has to be
        checked in `document`.
    :return: a dictionary of bigram features {bigram : boolean}.

    >>> bigrams = [('global', 'warming'), ('police', 'prevented'), ('love', 'you')]
    >>> document = 'ice is melting due to global warming'.split()
    >>> sorted(extract_bigram_feats(document, bigrams).items())
    [('contains(global - warming)', True), ('contains(love - you)', False),
    ('contains(police - prevented)', False)]
    features = {}
    for bigr in bigrams:
        features['contains({0} - {1})'.format(bigr[0], bigr[1])] = bigr in nltk.bigrams(document)
    return features

#{ Helper Functions

示例7: preprocess

# 需要导入模块: from nltk.corpus import stopwords [as 别名]
# 或者: from nltk.corpus.stopwords import words [as 别名]
def preprocess(text):
	min_length = 3
	text = re.sub('\d+','#',text)
	text = re.sub('\.',' eos ',text)
	# Tokenize
	words = map(lambda word: word.lower(), word_tokenize(text))
	tokens = words
	# Remove non characters
	p = re.compile('[a-zA-Z#]+')
	# Filter tokens (we do not remove stopwords)
	filtered_tokens = list(filter(lambda token: p.match(token) and len(token)>=min_length and (token not in english_stopwords), tokens))
	# Encode to ascii
	filtered_tokens = [token.encode('ascii','ignore') for token in filtered_tokens]

	return filtered_tokens

# Modify this path 

示例8: convert_string

# 需要导入模块: from nltk.corpus import stopwords [as 别名]
# 或者: from nltk.corpus.stopwords import words [as 别名]
def convert_string(text):
    """Convert a str of text into tokenized and selected list of words.

    text : str
        Text as one long string.

    words_cleaned : list of str
        List of tokenized words, after processing.

    This function sets text to lower case, and removes stopwords and punctuation.

    words = word_tokenize(text)
    words_cleaned = [word.lower() for word in words if (
        (not word.lower() in stopwords.words('english')) & word.isalnum())]

    return words_cleaned 

示例9: _tokenize

# 需要导入模块: from nltk.corpus import stopwords [as 别名]
# 或者: from nltk.corpus.stopwords import words [as 别名]
def _tokenize(self, row):
        clean texts by removing special and non-chars
        stems each word and removes stop words
        return list of tokenized words for each row
        chars = re.sub(r'-|&quot;|&amp;',' ',row) #replace dashes, quotes, and ampersands
        chars = self.regex.sub('',chars) # remove nonchars
        wordlist = str(chars).split()
        if self.stemming:
            wordlist = [self.stemmer.stem(word.lower()) for word in wordlist if word.lower() not in self.stop] # stem and remove stopwords
            wordlist = [word.lower() for word in wordlist if word.lower() not in self.stop]
        #create bigrams if enabled
        if self.bigrams:
            bigrams = []
            for i in range(len(wordlist)-1):
                bigrams.append(wordlist[i]+" "+wordlist[i+1])
            wordlist = wordlist + bigrams
        return wordlist 

示例10: lightStemAr

# 需要导入模块: from nltk.corpus import stopwords [as 别名]
# 或者: from nltk.corpus.stopwords import words [as 别名]
def lightStemAr(word_list):
	result = []
	arstemmer = ISRIStemmer()
	for word in word_list:
		word = arstemmer.norm(word, num=1)  #  remove diacritics which representing Arabic short vowels  
		if not word in arstemmer.stop_words:   # exclude stop words from being processed
			word = arstemmer.pre32(word)        # remove length three and length two prefixes in this order
			word = arstemmer.suf32(word)        # remove length three and length two suffixes in this order
			word = arstemmer.waw(word)          # remove connective ‘و’ if it precedes a word beginning with ‘و’
			word = arstemmer.norm(word, num=2)       # normalize initial hamza to bare alif
	return ' '.join(result)


# combine rooting and light stemming: if light stemming alogrithm manage to reduce word form, then the light stem is returned, else, the root is returned 

示例11: distance_based_helper

# 需要导入模块: from nltk.corpus import stopwords [as 别名]
# 或者: from nltk.corpus.stopwords import words [as 别名]
def distance_based_helper(self, P, Q, A):
        res = []
        U = set(stopwords.words('arabic')) & set(P)
        SQ = list(set(P) & set(Q) - U)
        for i in range(0, len(A)):
            SA = list(((set(A[i]) & set(P)) - set(Q)) - U)
            d = len(P) + 1
            if(len(SQ) == 0 or len(SA) == 0):
                d = 1
                for q in SQ:
                    for a in SA:
                        d = min(d, self.dist(P, q, a))
            d *= 1 / (len(P) - 1)
        return res 

示例12: get_last_words_from_parsed_title

# 需要导入模块: from nltk.corpus import stopwords [as 别名]
# 或者: from nltk.corpus.stopwords import words [as 别名]
def get_last_words_from_parsed_title(s):
    if len(words)==0:
        if len(words)>1:
            if word_before_last=="and":
            if len(words)>2 and word_before_last!="and":
                if word2_before_last=="and":
    return last_word, word_before_last, word2_before_last 

示例13: _use_stopwords

# 需要导入模块: from nltk.corpus import stopwords [as 别名]
# 或者: from nltk.corpus.stopwords import words [as 别名]
def _use_stopwords(self, x):
        words = tokenizer.tokenize(x)
        words = [w for w in words if not w in eng_stopwords]
        x = " ".join(words)
        return x 

示例14: _apostrophes

# 需要导入模块: from nltk.corpus import stopwords [as 别名]
# 或者: from nltk.corpus.stopwords import words [as 别名]
def _apostrophes(self, x):
        words = tokenizer.tokenize(x)
        words = [APOSTROPHES_WORDS[word] if word in APOSTROPHES_WORDS else word for word in words]
        words = [lem.lemmatize(word, "v") for word in words]
        words = [w for w in words if not w in eng_stopwords]
        x = " ".join(words)
        return x 

示例15: _lemmatize

# 需要导入模块: from nltk.corpus import stopwords [as 别名]
# 或者: from nltk.corpus.stopwords import words [as 别名]
def _lemmatize(self, sent):
        words = [wn.lemmatize(_) for _ in sent.split()]
        return ' '.join(words) 
