Python stopwords.words函数代码示例

本文整理汇总了Python中nltk.corpus.stopwords.words函数的典型用法代码示例。如果您正苦于以下问题：Python words函数的具体用法？Python words怎么用？Python words使用的例子？那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。

在下文中一共展示了words函数的15个代码示例，这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞，您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: get_stopwords

def get_stopwords(include_trectext_syntax=True):
    ignore_words = ['<doc>', '</doc>', '<text>', '</text>']

    ignore_words.extend(stopwords.words('english'))
    ignore_words.extend(stopwords.words('dutch'))

    return set(ignore_words)

开发者ID:cvangysel，项目名称:embedding-utils，代码行数:7，代码来源:nltk_utils.py

示例2: find_opinions

def find_opinions(tokens, feature, feat, id):
    fg = 0
    for opinion in tokens:
        if opinion[0] == 'advmod' or opinion[0] == 'neg':
            if opinion[3].lower() in stopwords.words('english'):
                continue
            # endif
            if feature[1:3] == opinion[1:3]:
                fg = 1
                modifier_set.add(opinion[3])
                if id != -1:
                    mods[id].append(opinion[3])
                feat.write(
                    feature[3] + ' ' + feature[1] + ' ' +
                    opinion[3] + '\n')

            # endif
        # endif
        elif opinion[0] == 'dep':
            if opinion[3].lower() in stopwords.words('english'):
                continue
            # endif
            if feature[1:3] == opinion[1:3]:
                opinions_set.add(opinion[3])
                find_opinions(
                    tokens, ['nsubj', opinion[3], opinion[4], feature[3],
                             feature[4]], feat, -1)
        # endelif
    # endfor

    if fg == 0:
        feat.write(feature[3] + ' ' + feature[1] + '\n')

开发者ID:farhan0581，项目名称:majorProject，代码行数:32，代码来源:features.py

示例3: find_features

def find_features(tokens, feat):
    i = 0
    for feature in tokens:
        if feature[0] == 'nsubj':
            if feature[3].lower() in stopwords.words('english'):
                continue
            if feature[1].lower() in stopwords.words('english'):
                continue
            if not valid_feature(tokens, feature):
                continue
            # endif
            mods.append([])
            features_set.add(feature[3])
            opinions_set.add(feature[1])
            find_opinions(tokens, feature, feat, len(mods) - 1)
            if i != 0:
                if tokens[i - 1][0] == 'nsubj' and tokens[i - 1][3:5] == feature[3:5]:
                    for mod in mods[len(mods) - 2]:
                        if mod not in mods[len(mods) - 1]:
                            mods[len(mods) - 1].append(mod)
                            feat.write(
                                feature[3] + ' ' + feature[1] + ' ' + mod + '\n')

        # endif
        i = i + 1

开发者ID:farhan0581，项目名称:majorProject，代码行数:25，代码来源:features.py

示例4: extract_bigrams

	def extract_bigrams(self, text):

		text = self.remove_return_lines_and_quotes(text)
		bigrams = []

		st = PorterStemmer()
		stop = stopwords.words('english')

		more_stop_words = [
			'(', ')', "'s", ',', ':', '<', '>', '.', '-', '&', '*', '...']
		stop = stopwords.words('english')
		stop = stop + more_stop_words

		tokens = st.stem(text)
		tokens = nltk.word_tokenize(tokens.lower())
		tokens = [i for i in tokens if i not in stop]
		tokens = [word for word in tokens if len(word) > 2]

		bigram_measures = nltk.collocations.BigramAssocMeasures()
		finder = BigramCollocationFinder.from_words(tokens)
		finder.apply_freq_filter(2)
		top_bigrams = finder.nbest(bigram_measures.pmi, 1000)

		for bg in top_bigrams:
			bg = " ".join(bg)
			tag = nltk.pos_tag([bg])[0]

			if tag[1] not in ['VBG', 'RB', 'VB', 'VBD', 'VBN', 'VBP', 'VBZ', 'PRP', 'IN', 'DT', 'CC', 'PRP$']:
				bigrams.append(tag[0])

		return bigrams

开发者ID:webeng，项目名称:feature_engineering，代码行数:31，代码来源:keywords.py

示例5: clean

    def clean(self, raw):

        letters_only = re.sub("[^a-zA-Z#@]", " ", raw)

        words = letters_only.split()

        for i in range(0, len(words)):

            if "#" in words[i]:
                s = words[i].split('#')
                words[i] = '# '.join(s)
            if "@" in words[i]:
                s = words[i].split('@')
                words[i] = '@ '.join(s)
            if "http" in words[i]:
                s = words[i].split('http')
                words[i]= "http".join(s)


        total_stop_words = set(stopwords.words("english"))
        removed_stop_words = set(stopwords.words("english")[0:20])
        stop_words = total_stop_words - removed_stop_words
        content_words = [w for w in words if not w in stop_words]

        return " ".join(content_words)

开发者ID:jpriniski，项目名称:TwitterGatekeeping，代码行数:25，代码来源:FTAC.py

示例6: removeStopWords

def removeStopWords(tokens, lang):
    filteredToken=tokens
    if lang =='en':
        filteredToken = [w for w in tokens if not w in stopwords.words('english')]
    elif lang =='es':
        filteredToken = [w for w in tokens if not w in stopwords.words('spanish')]
    return filteredToken

开发者ID:thejamesmarq，项目名称:UWT-PAN，代码行数:7，代码来源:Util.py

示例7: frequencounting4Up

def frequencounting4Up(Listings):
    """
	Get the keywords count and the rank of the keywords
	:param Listings: the input list of tweets
	:return: a list of tuple ranked by words counts
	"""
    MyCounter = Counter()

    chars = ['.', '/', "'", '"', '?', '!', '#', '$', '%', '^', '&',
             '*', '(', ')', ' - ', '_', '+', '=', '@', ':', '\\', ',',
             ';', '~', '`', '<', '>', '|', '[', ']', '{', '}', '-', '"', '&amp;', 'rt']

    UpdatingChars = ['&amp;', 'rt', '', '#dctraffic', '#mdtraffic', '#vatraffic', 'amp', '-']

    # This section below will filter out the common english words and punctuations from the target tweets.
    for line in Listings:
        if type(line) is str:
            for word in line.strip().lower().split():
                if PunkRemovement(word.strip().lower()) not in UpdatingChars + stopwords.words(
                        'english') and not word.isdigit():
                    if len(word) > 1:
                        MyCounter[PunkRemovement(word.strip().lower())] += 1
        else:
            for word in line.text.decode('UTF-8').strip().lower().split():
                if PunkRemovement(word.strip().lower()) not in chars + stopwords.words('english'):
                    MyCounter[PunkRemovement(word.strip().lower())] += 1

    return MyCounter.most_common()

开发者ID:DCgov，项目名称:Traffic_Tweet_analysis，代码行数:28，代码来源:Freqencycounting.py

示例8: freqgen_word

def freqgen_word(word):
  connect(word)
  # get english stopwords
  stopen = stopwords.words('english')
  stopfr = stopwords.words('french')
  #stopsp = stopwords.words('spanish')

  query={}
  projection={"text":1}

  cursor = db.Tweetfind.find(query,projection)

  texts = pandas.Series(list(cursor))
  tokens = []

  for text in texts.values:
    tokens.extend([word.lower().strip(':;,#."-\'!') for word in text['text'].split()])
  filtered_tokens=[]
  st = ['&amp','&nbsp','it\'s','haven\'t','can\'t','don\'t','i\'m','i\'ve','i\'ll','i\'d','#','e','@','1','2','3','4','5','6','7','8','9','10','11','12','13','14','15','16','17','18','19','20','rt','(',')']
  for word in tokens:
    try:
      if (not word.decode('utf-8') in stopen) and (not word.decode('utf-8') in stopfr):
        if not word in st:  
          filtered_tokens.append(word.decode('utf-8'))
    except :
      pass
  freq_dist = nltk.FreqDist(filtered_tokens)
  print type(freq_dist)
  #print freq_dist.plot(25)
  return freq_dist

开发者ID:youssefmrini，项目名称:Tweets-analyses，代码行数:30，代码来源:word_freq_final.py

示例9: pre_process

 def pre_process(self, text):
     for i in range(len(text)):
         text[i] = text[i].replace("-", " ")
         word_list = text[i].encode('ascii', 'ignore').lower().split(" ")
         processed_text = []
         count = 0
         for word in word_list:
             if word in stopwords.words('english'):
                 continue
             if re.match('@\w+', word):
                 continue
             if re.match('#\w+', word):
                 continue
             word = re.sub('[0-9]+', 'gotNumber', word)
             word = re.sub('http(s)?.+', 'gotURL', word)
             word = re.sub('[^a-zA-Z0-9]', ' ', word)
             words = word.split(' ')
             for w in words:
                 if w is not ' ' and len(w) > 1 and w not in stopwords.words('english'):
                     w = self.sno.stem(w)
                     processed_text.append(w)
                 count += 1
                 print  '. ',
                 if count == 11:
                     print ''
                     count = 0
         text[i] = processed_text
     print ''
     return text

开发者ID:niteshthali08，项目名称:Disaster-Notofication，代码行数:29，代码来源:data_processor.py

示例10: fuzzer

def fuzzer(localstring, dbpstring):
	lwl = localstring.replace('-','').replace(',.', '').split()
	lfwl = [w for w in lwl if not w in stopwords.words('english')]
	dwl = dbpstring.replace('-','').split()
	dfwl = [w for w in dwl if not w in stopwords.words('english')]
	ratio = fuzz.token_sort_ratio(str(lfwl), str(dfwl))
	return ratio

开发者ID:barmintor，项目名称:ead2rdf2solr，代码行数:7，代码来源:utils.py

示例11: clean_total_words

def clean_total_words(data):
    all_text=list()
    for i in range(len(data)):
        all_text.append(data[i]['text'])
    words=list()
    for i in range(len(all_text)):
        words.append(nltk.word_tokenize(all_text[i]))
    wordss= list(itertools.chain.from_iterable(words))
    word_after_clean=list()
    for i in range(len(words)):
        wordss[i]=wordss[i].lower()
    stop_words = set(stopwords.words('english'))
    stop_words.update(['.', ',', '"', "'", '?', '!', ':', ';', '(', ')', '[', ']', '{', '}'])
    for i in range(len(wordss)):
        if wordss[i] not in stop_words:
            word_after_clean.append(wordss[i])
    word_clean=list()
    for i in range(len(word_after_clean)):
        if word_after_clean[i].isalpha()==True:
            word_clean.append(word_after_clean[i])
    word_clea=list()
    for i in range(len(word_clean)):
        word_clea.append(word_clean[i].lower())
    stop_words = set(stopwords.words('english'))
    word_c=list()
    for i in range(len(word_clea)):
        if word_clea[i] not in stop_words:
            word_c.append(word_clea[i])
    return(word_c)

开发者ID:Chenyu-Renee，项目名称:CS289FinalProject，代码行数:29，代码来源:word_matrix.py

示例12: extract_features

    def extract_features(self, article, feats, threegram_sent_ppl, fourgram_sent_ppl, fivegram_sent_ppl, sixgram_sent_ppl, index = None):
      featureSet = {}
      articleWords = article.replace("<s>", "").replace("</s>", "").split()
      featureSet["articlelen"] = len(articleWords)
      fx_words = [word for word in articleWords if word.lower() in stopwords.words('english')]
      featureSet["fxwordcount"] = len(fx_words)/len(articleWords)
      non_words = [word for word in articleWords if word.isalpha() != True]
      featureSet["nonwordcount"] = len(non_words)/len(articleWords)
      content_words = [word for word in articleWords if word.lower() not in stopwords.words('english')]
      featureSet["contentwordcount"] = len(content_words)/len(articleWords)
      featureSet["uniquewords"] = len(set(articleWords))/len(articleWords)
      featureSet.update(feats)

      try:
        sents = [x for x in article.split("\n") if len(x) > 1]
        ppl_five = ppl_wrangling(sents, fivegram_sent_ppl)
        ppl_six = ppl_wrangling(sents, sixgram_sent_ppl)
        ppl_three = ppl_wrangling(sents, threegram_sent_ppl)
        ppl_four = ppl_wrangling(sents, fourgram_sent_ppl)
        featureSet["ppl-5"] = ppl_five
        featureSet["ppl-6"] = ppl_six
        featureSet["ppl-3"] = ppl_three
        featureSet["ppl-4"] = ppl_four
      except:
          pass

      featureSet.update(self.posTags(index, article))
      return featureSet

开发者ID:emilytag，项目名称:lang-stats-sp2016，代码行数:28，代码来源:RunMe.py

示例13: evaluate_html

def evaluate_html(content, html_conf):
    fdist = FreqDist()
    if html_conf['usehtml'] == False:
        logging.info('Discarding HTML tags')
        return fdist
 
    logging.info("\tEvaluating HTML")
     
    # try with TITLE tag
    titles = re.findall("<title>[A-Za-z0-9 ]+</title>", content)
    for title in titles:
        root = etree.fromstring(title)
        words_list = nltk.word_tokenize(re.sub('[^A-Za-z0-9 ]', ' ', root.text))
        terms_list = [ x for x in words_list if x.lower() not in stopwords.words('english')]
        stems = steming(terms_list)

        for i in range(html_conf['title']):
            fdist.update(stems)

    # try with H1 tag
    headers = re.findall("<h1>[A-Za-z0-9 ]+</h1>", content)
    for header in headers:
        root = etree.fromstring(header)
        words_list = nltk.word_tokenize(re.sub('[^A-Za-z0-9 ]', ' ', root.text))
        terms_list = [ x for x in words_list if x.lower() not in stopwords.words('english')]
        stems = steming(terms_list)

        for i in range(html_conf['h1']):
            fdist.update(stems)

    return fdist

开发者ID:pejotr，项目名称:doc-clustering，代码行数:31，代码来源:preprocessing.py

示例14: palavrasChaves

    def palavrasChaves(self):
        # fun��o da NLTK que retorna as stopwords na lingua inglesa
        stopE = stopwords.words('english')

        # fun��o da NLTK que retorna as stopwords na lingua portuguesa
        stop = stopwords.words('portuguese')  
              
        stopS = stopwords.words('spanish')
        
        palavrasChaves = [] 
        textoArtigo = []
        
        #retira pontua��es do texto e divide o texto em palavras
        for i in self.titulo.lower().replace(',','').replace('.','').replace('-','').replace('(','').replace(')','').split():
            #retira as stopwords da lingua portuguesa do texto do artigo que est� sendo apresentado
            if i not in stop:
                #retira as stopwords da lingua inglesa do texto do artigo que est� sendo apresentado
                if i not in stopE:
                    #ignora palavras com menos de 3 caracteres. Isso � para tratar palavras, como por exemplo o verbo "�"
                    if i not in stopS:
                            if len(i) > 2:
                                textoArtigo.append(i)
        
        # apresenta a frequencia de repeticoes das palavras no corpo do artigo
        freq = FreqDist(textoArtigo)
        
        # separa as quatro palavras mais frequentes
        items = freq.items()[:4]
        
        # coloca as palavras mais frequentes do texto na variavel palavrasChaves
        for i in range(0,len(items)):
            palavrasChaves.append(items[i][0])
            
        return palavrasChaves

开发者ID:dienerpiske，项目名称:QSabe，代码行数:34，代码来源:models.py

示例15: word_standardize

def word_standardize(sentences): 	
    tokens = []
    sentences_st = []

    for sent in sentences:
        tokens.extend(word_tokenize(sent))
        sentences_st.append(word_tokenize(sent))
	
    words = tokens
    
    st = LancasterStemmer()

    words = [w.lower() for w in words]
    words = [w for w in words if not w in stopwords.words('english')]
    words = [w for w in words if not w in '!"#$%&\'()*+,-./:;<=>[email protected][\\]^_`{|}~']
    st_words = [st.stem(w) for w in words]

    sent_result = []
    for sent in sentences_st:
        sent = [w.lower() for w in sent]
        sent = [w for w in sent if not w in stopwords.words('english')]
        sent = [w for w in sent if not w in '!"#$%&\'()*+,-./:;<=>[email protected][\\]^_`{|}~']
        sent_result.append(sent)

    return st_words, sent_result

开发者ID:chqsark，项目名称:hightext，代码行数:25，代码来源:pullData.py

注：本文中的nltk.corpus.stopwords.words函数示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台，相关代码片段筛选自各路编程大神贡献的开源项目，源码版权归原作者所有，传播和使用请参考对应项目的License；未经允许，请勿转载。