当前位置: 首页>>代码示例>>Python>>正文


Python stop_words.get_stop_words函数代码示例

本文整理汇总了Python中stop_words.get_stop_words函数的典型用法代码示例。如果您正苦于以下问题:Python get_stop_words函数的具体用法?Python get_stop_words怎么用?Python get_stop_words使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。


在下文中一共展示了get_stop_words函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: load_stoplist

def load_stoplist(topic_words=False, lang="en"):
    try:
        if lang == "en":
            if topic_words: return set(get_stop_words("en") + STOP_LIST + get_topic_stoplist())
            else: return set(get_stop_words("en") + STOP_LIST + stopwords.words('english'))
        elif lang == "nl":
            return set(get_stop_words("nl") + stopwords.words('dutch') + STOP_LIST_NL)
    except:
        print "warning: no stopwords were downloaded. check nltk corpora"
        print format_exc()
        return set()
开发者ID:anukat2015,项目名称:taxi,代码行数:11,代码来源:morph.py

示例2: test_filters

    def test_filters(self):
            language = 'en'
            before = get_stop_words(language, False)
            letter = random.choice(random.choice(before))

            def remove_letter(stopwords, language):
                return [word for word in stopwords if letter not in word]
            stop_words.add_filter(remove_letter)
            after = get_stop_words(language, False)
            for stopword in after:
                self.assertFalse(letter in stopword)
            self.assertTrue(stop_words.remove_filter(remove_letter))
开发者ID:Alir3z4,项目名称:python-stop-words,代码行数:12,代码来源:tests.py

示例3: get_most_freq

def get_most_freq(all_comments):
    APP_ROOT = os.path.dirname(os.path.abspath(__file__))
    APP_STATIC = os.path.join(APP_ROOT, 'static')
    file_name = os.path.join(APP_STATIC, 'freq_portugues.p')
    dict_freq = pickle.load(open(file_name, "rb" ) )

    web_stopWords = ["q","vc","vcs","tipo","ta","pra","pq","ne","sobre","ser","cara","la"]

    all_comments = remove_accents(all_comments)
    tokens = all_comments.split()

    #build token dictionary
    dict_tokens = {}
    for token in tokens:
        if token in dict_tokens:
            dict_tokens[token] += 1
        else:
            dict_tokens[token] = 1

    #remove stop words
    stopWords = get_stop_words('portuguese', cache=True)
    stopWords += get_stop_words('english', cache=True)
    stopWords += web_stopWords

    #remove stop words
    for word in stopWords:
        dict_tokens.pop(remove_accents(word), None)

    #for word in dict_tokens:
    #    print(dict_tokens[token])
    #    dict_tokens[token] = 1+math.log(dict_tokens[token])

    #sorted by frequency
    sorted_tokens = sorted(dict_tokens.items(), key=operator.itemgetter(1),reverse=True)
    num_tokens = int(min(len(sorted_tokens)/2, 1000))

    sorted_tokens = sorted_tokens[0:num_tokens]

    #normalize by frequency
    standart_frequency = dict_freq["acelga"]
    for i in range(len(sorted_tokens)):
        (token,value) = sorted_tokens[i]
        if token in dict_freq:
            sorted_tokens[i] = (token, math.log(value/dict_freq[token]))
        else:
            sorted_tokens[i] = (token,math.log(value/standart_frequency))

    sorted_tokens_after = sorted(sorted_tokens,key=operator.itemgetter(1), reverse=True)
    max_num_words = 100
    sorted_tokens_after = sorted_tokens_after[0:max_num_words]

    return sorted_tokens_after
开发者ID:FaceBattle,项目名称:FaceBattle-TLDR,代码行数:52,代码来源:text_analise.py

示例4: test_get_stop_words_cache

 def test_get_stop_words_cache(self):
     self.assertFalse('french' in stop_words.STOP_WORDS_CACHE)
     sw = get_stop_words('fr')
     self.assertTrue('french' in stop_words.STOP_WORDS_CACHE)
     original_stop_words_dir = stop_words.STOP_WORDS_DIR
     stop_words.STOP_WORDS_DIR = 'not-existing-directory'
     self.assertEqual(sw, get_stop_words('french'))
     stop_words.STOP_WORDS_DIR = original_stop_words_dir
     try:
         get_stop_words('klingon')
     except:
         pass
     self.assertFalse('klingon' in stop_words.STOP_WORDS_CACHE)
开发者ID:Alir3z4,项目名称:python-stop-words,代码行数:13,代码来源:tests.py

示例5: word_list

def word_list(text ) :

    list = {}
    words = text.split()
    stop_words = get_stop_words('en')          # stop words is a list of common words used in English
    stop_words = get_stop_words('english')     

    words = [word for word in words if word not in stop_words]    #removing stop words

    for i in words:
        if all(j.isdigit() for j in i):     # classifing token as number feature
            if list.has_key("NUMBER"):
                list["NUMBER"]+=1
            else:
                list["NUMBER"]=1

        elif (len (i) >=4 and i[0] == 'h' and i[1] == 't' and i[2] == 't' and i[3] == 'p'):
        	if list.has_key("LINKS"):     # classifing token as link feature
        		list["LINKS"]+=1
        	else:
        		list["LINKS"]=1
        	

        elif all(j in string.punctuation for j in i):
            if list.has_key("PUNCTUATION"):        # classifing token as punctuation feature
                list["PUNCTUATION"]+=1
            else:
                list["PUNCTUATION"]=1

        elif len(i.translate(None,string.punctuation)) < 3:
            continue

        elif i.upper()==i:
            if list.has_key("CAPSLOCK"):        # classifing token as capital word feature
                list["CAPSLOCK"]+=1
            else:
                list["CAPSLOCK"]=1
        
        else:
            j = i.translate(None,string.punctuation).lower()
            if list.has_key(j):
                list[j]+=1
            else:
                list[j]=1
            
    
    
    return list
开发者ID:saurabhanand1995,项目名称:Spam-Filter,代码行数:48,代码来源:spamFilter.py

示例6: lemmatization_intern

def lemmatization_intern(lang, rss, result, doc):
    # Construction et configuration du wrapper
    tagger = treetaggerwrapper.TreeTagger(TAGLANG=lang, TAGDIR=treetagger_path,
                                          TAGINENC='utf-8', TAGOUTENC='utf-8')

    # Utilisation
    tags = tagger.TagText(rss)
    data = formatTTG(tags, tagger, stop_words.get_stop_words(language=lang))

    for k in [1, 2, 3]:
        i = 0
        liste = []
        while i <= len(data) - k:
            lemma = getLemma(data[i])

            for j in range(k - 1):
                lemma += " " + getLemma(data[i + j + 1])
            if lemma not in result:
                result[k-1][lemma] = 0
                doc[k-1][lemma] = 1
                liste += [lemma]
            elif lemma not in liste:
                doc[k-1][lemma] += 1
                liste += [lemma]

            result[k-1][lemma] += 1
            i += 1
    return result, doc
开发者ID:Flasheur111,项目名称:SEO,代码行数:28,代码来源:lemmatization.py

示例7: get_stopset

def get_stopset():
    """
    Gets a set of stopwords
    """
    stopset = set(get_stop_words('en'))

    # get those contractions
    add_stops = nltk.word_tokenize(' '.join(stopset))
    stopset.update(add_stops)

    # make sure to get contractions without punctuation, so that
    # order of operations doesn't matter later
    add_stops = [stopword.strip(string.punctuation)
                 for stopword in stopset]
    stopset.update(add_stops)

    # custom stop words
    add_stops = [u'lp', u'ep',
                 u'record', u'records', u'recorded'
                 u'label', u'labels',
                 u'release', u'releases', u'released',
                 u'listen', u'listens', u'listened', u'listener',
                 u'version', u'versions',
                 u'album', u'albums',
                 u'song', u'songs',
                 u'track', u'tracks',
                 u'sound', u'sounds',
                 u'thing', u'things', u'something',
                 u'music']
    stopset.update(add_stops)
    return stopset
开发者ID:lwoloszy,项目名称:albumpitch,代码行数:31,代码来源:text_preprocess.py

示例8: get_frequency

    def get_frequency(self):

        # Selecting all the text in the database
        cursor = self.select_content('Content')

        # Initialising variables
        words = []
        count_handle = Counter()

        # Generating common word list to be removed from the keyword list to be generated
        sw = stop_words.get_stop_words("english")

        # Extracting all words from the given database
        for row in cursor:
            words += re.compile('\w+').findall(row[1])

        #Remove stop words from 'words' list
        words = [w.lower() for w in words if w.lower() not in sw]

        # Calculating the frequency of all words in the given database
        for w in words:
            count_handle[w] += 1

        # Writing the keywords returned into the file = category+ "_keyword.txt"
        with open(self.out, 'w') as file_name:
            for word in count_handle.most_common(self.limit):
                file_name.write(word[0]+"\t"+str(word[1])+"\n")
开发者ID:jadeseeker,项目名称:Domain-Analyzer,代码行数:27,代码来源:getKeywords.py

示例9: issue_analysis

def issue_analysis(df):
    df_sub = df[['Issue']]
    df_sub.insert(0, 'count', 1)

    Issue_List=[]
    for i in range(0,50):
        Issue_List.append(df_sub.groupby(['Issue']).sum().sort_index(by='count', ascending=False).ix[i].name)

    tokenizer = RegexpTokenizer(r'[A-Za-z0-9\']+')    # set tokenize Reg
    en_stop = get_stop_words('en')         # create English stop words list
    p_stemmer = PorterStemmer()            # Create p_stemmer of class PorterStemmer
    texts = []                             # list for tokenized documents in loop
    text_view = ''
                                                                
    # loop through document list
    for i in Issue_List:
        # clean and tokenize document string
        raw = i.lower()
        tokens = tokenizer.tokenize(raw)
       
        # remove stop words from tokens
        stopped_tokens = [i for i in tokens if not i in en_stop]
        
        # stem tokens and add them to list
        stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
        texts.append(stemmed_tokens)

        #print ' '.join(stemmed_tokens)
        text_view += ' '.join(stemmed_tokens)
        text_view += ' '

    wordcloud = WordCloud().generate(text_view)
    fig = plt.figure(figsize=(8,6))
    fig1 = fig.add_subplot(1,1,1)
    fig1.set_title("Top issued words", fontdict={'fontsize':25})
    fig1.imshow(wordcloud)
    fig1.axis("off")
    #plt.savefig('ComplainCount_WC.png')
    plt.savefig('ComplainCount_WC_2016.png')
    
    # turn our tokenized documents into a id <-> term dictionary
    dictionary = corpora.Dictionary(texts)

    # convert tokenized documents into a document-term matrix
    corpus = [dictionary.doc2bow(text) for text in texts]

    # generate LDA model
    ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=25, id2word = dictionary)
    LDAText =  ldamodel.print_topics(num_topics=5, num_words=3)
    #print "\n Topic analysis result for top 25 issues with LDA"
    #print(LDAText)
       
    vis_data = gensimvis.prepare(ldamodel, corpus, dictionary)
    #pyLDAvis.show(vis_data)
    #pyLDAvis.save_html(vis_data, "issue_lda.html")
    #pyLDAvis.save_json(vis_data, "issue_lda.json")
    pyLDAvis.save_html(vis_data, "issue_lda_2016.html")
    pyLDAvis.save_json(vis_data, "issue_lda_2016.json")

    return 0
开发者ID:choi-junhwan,项目名称:ConsumerComplaintsDataProject,代码行数:60,代码来源:Complaints_TextAnalysis.py

示例10: cal_idf_overlap

def cal_idf_overlap():
    list_subj = utils.list_subject

    ls_distance_final = []
    ls_distance_row = []
    #print len(list_att)
    stop_words = get_stop_words('en')
    tmp_corpus = []
    for i in range(len(list_subj)):
        item = str(list_subj[i]).split(" ")
        for token in item:
            if token in stop_words:
                pass
            else:
                tmp_corpus.append(token)
    #print "corpus", corpus

    length = len(list_subj)
    for i in range(0, length):
        if i == 500 or i == 1000 or i == 1500:
            print i
        for j in range(0, length):
            print i, j
            idf_instance = IDF.IDF(str(list_subj[i]),str(list_subj[j]), tmp_corpus)
            distance = idf_instance.cal_overlap()
            ls_distance_row.append(distance)
        ls_distance_final.append(ls_distance_row)
        ls_distance_row = []

    myarray = np.asarray(ls_distance_final)
    print myarray
    Z = linkage(myarray, "ward")
    thefile = open('/Users/Aaron/test.txt', 'w')
    for item in Z:
        thefile.write("%s\n" % item)

    plt.figure(figsize=(25, 10))
    plt.title('Hierarchical Clustering Dendrogram')
    plt.xlabel('sample index')
    plt.ylabel('distance')
    dendrogram(
         Z,
         leaf_rotation=90.,  # rotates the x axis labels
         leaf_font_size=8.,  # font size for the x axis labels
     )
    plt.show()

    plt.title('Hierarchical Clustering Dendrogram (truncated)')
    plt.xlabel('sample index')
    plt.ylabel('distance')
    dendrogram(
        Z,
        truncate_mode='lastp',  # show only the last p merged clusters
        p=30,  # show only the last p merged clusters
        show_leaf_counts=True,  # otherwise numbers in brackets are counts
        leaf_rotation=90.,
        leaf_font_size=12.,
        show_contracted=True,  # to get a distribution impression in truncated branches
    )
    plt.show()
开发者ID:ycraaron,项目名称:CanonicalizationOKB,代码行数:60,代码来源:hac_idf_overlap.py

示例11: getWordVector

def getWordVector(inputString):
    tokenizer = RegexpTokenizer(r'\w+\'?\w+')

    # default English stop words list
    en_stop = get_stop_words('en')

    # Create p_stemmer of class PorterStemmer
    # It is considered to be the best for finding word roots
    p_stemmer = PorterStemmer()

    raw = inputString.lower() 
    tokens = tokenizer.tokenize(raw)    

    # remove stop words from tokens
    stopped_tokens = [i for i in tokens if not i in en_stop]

    # now POS words which are nouns, adjectives, adverbs and verbs
    pos_tagged = nltk.pos_tag(stopped_tokens)
        
    # stem tokens
    # p_stemmer.stem(i[0]) and other additions in if condition - or i[1][0] == 'R' or i[1][0] == 'V' 

    stemmed_tokens = [i[0]
                        for i in pos_tagged
                        if i[1][0] == 'N'] # or i[1][0] == 'J']

    return stemmed_tokens
开发者ID:pralhadsapre,项目名称:Yelp-Project,代码行数:27,代码来源:TopicModeler.py

示例12: lda_approach_one

def lda_approach_one():
    tokenizer = RegexpTokenizer(r'\w+')
    en_stop = get_stop_words('en')
    p_stemmer = PorterStemmer()
    # doc_a = "Brocolli is good to eat. My brother likes to eat good brocolli, but not my mother."
    # doc_b = "My mother spends a lot of time driving my brother around to baseball practice."
    # doc_c = "Some health experts suggest that driving may cause increased tension and blood pressure."
    # doc_e = "Health professionals say that brocolli is good for your health."
    # doc_set = [doc_a, doc_b, doc_c, doc_e]
    print db.find().count()
    doc_set = [i['abstract'] for i in db.find()]
    texts = []
    for i in doc_set:
        raw = i.lower()
        tokens = tokenizer.tokenize(raw)
        stopped_tokens = [i for i in tokens if not i in en_stop]
        stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
        texts.append(stemmed_tokens)
    dictionary = corpora.Dictionary(texts)
    corpus = [dictionary.doc2bow(text) for text in texts]
    ldamodel = gensim.models.ldamodel.LdaModel(
        corpus,
        num_topics=4,
        id2word=dictionary,
        passes=20
    )
    print ldamodel.print_topics(10)
开发者ID:nikhil2kulkarni,项目名称:biz,代码行数:27,代码来源:general_lda_word2vec.py

示例13: get_corpus

def get_corpus():
    db_conn = MySQLdb.connect(host="localhost", port=8889, db="linked_reverb", user="root", passwd="root")
    cursor = db_conn.cursor()
    cursor.execute("select argument1, argument2 from linked_entity80_a")

    ls_result = []
    ls_corpus = []

    row_count = int(cursor.rowcount)
    for i in range(0, row_count):
        row = cursor.fetchone()
        ls_result.append(row)

    stop_words = get_stop_words('en')

    for i in range(len(ls_result)):
        for item in ls_result[i][0].split(" "):
            if item in stop_words:
                pass
            else:
                ls_corpus.append(item)
        for item in ls_result[i][1].split(" "):
            if item in stop_words:
                pass
            else:
                ls_corpus.append(item)

                #
                # ls_corpus.append(ls_result[i][0].split(" "))
                # ls_corpus.append(ls_result[i][1].split(" "))

    db_conn.close()
    return ls_corpus
开发者ID:ycraaron,项目名称:CanonicalizationOKB,代码行数:33,代码来源:main.py

示例14: convert_amazon_to_dict

def convert_amazon_to_dict(dict_field, is_text, in_fname, out_fname):
	id = 0
	num_entries = 0
	field_dict = {'':0}
	stop_words = get_stop_words('en')

	for entry in parse_amazon(in_fname):
		if entry.has_key(dict_field):
			num_entries += 1
			# if text field, parse and populate.
			if is_text:
				words = entry[dict_field].split()
				for word in words:
					stemmed_word = stem(word)
					if stemmed_word not in stop_words and stemmed_word not in field_dict:
						id += 1
						field_dict[stemmed_word] = id
			else:
				if entry[dict_field] not in field_dict:
					id += 1
					field_dict[entry[dict_field]] = id
				#printf('%s -> %d\n', entry[dict_field], id)
				#if id > 100:
				#	break
	print "num_entries:", num_entries
	print "length of field_dict:", len(field_dict)
	with open(out_fname, 'wb') as outf:
		pickle.dump(field_dict, outf)
开发者ID:fruitfly1026,项目名称:tensors,代码行数:28,代码来源:process.py

示例15: load_dataset

def load_dataset(dataset_file):
    """
    It is more efficient (O(n) vs. O(1)) to search a dictionary or a set
    compared to a list as they are implemented with a hash.
    Therefore, the dataset is kept with 2 dictionaries where
    the values are sets.
    """
    items_original_form = defaultdict(set)
    items_by_keyword_start = defaultdict(set)
    items_by_id = defaultdict(set)

    stop_words = get_stop_words('english')

    with open(dataset_file) as f:
        lines = csv.reader(f, delimiter=',')
        for line in lines:

            item_id, *descriptors = line

            # save original form (3 seperate fields:
            # id, description, company name) for output
            items_original_form[item_id] = descriptors

            # create 2 dictionaries for searching:
            # 1. Key: 3 lower-case first letters of each
            # word of item descriptors. Value: item ids.
            # 2. Key: item id. Value: item descriptors in lower-case.
            descriptors_set = set(" ".join(descriptors).lower().split())
            for d in descriptors_set:
                if d not in stop_words:
                    items_by_keyword_start[d[:3]].add(item_id)
            items_by_id[item_id] = descriptors_set

    return (items_by_keyword_start, items_by_id, items_original_form)
开发者ID:noamba,项目名称:query-dataset,代码行数:34,代码来源:search_app_improved.py


注:本文中的stop_words.get_stop_words函数示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。