Python porter2.stem函数代码示例

本文整理汇总了Python中stemming.porter2.stem函数的典型用法代码示例。如果您正苦于以下问题：Python stem函数的具体用法？Python stem怎么用？Python stem使用的例子？那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。

在下文中一共展示了stem函数的15个代码示例，这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞，您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: get_pmi

    def get_pmi(self, word0, word1):
        """Return the pointwise mutual information, a measure of word
        association within a window, for two words. This is normalized
        using Bouma (2009) to avoid infinite values for OOV terms.
        """
        word0 = word0.lower()
        word1 = word1.lower()

        if self.stemming:
            word0 = porter2.stem(word0)
            word1 = porter2.stem(word1)

        if word0 not in self.word_counts or word1 not in self.word_counts:
            return -1

        if word0 < word1:
            pair_counts = self.word_pair_counts[word0][word1]
        else:
            pair_counts = self.word_pair_counts[word0][word1]

        if pair_counts == 0:
            return -1

        num_words = self.word_counts[anyword]

        # TODO: confirm normalization. Currently assuming words are
        # normalized by num_words and pairs by num_words^2.
        ratio = pair_counts / (self.word_counts[word0] *
                               self.word_counts[word1])
        pmi = np.log(ratio)
        normalized_pmi = - pmi / np.log(pair_counts / (num_words * num_words))

        return normalized_pmi

开发者ID:mcka1n，项目名称:dissertation，代码行数:33，代码来源:stats.py

示例2: find_collocations

def find_collocations(file_name, data, popular_word):
    text_file = open(file_name, 'r')
    file_content = text_file.read()

    most_common_words = find_most_common_words(file_content, popular_word)

    second_word = None
    third_word = None
    collocations = data

    text_file.seek(0)
    for line in text_file:
        for word in line.split():
            first_word = second_word
            second_word = third_word
            third_word = trim_word(word)
            if (first_word not in most_common_words and second_word not in most_common_words) and \
                    (first_word and first_word[0].islower() and second_word and second_word[0].islower()):
                count_collocations(collocations, stem(first_word.lower()), stem(second_word.lower()))

    # dodatkowa iteracja dla ostatniego slowa
    first_word = second_word
    second_word = third_word
    count_collocations(collocations, first_word, second_word)

    collocations = find_whole_collocations_from_stems(collocations, file_content)
    return collocations, most_common_words, file_content

开发者ID:Ogleiv，项目名称:IWI，代码行数:27，代码来源:collocations_file.py

示例3: calculateScore

def calculateScore(query,qID):
	

	sfile=open('../AP_DATA/stoplist.txt','r')
	sList=sfile.read().split('\n')
	query=query.lower()
	qList=re.findall("\w+[\.?\w+]*",query)
	temp=list()
	for term in qList:
		if term.endswith('.') & term.count('.')==1 & (len(term)>1):
			term=term.replace('.','')
		if term.startswith('_') & term.count('_') ==1 & (len(term)>1):
			term = term.replace('_','')
		temp.append(term)
	
	qList = temp
	#print index_num
	if index_num=='4':
		#print 123
		qList=[i for i in temp if i not in sList]
		temp=list() 
		for term in qList:
			term=stem(term)
			temp.append(term)
		qList=temp

	if index_num=='3':
		temp=list()
		for term in qList:
			term=stem(term)
			temp.append(term)
		qList=temp

	if index_num=='2':
		qList=[i for i in temp if i not in sList]

开发者ID:vaibhavty，项目名称:InformationRetrieval，代码行数:35，代码来源:score.py

示例4: find_collocations_tri

def find_collocations_tri(filename):
    text_file = open(filename, 'r')

    most_common_words = find_most_common_words(text_file, 100)

    second_word = None
    third_word = None
    fourth_word = None
    collocations = dict()

    text_file.seek(0)
    for line in text_file:
        for word in line.split():
            first_word = second_word
            second_word = third_word
            third_word = fourth_word
            fourth_word = trim_word(word)
            if (first_word not in most_common_words and second_word not in most_common_words and third_word not in most_common_words) and \
                    (first_word and first_word[0].islower() and second_word and second_word[0].islower() and third_word and third_word[0].islower()):
                count_collocations_tri(collocations, stem(first_word.lower()), stem(second_word.lower()), stem(third_word.lower()))

     #dodatkowa iteracja dla ostatniego slowa
    first_word = second_word
    second_word = third_word
    third_word = fourth_word                                   
    count_collocations_tri(collocations, first_word, second_word, third_word)
    sort_collocations_tri(collocations)

开发者ID:Ogleiv，项目名称:IWI，代码行数:27，代码来源:kolotri.py

示例5: find_collocations_penta

def find_collocations_penta(text, data, popular_word):
    
    most_common_words = find_most_common_words(text, popular_word)

    second_word = None
    third_word = None
    fourth_word = None
    fifth_word = None
    sixth_word = None
    collocations = data

    for word in text.split():
        first_word = second_word
        second_word = third_word
        third_word = fourth_word
        fourth_word = fifth_word
        fifth_word = sixth_word
        sixth_word = trim_word(word)
        if (first_word not in most_common_words and second_word not in most_common_words and third_word not in most_common_words and fourth_word not in most_common_words and fifth_word not in most_common_words) and \
                (first_word and first_word[0].islower() and second_word and second_word[0].islower() and third_word and third_word[0].islower() and fourth_word and fourth_word[0].islower() and fifth_word and fifth_word[0].islower() ):
            count_collocations_penta(collocations, stem(first_word.lower()), stem(second_word.lower()), stem(third_word.lower()), stem(fourth_word.lower()), stem(fifth_word.lower()))

    #dodatkowa iteracja dla ostatniego slowa
    first_word = second_word
    second_word = third_word
    third_word = fourth_word
    fourth_word = fifth_word
    fifth_word = sixth_word
    count_collocations_penta(collocations, first_word, second_word, third_word, fourth_word, fifth_word)
    return collocations, most_common_words

开发者ID:Ogleiv，项目名称:IWI，代码行数:30，代码来源:collocations_wikipedia_penta.py

示例6: tokenize_porter

def tokenize_porter(title, body):
    """ Break text into words and stem user porter stemmer """
    # break up words & remove stopwords
    title_break = stopWords(nltk.word_tokenize(title), lower_case=True)
    body_break = stopWords(nltk.word_tokenize(body), lower_case=True)
    # print title_break
    return ["title:" + stem(title) for title in title_break] + ["body:" + stem(body) for body in body_break]

开发者ID:JasperHG90，项目名称:naiveBayes-guardian-articles，代码行数:7，代码来源:naiveBayes.py

示例7: ngram_in_collection

def ngram_in_collection(ngram, coll):
    """
    Check if ngram's components are in collection
    """
    s1 = set([stem(word) for word in ngram.split(' ')])
    s2 = set([stem(word) for word in coll])
    return (len(s1.intersection(s2)) > 0)

开发者ID:shankark10n，项目名称:ecotrends，代码行数:7，代码来源:picker.py

示例8: tokenize

 def tokenize(self):
     punc = """\\.!?,(){}[]"'"""
     wordarray = []
     for c in self.document.lower().split():
         if stem(c.strip()) not in self.corpus.stopwords:
             wordarray.append(stem(c.strip(punc)))
     return wordarray

开发者ID:dydt，项目名称:dialectgaussmix，代码行数:7，代码来源:tokenizer.py

示例9: cleanText

def cleanText(text, entities, category):
	cleanText = text
	hashtags = entities.get('hashtags', [])
	ranges = []
	for hashtag in hashtags:
		if hashtag.get('text', '').lower() == category:
			indices = hashtag.get('indices')
			ranges.append(indices)
	urls = entities.get('urls', [])
	urls.reverse()
	ranges.extend([v for url in urls for k,v in url.iteritems() if k == 'indices'])
	media = entities.get('media', [])
	media.reverse()
	ranges.extend([v for medium in media for k,v in medium.iteritems() if k == 'indices'])
	ranges = sorted(ranges, key=lambda x: x[0], reverse=True)
	for r in ranges:
		cleanText = cleanText[:r[0]] + cleanText[r[1] + 1:]

	category_stem = stem(category).lower()
	cleanTextList = cleanText.split(' ')
	cleanText = []
	for word in cleanTextList:
		if category_stem not in stem(word).lower() and stem(word).lower() not in category_stem:
			cleanText.append(word)
	cleanText = " ".join(cleanText)
	return cleanText

开发者ID:atran3，项目名称:sarcasm_detection，代码行数:26，代码来源:cleanTweets.py

示例10: read

 def read(self, publication_keyword, publication_data):
     words = open(publication_keyword, 'r').readlines()
     for i in range(0, self.topic_number):
         s = stem(words[i].split('\t')[0])
         self.topics[ s ] = dict()
         self.stemword_dict[s] = words[i].split('\t')[0]
     content = open(publication_data, 'r').readlines()
     counter = 0
     year = ''
     for i in content:
         # three line represents a publication
         if counter % 3000 == 0:
             print (counter / 3)
         # record the year of this publication
         if counter % 4 == 1:
             year = int(i.strip())
         # parse the keywords of this publication
         elif counter % 4 == 3:
             keywords = i.strip().split(' ')
             for j in keywords:
                 j = stem(j)
                 if j in self.topics:
                     if year in self.topics[j]:
                         self.topics[j][year] += 1
                     else:
                         self.topics[j][year] = 1
         counter = counter + 1

开发者ID:JingqingZ，项目名称:AminerKnowledgeGraph，代码行数:27，代码来源:time_keyword_distribution.py

示例11: makeFreqDictionaryOfSentenceWords

def makeFreqDictionaryOfSentenceWords(s1):
	words1 = s1.split();
	dt1 = {}
	for w in words1:
		if w.lower() not in stopwords:
			dt1[stem(w.lower())] = dt1.get(stem(w.lower()),0) + 1
	return dt1

开发者ID:farazbhinder，项目名称:article-summarization，代码行数:7，代码来源:algo2.py

示例12: sentence_matches

    def sentence_matches(self, sentence_text):
        """Returns true iff the sentence contains this mention's upstream
        and downstream participants, and if one of the stemmed verbs in
        the sentence is the same as the stemmed action type."""
        has_upstream = False
        has_downstream = False
        has_verb = False

        # Get the first word of the action type and assume this is the verb
        # (Ex. get depends for depends on)
        actiontype_words = word_tokenize(self.mention.actiontype)
        actiontype_verb_stemmed = stem(actiontype_words[0])

        words = word_tokenize(sentence_text)

        if self.string_matches_sans_whitespace(sentence_text.lower(),
            self.mention.upstream.lower()):
            has_upstream = True

        if self.string_matches_sans_whitespace(sentence_text.lower(),
            self.mention.downstream.lower()):
            has_downstream = True

        for word in words:
            if actiontype_verb_stemmed == stem(word):
                has_verb = True

        return has_upstream and has_downstream and has_verb

开发者ID:johnbachman，项目名称:indra，代码行数:28，代码来源:find_full_text_sentence.py

示例13: getVocabularyStem

def getVocabularyStem(content):
    vocabulary = {}
    index = 0
    for i in range(len(content)):
        if stem(content[i]) not in vocabulary:
            vocabulary[stem(content[i])] = index
            index = index + 1
    return vocabulary

开发者ID:yanyankangkang，项目名称:Text-Mining，代码行数:8，代码来源:feature_hy.py

示例14: main

def main():
    nlp_file = open(sys.argv[1], "r")
    for line in nlp_file:
        words = line.strip().split(" ")
        for word in words:
            print stem(word)

    nlp_file.close()

开发者ID:m-note，项目名称:100knock2015，代码行数:8，代码来源:knock52.py

示例15: getSentTf

def getSentTf(sent, stopwords):
	doc = dict()
	for word in re.split("[^a-zA-Z0-9]", sent):
		word = word.lower()
		if word != "" and word!="'" and stem(word) not in stopwords:
			if doc.get(stem(word), 0) == 0:
				doc[stem(word)] = 1
			else:
				doc[stem(word)] = doc[stem(word)]+1
	return doc

开发者ID:imsorry1121，项目名称:paper_label，代码行数:10，代码来源:ir.py

注：本文中的stemming.porter2.stem函数示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台，相关代码片段筛选自各路编程大神贡献的开源项目，源码版权归原作者所有，传播和使用请参考对应项目的License；未经允许，请勿转载。