当前位置: 首页>>代码示例>>Python>>正文


Python PorterStemmer.stem_word方法代码示例

本文整理汇总了Python中nltk.stem.porter.PorterStemmer.stem_word方法的典型用法代码示例。如果您正苦于以下问题:Python PorterStemmer.stem_word方法的具体用法?Python PorterStemmer.stem_word怎么用?Python PorterStemmer.stem_word使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在nltk.stem.porter.PorterStemmer的用法示例。


在下文中一共展示了PorterStemmer.stem_word方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: __init__

# 需要导入模块: from nltk.stem.porter import PorterStemmer [as 别名]
# 或者: from nltk.stem.porter.PorterStemmer import stem_word [as 别名]
	def __init__(self, words, is_binary=False):
		self._keywords = words
		self._stemmed_keywords = []
		stemmer = PorterStemmer()
		for word in words:
			self._stemmed_keywords.append(stemmer.stem_word(word))
		self._is_binary = is_binary
开发者ID:sebschu,项目名称:cs224u-fp,代码行数:9,代码来源:learning2.py

示例2: extract

# 需要导入模块: from nltk.stem.porter import PorterStemmer [as 别名]
# 或者: from nltk.stem.porter.PorterStemmer import stem_word [as 别名]
	def extract(self, line):
		"""
		find word pairs that co-occur and extract # of minimum distance word pairs in the line
		"""
		words = self.tokenize(line.lower())
		count = 0.0
		stemmer = PorterStemmer()
		bad_indices = [] 
		you_indices = [] 
		for i in range(len(words)):
			word = words[i] 
			if word in self._youwords: 
				you_indices.append(i)
			word = stemmer.stem_word(word)
			if word  in self._stemmed_badwords or self.isWordPartOf(word,self._badwords): 
				bad_indices.append(i)
			
		 
		if not bad_indices or not you_indices: 
			return [-1]
		else: 
			#print line 
			#print bad_indices
			#print you_indices
			distances = [] 
			for bindex in bad_indices:
				for yindex in you_indices: 
					distances.append(abs(bindex - yindex))
			#print distances
			mn = min(distances)
			count = sum([1 for d  in distances if d == mn])
			#return [(count *1.0)* mn/len(line)]		
			return [1]
开发者ID:sebschu,项目名称:cs224u-fp,代码行数:35,代码来源:learning2.py

示例3: get_stemmed_terms_list

# 需要导入模块: from nltk.stem.porter import PorterStemmer [as 别名]
# 或者: from nltk.stem.porter.PorterStemmer import stem_word [as 别名]
def get_stemmed_terms_list(doc, stem_words_map = None, stem_bigrams_map = None):
    ps = PorterStemmer()
    local_map = dict()
    word_list = []

    clean_doc = [(w.strip()).lower() for w in doc.split() if len(w) in range(3,16)]
    filtered_words = [w.strip('.,;?!:)(#') for w in clean_doc if not w.strip('.,;?!:)(#') in stopwords.words('english')]

    for w in filtered_words:
        if w.isalpha():
            w_temp = ps.stem_word(w)
            if stem_words_map is not None:
                if w_temp not in stem_words_map:
                    stem_words_map[w_temp] = dict()
                stem_words_map[w_temp][w] = stem_words_map[w_temp].get(w, 0)+1
                local_map[w_temp] = w
            word_list.append(w_temp)

    bigrams = nltk.bigrams(word_list)
    for b in bigrams:
        bigram_org = (local_map[b[0]],local_map[b[1]])
        if stem_bigrams_map is not None:
                if b not in stem_bigrams_map:
                    stem_bigrams_map[b] = dict()
                stem_bigrams_map[b][bigram_org] = stem_bigrams_map[b].get(bigram_org, 0)+1

    return word_list, bigrams
开发者ID:Sandy4321,项目名称:CrisisLex,代码行数:29,代码来源:read.py

示例4: _getFeatures

# 需要导入模块: from nltk.stem.porter import PorterStemmer [as 别名]
# 或者: from nltk.stem.porter.PorterStemmer import stem_word [as 别名]
 def _getFeatures(self, corpus):
     stemmer = PorterStemmer()
     tokens = corpus.split(" ")
     features = filter(lambda x: len(x) > 1, tokens)
     
     finalList = [] 
     for feature in features :
         feature = re.sub("[^a-zA-Z0-9']", "", feature.lower())
         finalList.append(stemmer.stem_word(feature))
         
     return finalList
开发者ID:dannieb,项目名称:Common-Substring,代码行数:13,代码来源:__init__.py

示例5: extract

# 需要导入模块: from nltk.stem.porter import PorterStemmer [as 别名]
# 或者: from nltk.stem.porter.PorterStemmer import stem_word [as 别名]
	def extract(self, line):
		
		words = self.tokenize(line.lower())
		count = 0.0
		stemmer = PorterStemmer()
		for word in words:
			word = stemmer.stem_word(word)
			if word in self._stemmed_keywords:
				count += 1
		if self._is_binary:
			return [1] if count > 0 else [0]
		else:
			return [count]
开发者ID:sebschu,项目名称:cs224u-fp,代码行数:15,代码来源:learning3.py

示例6: getFeatures

# 需要导入模块: from nltk.stem.porter import PorterStemmer [as 别名]
# 或者: from nltk.stem.porter.PorterStemmer import stem_word [as 别名]
 def getFeatures(self, corpus):
     stemmer = PorterStemmer()
     stems = FreqDist()
     onlyLettersNumbers = re.compile('[^a-zA-Z0-9%!]')
     corpus = onlyLettersNumbers.sub(' ', corpus.lower())
     corpus = TreebankWordTokenizer().tokenize(corpus)
     
     count = 0
     for word in corpus :
         if not stopwords.STOP_WORDS.get(word) and len(word.strip()) > 1 :
             stems.inc(stemmer.stem_word(word))
             count += 1
             if self.__maxFeatures > 0 and count >= self.__maxFeatures :
                 break
             
     features = stems.samples()
     
     return features
开发者ID:artscoop,项目名称:django-classifier,代码行数:20,代码来源:__init__.py

示例7: __init__

# 需要导入模块: from nltk.stem.porter import PorterStemmer [as 别名]
# 或者: from nltk.stem.porter.PorterStemmer import stem_word [as 别名]

#.........这里部分代码省略.........
            if self.isVariable(tag_frame[x]):
                if tag_frame[x] in groundings.keys():
                    output[x] = groundings[tag_frame[x]]

        return output
    
    def matchTemplate(self, rule_set, callback):
        output_stack = []
        for key, value in rule_set.items():
            match_rule = callback(value)
            if match_rule[0]:
                output = self.replaceOutput(match_rule, value)
                output_stack.append((key, output))
                
            self.reset()
                
        return output_stack
        
    def matchRuleSet(self, ruleSet):
        output_stack = []
        for key, value in ruleSet.items():
            match_list = self.matchRule(value)
            if match_list:
                output = self.replaceOutput(match_list, value)
                output_stack.append((key, output))
                
            self.reset()
            
        return output_stack
    
    def lemma(self, word, grounding):
        if self.isVariable(word) and self.Grounds.isGround(word):
            groundedWord = self.Grounds.getGrounding(word)
            stemmed = self.stemmer.stem_word(groundedWord)
            self.Grounds.groundVariable(grounding, stemmed)
            
        elif not self.isVariable(word):
            stemmed = self.stemmer.stem_word(word)
            self.Grounds.groundVariable(grounding, stemmed)

    def compareRule(self, tag, var_1, var_2):
        ## check to see if we are ground and it is a var
        if self.isVariable(var_1) and self.Grounds.isGround(var_1):
            out = self.compareGround(var_1, 0)
            if not out:
                return False
            
        ## otherwise we need to ground it
        elif self.isVariable(var_1) and not self.Grounds.isGround(var_1):
            self.Grounds.groundVariable(var_1, self.current[0])
     
        if self.isVariable(var_2) and self.Grounds.isGround(var_2):
            out = self.compareGround(var_2, 2)
            if not out:
                return False
            
        elif self.isVariable(var_2) and not self.Grounds.isGround(var_2):
            self.Grounds.groundVariable(var_2, self.current[2])
            
        if tag == '$prep':
            ## XXX: bad way of doing this
            return True
        
        if self.isVariable(tag) and self.Grounds.isGround(tag):
            self.compareGround(tag, 1)
                
开发者ID:bluemoon,项目名称:Godel,代码行数:69,代码来源:rule_engine.py

示例8: get_word_counts

# 需要导入模块: from nltk.stem.porter import PorterStemmer [as 别名]
# 或者: from nltk.stem.porter.PorterStemmer import stem_word [as 别名]
def get_word_counts( solr, fq, query, num_words, field='sentence' ) :
    print query

    print str(time.asctime())

    start_time = time.time()

    function_start_time = start_time
    
    results = fetch_all( solr, fq, query, 'sentence' )
    print "got " + query
    print len( results )
    print time.asctime()

    end_time = time.time()
    print "time {}".format( str(end_time - start_time) )

    start_time = end_time

    print 'converting to utf8 and lowercasing';
    sentences = [ result['sentence'].lower() for result in results ]

    results = None

    end_time = time.time()
    print "time {}".format( str(end_time - start_time) )

    start_time = end_time


    print 'calculating non_stemmed_wordcounts'
    term_counts = non_stemmed_word_count( sentences )

    if '' in term_counts:
        del term_counts['']

    print "Returned from non_stemmed_word_count"
    print time.asctime()
    end_time = time.time()
    print "time {}".format( str(end_time - start_time) )

    start_time = end_time
    print "freeing sentences "
    sentences = None
    
    end_time = time.time()
    print "time {}".format( str(end_time - start_time) )


    start_time = end_time

    print 'stemming and counting'

    stem_counts = collections.Counter()

    st = PorterStemmer()
    for term in term_counts.keys():
        #ipdb.set_trace()
        stem = st.stem_word( term )
        stem_counts[ stem ] += term_counts[ term ]


    end_time = time.time()
    print "done stemming and counting "
    print "time {}".format( str(end_time - start_time) )

    start_time = end_time

    print ' calcuating stem to term map '
    stem_to_terms = {}
    for term in term_counts.keys():
        stem = st.stem_word( term )
        if stem not in stem_to_terms:
            stem_to_terms[ stem ] = []

        stem_to_terms[stem].append( term )

    print "done calcuating stem to term map "
    print "time {}".format( str(end_time - start_time) )


    counts = stem_counts.most_common( num_words )

    ret = [ ]
    for stem, count in counts:
        if len( stem_to_terms[ stem ] ) < 2:
            term = stem_to_terms[ stem][0]
        else:
            best_count = 0
            for possible_best in stem_to_terms[ stem ] :
                if term_counts[ possible_best ] > best_count:
                    term = possible_best
                    best_count = term_counts[ possible_best ]

        ret.append( 
            { 'stem': stem, 
              'term': term,
              'count': count
              } )

#.........这里部分代码省略.........
开发者ID:Spencerx,项目名称:mediacloud,代码行数:103,代码来源:solr_in_memory_wordcount_stemmed.py

示例9: Namespace

# 需要导入模块: from nltk.stem.porter import PorterStemmer [as 别名]
# 或者: from nltk.stem.porter.PorterStemmer import stem_word [as 别名]

#.........这里部分代码省略.........
                compdict[uri] = label.toPython().strip().lower()
        if (None, URIRef("http://www.w3.org/2000/01/rdf-schema#label"), None) in graph:
            for uri, label in graph.subject_objects(
                   URIRef("http://www.w3.org/2000/01/rdf-schema#label")):
                compdict[uri] = label.toPython().strip().lower() 
        return compdict

    def removeDiacritics(self, label):
        """
        This method uses unicodedata() to remove diacritics from a string
        TODO: Does this work without unicodedata?
        param: string
        return: string
        """
        label = ''.join((c for c in unicodedata.normalize('NFD', unicode(label)) if unicodedata.category(c) != 'Mn'))
        return label

    def removePunctuation(self, label):
        """This method removes punctuations. Right now, it will remove
        '!"#$%&\'()*+,-./:;<=>[email protected][\\]^_`{|}~'
        param: string
        return: string
        """
        for punct in string.punctuation:
            label = label.replace(punct," ")
        return label

    def stemWords(self, label):
        """This method stems a single word or a phrase
        param: string
        return: string
        """ 
        
        label = " ".join(self.porter.stem_word(word) for word in label.split(" "))
        return label

    def __broaders(self, uri, graph):
        """This "private" method is a generator over broaderTerms of a given URI in
        a given ConjunctiveGraph. Use getParents() to obtain the list of broaderTerms
        """
        for n in graph.transitive_objects(URIRef(uri), URIRef('http://www.w3.org/2004/02/skos/core#broader')):
            if (uri==n):
                continue
            yield n

    def __narrowers(self, uri, graph):
        """This "private" method is a generator over narrowerTerms of a given URI in
        a given ConjunctiveGraph. Use getchildren() to obtain the list of narrowerTerms
        """
        for n in graph.transitive_objects(URIRef(uri), URIRef('http://www.w3.org/2004/02/skos/core#narrower')):
            if (uri==n):
                continue
            yield n

    def getParents(self, uri, graph):
        list = []
        for n in self.__broaders(uri, graph):
            for label in graph.objects(n, URIRef("http://www.w3.org/2004/02/skos/core#prefLabel")):
                list.append(label.toPython().strip().lower())
        return list

    def getChildren(self, uri, graph):
        list = []
        for n in self.__narrowers(uri, graph):
            for label in graph.objects(n, URIRef("http://www.w3.org/2004/02/skos/core#prefLabel")):
                list.append(label.toPython().strip().lower())
开发者ID:Gregsen,项目名称:skotheme,代码行数:70,代码来源:skotheme.py

示例10: set

# 需要导入模块: from nltk.stem.porter import PorterStemmer [as 别名]
# 或者: from nltk.stem.porter.PorterStemmer import stem_word [as 别名]
wouldn't
you
you'd
you'll
you're
you've
your
yours
yourself
yourselves""".translate(table,punc).split())


for x in nltk.corpus.words.words():
    stopwords.add(x.translate(table,punc))

stopwords = set([stemm.stem_word(x) for x in stopwords])

#get a list of the normalized words from the tweet
def split_tweet(m):
    text = str(m['text']).translate(table, punc)    
    #repattern.sub('', 
    return set([x for x in [stemm.stem_word(x.lower()) for x in text.split() if x.lower().isalpha() and len(x) > 2 and  not '#' in x and  not 'http' in x] ])

#get hashtags
def get_tags(m):
    return [stemm.stem_word(x['text'].lower()) for x in m['entities']['hashtags']]

english_vocab = set(stemm.stem_word(w.lower()) for w in nltk.corpus.words.words()) 
print 'english parsed'
#text_vocab = set(w.lower() for w in text if w.lower().isalpha()) 
#loop over all tweets found in files on @path
开发者ID:darkraft,项目名称:homespace,代码行数:33,代码来源:twitter.py

示例11: DateExtractor

# 需要导入模块: from nltk.stem.porter import PorterStemmer [as 别名]
# 或者: from nltk.stem.porter.PorterStemmer import stem_word [as 别名]

#.........这里部分代码省略.........
        return result

    def _extend_to_left(self, expression, tagged_sentence):
        """Try to complete the first numeral. It's a healing method for the next scenario:
        seventy two (72) days.
        """
        num_text = self._extract_number_from_expression(expression)
        if not num_text:
            return expression

        if expression.startswith(num_text):
            # Expression is correctly formatted
            return expression

        # If the text would have contained '-' the expression would have been correct.
        num_words = num_text.split('-')

        if not expression.lower().startswith(num_words[-1]):
            print 'Expression %s does not start with the expected word %s' % (expression,
                                                                              num_words[-1])
            return expression

        sentence = join_sentence([t[0] for t in tagged_sentence])

        # Last word is in the expression, the other words we expect to find in the subsentence.
        # The fact that there can be more such expressions in a sentence was considered but we
        # should not do anything special for them because agreements have a pseudo structure which
        # leads to consistency in terms of formats.
        idx = sentence.find(expression)

        subsentence = sentence[:idx].strip().split()
        subsentence.reverse()

        num_words = num_words[:-1]
        num_words.reverse()

        idx = 0
        wc = len(num_words)
        while idx < wc and num_words[idx] == subsentence[idx]:
            expression = '%s %s' % (num_words[idx], expression)
            idx += 1

        return expression

    def _extract_number_from_expression(self, expression):
        number_search = re.search('\(([0-9]+)\)', expression)
        if not number_search:
            return None

        try:
            number = int(number_search.groups()[0])
        except:
            print "Could not extract number from expression: %s" % expression
            return None

        return num2words(number)

    def _extract_data_from_tree(self, tree):
        expressions = []
        for subtree in tree.subtrees():
            if not subtree.label() == 'DATE':
                continue

            expressions.append(join_sentence([t[0] for t in subtree.leaves()]))

        return expressions

    def _get_sentences(self, text):
        sentences = nltk.sent_tokenize(text)
        # Remove new lines
        sentences = [s.replace('\r\n', ' ') for s in sentences]

        # Collapse whitespaces
        rex = re.compile(r'[ \t]+')
        sentences = [rex.sub(' ', s) for s in sentences]

        sentences = [nltk.word_tokenize(sent) for sent in sentences]
        return sentences

    def _is_false_positive(self, expression, tagged_sentence):
        # The last token should be either time unit or 'period'
        time_unit = expression.split()[-1]

        stem = self.stemmer.stem_word(time_unit)
        if stem not in ALLOWED_STEMS:
            return True

        if stem == YEAR_STEM:
            # Check if the sentence represents an age expression (it is followed by "of age"
            # or "old").
            sentence = join_sentence([t[0] for t in tagged_sentence])
            idx = sentence.find(expression) + len(expression)
            subsentence = sentence[idx:].strip()
            # Note: This check may fail if in the same sentence there are both age expressions and
            # date expressions in years. This should not be a problem since no reviewed document
            # has this case (it also doesn't make sense in the pseudo-structure of agreements).
            if any(subsentence.startswith(expr) for expr in AGE_EXPRESSIONS):
                return True

        return False
开发者ID:tiriplicamihai,项目名称:information_extraction,代码行数:104,代码来源:date_extractor.py

示例12: PorterStemmer

# 需要导入模块: from nltk.stem.porter import PorterStemmer [as 别名]
# 或者: from nltk.stem.porter.PorterStemmer import stem_word [as 别名]
        if(word!=False):
            fullSearchURL=baseSearchURL+word
            f = urllib.urlopen(fullSearchURL)
            for line in f.read().split("\n"):
                found=False
                if("video src" in line):
                    endURL=line[12:]
                    endURL=endURL[:endURL.index("\"")]
                    fullURL=baseURL+endURL
                    print 'getting video from'+fullURL
                    found=True
                    break
            if(found):
                urllib.urlretrieve(fullURL,word+'.mp4')
        else:
            p = PorterStemmer()
            word=PorterStemmer.stem_word(p,origword)
            fullSearchURL=baseSearchURL+word
            f = urllib.urlopen(fullSearchURL)
            for line in f.read().split("\n"):
                found=False
                if("video src" in line):
                    endURL=line[12:]
                    endURL=endURL[:endURL.index("\"")]
                    fullURL=baseURL+endURL
                    print 'getting video from'+fullURL
                    found=True
                    break
            if(found):
                urllib.urlretrieve(fullURL,word+'.mp4')
开发者ID:VWang1111,项目名称:ASLVideoCombiner,代码行数:32,代码来源:ASLVideoCombiner.py

示例13: glossOverlap

# 需要导入模块: from nltk.stem.porter import PorterStemmer [as 别名]
# 或者: from nltk.stem.porter.PorterStemmer import stem_word [as 别名]
def glossOverlap(gloss1, gloss2):
	# stopws = stopwords.words('english')
	stopws = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 
	'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 
	'her', 'hers', 'herself', 'it', 'its', 'itself', 'they', 'them', 'their', 'theirs', 
	'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', 'these', 'those', 
	'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 
	'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 
	'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 
	'in', 'out', 'on', 'off', 'over', 'under', 'then',  
	'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each',
	'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 
	'than', 's', 't', 'just', 'don']

	stemmer = PorterStemmer()

	# print "-"*50
	# for x in stopws:
	# 	if stemmer.stem_word(x) != x:
	# 		print x, stemmer.stem_word(x)

	def longestOverlap(a, b):
		now = [0]*len(b)
		bestOverlap = 0
		aStart = 0
		bStart = 0

		nextNonStopWord = [-1]*(len(a)+1)
		for i in range(len(a)-1, 0, -1):
			if a[i] not in stopws:
				nextNonStopWord[i] = i
			else:
				nextNonStopWord[i] = nextNonStopWord[i+1]

		for i in range(1, len(a)):
			prev = now
			now = [0]*len(b)
			if a[i] == '#':
				continue
			for j in range(1, len(b)):
				if b[j] == '#':
					continue
				if a[i] == b[j]:
					now[j] = max(now[j], prev[j-1] + 1)
					if a[i] in stopws:
						continue

					overlap = now[j]
					start = i - overlap + 1
					start = nextNonStopWord[start]
					overlap = i - start + 1
					if bestOverlap < overlap:
						bestOverlap = overlap
						aStart = i - overlap + 1
						bStart = j - overlap + 1

		return (bestOverlap, aStart, bStart)


	regex = ',|\.|\s|\?|\'|\"|!|;|-'
	#maybe check what happens if we don't stem the glosses
	a1 = ['#'] + [stemmer.stem_word(x.lower()) for x in re.split(regex, gloss1) if x]
	a2 = ['#'] + [stemmer.stem_word(x.lower()) for x in re.split(regex, gloss2) if x]

	score = 0
	(overlap, start1, start2) = longestOverlap(a1, a2)
	while overlap > 0:
		# print overlap
		# print a1[start1:start1+overlap]
		# print a2[start2:start2+overlap]
		a1[start1:start1+overlap] = ['#']
		a2[start2:start2+overlap] = ['#']
		score += overlap**2
		(overlap, start1, start2) = longestOverlap(a1, a2)

	return score
开发者ID:m1ha1f,项目名称:disambiguation,代码行数:78,代码来源:overlap.py

示例14: BooleanSearch

# 需要导入模块: from nltk.stem.porter import PorterStemmer [as 别名]
# 或者: from nltk.stem.porter.PorterStemmer import stem_word [as 别名]
class BooleanSearch(object):
    """
    This class handles the parsing and execution of queries from a provided query file based on
    the index and postings file provided upon initialization. Results are saved into an output file.
    Parsing of queries is handled by converting the boolean expressions into an equivalent python
    expression that is executable.
    """
    eval_index_local = "index"
    eval_globals = {"__builtins__": None}
    replacements = [("AND NOT", "-"), ("AND", "&"), ("OR", "|"), ("NOT", "~"), ("(", " ( "), (")", " ) ")]
    exprs = set(["-", "&", "|", "~", "(", ")"])
    expr_postings_ref = "index[\"%s\"]"

    def __init__(self, index_filename, postings_filename):
        """
        index_filename refers to the dictionary file.
        postings_filename refers to the postings file.
        """
        self.stemmer = PorterStemmer()
        self.index = Index(index_filename, postings_filename)
        self.eval_locals = {self.eval_index_local: self.index}

    def _to_python_expression(self, query):
        """
        Parses a boolean expression by converting the boolean operator keywords into python's bitwise operators,
        and converts the terms into their respective index calls that return SkipList objects.
        The resulting expression is an executable python expression.

        WARNING: NOT SAFE FOR PRODUCTION SYSTEMS. FOR ACADEMIC PURPOSES ONLY.
        """
        query = reduce(lambda q,args: q.replace(*args), self.replacements, query)
        query_list = [x not in self.exprs and self.expr_postings_ref % self.stemmer.stem_word(x.lower()) or x for x in query.split()]
        return " ".join(query_list)

    def _execute_query(self, query):
        """
        Executes the provided query and returns the result

        WARNING: NOT SAFE FOR PRODUCTION SYSTEMS. FOR ACADEMIC PURPOSES ONLY.
        """
        expression = self._to_python_expression(query)
        try:
            result = eval(expression, self.eval_globals, self.eval_locals)
        except SyntaxError as se:
            return "Syntax Error occurred, possible malformed expression during conversion: %s" % expression
        except NameError as ne:
            return "Name Error occured, possible invalid object reference in query: %s" % expression
        else:
            return result

    def process_queries(self, query_filename, output_filename):
        """
        This method takes in a query filename and output filename.
        For every query, it writes the output into a new line.
        """
        try:
            with open(query_filename, 'r') as query_file, open(output_filename, 'w') as output_file:
                for row in query_file:
                    result = self._execute_query(row)
                    output_file.write(str(result) + "\n")
        except IOError as error:
            print "IO Error occured while attempting to run BooleanSearch"
            sys.exit(error.args[1])
开发者ID:cwma,项目名称:School-Projects,代码行数:65,代码来源:search.py

示例15: len

# 需要导入模块: from nltk.stem.porter import PorterStemmer [as 别名]
# 或者: from nltk.stem.porter.PorterStemmer import stem_word [as 别名]
  db = c['haiku']

  # Open connection to Twitter's public timeline, build haikus
  failed = True
  while failed:
    failed = False
    try:
      with tweetstream.SampleStream(USER,PASS) as stream:
        for tweet in stream:
          if 'text' in tweet and len(tweet['text'])>0:
            screen_name = tweet['user']['screen_name']
            hashes = [j for j in set([i for i in tweet['text'].split() if i.startswith('#')])]
            # Strip out urls, punctuation, RTs, and @'s
            tweet_stripped = urlre.sub('',tweet['text'])
            tweet_stripped = punctre.sub('',tweet_stripped)
            tweet_stemmed = [porter_stemmer.stem_word(i.lower()) for i in tweet_stripped.split()]
            # Keep unstemmed, stripped tweet for either storage or retweeting
            tweet_outgoing = [i.lower() for i in tweet_stripped.split()]
            # hack to make sure that only coherent tweets are passed through
            temp_tweet = [i.lower() for i in tweet_stemmed if not i.lower().startswith('rt')]
            tweet_for_topic = [i.lower() for i in tweet_stemmed if not i.lower().startswith('rt') and nsyl(i)>0 and i.lower() not in stopwords]
            if tweet_for_topic==temp_tweet and len(tweet_for_topic)>0:
              print 'Iteration '+str(counter)
              #Assign this tweet a topic
              docset = []
              docset.append(' '.join(i for i in tweet_for_topic))
              print 'Tweet: '+docset[0]
              (gamma, bound) = olda.update_lambda(docset)
              counter+=1
              if (counter % 100 == 0):
                numpy.savetxt('lambdas/lambda-%d.dat' % counter, olda._lambda)
开发者ID:coreylynch,项目名称:PoetsThatDontKnowIt,代码行数:33,代码来源:ptdki.py


注:本文中的nltk.stem.porter.PorterStemmer.stem_word方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。