当前位置: 首页>>代码示例>>Python>>正文


Python WordNetLemmatizer.lemmatize方法代码示例

本文整理汇总了Python中nltk.WordNetLemmatizer.lemmatize方法的典型用法代码示例。如果您正苦于以下问题:Python WordNetLemmatizer.lemmatize方法的具体用法?Python WordNetLemmatizer.lemmatize怎么用?Python WordNetLemmatizer.lemmatize使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在nltk.WordNetLemmatizer的用法示例。


在下文中一共展示了WordNetLemmatizer.lemmatize方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: __init__

# 需要导入模块: from nltk import WordNetLemmatizer [as 别名]
# 或者: from nltk.WordNetLemmatizer import lemmatize [as 别名]
	def __init__(self, text, product_name):
		self.candidate_features = []
		self.feature_sentences = []
		self.product_name = product_name.lower().split('-')[0].split('_')
		t = Tokenizer()
		sents = t.sent_tokenize(text.lower())
		p = POSTagger()
		wnl = WordNetLemmatizer()
		for sent in sents:
			tagged_sent = p.nltk_tag(t.word_tokenize(sent))
			feature_sent = {}
			feature_sent['sentence'] = sent
			feature_sent['tags'] = tagged_sent
			feature_sent['nouns'] = []
			feature_sent['noun_phrases'] = []
			for i in range(0, len(tagged_sent)):
				(word, tag) = tagged_sent[i]
				#Don't include proper nouns
				if tag.startswith('N') and tag != 'NNP':
					"""
					Consecutive nouns might form a feature phrase. Eg. Picture quality is a phrase.
					Meaningless phrases like 'quality digital' are removed later as their frequeny of occurence is	low. """
					if i > 0 and len(feature_sent['nouns']) > 0 and tagged_sent[i - 1][0] == feature_sent['nouns'][-1] and feature_sent['sentence'].find(feature_sent['nouns'][-1] + ' ' + word) > -1:
						feature_sent['noun_phrases'].append(wnl.lemmatize(feature_sent['nouns'].pop() + ' ' + word))
					else:
						feature_sent['nouns'].append(wnl.lemmatize(word))
					
			self.feature_sentences.append(feature_sent)
开发者ID:SimonAtGitHub,项目名称:OpinionMiner,代码行数:30,代码来源:FeatureExtractor.py

示例2: init_feature_sentences

# 需要导入模块: from nltk import WordNetLemmatizer [as 别名]
# 或者: from nltk.WordNetLemmatizer import lemmatize [as 别名]
    def init_feature_sentences(self, total_content):
        t = Tokenizer()
        p = POSTagger()
        wnl = WordNetLemmatizer()

        sentences = t.sent_tokenize(total_content.lower())

        for sentence in sentences:
            tagged_sentence = p.ntlk_tag(t.word_tokenize(sentence))

            #Initializing Feature Sentence dictionary
            feature_sentence = {}
            feature_sentence['sentence'] = sentence
            feature_sentence['tags'] = tagged_sentence
            feature_sentence['nouns'] = []
            feature_sentence['noun_phrases'] = []

            #Finding the Nouns/Noun Phrases in the tagged sentence
            for i in range(0,len(tagged_sentence)):
                (word, tag) = tagged_sentence[i]

                #Chunking
                if tag.startswith('N') and tag != 'NNP':
                    if i > 0 and len(feature_sentence['nouns']) > 0 and tagged_sentence[i - 1][0] == feature_sentence['nouns'][-1] and feature_sentence['sentence'].find(feature_sentence['nouns'][-1] + ' ' + word) > -1:
                        feature_sentence['noun_phrases'].append(wnl.lemmatize(feature_sentence['nouns'].pop() + ' ' + word))
                    else:
                        feature_sentence['nouns'].append(wnl.lemmatize(word))

            self.feature_sentences.append(feature_sentence)
开发者ID:sgudla,项目名称:OpninionMining,代码行数:31,代码来源:FeatureExtractor.py

示例3: write_clean_turian_unigrams

# 需要导入模块: from nltk import WordNetLemmatizer [as 别名]
# 或者: from nltk.WordNetLemmatizer import lemmatize [as 别名]
def write_clean_turian_unigrams():
    """
    Extracts unigram embeddings from Socher's binary distribution. These can be used by other composers.

    There are only 50k embeddings (presumably for the most frequent tokens in the corpus). The words have not
    been processed- there are punctuation-only tokens, uppercased words and non-lemmatized words. There isn't
    any PoS tag filtering either- words like "to", "while" and "there".

    I remove punctuation, then lowercase and lemmatize each entry. Multiple entries may map to the
    same canonical form. I select the shortest original entry (ties are broken by giving preference to
    words that are already lowercased). This could have been done better.
    Only vectors for the selected entries are kept. There's 33k canonical
    forms left, many of which are not nouns/adjs/verbs.

    We don't have a PoS tag for the canonical forms. I get around the problem by creating 3 copies of each
    canonical form and expand "cat" to cat/N, cat/J and cat/V, which all share the same vector.
    """
    logging.info('Writing Turian unigrams to %s', turian_unigram_vectors_file)
    mat = loadmat(socher_unigram_embedding_matlab)
    words = [w[0] for w in mat['words'].ravel()]
    df = pd.DataFrame(mat['We'].T, index=words)

    lmtzr = WordNetLemmatizer()
    clean_to_dirty = defaultdict(list)  # canonical -> [non-canonical]
    dirty_to_clean = dict()  # non-canonical -> canonical
    to_keep = set()  # which non-canonical forms forms we will keep
    #  todo this can be done based on frequency or something

    for w in words:
        if set(w).intersection(set(string.punctuation).union(set('0123456789'))):
            # not a real word- contains digits or punctuation
            continue

        lemma = lmtzr.lemmatize(w.lower())
        clean_to_dirty[lemma].append(w)
        dirty_to_clean[w] = lemma

    # decide which of possibly many non-canonical forms with the same lemma to keep
    # prefer shorter and lowercased non-canonical forms
    for lemma, dirty_list in clean_to_dirty.items():
        if len(dirty_list) > 1:
            best_lemma = min(dirty_list, key=lambda w: (len(w), not w.islower()))
        else:
            best_lemma = dirty_list[0]
        to_keep.add(best_lemma)

    # remove non-canonical forms we don't want
    idx_to_drop = [i for i, w in enumerate(df.index) if w not in to_keep]
    ddf = df.drop(df.index[idx_to_drop])
    # canonicalize whatever is left
    ddf.index = [lmtzr.lemmatize(w.lower()) for w in ddf.index]

    # we don't know what the PoS tags of the canonical forms are, so make them all of the same tag
    # e.g. expand "cat" to cat/N, cat/J and cat/V, which all share the same vector
    new_index = ['%s/%s'%(w, pos) for pos in 'NJV' for w in ddf.index]
    new_data = np.vstack([ddf.values] * 3)
    ddf = pd.DataFrame(new_data, index= new_index)
    dv = DenseVectors(ddf, allow_lexical_overlap=True)
    dv.to_tsv(turian_unigram_vectors_file)
    logging.info('Done')
开发者ID:mbatchkarov,项目名称:vector_builder,代码行数:62,代码来源:socher_vectors.py

示例4: lemmatize

# 需要导入模块: from nltk import WordNetLemmatizer [as 别名]
# 或者: from nltk.WordNetLemmatizer import lemmatize [as 别名]
def lemmatize(tokens): 
	# lemmatize words. try both noun and verb lemmatizations 
	lmtzr = WordNetLemmatizer() 
	for i in range(0,len(tokens)): 
		res = lmtzr.lemmatize(tokens[i]) 
		if res == tokens[i]: 
			tokens[i] = lmtzr.lemmatize(tokens[i], 'v') 
		else: 
			tokens[i] = res 
	return tokens
开发者ID:fengshikun,项目名称:Webpage-classification-Minor-Project,代码行数:12,代码来源:functions.py

示例5: Lemmatizer

# 需要导入模块: from nltk import WordNetLemmatizer [as 别名]
# 或者: from nltk.WordNetLemmatizer import lemmatize [as 别名]
class Lemmatizer():
	def __init__(self):
		self.lemmatizer = WordNetLemmatizer()
		self.stemmer = SnowballStemmer("english", ignore_stopwords=True)

	'''
	Lemmatizes every word in a sentence and then tokenizes it.	
		sentence: str
	'''
	def lemmatize(self, sentence):
		tokens = word_tokenize(sentence)
		lemmas = self.lemmatizeTokens(tokens)
		return " ".join(lemmas)
		
	'''
	Turns phrase tokens into lemmatized tokens, which means into some standard format
	as determined by the nltk lemmatizer. "Dogs" to "dog", "went" to "go", etc.	 
		tokens: list of str
	'''
	def lemmatizeTokens(self, tokens):
		tokens_tagged = pos_tag(tokens)
		#Get simple POS tags.
		tokens_simpleTags = [(word, map_tag('en-ptb', 'universal', tag)) 
			for word, tag in tokens_tagged]
		
		#Actually lemmatize.
		lemmas = []
		for token, tag in tokens_simpleTags:
			lemmatized = ""
			if tag == "VERB":
				lemmatized = self.lemmatizer.lemmatize(token, pos='v')
			elif tag == "ADJ":
				lemmatized = self.lemmatizer.lemmatize(token, pos='a')
			elif tag == "ADV":
				lemmatized = self.lemmatizer.lemmatize(token, pos='r')
			else:
				lemmatized = self.lemmatizer.lemmatize(token) #pos = 'n'
			lemmas.append(lemmatized.encode("utf-8"))
		return lemmas

	'''
	Reduce this word down to its most basic form by removing suffixes or common ending
	and finding the "root" or "stem" of the word.

	Example: "response," "responsive," and "responsivity" all stem from "respons," or 
	something similar.
	'''
	def stem(self, tokens):
		stemmed = []
		for token in tokens:
			stem = self.stemmer.stem(token)
			stemmed.append(stem.encode("utf-8"))
		return stemmed
开发者ID:LukeLindsey,项目名称:WhistleblowerAnalysis,代码行数:55,代码来源:Lemmatizer.py

示例6: lemmstem

# 需要导入模块: from nltk import WordNetLemmatizer [as 别名]
# 或者: from nltk.WordNetLemmatizer import lemmatize [as 别名]
def lemmstem(sentences):
    ''' This function is responsible for perfoming 
        the lemmarization and stemming of the words
        Input: A list of trees containing the sentences.
                All words are classificated by their NE type
        Output: Lemmatized/Stemmized sentences
    '''
    
    lmtzr = WordNetLemmatizer()
    st = LancasterStemmer()
    
    dic = {'VB' :wordnet.VERB,
            'NN': wordnet.NOUN,
            'JJ':wordnet.ADJ,
            'RB':wordnet.ADV }
    
    for sent in sentences:
      
        lvsidx=sent.treepositions('leaves') 
       
        for pos in lvsidx:
            word=sent[pos][0]
            tag = sent[pos][1]
            rtag = tag[0:2]
            if rtag in dic:
                lemm=lmtzr.lemmatize( word, dic[rtag] )
                stem=st.stem(lemm)
                #print word, lemm, stem #Linia maldita
                sent[pos]=(word, tag, stem)
            else:
                sent[pos]=(word, tag, word)
    
    return sentences
开发者ID:picarus,项目名称:MAI-INLP-ALB5,代码行数:35,代码来源:preprocessing_functions.py

示例7: text2sents

# 需要导入模块: from nltk import WordNetLemmatizer [as 别名]
# 或者: from nltk.WordNetLemmatizer import lemmatize [as 别名]
def text2sents(text, lemmatize=False, stemmer=None):
    """
    converts a text into a list of sentences consisted of normalized words
    :param text: list of string to process
    :param lemmatize: if true, words will be lemmatized, otherwise -- stemmed
    :param stemmer: stemmer to be used, if None, PortedStemmer is used. Only applyed if lemmatize==False
    :return: list of lists of words
    """
    sents = sent_tokenize(text)

    tokenizer = RegexpTokenizer(r'\w+')

    if lemmatize:
        normalizer = WordNetLemmatizer()
        tagger = PerceptronTagger()
    elif stemmer is None:
        normalizer = PorterStemmer()
    else:
        normalizer = stemmer

    sents_normalized = []

    for sent in sents:
        sent_tokenized = tokenizer.tokenize(sent)
        if lemmatize:
            sent_tagged = tagger.tag(sent_tokenized)
            sent_normalized = [normalizer.lemmatize(w[0], get_wordnet_pos(w[1])) for w in sent_tagged]
        else:
            sent_normalized = [normalizer.stem(w) for w in sent_tokenized]

        sents_normalized.append(sent_normalized)
    return sents_normalized
开发者ID:Dolorousrtur,项目名称:KeywordClassifier,代码行数:34,代码来源:text_processing.py

示例8: preprocess

# 需要导入模块: from nltk import WordNetLemmatizer [as 别名]
# 或者: from nltk.WordNetLemmatizer import lemmatize [as 别名]
def preprocess(original_str):
	# stemmer
	wnl = WordNetLemmatizer()
	# pos
	original_str = unicode(original_str, errors='ignore')
	print type(original_str)
	article_tok = pos_tag(word_tokenize(original_str))
	print type(article_tok)
	print "token: "
	print article_tok

	# choose Noun
	str_noun = ''
	for word, tag in article_tok:
		if ("NN" in tag) or ("JJ" in tag):
			# print(word,":",tag)
			# print(wnl.lemmatize(word))
			try:
				stemming_word = wnl.lemmatize(word)
				print stemming_word
				if len(word) > 1:
					str_noun = str_noun + stemming_word + " "
			except UnicodeDecodeError as e:
				print "error: " + word
			# end if



	# result
	# final_doc.append(str_noun)
	# print "return_preprocess : " + str_noun

	return str_noun
开发者ID:kkfighter2,项目名称:test,代码行数:35,代码来源:pos.py

示例9: feature_extractor_top_words_weights

# 需要导入模块: from nltk import WordNetLemmatizer [as 别名]
# 或者: from nltk.WordNetLemmatizer import lemmatize [as 别名]
def feature_extractor_top_words_weights(data):
    data = data.decode('utf-8')
    top_words = ['travel', 'vacation', 'city', 'itsmorefuninthephilippines', 'travel',
                 'boracay', 'philippine', 'view', 'day', 'beach', 'morning', 'resort', 
                 'good', 'cebu', 'island']
    features = {}
    lemmatizer = WordNetLemmatizer()
    stop_words = stopwords.words('english')

    words = [lemmatizer.lemmatize(word.lower()) for word in word_tokenize(data)]

    for word in words:
        if word not in stop_words:
            if word in features:
                if word in top_words:
                    features[word] += 1.5
                else:
                    features[word] += 1
            else:
                if word in top_words:
                    features[word] = 1.5
                else:
                    features[word] = 1

    return features
开发者ID:jedijulia,项目名称:nlp-tourism,代码行数:27,代码来源:classifier.py

示例10: returnKeywordFromList

# 需要导入模块: from nltk import WordNetLemmatizer [as 别名]
# 或者: from nltk.WordNetLemmatizer import lemmatize [as 别名]
def returnKeywordFromList(convertpath):
    token_dict = {}
    i=0

    #nltk.download()
    wnl = WordNetLemmatizer()
    fileName = {}
    #print file
    #print str(i)+ file
    #file_path = subdir + os.path.sep + file
    shakes = open(convertpath, 'r')
    text = shakes.read()
    lowers = "".join(map(lambda l:l.decode('unicode_escape').encode('ascii','ignore'),text))
    no_punctuation = re.sub(r'[?|$|.|!0-9()=+-\/\'\"\|]',r'',lowers)
    d = {v:True for v in no_punctuation.split()}
    for token in d.keys():
        no_punctuation = no_punctuation.replace(token, wnl.lemmatize(token))
    fileName[i] = file
    token_dict[i] = no_punctuation.replace("\n"," ").replace("\r","")
    #break

    #this can take some time
    ##print token_dict.values()
    tfidf_vect = TfidfVectorizer(stop_words =stops, ngram_range=(1, 2))
    # #
    # count_vect.stop_words = stops
    #
    X_train_counts = tfidf_vect.fit_transform(token_dict.values())
    #print tfidf_vect.get_feature_names()
    #print(sortSparseMatrix(X_train_counts.getrow(0),rev=False, only_indices=False))
    sortedMatrix = sortSparseMatrix(X_train_counts.getrow(0),rev=True, only_indices=False)[0]
    x = map(lambda (x,y):x,sortedMatrix)
    result = getKeywordAlgorithms(1,sortedMatrix)
    return map(lambda key:tfidf_vect.get_feature_names()[key],result)
开发者ID:inatnunz,项目名称:cv-recommendsys-api,代码行数:36,代码来源:KeywordProcessor.py

示例11: feature_extractor_tripadvisor_top_words_weights

# 需要导入模块: from nltk import WordNetLemmatizer [as 别名]
# 或者: from nltk.WordNetLemmatizer import lemmatize [as 别名]
def feature_extractor_tripadvisor_top_words_weights(data):
    data = data.decode('utf-8')

    top_file = open('scraper/top_words.txt', 'r')
    top_words = [word.replace('\n', '') for word in top_file]
    places_file = open('scraper/places.txt', 'r')

    for place in places_file:
        place = place.replace('\n', '')
        for word in place.split(' '):
            if word != '-':
                top_words.append(word)

    features = {}
    lemmatizer = WordNetLemmatizer()
    stop_words = stopwords.words('english')

    words = [lemmatizer.lemmatize(word.lower()) for word in word_tokenize(data)]

    for word in words:
        if word not in stop_words:
            if word in features:
                if word in top_words:
                    features[word] += 1.5
                else:
                    features[word] += 1
            else:
                if word in top_words:
                    features[word] = 1.5
                else:
                    features[word] = 1

    return features
开发者ID:jedijulia,项目名称:nlp-tourism,代码行数:35,代码来源:classifier.py

示例12: lemmatizing

# 需要导入模块: from nltk import WordNetLemmatizer [as 别名]
# 或者: from nltk.WordNetLemmatizer import lemmatize [as 别名]
def lemmatizing(line_list):
    """
    Input: line_list (list of strings(sentences/documents)) - e.g. dataset.data

    Iterates over all terms in lines, lemmatize them using WordNetLemmatizer()

    Return: lemmatized_list (list of strings(terms that stemmed))
    """
    lemmatized_list = []
    lemmatizer = WordNetLemmatizer()
    for i, line in enumerate(line_list):
        # linercase
        line = line.lower()
        # remove punctuation
        # below method will simply remove punctuation, but mistakes such as amazon.com => amazoncom
        # nopunct_line = ''.join([c for c in line 
                                            # if re.match("[a-z\-\' \n\t]", c)])
        # this solve the problem above:
        nopunct_line = re.sub('[^A-Za-z0-9]+', ' ', line)                                            
        # tokenize
        line_token = wt(nopunct_line)
        # stemming
        lemmatized_line = []
        for term in line_token:
            term = lemmatizer.lemmatize(term)
            lemmatized_line.append(term)
        # back to sentence as a string
        lemmatized_sentence = ' '.join(lemmatized_line)
        lemmatized_list.append(lemmatized_sentence)
    return lemmatized_list
开发者ID:YuanhaoSun,项目名称:PPLearn,代码行数:32,代码来源:ml_feature_engineering.py

示例13: feature_extractor_top_words_weights

# 需要导入模块: from nltk import WordNetLemmatizer [as 别名]
# 或者: from nltk.WordNetLemmatizer import lemmatize [as 别名]
def feature_extractor_top_words_weights(data):
    """
     Extract features using the top words with weights method
     parameter: data (tweet)
     returns: returns features of the given data
    """
    data = data.decode('utf-8')
    # top 15 frequently-ocurring words from the tourism-related twitter corpus
    top_words = ['travel', 'vacation', 'city', 'itsmorefuninthephilippines', 'travel',
                 'boracay', 'philippine', 'view', 'day', 'beach', 'morning', 'resort', 
                 'good', 'cebu', 'island']
    features = {}
    lemmatizer = WordNetLemmatizer()
    stop_words = stopwords.words('english')

    # preprocessing: tokenize, convert to lowercase and lemmatize words
    words = [lemmatizer.lemmatize(word.lower()) for word in word_tokenize(data)]

    # remove stop words and add words and their frequencies as features
    for word in words:
        if word not in stop_words:
            if word in features:
                # if word is found in the top words list, increase by 1.5 or preferred weight
                if word in top_words:
                    features[word] += 1.5
                else:
                    features[word] += 1
            else:
                if word in top_words:
                    features[word] = 1.5
                else:
                    features[word] = 1

    return features
开发者ID:jedijulia,项目名称:nlp-tourism,代码行数:36,代码来源:classifier.py

示例14: Check

# 需要导入模块: from nltk import WordNetLemmatizer [as 别名]
# 或者: from nltk.WordNetLemmatizer import lemmatize [as 别名]
def Check(mArray):
  
  # what am I checking?
  item = mArray[1]
  lmtzr = WordNetLemmatizer()
  item = lmtzr.lemmatize(item)
  
  # converts to a string
  return ''.join(item)
开发者ID:3009420,项目名称:mg-game,代码行数:11,代码来源:DictStem.py

示例15: word_extractor2

# 需要导入模块: from nltk import WordNetLemmatizer [as 别名]
# 或者: from nltk.WordNetLemmatizer import lemmatize [as 别名]
def word_extractor2(text):
	wordlemmatizer = WordNetLemmatizer()
	text = re.sub(r'([a-z])\1+', r'\1\1',text)#substitute multiple letter by two
	words = ""
	wordtokens = [ wordlemmatizer.lemmatize(word.lower()) \
	for word in word_tokenize(text.decode('utf-8', 'ignore')) ]
	for word in wordtokens:
		words+=" "+word
	return words
开发者ID:Paulinyta,项目名称:Tarea3_AID,代码行数:11,代码来源:pregunta2_nonstop.py


注:本文中的nltk.WordNetLemmatizer.lemmatize方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。