当前位置: 首页>>代码示例>>Python>>正文


Python WordNetLemmatizer.lemmatize方法代码示例

本文整理汇总了Python中nltk.stem.wordnet.WordNetLemmatizer.lemmatize方法的典型用法代码示例。如果您正苦于以下问题:Python WordNetLemmatizer.lemmatize方法的具体用法?Python WordNetLemmatizer.lemmatize怎么用?Python WordNetLemmatizer.lemmatize使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在nltk.stem.wordnet.WordNetLemmatizer的用法示例。


在下文中一共展示了WordNetLemmatizer.lemmatize方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: get_cooc

# 需要导入模块: from nltk.stem.wordnet import WordNetLemmatizer [as 别名]
# 或者: from nltk.stem.wordnet.WordNetLemmatizer import lemmatize [as 别名]
def get_cooc(chunk_trees,stoplist=True):
  triples, simple_trees = [], []
  lmtzr = WordNetLemmatizer()
  for t in chunk_trees:
    entities = []
    for chunk in t[:]:
      if isinstance(chunk,Tree) and chunk.node == 'NP':
        # getting a tree for later processing of triples from the simple noun 
        # phrases (if present)
        simple_trees.append(parser_smp.parse(chunk.leaves()))
        words = []
        for word, tag in chunk[:]:
          # stem/discard elements and construct an argument
          if (stoplist and word in STOPLIST) or \
          (len([x for x in word if x.isalnum()]) == 0):
            # do not process stopwords for simple trees, do not process purely 
            # non alphanumeric characters
            continue
          if tag.startswith('N'):
            words.append(lmtzr.lemmatize(word,'n'))
          elif tag.startswith('J'):
            words.append(lmtzr.lemmatize(word,'a'))
          else:
            words.append(word)
        if len(words) > 0:
          entities.append(SEP.join(words))
    for e1, e2 in combinations(entities,2):
      triples.append((e1,util.COOC_RELNAME,e2))
      triples.append((e2,util.COOC_RELNAME,e1))
  return triples, simple_trees
开发者ID:vitnov,项目名称:SKIMMR,代码行数:32,代码来源:extr.py

示例2: MakeLemmaList

# 需要导入模块: from nltk.stem.wordnet import WordNetLemmatizer [as 别名]
# 或者: from nltk.stem.wordnet.WordNetLemmatizer import lemmatize [as 别名]
def MakeLemmaList(tagged):
    # n noun
    # v verb
    # a adje
    # r adverb
    # m,w,.. something else

    noun_op, adj_op, adv_op, verb_op, other_op = [], [], [], [], []

    lm = WordNetLemmatizer()
    for i in tagged:
        # print i, i[0], i[1][0:2]
        if cmp(i[1][0:1], "N") == 0:
            noun_op.append(lm.lemmatize(i[0], "n"))
        elif cmp(i[1][0:1], "V") == 0:
            asd = lm.lemmatize(i[0], "v")
            if asd != "be" and asd != "have" and asd != "do" and asd != "done" and asd != "should":
                verb_op.append(asd)
        elif cmp(i[1][0:1], "J") == 0:
            adj_op.append(lm.lemmatize(i[0], "a"))
        elif cmp(i[1][0:1], "R") == 0:
            adv_op.append(lm.lemmatize(i[0], "r"))
        else:
            # print lm.lemmatize(i[0])+ " "
            pass
    final_op = noun_op + verb_op + other_op + adj_op + adv_op
    return final_op
开发者ID:ecntrk,项目名称:Gardener,代码行数:29,代码来源:input_sanitizer.py

示例3: decompose

# 需要导入模块: from nltk.stem.wordnet import WordNetLemmatizer [as 别名]
# 或者: from nltk.stem.wordnet.WordNetLemmatizer import lemmatize [as 别名]
def decompose(text, keepOriginal):
    if text:
        # Case-folding
        text = text.lower();
        
        # Expand all contractions like "isn't" to "is not"
        text = expandContractions(text);
        
        # Remove punctuation
        regex = re.compile('[%s]' % re.escape(string.punctuation))
        text = regex.sub('', text)
        
        # Remove stop words (just add words to the list you think also have to be removed)
        stopWords = ['the','this','that','those','these','to','as','there','has','and','or',
                     'is','not','a','an','of','but','in','by','on','are','it','if'];
        words = text.split();
        text = ' '.join([i for i in words if i not in stopWords]);
        
        # Lemmatization
        lemmatizer = WordNetLemmatizer();
        words = text.split();
        if keepOriginal:
            text = ' '.join([i + " " + lemmatizer.lemmatize(i) for i in words]);
        else:            
            text = ' '.join([lemmatizer.lemmatize(i) for i in words]);
        
        # Remove duplicate words
        text = ' '.join(OrderedDict((word,word) for word in text.split()).keys());
    return text
开发者ID:jreijrink,项目名称:eventbook,代码行数:31,代码来源:decomposition.py

示例4: stemming

# 需要导入模块: from nltk.stem.wordnet import WordNetLemmatizer [as 别名]
# 或者: from nltk.stem.wordnet.WordNetLemmatizer import lemmatize [as 别名]
def stemming():
    lmtzr = WordNetLemmatizer()
    with open('date_gone.out', 'rb') as fin:
        with open('stemmed.out', 'w') as fout:
            i = 0
            for line in fin:
                #i+=1
                new_data = []
                row = line.split('\t')
                #print(i)
                l = len(row)
                if l > 5:
                    data = row[5]
                    words = data.split(' ')
                    for word in words:
                        new_word = lmtzr.lemmatize(word)
                        new_data.append(new_word)
                    row[5] = ' '.join(new_data)
                if l > 6:
                    data = row[6]
                    words = data.split(' ')
                    for word in words:
                        new_word = lmtzr.lemmatize(word)
                        new_data.append(new_word)
                    row[6] = ' '.join(new_data)
                fout.write('\t'.join(row))
开发者ID:esmeagol,项目名称:ReviewSpamDetection,代码行数:28,代码来源:replace_product_freq.py

示例5: firstDef

# 需要导入模块: from nltk.stem.wordnet import WordNetLemmatizer [as 别名]
# 或者: from nltk.stem.wordnet.WordNetLemmatizer import lemmatize [as 别名]
def firstDef(mwe,definition):
    # this is the approach of using only the first definition
    if definition=='':
        return([1,1])
    definition = definition.split('\n')[0]
    definition = definition.replace(mwe,'')
    definition = definition.replace('(','')
    definition = definition.replace(')','')
    tokenizer = RegexpTokenizer('\w+|\$[\d\.]+|\S+')
    defineArr = tokenizer.tokenize(definition)
    
    lmtzr = WordNetLemmatizer()
    for i in range(0,len(defineArr)):
        defineArr[i] = lmtzr.lemmatize(defineArr[i])

        
    words = mwe.split()
    for i in range(0,len(words)):
        words[i] = lmtzr.lemmatize(words[i])
    



    if words[0] in defineArr and words[1] in defineArr:
        return([1,1])
        
    elif words[0] in defineArr:
        return([1,0])
        
    elif words[1] in defineArr:
        return([0,1])
    else:
        return([0,0])
开发者ID:bsalehi,项目名称:wiktionary_MWE_compositionality,代码行数:35,代码来源:Compute.py

示例6: convert_speeches_into_matrix

# 需要导入模块: from nltk.stem.wordnet import WordNetLemmatizer [as 别名]
# 或者: from nltk.stem.wordnet.WordNetLemmatizer import lemmatize [as 别名]
def convert_speeches_into_matrix(features,speech_list,label):    
    sample_matrix = []
    label_vector  = []
    #print len(features)
    for speech in speech_list:
        sample = []
        speech = re.sub('http://[a-zA-Z0-9|/|.]*',' ',speech)
        speech = re.sub('%[0-9|.]*', ' ', speech)
        speech = re.sub('$[0-9|.]*',' ', speech)
        for ch in " \"$!'@#%&()*+,-./:;<=>?[\\]^_`{|}~ ":
            speech = speech.replace(ch,' ')

        tokens = speech.split()
        
        #word lemmatization
        lmtzr = WordNetLemmatizer()
        tokens = [lmtzr.lemmatize(token) for token in tokens]
        tokens = [lmtzr.lemmatize(token,'v') for token in tokens]

        #tokens = bigrams(tokens)                    # uncomment this line, we can use bigram as
        unique_tokens_dict = collections.Counter(tokens)

        for fea in features:
            if fea in unique_tokens_dict:
                sample.append(unique_tokens_dict[fea])
            else:
                sample.append(0)
       
        #print(sample)
        sample_matrix.append(sample)
        label_vector.append(label)
    
    return sample_matrix,label_vector
开发者ID:Chunpai,项目名称:cs200,代码行数:35,代码来源:svm.py

示例7: data_preprocessing

# 需要导入模块: from nltk.stem.wordnet import WordNetLemmatizer [as 别名]
# 或者: from nltk.stem.wordnet.WordNetLemmatizer import lemmatize [as 别名]
def data_preprocessing(file_path):
    f = open(file_path,'r')
    speech_list = f.read().split("###")   # read speeches, split with ###, and save them into list.
    del speech_list[-1]
    f.close()
    #print len(speech_list)
    f = open(file_path,'r')
    speeches = f.read().lower()    #set all letters lower case
    speeches = re.sub('http://[a-zA-Z0-9|/|.]*',' ',speeches)
    speeches = re.sub('%[0-9|.]*', ' ', speeches)
    speeches = re.sub('$[0-9|.]*',' ', speeches)
    #speeches = re.sub('\\\\xe2\\\\x80\\\\x[a-zA-Z0-9]*',' ',speeches)
    #print speeches
    for ch in " \"$!'@#%&()*+,-./:;<=>?[\\]^_`{|}~ ":
        speeches = speeches.replace(ch,' ')

    tokens = speeches.split()
    
    #word lemmatization
    lmtzr = WordNetLemmatizer()
    tokens = [lmtzr.lemmatize(token) for token in tokens]
    tokens = [lmtzr.lemmatize(token,'v') for token in tokens]

    #tokens = bigrams(tokens)                    # uncomment this line, we can use bigram as

    total_tokens_count = len(tokens)
    unique_tokens_dict = collections.Counter(tokens)   #key is word, value is the count,
                                                       #also default value 0 for non-exsit key.

    result = [ speech_list, unique_tokens_dict, total_tokens_count ]
    return result
开发者ID:Chunpai,项目名称:cs200,代码行数:33,代码来源:data_preprocessing.py

示例8: getpurpose

# 需要导入模块: from nltk.stem.wordnet import WordNetLemmatizer [as 别名]
# 或者: from nltk.stem.wordnet.WordNetLemmatizer import lemmatize [as 别名]
def getpurpose(matched,classname):
	lmtzr = WordNetLemmatizer()
	if classname=='class4' or classname=='class6' or classname=='class3':
		exp='\w*?ing NN\w*?'
		match=re.search(exp,matched)
		purpose_text=match.group().split()
		purpose=lmtzr.lemmatize(purpose_text[0],'v')
		return purpose
	if classname=='class2':
		exp='\w*? VB\w*?'
		match=re.search(exp,matched)
		purpose_text=match.group().split()
		purpose=lmtzr.lemmatize(purpose_text[0],'v')
		return purpose
	if classname=='class5' or classname=='class7':
		exp='for IN \w*? NN\w*?';
		match=re.search(exp,matched)
		purpose_text=match.group().split()
		purpose=lmtzr.lemmatize(purpose_text[2],'v')
		return purpose
	if classname=='class1' or 'class9':
		exp='\w*? IN \w*? VBG'
		match=re.search(exp,matched)
		if match:
			purpose_text=match.group().split()
			purpose=lmtzr.lemmatize(purpose_text[2],'v')
			return purpose
	if classname=='class1':
		exp='\w*? TO \w*? VB\w*? \w*? NN\w*?'
		match=re.search(exp,matched)
		if match:
			purpose_text=match.group().split()
			purpose=lmtzr.lemmatize(purpose_text[2],'v')
			return purpose
	return None
开发者ID:aditya-sanka,项目名称:Projects,代码行数:37,代码来源:PURPOSE_EXTRACTION_TOOL.py

示例9: parseLine

# 需要导入模块: from nltk.stem.wordnet import WordNetLemmatizer [as 别名]
# 或者: from nltk.stem.wordnet.WordNetLemmatizer import lemmatize [as 别名]
def parseLine(line, stopWords_, wordInd, currWrd):
    """ Removes stop words and lemmas using nltk and punctuations 
    using re. Returns a list with valid words in the line. currWrd is
    the index of next word occurring for the first time
    """
    lineWords = []
    # Hypen in hyphenated words are removed e.g. wi-fi ==> wifi.
    line = re.sub('(\w)-(\w)',r'\1\2',line)
    # replace underscore with space     
    line = re.sub('(\w)_(\w)',r'\1 \2',line)    
    # Remove punctuation marks.
    line = re.sub("[',~`@#$%^&*|<>{}[\]\\\/.:;?!\(\)_+\"-]",r'',line)
    wnLmtzr = WordNetLemmatizer()    
    for word in line.split():
        # Get index of word from wordInd. If it is seen for the first 
        # time assign an index to the word.
        word = word.lower()    # case of words is ignored
        # Lemmatize word using word net function
        word = wnLmtzr.lemmatize(word, 'n')    # with noun
        word1 = wnLmtzr.lemmatize(word, 'v')    # with verb
        if len(word1) < len(word):    # select smaller of two
            word = word1                
        # Ignore stop words and numbers.
        if word in stopWords_ or \
                re.match('^\d+x?\d*$',word) is not None:
            continue
        # Update wordInd with number of occurrences of word.
        if word not in wordInd:                
            wordInd[word] = currWrd[0]
            currWrd[0] += 1
        # Update lineWords with word.
        lineWords.append(word)
    return lineWords
开发者ID:mnandan,项目名称:exercises,代码行数:35,代码来源:task2.py

示例10: stemWordMatch

# 需要导入模块: from nltk.stem.wordnet import WordNetLemmatizer [as 别名]
# 或者: from nltk.stem.wordnet.WordNetLemmatizer import lemmatize [as 别名]
def stemWordMatch(question,sentence):

    lmtzr = WordNetLemmatizer()

    question_tokens = set(nltk.word_tokenize(question))
    sentence_tokens=set(nltk.word_tokenize(sentence))

    count=0
    '''for i in sentence_tokens:
        #Finding the exact word match
        if lmtzr.lemmatize(i, 'v').lower() in  [lmtzr.lemmatize(x, 'v').lower() for x in question_tokens]:
            #print  'matching word is:',i
            count=count+6
        elif i.lower() in [x.lower() for x in question_tokens]:
            print 'i is :',i
            count=count+3
    #print 'Exact word match count is :',count'''

    for i in sentence_tokens:
        #Finding the exact word match

        if i.lower() in [x.lower() for x in question_tokens]:
            #print 'i is :',i
            count=count+3
        elif lmtzr.lemmatize(i, 'v').lower() in  [lmtzr.lemmatize(x, 'v').lower() for x in question_tokens]:
            #print  'matching word is:',i
            count=count+6

    #print 'Exact word match count is :',count


    return count
开发者ID:young1205,项目名称:Natural-Language-Processing-Fall-2015,代码行数:34,代码来源:WM.py

示例11: getlemmas

# 需要导入模块: from nltk.stem.wordnet import WordNetLemmatizer [as 别名]
# 或者: from nltk.stem.wordnet.WordNetLemmatizer import lemmatize [as 别名]
def getlemmas(tokens):
    lemmas = []
    for token in tokens:
        if len(token) < 2 or not isWord(token) or token == "the":
            lemmas.append({})
            continue
        
        tokenLemmas = {}
        #Synonyms
        for syn in wn.synsets(token):
            #Derived Forms and their Syns
            for lemma in syn.lemmas():
                for df in lemma.derivationally_related_forms():
                    for ln in df.synset().lemma_names():
                        tokenLemmas[ln] = 4
                    tokenLemmas[df.name()] = 3
            for lname in syn.lemma_names():
                tokenLemmas[lname] = 2
        
        #Wordnet lemmas
        l = WordNetLemmatizer()
        for x in ('v','a','s','r','n'):
            tmp = l.lemmatize(token, x)
            tokenLemmas[tmp] = 1
            tmp = l.lemmatize(tmp, x)
            tokenLemmas[tmp] = 1
        
        #Exact
        tokenLemmas[token] = 1
        
        lemmas.append(tokenLemmas)
    
    return lemmas
开发者ID:mharwani,项目名称:AMR-parsing,代码行数:35,代码来源:wordmatch.py

示例12: get_dante_answers

# 需要导入模块: from nltk.stem.wordnet import WordNetLemmatizer [as 别名]
# 或者: from nltk.stem.wordnet.WordNetLemmatizer import lemmatize [as 别名]
def get_dante_answers(senseval_data):
    # TODO: implement probability based inference of accuracy, i.e. POS adds prob, colloc adds prob, phrase adds prob
    #  - must find values for probs first. for colloc - adjacency affects it. for phrase - order affects it
    # Or, just test adjacency, presence of colloc and phrase words in the sentence (test both lemmatized and not)
    # Methods: Set arbitrary values and adjust manually
    #          Use a learning algorithm to find the best mix of values
    DanteAPI.initialize()
    dante = DanteAPI.get_all_word_meanings()
    print "\nDANTE parsing completed"
    dante_answers = {}
    lemmatizer = WordNetLemmatizer()
    for sentence_data in senseval_data:
        for phrase in sentence_data["test_phrases"]:
            word_id, raw_word = phrase["headword"]
            word = lemmatizer.lemmatize(raw_word)
            phrase_meaning = _answer_phrase(word, sentence_data, dante)
            if phrase_meaning is not None:
                dante_answers[word_id] = phrase_meaning
            else:
                dante_answers[word_id] = _answer_word(word, sentence_data, dante)

        for word_id, raw_word in sentence_data["test_words"].iteritems():
            word = lemmatizer.lemmatize(raw_word)
            dante_answers[word_id] = _answer_word(word, sentence_data, dante)
    return dante_answers
开发者ID:tgrant59,项目名称:pydante,代码行数:27,代码来源:danteanswers.py

示例13: extract_cooking_methods

# 需要导入模块: from nltk.stem.wordnet import WordNetLemmatizer [as 别名]
# 或者: from nltk.stem.wordnet.WordNetLemmatizer import lemmatize [as 别名]
def extract_cooking_methods(input_steps, title):
    steps = copy.deepcopy(input_steps)
    steps.append(title)
    tk_steps = [pos_tag(word_tokenize(w.lower())) for w in steps]

    methods = []
    for step in tk_steps:
        # methods += [wordnet_lemmatizer.lemmatize(w, pos='v').encode('ascii', 'ignore') for (w, pos) in step if 'VB' in pos]
        methods += [w.encode('ascii', 'ignore') for (w, pos) in step if 'VB' in pos]

    for step in steps:
        if 'preheat' in step:
            methods += ['preheat', 'preheating']
        if 'microwav' in step:
            methods += ['microwave', 'microwaving']
        if 'place' in step:
            methods.append('place')
        if 'form' in step:
            methods.append('form')
        if 'sprinkle' in step:
            methods.append('sprinkle')

    wordnet_lemmatizer = WordNetLemmatizer()
    discard = ['be', 'use', 'need', 'should', 'allow', 'pink', 'turn', 'reserve']
    methods =  [m for m in methods if wordnet_lemmatizer.lemmatize(m, pos='v') not in discard and len(m) > 2]
    stems = [wordnet_lemmatizer.lemmatize(w, pos='v') for w in methods]
    gerunds = [w[:-1] + 'ing' for w in stems if w[-1] == 'e']
    gerunds +=  [w + 'ing' for w in stems if w[-1] != 'e']
    methods = list(set(methods + stems + gerunds))
    return methods
开发者ID:collinbarnwell,项目名称:recipe-remix,代码行数:32,代码来源:extract_cooking_methods.py

示例14: LexicalBigramUnigramAnalyzer

# 需要导入模块: from nltk.stem.wordnet import WordNetLemmatizer [as 别名]
# 或者: from nltk.stem.wordnet.WordNetLemmatizer import lemmatize [as 别名]
class LexicalBigramUnigramAnalyzer(object):   
    def __init__(self):
        self.lemmatizer = WordNetLemmatizer()    
        self.tb = Blobber(pos_tagger=PerceptronTagger())
        self.sentencer = SentenceTokenizer()
    def __call__(self, doc):   
        tokens = []     
        for sent in self.sentencer.tokenize(doc.decode('ascii','ignore')):
            tagged = self.tb(sent.lower()).tags    
            
            tagged = [(t[0], penn_to_wn(t[1])) for t in tagged]
            tagged = [(t[0], t[1]) for t in tagged if t[0] not in stopwords.words('english')]
            ng = zip(tagged, tagged[1:])
            rule1 = [(t[0],t[1]) for t in ng if t[0][1]== wn.ADJ and t[1][1]== wn.NOUN]
            rule2 = [(t[0],t[1]) for t in ng if (t[0][1]== wn.ADV and t[1][1]== wn.VERB) or (t[0][1]== wn.VERB and t[1][1]== wn.ADV)]
            rule3 = [(t[0],t[1]) for t in ng if t[0][1]== wn.VERB and t[1][1]== wn.VERB]
            rule4 = [(t[0],t[1]) for t in ng if t[0][1]== wn.NOUN and t[1][1]== wn.NOUN]
            
            filtered_list = rule1 + rule2 + rule3 + rule4
                             
                    
            # Lemmatize
            filtered_bigrams = [self.lemmatizer.lemmatize(t[0][0], t[0][1]) + ' ' + self.lemmatizer.lemmatize(t[1][0], t[1][1]) for t in filtered_list]
            filtered_unigrams = [self.lemmatizer.lemmatize(w[0], w[1]) for w in tagged]
            for bigram in filtered_bigrams:
                tokens.append(bigram)
            for unigram in filtered_unigrams:
                tokens.append(unigram)
        return tokens
开发者ID:DirkBrand,项目名称:Comment-Classification,代码行数:31,代码来源:mainExtractor.py

示例15: single_master_list

# 需要导入模块: from nltk.stem.wordnet import WordNetLemmatizer [as 别名]
# 或者: from nltk.stem.wordnet.WordNetLemmatizer import lemmatize [as 别名]
def single_master_list(data):
    my_vocab = deepcopy(init_to_zero_vocab)
    data = data.lower()
    data = re.sub("\[email protected]\S", " EMAILREPLACED ", data)
    data = re.sub("\d+", " NUMBERREPLACED ", data)
    data = re.sub("\s?http:s?\/\/\w{0,3}\.\w+\.\w{0,3}\S?|w{0,3}\.\w+\.\w{0,3}\S?", " URLREPLACED ", data)
    for punct in string.punctuation:
        data = data.replace(punct," ")
    format_data = data.split()
    no_stop_words = []
    l = WordNetLemmatizer()
    for word in format_data:
        if (stop):
            if word not in stopwords.words('english'):
                if (lem):
                    no_stop_words.append(l.lemmatize(word))
                else:
                    no_stop_words.append(word)
        else:
            if (lem):
                no_stop_words.append(l.lemmatize(word))
            else:
                no_stop_words.append(word)
            
    for element in no_stop_words:
        if(element in my_vocab):
            my_vocab[element] += 1

    return my_vocab
开发者ID:jlyons871,项目名称:EmailClassify,代码行数:31,代码来源:perEmail_masterlist.py


注:本文中的nltk.stem.wordnet.WordNetLemmatizer.lemmatize方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。