Python nltk.WordNetLemmatizer类代码示例

本文整理汇总了Python中nltk.WordNetLemmatizer类的典型用法代码示例。如果您正苦于以下问题：Python WordNetLemmatizer类的具体用法？Python WordNetLemmatizer怎么用？Python WordNetLemmatizer使用的例子？那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。

在下文中一共展示了WordNetLemmatizer类的15个代码示例，这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞，您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: text2sents

def text2sents(text, lemmatize=False, stemmer=None):
    """
    converts a text into a list of sentences consisted of normalized words
    :param text: list of string to process
    :param lemmatize: if true, words will be lemmatized, otherwise -- stemmed
    :param stemmer: stemmer to be used, if None, PortedStemmer is used. Only applyed if lemmatize==False
    :return: list of lists of words
    """
    sents = sent_tokenize(text)

    tokenizer = RegexpTokenizer(r'\w+')

    if lemmatize:
        normalizer = WordNetLemmatizer()
        tagger = PerceptronTagger()
    elif stemmer is None:
        normalizer = PorterStemmer()
    else:
        normalizer = stemmer

    sents_normalized = []

    for sent in sents:
        sent_tokenized = tokenizer.tokenize(sent)
        if lemmatize:
            sent_tagged = tagger.tag(sent_tokenized)
            sent_normalized = [normalizer.lemmatize(w[0], get_wordnet_pos(w[1])) for w in sent_tagged]
        else:
            sent_normalized = [normalizer.stem(w) for w in sent_tokenized]

        sents_normalized.append(sent_normalized)
    return sents_normalized

开发者ID:Dolorousrtur，项目名称:KeywordClassifier，代码行数:32，代码来源:text_processing.py

示例2: lemmatizing

def lemmatizing(line_list):
    """
    Input: line_list (list of strings(sentences/documents)) - e.g. dataset.data

    Iterates over all terms in lines, lemmatize them using WordNetLemmatizer()

    Return: lemmatized_list (list of strings(terms that stemmed))
    """
    lemmatized_list = []
    lemmatizer = WordNetLemmatizer()
    for i, line in enumerate(line_list):
        # linercase
        line = line.lower()
        # remove punctuation
        # below method will simply remove punctuation, but mistakes such as amazon.com => amazoncom
        # nopunct_line = ''.join([c for c in line 
                                            # if re.match("[a-z\-\' \n\t]", c)])
        # this solve the problem above:
        nopunct_line = re.sub('[^A-Za-z0-9]+', ' ', line)                                            
        # tokenize
        line_token = wt(nopunct_line)
        # stemming
        lemmatized_line = []
        for term in line_token:
            term = lemmatizer.lemmatize(term)
            lemmatized_line.append(term)
        # back to sentence as a string
        lemmatized_sentence = ' '.join(lemmatized_line)
        lemmatized_list.append(lemmatized_sentence)
    return lemmatized_list

开发者ID:YuanhaoSun，项目名称:PPLearn，代码行数:30，代码来源:ml_feature_engineering.py

示例3: feature_extractor_tripadvisor_top_words_weights

def feature_extractor_tripadvisor_top_words_weights(data):
    data = data.decode('utf-8')

    top_file = open('scraper/top_words.txt', 'r')
    top_words = [word.replace('\n', '') for word in top_file]
    places_file = open('scraper/places.txt', 'r')

    for place in places_file:
        place = place.replace('\n', '')
        for word in place.split(' '):
            if word != '-':
                top_words.append(word)

    features = {}
    lemmatizer = WordNetLemmatizer()
    stop_words = stopwords.words('english')

    words = [lemmatizer.lemmatize(word.lower()) for word in word_tokenize(data)]

    for word in words:
        if word not in stop_words:
            if word in features:
                if word in top_words:
                    features[word] += 1.5
                else:
                    features[word] += 1
            else:
                if word in top_words:
                    features[word] = 1.5
                else:
                    features[word] = 1

    return features

开发者ID:jedijulia，项目名称:nlp-tourism，代码行数:33，代码来源:classifier.py

示例4: returnKeywordFromList

def returnKeywordFromList(convertpath):
    token_dict = {}
    i=0

    #nltk.download()
    wnl = WordNetLemmatizer()
    fileName = {}
    #print file
    #print str(i)+ file
    #file_path = subdir + os.path.sep + file
    shakes = open(convertpath, 'r')
    text = shakes.read()
    lowers = "".join(map(lambda l:l.decode('unicode_escape').encode('ascii','ignore'),text))
    no_punctuation = re.sub(r'[?|$|.|!0-9()=+-\/\'\"\|]',r'',lowers)
    d = {v:True for v in no_punctuation.split()}
    for token in d.keys():
        no_punctuation = no_punctuation.replace(token, wnl.lemmatize(token))
    fileName[i] = file
    token_dict[i] = no_punctuation.replace("\n"," ").replace("\r","")
    #break

    #this can take some time
    ##print token_dict.values()
    tfidf_vect = TfidfVectorizer(stop_words =stops, ngram_range=(1, 2))
    # #
    # count_vect.stop_words = stops
    #
    X_train_counts = tfidf_vect.fit_transform(token_dict.values())
    #print tfidf_vect.get_feature_names()
    #print(sortSparseMatrix(X_train_counts.getrow(0),rev=False, only_indices=False))
    sortedMatrix = sortSparseMatrix(X_train_counts.getrow(0),rev=True, only_indices=False)[0]
    x = map(lambda (x,y):x,sortedMatrix)
    result = getKeywordAlgorithms(1,sortedMatrix)
    return map(lambda key:tfidf_vect.get_feature_names()[key],result)

开发者ID:inatnunz，项目名称:cv-recommendsys-api，代码行数:34，代码来源:KeywordProcessor.py

示例5: feature_extractor_top_words_weights

def feature_extractor_top_words_weights(data):
    """
     Extract features using the top words with weights method
     parameter: data (tweet)
     returns: returns features of the given data
    """
    data = data.decode('utf-8')
    # top 15 frequently-ocurring words from the tourism-related twitter corpus
    top_words = ['travel', 'vacation', 'city', 'itsmorefuninthephilippines', 'travel',
                 'boracay', 'philippine', 'view', 'day', 'beach', 'morning', 'resort', 
                 'good', 'cebu', 'island']
    features = {}
    lemmatizer = WordNetLemmatizer()
    stop_words = stopwords.words('english')

    # preprocessing: tokenize, convert to lowercase and lemmatize words
    words = [lemmatizer.lemmatize(word.lower()) for word in word_tokenize(data)]

    # remove stop words and add words and their frequencies as features
    for word in words:
        if word not in stop_words:
            if word in features:
                # if word is found in the top words list, increase by 1.5 or preferred weight
                if word in top_words:
                    features[word] += 1.5
                else:
                    features[word] += 1
            else:
                if word in top_words:
                    features[word] = 1.5
                else:
                    features[word] = 1

    return features

开发者ID:jedijulia，项目名称:nlp-tourism，代码行数:34，代码来源:classifier.py

示例6: init_feature_sentences

    def init_feature_sentences(self, total_content):
        t = Tokenizer()
        p = POSTagger()
        wnl = WordNetLemmatizer()

        sentences = t.sent_tokenize(total_content.lower())

        for sentence in sentences:
            tagged_sentence = p.ntlk_tag(t.word_tokenize(sentence))

            #Initializing Feature Sentence dictionary
            feature_sentence = {}
            feature_sentence['sentence'] = sentence
            feature_sentence['tags'] = tagged_sentence
            feature_sentence['nouns'] = []
            feature_sentence['noun_phrases'] = []

            #Finding the Nouns/Noun Phrases in the tagged sentence
            for i in range(0,len(tagged_sentence)):
                (word, tag) = tagged_sentence[i]

                #Chunking
                if tag.startswith('N') and tag != 'NNP':
                    if i > 0 and len(feature_sentence['nouns']) > 0 and tagged_sentence[i - 1][0] == feature_sentence['nouns'][-1] and feature_sentence['sentence'].find(feature_sentence['nouns'][-1] + ' ' + word) > -1:
                        feature_sentence['noun_phrases'].append(wnl.lemmatize(feature_sentence['nouns'].pop() + ' ' + word))
                    else:
                        feature_sentence['nouns'].append(wnl.lemmatize(word))

            self.feature_sentences.append(feature_sentence)

开发者ID:sgudla，项目名称:OpninionMining，代码行数:29，代码来源:FeatureExtractor.py

示例7: write_clean_turian_unigrams

def write_clean_turian_unigrams():
    """
    Extracts unigram embeddings from Socher's binary distribution. These can be used by other composers.

    There are only 50k embeddings (presumably for the most frequent tokens in the corpus). The words have not
    been processed- there are punctuation-only tokens, uppercased words and non-lemmatized words. There isn't
    any PoS tag filtering either- words like "to", "while" and "there".

    I remove punctuation, then lowercase and lemmatize each entry. Multiple entries may map to the
    same canonical form. I select the shortest original entry (ties are broken by giving preference to
    words that are already lowercased). This could have been done better.
    Only vectors for the selected entries are kept. There's 33k canonical
    forms left, many of which are not nouns/adjs/verbs.

    We don't have a PoS tag for the canonical forms. I get around the problem by creating 3 copies of each
    canonical form and expand "cat" to cat/N, cat/J and cat/V, which all share the same vector.
    """
    logging.info('Writing Turian unigrams to %s', turian_unigram_vectors_file)
    mat = loadmat(socher_unigram_embedding_matlab)
    words = [w[0] for w in mat['words'].ravel()]
    df = pd.DataFrame(mat['We'].T, index=words)

    lmtzr = WordNetLemmatizer()
    clean_to_dirty = defaultdict(list)  # canonical -> [non-canonical]
    dirty_to_clean = dict()  # non-canonical -> canonical
    to_keep = set()  # which non-canonical forms forms we will keep
    #  todo this can be done based on frequency or something

    for w in words:
        if set(w).intersection(set(string.punctuation).union(set('0123456789'))):
            # not a real word- contains digits or punctuation
            continue

        lemma = lmtzr.lemmatize(w.lower())
        clean_to_dirty[lemma].append(w)
        dirty_to_clean[w] = lemma

    # decide which of possibly many non-canonical forms with the same lemma to keep
    # prefer shorter and lowercased non-canonical forms
    for lemma, dirty_list in clean_to_dirty.items():
        if len(dirty_list) > 1:
            best_lemma = min(dirty_list, key=lambda w: (len(w), not w.islower()))
        else:
            best_lemma = dirty_list[0]
        to_keep.add(best_lemma)

    # remove non-canonical forms we don't want
    idx_to_drop = [i for i, w in enumerate(df.index) if w not in to_keep]
    ddf = df.drop(df.index[idx_to_drop])
    # canonicalize whatever is left
    ddf.index = [lmtzr.lemmatize(w.lower()) for w in ddf.index]

    # we don't know what the PoS tags of the canonical forms are, so make them all of the same tag
    # e.g. expand "cat" to cat/N, cat/J and cat/V, which all share the same vector
    new_index = ['%s/%s'%(w, pos) for pos in 'NJV' for w in ddf.index]
    new_data = np.vstack([ddf.values] * 3)
    ddf = pd.DataFrame(new_data, index= new_index)
    dv = DenseVectors(ddf, allow_lexical_overlap=True)
    dv.to_tsv(turian_unigram_vectors_file)
    logging.info('Done')

开发者ID:mbatchkarov，项目名称:vector_builder，代码行数:60，代码来源:socher_vectors.py

示例8: feature_extractor_top_words_weights

def feature_extractor_top_words_weights(data):
    data = data.decode('utf-8')
    top_words = ['travel', 'vacation', 'city', 'itsmorefuninthephilippines', 'travel',
                 'boracay', 'philippine', 'view', 'day', 'beach', 'morning', 'resort', 
                 'good', 'cebu', 'island']
    features = {}
    lemmatizer = WordNetLemmatizer()
    stop_words = stopwords.words('english')

    words = [lemmatizer.lemmatize(word.lower()) for word in word_tokenize(data)]

    for word in words:
        if word not in stop_words:
            if word in features:
                if word in top_words:
                    features[word] += 1.5
                else:
                    features[word] += 1
            else:
                if word in top_words:
                    features[word] = 1.5
                else:
                    features[word] = 1

    return features

开发者ID:jedijulia，项目名称:nlp-tourism，代码行数:25，代码来源:classifier.py

示例9: preprocess

def preprocess(original_str):
	# stemmer
	wnl = WordNetLemmatizer()
	# pos
	original_str = unicode(original_str, errors='ignore')
	print type(original_str)
	article_tok = pos_tag(word_tokenize(original_str))
	print type(article_tok)
	print "token: "
	print article_tok

	# choose Noun
	str_noun = ''
	for word, tag in article_tok:
		if ("NN" in tag) or ("JJ" in tag):
			# print(word,":",tag)
			# print(wnl.lemmatize(word))
			try:
				stemming_word = wnl.lemmatize(word)
				print stemming_word
				if len(word) > 1:
					str_noun = str_noun + stemming_word + " "
			except UnicodeDecodeError as e:
				print "error: " + word
			# end if



	# result
	# final_doc.append(str_noun)
	# print "return_preprocess : " + str_noun

	return str_noun

开发者ID:kkfighter2，项目名称:test，代码行数:33，代码来源:pos.py

示例10: lemmstem

def lemmstem(sentences):
    ''' This function is responsible for perfoming 
        the lemmarization and stemming of the words
        Input: A list of trees containing the sentences.
                All words are classificated by their NE type
        Output: Lemmatized/Stemmized sentences
    '''
    
    lmtzr = WordNetLemmatizer()
    st = LancasterStemmer()
    
    dic = {'VB' :wordnet.VERB,
            'NN': wordnet.NOUN,
            'JJ':wordnet.ADJ,
            'RB':wordnet.ADV }
    
    for sent in sentences:
      
        lvsidx=sent.treepositions('leaves') 
       
        for pos in lvsidx:
            word=sent[pos][0]
            tag = sent[pos][1]
            rtag = tag[0:2]
            if rtag in dic:
                lemm=lmtzr.lemmatize( word, dic[rtag] )
                stem=st.stem(lemm)
                #print word, lemm, stem #Linia maldita
                sent[pos]=(word, tag, stem)
            else:
                sent[pos]=(word, tag, word)
    
    return sentences

开发者ID:picarus，项目名称:MAI-INLP-ALB5，代码行数:33，代码来源:preprocessing_functions.py

示例11: init

	def __init__(self, text, product_name):
		self.candidate_features = []
		self.feature_sentences = []
		self.product_name = product_name.lower().split('-')[0].split('_')
		t = Tokenizer()
		sents = t.sent_tokenize(text.lower())
		p = POSTagger()
		wnl = WordNetLemmatizer()
		for sent in sents:
			tagged_sent = p.nltk_tag(t.word_tokenize(sent))
			feature_sent = {}
			feature_sent['sentence'] = sent
			feature_sent['tags'] = tagged_sent
			feature_sent['nouns'] = []
			feature_sent['noun_phrases'] = []
			for i in range(0, len(tagged_sent)):
				(word, tag) = tagged_sent[i]
				#Don't include proper nouns
				if tag.startswith('N') and tag != 'NNP':
					"""
					Consecutive nouns might form a feature phrase. Eg. Picture quality is a phrase.
					Meaningless phrases like 'quality digital' are removed later as their frequeny of occurence is	low. """
					if i > 0 and len(feature_sent['nouns']) > 0 and tagged_sent[i - 1][0] == feature_sent['nouns'][-1] and feature_sent['sentence'].find(feature_sent['nouns'][-1] + ' ' + word) > -1:
						feature_sent['noun_phrases'].append(wnl.lemmatize(feature_sent['nouns'].pop() + ' ' + word))
					else:
						feature_sent['nouns'].append(wnl.lemmatize(word))
					
			self.feature_sentences.append(feature_sent)

开发者ID:SimonAtGitHub，项目名称:OpinionMiner，代码行数:28，代码来源:FeatureExtractor.py

示例12: Check

def Check(mArray):
  
  # what am I checking?
  item = mArray[1]
  lmtzr = WordNetLemmatizer()
  item = lmtzr.lemmatize(item)
  
  # converts to a string
  return ''.join(item)

开发者ID:3009420，项目名称:mg-game，代码行数:9，代码来源:DictStem.py

示例13: word_extractor2

def word_extractor2(text):
	wordlemmatizer = WordNetLemmatizer()
	text = re.sub(r'([a-z])\1+', r'\1\1',text)#substitute multiple letter by two
	words = ""
	wordtokens = [ wordlemmatizer.lemmatize(word.lower()) \
	for word in word_tokenize(text.decode('utf-8', 'ignore')) ]
	for word in wordtokens:
		words+=" "+word
	return words

开发者ID:Paulinyta，项目名称:Tarea3_AID，代码行数:9，代码来源:pregunta2_nonstop.py

示例14: Check

def Check(mArray):

  #what am I checking?
  #Taking the 2nd item in the array since popopen puts the file path as the first item.
  item = mArray[1]
  lmtzr = WordNetLemmatizer()
  item = lmtzr.lemmatize(item, get_wordnet_pos(item))
    
  #converts to a string
  return ''.join(item)

开发者ID:etaiklein，项目名称:Spelling，代码行数:10，代码来源:DictStem.py

示例15: lemmatize

def lemmatize(tokens): 
	# lemmatize words. try both noun and verb lemmatizations 
	lmtzr = WordNetLemmatizer() 
	for i in range(0,len(tokens)): 
		res = lmtzr.lemmatize(tokens[i]) 
		if res == tokens[i]: 
			tokens[i] = lmtzr.lemmatize(tokens[i], 'v') 
		else: 
			tokens[i] = res 
	return tokens

开发者ID:fengshikun，项目名称:Webpage-classification-Minor-Project，代码行数:10，代码来源:functions.py

注：本文中的nltk.WordNetLemmatizer类示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台，相关代码片段筛选自各路编程大神贡献的开源项目，源码版权归原作者所有，传播和使用请参考对应项目的License；未经允许，请勿转载。