当前位置: 首页>>代码示例>>Python>>正文


Python WordNetLemmatizer.lemmatize方法代码示例

本文整理汇总了Python中nltk.stem.WordNetLemmatizer.lemmatize方法的典型用法代码示例。如果您正苦于以下问题:Python WordNetLemmatizer.lemmatize方法的具体用法?Python WordNetLemmatizer.lemmatize怎么用?Python WordNetLemmatizer.lemmatize使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在nltk.stem.WordNetLemmatizer的用法示例。


在下文中一共展示了WordNetLemmatizer.lemmatize方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: getBoW

# 需要导入模块: from nltk.stem import WordNetLemmatizer [as 别名]
# 或者: from nltk.stem.WordNetLemmatizer import lemmatize [as 别名]
    def getBoW(self, instance):
        bowFeatures = {}

        # tokens in the third position
        tokens = instance[3]
        # pos tag
        wordnet_lemmatizer = WordNetLemmatizer()
        tagged = nltk.pos_tag(tokens)
        i = 0
        for tag in tagged:
            if instance[2] == i:
                i +=1
                continue
                #sys.stderr.write('remove target word (%s)\n' % tag[0])
            elif tag[0] in stopwords.words("english"):
                i +=1
                continue
                #sys.stderr.write('stopword (%s)\n' % tag[0])
            elif re.match("N.*", tag[1]):
                bowFeatures['bow(%s)' %  wordnet_lemmatizer.lemmatize(tag[0], pos="n")] = True
            elif re.match("V.*", tag[1]):
                bowFeatures['bow(%s)' % wordnet_lemmatizer.lemmatize(tag[0], pos="v")] = True
            elif re.match("R.*", tag[1]):
                bowFeatures['bow(%s)' % wordnet_lemmatizer.lemmatize(tag[0], pos="r")] = True
            elif re.match("J.*", tag[1]):
                bowFeatures['bow(%s)' % wordnet_lemmatizer.lemmatize(tag[0], pos="a")] = True
            i += 1
        return bowFeatures
开发者ID:oierldl,项目名称:hap-lap,代码行数:30,代码来源:FeatureExtractor.py

示例2: get_words_list

# 需要导入模块: from nltk.stem import WordNetLemmatizer [as 别名]
# 或者: from nltk.stem.WordNetLemmatizer import lemmatize [as 别名]
def get_words_list(dataset):
	'''
	Loading dataset and read contents, use tokenize to get tokens and lemmatize the words.
	'''

	# join the path and file name together
        spam_path = 'data/enron/pre/'+ dataset + '/spam/'
	ham_path = 'data/enron/pre/'+ dataset + '/ham/'
        spam_npl = [i[-1] for i in os.walk(spam_path)][0]
        ham_npl = [i[-1] for i in os.walk(ham_path)][0]

        spam_fl = (open(os.path.join(spam_path, j)).read().lower() for j in spam_npl)
	ham_fl = (open(os.path.join(ham_path, j)).read().lower() for j in ham_npl)

        splitter = re.compile("\\W*")
	english_stops = set(stopwords.words('english'))
	lemmatizer = WordNetLemmatizer()

	# tokenize the files into words
	spam_wl = [None]*len(spam_npl)
	for i,f in enumerate(spam_fl):
		spam_wl[i] = [word for word in (lemmatizer.lemmatize(w) for w in splitter.split(f) \
				if w not in english_stops and w.isalpha()) if len(word) > 2 and len(word) < 20]
        
	ham_wl = [None]*len(ham_npl)
	for i,f in enumerate(ham_fl):
		ham_wl[i] = [word for word in (lemmatizer.lemmatize(w) for w in splitter.split(f) \
				if w not in english_stops and w.isalpha()) if len(word) > 2 and len(word) < 20]

	return spam_wl, ham_wl
开发者ID:alifars,项目名称:training-classifier-in-python,代码行数:32,代码来源:svm_clf.py

示例3: bow_score

# 需要导入模块: from nltk.stem import WordNetLemmatizer [as 别名]
# 或者: from nltk.stem.WordNetLemmatizer import lemmatize [as 别名]
def bow_score(hypothesis_list,text_list):
	wordnet_lemmatizer = WordNetLemmatizer()
	stop_word_list = ['a', 'an', 'the', ',', '.', ';', ':' ]
	i = 0
	while i < len(hypothesis_list):
		if hypothesis_list[i] in stop_word_list:
			del hypothesis_list[i]
			i = i - 1
		i = i  + 1
	if len(hypothesis_list) == 0:
		return 0
	i = 0	
	while i < len(text_list):
		if text_list[i] in stop_word_list:
			del text_list[i]
			i = i - 1
		i = i + 1
	if len(text_list) == 0:
		return 0
	## Stop words removed up until here

	score = 0	
	for word_text in text_list:
		lemma_text = wordnet_lemmatizer.lemmatize(word_text)
		for word_hypothesis in hypothesis_list:
			lemma_hypothesis = wordnet_lemmatizer.lemmatize(word_hypothesis)
			print lemma_hypothesis
			print lemma_text
			score += lexical_compare(lemma_text,lemma_hypothesis)
			print str(score)
	return score
开发者ID:racoder,项目名称:question-answering-nlp,代码行数:33,代码来源:BOW_1.py

示例4: possibility

# 需要导入模块: from nltk.stem import WordNetLemmatizer [as 别名]
# 或者: from nltk.stem.WordNetLemmatizer import lemmatize [as 别名]
def possibility():
    wnl = WordNetLemmatizer()
    verb = wnl.lemmatize(verbs[random.randrange(0, len(verbs))])
    noun = wnl.lemmatize(nouns[random.randrange(0, len(nouns))])

    article = "a"
    if noun[0] in ["a", "e", "i", "o", "u"]:
        article = "an"

    if random.randrange(0, 100) < chance_quantity:
        quantity_word = quantity_adverbs[random.randrange(0, len(quantity_adverbs))]
        if not noun.endswith("s") and not noun.endswith("y") and not quantity_word == "numerous":
            noun += "s"
        possibility = verb + " " + quantity_word + " of the " + noun

    elif random.randrange(0, 100) < chance_location:
        location_word = location_adverbs[random.randrange(0, len(location_adverbs))]
        possibility = (
            verb
            + " "
            + article
            + " "
            + noun
            + " "
            + location_word
            + " the "
            + wnl.lemmatize(nouns[random.randrange(0, len(nouns))])
        )

    else:
        possibility = verb + " " + article + " " + noun

    return possibility
开发者ID:jeffThompson,项目名称:WouldYouRatherBot,代码行数:35,代码来源:WouldYouRatherBot.py

示例5: get_clean_text

# 需要导入模块: from nltk.stem import WordNetLemmatizer [as 别名]
# 或者: from nltk.stem.WordNetLemmatizer import lemmatize [as 别名]
def get_clean_text(list_filenames, path_to_file):
    '''
    parameter:
    ----------
    list_filenames: as LST is a list of filename as STR
    path_to_file: as STR is the path to the file containing movie scripts
    --> such that path_to_file/filename.txt is the file to open

    returns:
    --------
    list of list of words (lemmatize, lowercase) in the text (order preserved)
    '''
    wnl = WordNetLemmatizer()
    list_texts_as_words = []
    for filename in list_filenames:
        path_file = path_to_file+"/"+filename+".txt"
        with open(path_file) as f:
            text = f.readlines()
            lines = [line.strip() for line in text if line.strip()]
            string_words = []
            for line in lines:
                words = [wnl.lemmatize(word.lower()) for word in line.split(' ') if wnl.lemmatize(word.lower())]
                string_words += words
        list_texts_as_words.append(string_words)
    return list_texts_as_words
开发者ID:AnnaVM,项目名称:Project_Plotline,代码行数:27,代码来源:emotions_script.py

示例6: TweetsLemmatizedVectorizer

# 需要导入模块: from nltk.stem import WordNetLemmatizer [as 别名]
# 或者: from nltk.stem.WordNetLemmatizer import lemmatize [as 别名]
class TweetsLemmatizedVectorizer(TweetsTruncatedVectorizer):
  def __init__(self):
    self.vectorizer = TfidfVectorizer(stop_words='english',min_df=5) #, sublinear_tf=True)
    self.wordnet = WordNetLemmatizer()

  def fit_transform(self, users):
    join_tweets = []
    
    for user in users:
      timeline = [''.join(remove_tweet_noise(tweet.text)) for tweet in user.twitter]
      #timeline_insta = [''.join(remove_tweet_noise(insta.text)) for insta in user.instagram]
      #print timeline_insta
      #timeline = timeline + timeline_insta
      lemmatized = []
      for tweet in timeline:
        lemma = [self.wordnet.lemmatize(word) for word in tweet.split()]
        lemmatized.append(' '.join(lemma))
      
      join_tweets.append(''.join(lemmatized))

    return self.vectorizer.fit_transform([usertweets for usertweets in join_tweets])

  def transform(self, users):
    join_tweets = []
    
    for user in users:
      timeline = [''.join(remove_tweet_noise(tweet.text)) for tweet in user.twitter]
      lemmatized = []
      for tweet in timeline:
        lemma = [self.wordnet.lemmatize(word) for word in tweet.split()]
        lemmatized.append(' '.join(lemma))
      
      join_tweets.append(''.join(lemmatized))

    return self.vectorizer.transform([usertweets for usertweets in join_tweets])
开发者ID:hanveiga,项目名称:master-thesis,代码行数:37,代码来源:models.py

示例7: createCorpus

# 需要导入模块: from nltk.stem import WordNetLemmatizer [as 别名]
# 或者: from nltk.stem.WordNetLemmatizer import lemmatize [as 别名]
def createCorpus(data,i, binaryX="False", stopWords=None, lemmatize="False", tfidf= "False", useidf="True"):  # will vectorize BOG using frequency as the parameter and will return the required arrays
    X_train =[]
    X_test=[]
    Y_train=[]
    Y_test=[]

    for key in data:
        if key in i:

            for filename in data[key]:
                text = data[key][filename][0]
                if lemmatize == "True":
                    port =  WordNetLemmatizer()
                    text = " ".join([port.lemmatize(k,"v") for k in text.split()])
                X_test.append(text)
                Y_test.append(data[key][filename][1])
        else:
            for filename in data[key]:
                text = data[key][filename][0]
                if lemmatize == "True":
                    port =  WordNetLemmatizer()
                    text = " ".join([port.lemmatize(k,"v") for k in text.split()])
                X_train.append(text)
                Y_train.append(data[key][filename][1])
    if tfidf == "False":
        vectorizer = CountVectorizer(min_df=1, binary= binaryX, stop_words=stopWords)
        X_train_ans = vectorizer.fit_transform(X_train)
        X_test_ans = vectorizer.transform(X_test)
        return X_train_ans, Y_train, X_test_ans,Y_test
    elif tfidf == "True":
        vectorizer = TfidfVectorizer(min_df=1, use_idf=useidf)
        X_train_ans = vectorizer.fit_transform(X_train)
        X_test_ans = vectorizer.transform(X_test)

        return X_train_ans, Y_train, X_test_ans,Y_test
开发者ID:aayushmudgal,项目名称:CS771-MLT,代码行数:37,代码来源:knn-result.py

示例8: negator

# 需要导入模块: from nltk.stem import WordNetLemmatizer [as 别名]
# 或者: from nltk.stem.WordNetLemmatizer import lemmatize [as 别名]
    def negator(self,wordVec):
        negation = False
        negated_doc = []
        lemmatizer = WordNetLemmatizer()
        for w,p in wordVec:
            w_out = ""
            if (p[:2] == "NN"):
                w_out = lemmatizer.lemmatize(w.lower(), pos=wordnet.NOUN)
            elif (p[:2] == "JJ"):
                w_out = lemmatizer.lemmatize(w.lower(), pos=wordnet.ADJ)
            elif (p[:2] == "VB"):
                w_out = lemmatizer.lemmatize(w.lower(), pos=wordnet.VERB)
            elif (p[:2] == "RB"):
                w_out = lemmatizer.lemmatize(w.lower(), pos=wordnet.ADV)
            if(w_out == "not" or w_out == "n't" ):
                #print "blah"
                negation = not negation
                #rint negation
            elif(w_out in string.punctuation and w_out != ''):

                negation = False
            elif(negation):
                #print negation
                w_out = "NOT_"+w_out
            negated_doc.append((w_out,p))
        #print negated_doc
        return negated_doc
开发者ID:akshaynavada,项目名称:NLP,代码行数:29,代码来源:NB.py

示例9: tokenize3

# 需要导入模块: from nltk.stem import WordNetLemmatizer [as 别名]
# 或者: from nltk.stem.WordNetLemmatizer import lemmatize [as 别名]
def tokenize3(text):
	wordnet_lemmatizer = WordNetLemmatizer()
	tokens             = word_tokenize(text)
	tokens             = [wordnet_lemmatizer.lemmatize(token, NOUN) for token in tokens]
	tokens             = [wordnet_lemmatizer.lemmatize(token, VERB) for token in tokens]
	tokens             = [wordnet_lemmatizer.lemmatize(token, ADJ) for token in tokens]
	return tokens
开发者ID:SJinping,项目名称:WordProc,代码行数:9,代码来源:wordProcBase.py

示例10: pos_analysis

# 需要导入模块: from nltk.stem import WordNetLemmatizer [as 别名]
# 或者: from nltk.stem.WordNetLemmatizer import lemmatize [as 别名]
def pos_analysis(tags, stoplist):
    wordnet_lemmatizer = WordNetLemmatizer()
    nouns = [wordnet_lemmatizer.lemmatize(word) for word, tag in tags if tag=='NN']
    display_freq(nouns, 'Nouns', top=50)
    adjectives = [wordnet_lemmatizer.lemmatize(word) for word, tag in tags if tag=='JJ']
    display_freq(adjectives, 'Adjectives', top=50)
    verbs = [wordnet_lemmatizer.lemmatize(word, pos='v') for word, tag in tags if tag[:2] in ('VB') and word not in stoplist]
    display_freq(verbs, 'Verbs', top=50)
开发者ID:hsenot,项目名称:parliament_of_australia,代码行数:10,代码来源:utils.py

示例11: build_analyzer

# 需要导入模块: from nltk.stem import WordNetLemmatizer [as 别名]
# 或者: from nltk.stem.WordNetLemmatizer import lemmatize [as 别名]
 def build_analyzer(self):
     try:
         english_lemmatizer = WordNetLemmatizer()
         analyzer = super(ProcessCountVectorizer, self).build_analyzer()
         return lambda doc: (english_lemmatizer.lemmatize(english_lemmatizer.lemmatize(w, "v"), "n")
                             for w in analyzer(doc) if not w.endswith("ly") and len(w) > 4)
     except Warning:
         pass
开发者ID:sanekas,项目名称:Sammerfield-Edu,代码行数:10,代码来源:ProcessCountVectorizer.py

示例12: LemmaTokenizer

# 需要导入模块: from nltk.stem import WordNetLemmatizer [as 别名]
# 或者: from nltk.stem.WordNetLemmatizer import lemmatize [as 别名]
class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, doc):
        doc = doc.lower()
        doc = re.sub("[^a-z]", " ", doc) #replace punctuation with spaces
        # doc = re.sub("thanks", "thank", doc)
        return [self.wnl.lemmatize(t) for t in word_tokenize(doc) if len(self.wnl.lemmatize(t)) > 2]
开发者ID:CharlieDaniels,项目名称:Rally,代码行数:10,代码来源:term_frequency.py

示例13: __init__

# 需要导入模块: from nltk.stem import WordNetLemmatizer [as 别名]
# 或者: from nltk.stem.WordNetLemmatizer import lemmatize [as 别名]
 def __init__(self, data, label=None, *args, **kwargs):
     lem = WordNetLemmatizer()
     if data and not label:
         # Data is assumed to be NLTK-style (word, tag) pairs.
         # If you'd like to collapse the tag set, this is the place.
         label = [re.sub(r'[{}]+'.format(punctuation),'PUN',tag) for word, tag in data] # e.g., tag[0]
         data = [re.sub(r'[{}]+'.format(punctuation),'PUN', lem.lemmatize(word.lower())) for word, tag in data]
         data = [re.sub(r'[0-9]+','NUM', lem.lemmatize(word.lower())) for word in data]
     super(TaggedSentence, self).__init__(data, label, *args, **kwargs)
开发者ID:KechenQin,项目名称:Hidden_Markov_Model,代码行数:11,代码来源:test_hmm.py

示例14: tokenize4

# 需要导入模块: from nltk.stem import WordNetLemmatizer [as 别名]
# 或者: from nltk.stem.WordNetLemmatizer import lemmatize [as 别名]
def tokenize4(text):
	wordnet_lemmatizer = WordNetLemmatizer()
	tokens             = word_tokenize(text)
	wordset            = set(words.words())
	tokens             = [wordnet_lemmatizer.lemmatize(token, NOUN) for token in tokens]
	tokens             = [wordnet_lemmatizer.lemmatize(token, VERB) for token in tokens]
	tokens             = [wordnet_lemmatizer.lemmatize(token, ADJ) for token in tokens]
	tokens             = [token for token in tokens if token in wordset]
	return tokens
开发者ID:SJinping,项目名称:WordProc,代码行数:11,代码来源:wordProcBase.py

示例15: stopWordRemoval

# 需要导入模块: from nltk.stem import WordNetLemmatizer [as 别名]
# 或者: from nltk.stem.WordNetLemmatizer import lemmatize [as 别名]
def stopWordRemoval() :


	f = open('repos', 'r')
	strn = f.read()
	lst = strn.split('\n')

	i = 0
	while i < (len(lst) - 1) :
	
		name = lst[i].split("/")

		dummyFile = 'filteredData/' + name[1] + '/dummy.txt';
		dr = os.path.dirname(dummyFile)

		if not os.path.exists(dr) :
			os.makedirs(dr)

		ft = open('data/'+name[1]+'/title.txt')
		st = ft.read().lower()

		fd = open('data/'+name[1]+'/description.txt')
		sd = fd.read().lower()

		fc = open('data/'+name[1]+'/content.txt')
		sc = fc.read().lower()
		

		tokenizer = RegexpTokenizer(r'\w+')

		wordArrTitle = tokenizer.tokenize(st)
		wordArrDesc = tokenizer.tokenize(sd)
		wordArrData = tokenizer.tokenize(sc)

		filteredWordsTitle = [w for w in wordArrTitle if not w in stopwords.words('english')]
		filteredWordsDesc = [w for w in wordArrDesc if not w in stopwords.words('english')]
		filteredWordsData = [w for w in wordArrData if not w in stopwords.words('english')]

		wordnet_lem= WordNetLemmatizer()


		ftf = open('filteredData/'+name[1]+'/title.lst','w')
		for w in filteredWordsTitle:
			#print w
			ftf.write(wordnet_lem.lemmatize(w)+'\n')

		fdf = open('filteredData/'+name[1]+'/description.lst','w')
		for w in filteredWordsDesc:
			#print w
			fdf.write(wordnet_lem.lemmatize(w)+'\n')

		fcf = open('filteredData/'+name[1]+'/content.lst','w')
		for w in filteredWordsData:
			print w+'\n'
			fcf.write(wordnet_lem.lemmatize(w)+'\n')
		
		i=i+2
开发者ID:g31pranjal,项目名称:git-analysis,代码行数:59,代码来源:script.py


注:本文中的nltk.stem.WordNetLemmatizer.lemmatize方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。