当前位置: 首页>>代码示例>>Python>>正文


Python stem.WordNetLemmatizer类代码示例

本文整理汇总了Python中nltk.stem.WordNetLemmatizer的典型用法代码示例。如果您正苦于以下问题:Python WordNetLemmatizer类的具体用法?Python WordNetLemmatizer怎么用?Python WordNetLemmatizer使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。


在下文中一共展示了WordNetLemmatizer类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: bow_score

def bow_score(hypothesis_list,text_list):
	wordnet_lemmatizer = WordNetLemmatizer()
	stop_word_list = ['a', 'an', 'the', ',', '.', ';', ':' ]
	i = 0
	while i < len(hypothesis_list):
		if hypothesis_list[i] in stop_word_list:
			del hypothesis_list[i]
			i = i - 1
		i = i  + 1
	if len(hypothesis_list) == 0:
		return 0
	i = 0	
	while i < len(text_list):
		if text_list[i] in stop_word_list:
			del text_list[i]
			i = i - 1
		i = i + 1
	if len(text_list) == 0:
		return 0
	## Stop words removed up until here

	score = 0	
	for word_text in text_list:
		lemma_text = wordnet_lemmatizer.lemmatize(word_text)
		for word_hypothesis in hypothesis_list:
			lemma_hypothesis = wordnet_lemmatizer.lemmatize(word_hypothesis)
			print lemma_hypothesis
			print lemma_text
			score += lexical_compare(lemma_text,lemma_hypothesis)
			print str(score)
	return score
开发者ID:racoder,项目名称:question-answering-nlp,代码行数:31,代码来源:BOW_1.py

示例2: labelBasedEntry

def labelBasedEntry(term,uri):
    wnl = WordNetLemmatizer()
    hm = {}
    sparql = Sparql.Connection()
    if " " in term:
        term = term.split(" ")[1]
        
    stem = wnl.lemmatize(term)
    wiktionary_informations = sparql.getWiktionaryInformationsNEW(stem)
    for x in wiktionary_informations:
        if " + " in x[0] and "," not in x[0] and "*" not in x[0]:
            tmp = x[0].split(" + ")[0]
            if "Adjective" in x[1]:
                hm[LexiconGenerator.AdjectivePPFrame(tmp, uri,{})] = ""
            if "Verb" in x[1]:
                hm[LexiconGenerator.TransitiveFrame(tmp, uri,{})] = ""
            if "Noun" in x[1]:
                hm[LexiconGenerator.NounPPFrame(tmp,uri,{})] = ""
        elif "," not in x[0] and "*" not in x[0]:
            if "Adjective" in x[1]:
                hm[LexiconGenerator.AdjectivePPFrame(term, uri,{})] = ""
            if "Verb" in x[1]:
                hm[LexiconGenerator.TransitiveFrame(term, uri,{})] = ""
            if "Noun" in x[1]:
                hm[LexiconGenerator.NounPPFrame(term,uri,{})] = ""

    if len(wiktionary_informations) == 0:
        hm[LexiconGenerator.TransitiveFrame(stem, uri,{})]  = ""
        hm[LexiconGenerator.NounPPFrame(stem,uri,{})] = ""
        
    entry = []
    for key in hm:
        entry.append(key)
            
    return entry
开发者ID:swalter2,项目名称:knowledgeLexicalisation,代码行数:35,代码来源:LabelApproach.py

示例3: stemming

def stemming(words_l, type="PorterStemmer", lang="english", encoding="utf8"):
    supported_stemmers = [
        "PorterStemmer", "SnowballStemmer",
        "LancasterStemmer", "WordNetLemmatizer"]
    if type is False or type not in supported_stemmers:
        return words_l
    else:
        l = []
        if type == "PorterStemmer":
            stemmer = PorterStemmer()
            for word in words_l:
                l.append(stemmer.stem(word).encode(encoding))
        if type == "SnowballStemmer":
            stemmer = SnowballStemmer(lang)
            for word in words_l:
                l.append(stemmer.stem(word).encode(encoding))
        if type == "LancasterStemmer":
            stemmer = LancasterStemmer()
            for word in words_l:
                l.append(stemmer.stem(word).encode(encoding))
        if type == "WordNetLemmatizer":  # TODO: context
            wnl = WordNetLemmatizer()
            for word in words_l:
                l.append(wnl.lemmatize(word).encode(encoding))
        return l
开发者ID:LewkowskiArkadiusz,项目名称:magisterka,代码行数:25,代码来源:preprocessing.py

示例4: getWordCounts

def getWordCounts(WordCloudTweetNo):
    print('Fetching the most commonly used {0} words in the "{1}" feed...'.format(WordCloudTweetNo, ScreenName))
    cur = "DELETE FROM WordsCount;"
    conn.execute(cur)
    conn.commit()
    cur = 'SELECT tweet_text FROM UserTimeline'
    data = conn.execute(cur)
    StopList = stopwords.words('english')
    Lem = WordNetLemmatizer()
    AllWords = ''
    for w in tqdm(data.fetchall(),leave=1):
            try:
                #remove certain characters and strings
                CleanWordList = re.sub(r'http://[\w.]+/+[\w.]+', "", w[0], re.IGNORECASE)
                CleanWordList = re.sub(r'https://[\w.]+/+[\w.]+', "", CleanWordList, re.IGNORECASE)
                CleanWordList = re.sub(r'[@#\[\]\'"$.;{}~`<>:%&^*()-?_!,+=]', "", CleanWordList)
                #tokenize and convert to lower case
                CleanWordList = [words.lower() for words in word_tokenize(CleanWordList) if words not in StopList]
                #lemmatize words
                CleanWordList = [Lem.lemmatize(word) for word in CleanWordList]
                #join words
                CleanWordList =' '.join(CleanWordList)
                AllWords += CleanWordList
            except Exception as e:
                print (e)
                sys.exit(e)
    if AllWords is not None:
        words = [word for word in AllWords.split()]
        c = Counter(words)
        for word, count in c.most_common(WordCloudTweetNo):
            conn.execute("INSERT INTO WordsCount (word, frequency) VALUES (?,?)", (word, count))
            conn.commit()
开发者ID:keyur9,项目名称:Examining-your-presence-on-Twitter,代码行数:32,代码来源:Twitter_Scrape.py

示例5: preprocess

def preprocess(line, is_lmz=False):
    line = wordpunct_tokenize(line.strip())
    if is_lmz:
        lemmatizer = WordNetLemmatizer()
        line = [lemmatizer.lemmatize(word) for word in line]

    return line
开发者ID:lngvietthang,项目名称:imageqa,代码行数:7,代码来源:quest2num.py

示例6: create_lexicon

def create_lexicon(pos_file, neg_file):
    lex = []

    # 读取文件
    def process_file(_f):
        with open(_f, 'r') as f:
            lex = []
            lines = f.readlines()
            # print(lines)
            for line in lines:
                words = word_tokenize(line.lower())
                lex += words
            return lex

    lex += process_file(pos_file)
    lex += process_file(neg_file)
    # print(len(lex))
    lemmatizer = WordNetLemmatizer()
    lex = [lemmatizer.lemmatize(word) for word in lex]  # 词形还原 (cats->cat)

    word_count = Counter(lex)
    # print(word_count)
    # {'.': 13944, ',': 10536, 'the': 10120, 'a': 9444, 'and': 7108, 'of': 6624, 'it': 4748, 'to': 3940......}
    # 去掉一些常用词,像the,a and等等,和一些不常用词; 这些词对判断一个评论是正面还是负面没有做任何贡献
    lex = []
    for word in word_count:
        if word_count[word] < 2000 and word_count[word] > 20:  # 这写死了,好像能用百分比
            lex.append(word)  # 齐普夫定律-使用Python验证文本的Zipf分布 http://blog.topspeedsnail.com/archives/9546
    return lex
开发者ID:gswyhq,项目名称:hello-world,代码行数:29,代码来源:TensorFlow练习1,对评论进行分类.py

示例7: split_into_words

def split_into_words(text, lemmatize=False, reattach=True, replace_numbers=True, split_off_quotes=True,
                     fix_semicolon_mistakes=True):

    if fix_semicolon_mistakes:
        text = fix_semicolons(text)

    word_tokenizer = nltk.tokenize.TreebankWordTokenizer()

    # get rid of certain character so that we can use those for special purposes
    tokens = word_tokenizer.tokenize(text)
    if reattach:
        tokens = reattach_clitics(tokens)

    if split_off_quotes:
        tokens = split_off_quote_marks(tokens)

    if lemmatize:
        lemmatizer = WordNetLemmatizer()
        tokens = [lemmatizer.lemmatize(token) for token in tokens]

    if replace_numbers:
        tokens = [re.sub('[0-9]', '#', t) for t in tokens]

    tokens = split_off_final_punctuation(tokens)

    tokens = split_off_commas(tokens)

    return tokens
开发者ID:dallascard,项目名称:guac,代码行数:28,代码来源:tokenizer.py

示例8: lemmatize

    def lemmatize(self):
        wnl = WordNetLemmatizer()
        self.lemma_list = []

        for i in self.tokens_no_punct:
            lemmy_word = wnl.lemmatize(i)
            self.lemma_list.append(unicode(lemmy_word))
开发者ID:mjlavin80,项目名称:walker,代码行数:7,代码来源:text_process.py

示例9: get_words_list

def get_words_list(dataset):
	'''
	Loading dataset and read contents, use tokenize to get tokens and lemmatize the words.
	'''

	# join the path and file name together
        spam_path = 'data/enron/pre/'+ dataset + '/spam/'
	ham_path = 'data/enron/pre/'+ dataset + '/ham/'
        spam_npl = [i[-1] for i in os.walk(spam_path)][0]
        ham_npl = [i[-1] for i in os.walk(ham_path)][0]

        spam_fl = (open(os.path.join(spam_path, j)).read().lower() for j in spam_npl)
	ham_fl = (open(os.path.join(ham_path, j)).read().lower() for j in ham_npl)

        splitter = re.compile("\\W*")
	english_stops = set(stopwords.words('english'))
	lemmatizer = WordNetLemmatizer()

	# tokenize the files into words
	spam_wl = [None]*len(spam_npl)
	for i,f in enumerate(spam_fl):
		spam_wl[i] = [word for word in (lemmatizer.lemmatize(w) for w in splitter.split(f) \
				if w not in english_stops and w.isalpha()) if len(word) > 2 and len(word) < 20]
        
	ham_wl = [None]*len(ham_npl)
	for i,f in enumerate(ham_fl):
		ham_wl[i] = [word for word in (lemmatizer.lemmatize(w) for w in splitter.split(f) \
				if w not in english_stops and w.isalpha()) if len(word) > 2 and len(word) < 20]

	return spam_wl, ham_wl
开发者ID:alifars,项目名称:training-classifier-in-python,代码行数:30,代码来源:svm_clf.py

示例10: preProcessHistogram

def preProcessHistogram(documents):

    """
    preProcessHistogram(listofString) -> listOfString

    consumes a listofSentences and Tokenizes it, and returns a list of lemmatized words.

    """

    paragraph = ""
    for sentence in documents:
        paragraph = paragraph + " " + sentence.lower()
   
    #make all words lowercase and remove all punctuation         
    lowerCaseParagraph = paragraph.translate(maketrans("",""),punctuation) 
    
    words = lowerCaseParagraph.split()  
    
    
    lemmatizer = WordNetLemmatizer()
    #lemmatize every word.... (if it needs to be lemmatized) and remove words that are too long, because chances are they arent words.
    words = map(lambda x: lemmatizer.lemmatize(x,'v'), words)
    words = filter(lambda x: len(x) < 10 or x.isdigit() , words)
    
    return words
开发者ID:t3abdulg,项目名称:Twitter-Topic-Modelling,代码行数:25,代码来源:textanalytics.py

示例11: text_tokenize

def text_tokenize(sentence):
    #stemmer = SnowballStemmer('english')
    lmtr = WordNetLemmatizer()
    tokens = [x.lower() for x in word_tokenize(sentence) if x.isalpha()]
    tokens_tagged = nltk.pos_tag(tokens)
    tokens_tagged = [(x, get_wordnet_pos(y)) for (x, y) in tokens_tagged if x not in stopwords.words('english')]
    return [lmtr.lemmatize(x, y) if y != '' else x for (x, y) in tokens_tagged]
开发者ID:chandlerzuo,项目名称:chandlerzuo.github.io,代码行数:7,代码来源:inaugural.py

示例12: getdata

def getdata():
    """
    retrieves the data from repository table
    removes the special characters and stop words and does the stemming(using nltk package)
    """
    conn=db.getDBConnection()
    cursor = conn.cursor()
    global stopWordSet
    sql = "select id, description from repository"
    rows = db.executeSQL(conn, sql)
    counter=1
    wnl = WordNetLemmatizer()
    for row in rows:
        id = row[0]
        desc= row[1]
        #print desc
        if desc is not None:
            desc=desc.replace('-',' ').replace(',',' ').replace('/',' ').replace('.',' ').replace('_',' ')
            desc = desc.lower()
            desc = re.sub('[^a-z0-9 ]','',desc)
            keywords = desc.split(" ")
            for word in keywords:
                #word = porter.stem(word.strip())
                word=wnl.lemmatize(word.strip())
                if word not in stopWordSet:
                    sql1 = "insert into keywords1 values("+str(counter)+",'"+word+"',"+str(id)+ ',' + str(0) + ")"
                    print sql1
                    cursor.execute(sql1)
                    conn.commit()
                    counter = counter+1
开发者ID:rajuch,项目名称:Kmeans-Clustering,代码行数:30,代码来源:processdata.py

示例13: lemmatize

def lemmatize(tweets):
	'''
	Lemmatize words in the corpus.
	
	Input:
	------------------
	tweets: List of lists, [[word1OfTweet1, word2OfTweet1,...,word_m1OfTweet1],
						   	[word1OfTweet2, word2OfTweet2,...,word_m2OfTweet2],
						   						. 								
						   						. 
						   						.
						    [word1OfTweetN, word2OfTweetN,...,word_mNOfTweetN]]
	Output:
	-----------------
	newTweets: All the words in the tweet lemmatized.
	'''
	wordnet_lemmatizer = WordNetLemmatizer()
	pos_tag_tweets = [nltk.pos_tag(t) for t in tweets]
	tweets = []
	i = 0
	for t in pos_tag_tweets:
		tt = []
		for w in t:
			if get_wordnet_pos(w[1]) =='':
				tt.append(w[0])
			else:
				try:
					tt.append(wordnet_lemmatizer.lemmatize(w[0], pos = get_wordnet_pos(w[1])))
				except UnicodeDecodeError:
					pass
		tweets.append(tt)
		i += 1
	return tweets
开发者ID:EliasJonsson,项目名称:PGM-Project,代码行数:33,代码来源:preprocess.py

示例14: _lemma_

def _lemma_(token):

	if isinstance(token, str):
		return _stem_(token)
	if isinstance(token, unicode):
		return _stem_(token)
	from nltk.corpus import wordnet

	def get_wordnet_pos(treebank_tag):

		if treebank_tag.startswith('J'):
			return wordnet.ADJ
		elif treebank_tag.startswith('V'):
			return wordnet.VERB
		elif treebank_tag.startswith('N'):
			return wordnet.NOUN
		elif treebank_tag.startswith('R'):
			return wordnet.ADV
		else:
			return ''

	from nltk.stem import WordNetLemmatizer
	wordnet_lemmatizer = WordNetLemmatizer()
	p = get_wordnet_pos(token.pos()[0][1])
	if p!=wordnet.VERB:
		return _stem_(token[0])
	rs = wordnet_lemmatizer.lemmatize(token[0], pos=p)
	return rs
开发者ID:gkotsis,项目名称:negation-detection,代码行数:28,代码来源:negation_detection.py

示例15: GetCleanWords

def GetCleanWords(content_string):
    
    # Tokenize the sentences using hte Punkt word Tokenizer
    tokenized_words = PunktWordTokenizer().tokenize(content_string)
    
    #Now let's remove the stop words
    tokenized_words = [word for word in tokenized_words if word.lower() not in stopwords_list]
    
    # Now let's remove all of the solely punctuation.
    punctuation_list = ['.',',',';',':','!','?']
    tokenized_words = [word for word in tokenized_words if word not in punctuation_list]
    
    # Finally let's get rid of the punctuation at the end of each word
    cleaned_words = []
    for word in tokenized_words:
        if word[-1] in punctuation_list:
            cleaned_words.append(word[:-1])
        else:
            cleaned_words.append(word)
    
    # Now let's stem each of the words to lower our word count
    wnl = WordNetLemmatizer()
    clean_and_stemmed_words = [wnl.lemmatize(cleaned_word) for cleaned_word in cleaned_words] 
    
    return clean_and_stemmed_words
开发者ID:jellis505,项目名称:NLPtools,代码行数:25,代码来源:CreateDict.py


注:本文中的nltk.stem.WordNetLemmatizer类示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。