当前位置: 首页>>代码示例>>Python>>正文


Python LancasterStemmer.stem方法代码示例

本文整理汇总了Python中nltk.stem.lancaster.LancasterStemmer.stem方法的典型用法代码示例。如果您正苦于以下问题:Python LancasterStemmer.stem方法的具体用法?Python LancasterStemmer.stem怎么用?Python LancasterStemmer.stem使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在nltk.stem.lancaster.LancasterStemmer的用法示例。


在下文中一共展示了LancasterStemmer.stem方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: simplify_old

# 需要导入模块: from nltk.stem.lancaster import LancasterStemmer [as 别名]
# 或者: from nltk.stem.lancaster.LancasterStemmer import stem [as 别名]
def simplify_old(s):
    res = ''
    st = LancasterStemmer()

    text = nltk.word_tokenize(s)
    tags = nltk.pos_tag(text)

    for tag in tags:
        word = tag[0]
        if f.checkPos(tag[1]):
            if word in model:
                word_stem = st.stem(word)
                top_words = model.most_similar(positive=[word], topn = 20)
                candidate_list = [w[0] for w in top_words]
                freq_list = [fdist[w] for w in candidate_list]
                c_f_list = zip(candidate_list, freq_list)
                ordered_list = sorted(c_f_list, key=lambda c_f_list:c_f_list[1], reverse=True)
                word_freq = fdist[word]
                #			synonmys = f.getSynonmys(word)  ## get synonmys from wordnet
                # print synonmys
                for w in ordered_list:
                    if not f.freq_diff(word_freq, w[1]):  ## break for loop if candidate word frequency does not exceed the word frequency by a threshold
                            break
                    if st.stem(w[0]) != word_stem and f.samePos(word, w[0]): ##exclude morphological derivations and same pos
                            word = w[0]  ### do not use wordnet
        # if w[0] in synonmys:
        # 	word = w[0]
        # else:
        # 	for syn in synonmys:
        # 		if st.stem(w[0]) == st.stem(syn):
        # 			word = w[0]

        res = res + word + ' '
    return res
开发者ID:wufei523,项目名称:SimpleTestUmb,代码行数:36,代码来源:utils.py

示例2: getstems

# 需要导入模块: from nltk.stem.lancaster import LancasterStemmer [as 别名]
# 或者: from nltk.stem.lancaster.LancasterStemmer import stem [as 别名]
def getstems(dict):
    l = LancasterStemmer()
    stems = {}
    for word in dict:
        if word in dicts.irregforms:
            stems[word] = l.stem(dicts.irregforms[word])
        else:
            stems[word] = l.stem(word)
    return stems                
开发者ID:stde,项目名称:texcondensator,代码行数:11,代码来源:getwordpriority.py

示例3: mapper

# 需要导入模块: from nltk.stem.lancaster import LancasterStemmer [as 别名]
# 或者: from nltk.stem.lancaster.LancasterStemmer import stem [as 别名]
def mapper(shard, doc_counter):
    st = LancasterStemmer()
    with open(shard, "r") as f:
        ohsu = json.JSONDecoder().decode(f.read())
        output_values = []
        doc_counter.add(len(ohsu))
        for article in ohsu:
            output_values += [(w, (article[".I"], 'a')) for w in article[".A"]]
            output_values += [(st.stem(w), (article[".I"], 't')) for w in alphabet.findall(article[".T"].lower())]
            if article.get('.W') is not None:
                body_words = (w for w in alphabet.findall(article[".W"].lower()))
                output_values += [(st.stem(w), (article[".I"], 'w')) for w in body_words]
    return output_values
开发者ID:el-san59,项目名称:Course_Work,代码行数:15,代码来源:indexer.py

示例4: poss_train

# 需要导入模块: from nltk.stem.lancaster import LancasterStemmer [as 别名]
# 或者: from nltk.stem.lancaster.LancasterStemmer import stem [as 别名]
def poss_train(train_file,train_write,sw_file):
    """
    
    Arguments:
    - `train_file`:
    """
    a = 0
    f = open(train_file)
    reader = csv.reader(f)

    t = open(train_write,"w")

    sw = open(sw_file)
    sw = sw.readlines()
    sw = [word.strip() for word in sw]
    
    #stopwords = sw  # use nltk stopwords
    stopwords = nltk.corpus.stopwords.words('english')
    print "停顿词表长度",len(stopwords)
    stopwords = set(stopwords)

    g = lambda x : x not in stopwords
    
    for row in reader:
        if a%10000 == 0:
            print a    
        a += 1
        title = row[1].lower()
        #clean html
        body = nltk.clean_html(row[2].lower())
        
        #word tokenize
        pattern = r"([a-z])\w+"
        body = nltk.regexp_tokenize(body, pattern)
        title = nltk.regexp_tokenize(title, pattern)
        
        #remove stopwords
        body = filter(g,body)
        title = filter(g,title)

        #light stem
        st = LancasterStemmer()
        title = set([st.stem(word) for word in title])
        body = set(body)
        body = set([st.stem(word) for word in body])

        # list to string
        body = ' '.join(body)
        title = ' '.join(title)
        t.write('"%s","%s","%s","%s"\n'%(row[0], title,body,row[3]))
开发者ID:rve,项目名称:keyword,代码行数:52,代码来源:stem.py

示例5: stemming

# 需要导入模块: from nltk.stem.lancaster import LancasterStemmer [as 别名]
# 或者: from nltk.stem.lancaster.LancasterStemmer import stem [as 别名]
def stemming(words):
    wordsAfterStemming=[]
    st=LancasterStemmer()
    for x in words:
        y=st.stem(x)
        wordsAfterStemming.append(y)
    return wordsAfterStemming
开发者ID:12190143,项目名称:test,代码行数:9,代码来源:textSolve.py

示例6: score_sentence

# 需要导入模块: from nltk.stem.lancaster import LancasterStemmer [as 别名]
# 或者: from nltk.stem.lancaster.LancasterStemmer import stem [as 别名]
def score_sentence(sentence, weights, stop_words):
	"""
	Parameters weights: Counter, sentence: string
	#I NEED SKIES DOCUMENTATION
	"""
	lemmatizer = WordNetLemmatizer()
	stemmer = LancasterStemmer()
	sentence = strip_punc(sentence)
	tokens = word_tokenize(sentence)
	score = 0
	for token in tokens:
		root = stemmer.stem(lemmatizer.lemmatize(token))
		if token not in stop_words and root not in stop_words:
			score += weights[root] 
	score = sum([weights[stemmer.stem(lemmatizer.lemmatize(token))] for token in tokens if token not in stop_words and stemmer.stem(lemmatizer.lemmatize(token)) not in stop_words])
	return score
开发者ID:skyli42,项目名称:RiceKrispies,代码行数:18,代码来源:summarize.py

示例7: readText

# 需要导入模块: from nltk.stem.lancaster import LancasterStemmer [as 别名]
# 或者: from nltk.stem.lancaster.LancasterStemmer import stem [as 别名]
def readText(textFile):			
	examples = []
	count = 0
	lexicon_en = {}
	lexicon_ge = {}
	stem_en = LancasterStemmer()
	stem_ge = nltk.stem.snowball.GermanStemmer()
	for line in open(textFile):
		count+=1
		if count % 1000 == 0:
			print count
		lans = line.lower().strip().split("|||")
		#german = [stem_ge.stem(x.decode('utf-8')) for x in lans[0].strip().split(" ")]
		german = lans[0].strip().split(" ")
		german = process(german)
		for wordx in german:
			for word in wordx:
				if word not in lexicon_ge:
					lexicon_ge[word]=1
				else:
					lexicon_ge[word]+=1
		eng = [stem_en.stem(x.decode('utf-8')) for x in lans[1].strip().split(" ")]
		#parse_en = pattern.en.parse(" ".join(eng))
		eng = lans[1].strip().split(" ")
		for word in eng:
			if word not in lexicon_en:
				lexicon_en[word]=1
			else:
				lexicon_en[word]+=1
		examples.append(Example(german,eng))
	return examples, lexicon_en, lexicon_ge
开发者ID:frederick0329,项目名称:sp2016.11-731,代码行数:33,代码来源:align-compound.py

示例8: word_stem_example

# 需要导入模块: from nltk.stem.lancaster import LancasterStemmer [as 别名]
# 或者: from nltk.stem.lancaster.LancasterStemmer import stem [as 别名]
def word_stem_example(word="Amevive"):
    """
    [EN]Read: http://www.nltk.org/book/ch03.html, 3.6 Normalizing Text
    [CN]根据NLTK in python书中的推荐Porter算法较为鲁棒, 推荐使用
    """
    stemmer = LancasterStemmer()
    print("Lancaster [%s => %s]" % (word, stemmer.stem(word)))
    
    stemmer = PorterStemmer() # <=== recommended algorithm
    print("Porter [%s => %s]" % (word, stemmer.stem(word)))
    
    stemmer = RegexpStemmer('ing$|s$|e$', min=4)
    print("Regexp [%s => %s]" % (word, stemmer.stem(word)))
    
    stemmer = SnowballStemmer('english') # Choose a language
    print("Snowball [%s => %s]" % (word, stemmer.stem(word)))
开发者ID:MacHu-GWU,项目名称:NLTK-hands-on-skills,代码行数:18,代码来源:tt02_word_stemming.py

示例9: preprocess

# 需要导入模块: from nltk.stem.lancaster import LancasterStemmer [as 别名]
# 或者: from nltk.stem.lancaster.LancasterStemmer import stem [as 别名]
def preprocess(reviews):
	import nltk
	from nltk.tokenize import word_tokenize

	review_tokenized = [[word.lower() for word in word_tokenize(review.decode('utf-8'))] for review in reviews] 
	#print "review tokenize done"

	#remove stop words
	from nltk.corpus import stopwords
	english_stopwords = stopwords.words('english')
	review_filterd_stopwords = [[word for word in review if not word in english_stopwords] for review in review_tokenized]
	#print 'remove stop words done'

	#remove punctuations
	english_punctuations = [',','.',':',';','?','(',')','&','!','@','#','$','%']
	review_filtered = [[word for word in review if not word in english_punctuations] for review in review_filterd_stopwords]
	#print 'remove punctuations done'

	#stemming
	from nltk.stem.lancaster import LancasterStemmer
	st = LancasterStemmer()
	review_stemmed = [[st.stem(word) for word in review] for review in review_filtered]
	#print 'stemming done'

	return review_stemmed
开发者ID:anirudhreddy92,项目名称:DataMining_Capstone,代码行数:27,代码来源:task3.1.py

示例10: predict_category_subcategory

# 需要导入模块: from nltk.stem.lancaster import LancasterStemmer [as 别名]
# 或者: from nltk.stem.lancaster.LancasterStemmer import stem [as 别名]
def predict_category_subcategory(book_name):
	data_set1 = pandas.Series(book_name.encode('ascii'))

    #Data Preprocessing
	data_set1 = data_set1.dropna(axis=0,how='any')
	data_set1 = data_set1.str.lower()

    #Manual removal List
	remove_list = ['edition','ed','edn', 'vol' , 'vol.' , '-' ,'i']


	data_set1[0] =' '.join([i for i in data_set1[0].split() if i not in remove_list])

	data_set1 = data_set1.apply(lambda x :re.sub(r'\w*\d\w*', '', x).strip())
	data_set1 = data_set1.apply(lambda x :re.sub(r'\([^)]*\)', ' ', x))
	data_set1 = data_set1.apply(lambda x :re.sub('[^A-Za-z0-9]+', ' ', x))
    #data_set['Category ID'] = data_set['Category ID']+"|"+data_set['Subcategory ID']


    #Stemming the book titles
	stemmer = LancasterStemmer()
	data_set1[0]=" ".join([stemmer.stem(i) for i in  data_set1[0].split()])

	clf = joblib.load(os.path.join(BASE_DIR+"/learners/",'category_predict.pkl'))
	ans = clf.predict(data_set1)
	sub_clf = joblib.load(os.path.join(BASE_DIR+"/learners/",'subcategory_predict.pkl'))
	sub_ans = sub_clf.predict(data_set1)
	return [ans[0],sub_ans[0]]
开发者ID:dsaumyajit007,项目名称:textnook,代码行数:30,代码来源:views.py

示例11: word_standardize

# 需要导入模块: from nltk.stem.lancaster import LancasterStemmer [as 别名]
# 或者: from nltk.stem.lancaster.LancasterStemmer import stem [as 别名]
def word_standardize(sentences): 	
    tokens = []
    sentences_st = []

    for sent in sentences:
        tokens.extend(word_tokenize(sent))
        sentences_st.append(word_tokenize(sent))
	
    words = tokens
    
    st = LancasterStemmer()

    words = [w.lower() for w in words]
    words = [w for w in words if not w in stopwords.words('english')]
    words = [w for w in words if not w in '!"#$%&\'()*+,-./:;<=>[email protected][\\]^_`{|}~']
    st_words = [st.stem(w) for w in words]

    sent_result = []
    for sent in sentences_st:
        sent = [w.lower() for w in sent]
        sent = [w for w in sent if not w in stopwords.words('english')]
        sent = [w for w in sent if not w in '!"#$%&\'()*+,-./:;<=>[email protected][\\]^_`{|}~']
        sent_result.append(sent)

    return st_words, sent_result
开发者ID:chqsark,项目名称:hightext,代码行数:27,代码来源:pullData.py

示例12: LemmaTokenizer

# 需要导入模块: from nltk.stem.lancaster import LancasterStemmer [as 别名]
# 或者: from nltk.stem.lancaster.LancasterStemmer import stem [as 别名]
class LemmaTokenizer(object):
	def __init__(self):
		#self.wnl = WordNetLemmatizer()
		self.stemmer = LancasterStemmer()
	def __call__(self, doc):
		#return [self.wnl.lemmatize(t) for t in word_tokenize(doc) if re.match(r'[a-z]+', t, re.M|re.I)]
		return [self.stemmer.stem(t) for t in word_tokenize(doc) if re.match(r'[a-z]+', t, re.M|re.I)]
开发者ID:jerryli1981,项目名称:token-image,代码行数:9,代码来源:extractKeywords.py

示例13: stem_text

# 需要导入模块: from nltk.stem.lancaster import LancasterStemmer [as 别名]
# 或者: from nltk.stem.lancaster.LancasterStemmer import stem [as 别名]
def stem_text(text):
    stm = LancasterStemmer()
    tokens = text.split()
    words = [stm.stem(w) for w in tokens]
    snt = " ".join(words)

    return snt
开发者ID:uml-cs-nlp-sentence-completion,项目名称:Sherlock,代码行数:9,代码来源:process_file.py

示例14: filt

# 需要导入模块: from nltk.stem.lancaster import LancasterStemmer [as 别名]
# 或者: from nltk.stem.lancaster.LancasterStemmer import stem [as 别名]
	def filt(string):

		ret = string

		#	Filter all punctuation from string
		for p in punctuation:
			ret = ret.replace(p, '')

		#	Replace hyphens with spaces
		ret = ret.replace('-', ' ')
		oldret = ret
		ret = ""

		#	Filter all stop words from string
		for word in oldret.split():
			if (word in allStopWords) or len (word) <= 1:
				pass
			else:
				ret += word.lower() +  " "

		st = LancasterStemmer()
		steamed = ""

		for word in ret.split():
			try:
				steamed += str(st.stem(word)) + " "

			except UnicodeDecodeError:
				pass

		return steamed
开发者ID:mitzelu,项目名称:lexical_analysis_tex,代码行数:33,代码来源:mrtitlefreq.py

示例15: preprocess

# 需要导入模块: from nltk.stem.lancaster import LancasterStemmer [as 别名]
# 或者: from nltk.stem.lancaster.LancasterStemmer import stem [as 别名]
def preprocess(content):
	stopset = set(stopwords.words('english'))
	#replace punctuation and tag with space
	tokens = word_tokenize(re.sub(r'<p>|</p>|[^A-Za-z ]', ' ', content.lower())) 
	pos_list = pos_tag(tokens)
	s_tokens = list()

	#noun and verb only
	for pos in pos_list:
		#print pos[1]
		#if pos[1] in ['NN', 'NNS', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']:
		if pos[1] in ['NN', 'NNS']:
			s_tokens.append(pos[0])

	wordfreq = FreqDist(s_tokens)
	stemfreq = dict()
	st = LancasterStemmer()
	for word, freq in wordfreq.items():
		#stopwords
		if word in stopset:
			del wordfreq[word]
			continue
		#tiny words
		if len(word) <= 2:
			del wordfreq[word]
			continue
		#stemmer
		stem = st.stem(word)
		try:
			stemfreq[stem]+=freq
		except:
			stemfreq[stem]=freq
	return stemfreq
开发者ID:TorchmanX,项目名称:TARS,代码行数:35,代码来源:nc.py


注:本文中的nltk.stem.lancaster.LancasterStemmer.stem方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。