本文整理汇总了Python中nltk.stem.lancaster.LancasterStemmer类的典型用法代码示例。如果您正苦于以下问题:Python LancasterStemmer类的具体用法?Python LancasterStemmer怎么用?Python LancasterStemmer使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了LancasterStemmer类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: stem_tweet
def stem_tweet(tweet, stemmer_type = "lancaster"):
"""
:param tweet: string representing tweet
:param stemmer_type: type of stemmer used (default value is lancaster)
:return: stemmed tweet
:type tweet: str
:type stemmer_type: str
"""
tokens = nltk.word_tokenize(tweet)
stemmed_tokens = []
if stemmer_type == "lancaster":
stemmer = LancasterStemmer()
elif stemmer_type == "snowball":
stemmer = SnowballStemmer("english")
elif stemmer_type == "porter":
stemmer = PorterStemmer()
elif stemmer_type == "regexp":
stemmer = RegexpStemmer("english")
else:
return None
for token in tokens:
stemmed_tokens.append(stemmer.stem(token))
ret_tw = "".join([" "+i if not i.startswith("'") and i not in string.punctuation else i for i in stemmed_tokens]).strip()
return ret_tw
示例2: stem_text
def stem_text(text):
stm = LancasterStemmer()
tokens = text.split()
words = [stm.stem(w) for w in tokens]
snt = " ".join(words)
return snt
示例3: lemmatizer_newsheadlines
def lemmatizer_newsheadlines() :
lancaster_stemmer = LancasterStemmer()
frl=open("C:/Users/rajas/Downloads/csv_files-2014-12-10/csv files/lemma1.csv","rU")
fr=open("C:/Users/rajas/Downloads/csv_files-2014-12-10/csv files/sample.csv","rU")
fw=open("C:/Users/rajas/Downloads/csv_files-2014-12-10/csv files/lemmaheadlines.csv","w")
for headline in fr:
if len(headline)>0:
headlinelist=headline.split(",")
if len(headlinelist)==3:
headlinewords=headlinelist[1].split(" ")
print(headlinewords)
for word in headlinewords:
wordcor=(((word.replace("?","")).replace(":","")).replace("\"",""))
headlineword=(lancaster_stemmer.stem(wordcor)).lower()
print(headlineword)
# for line in frl:
# crimelist=line.split(",")
# crimeword=((crimelist[1].replace("\"","")).strip()).lower()
# print(crimeword+str(i))
# i+=1
dictcrime=lemmadict()
if headlineword in dictcrime:
print(headlineword+"yipee")
fw.write(headlineword+","+headlinelist[0]+","+headlinelist[1]+"\n")
break;
frl.close()
fw.close()
fr.close()
示例4: simplify_old
def simplify_old(s):
res = ''
st = LancasterStemmer()
text = nltk.word_tokenize(s)
tags = nltk.pos_tag(text)
for tag in tags:
word = tag[0]
if f.checkPos(tag[1]):
if word in model:
word_stem = st.stem(word)
top_words = model.most_similar(positive=[word], topn = 20)
candidate_list = [w[0] for w in top_words]
freq_list = [fdist[w] for w in candidate_list]
c_f_list = zip(candidate_list, freq_list)
ordered_list = sorted(c_f_list, key=lambda c_f_list:c_f_list[1], reverse=True)
word_freq = fdist[word]
# synonmys = f.getSynonmys(word) ## get synonmys from wordnet
# print synonmys
for w in ordered_list:
if not f.freq_diff(word_freq, w[1]): ## break for loop if candidate word frequency does not exceed the word frequency by a threshold
break
if st.stem(w[0]) != word_stem and f.samePos(word, w[0]): ##exclude morphological derivations and same pos
word = w[0] ### do not use wordnet
# if w[0] in synonmys:
# word = w[0]
# else:
# for syn in synonmys:
# if st.stem(w[0]) == st.stem(syn):
# word = w[0]
res = res + word + ' '
return res
示例5: filt
def filt(string):
ret = string
# Filter all punctuation from string
for p in punctuation:
ret = ret.replace(p, '')
# Replace hyphens with spaces
ret = ret.replace('-', ' ')
oldret = ret
ret = ""
# Filter all stop words from string
for word in oldret.split():
if (word in allStopWords) or len (word) <= 1:
pass
else:
ret += word.lower() + " "
st = LancasterStemmer()
steamed = ""
for word in ret.split():
try:
steamed += str(st.stem(word)) + " "
except UnicodeDecodeError:
pass
return steamed
示例6: mapper
def mapper():
#list of fields in positional order expected in inbound
#forum node data.
fieldnames = ['id', 'title', 'tag_names', 'author_id', 'body',
'node_type', 'parent_id', 'abs_parent_id',
'added_at', 'score', 'state_string', 'last_edited_id',
'last_activity_by_id', 'last_activity_at',
'active_revision_id', 'extra', 'extra_ref_id',
'extra_count', 'marked']
reader = csv.DictReader(sys.stdin, delimiter='\t', fieldnames=fieldnames)
stemmer = LancasterStemmer()
stopw = stopwords.words('english')
split_pattern = re.compile('[\W.!?:;"()<>[\]#$=\-/]')
for line in reader:
pid = line['id']
body = line['body']
# split body into words
words = split_pattern.split(body)
# map the stemmer function across all the words.
# and use the Counter to create a dict
# of counted stems. Remove english stopwords.
stem_counts = Counter((stemmer.stem(x) for x in words if x not in stopw))
# emit the stem, count and node id
# for reduction into the reverse index
for stem, count in stem_counts.items():
print "{stem}\t{node_id}\t{count}".format(stem=stem, node_id=pid, count=count)
示例7: preprocess
def preprocess(reviews):
import nltk
from nltk.tokenize import word_tokenize
review_tokenized = [[word.lower() for word in word_tokenize(review.decode('utf-8'))] for review in reviews]
#print "review tokenize done"
#remove stop words
from nltk.corpus import stopwords
english_stopwords = stopwords.words('english')
review_filterd_stopwords = [[word for word in review if not word in english_stopwords] for review in review_tokenized]
#print 'remove stop words done'
#remove punctuations
english_punctuations = [',','.',':',';','?','(',')','&','!','@','#','$','%']
review_filtered = [[word for word in review if not word in english_punctuations] for review in review_filterd_stopwords]
#print 'remove punctuations done'
#stemming
from nltk.stem.lancaster import LancasterStemmer
st = LancasterStemmer()
review_stemmed = [[st.stem(word) for word in review] for review in review_filtered]
#print 'stemming done'
return review_stemmed
示例8: preprocess
def preprocess(content):
stopset = set(stopwords.words('english'))
#replace punctuation and tag with space
tokens = word_tokenize(re.sub(r'<p>|</p>|[^A-Za-z ]', ' ', content.lower()))
pos_list = pos_tag(tokens)
s_tokens = list()
#noun and verb only
for pos in pos_list:
#print pos[1]
#if pos[1] in ['NN', 'NNS', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']:
if pos[1] in ['NN', 'NNS']:
s_tokens.append(pos[0])
wordfreq = FreqDist(s_tokens)
stemfreq = dict()
st = LancasterStemmer()
for word, freq in wordfreq.items():
#stopwords
if word in stopset:
del wordfreq[word]
continue
#tiny words
if len(word) <= 2:
del wordfreq[word]
continue
#stemmer
stem = st.stem(word)
try:
stemfreq[stem]+=freq
except:
stemfreq[stem]=freq
return stemfreq
示例9: processRawData
def processRawData(self, inputPath, outputPath):
raw = pickle.load(open(inputPath, "r"))
data = []
genres = set([])
count = 0
st = LancasterStemmer()
for key in raw.keys():
movie = raw[key]
# if no genre or synopsis data
if 'genres' not in movie or 'synopsis' not in movie: continue
if len(movie['genres'])==0 or movie['synopsis'] == '': continue
temp = {}
temp['genres'] = movie['genres']
for g in temp['genres']:
genres.add(g)
# trim out the punctuation and transform to lowercase
#replace_punctuation = string.maketrans(string.punctuation, ' '*len(string.punctuation))
s = str(movie['synopsis'])
s = s.translate(string.maketrans("",""), string.punctuation)
s = re.sub(' +', ' ', s).strip()
s = " ".join(st.stem(word) for word in s.split(" "))
temp['synopsis'] = s.lower()
data.append(temp)
count += 1
# output as a pickle file
file = open(outputPath, 'wb')
pickle.dump(data, file)
print 'processed ' + str(count) + ' movies'
return genres
示例10: parse_validation
def parse_validation(validation_path):
validation_list = []
with open(validation_path) as f:
for line in f:
strs = line.split('|')
word_dict = {}
validation_list.append(word_dict)
word_dict["word"] = strs[0].strip()
word_dict["real_sense"] = int(strs[1])
sentence_list = []
word_dict["sentence"] = sentence_list
lmtzr = WordNetLemmatizer()
ls = LancasterStemmer()
single_words = re.findall("(\w+|%%)",strs[2])
double_mod_found = False
word_count = 0
for single_word in single_words:
if single_word == "%%":
if not double_mod_found:
word_dict["target_word_idx"] = word_count+1
double_mod_found = True
continue
lemmed = lmtzr.lemmatize(single_word)
stemmed = ls.stem(lemmed)
if not stemmed in glob_Lucene:
sentence_list.append(stemmed)
word_count += 1
return validation_list
示例11: getMaybeWords
def getMaybeWords(self, text_ls):
ignoreWords = ["","have","her","there","the","be","to","of","and","a","in","that","it","for","on","with","as","at","this","but","his","by","from","they","or","an","will","would","so","even","is","be","am","are"];
word_ls = []
for text in text_ls:
word_ls += wordpunct_tokenize(text)
frequencies = {}
st = LancasterStemmer()
for word in word_ls:
if not word[0].isalpha():
continue
if word in ignoreWords:
continue
word_stem = st.stem(word)
if word_stem in frequencies:
frequencies[word_stem] += 1
else:
frequencies[word_stem] = 1
sorted_frequencies = sorted(frequencies.iteritems(), key = operator.itemgetter(1), reverse = True)
#print sorted_frequencies
max_words = 30
if len(sorted_frequencies) < max_words:
max_words = len(sorted_frequencies)
word_tuples = sorted_frequencies[0:max_words]
words = [tuple[0] for tuple in word_tuples]
print words
return words
示例12: build_analyzer
def build_analyzer(self):
"""
Return a callable that handles preprocessing and tokenization
"""
preprocess = self.build_preprocessor()
tokenize = self.build_tokenizer()
stemmer = LancasterStemmer()
filter_meta = lambda doc: ' '.join([w for w in doc.split() if not w.startswith('~')])
parse_words = lambda doc: tokenize(preprocess(filter_meta(self.decode(doc))))
stem_words = lambda doc: [stemmer.stem(t) for t in parse_words(doc)]
meta_func = lambda prefix: lambda doc: (t for t in self.decode(doc).split() if t.startswith(prefix))
feat_func_map = {
'word': lambda doc: self._word_ngrams(parse_words(doc), self.get_stop_words()),
'stem': lambda doc: self._word_ngrams(stem_words(doc), self.get_stop_words()),
'1st': lambda doc: ('~T:1st' for i in parse_words(doc) if i in first_person_words),
'3rd': lambda doc: ('~T:3rd' for i in parse_words(doc) if i in third_person_words),
'tag': lambda doc: self._word_ngrams([t[1] for t in nltk.pos_tag(parse_words(doc))]),
'length': lambda doc: ['~L:%d' % (len(parse_words(doc)) / 5)],
'genre': meta_func('~G'),
'rating': meta_func('~Ra'),
'votes': meta_func('~V'),
'lang': meta_func('~La'),
'country': meta_func('~Co'),
'year': meta_func('~Y'),
'runtime': meta_func('~Rt'),
'type': meta_func('~T')
}
func_list = [feat_func_map.get(flag.strip()) for flag in self.analyzer.split(':')] \
if type(self.analyzer) is str else None
if not func_list:
raise ValueError('%s is not a valid tokenization scheme/analyzer' % self.analyzer)
else:
return lambda doc: itertools.chain.from_iterable(f(doc) for f in func_list if callable(f))
示例13: readText
def readText(textFile):
examples = []
count = 0
lexicon_en = {}
lexicon_ge = {}
stem_en = LancasterStemmer()
stem_ge = nltk.stem.snowball.GermanStemmer()
for line in open(textFile):
count+=1
if count % 1000 == 0:
print count
lans = line.lower().strip().split("|||")
#german = [stem_ge.stem(x.decode('utf-8')) for x in lans[0].strip().split(" ")]
german = lans[0].strip().split(" ")
german = process(german)
for wordx in german:
for word in wordx:
if word not in lexicon_ge:
lexicon_ge[word]=1
else:
lexicon_ge[word]+=1
eng = [stem_en.stem(x.decode('utf-8')) for x in lans[1].strip().split(" ")]
#parse_en = pattern.en.parse(" ".join(eng))
eng = lans[1].strip().split(" ")
for word in eng:
if word not in lexicon_en:
lexicon_en[word]=1
else:
lexicon_en[word]+=1
examples.append(Example(german,eng))
return examples, lexicon_en, lexicon_ge
示例14: prepare_corpus
def prepare_corpus(raw_documents):
# remove punctuation
print "Removing Punctuation"
import string
exclude = set(string.punctuation)
raw_documents = [''.join(ch for ch in s if ch not in exclude) for s in raw_documents]
# remove common words
print "Calculating Stoplist"
stoplist = set([x.rstrip() for x in codecs.open("stop_list.txt", encoding='utf-8') if not x.startswith("#")])
stoplist = stoplist.union(set(nltk.corpus.stopwords.words("english")))
# print stoplist
print "Removing Stoplist and Stemming"
from nltk.stem.lancaster import LancasterStemmer
st = LancasterStemmer()
texts = [[st.stem(word) for word in document.lower().split() if word not in stoplist]
for document in raw_documents]
# remove words that appear only once
print "Removing Single Variables"
all_tokens = sum(texts, [])
tokens_once = set(word for word in set(all_tokens) if all_tokens.count(word) == 1)
texts = [[word for word in text if word not in tokens_once]
for text in texts]
return texts
示例15: tokenize_rest
def tokenize_rest(text):
wnl = WordNetLemmatizer()
st = LancasterStemmer()
words = nltk.word_tokenize(text)
postag = nltk.pos_tag(words)
tokens = []
whfound=False
for word in words:
if word[0:2].lower() == 'wh' and not whfound:
tokens.append({word.lower():'wh'})
whfound = True
continue
elem=wnl.lemmatize(word)
stem = st.stem(elem)
synd = wn.synsets(stem)
if not synd:
stem = stemmer(elem)
synd = wn.synsets(stem)
if not synd:
stem = elem
synd = wn.synsets(stem)
dbelement=detect(stem)
if dbelement:
for every_elem in dbelement:
tokens.append({word:every_elem})
print "\n Rest of possible Tokens"
print tokens
return tokens