本文整理汇总了Python中nltk.stem.snowball.SnowballStemmer类的典型用法代码示例。如果您正苦于以下问题:Python SnowballStemmer类的具体用法?Python SnowballStemmer怎么用?Python SnowballStemmer使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了SnowballStemmer类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: _text_to_words
def _text_to_words(self, text):
'''
Processe un texte et retourne une liste de mots
Le processing effectue les actions suivantes:
- mise en minuscule du texte
- tokenisation
- retrait des stop_words
- stemming des mots
'''
# On met le texte en minuscule
text = text.lower().strip()
# Tokenisation
tokens = word_tokenize(text, language="english")
# On retire les mots commencant par une apostrophe
# (la tokenization transforme I'd like en ["I", "'d", "like"]
# et on pourrait se passer de "'d")
tokens = [token for token in tokens if not token.startswith("'")]
# stop_words
# On retire les stop words de notre vecteur.
# En plus des stopwords donnees avec la collection, je rajoute les mots courants
# Anglais donnés par NLTK et la ponctuation (sauf parantheses car utile pour query bool)
stop_words = self.stop_words + list(string.punctuation) + stopwords.words("english")
tokens = [token for token in tokens if token not in stop_words]
# Stemming
stemmer = SnowballStemmer(language="english")
tokens = [stemmer.stem(word) for word in tokens]
return tokens
示例2: get_stemm_tags
def get_stemm_tags(self, tags):
stemm_tags = []
current_stemmer = SnowballStemmer('english')
for tag in self.tags:
stemm_tags.append(current_stemmer.stem(tag.lower()))
return stemm_tags
示例3: parseOutText
def parseOutText(f):
""" given an opened email file f, parse out all text below the
metadata block at the top
(in Part 2, you will also add stemming capabilities)
and return a string that contains all the words
in the email (space-separated)
example use case:
f = open("email_file_name.txt", "r")
text = parseOutText(f)
"""
f.seek(0) ### go back to beginning of file (annoying)
all_text = f.read()
### split off metadata
content = all_text.split("X-FileName:")
words = ""
stemmer = SnowballStemmer("english")
if len(content) > 1:
text_string = content[1].translate(string.maketrans("", ""), string.punctuation)
split = text_string.split()
text = [stemmer.stem(word) for word in split]
words = ' '.join(text)
f.close()
return words.strip()
示例4: tokenize
def tokenize(s, stem=True, digit=False, stop=True, use_re=False):
"""
:type s: str
:type stem: bool
:type use_re: bool
:rtype: set(str)
"""
stop_words = stopwords.words('english')
stemmer = SnowballStemmer('english')
wordnet = WordNetLemmatizer()
table = string.maketrans("","")
if use_re:
s = re.sub('(.)([A-Z][a-z]+)', r'\1 \2', s)
if digit:
tokens = set(word_tokenize(unify_units(s).translate(table, string.punctuation + string.digits)))
else:
tokens = set(word_tokenize(unify_units(s).translate(table, string.punctuation)))
if stop:
tokens = set(word for word in tokens if word not in stop_words)
if stem:
tokens = set(stemmer.stem(word) for word in tokens)
return tokens
示例5: tokenize
def tokenize(self, document):
"""
Break text into sentences and each sentence into a list of single words
Ignore any token that falls into the stopwords set.
"""
# use sentence tokenizer sent_tokenize from nltk package
sentences = sent_tokenize(utils.to_unicode(document.lower()))
# create stemmer of class SnowballStemmer
stemmer = SnowballStemmer("english")
for sentence in sentences:
words = [word
for word in utils.tokenize(
self.cleanse_text(sentence)
)]
if self.remove_stopwords:
words = [
word for word in words
if word not in self.en_stopwords
]
if self.stemming:
words = [stemmer.stem(t) for t in words]
yield words
示例6: pos_tokenizer
def pos_tokenizer(s): #define a tokenizer that uses POS tagging
texts=nltk.word_tokenize(s)
texts=[word for word in texts if len(word)>2]
# PULL OUT NOUN AND VERB PHRASES
chunktext=nltk.pos_tag(texts)
patterns="""
VP:{<V.*><DT>?<JJ.*>?<NN.*>}
NP:{<DT>?<JJ>*<NN.*>}
N:{<NN.*>}
"""
NPchunker=nltk.RegexpParser(patterns)
from nltk.stem.snowball import SnowballStemmer
st=SnowballStemmer('english')
#print text
temp=[]
result=NPchunker.parse(chunktext)
#print result
for phrase in result:
try:
phrase.label()
string=''
m=0
for word in phrase:
if m==0:
string+=st.stem(word[0])
m+=1
else: string+=' '+st.stem(word[0])
temp.append(string)
except: pass
return temp
示例7: text_cleaner_and_tokenizer
def text_cleaner_and_tokenizer(texts):
"""
takes a list of sentences, removes punctuation, numbers, stopwords and stems.
Then joins everything back together and returns the filtered texts as a list of unicode strings
:param texts: list of unprocessed strings
:return: list of unicode strings
"""
i = 0
stopword_list = set(stopwords.words('danish'))
stemmer = SnowballStemmer("danish", ignore_stopwords=False)
filtered_texts = []
for sentence in texts:
for symbol in punctuation:
sentence = sentence.replace(symbol,'')
for num in numbers:
sentence = sentence.replace(str(num),'')
sentence = sentence.decode('utf-8').lower()
words_in_sentence = word_tokenize(sentence, language='danish')
filtered_sentence = []
for word in words_in_sentence:
if word not in stopword_list:
stem_word = stemmer.stem(word)
filtered_sentence.append(stem_word)
sentence = ' '.join(filtered_sentence)
filtered_texts.append(sentence)
i = i +1
if i % 1000 == 0:
print(i)
print('Done :D!')
return filtered_texts
示例8: tokenize
def tokenize(text):
tokens = nltk.word_tokenize(text)
stems = []
stemmer = SnowballStemmer("english", ignore_stopwords=True)
for item in tokens:
stems.append(stemmer.stem(item))
return stems
示例9: read_corpus
def read_corpus(corpus_file, use_sentiment):
"Reads in the corpus and returns the documents and labels"
documents = []
labels = []
with open(corpus_file, encoding='utf-8') as f:
for line in f:
tokens = line.strip().split()
use_stopword = False
if use_stopword:
stopwordfile = open('stopwords.txt', 'r')
stopwords = []
for line in stopwordfile:
if len(line) > 0:
splitline = line.split(',')
for word in splitline:
stopwords.append(word)
tokenlist = [token for token in tokens[3:] if token not in stopwords]
documents.append(find_ngrams(tokenlist, 2))
else:
snowballstemmer = SnowballStemmer('english')
stemmedtokens = [snowballstemmer.stem(word) for word in tokens[3:]]
#documents.append(stemmedtokens)
documents.append(find_ngrams(stemmedtokens, 2))
if use_sentiment:
# 2-class problem: positive vs negative
labels.append( tokens[1] )
else:
# 6-class problem: books, camera, dvd, health, music, software
labels.append( tokens[0] )
return documents, labels
示例10: clean_data
def clean_data(data):
'''
Stems and removes stop words from training and test data
'''
stemmer = SnowballStemmer('english')
stop = stopwords.words('english')
for column_name in ['query', 'product_title', 'product_description']:
for index, row in data.iterrows():
warnings.filterwarnings('error')
try:
extracted_data = (' ').join(
[i for i in BeautifulSoup(row[column_name], 'lxml')
.get_text(' ')
.split(' ')
])
except UserWarning:
pass
cleaned_data = re.sub('[^a-zA-Z0-9]',' ', extracted_data)
stemmed_data = (' ').join(
[stemmer.stem(i) for i in cleaned_data.split(' ')
])
remove_stop_words = ('').join(
[i for i in stemmed_data if i not in stop]
)
data.set_value(index, column_name, unicode(remove_stop_words))
return data
示例11: get_core_words
def get_core_words( text ):
#TOKENIZATION
b = word_tokenize(text)
#KEEP ONLY NOUNS
b = [noun for noun, pos in pos_tag(b) if pos.startswith('N')]
#CONVERT INTO LOWER CASE
looper = 0
for token in b:
b[looper] = token.lower()
looper+=1
#REMOVE THE STOPWORDS FROM THE FILE
minlength = 2
c = [token for token in b if (not token in stopwords.words('english')) and len(token) >= minlength]
#STEMMING THE WORDS TO ITS BASE FORM
stemmer = SnowballStemmer("english")
looper1 = 0
for token in c:
c[looper1] = stemmer.stem(token.decode("utf8"))
looper1 +=1
return c
示例12: prune
def prune(doc, stoplist = None, stem = True, english_dictionary_words = False):
"""This takes a single document and tokenizes the words, removes
undesirable elements, and prepares it to be loaded into a dictionary.
"""
# Tokenize the document and make it lowercase
temp = utils.simple_preprocess(doc.lower())
# Remove freestanding punctuation and punctuation in words
temp = [w for w in temp if w not in string.punctuation]
temp = [rmPunct(w) for w in temp]
# Remove words in passed stoplist
if stoplist:
temp = [w for w in temp if w not in stoplist]
# Remove specific tokens
temp = [w for w in temp if w not in set(['[', ']', "'", '\n', 'com'])]
# Remove stopwords
temp = [w for w in temp if w not in stopwords.words('english')]
# Stem the remaining words
if stem:
stemmer = SnowballStemmer('english')
temp = [stemmer.stem(w) for w in temp]
if english_dictionary_words:
d = enchant.Dict("en_US")
temp = [w for w in temp if d.check(w)]
return temp
示例13: processFile
def processFile(fh):
with gzip.open(fh, 'rb') as f:
tree = etree.parse(f)
root = tree.getroot()
r = re.compile('^[a-zA-Z]+$')
s = SnowballStemmer("english")
paragraphs = root.xpath('DOC[@type="story"]/TEXT/P')
for p in paragraphs:
try:
sentences = PunktSentenceTokenizer().sentences_from_text(p.text)
for sentence in sentences:
tokens = TreebankWordTokenizer().tokenize(sentence)
#Filter by alphabetic only
alphabetic = filter(r.match, tokens)
#Filter by stopwords & stem all leftover tokens
stop_filtered = [s.stem(w) for w in alphabetic if w.lower() not in stopwords.words('english')]
print (" ").join(stop_filtered).upper()
except:
continue
return True
示例14: parseOutText
def parseOutText(f):
""" given an opened email file f, parse out all text below the
metadata block at the top
example use case:
f = open("email_file_name.txt", "r")
text = parseOutText(f)
"""
stemmer = SnowballStemmer("english")
f.seek(0) ### go back to beginning of file (annoying)
all_text = f.read()
### split off metadata
content = all_text.split("X-FileName:")
words = ""
if len(content) > 1:
### remove punctuation
text_string = content[1].translate(string.maketrans("", ""), string.punctuation)
### split the text string into individual words, stem each word,
### and append the stemmed word to words (make sure there's a single
### space between each stemmed word)
words = ' '.join([stemmer.stem(word) for word in text_string.split()])
return words
示例15: __init__
def __init__(self,df, column,n ): # gets the most frecuent words in a document
texto = " ".join(str(x) for x in df[column].values)
tokens = texto.split()
tokens=[x.lower() for x in tokens]
#stopset = set(stopwords.words('english')) # dictionary of stop words
#tokens = [w for w in tokens if not w in stopset]
stemmer=SnowballStemmer("english")
stemm_words=[]
tokens_clean=[]
for j in tokens:
sa=re.sub('[^A-Za-z]+', '', j)
tokens_clean.append(sa)
#print tokens_clean
for s in tokens_clean:
try:
stem= stemmer.stem(s)
if s!='':
stemm_words.append(str(stem))
except:
pass
cuenta = len(tokens_clean)
largo = Counter(stemm_words).most_common(n)
topdic = dict(largo)
asortado = Series(topdic)
asortadol = asortado.columns = ['a', 'b']
ordenado = asortado.order(ascending=False)
ordenadolist= topdic.keys() #+stemm_words
self.top=ordenadolist