本文整理汇总了Python中snowballstemmer.stemmer函数的典型用法代码示例。如果您正苦于以下问题:Python stemmer函数的具体用法?Python stemmer怎么用?Python stemmer使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了stemmer函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: clean
def clean(text, stemmer='snowball'):
"""Normalize, split, and clean text
Parameters:
-----------
text : str
Block of text to clean and prepare.
stemmer : str, opt
Stemmer to use: [snowball, five, simple]
Returns:
--------
text : str
Cleaned and prepared text block.
"""
if not stemmer in ['snowball', 'five', 'simple', 'none']:
raise ValueError("Stemmer choice not available.")
text = re.sub("[{}]".format(string.punctuation), " ", text.lower())
text = text.split()
if stemmer == 'five':
text = [five_stemmer(item) for item in text]
elif stemmer == 'snowball':
stemmer = snowballstemmer.stemmer('english');
text = stemmer.stemWords(text)
elif stemmer == 'simple':
text = [simple_stem(item) for item in text]
else:
pass
text = [item for item in text if not item in STOP_WORDS]
return text
示例2: preprocess_document
def preprocess_document(data):
# Step 1: strip punctuation
data = data.lower()
punctuation = ['.', ',', '"', "'", '?', '!', ':', ';', '(', ')', '[', ']'
, '{', '}', '#', '\\','/','@','\xa0','\n','&','$','‘','…','•','-']
for punc in punctuation:
data = data.replace(punc, '')
# Step 2: tokenize
data = list(nltk.word_tokenize(data))
# Step 3: strip stopwords
stop = set(stopwords.words('english'))
extra_stopwords = ['ok', 'oh', 'via','bc','gon','na'] # add any additional stopwords we want to use here
stop.update(extra_stopwords)
stop.update(list(string.ascii_lowercase)) # remove all single letters
data = [i for i in data if i not in stop] # remove stopwords and sort result
# Step 4: stemming
stemmer = snowballstemmer.stemmer('english')
data = stemmer.stemWords(data)
# Step 5: remove words not in NLTK english corpus
words = set(nltk.corpus.words.words())
for w in data:
if w not in words:
data.remove(w)
示例3: stemming
def stemming(lang, input, output, encoding, pretty):
result = []
stemmer = snowballstemmer.stemmer(lang)
for original in codecs.open(input, "r", encoding).readlines():
original = original.strip()
# Convert only ASCII-letters to lowercase, to match C behavior
original = ''.join((lower_(c) if 'A' <= c <= 'Z' else c for c in original))
stemmed = stemmer.stemWord(original)
if result:
result.append('\n')
if pretty == 0:
if stemmed != "":
result.append(stemmed)
elif pretty == 1:
result.append(original, " -> ", stemmed)
elif pretty == 2:
result.append(original)
if len(original) < 30:
result.append(" " * (30 - len(original)))
else:
result.append("\n")
result.append(" " * 30)
result.append(stemmed)
outfile = codecs.open(output, "w", encoding)
outfile.write(''.join(result) + '\n')
outfile.close()
示例4: getHighlightingsVariables
def getHighlightingsVariables(self, article, variable_keywords, variable_pages):
stemmer = snowballstemmer.stemmer("german")
#goodchars = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzÄÖÜäöüß'"
for i in range(0, len(article)):
for j in range(0, len(article[i])):
article[i][j] = article[i][j].split(" ");
for k in range(0, len(article[i][j])):
#article[i][j][k]=chrtran(article[i][j][k], goodchars, "")
article[i][j][k]=stemmer.stemWord(article[i][j][k])
for i in range(0, len(variable_keywords)):
#variable_keywords[i]=chrtran(variable_keywords[i], goodchars, "")
variable_keywords[i]=stemmer.stemWord(variable_keywords[i])
highlight = []
for i in range(0, len(article)):
highlight_article = []
for j in range(0, len(article[i])):
highlight_variables = []
for k in range(0, len(variable_keywords)):
highlight_variables.append(random.random())
highlight_article.append(highlight_variables)
highlight.append(highlight_article)
return highlight
示例5: turkish
def turkish(sent):
# No turkish stemmer in NLTK
stem = snowballstemmer.stemmer('turkish')
stop = stopwords.words('turkish')
tx = word_tokenize(sent)
mx = stem.stemWords(tx)
px = [x for x in mx if x not in stop]
return px
示例6: __init__
def __init__(self, language=None):
"""Create a new highlighter for the specified language.
"""
if language:
self.stem = snowballstemmer.stemmer(language)
else:
self.stem = NoStem()
示例7: aplicarStemmer
def aplicarStemmer(pDictPalabrasArchivos):
print("aplicando stemming...")
dictRaices = {}
stemmer = snowballstemmer.stemmer("spanish")
for docId, palabras in pDictPalabrasArchivos.items():
raices = stemmer.stemWords(palabras)
dictRaices[docId] = raices
## archivo.archivo.crearCSVDict(".\stemming.csv",dictRaices)
return dictRaices
示例8: __init__
def __init__(self, xml):
self.dest = xml.get("dest")
if self.dest is None:
raise ValueError()
self.verbose = xml.get("verbose")
if self.verbose is None:
self.verbose = False
else:
self.verbose = True
self.stemmer = snowballstemmer.stemmer('english')
示例9: aplicarStemmerConsulta
def aplicarStemmerConsulta(pLista):
#print(pLista)
print("aplicando stemming...")
lista = []
stemmer = snowballstemmer.stemmer('spanish')
for i in pLista:
#print(i[0])
raiz = stemmer.stemWords([i[0]])[0]
lista.append([raiz,i[1]])
#print(i[0])
#print(lista)
return lista
示例10: __init__
def __init__(self, samples=None, stopwords="english", limit=20, logging=False):
"""
Create a vocabulary which is a mapping from bucket names to lists of
synonyms that fall into their bucket. Stopwords is a list of words that
are ignored for the vocabulary and defaults to a built-in english
stopword list.
"""
self.stopwords = stopwords
self.stemmer = snowballstemmer.stemmer("english")
self.tokens = re.compile(r"[A-Z]?[a-z]{2,}")
self.logging = logging
if samples:
self._generate_vocabulary(samples, limit)
示例11: create_search_terms
def create_search_terms(string_terms):
''' Creates search terms by stemming every word within the parameter passed.
Returns all search terms in one string separated by space'''
stemmer = snowballstemmer.stemmer('english')
terms = stemmer.stemWords(string_terms.split())
search_term = list()
for term in terms:
lower_term = term.lower()
if not lower_term in _STOP_WORDS:
search_term.append(lower_term)
return " ".join(search_term)
示例12: search_result
def search_result(request):
query = request.POST.get('query')
q_words = query.split()
stemmed_words = []
for word in q_words:
lng = detect(word)
if lng in LANGUAGES:
lng = LANGUAGES[lng]
stemmed_words.append(snowballstemmer.stemmer(lng).stemWord(word))
else:
stemmed_words.append(word)
return render(request, 'searchres/search_result.html', {})
示例13: getPalabras
def getPalabras():
file = "dicc.txt"
arc = open(file, 'r')
stemmer = snowballstemmer.stemmer('spanish');
words = {}
for i in arc:
i = i.rstrip()
i = stemmer.stemWord(i)
words[i] = "word"
for i in words.items():
print i
print len(words)
示例14: get_coursed_and_create_matrix
def get_coursed_and_create_matrix():
results = [course for course in modulestore().get_courses() if course.scope_ids.block_type == "course"]
new_matrix = TfidMatrixAllCourses.objects.all().first() or TfidMatrixAllCourses()
print new_matrix.matrix.shape[0] != len(results)
if new_matrix.matrix.shape[0] != len(results):
all_courses = [re.sub("<[^>]*>", "", CourseDetails.fetch_about_attribute(x.id, "overview")) for x in results]
MatrixEdxCoursesId.objects.all().delete()
map(lambda x: MatrixEdxCoursesId.objects.create(course_key=x.id, course_index=results.index(x)), results)
stemmer = snowballstemmer.stemmer("english")
courses_stem = [" ".join(stemmer.stemWords(x.split())) for x in all_courses]
vect = TfidfVectorizer(stop_words=get_stop_words(), lowercase=True, dtype=np.float32)
matrix = vect.fit_transform(courses_stem)
new_matrix.matrix = matrix
new_matrix.save()
示例15: identify_language
def identify_language(self, text):
self.lang = lang_mapping[langid.classify(text)[0]]
if self.debug: print "LANG", self.lang#, "stemmer", self.stem
if self.lang == "greek":
from stemmers.greek import stem, stopwords
self.stem = stem
self.legal_token = partial(self.legal_token, exclude_list=stopwords)
elif self.lang == "turkish": # unfortunately, turkish stemmer isnt included in nltk
import snowballstemmer
from stemmers.turkish import stopwords
self.stem = snowballstemmer.stemmer("turkish").stemWord
self.legal_token = partial(self.legal_token, exclude_list=stopwords)
else:
from nltk.stem import SnowballStemmer
from nltk.corpus import stopwords
self.stem = SnowballStemmer(self.lang).stem
self.legal_token = partial(self.legal_token, exclude_list=stopwords.words(self.lang))