当前位置: 首页>>代码示例>>Python>>正文


Python snowballstemmer.stemmer函数代码示例

本文整理汇总了Python中snowballstemmer.stemmer函数的典型用法代码示例。如果您正苦于以下问题:Python stemmer函数的具体用法?Python stemmer怎么用?Python stemmer使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。


在下文中一共展示了stemmer函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: clean

def clean(text, stemmer='snowball'):
    """Normalize, split, and clean text

    Parameters:
    -----------
    text : str
        Block of text to clean and prepare.
    stemmer : str, opt
        Stemmer to use: [snowball, five, simple]

    Returns:
    --------
    text : str
        Cleaned and prepared text block.
    """

    if not stemmer in ['snowball', 'five', 'simple', 'none']:
        raise ValueError("Stemmer choice not available.")

    text = re.sub("[{}]".format(string.punctuation), " ", text.lower())
    text = text.split()

    if stemmer == 'five':
        text = [five_stemmer(item) for item in text]
    elif stemmer == 'snowball':
        stemmer = snowballstemmer.stemmer('english');
        text = stemmer.stemWords(text)
    elif stemmer == 'simple':
        text = [simple_stem(item) for item in text]
    else:
        pass

    text = [item for item in text if not item in STOP_WORDS]

    return text
开发者ID:justincely,项目名称:classwork,代码行数:35,代码来源:corpus_stats.py

示例2: preprocess_document

def preprocess_document(data):
    # Step 1: strip punctuation
    data = data.lower()
    punctuation = ['.', ',', '"', "'", '?', '!', ':', ';', '(', ')', '[', ']'
    , '{', '}', '#', '\\','/','@','\xa0','\n','&','$','‘','…','•','-'] 
    for punc in punctuation:
        data = data.replace(punc, '')
        
    # Step 2: tokenize 
    data = list(nltk.word_tokenize(data))
    
    # Step 3: strip stopwords
    stop = set(stopwords.words('english'))
    extra_stopwords = ['ok', 'oh', 'via','bc','gon','na'] # add any additional stopwords we want to use here
    stop.update(extra_stopwords)
    stop.update(list(string.ascii_lowercase)) # remove all single letters
    data = [i for i in data if i not in stop] # remove stopwords and sort result
    
    # Step 4: stemming
    stemmer = snowballstemmer.stemmer('english') 
    data = stemmer.stemWords(data)
    
    # Step 5: remove words not in NLTK english corpus
    words = set(nltk.corpus.words.words())
     for w in data:
        if w not in words:
            data.remove(w)
开发者ID:ZanW,项目名称:Python,代码行数:27,代码来源:News_pre_l.py

示例3: stemming

def stemming(lang, input, output, encoding, pretty):
    result = []
    stemmer = snowballstemmer.stemmer(lang)
    for original in codecs.open(input, "r", encoding).readlines():
        original = original.strip()
        # Convert only ASCII-letters to lowercase, to match C behavior
        original = ''.join((lower_(c) if 'A' <= c <= 'Z' else c for c in original))
        stemmed = stemmer.stemWord(original)
        if result:
            result.append('\n')
        if pretty == 0:
            if stemmed != "":
                result.append(stemmed)
        elif pretty == 1:
            result.append(original, " -> ", stemmed)
        elif pretty == 2:
            result.append(original)
            if len(original) < 30:
                result.append(" " * (30 - len(original)))
            else:
                result.append("\n")
                result.append(" " * 30)
            result.append(stemmed)
    outfile = codecs.open(output, "w", encoding)
    outfile.write(''.join(result) + '\n')
    outfile.close()
开发者ID:xjzhou,项目名称:snowball,代码行数:26,代码来源:stemwords.py

示例4: getHighlightingsVariables

	def getHighlightingsVariables(self, article, variable_keywords, variable_pages):
		stemmer = snowballstemmer.stemmer("german")
		#goodchars = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzÄÖÜäöüß'"
		for i in range(0, len(article)):
			for j in range(0, len(article[i])):
  				article[i][j] = article[i][j].split(" ");
				for k in range(0, len(article[i][j])):
					#article[i][j][k]=chrtran(article[i][j][k], goodchars, "")
					article[i][j][k]=stemmer.stemWord(article[i][j][k])


		for i in range(0, len(variable_keywords)):
			#variable_keywords[i]=chrtran(variable_keywords[i], goodchars, "")
			variable_keywords[i]=stemmer.stemWord(variable_keywords[i])

		highlight = []

		for i in range(0, len(article)):
			highlight_article = []
	
			for j in range(0, len(article[i])):
				highlight_variables = []
				for k in range(0, len(variable_keywords)):
	  				highlight_variables.append(random.random())
				highlight_article.append(highlight_variables)

			highlight.append(highlight_article)
			


	 	return highlight
开发者ID:Institute-Web-Science-and-Technologies,项目名称:westcat,代码行数:31,代码来源:Highlighter_Articles.py

示例5: turkish

def turkish(sent):
    # No turkish stemmer in NLTK
    stem = snowballstemmer.stemmer('turkish')
    stop = stopwords.words('turkish')
    tx  = word_tokenize(sent)
    mx = stem.stemWords(tx)
    px = [x for x in mx if x not in stop]
    return px
开发者ID:Jiannan28,项目名称:stemtokstop,代码行数:8,代码来源:stemtokstop.py

示例6: __init__

 def __init__(self, language=None):
     """Create a new highlighter for the specified language.
     
     """
     if language:
         self.stem = snowballstemmer.stemmer(language)
     else:
         self.stem = NoStem()
开发者ID:flaxsearch,项目名称:highlighter,代码行数:8,代码来源:highlight.py

示例7: aplicarStemmer

def aplicarStemmer(pDictPalabrasArchivos):
    print("aplicando stemming...")
    dictRaices = {}
    stemmer = snowballstemmer.stemmer("spanish")
    for docId, palabras in pDictPalabrasArchivos.items():
        raices = stemmer.stemWords(palabras)
        dictRaices[docId] = raices
    ##    archivo.archivo.crearCSVDict(".\stemming.csv",dictRaices)
    return dictRaices
开发者ID:201265615,项目名称:TP2_RIT_II15_PY,代码行数:9,代码来源:archivo_invertido.py

示例8: __init__

 def __init__(self, xml):
     self.dest = xml.get("dest")
     if self.dest is None:
         raise ValueError()
     self.verbose = xml.get("verbose")
     if self.verbose is None:
         self.verbose = False
     else:
         self.verbose = True
     self.stemmer =  snowballstemmer.stemmer('english')
开发者ID:Sentimentron,项目名称:Nebraska-public,代码行数:10,代码来源:stemmer.py

示例9: aplicarStemmerConsulta

def aplicarStemmerConsulta(pLista):
    #print(pLista)
    print("aplicando stemming...")
    lista = []
    stemmer = snowballstemmer.stemmer('spanish')
    for i in pLista:
        #print(i[0])
        raiz = stemmer.stemWords([i[0]])[0]
        lista.append([raiz,i[1]])
        #print(i[0])
    #print(lista)
    return lista
开发者ID:201265615,项目名称:TP2_RIT_II15_PY,代码行数:12,代码来源:consultas.py

示例10: __init__

 def __init__(self, samples=None, stopwords="english", limit=20, logging=False):
     """
     Create a vocabulary which is a mapping from bucket names to lists of
     synonyms that fall into their bucket. Stopwords is a list of words that
     are ignored for the vocabulary and defaults to a built-in english
     stopword list.
     """
     self.stopwords = stopwords
     self.stemmer = snowballstemmer.stemmer("english")
     self.tokens = re.compile(r"[A-Z]?[a-z]{2,}")
     self.logging = logging
     if samples:
         self._generate_vocabulary(samples, limit)
开发者ID:janukobytsch,项目名称:wikimedia-image-classification,代码行数:13,代码来源:words.py

示例11: create_search_terms

def create_search_terms(string_terms):
  ''' Creates search terms by stemming every word within the parameter passed.
  Returns all search terms in one string separated by space'''
  stemmer = snowballstemmer.stemmer('english')
  terms = stemmer.stemWords(string_terms.split())

  search_term = list()
  for term in terms:
    lower_term = term.lower()
    if not lower_term in _STOP_WORDS:
      search_term.append(lower_term)

  return " ".join(search_term)
开发者ID:Trekafe,项目名称:trekafe_web,代码行数:13,代码来源:search_terms.py

示例12: search_result

def search_result(request):
    query = request.POST.get('query')
    q_words = query.split()
    stemmed_words = []
    for word in q_words:
        lng = detect(word)
        if lng in LANGUAGES:
            lng = LANGUAGES[lng]
            stemmed_words.append(snowballstemmer.stemmer(lng).stemWord(word))
        else:
            stemmed_words.append(word)

    return render(request, 'searchres/search_result.html', {})
开发者ID:alehat,项目名称:searchengine,代码行数:13,代码来源:views.py

示例13: getPalabras

def getPalabras():
    file = "dicc.txt"

    arc = open(file, 'r')
    stemmer = snowballstemmer.stemmer('spanish');
         
    
    words = {}
    for i in arc:
        i = i.rstrip()
        i = stemmer.stemWord(i)
        words[i] = "word"
    
    for i in words.items():
        print i
    
    print len(words) 
开发者ID:andoniVT,项目名称:OpinionMiningProject,代码行数:17,代码来源:Utils.py

示例14: get_coursed_and_create_matrix

def get_coursed_and_create_matrix():
    results = [course for course in modulestore().get_courses() if course.scope_ids.block_type == "course"]
    new_matrix = TfidMatrixAllCourses.objects.all().first() or TfidMatrixAllCourses()
    print new_matrix.matrix.shape[0] != len(results)
    if new_matrix.matrix.shape[0] != len(results):
        all_courses = [re.sub("<[^>]*>", "", CourseDetails.fetch_about_attribute(x.id, "overview")) for x in results]

        MatrixEdxCoursesId.objects.all().delete()
        map(lambda x: MatrixEdxCoursesId.objects.create(course_key=x.id, course_index=results.index(x)), results)

        stemmer = snowballstemmer.stemmer("english")
        courses_stem = [" ".join(stemmer.stemWords(x.split())) for x in all_courses]

        vect = TfidfVectorizer(stop_words=get_stop_words(), lowercase=True, dtype=np.float32)
        matrix = vect.fit_transform(courses_stem)
        new_matrix.matrix = matrix
        new_matrix.save()
开发者ID:vz10,项目名称:edx_telegram_bot,代码行数:17,代码来源:prediction.py

示例15: identify_language

 def identify_language(self, text):
     self.lang = lang_mapping[langid.classify(text)[0]]
     if self.debug: print "LANG", self.lang#, "stemmer", self.stem
     
     if self.lang == "greek":
         from stemmers.greek import stem, stopwords 
         self.stem = stem
         self.legal_token = partial(self.legal_token, exclude_list=stopwords)
     elif self.lang == "turkish": # unfortunately, turkish stemmer isnt included in nltk
         import snowballstemmer
         from stemmers.turkish import stopwords 
         self.stem = snowballstemmer.stemmer("turkish").stemWord
         self.legal_token = partial(self.legal_token, exclude_list=stopwords)
     else:
         from nltk.stem import SnowballStemmer
         from nltk.corpus import stopwords
         self.stem = SnowballStemmer(self.lang).stem
         self.legal_token = partial(self.legal_token, exclude_list=stopwords.words(self.lang))
开发者ID:hymloth,项目名称:pyredise,代码行数:18,代码来源:index_base.py


注:本文中的snowballstemmer.stemmer函数示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。