當前位置: 首頁>>代碼示例>>Python>>正文


Python snowball.SnowballStemmer方法代碼示例

本文整理匯總了Python中nltk.stem.snowball.SnowballStemmer方法的典型用法代碼示例。如果您正苦於以下問題:Python snowball.SnowballStemmer方法的具體用法?Python snowball.SnowballStemmer怎麽用?Python snowball.SnowballStemmer使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在nltk.stem.snowball的用法示例。


在下文中一共展示了snowball.SnowballStemmer方法的15個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。

示例1: clean_resume

# 需要導入模塊: from nltk.stem import snowball [as 別名]
# 或者: from nltk.stem.snowball import SnowballStemmer [as 別名]
def clean_resume(resume_text):

  cleaned_resume = []

  # replacing newlines and punctuations with space
  resume_text =resume_text.replace('\t', ' ').replace('\n', ' ')
  for punctuation in string.punctuation:
    resume_text = resume_text.replace(punctuation, ' ')
  resume_text = resume_text.split()

  # removing stop words and Stemming the remaining words in the resume
  stemmer = SnowballStemmer("english")
  for word in resume_text:
    if word not in stopwords.words('english') and not word.isdigit():
      cleaned_resume.append(word.lower())#stemmer.stem(word))

  cleaned_resume = ' '.join(cleaned_resume)
  return cleaned_resume 
開發者ID:skcript,項目名稱:cvscan,代碼行數:20,代碼來源:language_parser.py

示例2: conversion

# 需要導入模塊: from nltk.stem import snowball [as 別名]
# 或者: from nltk.stem.snowball import SnowballStemmer [as 別名]
def conversion(source, dest):
    """
    :param source: the unit of measure you have
    :param dest: the unit of measure need to convert to
    :return:
    """
    stemmer = SnowballStemmer('english')
    source = stemmer.stem(source)
    dest = stemmer.stem(dest)

    try:
       units = conv_dict.get(source).get('Units')[
          conv_dict.get(source).get('Destination').index(dest)
       ]
    except:
       units = None

    return units, source, dest 
開發者ID:foxbook,項目名稱:atap,代碼行數:20,代碼來源:converter.py

示例3: tokenize

# 需要導入模塊: from nltk.stem import snowball [as 別名]
# 或者: from nltk.stem.snowball import SnowballStemmer [as 別名]
def tokenize(text):
        """
        Tokenizes sequences of text and stems the tokens.
        :param text: String to tokenize
        :return: List with stemmed tokens
        """
        tokens = nltk.WhitespaceTokenizer().tokenize(text)
        tokens = list(set(re.sub("[^a-zA-Z\']", "", token) for token in tokens))
        tokens = [word for word in tokens if word not in stopwords.words('english')]
        tokens = list(set(re.sub("[^a-zA-Z]", "", token) for token in tokens))
        stems = []
        stemmer = SnowballStemmer("english")
        for token in tokens:
            token = stemmer.stem(token)
            if token != "":
                stems.append(token)
        return stems 
開發者ID:thomhopmans,項目名稱:themarketingtechnologist,代碼行數:19,代碼來源:run.py

示例4: stemming_message_snowball

# 需要導入模塊: from nltk.stem import snowball [as 別名]
# 或者: from nltk.stem.snowball import SnowballStemmer [as 別名]
def stemming_message_snowball(message, stemmings_to_words=dict()):
    from nltk.stem.snowball import SnowballStemmer
    from nltk.tokenize import casual_tokenize
    stemmer = SnowballStemmer('finnish')

    if type(message) == None:
        return '', stemmings_to_words

    message.replace('#','')

    stemmed_message = []

    for word in casual_tokenize(message):

        stemmed_word = stemmer.stem(word.lower())
        stemmed_message.append(stemmed_word)
        stemmings_to_words[stemmed_word] = word

    stemmed_message = ' '.join(stemmed_message)

    return stemmed_message, stemmings_to_words 
開發者ID:futurice,項目名稱:spice-hate_speech_detection,代碼行數:23,代碼來源:texttools.py

示例5: process

# 需要導入模塊: from nltk.stem import snowball [as 別名]
# 或者: from nltk.stem.snowball import SnowballStemmer [as 別名]
def process(input_text):
    # Create a regular expression tokenizer
    tokenizer = RegexpTokenizer(r'\w+')

    # Create a Snowball stemmer 
    stemmer = SnowballStemmer('english')

    # Get the list of stop words 
    stop_words = stopwords.words('english')
    
    # Tokenize the input string
    tokens = tokenizer.tokenize(input_text.lower())

    # Remove the stop words 
    tokens = [x for x in tokens if not x in stop_words]
    
    # Perform stemming on the tokenized words 
    tokens_stemmed = [stemmer.stem(x) for x in tokens]

    return tokens_stemmed 
開發者ID:PacktPublishing,項目名稱:Artificial-Intelligence-with-Python,代碼行數:22,代碼來源:topic_modeler.py

示例6: __init__

# 需要導入模塊: from nltk.stem import snowball [as 別名]
# 或者: from nltk.stem.snowball import SnowballStemmer [as 別名]
def __init__(self, input_directory, language):
        """
        Args:
            input_directory (str): the directory from which text documents to
              be summarized are loaded.

        @type language: str

        """
        self.input_directory = input_directory
        self.sentences = []
        self.weights = {}
        self.c2s = defaultdict(set)
        self.concept_sets = defaultdict(frozenset)
        self.LANGUAGE = language
        # type: str

        self.stoplist = set(stopwords.words(self.LANGUAGE))
        self.stemmer = SnowballStemmer(self.LANGUAGE)

        self.word_frequencies = defaultdict(int)
        self.w2s = defaultdict(set) 
開發者ID:UKPLab,項目名稱:acl2017-interactive_summarizer,代碼行數:24,代碼來源:concept_based.py

示例7: test_russian

# 需要導入模塊: from nltk.stem import snowball [as 別名]
# 或者: from nltk.stem.snowball import SnowballStemmer [as 別名]
def test_russian(self):
        # Russian words both consisting of Cyrillic
        # and Roman letters can be stemmed.
        stemmer_russian = SnowballStemmer("russian")
        assert stemmer_russian.stem("авантненькая") == "авантненьк"
        assert stemmer_russian.stem("avenantnen'kai^a") == "avenantnen'k" 
開發者ID:rafasashi,項目名稱:razzy-spinner,代碼行數:8,代碼來源:test_stem.py

示例8: test_german

# 需要導入模塊: from nltk.stem import snowball [as 別名]
# 或者: from nltk.stem.snowball import SnowballStemmer [as 別名]
def test_german(self):
        stemmer_german = SnowballStemmer("german")
        stemmer_german2 = SnowballStemmer("german", ignore_stopwords=True)

        assert stemmer_german.stem("Schr\xe4nke") == 'schrank'
        assert stemmer_german2.stem("Schr\xe4nke") == 'schrank'

        assert stemmer_german.stem("keinen") == 'kein'
        assert stemmer_german2.stem("keinen") == 'keinen' 
開發者ID:rafasashi,項目名稱:razzy-spinner,代碼行數:11,代碼來源:test_stem.py

示例9: test_spanish

# 需要導入模塊: from nltk.stem import snowball [as 別名]
# 或者: from nltk.stem.snowball import SnowballStemmer [as 別名]
def test_spanish(self):
        stemmer = SnowballStemmer('spanish')

        assert stemmer.stem("Visionado") == 'vision'

        # The word 'algue' was raising an IndexError
        assert stemmer.stem("algue") == 'algu' 
開發者ID:rafasashi,項目名稱:razzy-spinner,代碼行數:9,代碼來源:test_stem.py

示例10: test_short_strings_bug

# 需要導入模塊: from nltk.stem import snowball [as 別名]
# 或者: from nltk.stem.snowball import SnowballStemmer [as 別名]
def test_short_strings_bug(self):
        stemmer = SnowballStemmer('english')
        assert stemmer.stem("y's") == 'y' 
開發者ID:rafasashi,項目名稱:razzy-spinner,代碼行數:5,代碼來源:test_stem.py

示例11: __init__

# 需要導入模塊: from nltk.stem import snowball [as 別名]
# 或者: from nltk.stem.snowball import SnowballStemmer [as 別名]
def __init__(self,bigrams=True,min_df=3,stemming=True,tfidf=True):
        self.regex = re.compile('[^a-zA-Z ]')
        self.stop = set(stopwords.words('english'))
        self.stemmer = SnowballStemmer("english")
        self.bigrams = bigrams
        self.min_df = min_df
        self.stemming = stemming
        self.tfidf = tfidf 
開發者ID:iamshang1,項目名稱:Projects,代碼行數:10,代碼來源:preprocessing.py

示例12: cleaned_tokens

# 需要導入模塊: from nltk.stem import snowball [as 別名]
# 或者: from nltk.stem.snowball import SnowballStemmer [as 別名]
def cleaned_tokens(tokens):
    """Clean the tokens by removing stop words and stemming."""
    # stemmer = SnowballStemmer("english")
    # stemmed = [stemmer.stem(token) for token in tokens]
    s = set(stopwords.words('english'))
    tokens = [x.lower() for x in tokens if not x.isdigit()]
    return filter(lambda w: not w.lower() in s, tokens) 
開發者ID:9b,項目名稱:chirp,代碼行數:9,代碼來源:helpers.py

示例13: build_analyzer

# 需要導入模塊: from nltk.stem import snowball [as 別名]
# 或者: from nltk.stem.snowball import SnowballStemmer [as 別名]
def build_analyzer(self):
        analyzer = super(StemmedCountVectorizer, self).build_analyzer()
        stemmer = SnowballStemmer("english", ignore_stopwords=True)
        return lambda doc: ([stemmer.stem(w) for w in analyzer(doc)]) 
開發者ID:karolzak,項目名稱:support-tickets-classification,代碼行數:6,代碼來源:2_train_and_eval_model.py

示例14: __init__

# 需要導入模塊: from nltk.stem import snowball [as 別名]
# 或者: from nltk.stem.snowball import SnowballStemmer [as 別名]
def __init__(self):
        # Create a regular expression tokenizer
        self.tokenizer = RegexpTokenizer(r'\w+')

        # get the list of stop words 
        self.stop_words_english = stopwords.words('english')

        # Create a Snowball stemmer 
        self.stemmer = SnowballStemmer('english')
        
    # Tokenizing, stop word removal, and stemming 
開發者ID:PacktPublishing,項目名稱:Python-Machine-Learning-Cookbook-Second-Edition,代碼行數:13,代碼來源:topic_modeling.py

示例15: tokenize_and_stem

# 需要導入模塊: from nltk.stem import snowball [as 別名]
# 或者: from nltk.stem.snowball import SnowballStemmer [as 別名]
def tokenize_and_stem(text):
    stemmer = SnowballStemmer("english")
    text = re.sub("^\d+\s|\s\d+\s|\s\d+$", " ", text)
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    stems = [stemmer.stem(t) for t in filtered_tokens]
    return stems
################################################################################ 
開發者ID:AutoViML,項目名稱:Auto_ViML,代碼行數:15,代碼來源:Auto_NLP.py


注:本文中的nltk.stem.snowball.SnowballStemmer方法示例由純淨天空整理自Github/MSDocs等開源代碼及文檔管理平台,相關代碼片段篩選自各路編程大神貢獻的開源項目,源碼版權歸原作者所有,傳播和使用請參考對應項目的License;未經允許,請勿轉載。