当前位置: 首页>>代码示例>>Python>>正文


Python snowball.SnowballStemmer方法代码示例

本文整理汇总了Python中nltk.stem.snowball.SnowballStemmer方法的典型用法代码示例。如果您正苦于以下问题:Python snowball.SnowballStemmer方法的具体用法?Python snowball.SnowballStemmer怎么用?Python snowball.SnowballStemmer使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在nltk.stem.snowball的用法示例。


在下文中一共展示了snowball.SnowballStemmer方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: clean_resume

# 需要导入模块: from nltk.stem import snowball [as 别名]
# 或者: from nltk.stem.snowball import SnowballStemmer [as 别名]
def clean_resume(resume_text):

  cleaned_resume = []

  # replacing newlines and punctuations with space
  resume_text =resume_text.replace('\t', ' ').replace('\n', ' ')
  for punctuation in string.punctuation:
    resume_text = resume_text.replace(punctuation, ' ')
  resume_text = resume_text.split()

  # removing stop words and Stemming the remaining words in the resume
  stemmer = SnowballStemmer("english")
  for word in resume_text:
    if word not in stopwords.words('english') and not word.isdigit():
      cleaned_resume.append(word.lower())#stemmer.stem(word))

  cleaned_resume = ' '.join(cleaned_resume)
  return cleaned_resume 
开发者ID:skcript,项目名称:cvscan,代码行数:20,代码来源:language_parser.py

示例2: conversion

# 需要导入模块: from nltk.stem import snowball [as 别名]
# 或者: from nltk.stem.snowball import SnowballStemmer [as 别名]
def conversion(source, dest):
    """
    :param source: the unit of measure you have
    :param dest: the unit of measure need to convert to
    :return:
    """
    stemmer = SnowballStemmer('english')
    source = stemmer.stem(source)
    dest = stemmer.stem(dest)

    try:
       units = conv_dict.get(source).get('Units')[
          conv_dict.get(source).get('Destination').index(dest)
       ]
    except:
       units = None

    return units, source, dest 
开发者ID:foxbook,项目名称:atap,代码行数:20,代码来源:converter.py

示例3: tokenize

# 需要导入模块: from nltk.stem import snowball [as 别名]
# 或者: from nltk.stem.snowball import SnowballStemmer [as 别名]
def tokenize(text):
        """
        Tokenizes sequences of text and stems the tokens.
        :param text: String to tokenize
        :return: List with stemmed tokens
        """
        tokens = nltk.WhitespaceTokenizer().tokenize(text)
        tokens = list(set(re.sub("[^a-zA-Z\']", "", token) for token in tokens))
        tokens = [word for word in tokens if word not in stopwords.words('english')]
        tokens = list(set(re.sub("[^a-zA-Z]", "", token) for token in tokens))
        stems = []
        stemmer = SnowballStemmer("english")
        for token in tokens:
            token = stemmer.stem(token)
            if token != "":
                stems.append(token)
        return stems 
开发者ID:thomhopmans,项目名称:themarketingtechnologist,代码行数:19,代码来源:run.py

示例4: stemming_message_snowball

# 需要导入模块: from nltk.stem import snowball [as 别名]
# 或者: from nltk.stem.snowball import SnowballStemmer [as 别名]
def stemming_message_snowball(message, stemmings_to_words=dict()):
    from nltk.stem.snowball import SnowballStemmer
    from nltk.tokenize import casual_tokenize
    stemmer = SnowballStemmer('finnish')

    if type(message) == None:
        return '', stemmings_to_words

    message.replace('#','')

    stemmed_message = []

    for word in casual_tokenize(message):

        stemmed_word = stemmer.stem(word.lower())
        stemmed_message.append(stemmed_word)
        stemmings_to_words[stemmed_word] = word

    stemmed_message = ' '.join(stemmed_message)

    return stemmed_message, stemmings_to_words 
开发者ID:futurice,项目名称:spice-hate_speech_detection,代码行数:23,代码来源:texttools.py

示例5: process

# 需要导入模块: from nltk.stem import snowball [as 别名]
# 或者: from nltk.stem.snowball import SnowballStemmer [as 别名]
def process(input_text):
    # Create a regular expression tokenizer
    tokenizer = RegexpTokenizer(r'\w+')

    # Create a Snowball stemmer 
    stemmer = SnowballStemmer('english')

    # Get the list of stop words 
    stop_words = stopwords.words('english')
    
    # Tokenize the input string
    tokens = tokenizer.tokenize(input_text.lower())

    # Remove the stop words 
    tokens = [x for x in tokens if not x in stop_words]
    
    # Perform stemming on the tokenized words 
    tokens_stemmed = [stemmer.stem(x) for x in tokens]

    return tokens_stemmed 
开发者ID:PacktPublishing,项目名称:Artificial-Intelligence-with-Python,代码行数:22,代码来源:topic_modeler.py

示例6: __init__

# 需要导入模块: from nltk.stem import snowball [as 别名]
# 或者: from nltk.stem.snowball import SnowballStemmer [as 别名]
def __init__(self, input_directory, language):
        """
        Args:
            input_directory (str): the directory from which text documents to
              be summarized are loaded.

        @type language: str

        """
        self.input_directory = input_directory
        self.sentences = []
        self.weights = {}
        self.c2s = defaultdict(set)
        self.concept_sets = defaultdict(frozenset)
        self.LANGUAGE = language
        # type: str

        self.stoplist = set(stopwords.words(self.LANGUAGE))
        self.stemmer = SnowballStemmer(self.LANGUAGE)

        self.word_frequencies = defaultdict(int)
        self.w2s = defaultdict(set) 
开发者ID:UKPLab,项目名称:acl2017-interactive_summarizer,代码行数:24,代码来源:concept_based.py

示例7: test_russian

# 需要导入模块: from nltk.stem import snowball [as 别名]
# 或者: from nltk.stem.snowball import SnowballStemmer [as 别名]
def test_russian(self):
        # Russian words both consisting of Cyrillic
        # and Roman letters can be stemmed.
        stemmer_russian = SnowballStemmer("russian")
        assert stemmer_russian.stem("авантненькая") == "авантненьк"
        assert stemmer_russian.stem("avenantnen'kai^a") == "avenantnen'k" 
开发者ID:rafasashi,项目名称:razzy-spinner,代码行数:8,代码来源:test_stem.py

示例8: test_german

# 需要导入模块: from nltk.stem import snowball [as 别名]
# 或者: from nltk.stem.snowball import SnowballStemmer [as 别名]
def test_german(self):
        stemmer_german = SnowballStemmer("german")
        stemmer_german2 = SnowballStemmer("german", ignore_stopwords=True)

        assert stemmer_german.stem("Schr\xe4nke") == 'schrank'
        assert stemmer_german2.stem("Schr\xe4nke") == 'schrank'

        assert stemmer_german.stem("keinen") == 'kein'
        assert stemmer_german2.stem("keinen") == 'keinen' 
开发者ID:rafasashi,项目名称:razzy-spinner,代码行数:11,代码来源:test_stem.py

示例9: test_spanish

# 需要导入模块: from nltk.stem import snowball [as 别名]
# 或者: from nltk.stem.snowball import SnowballStemmer [as 别名]
def test_spanish(self):
        stemmer = SnowballStemmer('spanish')

        assert stemmer.stem("Visionado") == 'vision'

        # The word 'algue' was raising an IndexError
        assert stemmer.stem("algue") == 'algu' 
开发者ID:rafasashi,项目名称:razzy-spinner,代码行数:9,代码来源:test_stem.py

示例10: test_short_strings_bug

# 需要导入模块: from nltk.stem import snowball [as 别名]
# 或者: from nltk.stem.snowball import SnowballStemmer [as 别名]
def test_short_strings_bug(self):
        stemmer = SnowballStemmer('english')
        assert stemmer.stem("y's") == 'y' 
开发者ID:rafasashi,项目名称:razzy-spinner,代码行数:5,代码来源:test_stem.py

示例11: __init__

# 需要导入模块: from nltk.stem import snowball [as 别名]
# 或者: from nltk.stem.snowball import SnowballStemmer [as 别名]
def __init__(self,bigrams=True,min_df=3,stemming=True,tfidf=True):
        self.regex = re.compile('[^a-zA-Z ]')
        self.stop = set(stopwords.words('english'))
        self.stemmer = SnowballStemmer("english")
        self.bigrams = bigrams
        self.min_df = min_df
        self.stemming = stemming
        self.tfidf = tfidf 
开发者ID:iamshang1,项目名称:Projects,代码行数:10,代码来源:preprocessing.py

示例12: cleaned_tokens

# 需要导入模块: from nltk.stem import snowball [as 别名]
# 或者: from nltk.stem.snowball import SnowballStemmer [as 别名]
def cleaned_tokens(tokens):
    """Clean the tokens by removing stop words and stemming."""
    # stemmer = SnowballStemmer("english")
    # stemmed = [stemmer.stem(token) for token in tokens]
    s = set(stopwords.words('english'))
    tokens = [x.lower() for x in tokens if not x.isdigit()]
    return filter(lambda w: not w.lower() in s, tokens) 
开发者ID:9b,项目名称:chirp,代码行数:9,代码来源:helpers.py

示例13: build_analyzer

# 需要导入模块: from nltk.stem import snowball [as 别名]
# 或者: from nltk.stem.snowball import SnowballStemmer [as 别名]
def build_analyzer(self):
        analyzer = super(StemmedCountVectorizer, self).build_analyzer()
        stemmer = SnowballStemmer("english", ignore_stopwords=True)
        return lambda doc: ([stemmer.stem(w) for w in analyzer(doc)]) 
开发者ID:karolzak,项目名称:support-tickets-classification,代码行数:6,代码来源:2_train_and_eval_model.py

示例14: __init__

# 需要导入模块: from nltk.stem import snowball [as 别名]
# 或者: from nltk.stem.snowball import SnowballStemmer [as 别名]
def __init__(self):
        # Create a regular expression tokenizer
        self.tokenizer = RegexpTokenizer(r'\w+')

        # get the list of stop words 
        self.stop_words_english = stopwords.words('english')

        # Create a Snowball stemmer 
        self.stemmer = SnowballStemmer('english')
        
    # Tokenizing, stop word removal, and stemming 
开发者ID:PacktPublishing,项目名称:Python-Machine-Learning-Cookbook-Second-Edition,代码行数:13,代码来源:topic_modeling.py

示例15: tokenize_and_stem

# 需要导入模块: from nltk.stem import snowball [as 别名]
# 或者: from nltk.stem.snowball import SnowballStemmer [as 别名]
def tokenize_and_stem(text):
    stemmer = SnowballStemmer("english")
    text = re.sub("^\d+\s|\s\d+\s|\s\d+$", " ", text)
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    stems = [stemmer.stem(t) for t in filtered_tokens]
    return stems
################################################################################ 
开发者ID:AutoViML,项目名称:Auto_ViML,代码行数:15,代码来源:Auto_NLP.py


注:本文中的nltk.stem.snowball.SnowballStemmer方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。