当前位置: 首页>>代码示例>>Python>>正文


Python stem.PorterStemmer方法代码示例

本文整理汇总了Python中nltk.stem.PorterStemmer方法的典型用法代码示例。如果您正苦于以下问题:Python stem.PorterStemmer方法的具体用法?Python stem.PorterStemmer怎么用?Python stem.PorterStemmer使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在nltk.stem的用法示例。


在下文中一共展示了stem.PorterStemmer方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: __repr__

# 需要导入模块: from nltk import stem [as 别名]
# 或者: from nltk.stem import PorterStemmer [as 别名]
def __repr__(self):
        return '<PorterStemmer>'

## --NLTK--
## This test procedure isn't applicable.
#if __name__ == '__main__':
#    p = PorterStemmer()
#    if len(sys.argv) > 1:
#        for f in sys.argv[1:]:
#            with open(f, 'r') as infile:
#                while 1:
#                    w = infile.readline()
#                    if w == '':
#                        break
#                    w = w[:-1]
#                    print(p.stem(w))

##--NLTK--
## Added a demo() function 
开发者ID:rafasashi,项目名称:razzy-spinner,代码行数:21,代码来源:porter.py

示例2: __repr__

# 需要导入模块: from nltk import stem [as 别名]
# 或者: from nltk.stem import PorterStemmer [as 别名]
def __repr__(self):
        return '<PorterStemmer>'

## --NLTK--
## This test procedure isn't applicable.
#if __name__ == '__main__':
#    p = PorterStemmer()
#    if len(sys.argv) > 1:
#        for f in sys.argv[1:]:
#            infile = open(f, 'r')
#            while 1:
#                w = infile.readline()
#                if w == '':
#                    break
#                w = w[:-1]
#                print p.stem(w)

##--NLTK--
## Added a demo() function 
开发者ID:blackye,项目名称:luscan-devel,代码行数:21,代码来源:porter.py

示例3: plot_term_kdes

# 需要导入模块: from nltk import stem [as 别名]
# 或者: from nltk.stem import PorterStemmer [as 别名]
def plot_term_kdes(self, words, **kwargs):

        """
        Plot kernel density estimates for multiple words.

        Args:
            words (list): A list of unstemmed terms.
        """

        stem = PorterStemmer().stem

        for word in words:
            kde = self.kde(stem(word), **kwargs)
            plt.plot(kde)

        plt.show() 
开发者ID:davidmcclure,项目名称:textplot,代码行数:18,代码来源:text.py

示例4: tokenize

# 需要导入模块: from nltk import stem [as 别名]
# 或者: from nltk.stem import PorterStemmer [as 别名]
def tokenize(text):

    """
    Yield tokens.

    Args:
        text (str): The original text.

    Yields:
        dict: The next token.
    """

    stem = PorterStemmer().stem
    tokens = re.finditer('[a-z]+', text.lower())

    for offset, match in enumerate(tokens):

        # Get the raw token.
        unstemmed = match.group(0)

        yield { # Emit the token.
            'stemmed':      stem(unstemmed),
            'unstemmed':    unstemmed,
            'offset':       offset
        } 
开发者ID:davidmcclure,项目名称:textplot,代码行数:27,代码来源:utils.py

示例5: preprocess

# 需要导入模块: from nltk import stem [as 别名]
# 或者: from nltk.stem import PorterStemmer [as 别名]
def preprocess(string):

    stemmer = PorterStemmer()
    # Remove any punctuation character
    removed_punc = ''.join([char for char in string if char not in punctuation])

    cleaned = []
    # Remove any stopword
    for word in removed_punc.split(' '):
        if word not in stops:
            cleaned.append(stemmer.stem(word.lower()))
    return ' '.join(cleaned)




# Shuffle 
开发者ID:PacktPublishing,项目名称:Hands-On-Ensemble-Learning-with-Python,代码行数:19,代码来源:data_cleaning.py

示例6: _create_frequency_table

# 需要导入模块: from nltk import stem [as 别名]
# 或者: from nltk.stem import PorterStemmer [as 别名]
def _create_frequency_table(text_string) -> dict:
    """
    we create a dictionary for the word frequency table.
    For this, we should only use the words that are not part of the stopWords array.

    Removing stop words and making frequency table
    Stemmer - an algorithm to bring words to its root word.
    :rtype: dict
    """
    stopWords = set(stopwords.words("english"))
    words = word_tokenize(text_string)
    ps = PorterStemmer()

    freqTable = dict()
    for word in words:
        word = ps.stem(word)
        if word in stopWords:
            continue
        if word in freqTable:
            freqTable[word] += 1
        else:
            freqTable[word] = 1

    return freqTable 
开发者ID:akashp1712,项目名称:nlp-akash,代码行数:26,代码来源:Word_Frequency_Summarization.py

示例7: tagFilterAndStemming

# 需要导入模块: from nltk import stem [as 别名]
# 或者: from nltk.stem import PorterStemmer [as 别名]
def tagFilterAndStemming(originalTag):

    # Remove non alphabetical character and split on spaces
    processedTag = re.sub("[^a-zA-Z0-9]", " ", originalTag)
    processedTag = re.sub(" +", " ", processedTag)

    processedTag = processedTag.split(" ")

    stopwords_set = set(stopwords.words('english'))

    stemmer = PorterStemmer()

    result = []

    for tag in processedTag:

        tag_stemmed = stemmer.stem(tag)

        if tag_stemmed not in stopwords_set:
            result.append(tag_stemmed)

    return result 
开发者ID:MaurizioFD,项目名称:RecSys2019_DeepLearning_Evaluation,代码行数:24,代码来源:TagPreprocessing.py

示例8: demo

# 需要导入模块: from nltk import stem [as 别名]
# 或者: from nltk.stem import PorterStemmer [as 别名]
def demo():
    """
    A demonstration of the porter stemmer on a sample from
    the Penn Treebank corpus.
    """

    from nltk.corpus import treebank
    from nltk import stem

    stemmer = stem.PorterStemmer()

    orig = []
    stemmed = []
    for item in treebank.files()[:3]:
        for (word, tag) in treebank.tagged_words(item):
            orig.append(word)
            stemmed.append(stemmer.stem(word))

    # Convert the results to a string, and word-wrap them.
    results = ' '.join(stemmed)
    results = re.sub(r"(.{,70})\s", r'\1\n', results+' ').rstrip()

    # Convert the original to a string, and word wrap it.
    original = ' '.join(orig)
    original = re.sub(r"(.{,70})\s", r'\1\n', original+' ').rstrip()

    # Print the results.
    print('-Original-'.center(70).replace(' ', '*').replace('-', ' '))
    print(original)
    print('-Results-'.center(70).replace(' ', '*').replace('-', ' '))
    print(results)
    print('*'*70)

##--NLTK-- 
开发者ID:rafasashi,项目名称:razzy-spinner,代码行数:36,代码来源:porter.py

示例9: preprocessing

# 需要导入模块: from nltk import stem [as 别名]
# 或者: from nltk.stem import PorterStemmer [as 别名]
def preprocessing(text):
    text2 = " ".join("".join([" " if ch in string.punctuation else ch for ch in text]).split())

    tokens = [word for sent in nltk.sent_tokenize(text2) for word in
              nltk.word_tokenize(sent)]
    
    tokens = [word.lower() for word in tokens]
    
    stopwds = stopwords.words('english')
    tokens = [token for token in tokens if token not in stopwds]
    
    tokens = [word for word in tokens if len(word)>=3]
    
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(word) for word in tokens]

    tagged_corpus = pos_tag(tokens)    
    
    Noun_tags = ['NN','NNP','NNPS','NNS']
    Verb_tags = ['VB','VBD','VBG','VBN','VBP','VBZ']

    lemmatizer = WordNetLemmatizer()

    def prat_lemmatize(token,tag):
        if tag in Noun_tags:
            return lemmatizer.lemmatize(token,'n')
        elif tag in Verb_tags:
            return lemmatizer.lemmatize(token,'v')
        else:
            return lemmatizer.lemmatize(token,'n')
    
    pre_proc_text =  " ".join([prat_lemmatize(token,tag) for token,tag in tagged_corpus])             

    return pre_proc_text 
开发者ID:PacktPublishing,项目名称:Natural-Language-Processing-with-Python-Cookbook,代码行数:36,代码来源:9.5 Skipgram_Keras.py

示例10: preprocessing

# 需要导入模块: from nltk import stem [as 别名]
# 或者: from nltk.stem import PorterStemmer [as 别名]
def preprocessing(text):
    text2 = " ".join("".join([" " if ch in string.punctuation else ch for ch in text]).split())

    tokens = [word for sent in nltk.sent_tokenize(text2) for word in
              nltk.word_tokenize(sent)]
    
    tokens = [word.lower() for word in tokens]
    
    stopwds = stopwords.words('english')
    tokens = [token for token in tokens if token not in stopwds]
    
    tokens = [word for word in tokens if len(word)>=3]
    
    stemmer = PorterStemmer()
    try:
        tokens = [stemmer.stem(word) for word in tokens]

    except:
        tokens = tokens
        
    tagged_corpus = pos_tag(tokens)    
    
    Noun_tags = ['NN','NNP','NNPS','NNS']
    Verb_tags = ['VB','VBD','VBG','VBN','VBP','VBZ']

    lemmatizer = WordNetLemmatizer()

    def prat_lemmatize(token,tag):
        if tag in Noun_tags:
            return lemmatizer.lemmatize(token,'n')
        elif tag in Verb_tags:
            return lemmatizer.lemmatize(token,'v')
        else:
            return lemmatizer.lemmatize(token,'n')
    
    pre_proc_text =  " ".join([prat_lemmatize(token,tag) for token,tag in tagged_corpus])             

    return pre_proc_text 
开发者ID:PacktPublishing,项目名称:Natural-Language-Processing-with-Python-Cookbook,代码行数:40,代码来源:9.2 Email_Classification.py

示例11: __init__

# 需要导入模块: from nltk import stem [as 别名]
# 或者: from nltk.stem import PorterStemmer [as 别名]
def __init__(self):
        self.stemmer = NltkPorterStemmer()

    #overrides 
开发者ID:plasticityai,项目名称:magnitude,代码行数:6,代码来源:word_stemmer.py

示例12: __init__

# 需要导入模块: from nltk import stem [as 别名]
# 或者: from nltk.stem import PorterStemmer [as 别名]
def __init__(self):
        self.stemmer = NltkPorterStemmer() 
开发者ID:jcyk,项目名称:gtos,代码行数:4,代码来源:word_stemmer.py

示例13: demo

# 需要导入模块: from nltk import stem [as 别名]
# 或者: from nltk.stem import PorterStemmer [as 别名]
def demo():
    """
    A demonstration of the porter stemmer on a sample from
    the Penn Treebank corpus.
    """

    from nltk.corpus import treebank
    from nltk import stem

    stemmer = stem.PorterStemmer()

    orig = []
    stemmed = []
    for item in treebank.files()[:3]:
        for (word, tag) in treebank.tagged_words(item):
            orig.append(word)
            stemmed.append(stemmer.stem(word))

    # Convert the results to a string, and word-wrap them.
    results = ' '.join(stemmed)
    results = re.sub(r"(.{,70})\s", r'\1\n', results+' ').rstrip()

    # Convert the original to a string, and word wrap it.
    original = ' '.join(orig)
    original = re.sub(r"(.{,70})\s", r'\1\n', original+' ').rstrip()

    # Print the results.
    print '-Original-'.center(70).replace(' ', '*').replace('-', ' ')
    print original
    print '-Results-'.center(70).replace(' ', '*').replace('-', ' ')
    print results
    print '*'*70

##--NLTK-- 
开发者ID:blackye,项目名称:luscan-devel,代码行数:36,代码来源:porter.py

示例14: __init__

# 需要导入模块: from nltk import stem [as 别名]
# 或者: from nltk.stem import PorterStemmer [as 别名]
def __init__(self, pos_tagged, lang, stem=False, min_word_len=3):
        """
        :param pos_tagged: List of list : Text pos_tagged as a list of sentences
        where each sentence is a list of tuple (word, TAG).
        :param stem: If we want to apply stemming on the text.
        """
        self.min_word_len = min_word_len
        self.considered_tags = {'NN', 'NNS', 'NNP', 'NNPS', 'JJ'}
        self.pos_tagged = []
        self.filtered_pos_tagged = []
        self.isStemmed = stem
        self.lang = lang

        if stem:
            stemmer = PorterStemmer()
            self.pos_tagged = [[(stemmer.stem(t[0]), t[1]) for t in sent] for sent in pos_tagged]
        else:
            self.pos_tagged = [[(t[0].lower(), t[1]) for t in sent] for sent in pos_tagged]

        temp = []
        for sent in self.pos_tagged:
            s = []
            for elem in sent:
                if len(elem[0]) < min_word_len:
                    s.append((elem[0], 'LESS'))
                else:
                    s.append(elem)
            temp.append(s)

        self.pos_tagged = temp
        # Convert some language-specific tag (NC, NE to NN) or ADJA ->JJ see convert method.
        if lang in ['fr', 'de']:
            self.pos_tagged = [[(tagged_token[0], convert(tagged_token[1])) for tagged_token in sentence] for sentence
                               in
                               self.pos_tagged]
        self.filtered_pos_tagged = [[(t[0].lower(), t[1]) for t in sent if self.is_candidate(t)] for sent in
                                    self.pos_tagged] 
开发者ID:swisscom,项目名称:ai-research-keyphrase-extraction,代码行数:39,代码来源:input_representation.py

示例15: __init__

# 需要导入模块: from nltk import stem [as 别名]
# 或者: from nltk.stem import PorterStemmer [as 别名]
def __init__(self, mode=NLTK_EXTENSIONS):
        if mode not in (
            self.NLTK_EXTENSIONS,
            self.MARTIN_EXTENSIONS,
            self.ORIGINAL_ALGORITHM,
        ):
            raise ValueError(
                "Mode must be one of PorterStemmer.NLTK_EXTENSIONS, "
                "PorterStemmer.MARTIN_EXTENSIONS, or "
                "PorterStemmer.ORIGINAL_ALGORITHM"
            )

        self.mode = mode

        if self.mode == self.NLTK_EXTENSIONS:
            # This is a table of irregular forms. It is quite short,
            # but still reflects the errors actually drawn to Martin
            # Porter's attention over a 20 year period!
            irregular_forms = {
                "sky": ["sky", "skies"],
                "die": ["dying"],
                "lie": ["lying"],
                "tie": ["tying"],
                "news": ["news"],
                "inning": ["innings", "inning"],
                "outing": ["outings", "outing"],
                "canning": ["cannings", "canning"],
                "howe": ["howe"],
                "proceed": ["proceed"],
                "exceed": ["exceed"],
                "succeed": ["succeed"],
            }

            self.pool = {}
            for key in irregular_forms:
                for val in irregular_forms[key]:
                    self.pool[val] = key

        self.vowels = frozenset(['a', 'e', 'i', 'o', 'u']) 
开发者ID:V1EngineeringInc,项目名称:V1EngineeringInc-Docs,代码行数:41,代码来源:porter.py


注:本文中的nltk.stem.PorterStemmer方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。