當前位置: 首頁>>代碼示例>>Python>>正文


Python stem.PorterStemmer方法代碼示例

本文整理匯總了Python中nltk.stem.PorterStemmer方法的典型用法代碼示例。如果您正苦於以下問題:Python stem.PorterStemmer方法的具體用法?Python stem.PorterStemmer怎麽用?Python stem.PorterStemmer使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在nltk.stem的用法示例。


在下文中一共展示了stem.PorterStemmer方法的15個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。

示例1: __repr__

# 需要導入模塊: from nltk import stem [as 別名]
# 或者: from nltk.stem import PorterStemmer [as 別名]
def __repr__(self):
        return '<PorterStemmer>'

## --NLTK--
## This test procedure isn't applicable.
#if __name__ == '__main__':
#    p = PorterStemmer()
#    if len(sys.argv) > 1:
#        for f in sys.argv[1:]:
#            with open(f, 'r') as infile:
#                while 1:
#                    w = infile.readline()
#                    if w == '':
#                        break
#                    w = w[:-1]
#                    print(p.stem(w))

##--NLTK--
## Added a demo() function 
開發者ID:rafasashi,項目名稱:razzy-spinner,代碼行數:21,代碼來源:porter.py

示例2: __repr__

# 需要導入模塊: from nltk import stem [as 別名]
# 或者: from nltk.stem import PorterStemmer [as 別名]
def __repr__(self):
        return '<PorterStemmer>'

## --NLTK--
## This test procedure isn't applicable.
#if __name__ == '__main__':
#    p = PorterStemmer()
#    if len(sys.argv) > 1:
#        for f in sys.argv[1:]:
#            infile = open(f, 'r')
#            while 1:
#                w = infile.readline()
#                if w == '':
#                    break
#                w = w[:-1]
#                print p.stem(w)

##--NLTK--
## Added a demo() function 
開發者ID:blackye,項目名稱:luscan-devel,代碼行數:21,代碼來源:porter.py

示例3: plot_term_kdes

# 需要導入模塊: from nltk import stem [as 別名]
# 或者: from nltk.stem import PorterStemmer [as 別名]
def plot_term_kdes(self, words, **kwargs):

        """
        Plot kernel density estimates for multiple words.

        Args:
            words (list): A list of unstemmed terms.
        """

        stem = PorterStemmer().stem

        for word in words:
            kde = self.kde(stem(word), **kwargs)
            plt.plot(kde)

        plt.show() 
開發者ID:davidmcclure,項目名稱:textplot,代碼行數:18,代碼來源:text.py

示例4: tokenize

# 需要導入模塊: from nltk import stem [as 別名]
# 或者: from nltk.stem import PorterStemmer [as 別名]
def tokenize(text):

    """
    Yield tokens.

    Args:
        text (str): The original text.

    Yields:
        dict: The next token.
    """

    stem = PorterStemmer().stem
    tokens = re.finditer('[a-z]+', text.lower())

    for offset, match in enumerate(tokens):

        # Get the raw token.
        unstemmed = match.group(0)

        yield { # Emit the token.
            'stemmed':      stem(unstemmed),
            'unstemmed':    unstemmed,
            'offset':       offset
        } 
開發者ID:davidmcclure,項目名稱:textplot,代碼行數:27,代碼來源:utils.py

示例5: preprocess

# 需要導入模塊: from nltk import stem [as 別名]
# 或者: from nltk.stem import PorterStemmer [as 別名]
def preprocess(string):

    stemmer = PorterStemmer()
    # Remove any punctuation character
    removed_punc = ''.join([char for char in string if char not in punctuation])

    cleaned = []
    # Remove any stopword
    for word in removed_punc.split(' '):
        if word not in stops:
            cleaned.append(stemmer.stem(word.lower()))
    return ' '.join(cleaned)




# Shuffle 
開發者ID:PacktPublishing,項目名稱:Hands-On-Ensemble-Learning-with-Python,代碼行數:19,代碼來源:data_cleaning.py

示例6: _create_frequency_table

# 需要導入模塊: from nltk import stem [as 別名]
# 或者: from nltk.stem import PorterStemmer [as 別名]
def _create_frequency_table(text_string) -> dict:
    """
    we create a dictionary for the word frequency table.
    For this, we should only use the words that are not part of the stopWords array.

    Removing stop words and making frequency table
    Stemmer - an algorithm to bring words to its root word.
    :rtype: dict
    """
    stopWords = set(stopwords.words("english"))
    words = word_tokenize(text_string)
    ps = PorterStemmer()

    freqTable = dict()
    for word in words:
        word = ps.stem(word)
        if word in stopWords:
            continue
        if word in freqTable:
            freqTable[word] += 1
        else:
            freqTable[word] = 1

    return freqTable 
開發者ID:akashp1712,項目名稱:nlp-akash,代碼行數:26,代碼來源:Word_Frequency_Summarization.py

示例7: tagFilterAndStemming

# 需要導入模塊: from nltk import stem [as 別名]
# 或者: from nltk.stem import PorterStemmer [as 別名]
def tagFilterAndStemming(originalTag):

    # Remove non alphabetical character and split on spaces
    processedTag = re.sub("[^a-zA-Z0-9]", " ", originalTag)
    processedTag = re.sub(" +", " ", processedTag)

    processedTag = processedTag.split(" ")

    stopwords_set = set(stopwords.words('english'))

    stemmer = PorterStemmer()

    result = []

    for tag in processedTag:

        tag_stemmed = stemmer.stem(tag)

        if tag_stemmed not in stopwords_set:
            result.append(tag_stemmed)

    return result 
開發者ID:MaurizioFD,項目名稱:RecSys2019_DeepLearning_Evaluation,代碼行數:24,代碼來源:TagPreprocessing.py

示例8: demo

# 需要導入模塊: from nltk import stem [as 別名]
# 或者: from nltk.stem import PorterStemmer [as 別名]
def demo():
    """
    A demonstration of the porter stemmer on a sample from
    the Penn Treebank corpus.
    """

    from nltk.corpus import treebank
    from nltk import stem

    stemmer = stem.PorterStemmer()

    orig = []
    stemmed = []
    for item in treebank.files()[:3]:
        for (word, tag) in treebank.tagged_words(item):
            orig.append(word)
            stemmed.append(stemmer.stem(word))

    # Convert the results to a string, and word-wrap them.
    results = ' '.join(stemmed)
    results = re.sub(r"(.{,70})\s", r'\1\n', results+' ').rstrip()

    # Convert the original to a string, and word wrap it.
    original = ' '.join(orig)
    original = re.sub(r"(.{,70})\s", r'\1\n', original+' ').rstrip()

    # Print the results.
    print('-Original-'.center(70).replace(' ', '*').replace('-', ' '))
    print(original)
    print('-Results-'.center(70).replace(' ', '*').replace('-', ' '))
    print(results)
    print('*'*70)

##--NLTK-- 
開發者ID:rafasashi,項目名稱:razzy-spinner,代碼行數:36,代碼來源:porter.py

示例9: preprocessing

# 需要導入模塊: from nltk import stem [as 別名]
# 或者: from nltk.stem import PorterStemmer [as 別名]
def preprocessing(text):
    text2 = " ".join("".join([" " if ch in string.punctuation else ch for ch in text]).split())

    tokens = [word for sent in nltk.sent_tokenize(text2) for word in
              nltk.word_tokenize(sent)]
    
    tokens = [word.lower() for word in tokens]
    
    stopwds = stopwords.words('english')
    tokens = [token for token in tokens if token not in stopwds]
    
    tokens = [word for word in tokens if len(word)>=3]
    
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(word) for word in tokens]

    tagged_corpus = pos_tag(tokens)    
    
    Noun_tags = ['NN','NNP','NNPS','NNS']
    Verb_tags = ['VB','VBD','VBG','VBN','VBP','VBZ']

    lemmatizer = WordNetLemmatizer()

    def prat_lemmatize(token,tag):
        if tag in Noun_tags:
            return lemmatizer.lemmatize(token,'n')
        elif tag in Verb_tags:
            return lemmatizer.lemmatize(token,'v')
        else:
            return lemmatizer.lemmatize(token,'n')
    
    pre_proc_text =  " ".join([prat_lemmatize(token,tag) for token,tag in tagged_corpus])             

    return pre_proc_text 
開發者ID:PacktPublishing,項目名稱:Natural-Language-Processing-with-Python-Cookbook,代碼行數:36,代碼來源:9.5 Skipgram_Keras.py

示例10: preprocessing

# 需要導入模塊: from nltk import stem [as 別名]
# 或者: from nltk.stem import PorterStemmer [as 別名]
def preprocessing(text):
    text2 = " ".join("".join([" " if ch in string.punctuation else ch for ch in text]).split())

    tokens = [word for sent in nltk.sent_tokenize(text2) for word in
              nltk.word_tokenize(sent)]
    
    tokens = [word.lower() for word in tokens]
    
    stopwds = stopwords.words('english')
    tokens = [token for token in tokens if token not in stopwds]
    
    tokens = [word for word in tokens if len(word)>=3]
    
    stemmer = PorterStemmer()
    try:
        tokens = [stemmer.stem(word) for word in tokens]

    except:
        tokens = tokens
        
    tagged_corpus = pos_tag(tokens)    
    
    Noun_tags = ['NN','NNP','NNPS','NNS']
    Verb_tags = ['VB','VBD','VBG','VBN','VBP','VBZ']

    lemmatizer = WordNetLemmatizer()

    def prat_lemmatize(token,tag):
        if tag in Noun_tags:
            return lemmatizer.lemmatize(token,'n')
        elif tag in Verb_tags:
            return lemmatizer.lemmatize(token,'v')
        else:
            return lemmatizer.lemmatize(token,'n')
    
    pre_proc_text =  " ".join([prat_lemmatize(token,tag) for token,tag in tagged_corpus])             

    return pre_proc_text 
開發者ID:PacktPublishing,項目名稱:Natural-Language-Processing-with-Python-Cookbook,代碼行數:40,代碼來源:9.2 Email_Classification.py

示例11: __init__

# 需要導入模塊: from nltk import stem [as 別名]
# 或者: from nltk.stem import PorterStemmer [as 別名]
def __init__(self):
        self.stemmer = NltkPorterStemmer()

    #overrides 
開發者ID:plasticityai,項目名稱:magnitude,代碼行數:6,代碼來源:word_stemmer.py

示例12: __init__

# 需要導入模塊: from nltk import stem [as 別名]
# 或者: from nltk.stem import PorterStemmer [as 別名]
def __init__(self):
        self.stemmer = NltkPorterStemmer() 
開發者ID:jcyk,項目名稱:gtos,代碼行數:4,代碼來源:word_stemmer.py

示例13: demo

# 需要導入模塊: from nltk import stem [as 別名]
# 或者: from nltk.stem import PorterStemmer [as 別名]
def demo():
    """
    A demonstration of the porter stemmer on a sample from
    the Penn Treebank corpus.
    """

    from nltk.corpus import treebank
    from nltk import stem

    stemmer = stem.PorterStemmer()

    orig = []
    stemmed = []
    for item in treebank.files()[:3]:
        for (word, tag) in treebank.tagged_words(item):
            orig.append(word)
            stemmed.append(stemmer.stem(word))

    # Convert the results to a string, and word-wrap them.
    results = ' '.join(stemmed)
    results = re.sub(r"(.{,70})\s", r'\1\n', results+' ').rstrip()

    # Convert the original to a string, and word wrap it.
    original = ' '.join(orig)
    original = re.sub(r"(.{,70})\s", r'\1\n', original+' ').rstrip()

    # Print the results.
    print '-Original-'.center(70).replace(' ', '*').replace('-', ' ')
    print original
    print '-Results-'.center(70).replace(' ', '*').replace('-', ' ')
    print results
    print '*'*70

##--NLTK-- 
開發者ID:blackye,項目名稱:luscan-devel,代碼行數:36,代碼來源:porter.py

示例14: __init__

# 需要導入模塊: from nltk import stem [as 別名]
# 或者: from nltk.stem import PorterStemmer [as 別名]
def __init__(self, pos_tagged, lang, stem=False, min_word_len=3):
        """
        :param pos_tagged: List of list : Text pos_tagged as a list of sentences
        where each sentence is a list of tuple (word, TAG).
        :param stem: If we want to apply stemming on the text.
        """
        self.min_word_len = min_word_len
        self.considered_tags = {'NN', 'NNS', 'NNP', 'NNPS', 'JJ'}
        self.pos_tagged = []
        self.filtered_pos_tagged = []
        self.isStemmed = stem
        self.lang = lang

        if stem:
            stemmer = PorterStemmer()
            self.pos_tagged = [[(stemmer.stem(t[0]), t[1]) for t in sent] for sent in pos_tagged]
        else:
            self.pos_tagged = [[(t[0].lower(), t[1]) for t in sent] for sent in pos_tagged]

        temp = []
        for sent in self.pos_tagged:
            s = []
            for elem in sent:
                if len(elem[0]) < min_word_len:
                    s.append((elem[0], 'LESS'))
                else:
                    s.append(elem)
            temp.append(s)

        self.pos_tagged = temp
        # Convert some language-specific tag (NC, NE to NN) or ADJA ->JJ see convert method.
        if lang in ['fr', 'de']:
            self.pos_tagged = [[(tagged_token[0], convert(tagged_token[1])) for tagged_token in sentence] for sentence
                               in
                               self.pos_tagged]
        self.filtered_pos_tagged = [[(t[0].lower(), t[1]) for t in sent if self.is_candidate(t)] for sent in
                                    self.pos_tagged] 
開發者ID:swisscom,項目名稱:ai-research-keyphrase-extraction,代碼行數:39,代碼來源:input_representation.py

示例15: __init__

# 需要導入模塊: from nltk import stem [as 別名]
# 或者: from nltk.stem import PorterStemmer [as 別名]
def __init__(self, mode=NLTK_EXTENSIONS):
        if mode not in (
            self.NLTK_EXTENSIONS,
            self.MARTIN_EXTENSIONS,
            self.ORIGINAL_ALGORITHM,
        ):
            raise ValueError(
                "Mode must be one of PorterStemmer.NLTK_EXTENSIONS, "
                "PorterStemmer.MARTIN_EXTENSIONS, or "
                "PorterStemmer.ORIGINAL_ALGORITHM"
            )

        self.mode = mode

        if self.mode == self.NLTK_EXTENSIONS:
            # This is a table of irregular forms. It is quite short,
            # but still reflects the errors actually drawn to Martin
            # Porter's attention over a 20 year period!
            irregular_forms = {
                "sky": ["sky", "skies"],
                "die": ["dying"],
                "lie": ["lying"],
                "tie": ["tying"],
                "news": ["news"],
                "inning": ["innings", "inning"],
                "outing": ["outings", "outing"],
                "canning": ["cannings", "canning"],
                "howe": ["howe"],
                "proceed": ["proceed"],
                "exceed": ["exceed"],
                "succeed": ["succeed"],
            }

            self.pool = {}
            for key in irregular_forms:
                for val in irregular_forms[key]:
                    self.pool[val] = key

        self.vowels = frozenset(['a', 'e', 'i', 'o', 'u']) 
開發者ID:V1EngineeringInc,項目名稱:V1EngineeringInc-Docs,代碼行數:41,代碼來源:porter.py


注:本文中的nltk.stem.PorterStemmer方法示例由純淨天空整理自Github/MSDocs等開源代碼及文檔管理平台,相關代碼片段篩選自各路編程大神貢獻的開源項目,源碼版權歸原作者所有,傳播和使用請參考對應項目的License;未經允許,請勿轉載。