当前位置: 首页>>代码示例>>Python>>正文


Python PorterStemmer.stem方法代码示例

本文整理汇总了Python中nltk.stem.PorterStemmer.stem方法的典型用法代码示例。如果您正苦于以下问题:Python PorterStemmer.stem方法的具体用法?Python PorterStemmer.stem怎么用?Python PorterStemmer.stem使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在nltk.stem.PorterStemmer的用法示例。


在下文中一共展示了PorterStemmer.stem方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: new_lesk

# 需要导入模块: from nltk.stem import PorterStemmer [as 别名]
# 或者: from nltk.stem.PorterStemmer import stem [as 别名]
def new_lesk(context_sentence, ambiguous_word, pos=None, stem=True, hyperhypo=True):
    ps = PorterStemmer()
    max_overlaps = 0; lesk_sense = None
    context_sentence = context_sentence.split()
    for ss in wn.synsets(ambiguous_word):
        # If POS is specified.
        if pos and ss.pos is not pos:
            continue

        lesk_dictionary = []

        # Includes definition.
        lesk_dictionary+= ss.definition.split()
        # Includes lemma_names.
        lesk_dictionary+= ss.lemma_names

        # Optional: includes lemma_names of hypernyms and hyponyms.
        if hyperhypo == True:
            lesk_dictionary+= list(chain(*[i.lemma_names for i in ss.hypernyms()+ss.hyponyms()]))

        if stem == True: # Matching exact words causes sparsity, so lets match stems.
            lesk_dictionary = [ps.stem(i) for i in lesk_dictionary]
            context_sentence = [ps.stem(i) for i in context_sentence]

        overlaps = set(lesk_dictionary).intersection(context_sentence)

        if len(overlaps) > max_overlaps:
            lesk_sense = ss
            max_overlaps = len(overlaps)
    return lesk_sense
开发者ID:fieryfish,项目名称:wordSubstitutionTask,代码行数:32,代码来源:lesk.py

示例2: stem

# 需要导入模块: from nltk.stem import PorterStemmer [as 别名]
# 或者: from nltk.stem.PorterStemmer import stem [as 别名]
def stem(string):
    """Stem a phrase"""
    global stemmer
    if not stemmer:
        stemmer = Stemmer()
    #words = string.split()
    #for i in range(len(words)):
    #    words[i] = self.stemmer.stem(words[i])
    # stemming last word only
    #string = self._reGlue(words)
    #
    #string2 = stemmer.stem(string)
    #if string2 not in stemdict:
    #    stemdict[string2] = string
    # FIX ME
    if string not in stemdict:
        if bad_unicode(string):
            ## added A. Meyers 8/28/15
            temp = stemmer.stem(remove_non_unicode(string))
        else:
            temp = stemmer.stem(string)
        if temp:
            stemdict[string] = temp
        if not temp:
            pass
        elif temp not in unstemdict:
            unstemdict[temp] = [string]
        elif string not in unstemdict[temp]:
            unstemdict[temp].append(string)
    else:
        temp = stemdict[string]
    return temp
开发者ID:AdamMeyers,项目名称:The_Termolator,代码行数:34,代码来源:Filter.py

示例3: tokenize2

# 需要导入模块: from nltk.stem import PorterStemmer [as 别名]
# 或者: from nltk.stem.PorterStemmer import stem [as 别名]
def tokenize2(str,df_freq):
    #temp map (for getting the local term frequency)
    temp_map={}
    #for a sentence
    str =str.decode('ascii', 'ignore')
    #tokenizer = nltk.tokenize.treebank.TreebankWordTokenizer()
    #tokens=tokenizer.tokenize(str)
    tokens = str.split()
    #print tokens
    stemmer = PorterStemmer()
    #small set of stopwords (remove you, are, and, I those kinds of words)
    
    
    last =[]
    #bigram_list=[]
    for d in tokens:
        d = d.split('-')
        for c in d:
            c=re.compile('[%s]' % re.escape(string.punctuation)).sub('', c)
                #regular expression -> strip punctuations
                if c!='':
                    try:
                        if int(c):
                            if len(c)!=4 and (c>2015 or c<1900): #keep years
                                c=stemmer.stem('NUM')
                    except Exception:
                        c = stemmer.stem(c.lower())
                        pass
                    
                    last.append(c)
                    updateDF(temp_map,df_freq,c)
开发者ID:wingsrc,项目名称:musicRecommendation_topicmodeling,代码行数:33,代码来源:topic_model.py

示例4: tokenizeTags

# 需要导入模块: from nltk.stem import PorterStemmer [as 别名]
# 或者: from nltk.stem.PorterStemmer import stem [as 别名]
def tokenizeTags(str,dict_items):
    #temp map (for getting the local term frequency)
    #for a sentence
    str =str.decode('ascii', 'ignore')
    #tokenizer = nltk.tokenize.treebank.TreebankWordTokenizer()
    #tokens=tokenizer.tokenize(str)
    tokens = str.split()
    #print tokens
    stemmer = PorterStemmer()
    #small set of stopwords (remove you, are, and, I those kinds of words)
    last =[]
    #bigram_list=[]
    for d in tokens:
        d = d.split('-')
        for c in d:
                c=re.compile('[%s]' % re.escape(string.punctuation)).sub('', c)
                #regular expression -> strip punctuations
                if c!='' and c not in dict_items:
                    try:
                        if int(c):
                            if len(c)!=4 and (c>2015 or c<1900): #keep years
                                c=stemmer.stem('NUM')
                    except Exception:
                        c = stemmer.stem(c.lower())
                        pass
                    #c = stemmer.stem(c.lower())
                    last.append(c)
                    #bigram generation
                #index= len(last)
                #if index>1:
                   # bigram = last[index-2]+' '+last[index-1]
                   # bigram_list.append(bigram)
    return last
开发者ID:wingsrc,项目名称:musicRecommendation_topicmodeling,代码行数:35,代码来源:topic_model.py

示例5: tokenize2_bigram

# 需要导入模块: from nltk.stem import PorterStemmer [as 别名]
# 或者: from nltk.stem.PorterStemmer import stem [as 别名]
def tokenize2_bigram(str,df_freq):
    temp_map={}
    #for a sentence
    str =str.decode('ascii', 'ignore')
    tokens = str.split()
    #print tokens
    stemmer = PorterStemmer()
    last =[]
    bigram_list=[]
    for d in tokens:
        d = d.split('-')
        for c in d:
                c=re.compile('[%s]' % re.escape(string.punctuation)).sub('', c)
                #regular expression -> strip punctuations
                if c!='':
                    try:
                        if int(c):
                            if len(c)!=4 and (c>2015 or c<1900): #keep years
                                c=stemmer.stem('NUM')
                    except Exception:
                        c = stemmer.stem(c.lower())
                        pass
                    
                    #c = stemmer.stem(c.lower())
                    last.append(c)
                    
                    #bigram generation
                index= 0
                if index>1:
                    bigram = last[index-2]+' '+last[index-1]
                    bigram_list.append(bigram)
                    updateDF(temp_map,df_freq,bigram)
                    index+=1
    return bigram_list
开发者ID:wingsrc,项目名称:musicRecommendation_topicmodeling,代码行数:36,代码来源:topic_model.py

示例6: openfile

# 需要导入模块: from nltk.stem import PorterStemmer [as 别名]
# 或者: from nltk.stem.PorterStemmer import stem [as 别名]
def openfile(filename,output):
    print(filename)
    #starts run time
    start = timeit.default_timer()
    ps = PorterStemmer()
    file = open(filename,"r")
    tokens = []

    #Used for removing punctuation from the documents
    translate_table = dict((ord(char), None) for char in string.punctuation)

    start2 = timeit.default_timer()
    #splits the lines into words and removes the punctuation
    for line in file:
        tokens += word_tokenize(line.translate(translate_table)  )
    start3 = timeit.default_timer()
    print("tokenize")
    print(start3 - start2)
        
    #creates a set of stop words to be removed later
    stop_words = set(stopwords.words("english"))

    start6 = timeit.default_timer()
    #if a word is not a stop word it adds it to a list 
    filtered_sentence = []
    for w in tokens:
        if w not in stop_words:
            filtered_sentence.append(w)
    start7 = timeit.default_timer()
    print("stop word removal")
    print(start7 - start6)

    startw = timeit.default_timer()    
    #stems each word and adds it to the output file in csv form
    f = open(output,'w')
    iterFilSen = iter(filtered_sentence)
    if output == "documents.csv":
        for w in filtered_sentence:
            if w == "I":
                f.write("\n")
            f.write(ps.stem(w))
            f.write(",")
    else:
        for w in iterFilSen:
            if w == "I":
                f.write("\n")
                #removes the I number W
                next(iterFilSen)
                next(iterFilSen)
            else:
                f.write(ps.stem(w))
                f.write(",")
            
        
    #ends run time
    stop = timeit.default_timer()
    print("writing")
    print(stop - startw)
    print("total: "+output)
    print(stop - start)
开发者ID:SomeUserName-ForMe,项目名称:InvertedIndex,代码行数:62,代码来源:stemmer.py

示例7: StemmedBagOfWordsFeatureGenerator

# 需要导入模块: from nltk.stem import PorterStemmer [as 别名]
# 或者: from nltk.stem.PorterStemmer import stem [as 别名]
class StemmedBagOfWordsFeatureGenerator(EdgeFeatureGenerator):
    """
    Generates stemmed Bag of Words representation for each sentence that contains
    an edge, using the function given in the argument.

    By default it uses Porter stemmer

    :type feature_set: nala.structures.data.FeatureDictionary
    :type stemmer: nltk.stem.PorterStemmer
    :type stop_words: list[str]
    :type training_mode: bool
    """

    def __init__(self, feature_set, stop_words=[], training_mode=True):
        self.feature_set = feature_set
        """the feature set for the dataset"""
        self.training_mode = training_mode
        """whether the mode is training or testing"""
        self.stemmer = PorterStemmer()
        """an instance of the PorterStemmer"""
        self.stop_words = stop_words
        """a list of stop words"""

    def generate(self, dataset):
        for edge in dataset.edges():
            sentence = edge.part.sentences[edge.sentence_id]
            if self.training_mode:
                for token in sentence:
                    if self.stemmer.stem(
                            token.word
                    ) not in self.stop_words and not token.features['is_punct']:
                        feature_name = '4_bow_stem_' + self.stemmer.stem(
                            token.word) + '_[0]'
                        self.add_to_feature_set(edge, feature_name)
开发者ID:Rostlab,项目名称:relna,代码行数:36,代码来源:sentence.py

示例8: IntermediateTokensFeatureGenerator

# 需要导入模块: from nltk.stem import PorterStemmer [as 别名]
# 或者: from nltk.stem.PorterStemmer import stem [as 别名]
class IntermediateTokensFeatureGenerator(EdgeFeatureGenerator):
    """
    Generate the bag of words representation, masked text, stemmed text and
    parts of speech tag for each of the tokens present between two entities in
    an edge.

    :param feature_set: the feature set for the dataset
    :type feature_set: nala.structures.data.FeatureDictionary
    :param training_mode: indicates whether the mode is training or testing
    :type training_mode: bool
    """
    def __init__(self, feature_set, training_mode=True):
        self.feature_set = feature_set
        """the feature set for the dataset"""
        self.training_mode = training_mode
        """whether the mode is training or testing"""
        self.stemmer = PorterStemmer()
        """an instance of PorterStemmer"""

    def generate(self, dataset):
        for edge in dataset.edges():
            sentence = edge.part.sentences[edge.sentence_id]
            if edge.entity1.head_token.features['id'] < edge.entity2.head_token.features['id']:
                first = edge.entity1.head_token.features['id']
                second = edge.entity2.head_token.features['id']
                for i in range(first+1, second):
                    token = sentence[i]
                    feature_name = '33_fwd_bow_intermediate_'+token.word+'_[0]'
                    self.add_to_feature_set(edge, feature_name)
                    feature_name = '34_fwd_bow_intermediate_masked_'+token.masked_text(edge.part)+'_[0]'
                    self.add_to_feature_set(edge, feature_name)
                    feature_name = '35_fwd_stem_intermediate_'+self.stemmer.stem(token.word)+'_[0]'
                    self.add_to_feature_set(edge, feature_name)
                    feature_name = '36_fwd_pos_intermediate_'+token.features['pos']+'_[0]'
                    self.add_to_feature_set(edge, feature_name)
            else:
                first = edge.entity2.head_token.features['id']
                second = edge.entity1.head_token.features['id']
                for i in range(first+1, second):
                    token = sentence[i]
                    feature_name = '37_bkd_bow_intermediate_'+token.word+'_[0]'
                    self.add_to_feature_set(edge, feature_name)
                    feature_name = '38_bkd_bow_intermediate_masked_'+token.masked_text(edge.part)+'_[0]'
                    self.add_to_feature_set(edge, feature_name)
                    feature_name = '39_bkd_stem_intermediate_'+self.stemmer.stem(token.word)+'_[0]'
                    self.add_to_feature_set(edge, feature_name)
                    feature_name = '40_bkd_pos_intermediate_'+token.features['pos']+'_[0]'
                    self.add_to_feature_set(edge, feature_name)

            for i in range(first+1, second):
                token = sentence[i]
                feature_name = '41_bow_intermediate_'+token.word+'_[0]'
                self.add_to_feature_set(edge, feature_name)
                feature_name = '42_bow_intermediate_masked_'+token.masked_text(edge.part)+'_[0]'
                self.add_to_feature_set(edge, feature_name)
                feature_name = '43_stem_intermediate_'+self.stemmer.stem(token.word)+'_[0]'
                self.add_to_feature_set(edge, feature_name)
                feature_name = '44_pos_intermediate_'+token.features['pos']+'_[0]'
                self.add_to_feature_set(edge, feature_name)
开发者ID:Rostlab,项目名称:relna,代码行数:61,代码来源:context.py

示例9: EntityHeadTokenFeatureGenerator

# 需要导入模块: from nltk.stem import PorterStemmer [as 别名]
# 或者: from nltk.stem.PorterStemmer import stem [as 别名]
class EntityHeadTokenFeatureGenerator(EdgeFeatureGenerator):
    """
    Calculate the head token for each entity, using a simple heuristic - the
    distance to the root of the sentence.

    If the entity has just one token, then that forms the head token.
    If the entity has multiple tokens, then the token which is closest to the
    root of the sentence forms the entity head.

    :param feature_set: the feature set for the dataset
    :type feature_set: nala.structures.data.FeatureDictionary
    :param training_mode: whether the mode is training or testing, default True
    :type training_mode: bool
    """
    def __init__(self, feature_set, training_mode=True):
        self.feature_set = feature_set
        """the feature set for the dataset"""
        self.training_mode = training_mode
        """whether the mode is training or testing"""
        self.stemmer = PorterStemmer()
        """an instance of the PorterStemmer"""

    def generate(self, dataset):
        for edge in dataset.edges():
            entity1 = edge.entity1
            entity2 = edge.entity2

            self.named_entity_count('entity1_', entity1.class_id, edge)
            self.named_entity_count('entity2_', entity2.class_id, edge)

            entity1_stem = self.stemmer.stem(entity1.head_token.word)
            entity1_non_stem = entity1.head_token.word[len(entity1_stem):]
            entity2_stem = self.stemmer.stem(entity2.head_token.word)
            entity2_non_stem = entity1.head_token.word[len(entity2_stem):]

            feature_name_1_1 = '7_entity1_txt_' + entity1.head_token.word + '_[0]'
            feature_name_2_1 = '7_entity2_txt_' + entity2.head_token.word + '_[0]'
            feature_name_1_2 = '8_entity1_pos_' + entity1.head_token.features['pos'] + '_[0]'
            feature_name_2_2 = '8_entity2_pos_' + entity2.head_token.features['pos'] + '_[0]'
            feature_name_1_3 = '9_entity1_stem_' + entity1_stem + '_[0]'
            feature_name_2_3 = '9_entity2_stem_' + entity2_stem + '_[0]'
            feature_name_1_4 = '10_entity1_nonstem_' + entity1_non_stem + '_[0]'
            feature_name_2_4 = '10_entity2_nonstem_' + entity2_non_stem + '_[0]'

            self.add_to_feature_set(edge, feature_name_1_1)
            self.add_to_feature_set(edge, feature_name_2_1)
            self.add_to_feature_set(edge, feature_name_1_2)
            self.add_to_feature_set(edge, feature_name_2_2)
            self.add_to_feature_set(edge, feature_name_1_3)
            self.add_to_feature_set(edge, feature_name_2_3)
            self.add_to_feature_set(edge, feature_name_1_4)
            self.add_to_feature_set(edge, feature_name_2_4)

    def named_entity_count(self, prefix, entity_type, edge):
        entities = edge.part.get_entities_in_sentence(edge.sentence_id, entity_type)
        feature_name = '1_'+prefix+entity_type+'_count_['+str(len(entities))+']'
        self.add_to_feature_set(edge, feature_name)
开发者ID:Rostlab,项目名称:relna,代码行数:59,代码来源:entityhead.py

示例10: Indexer

# 需要导入模块: from nltk.stem import PorterStemmer [as 别名]
# 或者: from nltk.stem.PorterStemmer import stem [as 别名]
class Indexer():
    def __init__(self, rem_punc=True, rem_stop=True):
        self.rem_punc = rem_punc
        self.rem_stop = rem_stop
        self.stoplist = stopwords.words('english')
        self.punctunation = list(string.punctuation)
        self.token_dict = dict()
        self.pst = PorterStemmer()
        self.postings_list = dict()

    def get_pages(self):
        with open('./data/ucl', 'r') as ifile:
            contents = ifile.read()
            for page in contents.split('visited:'):
                self.parse_page(page)

    def parse_page(self, page):
        page = unicode(page, errors='ignore')
        lines = page.strip().split()
        if len(lines) > 2:
            title = lines[1]
            # tokenize and make lowercase
            tokens = [word.lower() for word in word_tokenize(str(lines[2:]))]
            # remove punctuation
            if self.rem_punc:
                tokens = [word for word in tokens if word not in self.punctunation]
            # remove stopwords
            if self.rem_stop:
                tokens = [word for word in tokens if word not in self.stoplist]
            # stem (Porter stemmer)
            tokens = [self.pst.stem(word) for word in tokens]
            # add to dictionary
            self.add_to_token_dict(title, tokens[3:])

    def add_to_token_dict(self, title, tokens):
        if tokens:
            words = dict()
            for token in tokens[1:]:
                key = self.pst.stem(token.lower())
                if key in self.token_dict:
                    self.token_dict[key] += 1
                else:
                    self.token_dict[key] = 1
                if key in words:
                    words[key] += 1
                else:
                    words[key] = 1
            self.postings_list[title] = [(k, v) for k, v in words.iteritems()]
开发者ID:kellino,项目名称:UCrawL,代码行数:50,代码来源:indexer.py

示例11: testing

# 需要导入模块: from nltk.stem import PorterStemmer [as 别名]
# 或者: from nltk.stem.PorterStemmer import stem [as 别名]
def testing():
    # - tokenize on sentence and word
    ex_txt = "hello there Mr. Bartuska, How are you? The weather is great and I enjoy Python. cheers!"
    print(sent_tokenize(ex_txt))
    print(word_tokenize(ex_txt, language='english'))

    # - stop words (pre-defined by nltk)
    stop_words = set(stopwords.words('english'))
    print(stop_words)
    words = word_tokenize(ex_txt)
    print(words)
    filtered_sent = []
    for w in words:
        if w not in stop_words:
            filtered_sent.append(w)
    print(filtered_sent)
    filtered_sent = [w for w in words if not w in stop_words]
    print(filtered_sent)

    # - stemming
    ps = PorterStemmer()
    example_words = [python,pythoner,pythoning,pythoned,pythonly]
    # for w in example_words:
    #     print(ps.stem(w))
    new_text = "it is very important to be pothonly while you are pythoning with python. All pythoners have pythoned poorly at least once."
    words = word_tokenize(new_text)
    for w in words:
        print(ps.stem(w))
开发者ID:gbartusk,项目名称:coursera_data_science_capstone,代码行数:30,代码来源:capstone.py

示例12: Stemmer

# 需要导入模块: from nltk.stem import PorterStemmer [as 别名]
# 或者: from nltk.stem.PorterStemmer import stem [as 别名]
class Stemmer(SentenceProcesser):
    def __init__(self):
        self.stemmer=PorterStemmer()
    def process(self, sentence):
        for word in sentence.words:
            word.stem=self.stemmer.stem(word.content)
        return sentence
开发者ID:ziqiangyeah,项目名称:pysumm,代码行数:9,代码来源:sentence_analyser.py

示例13: parseTranscript

# 需要导入模块: from nltk.stem import PorterStemmer [as 别名]
# 或者: from nltk.stem.PorterStemmer import stem [as 别名]
def parseTranscript(transcript):

    assert isinstance(transcript, Transcript), \
        "transcript must be stored in custom namedtuple, not {}".format(type(transcript))

    text = transcript.prepared.append(transcript.QandA)
    id = "{ticker}-{year}-{month}-{day}".format(ticker=transcript.ticker.split(':')[-1],
                                                year=transcript.date.year,
                                                month=transcript.date.month,
                                                day=transcript.date.day)

    tokenizer = wordpunct_tokenize
    stemmer = PorterStemmer()
    index = dict()
    pos = 0

    for row in text:

        for i, token in enumerate(tokenizer(row.lower())):
            token = stemmer.stem(token)
            if token not in index and '|' not in token:
                index[token] = [id, [str(pos + i)]]
            elif '|' not in token:
                index[token][-1].append(str(pos + i))

        try:
            pos += (i + 1)
        except:
            pass

    return index
开发者ID:trevorlindsay,项目名称:earnings-calls,代码行数:33,代码来源:build_index.py

示例14: preprocessing

# 需要导入模块: from nltk.stem import PorterStemmer [as 别名]
# 或者: from nltk.stem.PorterStemmer import stem [as 别名]
def preprocessing(text, debug = False):
    if debug:
        print text

    # lower case
    text = text.lower()
    if debug:
        print text

    # can't -> cannot, bya's -> bya is
    text = replacers.RegexpReplacer().replace(text)
    if debug:
        print text

    # word tokenize
    words = word_tokenize(text)
    if debug:
        print words

    # removing stopwords
    english_stops = set(stopwords.words('english'))
    english_stops_added = english_stops | {'.', ',', ':', ';'}
    words = [word for word in words if word not in english_stops_added]
    if debug:
        print words

    # stemming words
    stemmer = PorterStemmer()
    words_stemmed = list(map(lambda word: stemmer.stem(word), words))
    if debug:
        print words_stemmed

    return words, words_stemmed
开发者ID:Muugii-bs,项目名称:hommie,代码行数:35,代码来源:utils.py

示例15: parseReviews

# 需要导入模块: from nltk.stem import PorterStemmer [as 别名]
# 或者: from nltk.stem.PorterStemmer import stem [as 别名]
def parseReviews(mypath):
  filelist = os.listdir(mypath) 
  wordDict = {}
  negationList = ["no","not","never","can't","won't","cannot","didn't","couldn't"]
  negationFlag = False
  stopwordList = set(stopwords.words("english"))
  stemmer = PorterStemmer()
  for file in filelist:
    with open(mypath + "/" + file,"r") as f:
      word_list = word_tokenize(f.read())
    for word in word_list:
      if word in negationList:
        #double negative
        if negationFlag:
          negationFlag = False
        else:
          negationFlag = True
        continue
      if not word.isalnum():
        negationFlag = False
      if word.isalnum() and word not in stopwordList:
        word = stemmer.stem(word)
        if negationFlag:
          word = "!" + word
          negationFlag = False
        if word not in wordDict:
          wordDict[word] = 1
        else:
          wordDict[word] += 1
  return wordDict
开发者ID:sagardmni,项目名称:sentiment_classification,代码行数:32,代码来源:train.py


注:本文中的nltk.stem.PorterStemmer.stem方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。