当前位置: 首页>>代码示例>>Python>>正文


Python PorterStemmer.stem_word方法代码示例

本文整理汇总了Python中nltk.stem.PorterStemmer.stem_word方法的典型用法代码示例。如果您正苦于以下问题:Python PorterStemmer.stem_word方法的具体用法?Python PorterStemmer.stem_word怎么用?Python PorterStemmer.stem_word使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在nltk.stem.PorterStemmer的用法示例。


在下文中一共展示了PorterStemmer.stem_word方法的9个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: StemmerTokenizer

# 需要导入模块: from nltk.stem import PorterStemmer [as 别名]
# 或者: from nltk.stem.PorterStemmer import stem_word [as 别名]
class StemmerTokenizer():
    def __init__(self):
        self.lem = PorterStemmer()

    def __call__(self, string):
        tokens = word_tokenize( string.lower() )
        return [ self.lem.stem_word(t) for t in tokens ]
开发者ID:ck37,项目名称:h2o-redditcomments,代码行数:9,代码来源:feature_library.py

示例2: preprocess

# 需要导入模块: from nltk.stem import PorterStemmer [as 别名]
# 或者: from nltk.stem.PorterStemmer import stem_word [as 别名]
def preprocess(text):
  stemmer = PorterStemmer()
  stop = stopwords.words("english")
  result = word_tokenize(text)
  result = [stemmer.stem_word(word.lower()) for word in result if \
            word not in stop and \
            word not in string.punctuation and \
            word not in string.digits]
  return result
开发者ID:mmfrb,项目名称:pln-projeto,代码行数:11,代码来源:preprocessor.py

示例3: dialogue_act_features

# 需要导入模块: from nltk.stem import PorterStemmer [as 别名]
# 或者: from nltk.stem.PorterStemmer import stem_word [as 别名]
def dialogue_act_features(post):
    words = nltk.word_tokenize(post)
    sentences = nltk.sent_tokenize(post)
    features = {
        'word_diversity': len(words)/len(set(words)),
    }

    stemmer = PorterStemmer()
    stemmed_words = [stemmer.stem_word(w) for w in words]
        
    # words
    for word in set(stemmed_words):
         features['contains(%s)' % word.lower()] = True

    # check for presence/absence of specific words
    check_words = [
        'who', 'what', 'where', 'why', 'how',    # question words
        'love', 'hate', 'despis',		 # emotional words (?)
        ] 

    for word in check_words:
        features['contains(%s)' % word] = word in stemmed_words
         
    # punctuation
    for punctuation in ['?', '!', '!!', '?!', '"', '...', '.']:
        features['punctuation_count(%s)' % punctuation] = post.count(punctuation)

    # skip parts of speech for now - slow, not helping much
    return features

    # get counts for parts of speech
    pos_count = defaultdict(int)
    for sentence in sentences:
        # tokenize the sentence into words and tag parts of speech
        sentence_words = nltk.word_tokenize(sentence)
        # - using the nltk parts-of-speech tagger for now
        #  (other options may be faster/more accurate)
        pos_sentence = nltk.pos_tag(sentence_words)
        for word, pos in pos_sentence:
            pos_count['pos_%s' % pos] += 1

    # include final counts by part of speech in the features
    features.update(pos_count)

    return features
开发者ID:emory-libraries,项目名称:Twap,代码行数:47,代码来源:twap_dialog_type.py

示例4: __init__

# 需要导入模块: from nltk.stem import PorterStemmer [as 别名]
# 或者: from nltk.stem.PorterStemmer import stem_word [as 别名]
class PreProcess:
    def __init__(self):
        self.tokenizer = word_tokenize
        self.stemmer = PorterStemmer()
        self.punct = string.punctuation
        self.digits = string.digits
        self.stop = stopwords.words("english")

    def process_sent(self, snt):
        snt = self.tokenizer(snt)
        snt = [self.stemmer.stem_word(wrd.lower()) for wrd in snt if \
                    wrd not in self.stop and \
                    wrd not in self.digits and \
                    wrd not in self.punct ]
        return snt

    def process(self, snts):
        return [self.process_sent(snt) for snt in snts]
开发者ID:mmfrb,项目名称:classificador-mineracao,代码行数:20,代码来源:preprocess.py

示例5: stem

# 需要导入模块: from nltk.stem import PorterStemmer [as 别名]
# 或者: from nltk.stem.PorterStemmer import stem_word [as 别名]
    def stem(self,input_text):
       tokenizer = RegexpTokenizer('\s+', gaps=True)
       stemmed_text=[]
       lemmatizer = WordNetLemmatizer()
       stemmer = PorterStemmer() 
       text = tokenizer.tokenize(str(input_text))
       filtered_text = self.stopword(text)            
       for word in filtered_text:
           if word.isalpha():
		if len(word)>4:
               		stemmed_text.append(stemmer.stem_word(word).lower())
		else:
			stemmed_text.append(word.lower())
       for word in stemmed_text:
          if len(word) < 3 :
               stemmed_text.remove(word)      
       ' '.join(stemmed_text)
      
       return stemmed_text   
开发者ID:meghanathmacha,项目名称:Ad-Safe,代码行数:21,代码来源:contentanalyzer.py

示例6: ambigious

# 需要导入模块: from nltk.stem import PorterStemmer [as 别名]
# 或者: from nltk.stem.PorterStemmer import stem_word [as 别名]
""" General approach: serach for control-type structures which may be ambigious (with raising)
then search for those verbs to see if they exist in "There[ex] VERB" contexts
e.g we find "John seems to be beside himself today"
    so we search for "/[tT]here/ . (/VB/ < /^(seem)/)"
    if this returns any results, "seem" must be a raising verb
"""

from pdb import set_trace
import runTregex as tx
from nltk.stem import PorterStemmer
ps = PorterStemmer()
treebank_dir = "/home/chase/CompLing/stanford-tregex-2012-03-09/treebank_3/parsed/mrg/wsj/"

unfiltered = set()
for t in trees:
    unfiltered.add(ps.stem_word(t.matchTree.leaves()[0]).lower())

# this takes forever and  isn't really too effective...
for word in unfiltered:
    pat = "(/[Tt]here/ > EX) . /^%s/"%word
    reload(tx) 
    trees = tx.Treebank(treebank_dir, pat)
    trees.run()
    if len(trees) > 0:
        print word

开发者ID:tccorcoran,项目名称:Ling144Homework,代码行数:27,代码来源:getControlVerbs.py

示例7: Counter

# 需要导入模块: from nltk.stem import PorterStemmer [as 别名]
# 或者: from nltk.stem.PorterStemmer import stem_word [as 别名]
        True_iD = True
        continue
    if inPage and line.find( "<text" ) != -1:
        inText = True
        continue
    if inPage and True_iD and line.find("<id>") != -1:
        iD.append(line[len("<id>") : -len("</id>")])
        True_iD = False

    if inPage and line.find( "/text" ) != -1:
        inText = False
        text = ' '.join(list)
        #Tokenizing Text For Each XML
        temp = text.decode("utf-8","ignore")
        temp = temp.replace(u'\ufeff',' ')
        temp_1 = re.sub(pattern," ",temp)
        temp_1 = temp_1.lower()
        res=[]
        for x in temp_1.split():
        	if x not in stopwords.words('english'):
        		res.append(x)
        clean_text = " ".join(stem.stem_word(word) for word in res)
        tokens = nltk.word_tokenize(clean_text)
        cnt = Counter(tokens)
        print("[[%s]]\t[[%.0f]]") % (dict(cnt),int(iD[0]))
        list = []
        continue
    


开发者ID:wangweinan,项目名称:HW3_CS294,代码行数:29,代码来源:mapper_3.py

示例8: set

# 需要导入模块: from nltk.stem import PorterStemmer [as 别名]
# 或者: from nltk.stem.PorterStemmer import stem_word [as 别名]
stop_words = set(stopwords.words("english"))

words = word_tokenize(example_sentence)

filter_sentence = [w for w in words if w not in stop_words]

print(filter_sentence)

##### STEAMMER EXAMPLE #####

ps = PorterStemmer()

example_words = ["pythone", "pythoner", "pythoning", "pythoned", "pythonly"]

for w in example_words:
    print(ps.stem_word(w))

##### SENTENCE TOKENIZER EXAMPLE #####

train_text = state_union.raw("2005-GWBush.txt")
sample_text = state_union.raw("2006-GWBush.txt")

custom_some_tokenizer = PunktSentenceTokenizer(train_text)

tokenized = custom_some_tokenizer(sample_text)


def proce_content():
    try:
        for w in tokenized:
            words = nltk.word_tokenize(w)
开发者ID:Daniel194,项目名称:Machine-Learning,代码行数:33,代码来源:nlp.py

示例9: __init__

# 需要导入模块: from nltk.stem import PorterStemmer [as 别名]
# 或者: from nltk.stem.PorterStemmer import stem_word [as 别名]
class TitleSim:
    def __init__(self, features_conf, features_deleted):
        print 'Start initialization'

        # initial model training
        features = features_deleted + features_conf
        target = [0 for x in range(len(features_deleted))] +\
                [1 for x in range(len(features_conf))]
        self.classifier = RandomForestClassifier(n_estimators=50,
                                            verbose=2,
                                            n_jobs=1,
                                            min_samples_split=10,
                                            random_state=1)
        self.classifier.fit(features, target)

        # loading relational data which will be used
        paths = json.loads(open("SETTINGS.json").read())
        paper_doc = paths["paper_doc"]
        self.paper = dict([(entry[0], entry[1]) for entry in csv.reader(open(paper_doc))])

        # loading setting file
        self.paths = json.loads(open("SETTINGS.json").read())

        # loading word map of titles
        self.wordmap = self.load_titlemap()

        # do other initializations
        self.stemmer = PorterStemmer()
        print 'End initialization'


    def label_predict(self, fea_dict):
        # fea_dict is a dictionary whose key is 'user id'
        prob_dict = {}
        for key in fea_dict:
            features = [feature[1:] for feature in fea_dict[key]]
            predictions = self.classifier.predict_proba(features)[:,1]
            prob_dict[key]=[(item[0],prob) for item,prob in zip(fea_dict[key],predictions)]
        return prob_dict


    def load_titlemap(self):
        return dict([(entry[0],entry[1]) for entry in \
                csv.reader(open(self.paths["title_wordmap"]))])

    def calsim(self, author_doc, pairs):
        # calculate the similarity between titles
        title_features = []
        for pair in pairs:
            if pair[0] not in author_doc:
                print 'Key error.'
                sys.exit(1)
            title_features.append(self.calpairsim(author_doc[pair[0]], pair[1]))

        return title_features

    def calpairsim(self, doclist, target_doc):
        author_words = {}
        for doc in doclist:
            words = self.paper[doc].lower().split(' ')
            for word in words:
                stemmed_word = self.stemmer.stem_word(word)
                if stemmed_word in self.wordmap:
                    if stemmed_word in author_words:
                        author_words[stemmed_word] += 1
                    else:
                        author_words[stemmed_word] = 1

        doc_words = {}
        words = self.paper[target_doc].lower().split(' ')
        for word in words:
            stemmed_word = self.stemmer.stem_word(word)
            if stemmed_word in self.wordmap:
                if stemmed_word in doc_words:
                    doc_words[stemmed_word] += 1
                else:
                    doc_words[stemmed_word] = 1

        # number of common words
        comm_num = len(set(author_words.keys()) & set(doc_words.keys()))

        # pearson coefficient
        if (len(set(author_words.keys())) + len(set(doc_words.keys()))) != 0:
            pearson = comm_num*1.0/ (len(set(author_words.keys())) + len(set(doc_words.keys())))
        else:
            pearson = 0.0

        return [comm_num, pearson]
开发者ID:anthonylife,项目名称:kddcup2013,代码行数:90,代码来源:mkTitleSim.py


注:本文中的nltk.stem.PorterStemmer.stem_word方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。