当前位置: 首页>>代码示例>>Python>>正文


Python tokenize.wordpunct_tokenize方法代码示例

本文整理汇总了Python中nltk.tokenize.wordpunct_tokenize方法的典型用法代码示例。如果您正苦于以下问题:Python tokenize.wordpunct_tokenize方法的具体用法?Python tokenize.wordpunct_tokenize怎么用?Python tokenize.wordpunct_tokenize使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在nltk.tokenize的用法示例。


在下文中一共展示了tokenize.wordpunct_tokenize方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: generate_vocabulary

# 需要导入模块: from nltk import tokenize [as 别名]
# 或者: from nltk.tokenize import wordpunct_tokenize [as 别名]
def generate_vocabulary(self, review_summary_file):
        """

        :param review_summary_file:
        :return:
        """
        self.rev_sum_pair = pd.read_csv(review_summary_file, header=0).values

        for review,summary in self.rev_sum_pair:
            rev_lst = wordpunct_tokenize(review)
            sum_lst = wordpunct_tokenize(summary)
            self.__add_list_to_dict(rev_lst)
            self.__add_list_to_dict(sum_lst)

        # Now store the "" empty string as the last word of the voacabulary
        self.map[""] = len(self.map)
        self.revmap[len(self.map)] = "" 
开发者ID:harpribot,项目名称:deep-summarization,代码行数:19,代码来源:data2tensor.py

示例2: text2idx2

# 需要导入模块: from nltk import tokenize [as 别名]
# 或者: from nltk.tokenize import wordpunct_tokenize [as 别名]
def text2idx2(texts, vocab, dim):
    '''
    Convert a list of texts to their corresponding vocabulary indexes.
    '''
    out = -np.ones((len(texts), dim), dtype=np.int32)
    mask = np.zeros((len(texts), dim), dtype=np.float32)
    for i, text in enumerate(texts):
        j = 0
        for word in wordpunct_tokenize(text):
            if word in vocab:
                out[i,j] = vocab[word]
                mask[i,j] = 1.
                j += 1

                if j == dim:
                    break

    return out, mask 
开发者ID:nyu-dl,项目名称:dl4ir-webnav,代码行数:20,代码来源:utils.py

示例3: augment

# 需要导入模块: from nltk import tokenize [as 别名]
# 或者: from nltk.tokenize import wordpunct_tokenize [as 别名]
def augment(texts, dic_thes):
    if prm.aug<2:
        return texts

    out = []
    for text in texts:

        words_orig = wordpunct_tokenize(text)
        maxrep = max(2,int(0.1*len(words_orig))) #define how many words will be replaced. For now, leave the maximum number as 10% of the words
        
        for j in range(prm.aug):
            words = list(words_orig) #copy
            for k in range(randint(1,maxrep)):
                idx = randint(0,len(words)-1)
                word = words[idx]
                if word in dic_thes:
                    
                    synonym = min(np.random.geometric(0.5), len(dic_thes[word])-1) #chose the synonym based on a geometric distribution
                    #print 'fp',fp,"word", word,"synonym",dic_thes[word][synonym]
                    words[idx] = dic_thes[word][synonym]

            out.append(" ".join(words))

    return out 
开发者ID:nyu-dl,项目名称:dl4ir-webnav,代码行数:26,代码来源:utils.py

示例4: tiny_tokenize

# 需要导入模块: from nltk import tokenize [as 别名]
# 或者: from nltk.tokenize import wordpunct_tokenize [as 别名]
def tiny_tokenize(text, stem=False, stop_words=[]):
    words = []
    for token in wordpunct_tokenize(re.sub('[%s]' % re.escape(string.punctuation), ' ', \
            text.decode(encoding='UTF-8', errors='ignore'))):
        if not token.isdigit() and not token in stop_words:
            if stem:
                try:
                    w = EnglishStemmer().stem(token)
                except Exception as e:
                    w = token
            else:
                w = token
            words.append(w)

    return words

    # return [EnglishStemmer().stem(token) if stem else token for token in wordpunct_tokenize(
    #                     re.sub('[%s]' % re.escape(string.punctuation), ' ', text.decode(encoding='UTF-8', errors='ignore'))) if
    #                     not token.isdigit() and not token in stop_words] 
开发者ID:hugochan,项目名称:KATE,代码行数:21,代码来源:preprocessing.py

示例5: parseDocument

# 需要导入模块: from nltk import tokenize [as 别名]
# 或者: from nltk.tokenize import wordpunct_tokenize [as 别名]
def parseDocument(doc, vocab):
	wordslist = list()
	countslist = list()
	doc = doc.lower()
	tokens = wordpunct_tokenize(doc)

	dictionary = dict()
	for word in tokens:
		if word in vocab:
			wordtk = vocab[word]
			if wordtk not in dictionary:
				dictionary[wordtk] = 1
			else:
				dictionary[wordtk] += 1

	wordslist.append(dictionary.keys())
	countslist.append(dictionary.values())
	return (wordslist[0], countslist[0]) 
开发者ID:qlai,项目名称:stochasticLDA,代码行数:20,代码来源:utils.py

示例6: __generate_tensor

# 需要导入模块: from nltk import tokenize [as 别名]
# 或者: from nltk.tokenize import wordpunct_tokenize [as 别名]
def __generate_tensor(self, is_review, reverse=False):
        """

        :param is_review:
        :param reverse:
        :return:
        """
        seq_length = self.review_max_words if is_review else self.summary_max_words
        total_rev_summary_pairs = self.rev_sum_pair.shape[0]
        data_tensor = np.zeros([total_rev_summary_pairs,seq_length])

        sample = self.rev_sum_pair[0::, 0] if is_review else self.rev_sum_pair[0::, 1]

        for index, entry in enumerate(sample.tolist()):
            index_lst = np.array([self.map[word.lower()] for word in wordpunct_tokenize(entry)])
            # reverse if want to get backward form
            if reverse:
                index_lst = index_lst[::-1]
            # Pad the list
            if len(index_lst) <= seq_length:
                index_lst = np.lib.pad(index_lst, (0,seq_length - index_lst.size), 'constant', constant_values=(0, 0))
            else:
                index_lst = index_lst[0:seq_length]

            data_tensor[index] = index_lst

        return data_tensor 
开发者ID:harpribot,项目名称:deep-summarization,代码行数:29,代码来源:data2tensor.py

示例7: read_wordpunct_block

# 需要导入模块: from nltk import tokenize [as 别名]
# 或者: from nltk.tokenize import wordpunct_tokenize [as 别名]
def read_wordpunct_block(stream):
    toks = []
    for i in range(20): # Read 20 lines at a time.
        toks.extend(wordpunct_tokenize(stream.readline()))
    return toks 
开发者ID:rafasashi,项目名称:razzy-spinner,代码行数:7,代码来源:util.py

示例8: __call__

# 需要导入模块: from nltk import tokenize [as 别名]
# 或者: from nltk.tokenize import wordpunct_tokenize [as 别名]
def __call__(self, sentence):
        return wordpunct_tokenize(sentence) 
开发者ID:hobincar,项目名称:RecNet,代码行数:4,代码来源:transform.py

示例9: BOW2

# 需要导入模块: from nltk import tokenize [as 别名]
# 或者: from nltk.tokenize import wordpunct_tokenize [as 别名]
def BOW2(texts, vocab, dim):
    '''
    Convert a list of texts to the BoW dense representation.
    '''
    out = np.zeros((len(texts), dim), dtype=np.int32)
    mask = np.zeros((len(texts), dim), dtype=np.float32)
    for i, text in enumerate(texts):
        bow = BOW(wordpunct_tokenize(text), vocab)
        out[i,:len(bow[0])] = bow[0]
        mask[i,:len(bow[1])] = bow[1]

    return out, mask 
开发者ID:nyu-dl,项目名称:dl4ir-webnav,代码行数:14,代码来源:utils.py

示例10: Word2Vec_encode

# 需要导入模块: from nltk import tokenize [as 别名]
# 或者: from nltk.tokenize import wordpunct_tokenize [as 别名]
def Word2Vec_encode(texts, wemb):
    
    out = np.zeros((len(texts), prm.dim_emb), dtype=np.float32)
    for i, text in enumerate(texts):
        words = wordpunct_tokenize(text)
        n = 0.
        for word in words:
            if word in wemb:
                out[i,:] += wemb[word]
                n += 1.
        out[i,:] /= max(1.,n)

    return out 
开发者ID:nyu-dl,项目名称:dl4ir-webnav,代码行数:15,代码来源:utils.py

示例11: tag_unigram

# 需要导入模块: from nltk import tokenize [as 别名]
# 或者: from nltk.tokenize import wordpunct_tokenize [as 别名]
def tag_unigram(self, untagged_string: str):
        """Tag POS with unigram tagger.
        :type untagged_string: str
        :param : An untagged, untokenized string of text.
        :rtype tagged_text: str
        """
        untagged_tokens = wordpunct_tokenize(untagged_string)
        pickle_path = self.available_taggers['unigram']
        tagger = open_pickle(pickle_path)
        tagged_text = tagger.tag(untagged_tokens)
        return tagged_text 
开发者ID:cltk,项目名称:cltk,代码行数:13,代码来源:pos.py

示例12: tag_bigram

# 需要导入模块: from nltk import tokenize [as 别名]
# 或者: from nltk.tokenize import wordpunct_tokenize [as 别名]
def tag_bigram(self, untagged_string: str):
        """Tag POS with bigram tagger.
        :type untagged_string: str
        :param : An untagged, untokenized string of text.
        :rtype tagged_text: str
        """
        untagged_tokens = wordpunct_tokenize(untagged_string)
        pickle_path = self.available_taggers['bigram']
        tagger = open_pickle(pickle_path)
        tagged_text = tagger.tag(untagged_tokens)
        return tagged_text 
开发者ID:cltk,项目名称:cltk,代码行数:13,代码来源:pos.py

示例13: tag_trigram

# 需要导入模块: from nltk import tokenize [as 别名]
# 或者: from nltk.tokenize import wordpunct_tokenize [as 别名]
def tag_trigram(self, untagged_string: str):
        """Tag POS with trigram tagger.
        :type untagged_string: str
        :param : An untagged, untokenized string of text.
        :rtype tagged_text: str
        """
        untagged_tokens = wordpunct_tokenize(untagged_string)
        pickle_path = self.available_taggers['trigram']
        tagger = open_pickle(pickle_path)
        tagged_text = tagger.tag(untagged_tokens)
        return tagged_text 
开发者ID:cltk,项目名称:cltk,代码行数:13,代码来源:pos.py

示例14: tag_ngram_123_backoff

# 需要导入模块: from nltk import tokenize [as 别名]
# 或者: from nltk.tokenize import wordpunct_tokenize [as 别名]
def tag_ngram_123_backoff(self, untagged_string: str):
        """Tag POS with 1-, 2-, 3-gram tagger.
        :type untagged_string: str
        :param : An untagged, untokenized string of text.
        :rtype tagged_text: str
        """
        untagged_tokens = wordpunct_tokenize(untagged_string)
        pickle_path = self.available_taggers['ngram_123_backoff']
        tagger = open_pickle(pickle_path)
        tagged_text = tagger.tag(untagged_tokens)
        return tagged_text 
开发者ID:cltk,项目名称:cltk,代码行数:13,代码来源:pos.py

示例15: tag_ngram_12_backoff

# 需要导入模块: from nltk import tokenize [as 别名]
# 或者: from nltk.tokenize import wordpunct_tokenize [as 别名]
def tag_ngram_12_backoff(self, untagged_string: str):
        """Tag POS with 1-, 2-gram tagger.
        :type untagged_string: str
        :param : An untagged, untokenized string of text.
        :rtype tagged_text: str
        """
        untagged_tokens = wordpunct_tokenize(untagged_string)
        pickle_path = self.available_taggers['ngram_12_backoff']
        tagger = open_pickle(pickle_path)
        tagged_text = tagger.tag(untagged_tokens)
        return tagged_text 
开发者ID:cltk,项目名称:cltk,代码行数:13,代码来源:pos.py


注:本文中的nltk.tokenize.wordpunct_tokenize方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。