Python tokenize.wordpunct_tokenize方法代码示例

本文整理汇总了Python中nltk.tokenize.wordpunct_tokenize方法的典型用法代码示例。如果您正苦于以下问题：Python tokenize.wordpunct_tokenize方法的具体用法？Python tokenize.wordpunct_tokenize怎么用？Python tokenize.wordpunct_tokenize使用的例子？那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类nltk.tokenize的用法示例。

在下文中一共展示了tokenize.wordpunct_tokenize方法的15个代码示例，这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞，您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: generate_vocabulary

# 需要导入模块: from nltk import tokenize [as 别名]
# 或者: from nltk.tokenize import wordpunct_tokenize [as 别名]
def generate_vocabulary(self, review_summary_file):
        """

        :param review_summary_file:
        :return:
        """
        self.rev_sum_pair = pd.read_csv(review_summary_file, header=0).values

        for review,summary in self.rev_sum_pair:
            rev_lst = wordpunct_tokenize(review)
            sum_lst = wordpunct_tokenize(summary)
            self.__add_list_to_dict(rev_lst)
            self.__add_list_to_dict(sum_lst)

        # Now store the "" empty string as the last word of the voacabulary
        self.map[""] = len(self.map)
        self.revmap[len(self.map)] = ""

开发者ID:harpribot，项目名称:deep-summarization，代码行数:19，代码来源:data2tensor.py

示例2: text2idx2

# 需要导入模块: from nltk import tokenize [as 别名]
# 或者: from nltk.tokenize import wordpunct_tokenize [as 别名]
def text2idx2(texts, vocab, dim):
    '''
    Convert a list of texts to their corresponding vocabulary indexes.
    '''
    out = -np.ones((len(texts), dim), dtype=np.int32)
    mask = np.zeros((len(texts), dim), dtype=np.float32)
    for i, text in enumerate(texts):
        j = 0
        for word in wordpunct_tokenize(text):
            if word in vocab:
                out[i,j] = vocab[word]
                mask[i,j] = 1.
                j += 1

                if j == dim:
                    break

    return out, mask

开发者ID:nyu-dl，项目名称:dl4ir-webnav，代码行数:20，代码来源:utils.py

示例3: augment

# 需要导入模块: from nltk import tokenize [as 别名]
# 或者: from nltk.tokenize import wordpunct_tokenize [as 别名]
def augment(texts, dic_thes):
    if prm.aug<2:
        return texts

    out = []
    for text in texts:

        words_orig = wordpunct_tokenize(text)
        maxrep = max(2,int(0.1*len(words_orig))) #define how many words will be replaced. For now, leave the maximum number as 10% of the words
        
        for j in range(prm.aug):
            words = list(words_orig) #copy
            for k in range(randint(1,maxrep)):
                idx = randint(0,len(words)-1)
                word = words[idx]
                if word in dic_thes:
                    
                    synonym = min(np.random.geometric(0.5), len(dic_thes[word])-1) #chose the synonym based on a geometric distribution
                    #print 'fp',fp,"word", word,"synonym",dic_thes[word][synonym]
                    words[idx] = dic_thes[word][synonym]

            out.append(" ".join(words))

    return out

开发者ID:nyu-dl，项目名称:dl4ir-webnav，代码行数:26，代码来源:utils.py

示例4: tiny_tokenize

# 需要导入模块: from nltk import tokenize [as 别名]
# 或者: from nltk.tokenize import wordpunct_tokenize [as 别名]
def tiny_tokenize(text, stem=False, stop_words=[]):
    words = []
    for token in wordpunct_tokenize(re.sub('[%s]' % re.escape(string.punctuation), ' ', \
            text.decode(encoding='UTF-8', errors='ignore'))):
        if not token.isdigit() and not token in stop_words:
            if stem:
                try:
                    w = EnglishStemmer().stem(token)
                except Exception as e:
                    w = token
            else:
                w = token
            words.append(w)

    return words

    # return [EnglishStemmer().stem(token) if stem else token for token in wordpunct_tokenize(
    #                     re.sub('[%s]' % re.escape(string.punctuation), ' ', text.decode(encoding='UTF-8', errors='ignore'))) if
    #                     not token.isdigit() and not token in stop_words]

开发者ID:hugochan，项目名称:KATE，代码行数:21，代码来源:preprocessing.py

示例5: parseDocument

# 需要导入模块: from nltk import tokenize [as 别名]
# 或者: from nltk.tokenize import wordpunct_tokenize [as 别名]
def parseDocument(doc, vocab):
	wordslist = list()
	countslist = list()
	doc = doc.lower()
	tokens = wordpunct_tokenize(doc)

	dictionary = dict()
	for word in tokens:
		if word in vocab:
			wordtk = vocab[word]
			if wordtk not in dictionary:
				dictionary[wordtk] = 1
			else:
				dictionary[wordtk] += 1

	wordslist.append(dictionary.keys())
	countslist.append(dictionary.values())
	return (wordslist[0], countslist[0])

开发者ID:qlai，项目名称:stochasticLDA，代码行数:20，代码来源:utils.py

示例6: __generate_tensor

# 需要导入模块: from nltk import tokenize [as 别名]
# 或者: from nltk.tokenize import wordpunct_tokenize [as 别名]
def __generate_tensor(self, is_review, reverse=False):
        """

        :param is_review:
        :param reverse:
        :return:
        """
        seq_length = self.review_max_words if is_review else self.summary_max_words
        total_rev_summary_pairs = self.rev_sum_pair.shape[0]
        data_tensor = np.zeros([total_rev_summary_pairs,seq_length])

        sample = self.rev_sum_pair[0::, 0] if is_review else self.rev_sum_pair[0::, 1]

        for index, entry in enumerate(sample.tolist()):
            index_lst = np.array([self.map[word.lower()] for word in wordpunct_tokenize(entry)])
            # reverse if want to get backward form
            if reverse:
                index_lst = index_lst[::-1]
            # Pad the list
            if len(index_lst) <= seq_length:
                index_lst = np.lib.pad(index_lst, (0,seq_length - index_lst.size), 'constant', constant_values=(0, 0))
            else:
                index_lst = index_lst[0:seq_length]

            data_tensor[index] = index_lst

        return data_tensor

开发者ID:harpribot，项目名称:deep-summarization，代码行数:29，代码来源:data2tensor.py

示例7: read_wordpunct_block

# 需要导入模块: from nltk import tokenize [as 别名]
# 或者: from nltk.tokenize import wordpunct_tokenize [as 别名]
def read_wordpunct_block(stream):
    toks = []
    for i in range(20): # Read 20 lines at a time.
        toks.extend(wordpunct_tokenize(stream.readline()))
    return toks

开发者ID:rafasashi，项目名称:razzy-spinner，代码行数:7，代码来源:util.py

示例8: call

# 需要导入模块: from nltk import tokenize [as 别名]
# 或者: from nltk.tokenize import wordpunct_tokenize [as 别名]
def __call__(self, sentence):
        return wordpunct_tokenize(sentence)

开发者ID:hobincar，项目名称:RecNet，代码行数:4，代码来源:transform.py

示例9: BOW2

# 需要导入模块: from nltk import tokenize [as 别名]
# 或者: from nltk.tokenize import wordpunct_tokenize [as 别名]
def BOW2(texts, vocab, dim):
    '''
    Convert a list of texts to the BoW dense representation.
    '''
    out = np.zeros((len(texts), dim), dtype=np.int32)
    mask = np.zeros((len(texts), dim), dtype=np.float32)
    for i, text in enumerate(texts):
        bow = BOW(wordpunct_tokenize(text), vocab)
        out[i,:len(bow[0])] = bow[0]
        mask[i,:len(bow[1])] = bow[1]

    return out, mask

开发者ID:nyu-dl，项目名称:dl4ir-webnav，代码行数:14，代码来源:utils.py

示例10: Word2Vec_encode

# 需要导入模块: from nltk import tokenize [as 别名]
# 或者: from nltk.tokenize import wordpunct_tokenize [as 别名]
def Word2Vec_encode(texts, wemb):
    
    out = np.zeros((len(texts), prm.dim_emb), dtype=np.float32)
    for i, text in enumerate(texts):
        words = wordpunct_tokenize(text)
        n = 0.
        for word in words:
            if word in wemb:
                out[i,:] += wemb[word]
                n += 1.
        out[i,:] /= max(1.,n)

    return out

开发者ID:nyu-dl，项目名称:dl4ir-webnav，代码行数:15，代码来源:utils.py