本文整理汇总了Python中nltk.tokenize.wordpunct_tokenize方法的典型用法代码示例。如果您正苦于以下问题:Python tokenize.wordpunct_tokenize方法的具体用法?Python tokenize.wordpunct_tokenize怎么用?Python tokenize.wordpunct_tokenize使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类nltk.tokenize
的用法示例。
在下文中一共展示了tokenize.wordpunct_tokenize方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: generate_vocabulary
# 需要导入模块: from nltk import tokenize [as 别名]
# 或者: from nltk.tokenize import wordpunct_tokenize [as 别名]
def generate_vocabulary(self, review_summary_file):
"""
:param review_summary_file:
:return:
"""
self.rev_sum_pair = pd.read_csv(review_summary_file, header=0).values
for review,summary in self.rev_sum_pair:
rev_lst = wordpunct_tokenize(review)
sum_lst = wordpunct_tokenize(summary)
self.__add_list_to_dict(rev_lst)
self.__add_list_to_dict(sum_lst)
# Now store the "" empty string as the last word of the voacabulary
self.map[""] = len(self.map)
self.revmap[len(self.map)] = ""
示例2: text2idx2
# 需要导入模块: from nltk import tokenize [as 别名]
# 或者: from nltk.tokenize import wordpunct_tokenize [as 别名]
def text2idx2(texts, vocab, dim):
'''
Convert a list of texts to their corresponding vocabulary indexes.
'''
out = -np.ones((len(texts), dim), dtype=np.int32)
mask = np.zeros((len(texts), dim), dtype=np.float32)
for i, text in enumerate(texts):
j = 0
for word in wordpunct_tokenize(text):
if word in vocab:
out[i,j] = vocab[word]
mask[i,j] = 1.
j += 1
if j == dim:
break
return out, mask
示例3: augment
# 需要导入模块: from nltk import tokenize [as 别名]
# 或者: from nltk.tokenize import wordpunct_tokenize [as 别名]
def augment(texts, dic_thes):
if prm.aug<2:
return texts
out = []
for text in texts:
words_orig = wordpunct_tokenize(text)
maxrep = max(2,int(0.1*len(words_orig))) #define how many words will be replaced. For now, leave the maximum number as 10% of the words
for j in range(prm.aug):
words = list(words_orig) #copy
for k in range(randint(1,maxrep)):
idx = randint(0,len(words)-1)
word = words[idx]
if word in dic_thes:
synonym = min(np.random.geometric(0.5), len(dic_thes[word])-1) #chose the synonym based on a geometric distribution
#print 'fp',fp,"word", word,"synonym",dic_thes[word][synonym]
words[idx] = dic_thes[word][synonym]
out.append(" ".join(words))
return out
示例4: tiny_tokenize
# 需要导入模块: from nltk import tokenize [as 别名]
# 或者: from nltk.tokenize import wordpunct_tokenize [as 别名]
def tiny_tokenize(text, stem=False, stop_words=[]):
words = []
for token in wordpunct_tokenize(re.sub('[%s]' % re.escape(string.punctuation), ' ', \
text.decode(encoding='UTF-8', errors='ignore'))):
if not token.isdigit() and not token in stop_words:
if stem:
try:
w = EnglishStemmer().stem(token)
except Exception as e:
w = token
else:
w = token
words.append(w)
return words
# return [EnglishStemmer().stem(token) if stem else token for token in wordpunct_tokenize(
# re.sub('[%s]' % re.escape(string.punctuation), ' ', text.decode(encoding='UTF-8', errors='ignore'))) if
# not token.isdigit() and not token in stop_words]
示例5: parseDocument
# 需要导入模块: from nltk import tokenize [as 别名]
# 或者: from nltk.tokenize import wordpunct_tokenize [as 别名]
def parseDocument(doc, vocab):
wordslist = list()
countslist = list()
doc = doc.lower()
tokens = wordpunct_tokenize(doc)
dictionary = dict()
for word in tokens:
if word in vocab:
wordtk = vocab[word]
if wordtk not in dictionary:
dictionary[wordtk] = 1
else:
dictionary[wordtk] += 1
wordslist.append(dictionary.keys())
countslist.append(dictionary.values())
return (wordslist[0], countslist[0])
示例6: __generate_tensor
# 需要导入模块: from nltk import tokenize [as 别名]
# 或者: from nltk.tokenize import wordpunct_tokenize [as 别名]
def __generate_tensor(self, is_review, reverse=False):
"""
:param is_review:
:param reverse:
:return:
"""
seq_length = self.review_max_words if is_review else self.summary_max_words
total_rev_summary_pairs = self.rev_sum_pair.shape[0]
data_tensor = np.zeros([total_rev_summary_pairs,seq_length])
sample = self.rev_sum_pair[0::, 0] if is_review else self.rev_sum_pair[0::, 1]
for index, entry in enumerate(sample.tolist()):
index_lst = np.array([self.map[word.lower()] for word in wordpunct_tokenize(entry)])
# reverse if want to get backward form
if reverse:
index_lst = index_lst[::-1]
# Pad the list
if len(index_lst) <= seq_length:
index_lst = np.lib.pad(index_lst, (0,seq_length - index_lst.size), 'constant', constant_values=(0, 0))
else:
index_lst = index_lst[0:seq_length]
data_tensor[index] = index_lst
return data_tensor
示例7: read_wordpunct_block
# 需要导入模块: from nltk import tokenize [as 别名]
# 或者: from nltk.tokenize import wordpunct_tokenize [as 别名]
def read_wordpunct_block(stream):
toks = []
for i in range(20): # Read 20 lines at a time.
toks.extend(wordpunct_tokenize(stream.readline()))
return toks
示例8: __call__
# 需要导入模块: from nltk import tokenize [as 别名]
# 或者: from nltk.tokenize import wordpunct_tokenize [as 别名]
def __call__(self, sentence):
return wordpunct_tokenize(sentence)
示例9: BOW2
# 需要导入模块: from nltk import tokenize [as 别名]
# 或者: from nltk.tokenize import wordpunct_tokenize [as 别名]
def BOW2(texts, vocab, dim):
'''
Convert a list of texts to the BoW dense representation.
'''
out = np.zeros((len(texts), dim), dtype=np.int32)
mask = np.zeros((len(texts), dim), dtype=np.float32)
for i, text in enumerate(texts):
bow = BOW(wordpunct_tokenize(text), vocab)
out[i,:len(bow[0])] = bow[0]
mask[i,:len(bow[1])] = bow[1]
return out, mask
示例10: Word2Vec_encode
# 需要导入模块: from nltk import tokenize [as 别名]
# 或者: from nltk.tokenize import wordpunct_tokenize [as 别名]
def Word2Vec_encode(texts, wemb):
out = np.zeros((len(texts), prm.dim_emb), dtype=np.float32)
for i, text in enumerate(texts):
words = wordpunct_tokenize(text)
n = 0.
for word in words:
if word in wemb:
out[i,:] += wemb[word]
n += 1.
out[i,:] /= max(1.,n)
return out
示例11: tag_unigram
# 需要导入模块: from nltk import tokenize [as 别名]
# 或者: from nltk.tokenize import wordpunct_tokenize [as 别名]
def tag_unigram(self, untagged_string: str):
"""Tag POS with unigram tagger.
:type untagged_string: str
:param : An untagged, untokenized string of text.
:rtype tagged_text: str
"""
untagged_tokens = wordpunct_tokenize(untagged_string)
pickle_path = self.available_taggers['unigram']
tagger = open_pickle(pickle_path)
tagged_text = tagger.tag(untagged_tokens)
return tagged_text
示例12: tag_bigram
# 需要导入模块: from nltk import tokenize [as 别名]
# 或者: from nltk.tokenize import wordpunct_tokenize [as 别名]
def tag_bigram(self, untagged_string: str):
"""Tag POS with bigram tagger.
:type untagged_string: str
:param : An untagged, untokenized string of text.
:rtype tagged_text: str
"""
untagged_tokens = wordpunct_tokenize(untagged_string)
pickle_path = self.available_taggers['bigram']
tagger = open_pickle(pickle_path)
tagged_text = tagger.tag(untagged_tokens)
return tagged_text
示例13: tag_trigram
# 需要导入模块: from nltk import tokenize [as 别名]
# 或者: from nltk.tokenize import wordpunct_tokenize [as 别名]
def tag_trigram(self, untagged_string: str):
"""Tag POS with trigram tagger.
:type untagged_string: str
:param : An untagged, untokenized string of text.
:rtype tagged_text: str
"""
untagged_tokens = wordpunct_tokenize(untagged_string)
pickle_path = self.available_taggers['trigram']
tagger = open_pickle(pickle_path)
tagged_text = tagger.tag(untagged_tokens)
return tagged_text
示例14: tag_ngram_123_backoff
# 需要导入模块: from nltk import tokenize [as 别名]
# 或者: from nltk.tokenize import wordpunct_tokenize [as 别名]
def tag_ngram_123_backoff(self, untagged_string: str):
"""Tag POS with 1-, 2-, 3-gram tagger.
:type untagged_string: str
:param : An untagged, untokenized string of text.
:rtype tagged_text: str
"""
untagged_tokens = wordpunct_tokenize(untagged_string)
pickle_path = self.available_taggers['ngram_123_backoff']
tagger = open_pickle(pickle_path)
tagged_text = tagger.tag(untagged_tokens)
return tagged_text
示例15: tag_ngram_12_backoff
# 需要导入模块: from nltk import tokenize [as 别名]
# 或者: from nltk.tokenize import wordpunct_tokenize [as 别名]
def tag_ngram_12_backoff(self, untagged_string: str):
"""Tag POS with 1-, 2-gram tagger.
:type untagged_string: str
:param : An untagged, untokenized string of text.
:rtype tagged_text: str
"""
untagged_tokens = wordpunct_tokenize(untagged_string)
pickle_path = self.available_taggers['ngram_12_backoff']
tagger = open_pickle(pickle_path)
tagged_text = tagger.tag(untagged_tokens)
return tagged_text