本文整理汇总了Python中nltk.tokenize.word_tokenize方法的典型用法代码示例。如果您正苦于以下问题:Python tokenize.word_tokenize方法的具体用法?Python tokenize.word_tokenize怎么用?Python tokenize.word_tokenize使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类nltk.tokenize
的用法示例。
在下文中一共展示了tokenize.word_tokenize方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: createCorpus
# 需要导入模块: from nltk import tokenize [as 别名]
# 或者: from nltk.tokenize import word_tokenize [as 别名]
def createCorpus(t):
corpus = []
all_sent = []
for k in t:
for p in t[k]:
corpus.append(st(p))
for sent in range(len(corpus)):
for k in corpus[sent]:
all_sent.append(k)
for m in range(len(all_sent)):
all_sent[m] = wt(all_sent[m])
all_words=[]
for sent in all_sent:
hold=[]
for word in sent:
hold.append(word.lower())
all_words.append(hold)
return all_words
示例2: get_summary
# 需要导入模块: from nltk import tokenize [as 别名]
# 或者: from nltk.tokenize import word_tokenize [as 别名]
def get_summary(self, number_of_sentences=5):
'''
generates summary based on weighted word frequencies
:param number_of_sentences: total number of sentences to return in summary
:return: string of summary
'''
sentence_value = {}
for sentence in self.__sentence:
for word in self.__word_freq.keys():
if word in word_tokenize(sentence.lower()):
if sentence in sentence_value:
sentence_value[sentence] += self.__word_freq.get(word)
else:
sentence_value[sentence] = self.__word_freq.get(word, 0)
summary_sentences = heapq.nlargest(number_of_sentences, sentence_value, key=sentence_value.get)
summary = ' '.join(summary_sentences)
return summary
示例3: tokenize_data
# 需要导入模块: from nltk import tokenize [as 别名]
# 或者: from nltk.tokenize import word_tokenize [as 别名]
def tokenize_data(data):
'''
Tokenize captions, questions and answers
Also maintain word count if required
'''
ques_toks, ans_toks, caption_toks = [], [], []
print data['split']
print 'Tokenizing captions...'
for i in data['data']['dialogs']:
caption = word_tokenize(i['caption'])
caption_toks.append(caption)
print 'Tokenizing questions...'
for i in data['data']['questions']:
ques_tok = word_tokenize(i + '?')
ques_toks.append(ques_tok)
print 'Tokenizing answers...'
for i in data['data']['answers']:
ans_tok = word_tokenize(i)
ans_toks.append(ans_tok)
return ques_toks, ans_toks, caption_toks
示例4: createIndex
# 需要导入模块: from nltk import tokenize [as 别名]
# 或者: from nltk.tokenize import word_tokenize [as 别名]
def createIndex(self):
# create index
print('creating index...')
imgToQA = {ann['image_id']: [] for ann in self.dataset['annotations']}
qa = {ann['question_id']: [] for ann in self.dataset['annotations']}
qqa = {ann['question_id']: [] for ann in self.dataset['annotations']}
max_ques_len = 0
for ann in self.dataset['annotations']:
imgToQA[ann['image_id']] += [ann]
qa[ann['question_id']] = ann
for ques in self.questions['questions']:
qqa[ques['question_id']] = ques
max_ques_len = max(max_ques_len,
len(word_tokenize(ques['question'])))
print('index created!')
# create class members
self.qa = qa
self.qqa = qqa
self.imgToQA = imgToQA
self.max_ques_len = max_ques_len
示例5: filter_by_ques_len
# 需要导入模块: from nltk import tokenize [as 别名]
# 或者: from nltk.tokenize import word_tokenize [as 别名]
def filter_by_ques_len(self, max_ques_len):
print("Filtering the questions by length...")
keep_ques = {}
for ques in tqdm(self.questions['questions']):
if len(word_tokenize(ques['question'])) <= max_ques_len:
keep_ques[ques['question_id']] = \
keep_ques.get(ques['question_id'], 0) + 1
self.dataset['annotations'] = \
[ann for ann in self.dataset['annotations'] \
if keep_ques.get(ann['question_id'],0)>0]
self.questions['questions'] = \
[ques for ques in self.questions['questions'] \
if keep_ques.get(ques['question_id'],0)>0]
self.createIndex()
示例6: filter_by_ans_len
# 需要导入模块: from nltk import tokenize [as 别名]
# 或者: from nltk.tokenize import word_tokenize [as 别名]
def filter_by_ans_len(self, max_ans_len, min_freq=5):
print("Filtering the answers by length...")
keep_ques = {}
for ann in tqdm(self.dataset['annotations']):
if len(word_tokenize(ann['best_answer'])) <= max_ans_len \
and ann['best_answer_count']>=min_freq:
keep_ques[ann['question_id']] = \
keep_ques.get(ann['question_id'], 0) + 1
self.dataset['annotations'] = \
[ann for ann in self.dataset['annotations'] \
if keep_ques.get(ann['question_id'],0)>0]
self.questions['questions'] = \
[ques for ques in self.questions['questions'] \
if keep_ques.get(ques['question_id'],0)>0]
self.createIndex()
示例7: quora_read
# 需要导入模块: from nltk import tokenize [as 别名]
# 或者: from nltk.tokenize import word_tokenize [as 别名]
def quora_read(file_path, bleu_baseline=False):
"""Read the quora dataset"""
print("Reading quora raw data .. ")
print(" data path: %s" % file_path)
with open(file_path) as fd:
lines = fd.readlines()
sentence_sets = []
for l in tqdm(lines):
p0, p1 = l[:-1].lower().split("\t")
sentence_sets.append([word_tokenize(p0), word_tokenize(p1)])
if(bleu_baseline):
print("calculating bleu ... ")
hypothesis = [s[0] for s in sentence_sets]
references = [s[1:] for s in sentence_sets]
bleu = corpus_bleu(references, hypothesis)
print("bleu on the training set: %.4f" % bleu)
return sentence_sets
示例8: main
# 需要导入模块: from nltk import tokenize [as 别名]
# 或者: from nltk.tokenize import word_tokenize [as 别名]
def main():
fce = convert_fce(args.fce_dataset_path)
with open(args.output + "/fce-original.txt", 'w', encoding='utf-8') as out_original, \
open(args.output + "/fce-applied.txt", 'w', encoding='utf-8') as out_applied:
for doc in tqdm(fce, unit='doc'):
sents = re.split(r"\n +\n", doc)
for sent in sents:
tokenized_sents = sent_tokenize(sent)
for i in range(len(tokenized_sents)):
if re.search(r"[{>][.?!]$", tokenized_sents[i]):
tokenized_sents[i + 1] = tokenized_sents[i] + " " + tokenized_sents[i + 1]
tokenized_sents[i] = ""
regexp = r'{([^{}]*?)=>([^{}]*?)}'
original = re.sub(regexp, r"\1", tokenized_sents[i])
applied = re.sub(regexp, r"\2", tokenized_sents[i])
# filter out nested alerts
if original != "" and applied != "" and not re.search(r"[{}=]", original) \
and not re.search(r"[{}=]", applied):
out_original.write(" ".join(word_tokenize(original)) + "\n")
out_applied.write(" ".join(word_tokenize(applied)) + "\n")
示例9: clean_text
# 需要导入模块: from nltk import tokenize [as 别名]
# 或者: from nltk.tokenize import word_tokenize [as 别名]
def clean_text(text):
# stop_words = stopwords.words('english')
stop_words = []
stop_words.extend(['!', ',' ,'.' ,'?' ,'-s' ,'-ly' ,'</s> ', 's'])
stemmer = WordNetLemmatizer()
text = remove_short(text)
text = clean_str(text)
text = word_tokenize(text)
text = [word for word in text if word not in stop_words]
text = [stemmer.lemmatize(word) for word in text]
return ' '.join(text)
示例10: build_dataset
# 需要导入模块: from nltk import tokenize [as 别名]
# 或者: from nltk.tokenize import word_tokenize [as 别名]
def build_dataset(step, word_dict, article_max_len, summary_max_len, toy=False):
if step == "train":
article_list = get_text_list(train_article_path, toy)
title_list = get_text_list(train_title_path, toy)
elif step == "valid":
article_list = get_text_list(valid_article_path, toy)
else:
raise NotImplementedError
x = [word_tokenize(d) for d in article_list]
x = [[word_dict.get(w, word_dict["<unk>"]) for w in d] for d in x]
x = [d[:article_max_len] for d in x]
x = [d + (article_max_len - len(d)) * [word_dict["<padding>"]] for d in x]
if step == "valid":
return x
else:
y = [word_tokenize(d) for d in title_list]
y = [[word_dict.get(w, word_dict["<unk>"]) for w in d] for d in y]
y = [d[:(summary_max_len - 1)] for d in y]
return x, y
示例11: process_line
# 需要导入模块: from nltk import tokenize [as 别名]
# 或者: from nltk.tokenize import word_tokenize [as 别名]
def process_line(line):
tokens = word_tokenize(line)
output_tokens = []
for token in tokens:
if token in INS_PUNCTS:
output_tokens.append(INS_PUNCTS[token])
elif token in EOS_PUNCTS:
output_tokens.append(EOS_PUNCTS[token])
elif is_number(token):
output_tokens.append(NUM)
else:
output_tokens.append(token.lower())
return untokenize(" ".join(output_tokens) + " ")
示例12: validate
# 需要导入模块: from nltk import tokenize [as 别名]
# 或者: from nltk.tokenize import word_tokenize [as 别名]
def validate(self, sess, x_val, y_val, true_val):
# Calculate BLEU on validation data
hypotheses_val = []
references_val = []
symbol=[]
if self.config['experiment'] == 'qgen':
symbol.append('?')
for batch_i, (input_batch, output_batch, source_sent_lengths, tar_sent_lengths) in enumerate(
data_utils.get_batches(x_val, y_val, self.batch_size)):
answer_logits = sess.run(self.inference_logits,
feed_dict={self.input_data: input_batch,
self.source_sentence_length: source_sent_lengths,
self.keep_prob: 1.0})
for k, pred in enumerate(answer_logits):
hypotheses_val.append(
word_tokenize(" ".join([self.decoder_idx_word[i] for i in pred if i not in [self.pad, -1, self.eos]])) + symbol)
references_val.append([word_tokenize(true_val[batch_i * self.batch_size + k])])
bleu_scores = eval_utils.calculate_bleu_scores(references_val, hypotheses_val)
self.epoch_bleu_score_val['1'].append(bleu_scores[0])
self.epoch_bleu_score_val['2'].append(bleu_scores[1])
self.epoch_bleu_score_val['3'].append(bleu_scores[2])
self.epoch_bleu_score_val['4'].append(bleu_scores[3])
示例13: summonehot
# 需要导入模块: from nltk import tokenize [as 别名]
# 或者: from nltk.tokenize import word_tokenize [as 别名]
def summonehot(corpus):
allwords=[]
annotated={}
for sent in corpus:
for word in wt(sent):
allwords.append(word.lower())
print(len(set(allwords)), "unique characters in corpus")
#maxcorp=int(input("Enter desired number of vocabulary: "))
maxcorp=int(len(set(allwords))/1.1)
wordcount = Counter(allwords).most_common(maxcorp)
allwords=[]
for p in wordcount:
allwords.append(p[0])
allwords=list(set(allwords))
print(len(allwords), "unique characters in corpus after max corpus cut")
#integer encode
label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(allwords)
#one hot
onehot_encoder = OneHotEncoder(sparse=False)
integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
onehot_encoded = onehot_encoder.fit_transform(integer_encoded)
#make look up dict
for k in range(len(onehot_encoded)):
inverted = cleantext(label_encoder.inverse_transform([argmax(onehot_encoded[k, :])])[0]).strip()
annotated[inverted]=onehot_encoded[k]
return label_encoder,onehot_encoded,annotated
示例14: wordvecmatrix
# 需要导入模块: from nltk import tokenize [as 别名]
# 或者: from nltk.tokenize import word_tokenize [as 别名]
def wordvecmatrix(model,data):
IO_data={"article":[],"summaries":[]}
i=1
for k in range(len(data["articles"])):
art=[]
summ=[]
for word in wt(data["articles"][k].lower()):
try:
art.append(model.wv.word_vec(word))
except Exception as e:
print(e)
for word in wt(data["summaries"][k].lower()):
try:
summ.append(onehot[word])
#summ.append(model.wv.word_vec(word))
except Exception as e:
print(e)
IO_data["article"].append(art)
IO_data["summaries"].append(summ)
if i%100==0:
print("progress: " + str(((i*100)/len(data["articles"]))))
i+=1
#announcedone()
print('\007')
return IO_data
示例15: _tokenize
# 需要导入模块: from nltk import tokenize [as 别名]
# 或者: from nltk.tokenize import word_tokenize [as 别名]
def _tokenize(self, sent):
return ' '.join(word_tokenize(sent))