当前位置: 首页>>代码示例>>Python>>正文


Python tokenize.word_tokenize方法代码示例

本文整理汇总了Python中nltk.tokenize.word_tokenize方法的典型用法代码示例。如果您正苦于以下问题:Python tokenize.word_tokenize方法的具体用法?Python tokenize.word_tokenize怎么用?Python tokenize.word_tokenize使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在nltk.tokenize的用法示例。


在下文中一共展示了tokenize.word_tokenize方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: createCorpus

# 需要导入模块: from nltk import tokenize [as 别名]
# 或者: from nltk.tokenize import word_tokenize [as 别名]
def createCorpus(t):
    corpus = []
    all_sent = []
    for k in t:
        for p in t[k]:
            corpus.append(st(p))
    for sent in range(len(corpus)):
        for k in corpus[sent]:
            all_sent.append(k)
    for m in range(len(all_sent)):
        all_sent[m] = wt(all_sent[m])
    
    all_words=[]
    for sent in all_sent:
        hold=[]
        for word in sent:
            hold.append(word.lower())
        all_words.append(hold)
    return all_words 
开发者ID:DeepsMoseli,项目名称:Bidirectiona-LSTM-for-text-summarization-,代码行数:21,代码来源:word2vec.py

示例2: get_summary

# 需要导入模块: from nltk import tokenize [as 别名]
# 或者: from nltk.tokenize import word_tokenize [as 别名]
def get_summary(self, number_of_sentences=5):
        '''
            generates summary based on weighted word frequencies

            :param number_of_sentences: total number of sentences to return in summary
            :return: string of summary
        '''
        sentence_value = {}
        for sentence in self.__sentence:
            for word in self.__word_freq.keys():
                if word in word_tokenize(sentence.lower()):
                    if sentence in sentence_value:
                        sentence_value[sentence] += self.__word_freq.get(word)
                    else:
                        sentence_value[sentence] = self.__word_freq.get(word, 0)
        
        summary_sentences = heapq.nlargest(number_of_sentences, sentence_value, key=sentence_value.get)
        summary = ' '.join(summary_sentences)
        return summary 
开发者ID:OmkarPathak,项目名称:Django-Bookworm,代码行数:21,代码来源:summarize.py

示例3: tokenize_data

# 需要导入模块: from nltk import tokenize [as 别名]
# 或者: from nltk.tokenize import word_tokenize [as 别名]
def tokenize_data(data):
    '''
    Tokenize captions, questions and answers
    Also maintain word count if required
    '''
    ques_toks, ans_toks, caption_toks = [], [], []

    print data['split']
    print 'Tokenizing captions...'
    for i in data['data']['dialogs']:
        caption = word_tokenize(i['caption'])
        caption_toks.append(caption)

    print 'Tokenizing questions...'
    for i in data['data']['questions']:
        ques_tok = word_tokenize(i + '?')
        ques_toks.append(ques_tok)

    print 'Tokenizing answers...'
    for i in data['data']['answers']:
        ans_tok = word_tokenize(i)
        ans_toks.append(ans_tok)

    return ques_toks, ans_toks, caption_toks 
开发者ID:jiasenlu,项目名称:visDial.pytorch,代码行数:26,代码来源:prepro.py

示例4: createIndex

# 需要导入模块: from nltk import tokenize [as 别名]
# 或者: from nltk.tokenize import word_tokenize [as 别名]
def createIndex(self):
                # create index
                print('creating index...')
                imgToQA = {ann['image_id']: [] for ann in self.dataset['annotations']}
                qa =  {ann['question_id']: [] for ann in self.dataset['annotations']}
                qqa = {ann['question_id']: [] for ann in self.dataset['annotations']}
                max_ques_len = 0
                for ann in self.dataset['annotations']:
                        imgToQA[ann['image_id']] += [ann]
                        qa[ann['question_id']] = ann
                for ques in self.questions['questions']:
                        qqa[ques['question_id']] = ques
                        max_ques_len = max(max_ques_len,
                            len(word_tokenize(ques['question'])))
                print('index created!')

                # create class members
                self.qa = qa
                self.qqa = qqa
                self.imgToQA = imgToQA
                self.max_ques_len = max_ques_len 
开发者ID:DeepRNN,项目名称:visual_question_answering,代码行数:23,代码来源:vqa.py

示例5: filter_by_ques_len

# 需要导入模块: from nltk import tokenize [as 别名]
# 或者: from nltk.tokenize import word_tokenize [as 别名]
def filter_by_ques_len(self, max_ques_len):
                print("Filtering the questions by length...")
                keep_ques = {}
                for ques in tqdm(self.questions['questions']):
                    if len(word_tokenize(ques['question'])) <= max_ques_len:
                        keep_ques[ques['question_id']] = \
                            keep_ques.get(ques['question_id'], 0) + 1

                self.dataset['annotations'] = \
                    [ann for ann in self.dataset['annotations'] \
                    if keep_ques.get(ann['question_id'],0)>0]
                self.questions['questions'] = \
                    [ques for ques in self.questions['questions'] \
                    if keep_ques.get(ques['question_id'],0)>0]

                self.createIndex() 
开发者ID:DeepRNN,项目名称:visual_question_answering,代码行数:18,代码来源:vqa.py

示例6: filter_by_ans_len

# 需要导入模块: from nltk import tokenize [as 别名]
# 或者: from nltk.tokenize import word_tokenize [as 别名]
def filter_by_ans_len(self, max_ans_len, min_freq=5):
                print("Filtering the answers by length...")
                keep_ques = {}
                for ann in tqdm(self.dataset['annotations']):
                    if len(word_tokenize(ann['best_answer'])) <= max_ans_len \
                        and ann['best_answer_count']>=min_freq:
                        keep_ques[ann['question_id']] = \
                            keep_ques.get(ann['question_id'], 0) + 1

                self.dataset['annotations'] = \
                    [ann for ann in self.dataset['annotations'] \
                    if keep_ques.get(ann['question_id'],0)>0]
                self.questions['questions'] = \
                    [ques for ques in self.questions['questions'] \
                    if keep_ques.get(ques['question_id'],0)>0]

                self.createIndex() 
开发者ID:DeepRNN,项目名称:visual_question_answering,代码行数:19,代码来源:vqa.py

示例7: quora_read

# 需要导入模块: from nltk import tokenize [as 别名]
# 或者: from nltk.tokenize import word_tokenize [as 别名]
def quora_read(file_path, bleu_baseline=False):
  """Read the quora dataset"""
  print("Reading quora raw data .. ")
  print("  data path: %s" % file_path)
  with open(file_path) as fd:
    lines = fd.readlines()
  sentence_sets = []
  for l in tqdm(lines):
    p0, p1 = l[:-1].lower().split("\t")
    sentence_sets.append([word_tokenize(p0), word_tokenize(p1)])

  if(bleu_baseline):
    print("calculating bleu ... ")
    hypothesis = [s[0] for s in sentence_sets]
    references = [s[1:] for s in sentence_sets]
    bleu = corpus_bleu(references, hypothesis)
    print("bleu on the training set: %.4f" % bleu)
  return sentence_sets 
开发者ID:FranxYao,项目名称:dgm_latent_bow,代码行数:20,代码来源:data_utils.py

示例8: main

# 需要导入模块: from nltk import tokenize [as 别名]
# 或者: from nltk.tokenize import word_tokenize [as 别名]
def main():
    fce = convert_fce(args.fce_dataset_path)
    with open(args.output + "/fce-original.txt", 'w', encoding='utf-8') as out_original, \
            open(args.output + "/fce-applied.txt", 'w', encoding='utf-8') as out_applied:
        for doc in tqdm(fce, unit='doc'):
            sents = re.split(r"\n +\n", doc)
            for sent in sents:
                tokenized_sents = sent_tokenize(sent)
                for i in range(len(tokenized_sents)):
                    if re.search(r"[{>][.?!]$", tokenized_sents[i]):
                        tokenized_sents[i + 1] = tokenized_sents[i] + " " + tokenized_sents[i + 1]
                        tokenized_sents[i] = ""
                    regexp = r'{([^{}]*?)=>([^{}]*?)}'
                    original = re.sub(regexp, r"\1", tokenized_sents[i])
                    applied = re.sub(regexp, r"\2", tokenized_sents[i])
                    # filter out nested alerts
                    if original != "" and applied != "" and not re.search(r"[{}=]", original) \
                            and not re.search(r"[{}=]", applied):
                        out_original.write(" ".join(word_tokenize(original)) + "\n")
                        out_applied.write(" ".join(word_tokenize(applied)) + "\n") 
开发者ID:plkmo,项目名称:NLP_Toolkit,代码行数:22,代码来源:prepare_clc_fce_data.py

示例9: clean_text

# 需要导入模块: from nltk import tokenize [as 别名]
# 或者: from nltk.tokenize import word_tokenize [as 别名]
def clean_text(text):
        # stop_words = stopwords.words('english')
        stop_words = []
        stop_words.extend(['!', ',' ,'.' ,'?' ,'-s' ,'-ly' ,'</s> ', 's'])
        stemmer = WordNetLemmatizer()

        text = remove_short(text)
        text = clean_str(text)

        text = word_tokenize(text)

        text = [word for word in text if word not in stop_words]

        text = [stemmer.lemmatize(word) for word in text]

        return ' '.join(text) 
开发者ID:HuangLianzhe,项目名称:TextLevelGCN,代码行数:18,代码来源:pre_processing.py

示例10: build_dataset

# 需要导入模块: from nltk import tokenize [as 别名]
# 或者: from nltk.tokenize import word_tokenize [as 别名]
def build_dataset(step, word_dict, article_max_len, summary_max_len, toy=False):
    if step == "train":
        article_list = get_text_list(train_article_path, toy)
        title_list = get_text_list(train_title_path, toy)
    elif step == "valid":
        article_list = get_text_list(valid_article_path, toy)
    else:
        raise NotImplementedError

    x = [word_tokenize(d) for d in article_list]
    x = [[word_dict.get(w, word_dict["<unk>"]) for w in d] for d in x]
    x = [d[:article_max_len] for d in x]
    x = [d + (article_max_len - len(d)) * [word_dict["<padding>"]] for d in x]
    
    if step == "valid":
        return x
    else:        
        y = [word_tokenize(d) for d in title_list]
        y = [[word_dict.get(w, word_dict["<unk>"]) for w in d] for d in y]
        y = [d[:(summary_max_len - 1)] for d in y]
        return x, y 
开发者ID:dongjun-Lee,项目名称:text-summarization-tensorflow,代码行数:23,代码来源:utils.py

示例11: process_line

# 需要导入模块: from nltk import tokenize [as 别名]
# 或者: from nltk.tokenize import word_tokenize [as 别名]
def process_line(line):

    tokens = word_tokenize(line)
    output_tokens = []

    for token in tokens:

        if token in INS_PUNCTS:
            output_tokens.append(INS_PUNCTS[token])
        elif token in EOS_PUNCTS:
            output_tokens.append(EOS_PUNCTS[token])
        elif is_number(token):
            output_tokens.append(NUM)
        else:
            output_tokens.append(token.lower())

    return untokenize(" ".join(output_tokens) + " ") 
开发者ID:ottokart,项目名称:punctuator2,代码行数:19,代码来源:dont_run_me_run_the_other_script_instead.py

示例12: validate

# 需要导入模块: from nltk import tokenize [as 别名]
# 或者: from nltk.tokenize import word_tokenize [as 别名]
def validate(self, sess, x_val, y_val, true_val):
        # Calculate BLEU on validation data
        hypotheses_val = []
        references_val = []
        symbol=[]
        if self.config['experiment'] == 'qgen':
            symbol.append('?')
        for batch_i, (input_batch, output_batch, source_sent_lengths, tar_sent_lengths) in enumerate(
                data_utils.get_batches(x_val, y_val, self.batch_size)):
            answer_logits = sess.run(self.inference_logits,
                                     feed_dict={self.input_data: input_batch,
                                                self.source_sentence_length: source_sent_lengths,
                                                self.keep_prob: 1.0})

            for k, pred in enumerate(answer_logits):
                hypotheses_val.append(
                    word_tokenize(" ".join([self.decoder_idx_word[i] for i in pred if i not in [self.pad, -1, self.eos]])) + symbol)
                references_val.append([word_tokenize(true_val[batch_i * self.batch_size + k])])

        bleu_scores = eval_utils.calculate_bleu_scores(references_val, hypotheses_val)
        self.epoch_bleu_score_val['1'].append(bleu_scores[0])
        self.epoch_bleu_score_val['2'].append(bleu_scores[1])
        self.epoch_bleu_score_val['3'].append(bleu_scores[2])
        self.epoch_bleu_score_val['4'].append(bleu_scores[3]) 
开发者ID:HareeshBahuleyan,项目名称:tf-var-attention,代码行数:26,代码来源:ded_detAttn.py

示例13: summonehot

# 需要导入模块: from nltk import tokenize [as 别名]
# 或者: from nltk.tokenize import word_tokenize [as 别名]
def summonehot(corpus):
    allwords=[]
    annotated={}
    for sent in corpus:
        for word in wt(sent):
            allwords.append(word.lower())
    print(len(set(allwords)), "unique characters in corpus")
    #maxcorp=int(input("Enter desired number of vocabulary: "))
    maxcorp=int(len(set(allwords))/1.1)
    wordcount = Counter(allwords).most_common(maxcorp)
    allwords=[]
    
    for p in wordcount:
        allwords.append(p[0])  
        
    allwords=list(set(allwords))
    
    print(len(allwords), "unique characters in corpus after max corpus cut")
    #integer encode
    label_encoder = LabelEncoder()
    integer_encoded = label_encoder.fit_transform(allwords)
    #one hot
    onehot_encoder = OneHotEncoder(sparse=False)
    integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
    onehot_encoded = onehot_encoder.fit_transform(integer_encoded)
    #make look up dict
    for k in range(len(onehot_encoded)): 
        inverted = cleantext(label_encoder.inverse_transform([argmax(onehot_encoded[k, :])])[0]).strip()
        annotated[inverted]=onehot_encoded[k]
    return label_encoder,onehot_encoded,annotated 
开发者ID:DeepsMoseli,项目名称:Bidirectiona-LSTM-for-text-summarization-,代码行数:32,代码来源:word2vec.py

示例14: wordvecmatrix

# 需要导入模块: from nltk import tokenize [as 别名]
# 或者: from nltk.tokenize import word_tokenize [as 别名]
def wordvecmatrix(model,data):
    IO_data={"article":[],"summaries":[]}
    i=1
    for k in range(len(data["articles"])):
        art=[]
        summ=[]
        for word in wt(data["articles"][k].lower()):
            try:
                art.append(model.wv.word_vec(word))
            except Exception as e:
                print(e)

        for word in wt(data["summaries"][k].lower()):
            try:
                summ.append(onehot[word])
                #summ.append(model.wv.word_vec(word))
            except Exception as e:
                print(e)
        
        IO_data["article"].append(art) 
        IO_data["summaries"].append(summ)
        if i%100==0:
            print("progress: " + str(((i*100)/len(data["articles"]))))
        i+=1
    #announcedone()
    print('\007')
    return IO_data 
开发者ID:DeepsMoseli,项目名称:Bidirectiona-LSTM-for-text-summarization-,代码行数:29,代码来源:word2vec.py

示例15: _tokenize

# 需要导入模块: from nltk import tokenize [as 别名]
# 或者: from nltk.tokenize import word_tokenize [as 别名]
def _tokenize(self, sent):
        return ' '.join(word_tokenize(sent)) 
开发者ID:AuCson,项目名称:SEDST,代码行数:4,代码来源:metric.py


注:本文中的nltk.tokenize.word_tokenize方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。