當前位置: 首頁>>代碼示例>>Python>>正文


Python tokenize.sent_tokenize方法代碼示例

本文整理匯總了Python中nltk.tokenize.sent_tokenize方法的典型用法代碼示例。如果您正苦於以下問題:Python tokenize.sent_tokenize方法的具體用法?Python tokenize.sent_tokenize怎麽用?Python tokenize.sent_tokenize使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在nltk.tokenize的用法示例。


在下文中一共展示了tokenize.sent_tokenize方法的15個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。

示例1: createCorpus

# 需要導入模塊: from nltk import tokenize [as 別名]
# 或者: from nltk.tokenize import sent_tokenize [as 別名]
def createCorpus(t):
    corpus = []
    all_sent = []
    for k in t:
        for p in t[k]:
            corpus.append(st(p))
    for sent in range(len(corpus)):
        for k in corpus[sent]:
            all_sent.append(k)
    for m in range(len(all_sent)):
        all_sent[m] = wt(all_sent[m])
    
    all_words=[]
    for sent in all_sent:
        hold=[]
        for word in sent:
            hold.append(word.lower())
        all_words.append(hold)
    return all_words 
開發者ID:DeepsMoseli,項目名稱:Bidirectiona-LSTM-for-text-summarization-,代碼行數:21,代碼來源:word2vec.py

示例2: main

# 需要導入模塊: from nltk import tokenize [as 別名]
# 或者: from nltk.tokenize import sent_tokenize [as 別名]
def main():
    fce = convert_fce(args.fce_dataset_path)
    with open(args.output + "/fce-original.txt", 'w', encoding='utf-8') as out_original, \
            open(args.output + "/fce-applied.txt", 'w', encoding='utf-8') as out_applied:
        for doc in tqdm(fce, unit='doc'):
            sents = re.split(r"\n +\n", doc)
            for sent in sents:
                tokenized_sents = sent_tokenize(sent)
                for i in range(len(tokenized_sents)):
                    if re.search(r"[{>][.?!]$", tokenized_sents[i]):
                        tokenized_sents[i + 1] = tokenized_sents[i] + " " + tokenized_sents[i + 1]
                        tokenized_sents[i] = ""
                    regexp = r'{([^{}]*?)=>([^{}]*?)}'
                    original = re.sub(regexp, r"\1", tokenized_sents[i])
                    applied = re.sub(regexp, r"\2", tokenized_sents[i])
                    # filter out nested alerts
                    if original != "" and applied != "" and not re.search(r"[{}=]", original) \
                            and not re.search(r"[{}=]", applied):
                        out_original.write(" ".join(word_tokenize(original)) + "\n")
                        out_applied.write(" ".join(word_tokenize(applied)) + "\n") 
開發者ID:plkmo,項目名稱:NLP_Toolkit,代碼行數:22,代碼來源:prepare_clc_fce_data.py

示例3: find_abr_fullname

# 需要導入模塊: from nltk import tokenize [as 別名]
# 或者: from nltk.tokenize import sent_tokenize [as 別名]
def find_abr_fullname(doc,query,Num):
    """Find the query(abbreviation's) full name within the document.
       Parameters:
       doc: the document to be searched for(specified format) 
       query: the abbreviation
       Num: the number of sentences before the query to be looked for fullname
       (here we asume that all the fullname of the query appeared before the query)
    """
    sents = [word_tokenize(t) for t in sent_tokenize(doc)]
    for i,sent in enumerate(sents):
        if query in sent:
            fullname = find_abr_fn(sent,query)
            if fullname != -1:
                return fullname
            else:
                j = 1
                while i-j >= 0 and j <= Num: 
                    if find_abr_fn(sent[i-j],query) == -1:
                        j+=1
                    else:
                        return find_abr_fn(sent[i-j],query)
                
    raise Exception('No query in the document.') 
開發者ID:c-amr,項目名稱:camr,代碼行數:25,代碼來源:util.py

示例4: paragraph_to_sentences

# 需要導入模塊: from nltk import tokenize [as 別名]
# 或者: from nltk.tokenize import sent_tokenize [as 別名]
def paragraph_to_sentences(paragraph, term):
    """
    Turns a paragraph into clean, preprocessed sentences
    """
    result = []
    paragraph = re.sub(r"([^ ])([\(\[\"])", r"\1 \2", paragraph)  # Give brackets space to breathe
    paragraph = re.sub(r"([\)\]\"\!\?:])([^ ])", r"\1 \2", paragraph)
    paragraph = re.sub(r"([^. ]{3})\.([^. ]{3}|A |An )", r"\1. \2", paragraph)
    paragraph = re.sub(r" e\.?g\.? ", " _eg_ ", paragraph)  # sent_tokenize improperly splits sentences here
    paragraph = re.sub(r" i\.?e\.? ", " _ie_ ", paragraph)
    sentences = sent_tokenize(paragraph)
    for sentence in sentences:
        sentence = sentence.replace("_eg_", "_e.g._").replace("_ie_", "i.e.")  # reverts edge case
        processed = preprocess_sentence(sentence, term)
        if qualify_sentence(processed):
            result.append(processed)
    return result


# Sentences
######################## 
開發者ID:wordnik,項目名稱:serapis,代碼行數:23,代碼來源:preprocess.py

示例5: summarize

# 需要導入模塊: from nltk import tokenize [as 別名]
# 或者: from nltk.tokenize import sent_tokenize [as 別名]
def summarize(self, text, n):
    """
      Return a list of n sentences 
      which represent the summary of text.
    """
    sents = sent_tokenize(text)
    assert n <= len(sents)
    word_sent = [word_tokenize(s.lower()) for s in sents]
    self._freq = self._compute_frequencies(word_sent)
    ranking = defaultdict(int)
    for i,sent in enumerate(word_sent):
      for w in sent:
        if w in self._freq:
          ranking[i] += self._freq[w]
    sents_idx = self._rank(ranking, n)    
    return [sents[j] for j in sents_idx] 
開發者ID:assafelovic,項目名稱:nlp_url_summarizer,代碼行數:18,代碼來源:text_summarizer.py

示例6: _convert_files_to_binary

# 需要導入模塊: from nltk import tokenize [as 別名]
# 或者: from nltk.tokenize import sent_tokenize [as 別名]
def _convert_files_to_binary(input_filenames, output_filename):
  with open(output_filename, 'wb') as writer:
    for filename in input_filenames:
      with open(filename, 'r') as f:
        document = f.read()
    
      document_parts = document.split('\n', 1)
      assert len(document_parts) == 2
    
      title = '<d><p><s>' + document_parts[0] + '</s></p></d>'
      
      body = document_parts[1].decode('utf8').replace('\n', ' ').replace('\t', ' ')
      sentences = sent_tokenize(body)
      body = '<d><p>' + ' '.join(['<s>' + sentence + '</s>' for sentence in sentences]) + '</p></d>'
      body = body.encode('utf8')
    
      tf_example = example_pb2.Example()
      tf_example.features.feature['article'].bytes_list.value.extend([body])
      tf_example.features.feature['abstract'].bytes_list.value.extend([title])
      tf_example_str = tf_example.SerializeToString()
      str_len = len(tf_example_str)
      writer.write(struct.pack('q', str_len))
      writer.write(struct.pack('%ds' % str_len, tf_example_str)) 
開發者ID:surmenok,項目名稱:TextSum,代碼行數:25,代碼來源:textsum_data_convert.py

示例7: doc_to_ids

# 需要導入模塊: from nltk import tokenize [as 別名]
# 或者: from nltk.tokenize import sent_tokenize [as 別名]
def doc_to_ids(self, doc, training=True):
        l = []
        words = dict()
        window = 150
#        doc = doc.replace("&ndash;", " ")
#        doc = sent_tokenize(doc)
        for sentence in doc:
            miniArray = []
            for term in sentence:
                id = self.term_to_id(term, training)    
                if id != None:
                    miniArray.append(id)
                    if not id in words:
                        words[id] = 1
                        self.docfreq[id] += 1
            if not len(miniArray):
                continue
            if len(miniArray)  > window:
                l.extend([np.array(miniArray[i:i+window]) for i in xrange(0, len(miniArray), window)])
            else:
                l.append(np.array(miniArray))
        return l 
開發者ID:balikasg,項目名稱:topicModelling,代碼行數:24,代碼來源:vocabulary.py

示例8: split_into_sentences

# 需要導入模塊: from nltk import tokenize [as 別名]
# 或者: from nltk.tokenize import sent_tokenize [as 別名]
def split_into_sentences(text, doc_annotations, tokenizer):
  """Split into sentences and return bookkeeping info."""
  sentences = []
  sentences_starts = []
  sentence_annotations = []
  doc_annotations = sorted(doc_annotations, key=lambda x: x[2])
  annotation_idx = 0
  sentences_text = tokenize.sent_tokenize(text)
  token_idx = 0
  for sentence_text in sentences_text:
    sub_tokens, word_starts = tokenizer.tokenize(sentence_text)
    sentences.append(sub_tokens)
    sentences_starts.append(word_starts)
    sentence_annotations.append([])
    token_idx += len(sentence_text.split(" "))
    while annotation_idx < len(
        doc_annotations) and doc_annotations[annotation_idx][2] < token_idx:
      sentence_annotations[-1].append(doc_annotations[annotation_idx])
      annotation_idx += 1
  return sentences, sentences_starts, sentence_annotations 
開發者ID:google-research,項目名稱:language,代碼行數:22,代碼來源:create_pretraining_data.py

示例9: score_sentences

# 需要導入模塊: from nltk import tokenize [as 別名]
# 或者: from nltk.tokenize import sent_tokenize [as 別名]
def score_sentences(query,
                    doc_json,
                    entity,
                    sentence_scores,
                    max_sentence_len,
                    n=3):
  """Score sentences with respect to the query."""
  sentences = tokenize.sent_tokenize(doc_json['text'])
  query_ngrams = util.get_ngrams(tokenize.word_tokenize(query), n)
  for sentence in sentences:
    sentence_tokens = tokenize.word_tokenize(sentence)
    tokens = tokenize.word_tokenize(
        entity['wikipedia_name']) + [':'] + sentence_tokens[:max_sentence_len]
    sentence_ngrams = util.get_ngrams(tokens, n)
    score = len(set(sentence_ngrams).intersection(query_ngrams)) / max(
        1, len(query_ngrams))
    sentence_scores.append((entity, sentence_tokens), score) 
開發者ID:google-research,項目名稱:language,代碼行數:19,代碼來源:background.py

示例10: extractFeatures

# 需要導入模塊: from nltk import tokenize [as 別名]
# 或者: from nltk.tokenize import sent_tokenize [as 別名]
def extractFeatures(self, article, n, customStopWords=None):
        # pass in article as a tuple ( text, title)
        text = article[0]
        # extract the text
        title = article[1]
        # extract the title
        sentences = sent_tokenize(text)
        # split text into sentences
        word_sent = [word_tokenize(sentences.lower()) for a in sentences]
        # split sentences into words
        self._freq = self._compute_frequencies(word_sent, customStopWords)
        # calculate word freq using member func created above
        if n < 0:
            # how many features (words) to return - a -ve number means
            # no feature ( word) selection, just return all features
            return nlargest(len(self._freq_keys()),
                            self._freq, key=self._freq.get)
        else:
            # here we say if calling e func has asked for a subset
            # then return only the 'n' largest features, i.e. the
            # most important words ( important == frequent, less stopwords)
            return nlargest(n, self._freq, key=self._freq.get) 
開發者ID:qalhata,項目名稱:Python-Scripts-Repo-on-Data-Science,代碼行數:24,代碼來源:NewsArticleClass.py

示例11: summarize

# 需要導入模塊: from nltk import tokenize [as 別名]
# 或者: from nltk.tokenize import sent_tokenize [as 別名]
def summarize(self, article, n):
        text = article[0]
        text = article[1]
        sentences = sent_tokenize(text)
        word_sent = [word_tokenize(s.lower()) for s in sentences]
        self._freq = self._compute_frequencies(word_sent)
        ranking = defaultdict(int)
        for i, sentence in enumerate(word_sent):
            for word in sentence:
                if word in self._freq:
                    ranking[i] += self._freq[word]
        sentences_index = nlargest(n, ranking, key=ranking.get)
        return [sentences[j] for j in sentences_index]

##############################################################################
# TEST 
開發者ID:qalhata,項目名稱:Python-Scripts-Repo-on-Data-Science,代碼行數:18,代碼來源:NewsArticleClass.py

示例12: offset_tokenize

# 需要導入模塊: from nltk import tokenize [as 別名]
# 或者: from nltk.tokenize import sent_tokenize [as 別名]
def offset_tokenize(text):
    tail = text
    accum = 0
    tokens = [word for sent in sent_tokenize(text) for word in word_tokenize(sent)]
    info_tokens = []
    for tok in tokens:
        scaped_tok = re.escape(tok)
        m = re.search(scaped_tok, tail)
        start, end = m.span()
        # global offsets
        gs = accum + start
        ge = accum + end
        accum += end
        # keep searching in the rest
        tail = tail[end:]
        info_tokens.append((tok, (gs, ge)))
    return info_tokens 
開發者ID:UKPLab,項目名稱:semeval2017-scienceie,代碼行數:19,代碼來源:extras.py

示例13: convert_to_single_sentence

# 需要導入模塊: from nltk import tokenize [as 別名]
# 或者: from nltk.tokenize import sent_tokenize [as 別名]
def convert_to_single_sentence(doc_str, e1_start, e1_end, e2_start, e2_end, annotation_map):
    offsets = zip(e1_start+e2_start, e1_end+e2_end, [1]*len(e1_start)+[2]*len(e2_start))
    offsets = sorted(offsets, key=lambda tup: tup[0])
    replaced_doc_str = [process_single_annotation(doc_str, 0, s, e, annotation_map, i, ent_id) if i == 0
                        else
                        process_single_annotation(doc_str, offsets[i-1][1], s, e, annotation_map, i, ent_id)
                        for i, (s, e, ent_id) in enumerate(offsets)]

    replaced_doc_str.append(' '.join(doc_str[offsets[-1][1]:]))
    new_doc_str = ''.join(replaced_doc_str)

    ## TODO only works for data with single e1 and e2 mention
    sentences = sent_tokenize(new_doc_str.replace('@@ ', '').decode('utf-8'))
    tokenized_sents = [tokenize(s) for s in sentences]
    chosen_sent = [i for i, s in enumerate(sentences) if s.count(ENTITY_STRING) >= 2]
    if chosen_sent:
        if FLAGS.full_abstract:
            replaced_sent = [annotation_map[w] if w in annotation_map else w for s in tokenized_sents for w in s]
        else:
            idx = chosen_sent[0]
            s_idx = max(0, idx - FLAGS.sentence_window)
            e_idx = min(idx + FLAGS.sentence_window+1, len(tokenized_sents))
            window_sentences = [tokenized_sents[i] for i in (range(s_idx, e_idx))]
            replaced_sent = [annotation_map[w] if w in annotation_map else w for s in window_sentences for w in s]
        return replaced_sent 
開發者ID:patverga,項目名稱:bran,代碼行數:27,代碼來源:labled_tsv_to_tfrecords_single_sentences.py

示例14: get_feature_vectors_2

# 需要導入模塊: from nltk import tokenize [as 別名]
# 或者: from nltk.tokenize import sent_tokenize [as 別名]
def get_feature_vectors_2 (self, data_file):        

        print("Loading unlabeled data from file {}".format(data_file))
        with open(data_file, 'r') as f_data:                                    
            all_sentences_words = []
                 

            # Process all lines in the file
            for line in f_data:
                text = line.strip()                                

                #break the input text into sentences before tokenization
                sentences = sent_tokenize(text)
                
                for sent in sentences:
                    sentence_words = nltk.word_tokenize(sent)                             
                    all_sentences_words.append( tuple(sentence_words) )                                                           
        
        self.n_sentences_all = len(all_sentences_words)   
        print("number of unlabeled examples = {}".format(self.n_sentences_all))
        return self.create_feature_vectors(all_sentences_words)

    ##################################################
    #  get_feature_vectors_1  
    ################################################## 
開發者ID:Azure-Samples,項目名稱:MachineLearningSamples-BiomedicalEntityExtraction,代碼行數:27,代碼來源:DataReader.py

示例15: get_feature_vectors_1

# 需要導入模塊: from nltk import tokenize [as 別名]
# 或者: from nltk.tokenize import sent_tokenize [as 別名]
def get_feature_vectors_1 (self, data_list):        

        print("Reading unlabeled data from dataframe")   
        # list of list of tokens
        all_sentences_words = []           

        # Process all lines in the file
        for line in data_list:
            text = line.strip()                                

            #break the input text into sentences before tokenization
            sentences = sent_tokenize(text)
            
            for sent in sentences:
                sentence_words = nltk.word_tokenize(sent)                             
                all_sentences_words.append( tuple(sentence_words) )                                                                                             
        
        self.n_sentences_all = len(all_sentences_words)        
        print("number of unlabeled examples = {}".format(self.n_sentences_all))
        return self.create_feature_vectors(all_sentences_words)

    ################################################## 
    #   create_feature_vectors
    ################################################## 
開發者ID:Azure-Samples,項目名稱:MachineLearningSamples-BiomedicalEntityExtraction,代碼行數:26,代碼來源:DataReader.py


注:本文中的nltk.tokenize.sent_tokenize方法示例由純淨天空整理自Github/MSDocs等開源代碼及文檔管理平台,相關代碼片段篩選自各路編程大神貢獻的開源項目,源碼版權歸原作者所有,傳播和使用請參考對應項目的License;未經允許,請勿轉載。