本文整理匯總了Python中nltk.tokenize.sent_tokenize方法的典型用法代碼示例。如果您正苦於以下問題:Python tokenize.sent_tokenize方法的具體用法?Python tokenize.sent_tokenize怎麽用?Python tokenize.sent_tokenize使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在類nltk.tokenize
的用法示例。
在下文中一共展示了tokenize.sent_tokenize方法的15個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。
示例1: createCorpus
# 需要導入模塊: from nltk import tokenize [as 別名]
# 或者: from nltk.tokenize import sent_tokenize [as 別名]
def createCorpus(t):
corpus = []
all_sent = []
for k in t:
for p in t[k]:
corpus.append(st(p))
for sent in range(len(corpus)):
for k in corpus[sent]:
all_sent.append(k)
for m in range(len(all_sent)):
all_sent[m] = wt(all_sent[m])
all_words=[]
for sent in all_sent:
hold=[]
for word in sent:
hold.append(word.lower())
all_words.append(hold)
return all_words
示例2: main
# 需要導入模塊: from nltk import tokenize [as 別名]
# 或者: from nltk.tokenize import sent_tokenize [as 別名]
def main():
fce = convert_fce(args.fce_dataset_path)
with open(args.output + "/fce-original.txt", 'w', encoding='utf-8') as out_original, \
open(args.output + "/fce-applied.txt", 'w', encoding='utf-8') as out_applied:
for doc in tqdm(fce, unit='doc'):
sents = re.split(r"\n +\n", doc)
for sent in sents:
tokenized_sents = sent_tokenize(sent)
for i in range(len(tokenized_sents)):
if re.search(r"[{>][.?!]$", tokenized_sents[i]):
tokenized_sents[i + 1] = tokenized_sents[i] + " " + tokenized_sents[i + 1]
tokenized_sents[i] = ""
regexp = r'{([^{}]*?)=>([^{}]*?)}'
original = re.sub(regexp, r"\1", tokenized_sents[i])
applied = re.sub(regexp, r"\2", tokenized_sents[i])
# filter out nested alerts
if original != "" and applied != "" and not re.search(r"[{}=]", original) \
and not re.search(r"[{}=]", applied):
out_original.write(" ".join(word_tokenize(original)) + "\n")
out_applied.write(" ".join(word_tokenize(applied)) + "\n")
示例3: find_abr_fullname
# 需要導入模塊: from nltk import tokenize [as 別名]
# 或者: from nltk.tokenize import sent_tokenize [as 別名]
def find_abr_fullname(doc,query,Num):
"""Find the query(abbreviation's) full name within the document.
Parameters:
doc: the document to be searched for(specified format)
query: the abbreviation
Num: the number of sentences before the query to be looked for fullname
(here we asume that all the fullname of the query appeared before the query)
"""
sents = [word_tokenize(t) for t in sent_tokenize(doc)]
for i,sent in enumerate(sents):
if query in sent:
fullname = find_abr_fn(sent,query)
if fullname != -1:
return fullname
else:
j = 1
while i-j >= 0 and j <= Num:
if find_abr_fn(sent[i-j],query) == -1:
j+=1
else:
return find_abr_fn(sent[i-j],query)
raise Exception('No query in the document.')
示例4: paragraph_to_sentences
# 需要導入模塊: from nltk import tokenize [as 別名]
# 或者: from nltk.tokenize import sent_tokenize [as 別名]
def paragraph_to_sentences(paragraph, term):
"""
Turns a paragraph into clean, preprocessed sentences
"""
result = []
paragraph = re.sub(r"([^ ])([\(\[\"])", r"\1 \2", paragraph) # Give brackets space to breathe
paragraph = re.sub(r"([\)\]\"\!\?:])([^ ])", r"\1 \2", paragraph)
paragraph = re.sub(r"([^. ]{3})\.([^. ]{3}|A |An )", r"\1. \2", paragraph)
paragraph = re.sub(r" e\.?g\.? ", " _eg_ ", paragraph) # sent_tokenize improperly splits sentences here
paragraph = re.sub(r" i\.?e\.? ", " _ie_ ", paragraph)
sentences = sent_tokenize(paragraph)
for sentence in sentences:
sentence = sentence.replace("_eg_", "_e.g._").replace("_ie_", "i.e.") # reverts edge case
processed = preprocess_sentence(sentence, term)
if qualify_sentence(processed):
result.append(processed)
return result
# Sentences
########################
示例5: summarize
# 需要導入模塊: from nltk import tokenize [as 別名]
# 或者: from nltk.tokenize import sent_tokenize [as 別名]
def summarize(self, text, n):
"""
Return a list of n sentences
which represent the summary of text.
"""
sents = sent_tokenize(text)
assert n <= len(sents)
word_sent = [word_tokenize(s.lower()) for s in sents]
self._freq = self._compute_frequencies(word_sent)
ranking = defaultdict(int)
for i,sent in enumerate(word_sent):
for w in sent:
if w in self._freq:
ranking[i] += self._freq[w]
sents_idx = self._rank(ranking, n)
return [sents[j] for j in sents_idx]
示例6: _convert_files_to_binary
# 需要導入模塊: from nltk import tokenize [as 別名]
# 或者: from nltk.tokenize import sent_tokenize [as 別名]
def _convert_files_to_binary(input_filenames, output_filename):
with open(output_filename, 'wb') as writer:
for filename in input_filenames:
with open(filename, 'r') as f:
document = f.read()
document_parts = document.split('\n', 1)
assert len(document_parts) == 2
title = '<d><p><s>' + document_parts[0] + '</s></p></d>'
body = document_parts[1].decode('utf8').replace('\n', ' ').replace('\t', ' ')
sentences = sent_tokenize(body)
body = '<d><p>' + ' '.join(['<s>' + sentence + '</s>' for sentence in sentences]) + '</p></d>'
body = body.encode('utf8')
tf_example = example_pb2.Example()
tf_example.features.feature['article'].bytes_list.value.extend([body])
tf_example.features.feature['abstract'].bytes_list.value.extend([title])
tf_example_str = tf_example.SerializeToString()
str_len = len(tf_example_str)
writer.write(struct.pack('q', str_len))
writer.write(struct.pack('%ds' % str_len, tf_example_str))
示例7: doc_to_ids
# 需要導入模塊: from nltk import tokenize [as 別名]
# 或者: from nltk.tokenize import sent_tokenize [as 別名]
def doc_to_ids(self, doc, training=True):
l = []
words = dict()
window = 150
# doc = doc.replace("–", " ")
# doc = sent_tokenize(doc)
for sentence in doc:
miniArray = []
for term in sentence:
id = self.term_to_id(term, training)
if id != None:
miniArray.append(id)
if not id in words:
words[id] = 1
self.docfreq[id] += 1
if not len(miniArray):
continue
if len(miniArray) > window:
l.extend([np.array(miniArray[i:i+window]) for i in xrange(0, len(miniArray), window)])
else:
l.append(np.array(miniArray))
return l
示例8: split_into_sentences
# 需要導入模塊: from nltk import tokenize [as 別名]
# 或者: from nltk.tokenize import sent_tokenize [as 別名]
def split_into_sentences(text, doc_annotations, tokenizer):
"""Split into sentences and return bookkeeping info."""
sentences = []
sentences_starts = []
sentence_annotations = []
doc_annotations = sorted(doc_annotations, key=lambda x: x[2])
annotation_idx = 0
sentences_text = tokenize.sent_tokenize(text)
token_idx = 0
for sentence_text in sentences_text:
sub_tokens, word_starts = tokenizer.tokenize(sentence_text)
sentences.append(sub_tokens)
sentences_starts.append(word_starts)
sentence_annotations.append([])
token_idx += len(sentence_text.split(" "))
while annotation_idx < len(
doc_annotations) and doc_annotations[annotation_idx][2] < token_idx:
sentence_annotations[-1].append(doc_annotations[annotation_idx])
annotation_idx += 1
return sentences, sentences_starts, sentence_annotations
示例9: score_sentences
# 需要導入模塊: from nltk import tokenize [as 別名]
# 或者: from nltk.tokenize import sent_tokenize [as 別名]
def score_sentences(query,
doc_json,
entity,
sentence_scores,
max_sentence_len,
n=3):
"""Score sentences with respect to the query."""
sentences = tokenize.sent_tokenize(doc_json['text'])
query_ngrams = util.get_ngrams(tokenize.word_tokenize(query), n)
for sentence in sentences:
sentence_tokens = tokenize.word_tokenize(sentence)
tokens = tokenize.word_tokenize(
entity['wikipedia_name']) + [':'] + sentence_tokens[:max_sentence_len]
sentence_ngrams = util.get_ngrams(tokens, n)
score = len(set(sentence_ngrams).intersection(query_ngrams)) / max(
1, len(query_ngrams))
sentence_scores.append((entity, sentence_tokens), score)
示例10: extractFeatures
# 需要導入模塊: from nltk import tokenize [as 別名]
# 或者: from nltk.tokenize import sent_tokenize [as 別名]
def extractFeatures(self, article, n, customStopWords=None):
# pass in article as a tuple ( text, title)
text = article[0]
# extract the text
title = article[1]
# extract the title
sentences = sent_tokenize(text)
# split text into sentences
word_sent = [word_tokenize(sentences.lower()) for a in sentences]
# split sentences into words
self._freq = self._compute_frequencies(word_sent, customStopWords)
# calculate word freq using member func created above
if n < 0:
# how many features (words) to return - a -ve number means
# no feature ( word) selection, just return all features
return nlargest(len(self._freq_keys()),
self._freq, key=self._freq.get)
else:
# here we say if calling e func has asked for a subset
# then return only the 'n' largest features, i.e. the
# most important words ( important == frequent, less stopwords)
return nlargest(n, self._freq, key=self._freq.get)
示例11: summarize
# 需要導入模塊: from nltk import tokenize [as 別名]
# 或者: from nltk.tokenize import sent_tokenize [as 別名]
def summarize(self, article, n):
text = article[0]
text = article[1]
sentences = sent_tokenize(text)
word_sent = [word_tokenize(s.lower()) for s in sentences]
self._freq = self._compute_frequencies(word_sent)
ranking = defaultdict(int)
for i, sentence in enumerate(word_sent):
for word in sentence:
if word in self._freq:
ranking[i] += self._freq[word]
sentences_index = nlargest(n, ranking, key=ranking.get)
return [sentences[j] for j in sentences_index]
##############################################################################
# TEST
示例12: offset_tokenize
# 需要導入模塊: from nltk import tokenize [as 別名]
# 或者: from nltk.tokenize import sent_tokenize [as 別名]
def offset_tokenize(text):
tail = text
accum = 0
tokens = [word for sent in sent_tokenize(text) for word in word_tokenize(sent)]
info_tokens = []
for tok in tokens:
scaped_tok = re.escape(tok)
m = re.search(scaped_tok, tail)
start, end = m.span()
# global offsets
gs = accum + start
ge = accum + end
accum += end
# keep searching in the rest
tail = tail[end:]
info_tokens.append((tok, (gs, ge)))
return info_tokens
示例13: convert_to_single_sentence
# 需要導入模塊: from nltk import tokenize [as 別名]
# 或者: from nltk.tokenize import sent_tokenize [as 別名]
def convert_to_single_sentence(doc_str, e1_start, e1_end, e2_start, e2_end, annotation_map):
offsets = zip(e1_start+e2_start, e1_end+e2_end, [1]*len(e1_start)+[2]*len(e2_start))
offsets = sorted(offsets, key=lambda tup: tup[0])
replaced_doc_str = [process_single_annotation(doc_str, 0, s, e, annotation_map, i, ent_id) if i == 0
else
process_single_annotation(doc_str, offsets[i-1][1], s, e, annotation_map, i, ent_id)
for i, (s, e, ent_id) in enumerate(offsets)]
replaced_doc_str.append(' '.join(doc_str[offsets[-1][1]:]))
new_doc_str = ''.join(replaced_doc_str)
## TODO only works for data with single e1 and e2 mention
sentences = sent_tokenize(new_doc_str.replace('@@ ', '').decode('utf-8'))
tokenized_sents = [tokenize(s) for s in sentences]
chosen_sent = [i for i, s in enumerate(sentences) if s.count(ENTITY_STRING) >= 2]
if chosen_sent:
if FLAGS.full_abstract:
replaced_sent = [annotation_map[w] if w in annotation_map else w for s in tokenized_sents for w in s]
else:
idx = chosen_sent[0]
s_idx = max(0, idx - FLAGS.sentence_window)
e_idx = min(idx + FLAGS.sentence_window+1, len(tokenized_sents))
window_sentences = [tokenized_sents[i] for i in (range(s_idx, e_idx))]
replaced_sent = [annotation_map[w] if w in annotation_map else w for s in window_sentences for w in s]
return replaced_sent
示例14: get_feature_vectors_2
# 需要導入模塊: from nltk import tokenize [as 別名]
# 或者: from nltk.tokenize import sent_tokenize [as 別名]
def get_feature_vectors_2 (self, data_file):
print("Loading unlabeled data from file {}".format(data_file))
with open(data_file, 'r') as f_data:
all_sentences_words = []
# Process all lines in the file
for line in f_data:
text = line.strip()
#break the input text into sentences before tokenization
sentences = sent_tokenize(text)
for sent in sentences:
sentence_words = nltk.word_tokenize(sent)
all_sentences_words.append( tuple(sentence_words) )
self.n_sentences_all = len(all_sentences_words)
print("number of unlabeled examples = {}".format(self.n_sentences_all))
return self.create_feature_vectors(all_sentences_words)
##################################################
# get_feature_vectors_1
##################################################
開發者ID:Azure-Samples,項目名稱:MachineLearningSamples-BiomedicalEntityExtraction,代碼行數:27,代碼來源:DataReader.py
示例15: get_feature_vectors_1
# 需要導入模塊: from nltk import tokenize [as 別名]
# 或者: from nltk.tokenize import sent_tokenize [as 別名]
def get_feature_vectors_1 (self, data_list):
print("Reading unlabeled data from dataframe")
# list of list of tokens
all_sentences_words = []
# Process all lines in the file
for line in data_list:
text = line.strip()
#break the input text into sentences before tokenization
sentences = sent_tokenize(text)
for sent in sentences:
sentence_words = nltk.word_tokenize(sent)
all_sentences_words.append( tuple(sentence_words) )
self.n_sentences_all = len(all_sentences_words)
print("number of unlabeled examples = {}".format(self.n_sentences_all))
return self.create_feature_vectors(all_sentences_words)
##################################################
# create_feature_vectors
##################################################
開發者ID:Azure-Samples,項目名稱:MachineLearningSamples-BiomedicalEntityExtraction,代碼行數:26,代碼來源:DataReader.py