本文整理汇总了Python中nltk.sent_tokenize方法的典型用法代码示例。如果您正苦于以下问题:Python nltk.sent_tokenize方法的具体用法?Python nltk.sent_tokenize怎么用?Python nltk.sent_tokenize使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类nltk
的用法示例。
在下文中一共展示了nltk.sent_tokenize方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: prepareSents
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import sent_tokenize [as 别名]
def prepareSents(wrds):
valid_sents=[]
text=''.join(wrd[0] for wrd in wrds)
sent_list=[[(word,0,'None') for word in sent] for sent in sent_tokenize(text)]
text=[word for word in wrds if word[0]!=' ']
sent_list=[[word for word in concat_words(strip_chars(sent)) if word[0]!=' '] for sent in sent_list]
idx=0
s_idx=0
while idx < len(text) and s_idx<len(sent_list):
if not match_words(sent_list[s_idx],text[idx:idx+len(sent_list[s_idx])]):
print "NLTK:"+ str(sent_list[s_idx])
print 'MINE:' + str(text[idx:idx+len(sent_list[s_idx])])
else:
valid_sents+=[text[idx:idx+len(sent_list[s_idx])]]
idx=idx+len(sent_list[s_idx])
s_idx+=1
return valid_sents
示例2: get_story_question_answer_triples
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import sent_tokenize [as 别名]
def get_story_question_answer_triples(sqa_file):
sqatriples = []
fsqa = open(sqa_file, "rb")
for line in fsqa:
line = line.strip().decode("utf8").encode("ascii", "ignore")
if line.startswith("#"):
continue
story, question, answer, correct = line.split("\t")
swords = []
story_sents = nltk.sent_tokenize(story)
for story_sent in story_sents:
swords.extend(nltk.word_tokenize(story_sent))
qwords = nltk.word_tokenize(question)
awords = nltk.word_tokenize(answer)
is_correct = int(correct) == 1
sqatriples.append((swords, qwords, awords, is_correct))
fsqa.close()
return sqatriples
示例3: parse_sents
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import sent_tokenize [as 别名]
def parse_sents(self, sents):
"""
Parse multiple sentences
If "sents" is a string, it will be segmented into sentences using NLTK.
Otherwise, each element of "sents" will be treated as a sentence.
sents (str or Iterable[str] or Iterable[List[str]]): sentences to parse
Returns: Iter[nltk.Tree]
"""
if isinstance(sents, STRING_TYPES):
if self._tokenizer_lang is None:
raise ValueError(
"No tokenizer available for this language. "
"Please split into individual sentences and tokens "
"before calling the parser."
)
sents = nltk.sent_tokenize(sents, self._tokenizer_lang)
for parse_raw, tags_raw, sentence in self._batched_parsed_raw(self._nltk_process_sents(sents)):
yield self._make_nltk_tree(sentence, tags_raw, *parse_raw)
示例4: preprocess
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import sent_tokenize [as 别名]
def preprocess(html):
"""
Returns a preprocessed document consisting of a list of paragraphs, which
is a list of sentences, which is a list of tuples, where each tuple is a
(token, part of speech) pair.
"""
try:
return [
[
nltk.pos_tag(nltk.wordpunct_tokenize(sent))
for sent in nltk.sent_tokenize(paragraph)
]
for paragraph in para_tokenize(html)
]
except Exception as e:
raise NLTKError("could not preprocess text: {}".format(str(e)))
示例5: extract_nnp_phrases
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import sent_tokenize [as 别名]
def extract_nnp_phrases(text):
"""
NNP extractor convenience method.
:param text:
:return:
"""
phrase_list = []
for sentence in nltk.sent_tokenize(text):
# Get POS
tokens = nltk.word_tokenize(sentence)
pos = nltk.pos_tag(tokens)
# Get POS
phrase = []
for t, p in pos:
if p in ["NNP", "NNPS"] or t in [",", "&"]:
phrase.append(t)
else:
if len(phrase) > 1:
phrase_list.append(clean_nnp_phrase(phrase))
phrase = []
return phrase_list
示例6: clean_thread_conversations
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import sent_tokenize [as 别名]
def clean_thread_conversations(sub_str):
conversations = []
for mon in ['07', '08', '09', '10', '11', '12']:
with open('datasets/raw_reddit/reddit_{}_{}_18threads.json'.format(sub_str, mon)) as f:
data = json.load(f)
for thread in data:
new_convo = {}
new_convo['lines'] = []
speaker = 0
for msg in thread:
text = clean_post(msg['text'])
if len(text) > 1:
sentences = nltk.sent_tokenize(text)
for sent in sentences:
sent_dict = {}
sent_dict['character'] = speaker
sent_dict['text'] = sent
new_convo['lines'].append(sent_dict)
speaker = 1 - speaker
if len(new_convo['lines']) > 1:
conversations.append(new_convo)
return conversations
示例7: sentence_splitting
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import sent_tokenize [as 别名]
def sentence_splitting (texts, slen = 1):
if len(texts) <= 0:
return []
# splitting
sentences = []
text_sents = sent_tokenize(texts)
if (text_sents != [''] and len(text_sents) > 0):
for sent in text_sents:
sent = sent.strip().split('\r') # split strings that contains "\r"
for sen in sent:
se = sen.split('. ')
for s in se:
if (NLP_word.words_counting(s) >= slen):
sentences.append(s)
return sentences
# splitting text into Sentences using NLTK tokenization
示例8: flesch_kincaid_reading_ease
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import sent_tokenize [as 别名]
def flesch_kincaid_reading_ease(text, token_count):
"""
Takes a text and returns its FK Reading Ease
:param text: A string text
:param token_count: the number of tokens in the text
:return: FK Reading Ease
"""
# Partly extracted of textstat 0.3.1 which is only available for python 2 (https://github.com/shivam5992/textstat)
def avg_syllables_per_word(text, token_count):
syllable = syllable_count(text)
if token_count > 0:
return float(syllable) / float(token_count)
else:
return 0
if len(nltk.sent_tokenize(text)) <= 0 or token_count <= 0:
return 0
ASL = float(token_count / len(nltk.sent_tokenize(text))) # avg sentence length
ASW = avg_syllables_per_word(text, token_count)
FKRA = 206.835 - float(1.015 * ASL) - float(84.6 * ASW)
return FKRA
示例9: parse_gender
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import sent_tokenize [as 别名]
def parse_gender(text):
sentences = [
[word.lower() for word in nltk.word_tokenize(sentence)]
for sentence in nltk.sent_tokenize(text)
]
sents, words = count_gender(sentences)
total = sum(words.values())
for gender, count in words.items():
pcent = (count / total) * 100
nsents = sents[gender]
print(
"{:0.3f}% {} ({} sentences)".format(pcent, gender, nsents)
)
示例10: __init__
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import sent_tokenize [as 别名]
def __init__(self, text, wordlistfolder):
self.text = text
self.tokens = nltk.word_tokenize(text)
self.sentenses = nltk.sent_tokenize(text)
self.tags = nltk.pos_tag(self.tokens)
self.featspace = []
self.psykfeatspace(self.featspace, wordlistfolder)
self.bigrams(self.featspace)
self.number_count(self.featspace)
self.punc_count(self.featspace)
self.big_word_count(self.featspace)
self.words_per_sentence(self.featspace)
self.sentence_count(self.featspace)
self.countPOS(self.featspace, 'CC')
self.countPOS(self.featspace, 'NP')
self.countPOS(self.featspace, 'NNP')
self.words(self.featspace)
self.stem(self.featspace)
# Counts a specific POS tags
示例11: _from_text
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import sent_tokenize [as 别名]
def _from_text(self, text, unit, tokenizer, language):
if tokenizer is not None:
if isinstance(tokenizer, str):
tokens = re.findall(tokenizer, text)
else:
tokens = tokenizer.tokenize(text)
else:
import nltk
@requires_nltk_corpus
def tokenize_text(text):
if unit == 'word':
return nltk.word_tokenize(text, language)
elif unit.startswith('sent'):
return nltk.sent_tokenize(text, language)
else:
raise ValueError(
"unit must be either 'word' or 'sentence'")
tokens = tokenize_text(text)
for i, t in enumerate(tokens):
self._elements.append(TextStim(text=t, onset=None, duration=None,
order=i))
示例12: parse_sgm
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import sent_tokenize [as 别名]
def parse_sgm(self, sgm_path):
with open(sgm_path, 'r') as f:
soup = BeautifulSoup(f.read(), features='html.parser')
self.sgm_text = soup.text
doc_type = soup.doc.doctype.text.strip()
def remove_tags(selector):
tags = soup.findAll(selector)
for tag in tags:
tag.extract()
if doc_type == 'WEB TEXT':
remove_tags('poster')
remove_tags('postdate')
remove_tags('subject')
elif doc_type in ['CONVERSATION', 'STORY']:
remove_tags('speaker')
sents = []
converted_text = soup.text
for sent in nltk.sent_tokenize(converted_text):
sents.extend(sent.split('\n\n'))
sents = list(filter(lambda x: len(x) > 5, sents))
sents = sents[1:]
sents_with_pos = []
last_pos = 0
for sent in sents:
pos = self.sgm_text.find(sent, last_pos)
last_pos = pos
sents_with_pos.append({
'text': sent,
'position': [pos, pos + len(sent)]
})
return sents_with_pos
示例13: tokenize_sentences
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import sent_tokenize [as 别名]
def tokenize_sentences(targets):
while True:
text = (yield) # (yield) gets an item from an upstream step
sentences = nltk.sent_tokenize(text)
for sentence in sentences:
for target in targets:
target.send(sentence) # send() sends data downstream
开发者ID:PacktPublishing,项目名称:Hands-on-NLP-with-NLTK-and-scikit-learn-,代码行数:9,代码来源:nlp-6.1-nlp-pipeline.py
示例14: extract_wiki_sentences
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import sent_tokenize [as 别名]
def extract_wiki_sentences(title, text, n_sentences, replace_title_mentions=''):
"""
Extracts the first n_paragraphs from the text of a wikipedia page corresponding to the title.
strip_title_mentions and replace_title_mentions control handling of references to the title in text.
Oftentimes QA models learn *not* to answer entities mentioned in the question so this helps deal with this
in the domain adaptation case.
:param title: title of page
:param text: text of page
:param n_paragraphs: number of paragraphs to use
:param replace_title_mentions: Replace mentions with the provided string token, by default removing them
:return:
"""
# Get simplest representation of title and text
title = unidecode(title).replace('_', ' ')
text = unidecode(text)
# Split on non-alphanumeric
title_words = re.split('[^a-zA-Z0-9]', title)
title_word_pattern = '|'.join(re.escape(w.lower()) for w in title_words)
# Breaking by newline yields paragraphs. Ignore the first since its always just the title
paragraphs = [p for p in text.split('\n') if len(p) != 0][1:]
sentences = []
for p in paragraphs:
formatted_text = re.sub(title_word_pattern, replace_title_mentions, p, flags=re.IGNORECASE)
# Cleanup whitespace
formatted_text = re.sub('\s+', ' ', formatted_text).strip()
sentences.extend(nltk.sent_tokenize(formatted_text))
return sentences[:n_sentences]
示例15: tokenize_train
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import sent_tokenize [as 别名]
def tokenize_train(train_directory,tokenized_directory):
with codecs.open(train_directory, "r", "utf-8") as file:
with codecs.open(tokenized_directory, "w", "utf-8") as writer:
new_sens = []
for line in file:
sentences = sent_tokenize(line.strip())
for sen in sentences:
sen = word_tokenize(sen.lower())
new_sen = ' '.join(sen)
new_sens.append(new_sen)
writer.write(new_sen)
writer.write("\n")
sentences = gensim.models.word2vec.LineSentence(tokenized_directory)
return sentences