本文整理汇总了Python中nltk.tokenize.punkt.PunktSentenceTokenizer.sentences_from_text方法的典型用法代码示例。如果您正苦于以下问题:Python PunktSentenceTokenizer.sentences_from_text方法的具体用法?Python PunktSentenceTokenizer.sentences_from_text怎么用?Python PunktSentenceTokenizer.sentences_from_text使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类nltk.tokenize.punkt.PunktSentenceTokenizer
的用法示例。
在下文中一共展示了PunktSentenceTokenizer.sentences_from_text方法的14个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: sentence_tokenizer
# 需要导入模块: from nltk.tokenize.punkt import PunktSentenceTokenizer [as 别名]
# 或者: from nltk.tokenize.punkt.PunktSentenceTokenizer import sentences_from_text [as 别名]
def sentence_tokenizer(self, untokenized_string, language):
"""Reads language .pickle for right language"""
if language == 'greek':
pickle_path = os.path.expanduser('~/cltk_data/greek/cltk_linguistic_data/tokenizers/sentence/greek.pickle')
language_punkt_vars = PunktLanguageVars
language_punkt_vars.sent_end_chars = ('.', ';')
language_punkt_vars.internal_punctuation = (',', '·')
elif language == 'latin':
pickle_path = os.path.expanduser('~/cltk_data/latin/cltk_linguistic_data/tokenizers/sentence/latin.pickle')
language_punkt_vars = PunktLanguageVars
language_punkt_vars.sent_end_chars = ('.', '?', ':')
language_punkt_vars.internal_punctuation = (',', ';')
else:
print("No sentence tokenizer for this language available.")
with open(pickle_path, 'rb') as open_pickle:
tokenizer = pickle.load(open_pickle)
tokenizer.INCLUDE_ALL_COLLOCS = True
tokenizer.INCLUDE_ABBREV_COLLOCS = True
params = tokenizer.get_params()
sbd = PunktSentenceTokenizer(params)
tokenized_sentences = []
for sentence in sbd.sentences_from_text(untokenized_string,
realign_boundaries=True):
tokenized_sentences.append(sentence)
return tokenized_sentences
示例2: featureize
# 需要导入模块: from nltk.tokenize.punkt import PunktSentenceTokenizer [as 别名]
# 或者: from nltk.tokenize.punkt.PunktSentenceTokenizer import sentences_from_text [as 别名]
def featureize(F, observation_files):
word_tokenizer = PunktSentenceTokenizer()
sent_tokenizer = PunktSentenceTokenizer()
m = len(observation_files)
# X is Nx2
X = np.zeros((m,2), dtype=np.float)
for (i,filename) in enumerate(observation_files,start=0):
file_text = read_file(filename).decode('string_escape')
try:
num_sents = len(sent_tokenizer.sentences_from_text(file_text))
except UnicodeDecodeError:
num_sents = 2
#num_tokens = len(word_tokenize(file_text))
num_tokens = len(file_text.split())
# Return two features:
# 1 (0) - Number of sentences per file
# 2 (1) - Number of tokens per file
X[i][0] = num_sents
X[i][1] = num_tokens
return X
示例3: tokenize
# 需要导入模块: from nltk.tokenize.punkt import PunktSentenceTokenizer [as 别名]
# 或者: from nltk.tokenize.punkt.PunktSentenceTokenizer import sentences_from_text [as 别名]
def tokenize(self):
"""
Returns a list of tokenized sentences
"""
sentence_tokenizer = PunktSentenceTokenizer()
sentences = sentence_tokenizer.sentences_from_text(self.text)
sentences = [sentence.split() for sentence in sentences]
sentences = [[word.strip(",.?!") for word in sentence] for sentence in sentences]
return sentences
示例4: preprocess_doc
# 需要导入模块: from nltk.tokenize.punkt import PunktSentenceTokenizer [as 别名]
# 或者: from nltk.tokenize.punkt.PunktSentenceTokenizer import sentences_from_text [as 别名]
def preprocess_doc(doc):
sentence_tokenizer = PunktSentenceTokenizer()
sentences = sentence_tokenizer.sentences_from_text(doc)
tokens = []
for sentence in sentences:
#sentence1 = sentence.split()
sentence1 = neg_scope(sentence)
tokens.extend(w for w in sentence1 if w.lower() not in stopwords.words("english"))
for ii in xrange(len(tokens)):
if tokens[ii][-1] == '.':
tokens[ii] = tokens[ii][:-1]
return tokens
示例5: GCBlockExtractor
# 需要导入模块: from nltk.tokenize.punkt import PunktSentenceTokenizer [as 别名]
# 或者: from nltk.tokenize.punkt.PunktSentenceTokenizer import sentences_from_text [as 别名]
class GCBlockExtractor(ExtractionMapper):
def __init__(self):
super(GCBlockExtractor, self).__init__(
extraction_function=self._blocks_from_text)
self.tokenizer = PunktSentenceTokenizer()
def _blocks_from_text(self, page):
blocks = []
for sentence in self.tokenizer.sentences_from_text(
page.text.replace('\n', '')):
if sentence.strip():
blocks.append(len(sentence))
# maybe count tokens? or non-spaces?
return blocks
示例6: tokenize_sents_latin
# 需要导入模块: from nltk.tokenize.punkt import PunktSentenceTokenizer [as 别名]
# 或者: from nltk.tokenize.punkt.PunktSentenceTokenizer import sentences_from_text [as 别名]
def tokenize_sents_latin(sentences_string):
global tokenenized_sentences
"""Tokenize a Latin string into sentences"""
pickle_name = 'latin.pickle'
pickle_path = os.path.join(cltk_data, 'compiled', 'sentence_tokens_latin/', pickle_name)
with open(pickle_path, 'rb') as f:
train_data = pickle.load(f)
train_data.INCLUDE_ALL_COLLOCS = True
train_data.INCLUDE_ABBREV_COLLOCS = True
params = train_data.get_params()
sbd = PunktSentenceTokenizer(params)
tokenenized_sentences = []
for sentence in sbd.sentences_from_text(sentences_string, realign_boundaries=True):
tokenenized_sentences.append(sentence)
#print(tokenenized_sentences)
return tokenenized_sentences
示例7: raw_records
# 需要导入模块: from nltk.tokenize.punkt import PunktSentenceTokenizer [as 别名]
# 或者: from nltk.tokenize.punkt.PunktSentenceTokenizer import sentences_from_text [as 别名]
def raw_records(crawl_collection,start):
# Prepare a naive sentence tokeniser utility
pst = PunktSentenceTokenizer()
for rec in crawl_collection.query({'downloaded': True},field=None,skip=start):
_id = rec['_id']
if rec['content'] is None:
continue
content = rec['content']['contents']
# A wiki page may probably comprise of multiple content
for c in content:
# Explode a long topic into list of sentences
sentences = pst.sentences_from_text(c)
for s in sentences:
yield (_id,s)
示例8: add_sents
# 需要导入模块: from nltk.tokenize.punkt import PunktSentenceTokenizer [as 别名]
# 或者: from nltk.tokenize.punkt.PunktSentenceTokenizer import sentences_from_text [as 别名]
def add_sents(invid=None):
if invid:
findObj = {"_id": invid}
else:
findObj = {}
for vd in vdigests.find(findObj):
if not vd.get("nSentences") and vd.get('alignTrans') and vd.get('alignTrans').get('words'):
twords = vd['alignTrans']['words']
twords_len = len(twords)
trans = " ".join([wrd["word"] for wrd in twords])
STokenizer = PunktSentenceTokenizer()
token_sents = STokenizer.sentences_from_text(trans)
cwct = 0
sentct = 0
curword = twords[cwct]
for tsent in token_sents:
tswords = tsent.split(" ")
for wnum, tsword in enumerate(tswords):
if tsword == curword["word"]:
curword["sentenceNumber"] = sentct
cwct += 1
if cwct < twords_len:
curword = twords[cwct]
else:
print "warning: not a one-to-one match: ", curword["word"], tsword
if wnum == 0:
curword["sentenceNumber"] = sentct - 1
cwct += 1
if cwct < twords_len:
curword = twords[cwct]
elif wnum == len(tswords) - 1:
curword["sentenceNumber"] = sentct
else:
ipdb.set_trace()
sentct += 1
vd['nSentences'] = len(token_sents)
# write the separated sentences to file
ssout_name = "ss-" + vd["_id"]
outf = open("../ffdata/rawtrans/" + ssout_name, 'w')
outf.write("\n".join(token_sents))
outf.close()
vd['sentSepTransName'] = ssout_name
vdigests.save(vd)
示例9: tokenize_greek_sentences
# 需要导入模块: from nltk.tokenize.punkt import PunktSentenceTokenizer [as 别名]
# 或者: from nltk.tokenize.punkt.PunktSentenceTokenizer import sentences_from_text [as 别名]
def tokenize_greek_sentences(sentences_string):
global tokenenized_sentences
pickle_name = 'greek.pickle'
pickle_path = os.path.join(cltk_data, 'compiled', 'sentence_tokens_greek/', pickle_name)
with open(pickle_path, 'rb') as f:
train_data = pickle.load(f)
train_data.INCLUDE_ALL_COLLOCS = True
train_data.INCLUDE_ABBREV_COLLOCS = True
params = train_data.get_params()
sbd = PunktSentenceTokenizer(params)
'''
with open(input_file) as f:
to_be_tokenized = f.read()
'''
tokenenized_sentences = []
for sentence in sbd.sentences_from_text(sentences_string, realign_boundaries=True):
tokenenized_sentences.append(sentence)
#print(tokenenized_sentences)
return tokenenized_sentences
示例10: export_crawl_to_text
# 需要导入模块: from nltk.tokenize.punkt import PunktSentenceTokenizer [as 别名]
# 或者: from nltk.tokenize.punkt.PunktSentenceTokenizer import sentences_from_text [as 别名]
def export_crawl_to_text(mineDB):
# Prepare a naive sentence tokeniser utility
pst = PunktSentenceTokenizer()
text_path = os.path.realpath('./mine.txt')
with codecs.open(text_path, 'w', 'utf-8') as f:
m = 0
for wiki in mineDB.query({'downloaded': True},field=None):
# Skip empty content or the added one
if wiki['content'] is None or 'added_to_graph' in wiki:
continue
content = wiki['content']
# A wiki page may probably comprise of multiple content
for c in content['contents']:
# Explode content into sentences
sentences = pst.sentences_from_text(c)
print('... content #{} ==> {} sentences extracted.'.format(m, len(sentences)))
for s in sentences:
# Cleanse the sentence
s_ = cleanse(s)
# Filter out noise by length
if len(s_)<5 or len(s_.split(' '))<3:
continue
f.write(s_.lower() + '\n')
m += 1
if m>=args['limit']:
print(colored('[Ending] Maximum number of topics reached.','yellow'))
break
return text_path
示例11: GaleChurchAlignmentDistance
# 需要导入模块: from nltk.tokenize.punkt import PunktSentenceTokenizer [as 别名]
# 或者: from nltk.tokenize.punkt.PunktSentenceTokenizer import sentences_from_text [as 别名]
class GaleChurchAlignmentDistance(DistanceScorer):
def __init__(self):
self.name = "Gale Church Alignment Scorer"
self.tokenizer = PunktSentenceTokenizer()
self.sblocks, self.tblocks = [], []
def _blocks_from_text(self, text):
blocks = []
for sentence in self.tokenizer.sentences_from_text(
text.replace('\n', '')):
blocks.append(len(sentence))
# maybe count tokens? or non-spaces?
return blocks
def _extract(self, source_corpus, target_corpus):
for url, page in source_corpus.iteritems():
self.sblocks.append(self._blocks_from_text(page.text))
for url, page in target_corpus.iteritems():
self.tblocks.append(self._blocks_from_text(page.text))
def _score_pair(self, s_idx, s_page, t_idx, t_page):
return self.gc.align_score(self.sblocks[s_idx], self.tblocks[t_idx])
示例12: iter_topic
# 需要导入模块: from nltk.tokenize.punkt import PunktSentenceTokenizer [as 别名]
# 或者: from nltk.tokenize.punkt.PunktSentenceTokenizer import sentences_from_text [as 别名]
def iter_topic(crawl_collection,start):
# Prepare a naive sentence tokeniser utility
pst = PunktSentenceTokenizer()
n = 0
for wiki in crawl_collection.query({'downloaded': True},field=None,skip=start):
# Skip empty content or the added one
if wiki['content'] is None or 'added_to_graph' in wiki:
continue
m = 0
content = wiki['content']
if args['verbose']:
print(colored('[Extracting wiki] : ','cyan'), content['title'])
# A wiki page may probably comprise of multiple content
for c in content['contents']:
# Explode a long topic into list of sentences
sentences = pst.sentences_from_text(c)
for s in sentences:
m += 1
yield (content['title'],s.split(' '))
# After all sentences are processed,
# mark the current wiki record as 'processed'
crit = {'_id': wiki['_id']}
crawl_collection.update(crit, {'$set':{'added_to_graph':True}})
n += 1
if args['verbose']:
print(content['title'] + " processed with {0} nodes.".format(m))
print(colored("{0} wiki documents processed so far...".format(n),'blue'))
示例13: SimhashDistance
# 需要导入模块: from nltk.tokenize.punkt import PunktSentenceTokenizer [as 别名]
# 或者: from nltk.tokenize.punkt.PunktSentenceTokenizer import sentences_from_text [as 别名]
class SimhashDistance(DistanceScorer):
CHAR, TOKEN = range(2)
def __init__(self, source_tokenizer, target_tokenizer, n=2, level=TOKEN):
self.name = "Simhash Distance Scorer, n=%d" % n
self.sentence_splitter = PunktSentenceTokenizer()
self.s_hashes, self.t_hashes = [], []
self.source_tokenizer = source_tokenizer
if not source_tokenizer:
self.source_tokenizer = SpaceTokenizer()
self.target_tokenizer = target_tokenizer
if not target_tokenizer:
self.target_tokenizer = SpaceTokenizer()
def ngrams(n, tokenizer, page):
result = []
text = page.text.replace('\n', '')
for sentence in self.sentence_splitter.sentences_from_text(text):
if not sentence.strip():
continue
# if '\n' in sentence:
# print repr(sentence)
assert '\n' not in sentence, sentence
words = tokenizer.process(sentence).strip().split()
result += [" ".join(words[i:i + n]) for i in
range(max(len(words) - n + 1, 1))]
return result
def tokens(n, tokenizer, page):
# 180/1grams
# words = page.html.split()
words = filter(None, re.split("[^0-9a-zA-Z]", page.text))
return [" ".join(words[i:i + n]) for i in
range(max(len(words) - n + 1, 1))]
def chars(n, tokenizer, page):
s = "".join(page.text.split())
return [" ".join(s[i:i + n]) for i in
range(max(len(s) - n + 1, 1))]
def html_tokens(n, tokenizer, page):
# 153/trigrams
words = page.html.split()
return [" ".join(words[i:i + n]) for i in
range(max(len(words) - n + 1, 1))]
if level == SimhashDistance.TOKEN:
self.source_features = partial(tokens, n, self.source_tokenizer)
self.target_features = partial(tokens, n, self.target_tokenizer)
elif level == SimhashDistance.CHARS:
self.source_features = partial(chars, n, self.source_tokenizer)
self.target_features = partial(chars, n, self.target_tokenizer)
# self.source_features = partial(ngrams, n, self.source_tokenizer)
# self.target_features = partial(ngrams, n, self.target_tokenizer)
# print self.source_features("How are you?\nI am fine. Thanks.")
def _words_from_text(self, text, tokenizer):
words = set()
for line in self.sentence_splitter(text):
for w in tokenizer.process(line).split("\n"):
words.add(w)
return words
def _extract(self, source_corpus, target_corpus):
for url, page in source_corpus.iteritems():
self.s_hashes.append(Simhash(self.source_features(page)))
for url, page in target_corpus.iteritems():
self.t_hashes.append(Simhash(self.target_features(page)))
def _score_pair(self, s_idx, s_page, t_idx, t_page):
return -self.s_hashes[s_idx].distance(self.t_hashes[t_idx])
def get_features(self, text):
width = 3
text = self.tokenizer.sentences_from_text(text)
return [text[i:i + width] for i in
range(max(len(text) - width + 1, 1))]
示例14: len
# 需要导入模块: from nltk.tokenize.punkt import PunktSentenceTokenizer [as 别名]
# 或者: from nltk.tokenize.punkt.PunktSentenceTokenizer import sentences_from_text [as 别名]
ref_list = " ".join(ref)
line = sent_detector.tokenize(ref_list.strip())
author_name= []
year_of_pub= []
paper_name=[]
journal_name=[]
year_found = False
req_idx = 1
for i in Reference:
line = sent_detector.tokenize(i.strip())
line2 = sent_detector.sentences_from_text(i.strip() )
References.append(line)
line3 = [x for x in line if x != "."]
if len(line3)==4:
j=0
author_name.append(line3[j])
year_of_pub.append(line3[j+1])
paper_name.append(line3[j+2])
journal_name.append(line3[j+3])
else:
name_str = []
regex = re.compile("(\d{4})")
idx=0
req_idx = 1
while(idx<len(line3)):
result = re.findall(regex,line3[idx])