本文整理汇总了Python中nltk.sent_tokenize函数的典型用法代码示例。如果您正苦于以下问题:Python sent_tokenize函数的具体用法?Python sent_tokenize怎么用?Python sent_tokenize使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了sent_tokenize函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: cosineReadable
def cosineReadable(sentences):
#FIRST CHECK - we need at least 3 sentences for this method to be worth it
if (len(nltk.sent_tokenize(sentences)) <= 2):
return sentences
else: #we have enough sentences to do a readability overhaul
wordDimensions = [] #this gives every word an assigned dimension in the vector
for sent in nltk.sent_tokenize(sentences):
for word in nltk.word_tokenize(sent):
if word not in wordDimensions: #no duplicates
wordDimensions.append(word)
sentlist = nltk.sent_tokenize(sentences)
firstsent = sentlist[0]
sentenceVectors = [] #this will be a list of sentVectors for every sent in summary
for i in range(0,len(sentlist)): #turn every sentence into a vector
vec = makeSentVector(sentlist[i], wordDimensions)
sentenceVectors.append(vec)
sentScores = {} #dic keeps track of cosine distance scores for the sentences (in comparison to the first sentence)
firstSentVec = sentenceVectors[0]
for x in range(1, len(sentlist)):
sent = sentlist[x]
val = spatial.distance.cosine(firstSentVec, sentenceVectors[x])
sentScores[sent] = val
sentScores = sorted(sentScores, reverse=True, key=sentScores.get)
summary = str(sentlist[0])+"\n"
for otherSent in sentScores:
summary+=str(otherSent).strip()+"\n"
summary = summary.strip()
return summary
示例2: demo
def demo():
# split paragraph into sentences using punct
sent_tokenizer = nltk.data.load('nltk:tokenizers/punkt/english.pickle')
sents = sent_tokenizer.tokenize(paragraphs)
# split sentence into tokens (wrods + puncts)
s = "Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.\n\nThanks."
WordPunctTokenizer().tokenize(s)
#['Good', 'muffins', 'cost', '$', '3', '.', '88', 'in', 'New', 'York', '.', 'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.']
PunktWordTokenizer().tokenize(s)
#['Good', 'muffins', 'cost', '$3.88', 'in', 'New', 'York.', 'Please', 'buy', 'me', 'two', 'of', 'them.', 'Thanks.']
PunktWordTokenizer().span_tokenize(s)
#[(0, 4), (5, 12), (13, 17), (18, 23), (24, 26), (27, 30), (31, 36), (38, 44), (45, 48), (49, 51), (52, 55), (56, 58), (59, 64), (66, 73)]
#split the paragraph into sentence
nltk.sent_tokenize(s)
#split sentence into word and punct
nltk.word_tokenize(s)
# pos tagging
nltk.pos_tag(nltk.word_tokenize(s))
示例3: print_summary
def print_summary(indexes, doc, extract_n, doc_index):
if len(indexes) < extract_n:
extract_n = len(indexes)
reference = "reference/task" + str(doc_index) + "_englishReference" + str(doc_index) + ".txt"
reference_output = io.open(reference, "w", encoding='utf8')
tips = sent_tokenize(doc.tip)
for tip in tips:
reference_output.write(tip + "\n")
reference_output.close()
sentences = sent_tokenize(doc.review)
#print ""
## print "sentences length: " + str(len(sentences))
#print ""
#print "indexes: " + str(indexes)
#print ""
system = "system/task" + str(doc_index) + "_englishSyssum" + str(doc_index) + ".txt"
system_output = io.open(system, "w", encoding='utf8')
for i in range(0, extract_n):
#print "index: " + str(indexes[i])
system_output.write(sentences[indexes[i]] + "\n")
system_output.close()
示例4: refineText
def refineText(infp, outfp):
stringlist = []
textline = ""
size = ""
for line in infp:
current = line.strip().replace(' ',' ')
if current.startswith("<size>"):
if current != size and size != "":
for sentence in nltk.sent_tokenize(''.join(stringlist)):
for token in MyTokenizer().tokenize(sentence):
token = token.replace("“", "")
token = token.replace("”", "")
outfp.write(token+" ")
outfp.write('\n')
stringlist = []
outfp.write('\n')
stringlist.append(textline)
size = current
elif current == '':
continue
elif current[-1] == '-':
textline = current[0:-1]
else:
textline = current+' '
for sentence in nltk.sent_tokenize(''.join(stringlist)):
for token in MyTokenizer().tokenize(sentence):
token = token.replace("“", "")
token = token.replace("”", "")
outfp.write(token+" ")
outfp.write('\n')
示例5: __get_extra_wiki_description
def __get_extra_wiki_description(mesh_text, wiki_text, tfidf):
mesh_sents = sent_tokenize(mesh_text)
wiki_sents = sent_tokenize(wiki_text)
mesh_tfidf_list = __sentences_to_tfidf_vecs(mesh_sents, tfidf)
wiki_tfidf_list = __sentences_to_tfidf_vecs(wiki_sents, tfidf)
extra_description = ''
for i, wiki_tfidf_vec in enumerate(wiki_tfidf_list):
have_similar = False
for j, mesh_tfidf_vec in enumerate(mesh_tfidf_list):
sim_val = tfidf.sim(wiki_tfidf_vec, mesh_tfidf_vec)
if sim_val > 0.95:
# print sim_val, 'SIMILAR:'
# print mesh_sents[j]
# print wiki_sents[i]
have_similar = True
break
if not have_similar:
extra_description += ' ' + wiki_sents[i]
if len(extra_description) > 1:
extra_description = extra_description[1:]
if extra_description[-1].isalpha():
extra_description += '.'
elif extra_description[-1] == ':':
extra_description = extra_description[:-1] + '.'
return extra_description
return ''
示例6: postroot
def postroot():
if 'text' in request.forms:
text = request.forms['text']
sentences = sent_tokenize(text)
result = " ".join(w+'/'+t for s in sent_tokenize(text)
for (w,t) in pos_tag(word_tokenize(s)))
else:
text = 'Type your text here'
result = ''
return template("""
<!DOCTYPE html>
<html>
<head>
<meta charset="UTF-8">
<title>My Part of Speech Tagger</title>
</head>
<body>
<h1>My Part of Speech Tagger</h1>
<p>Type or paste your text below</p>
<form method="post">
<textarea name="text" rows="10" cols="50">
{{text}}
</textarea>
<input type="submit"/>
</form>
<hr>
<p>The tagged text is</p>
<p>{{tagged}}
</body>
</html>
""", text=text, tagged=result)
示例7: get_summaries_and_articles
def get_summaries_and_articles(coll):
'''
INPUT: mongo collection object
OUTPUT: list of summaries, list of articles
Runs through the MongoDB and extracts all of the newser.com summaries
with their corresponding articles.
'''
summary_list = []
article_list = []
for doc in list(coll.find()):
if doc['full_text'] != ' ':
summary_list.append(doc['summary'])
article_list.append(doc['full_text'])
for i in xrange(len(article_list)):
text = ''
for article in article_list[i]:
text += article
article_list[i] = text
summary_test = np.unique([summary_list[i] for i in xrange(len(summary_list))
if article_list[i] != '' and
article_list[i] != ' ' and
len(sent_tokenize(article_list[i])) > 10])
article_test = np.unique([article for article in article_list
if article != '' and
article_list[i] != ' ' and
len(sent_tokenize(article)) > 10])
return summary_test, article_test
示例8: readD
def readD(txtdoc):
#find basename
import os, nltk
base = os.path.basename(txtdoc)
#read file
with open (txtdoc,"r") as myfile:
text = myfile.readlines()
#extract relevant text from dataset
#write document
f = open(base + ".ready", "w")
#counts loops
a = 0
#for every line
for line in text:
if line.startswith("<bestanswer>"):
cleansentence = line[12:-13].replace("
"," ").replace(";",".").replace("<br />
","").replace("
"," ").replace("...",".").replace("<"," ").replace("<.br />.","")
#split line into sentences
sentences = nltk.sent_tokenize(cleansentence)
s = len(sentences)
#write into document
x=0
while x < (s-1):
f.write(sentences[x] + "\n")
a +=1
x+=1
f.write(sentences[s-1])
a +=1
print( (str(a)), end='\r')
if line.startswith("<answer_item>"):
cleansentence = line[13:-14].replace("
"," ").replace(";",".").replace("<br />
","").replace("
"," ").replace("...",".").replace("<"," ").replace("<.br />.","")
#split line into sentences
sentences = nltk.sent_tokenize(cleansentence)
s = len(sentences)
#write into document
x=0
while x < (s-1):
f.write(sentences[x] + "\n")
a +=1
x+=1
f.write(sentences[s-1])
a +=1
print( (str(a)), end='\r')
f.close
示例9: print_instance
def print_instance(relations, finlist, is_train):
arg1 = reduce(lambda x,y: x+y, [nltk.word_tokenize(s) for s in nltk.sent_tokenize(finlist[0])])
arg2 = reduce(lambda x,y: x+y, [nltk.word_tokenize(s) for s in nltk.sent_tokenize(finlist[1])])
if len(relations)>1:
return
#if is_train:
for relation in relations:
fw.write(json.dumps({'Arg1':arg1,'Arg2':arg2,'Sense':relation})+'\n')
示例10: create_summary
def create_summary(text):
text = re.sub(r'\s\s+', ' ', text)
sentences = nltk.sent_tokenize(text)
if len(sentences) < 10:
num = 3
else:
num = 2
summarizer = SimpleSummarizer()
return nltk.sent_tokenize(summarizer.summarize(text, num))
示例11: percentage_long_sent
def percentage_long_sent(text):
long_sentence = 0
sentence_all = len(nltk.sent_tokenize(text))
sentence_list = nltk.sent_tokenize(text)
for sentence in sentence_list:
wordlist = nltk.word_tokenize(sentence)
word_count = len(wordlist)
if word_count >15:
long_sentence += 1
return long_sentence/sentence_all
示例12: featurize
def featurize():
n = 100 # number of articles per topic
employer = request.form['user_input']
ftopic = df[df['company']==employer].head(n)
text = list(ftopic['pros'].values)
text = " ".join(text)
text = re.sub('[^\w\s]+', ' ', text).replace('\n', ' ')
# tokenize into words
tokens = [word.lower() for sent in sent_tokenize(text) \
for word in word_tokenize(sent)]
# remove stopwords
# some extra stop words not present in stopwords
stop = stopwords.words('english')
stop += ['said', 'would', 's', 'also', 'U', 'mr', 're', 'may', 'one', 'two', 'buy', 'much', \
'take', 'might', 'say', 'new', 'year', 'many','etc', 'll', 've']
stop += str(employer)
tokens = [token for token in tokens if token not in stop]
# remove words less than three letters
tokens = [word for word in tokens if len(word) >= 2]
string = " ".join(tokens)
wordcloud = WordCloud(font_path='/Library/Fonts/Arial Rounded Bold.ttf').generate(string)
plt.figure(figsize=(50,30))
plt.imshow(wordcloud)
plt.axis("off")
name = 'static/' +str(employer) + '-pros.png'
pic = plt.savefig(name, bbox_inches='tight',transparent = True)
text2 = list(ftopic['cons'].values)
text2 = " ".join(text2)
text2 = re.sub('[^\w\s]+', ' ', text2).replace('\n', ' ')
# tokenize into words
tokens2 = [word.lower() for sent in sent_tokenize(text2) \
for word in word_tokenize(sent)]
# remove stopwords
# some extra stop words not present in stopwords
stop2 = stopwords.words('english')
stop2 += ['said', 'would', 's', 'also', 'U', 'mr', 're', 'may', 'one', 'two', 'buy', 'much', \
'take', 'might', 'say', 'new', 'year', 'many','etc', 'll', 've']
stop2 += str(employer)
tokens2 = [token for token in tokens2 if token not in stop2]
# remove words less than three letters
tokens2 = [word for word in tokens2 if len(word) >= 2]
string2 = " ".join(tokens2)
wordcloud2 = WordCloud(font_path='/Library/Fonts/Arial Rounded Bold.ttf').generate(string2)
plt.figure(figsize=(50,30))
plt.imshow(wordcloud2)
plt.axis("off")
name2 = 'static/' +str(employer) + '-cons.png'
pic2 = plt.savefig(name2, bbox_inches='tight',transparent = True)
return render_template('template_wordcloud.html', pic_pro = name, pic_con=name2, employer=employer)
示例13: content
def content(self, title, text):
""" Set title and text of the content needs to be parsed. """
self._title = title
self._text = text
self._sepText = text.split('\n')
self._tokens = nltk.word_tokenize(self._text) # not using regex for tokenization
self._textSents = nltk.sent_tokenize(self._text)
self._textSents = list(map(lambda x: x.strip(), self._textSents)) # strip all sentences
self._sepTextSents = []
for pp in self._sepText:
self._sepTextSents.append(nltk.sent_tokenize(pp))
示例14: _shuffle_text
def _shuffle_text(self, text, times, label_func):
from random import shuffle
origin_sents = sent_tokenize(text)
assert len(origin_sents) > 1
sents = sent_tokenize(text)
res = []
for i in range(times):
shuffle(sents)
label = label_func(sents, origin_sents)
res.append((' '.join(sents[:-1]), label))
return res
示例15: main
def main():
tagged = getTagged(corpusdir)
featureSet = [(getFeatures(feature), tag) for (feature, tag) in tagged]
trainSet = featureSet[:]
testSet = featureSet[:100]
classifier = nltk.NaiveBayesClassifier.train(trainSet)
fileList = os.listdir(corpusdir)
sentences = []
visited = []
for (stem, tag) in [(f[:-4], f[-3:]) for f in fileList]:
if stem in visited:
continue
else:
visited.append(stem)
print stem
f_pos, f_neg = open(corpusdir + "/" + stem + "_pos"), open(corpusdir + "/" + stem + "_neg")
f_neg = open(corpusdir + "/" + stem + "_neg")
raw_pos, raw_neg = f_pos.read(), f_neg.read()
sent_pos, sent_neg = sent_tokenize(raw_pos), sent_tokenize(raw_neg)
f_pos.close()
f_neg.close()
falseNeg = falsePos = trueNeg = truePos = 0
for sent in sent_pos:
guess = classifier.classify(getFeatures(sent))
if guess == "POS":
truePos +=1
else:
falseNeg += 1
for sent in sent_neg:
guess = classifier.classify(getFeatures(sent))
if guess == "NEG":
trueNeg +=1
else:
falsePos += 1
posTags = len(sent_pos)
negTags = len(sent_neg)
totTags = posTags + negTags
#print "Total sentences: %i" % (totTag)
#print "Total negative: %.2f%%" % (float(negTags) / totTag * Tag100)
#print "Total positive: %.2f%%" % (float(posTags) / totTag * 100)
#print "True negatives: %.2f%%" % (float(trueNeg) / negTags * 100)
#print "True positives: %.2f%%" % (float(truePos) / posTags * 100)
print "False negatives: %.2f%%" % (float(falseNeg) / posTags * 100)
print "False positives: %.2f%%" % (float(falsePos) / negTags * 100)
print ""
print "Accuracy: %f" % nltk.classify.accuracy(classifier, testSet)