Python nltk.sent_tokenize函数代码示例

本文整理汇总了Python中nltk.sent_tokenize函数的典型用法代码示例。如果您正苦于以下问题：Python sent_tokenize函数的具体用法？Python sent_tokenize怎么用？Python sent_tokenize使用的例子？那么, 这里精选的函数代码示例或许可以为您提供帮助。

在下文中一共展示了sent_tokenize函数的15个代码示例，这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞，您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: cosineReadable

def cosineReadable(sentences):
	#FIRST CHECK - we need at least 3 sentences for this method to be worth it
	if (len(nltk.sent_tokenize(sentences)) <= 2):
		return sentences
	else:	#we have enough sentences to do a readability overhaul
		wordDimensions = [] #this gives every word an assigned dimension in the vector
		for sent in nltk.sent_tokenize(sentences):
			for word in nltk.word_tokenize(sent):
				if word not in wordDimensions: #no duplicates
					wordDimensions.append(word)

		sentlist = nltk.sent_tokenize(sentences)
		firstsent = sentlist[0]		
		sentenceVectors = [] #this will be a list of sentVectors for every sent in summary
		for i in range(0,len(sentlist)): #turn every sentence into a vector
			vec = makeSentVector(sentlist[i], wordDimensions)
			sentenceVectors.append(vec)
		sentScores = {} #dic keeps track of cosine distance scores for the sentences (in comparison to the first sentence)		
		firstSentVec = sentenceVectors[0]
		for x in range(1, len(sentlist)):
			sent = sentlist[x]
			val = spatial.distance.cosine(firstSentVec, sentenceVectors[x])
			sentScores[sent] = val
		
		sentScores = sorted(sentScores, reverse=True, key=sentScores.get)
		summary = str(sentlist[0])+"\n"
		for otherSent in sentScores:
			summary+=str(otherSent).strip()+"\n"
		summary = summary.strip()
		return summary

开发者ID:nickmon7，项目名称:573，代码行数:30，代码来源:content_selection2.py

示例2: demo

def demo():
    # split paragraph into sentences using punct
    sent_tokenizer = nltk.data.load('nltk:tokenizers/punkt/english.pickle')
    sents = sent_tokenizer.tokenize(paragraphs)
    
    # split sentence into tokens (wrods + puncts)
    s = "Good muffins cost $3.88\nin New York.  Please buy me\ntwo of them.\n\nThanks."
    WordPunctTokenizer().tokenize(s)
    #['Good', 'muffins', 'cost', '$', '3', '.', '88', 'in', 'New', 'York', '.', 'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.']
    PunktWordTokenizer().tokenize(s)
    #['Good', 'muffins', 'cost', '$3.88', 'in', 'New', 'York.', 'Please', 'buy', 'me', 'two', 'of', 'them.', 'Thanks.']
    PunktWordTokenizer().span_tokenize(s)
    #[(0, 4), (5, 12), (13, 17), (18, 23), (24, 26), (27, 30), (31, 36), (38, 44),  (45, 48), (49, 51), (52, 55), (56, 58), (59, 64), (66, 73)]
    
    #split the paragraph into sentence
    nltk.sent_tokenize(s)
    #split sentence into word and punct
    nltk.word_tokenize(s)
    
    # pos tagging
    nltk.pos_tag(nltk.word_tokenize(s))

开发者ID:shixing，项目名称:myLib，代码行数:21，代码来源:nltk.py

示例3: print_summary

def print_summary(indexes, doc, extract_n, doc_index):

    if len(indexes) < extract_n:
        extract_n = len(indexes)

    reference = "reference/task" + str(doc_index) + "_englishReference" + str(doc_index) + ".txt"
    reference_output = io.open(reference, "w", encoding='utf8')
    tips = sent_tokenize(doc.tip)

    for tip in tips:
        reference_output.write(tip + "\n")
    reference_output.close()

    sentences = sent_tokenize(doc.review)
    
    #print ""
    ## print "sentences length: " + str(len(sentences))
    #print ""
    #print "indexes: " + str(indexes)
    #print ""
    
    system = "system/task" + str(doc_index) + "_englishSyssum" + str(doc_index) + ".txt"
    system_output = io.open(system, "w", encoding='utf8')    
    for i in range(0, extract_n):
        #print "index: " + str(indexes[i])
        system_output.write(sentences[indexes[i]] + "\n")

    system_output.close()

开发者ID:fujunswufe，项目名称:YelpDataChallenge，代码行数:28，代码来源:textrank_fujun.py

示例4: refineText

def refineText(infp, outfp):
    stringlist = []
    textline = ""
    size = ""
    for line in infp:
        current = line.strip().replace('  ',' ')
        if current.startswith("<size>"):
            if current != size and size != "":
                for sentence in nltk.sent_tokenize(''.join(stringlist)):
                    for token in MyTokenizer().tokenize(sentence):
                        token = token.replace("“", "")
                        token = token.replace("”", "")
                        outfp.write(token+" ")
                outfp.write('\n')
                stringlist = []
                outfp.write('\n')
            stringlist.append(textline)
            size = current
        elif current == '':
            continue
        elif current[-1] == '-':
            textline = current[0:-1]
        else:
            textline = current+' '
    for sentence in nltk.sent_tokenize(''.join(stringlist)):
        for token in MyTokenizer().tokenize(sentence):
            token = token.replace("“", "")
            token = token.replace("”", "")
            outfp.write(token+" ")
    outfp.write('\n')

开发者ID:marquis-wu，项目名称:ExtractDataset，代码行数:30，代码来源:PaperToText.py

示例5: __get_extra_wiki_description

def __get_extra_wiki_description(mesh_text, wiki_text, tfidf):
    mesh_sents = sent_tokenize(mesh_text)
    wiki_sents = sent_tokenize(wiki_text)
    mesh_tfidf_list = __sentences_to_tfidf_vecs(mesh_sents, tfidf)
    wiki_tfidf_list = __sentences_to_tfidf_vecs(wiki_sents, tfidf)

    extra_description = ''
    for i, wiki_tfidf_vec in enumerate(wiki_tfidf_list):
        have_similar = False
        for j, mesh_tfidf_vec in enumerate(mesh_tfidf_list):
            sim_val = tfidf.sim(wiki_tfidf_vec, mesh_tfidf_vec)
            if sim_val > 0.95:
                # print sim_val, 'SIMILAR:'
                # print mesh_sents[j]
                # print wiki_sents[i]
                have_similar = True
                break
        if not have_similar:
            extra_description += ' ' + wiki_sents[i]

    if len(extra_description) > 1:
        extra_description = extra_description[1:]
        if extra_description[-1].isalpha():
            extra_description += '.'
        elif extra_description[-1] == ':':
            extra_description = extra_description[:-1] + '.'
        return extra_description
    return ''

开发者ID:hldai，项目名称:ks-studio-el，代码行数:28，代码来源:wikiprocess.py

示例6: postroot

def postroot():
    if 'text' in request.forms:
        text = request.forms['text']
        sentences = sent_tokenize(text)
        result = " ".join(w+'/'+t for s in sent_tokenize(text)
                          for (w,t) in pos_tag(word_tokenize(s)))
    else:
        text = 'Type your text here'
        result = ''
    return template("""
<!DOCTYPE html>
<html>
<head>
  <meta charset="UTF-8">
  <title>My Part of Speech Tagger</title>
</head>
<body>
<h1>My Part of Speech Tagger</h1>
<p>Type or paste your text below</p>
<form method="post">
<textarea name="text" rows="10" cols="50">
{{text}}
</textarea>
<input type="submit"/>
</form>
<hr>
<p>The tagged text is</p>
<p>{{tagged}}
</body>
</html>
    """, text=text, tagged=result)

开发者ID:dmollaaliod，项目名称:comp348，代码行数:31，代码来源:pos.py

示例7: get_summaries_and_articles

def get_summaries_and_articles(coll):
    '''
    INPUT: mongo collection object
    OUTPUT: list of summaries, list of articles

    Runs through the MongoDB and extracts all of the newser.com summaries
    with their corresponding articles.
    '''

    summary_list = []
    article_list = []

    for doc in list(coll.find()):
        if doc['full_text'] != ' ':
            summary_list.append(doc['summary'])
            article_list.append(doc['full_text'])

    for i in xrange(len(article_list)):
        text = ''
        for article in article_list[i]:
            text += article
        article_list[i] = text

    summary_test = np.unique([summary_list[i] for i in xrange(len(summary_list))
                              if article_list[i] != '' and
                              article_list[i] != ' ' and
                              len(sent_tokenize(article_list[i])) > 10])
    article_test = np.unique([article for article in article_list
                              if article != '' and
                              article_list[i] != ' ' and
                              len(sent_tokenize(article)) > 10])

    return summary_test, article_test

开发者ID:timbearden，项目名称:Project，代码行数:33，代码来源:test_summarizers.py

示例8: readD

def readD(txtdoc):
		
	#find basename
	import os, nltk
	base = os.path.basename(txtdoc)		
			#read file
	with open (txtdoc,"r") as myfile:
		text = myfile.readlines()
		
	#extract relevant text from dataset
				
	#write document
	f = open(base + ".ready", "w")         
		
			
	#counts loops
	a = 0
			
	#for every line
	for line in text:
							 
		if line.startswith("<bestanswer>"):
			
			cleansentence = line[12:-13].replace("&#xa;"," ").replace(";",".").replace("&lt;br /&gt;&#xa;","").replace("&#xa;"," ").replace("...",".").replace("<"," ").replace("&lt.br /&gt.","")
		#split line into sentences
			sentences = nltk.sent_tokenize(cleansentence)
				
			s = len(sentences)
			#write into document
			x=0
			while x < (s-1):
				f.write(sentences[x] + "\n")
				a +=1
				x+=1
			f.write(sentences[s-1])
					
			a +=1
			print( (str(a)), end='\r') 

		if line.startswith("<answer_item>"):
			
			cleansentence = line[13:-14].replace("&#xa;"," ").replace(";",".").replace("&lt;br /&gt;&#xa;","").replace("&#xa;"," ").replace("...",".").replace("<"," ").replace("&lt.br /&gt.","")
		#split line into sentences
			sentences = nltk.sent_tokenize(cleansentence)
				
			s = len(sentences)
			#write into document
			x=0
			while x < (s-1):
				f.write(sentences[x] + "\n")
				a +=1
				x+=1
			f.write(sentences[s-1])
					
			a +=1
			print( (str(a)), end='\r') 
				
	f.close

开发者ID:silvia6200，项目名称:humeanhackers，代码行数:58，代码来源:ReadDataSet.py

示例9: print_instance

def print_instance(relations, finlist, is_train):
    arg1 =  reduce(lambda x,y: x+y, [nltk.word_tokenize(s) for s in nltk.sent_tokenize(finlist[0])])
    arg2 = reduce(lambda x,y: x+y, [nltk.word_tokenize(s) for s in nltk.sent_tokenize(finlist[1])])
    if len(relations)>1:
        return
    #if is_train:
    for relation in relations:
        fw.write(json.dumps({'Arg1':arg1,'Arg2':arg2,'Sense':relation})+'\n')

开发者ID:jcyk，项目名称:IDRC，代码行数:8，代码来源:preprocess.py

示例10: create_summary

def create_summary(text):
    text = re.sub(r'\s\s+', ' ', text)
    sentences = nltk.sent_tokenize(text)
    if len(sentences) < 10:
        num = 3
    else:
        num = 2

    summarizer = SimpleSummarizer()
    return nltk.sent_tokenize(summarizer.summarize(text, num))

开发者ID:AlJohri，项目名称:Capitol-Words，代码行数:10，代码来源:cache_representative_sentences.py

示例11: percentage_long_sent

def percentage_long_sent(text):
    long_sentence = 0
    sentence_all = len(nltk.sent_tokenize(text))
    sentence_list = nltk.sent_tokenize(text)
    for sentence in sentence_list:
        wordlist = nltk.word_tokenize(sentence)
        word_count = len(wordlist)
        if word_count >15:
            long_sentence += 1
    return long_sentence/sentence_all

开发者ID:yunitata，项目名称:PAN15，代码行数:10，代码来源:feature_extractor.py

示例12: featurize

def featurize():
    n = 100   # number of articles per topic
    employer = request.form['user_input']
    ftopic = df[df['company']==employer].head(n)
    text = list(ftopic['pros'].values)
    text = " ".join(text)
    text = re.sub('[^\w\s]+', ' ', text).replace('\n', ' ')
   # tokenize into words
    tokens = [word.lower() for sent in sent_tokenize(text) \
             for word in word_tokenize(sent)]
   # remove stopwords

   # some extra stop words not present in stopwords
    stop = stopwords.words('english')
    stop += ['said', 'would', 's', 'also', 'U', 'mr', 're', 'may', 'one', 'two', 'buy', 'much', \
            'take', 'might', 'say', 'new', 'year', 'many','etc', 'll', 've']
    stop += str(employer)

    tokens = [token for token in tokens if token not in stop]
   # remove words less than three letters
    tokens = [word for word in tokens if len(word) >= 2]
    string = " ".join(tokens)
    wordcloud = WordCloud(font_path='/Library/Fonts/Arial Rounded Bold.ttf').generate(string)
    plt.figure(figsize=(50,30))
    plt.imshow(wordcloud)
    plt.axis("off")
    name = 'static/' +str(employer) + '-pros.png'
    pic = plt.savefig(name, bbox_inches='tight',transparent = True)

    text2 = list(ftopic['cons'].values)
    text2 = " ".join(text2)
    text2 = re.sub('[^\w\s]+', ' ', text2).replace('\n', ' ')
   # tokenize into words
    tokens2 = [word.lower() for sent in sent_tokenize(text2) \
             for word in word_tokenize(sent)]
   # remove stopwords

   # some extra stop words not present in stopwords
    stop2 = stopwords.words('english')
    stop2 += ['said', 'would', 's', 'also', 'U', 'mr', 're', 'may', 'one', 'two', 'buy', 'much', \
            'take', 'might', 'say', 'new', 'year', 'many','etc', 'll', 've']
    stop2 += str(employer)

    tokens2 = [token for token in tokens2 if token not in stop2]
   # remove words less than three letters
    tokens2 = [word for word in tokens2 if len(word) >= 2]
    string2 = " ".join(tokens2)
    wordcloud2 = WordCloud(font_path='/Library/Fonts/Arial Rounded Bold.ttf').generate(string2)
    plt.figure(figsize=(50,30))
    plt.imshow(wordcloud2)
    plt.axis("off")
    name2 = 'static/' +str(employer) + '-cons.png'
    pic2 = plt.savefig(name2, bbox_inches='tight',transparent = True)

    return render_template('template_wordcloud.html', pic_pro = name, pic_con=name2, employer=employer)

开发者ID:tomakant，项目名称:glassycrawl，代码行数:55，代码来源:app.py

示例13: content

	def content(self, title, text):
		""" Set title and text of the content needs to be parsed. """
		self._title = title
		self._text = text
		self._sepText = text.split('\n')
		self._tokens = nltk.word_tokenize(self._text) # not using regex for tokenization
		self._textSents = nltk.sent_tokenize(self._text)
		self._textSents = list(map(lambda x: x.strip(), self._textSents)) # strip all sentences
		self._sepTextSents = []
		for pp in self._sepText:
			self._sepTextSents.append(nltk.sent_tokenize(pp))

开发者ID:joelin000，项目名称:KeyContentExtractor，代码行数:11，代码来源:parser.py

示例14: _shuffle_text

 def _shuffle_text(self, text, times, label_func):
     from random import shuffle
     origin_sents = sent_tokenize(text)
     assert len(origin_sents) > 1
     sents = sent_tokenize(text)
     res = []
     for i in range(times):
         shuffle(sents)
         label = label_func(sents, origin_sents)
         res.append((' '.join(sents[:-1]), label))
     return res

开发者ID:kigawas，项目名称:coheoka，代码行数:11，代码来源:evaluator.py

示例15: main

def main():
    tagged = getTagged(corpusdir)
    featureSet = [(getFeatures(feature), tag) for (feature, tag) in tagged]
    trainSet = featureSet[:]
    testSet = featureSet[:100]
    classifier = nltk.NaiveBayesClassifier.train(trainSet)

    fileList = os.listdir(corpusdir)
    sentences = []
    visited = []
    for (stem, tag) in [(f[:-4], f[-3:]) for f in fileList]:
        if stem in visited:
            continue
        else:
            visited.append(stem)
        print stem

        f_pos, f_neg = open(corpusdir + "/" + stem + "_pos"), open(corpusdir + "/" + stem + "_neg")
        f_neg = open(corpusdir + "/" + stem + "_neg")
        raw_pos, raw_neg = f_pos.read(), f_neg.read()
        sent_pos, sent_neg = sent_tokenize(raw_pos), sent_tokenize(raw_neg)
        f_pos.close()
        f_neg.close()

        falseNeg = falsePos = trueNeg = truePos = 0
        for sent in sent_pos:
            guess = classifier.classify(getFeatures(sent))
            if guess == "POS":
                truePos +=1
            else:
                falseNeg += 1

        for sent in sent_neg:
            guess = classifier.classify(getFeatures(sent))
            if guess == "NEG":
                trueNeg +=1
            else:
                falsePos += 1

        posTags = len(sent_pos)
        negTags = len(sent_neg)
        totTags = posTags + negTags

        #print "Total sentences: %i" % (totTag)
        #print "Total negative: %.2f%%" % (float(negTags) / totTag * Tag100)
        #print "Total positive: %.2f%%" % (float(posTags) / totTag * 100)
        #print "True negatives: %.2f%%" % (float(trueNeg) / negTags * 100)
        #print "True positives: %.2f%%" % (float(truePos) / posTags * 100)
        print "False negatives: %.2f%%" % (float(falseNeg) / posTags * 100)
        print "False positives: %.2f%%" % (float(falsePos) / negTags * 100)
        print ""


    print "Accuracy: %f" % nltk.classify.accuracy(classifier, testSet)

开发者ID:dkudrow，项目名称:cs290f_proj，代码行数:54，代码来源:ClassifierHarness.py

注：本文中的nltk.sent_tokenize函数示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台，相关代码片段筛选自各路编程大神贡献的开源项目，源码版权归原作者所有，传播和使用请参考对应项目的License；未经允许，请勿转载。