当前位置: 首页>>代码示例>>Python>>正文


Python tokenize.sent_tokenize函数代码示例

本文整理汇总了Python中nltk.tokenize.sent_tokenize函数的典型用法代码示例。如果您正苦于以下问题:Python sent_tokenize函数的具体用法?Python sent_tokenize怎么用?Python sent_tokenize使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。


在下文中一共展示了sent_tokenize函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: post

 def post(self):
     args = parser.parse_args()
     text = {'text': args['text']}
     print text
     print sent_tokenize(text['text'])
     print word_tokenize(text['text'])
     return text['text']
开发者ID:lhofer,项目名称:Flask_text_processing_API,代码行数:7,代码来源:api_old.py

示例2: split_sentence_based_on_rules

def split_sentence_based_on_rules(sent):


    if re.search(r' \.+ ', sent):
        sentences = re.split(r' \.+ ', sent)
    elif re.search(r'@ ---- @', sent):
        sentences = re.split(r'@ ---- @', sent)
    elif re.search(r'\.\w+\:', sent):
        sent = re.sub(r'\.(\w+)\:', r'. \1:', sent)
        sentences = sent_tokenize(sent)
    elif re.search(r'\, as well as', sent):
        sent = sent.replace(', as well as', '. As well as')
        sentences = sent_tokenize(sent)
    elif re.search(r'[a-z\.]+[A-Z][a-z]+:', sent):
        k = re.findall(r' [a-z\.]+([A-Z][a-z]+:)', sent)
        p = chr(ord(max(sent)) + 1)
        sentences = sent.replace(k[0], p + k[0]).split(p)
    elif re.search(r'\; ', sent):
        sent = re.sub(r'\; ', r'. ', sent)
        sentences = sent_tokenize(sent)
    elif re.search(r', and, ', sent):
        sent = sent.replace(', and, ', '. And, ')
        sentences = sent_tokenize(sent)
    elif re.search(r'president\: Wechsler', sent):
        sent = sent.replace(': ', '. ')
        sentences = sent_tokenize(sent)
    elif re.search(r'\, ', sent):
        sentences = re.split(r'\, ', sent)
    else:
        sentences = [sent[:349],sent[350:]]
        print("Using greedy sentence tokenization")

    text_len = [len(sentence) for sentence in sentences]
    return sentences
开发者ID:bethard,项目名称:timenorm,代码行数:34,代码来源:preprocess_functions.py

示例3: load_file_sentences

def load_file_sentences(filepath):
    index = filepath.rfind('/')
    if index < 0:
        sents = sent_tokenize(PlaintextCorpusReader('.', filepath).raw())
    else:
        sents = sent_tokenize(PlaintextCorpusReader(filepath[:index], filepath[index+1:]).raw())
    return sents
开发者ID:tkoane,项目名称:stat471project,代码行数:7,代码来源:stat_code.py

示例4: realtime

def realtime():
    model_parsing()
    data_df=pd.read_csv('Test_Survey.csv')
    data_df.Verbatim=data_df.Verbatim.fillna(0)
    unique_id=data_df['Unique_Id']
    verbatims=data_df['Verbatim']
    data_dict = dict(zip(unique_id, verbatims))
    Results_df=pd.DataFrame(columns=('Unique_id','Sentence', 'category', 'Sentiment'))
    model_df = pd.read_csv('Model_modified_twitter_test.csv')
    for uid,line in data_dict.items(): 
        line=str(line).decode('utf-8',errors='ignore') #To make sure program doesnt run into unicode error. Add errot handling to avoid issues with other formats
        try:
            line_list=tokenize.sent_tokenize(str(line))
            tokenize.sent_tokenize(str(line))
            for line in line_list:
                original_line=line
                for p in list(punctuation):
                    line=line.replace(p,'')
                line=line.lower()
                line_SC=tb.blob.BaseBlob(line)
                line=line_SC.correct()
                line=str(line)
                #print uid
                sentiment_score=sentiment_calc(line)
                
                temp_df=core_classify(line,uid,sentiment_score,model_df,original_line)
                #Results_df = Results_df.append(temp_df)
                
                yield temp_df
        except UnicodeEncodeError:
            temp_df = pd.DataFrame({'Unique_id':[uid],'Sentence':[original_line],'category':['Invalid text data'],'Sentiment':[sentiment_score]})
            yield temp_df
            #Results_df = Results_df.append(temp_df)
    Results_df.to_csv('test_analysis.csv',index=False, encoding = 'utf-8')
开发者ID:ght438,项目名称:Loki,代码行数:34,代码来源:Text_analytics_gui.py

示例5: process_statuses

def process_statuses(uid):
	statuses_list = {}
	in_path = 'Data/'+uid+'/statuses_list.pickle'
	if os.path.exists(in_path):
		f = open(in_path,'rb')
		j = 0
		while True:
			try:
				statuses = pickle.load(f)
				for status in statuses:
					j += 1
					tweet = status.text
					if 

					sents = sent_tokenize(tweet)
					text = ""
					for sent in sents:
						#print("Sent: ", sent)
						sent_text = re.sub(r'RT\[email protected]\w+:\s|@\w+\s|#|http://.*$|http://.*\s|https://.*$|https://.*\s|\n|\\U\w+', "", sent)
						sent_text = highpoints.sub("", sent_text)
						#print(sent_text)
						tokens = word_tokenize(sent_text)
						words = [w.lower() for w in tokens if w.isalpha() or w.isalnum()]
						stop_words = set(stopwords.words('english'))
						filtered_words = [w for w in words if not w in stop_words]
						statuses_list[sent] = filtered_words	#structure: key:integrate sentence, value: filtered_words 
			except EOFError:
				print(j)
				break
	#print("statuses_list: ", statuses_list)
	return statuses_list 
开发者ID:QiaozhiWang,项目名称:Sensitive_tweets,代码行数:31,代码来源:extract_sentweet-seperate.py

示例6: sentences

def sentences(a, b):
    """Return sentences in both a and b"""
    asplit = sent_tokenize(a)
    bsplit = sent_tokenize(b)
    # use set again
    same = {x for x in asplit if x in bsplit}
    return list(same)
开发者ID:dillon,项目名称:cs50,代码行数:7,代码来源:helpers.py

示例7: embed

def embed(sentences):
    model = word2vec.load('~/word2vec_models/GoogleNews-vectors-negative300.bin')
    embedded_sentences = []
    tokenized_sentences = []

    max_len = 0
    for sentence in sentences:
        tokenized_sentence = sent_tokenize(sentence)
        tokenized_sentences.append(tokenized_sentence)
        if len(tokenized_sentence) > max_len:
            max_len = len(tokenized_sentence)


    for sentence in sentences:
        tokenized_sentence = sent_tokenize(sentence)
        embedded_words = []
        
        for word in tokenized_sentence:
            try:
                word = model['word']
            except:
                word = np.zeros(300)
            embedded_words.append(word)

        #padding    
        for i in range(max_len - len(embedded_words)):
            embedded_words.append(np.zeros(300))

        embedded_sentences.append(embedded_words)

    embedded_sentences = np.array(embedded_sentences)

    return embedded_sentences
开发者ID:RemedyHealthcare,项目名称:cnn-text-classification-tf,代码行数:33,代码来源:data_helpers.py

示例8: split_reddit_reviews

    def split_reddit_reviews(self,reviews):
        columns = ['Text','Score', 'True']
        #Calculate total number of sentences to fill up the data frame
        count=0
        for index,each_review in reviews.iterrows():

            split_sentences=sent_tokenize(each_review['Text'])
            count+=len(split_sentences)
        print "total number of sentences {}".format(count)

        df = pd.DataFrame(index=range(0,count), columns=columns)
        Text,Score,True=[],[],[]
        for index,each_review in reviews.iterrows():
            split_sentences=sent_tokenize(each_review['Text'])
            actual_tag=each_review['True']
            score_tag=each_review['Score']
            for each_split_sentence in split_sentences:
                Text.append(each_split_sentence)
                Score.append(actual_tag)
                True.append(score_tag)
        print "Count ={} Text.length {}".format(count,len(Text))
        df['Text']=Text
        df['Score']=Score
        df['True']=True
        df.to_csv('../data/reddit_reviews.csv')
开发者ID:pratheeksh,项目名称:Sarcasm-detection-project,代码行数:25,代码来源:sarcasmClassify.py

示例9: inputfactx

def inputfactx(rev, include_vpr):
    this_business = find_business(rev.bizid)
    this_user = find_user(rev.uid)
    result = [ this_business.stars ]
    if include_vpr:
        result += [ this_user.get_vpr() ]
    result += [
        this_user.reviewCount,
        len(rev.text),
        rev.stars,
        rev.get_days() ]
    if len(rev.text) == 0:
        result += [ 0, 0, 0, 0, 0 ]
    else:
        excount = 0
        for sent in sent_tokenize(rev.text):
            ss = sent.strip()
            if ss.endswith('!'):
                excount += 1
        result += [ excount,
        np.mean([len(sent) for sent in sent_tokenize(rev.text)]),
        len(sent_tokenize(rev.text)),
        len(re.findall('\n\n', rev.text)) + 1,
        len(rev.text.splitlines()[0]) ]
    result += [ this_business.longitude, this_business.latitude ]
    return result
开发者ID:jingjingh,项目名称:kaggle_yelp,代码行数:26,代码来源:LR.py

示例10: tokenize_sentences

def tokenize_sentences(filename):
	file_dir = docs_dir + str(filename)
	f = open(file_dir, 'r')

	root = ET.parse(f).getroot()
	tags = root.getiterator('str')

	# read the relevant tags
	title_string = ''
	desc_string = ''
	for tag in tags:
		if tag.get('name')  == 'Title' :
			title_string = filter(lambda x: x in string.printable, tag.text.lower().strip())

		elif tag.get('name') == 'Abstract':
			desc_string = filter(lambda x: x in string.printable, tag.text.lower().strip().replace('relevant documents will describe', ''))

	f.close()

	sentences = sent_tokenize(title_string)
	title_words = []
	for s in sentences:
		title_words = title_words + word_tokenize(s)

	sentences = sent_tokenize(desc_string)
	desc_words = []
	for s in sentences:
		desc_words = desc_words + word_tokenize(s)

	
	return (title_words, desc_words)
开发者ID:Tiotao,项目名称:CS3245HW4,代码行数:31,代码来源:index.py

示例11: tokenize

def tokenize(text, grams=1):
  wordStems = lambda s: map(stem, word_tokenize(s))
  sentTokens = lambda tok, s: tok + wordStems(s)

  if grams == 1:
    return list(reduce(sentTokens, sent_tokenize(text), [ ]))
  else:
    return list(ngrams(reduce(sentTokens, sent_tokenize(text), [ ]), grams))
开发者ID:nithinkrishna,项目名称:tamil-text-summarization,代码行数:8,代码来源:main.py

示例12: main

def main(param = 0):
    ''' 
    0 for no stem
    1 for porter
    2 for lancaster
    '''
    both_pos_index = {}
    tit_pos_index = {}
    abs_pos_index = {}


    if param == 0:
        path = './NoStemmer/'
    elif param == 1:
        path = './Porter/'
    elif param == 2:
        path = './Lancaster/'

    for i in range(1,1001):
            
        '''open xml file and get abstract and title'''
        try: 
            filename = "./data/%d.xml" %i
            data = open(filename)
        except:
            print "can't open file %s" %filename
            return 0

        docid = filename.split('/')[-1].split('.')[-2]
        
        tree = etree.fromstring(data.read())
    
        title = tree.find('Title').text
        abstract =  tree.find('Abstract').text
    
    
        #####################################################
        # Step2 tokenize and make position index dictionary #
        #####################################################
        '''sentence tokenize'''
        if title != None:
            title = title.replace('[','',1).replace(']','',1)
            titles = [s.replace('&amp;', '') for s in sent_tokenize(title)]
            tit_pos_index = position_index(tit_pos_index, titles, docid, param) 
           
        if abstract != None:
            abstracts = [s.replace('&amp;', '&') for s in sent_tokenize(abstract)] 
            both = titles + abstracts
        else:
            both = titles
            
        both_pos_index = position_index(both_pos_index,both,docid, param)
    '''save position idex to json'''
    
    with codecs.open( './' + path.split('/')[1] + '_both_index' + '.json', mode = 'w') as a:
        json.dump(both_pos_index, a)
开发者ID:tancc,项目名称:search-practice,代码行数:56,代码来源:generate_index.py

示例13: tag_words_by_sentence

def tag_words_by_sentence(input_filename, output_path=''):
#    text = get_file_text(input_filename)
    text = 'Every day I see blue. But the sky is red. Eagles are green'
    sentences = sent_tokenize(text)
#    sentences = sent_tokenize(text)
    word_tokens = [word_tokenize(s) for s in sent_tokenize(text)]
#    word_tokens = nltk.tag.batch_pos_tag(sent_tokenize(text))
    word_pos = nltk.tag.batch_pos_tag(word_tokens)

        
    return
开发者ID:bchoatejr,项目名称:religion,代码行数:11,代码来源:nlp_word_tools.py

示例14: sentences

def sentences(a, b):
    """Return sentences in both a and b"""
    a1 = set(sent_tokenize(a))
    b1 = set(sent_tokenize(b))
    ans = []

    for line in a1:
        if line in b1:
            ans.append(line)

    return ans
开发者ID:AadeshSalecha,项目名称:Intro-to-Computer-Science-CS50-,代码行数:11,代码来源:helpers.py

示例15: lexical_features

    def lexical_features(self):
        """ Lexical features
        """
        features = []
        # Add the first token from the top-1st span on stack
        if self.stackspan1 is not None:
            text = self.stackspan1.text
            texts1 = word_tokenize(text)
          #  print texts1
            sent_tokenize_list =sent_tokenize(text)
            wordb = word_tokenize(sent_tokenize_list[0] )
            worde = word_tokenize(sent_tokenize_list[-1] )
       #     print wordb[0]
            features.append(('StackSpan1','BEGIN-WORD-STACK1',wordb[0].lower()))
            features.append(('StackSpan1','BEGIN-END-STACK1',worde[-1].lower()))
            features.append(('StackSpan1','BEGIN-END-WORD-STACK1',wordb[0].lower(),worde[-1].lower()))


        if self.stackspan2 is not None:
            text = self.stackspan2.text
            texts2 = word_tokenize(text)
          #  print texts1
            sent_tokenize_list =sent_tokenize(text)
            wordb = word_tokenize(sent_tokenize_list[0] )
            worde = word_tokenize(sent_tokenize_list[-1] )
       #     print wordb[0]
            features.append(('StackSpan2','BEGIN-WORD-STACK2',wordb[0].lower()))
            features.append(('StackSpan2','BEGIN-END-STACK2',worde[-1].lower()))

        if self.queuespan1 is not None:
            text = self.queuespan1.text
            textq1 = word_tokenize(text)
          #  print texts1
            sent_tokenize_list =sent_tokenize(text)
            wordb = word_tokenize(sent_tokenize_list[0] )
            worde = word_tokenize(sent_tokenize_list[-1] )
       #     print wordb[0]
            features.append(('QueueSpan1','BEGIN-WORD-QUEUE1',wordb[0].lower()))
            features.append(('QueueSpan1','BEGIN-END-QUEUE',worde[-1].lower()))
            features.append(('QueueSpan1','BEGIN-END-WORD-QUEUE1',wordb[0].lower(),worde[-1].lower()))


        if self.stackspan2 is not None and self.stackspan1 is not None:
             features.append(('StackSpan1','LENGTH-STACK1-STACK2',len(texts1),len(texts2)))
        if self.queuespan1 is not None and self.stackspan1 is not None :

            features.append(('StackSpan1','LENGTH-STACK1-QUEUE1',len(texts1),len(textq1)))
       #     features.append(('StackSpan1','POS-START-STACK1-QUEUE1',begins1,beginq1))

        for feat in features:
            yield feat
开发者ID:parry2403,项目名称:CodeRepo,代码行数:51,代码来源:feature.py


注:本文中的nltk.tokenize.sent_tokenize函数示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。