本文整理汇总了Python中nltk.tokenize.sent_tokenize函数的典型用法代码示例。如果您正苦于以下问题:Python sent_tokenize函数的具体用法?Python sent_tokenize怎么用?Python sent_tokenize使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了sent_tokenize函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: post
def post(self):
args = parser.parse_args()
text = {'text': args['text']}
print text
print sent_tokenize(text['text'])
print word_tokenize(text['text'])
return text['text']
示例2: split_sentence_based_on_rules
def split_sentence_based_on_rules(sent):
if re.search(r' \.+ ', sent):
sentences = re.split(r' \.+ ', sent)
elif re.search(r'@ ---- @', sent):
sentences = re.split(r'@ ---- @', sent)
elif re.search(r'\.\w+\:', sent):
sent = re.sub(r'\.(\w+)\:', r'. \1:', sent)
sentences = sent_tokenize(sent)
elif re.search(r'\, as well as', sent):
sent = sent.replace(', as well as', '. As well as')
sentences = sent_tokenize(sent)
elif re.search(r'[a-z\.]+[A-Z][a-z]+:', sent):
k = re.findall(r' [a-z\.]+([A-Z][a-z]+:)', sent)
p = chr(ord(max(sent)) + 1)
sentences = sent.replace(k[0], p + k[0]).split(p)
elif re.search(r'\; ', sent):
sent = re.sub(r'\; ', r'. ', sent)
sentences = sent_tokenize(sent)
elif re.search(r', and, ', sent):
sent = sent.replace(', and, ', '. And, ')
sentences = sent_tokenize(sent)
elif re.search(r'president\: Wechsler', sent):
sent = sent.replace(': ', '. ')
sentences = sent_tokenize(sent)
elif re.search(r'\, ', sent):
sentences = re.split(r'\, ', sent)
else:
sentences = [sent[:349],sent[350:]]
print("Using greedy sentence tokenization")
text_len = [len(sentence) for sentence in sentences]
return sentences
示例3: load_file_sentences
def load_file_sentences(filepath):
index = filepath.rfind('/')
if index < 0:
sents = sent_tokenize(PlaintextCorpusReader('.', filepath).raw())
else:
sents = sent_tokenize(PlaintextCorpusReader(filepath[:index], filepath[index+1:]).raw())
return sents
示例4: realtime
def realtime():
model_parsing()
data_df=pd.read_csv('Test_Survey.csv')
data_df.Verbatim=data_df.Verbatim.fillna(0)
unique_id=data_df['Unique_Id']
verbatims=data_df['Verbatim']
data_dict = dict(zip(unique_id, verbatims))
Results_df=pd.DataFrame(columns=('Unique_id','Sentence', 'category', 'Sentiment'))
model_df = pd.read_csv('Model_modified_twitter_test.csv')
for uid,line in data_dict.items():
line=str(line).decode('utf-8',errors='ignore') #To make sure program doesnt run into unicode error. Add errot handling to avoid issues with other formats
try:
line_list=tokenize.sent_tokenize(str(line))
tokenize.sent_tokenize(str(line))
for line in line_list:
original_line=line
for p in list(punctuation):
line=line.replace(p,'')
line=line.lower()
line_SC=tb.blob.BaseBlob(line)
line=line_SC.correct()
line=str(line)
#print uid
sentiment_score=sentiment_calc(line)
temp_df=core_classify(line,uid,sentiment_score,model_df,original_line)
#Results_df = Results_df.append(temp_df)
yield temp_df
except UnicodeEncodeError:
temp_df = pd.DataFrame({'Unique_id':[uid],'Sentence':[original_line],'category':['Invalid text data'],'Sentiment':[sentiment_score]})
yield temp_df
#Results_df = Results_df.append(temp_df)
Results_df.to_csv('test_analysis.csv',index=False, encoding = 'utf-8')
示例5: process_statuses
def process_statuses(uid):
statuses_list = {}
in_path = 'Data/'+uid+'/statuses_list.pickle'
if os.path.exists(in_path):
f = open(in_path,'rb')
j = 0
while True:
try:
statuses = pickle.load(f)
for status in statuses:
j += 1
tweet = status.text
if
sents = sent_tokenize(tweet)
text = ""
for sent in sents:
#print("Sent: ", sent)
sent_text = re.sub(r'RT\[email protected]\w+:\s|@\w+\s|#|http://.*$|http://.*\s|https://.*$|https://.*\s|\n|\\U\w+', "", sent)
sent_text = highpoints.sub("", sent_text)
#print(sent_text)
tokens = word_tokenize(sent_text)
words = [w.lower() for w in tokens if w.isalpha() or w.isalnum()]
stop_words = set(stopwords.words('english'))
filtered_words = [w for w in words if not w in stop_words]
statuses_list[sent] = filtered_words #structure: key:integrate sentence, value: filtered_words
except EOFError:
print(j)
break
#print("statuses_list: ", statuses_list)
return statuses_list
示例6: sentences
def sentences(a, b):
"""Return sentences in both a and b"""
asplit = sent_tokenize(a)
bsplit = sent_tokenize(b)
# use set again
same = {x for x in asplit if x in bsplit}
return list(same)
示例7: embed
def embed(sentences):
model = word2vec.load('~/word2vec_models/GoogleNews-vectors-negative300.bin')
embedded_sentences = []
tokenized_sentences = []
max_len = 0
for sentence in sentences:
tokenized_sentence = sent_tokenize(sentence)
tokenized_sentences.append(tokenized_sentence)
if len(tokenized_sentence) > max_len:
max_len = len(tokenized_sentence)
for sentence in sentences:
tokenized_sentence = sent_tokenize(sentence)
embedded_words = []
for word in tokenized_sentence:
try:
word = model['word']
except:
word = np.zeros(300)
embedded_words.append(word)
#padding
for i in range(max_len - len(embedded_words)):
embedded_words.append(np.zeros(300))
embedded_sentences.append(embedded_words)
embedded_sentences = np.array(embedded_sentences)
return embedded_sentences
示例8: split_reddit_reviews
def split_reddit_reviews(self,reviews):
columns = ['Text','Score', 'True']
#Calculate total number of sentences to fill up the data frame
count=0
for index,each_review in reviews.iterrows():
split_sentences=sent_tokenize(each_review['Text'])
count+=len(split_sentences)
print "total number of sentences {}".format(count)
df = pd.DataFrame(index=range(0,count), columns=columns)
Text,Score,True=[],[],[]
for index,each_review in reviews.iterrows():
split_sentences=sent_tokenize(each_review['Text'])
actual_tag=each_review['True']
score_tag=each_review['Score']
for each_split_sentence in split_sentences:
Text.append(each_split_sentence)
Score.append(actual_tag)
True.append(score_tag)
print "Count ={} Text.length {}".format(count,len(Text))
df['Text']=Text
df['Score']=Score
df['True']=True
df.to_csv('../data/reddit_reviews.csv')
示例9: inputfactx
def inputfactx(rev, include_vpr):
this_business = find_business(rev.bizid)
this_user = find_user(rev.uid)
result = [ this_business.stars ]
if include_vpr:
result += [ this_user.get_vpr() ]
result += [
this_user.reviewCount,
len(rev.text),
rev.stars,
rev.get_days() ]
if len(rev.text) == 0:
result += [ 0, 0, 0, 0, 0 ]
else:
excount = 0
for sent in sent_tokenize(rev.text):
ss = sent.strip()
if ss.endswith('!'):
excount += 1
result += [ excount,
np.mean([len(sent) for sent in sent_tokenize(rev.text)]),
len(sent_tokenize(rev.text)),
len(re.findall('\n\n', rev.text)) + 1,
len(rev.text.splitlines()[0]) ]
result += [ this_business.longitude, this_business.latitude ]
return result
示例10: tokenize_sentences
def tokenize_sentences(filename):
file_dir = docs_dir + str(filename)
f = open(file_dir, 'r')
root = ET.parse(f).getroot()
tags = root.getiterator('str')
# read the relevant tags
title_string = ''
desc_string = ''
for tag in tags:
if tag.get('name') == 'Title' :
title_string = filter(lambda x: x in string.printable, tag.text.lower().strip())
elif tag.get('name') == 'Abstract':
desc_string = filter(lambda x: x in string.printable, tag.text.lower().strip().replace('relevant documents will describe', ''))
f.close()
sentences = sent_tokenize(title_string)
title_words = []
for s in sentences:
title_words = title_words + word_tokenize(s)
sentences = sent_tokenize(desc_string)
desc_words = []
for s in sentences:
desc_words = desc_words + word_tokenize(s)
return (title_words, desc_words)
示例11: tokenize
def tokenize(text, grams=1):
wordStems = lambda s: map(stem, word_tokenize(s))
sentTokens = lambda tok, s: tok + wordStems(s)
if grams == 1:
return list(reduce(sentTokens, sent_tokenize(text), [ ]))
else:
return list(ngrams(reduce(sentTokens, sent_tokenize(text), [ ]), grams))
示例12: main
def main(param = 0):
'''
0 for no stem
1 for porter
2 for lancaster
'''
both_pos_index = {}
tit_pos_index = {}
abs_pos_index = {}
if param == 0:
path = './NoStemmer/'
elif param == 1:
path = './Porter/'
elif param == 2:
path = './Lancaster/'
for i in range(1,1001):
'''open xml file and get abstract and title'''
try:
filename = "./data/%d.xml" %i
data = open(filename)
except:
print "can't open file %s" %filename
return 0
docid = filename.split('/')[-1].split('.')[-2]
tree = etree.fromstring(data.read())
title = tree.find('Title').text
abstract = tree.find('Abstract').text
#####################################################
# Step2 tokenize and make position index dictionary #
#####################################################
'''sentence tokenize'''
if title != None:
title = title.replace('[','',1).replace(']','',1)
titles = [s.replace('&', '') for s in sent_tokenize(title)]
tit_pos_index = position_index(tit_pos_index, titles, docid, param)
if abstract != None:
abstracts = [s.replace('&', '&') for s in sent_tokenize(abstract)]
both = titles + abstracts
else:
both = titles
both_pos_index = position_index(both_pos_index,both,docid, param)
'''save position idex to json'''
with codecs.open( './' + path.split('/')[1] + '_both_index' + '.json', mode = 'w') as a:
json.dump(both_pos_index, a)
示例13: tag_words_by_sentence
def tag_words_by_sentence(input_filename, output_path=''):
# text = get_file_text(input_filename)
text = 'Every day I see blue. But the sky is red. Eagles are green'
sentences = sent_tokenize(text)
# sentences = sent_tokenize(text)
word_tokens = [word_tokenize(s) for s in sent_tokenize(text)]
# word_tokens = nltk.tag.batch_pos_tag(sent_tokenize(text))
word_pos = nltk.tag.batch_pos_tag(word_tokens)
return
示例14: sentences
def sentences(a, b):
"""Return sentences in both a and b"""
a1 = set(sent_tokenize(a))
b1 = set(sent_tokenize(b))
ans = []
for line in a1:
if line in b1:
ans.append(line)
return ans
示例15: lexical_features
def lexical_features(self):
""" Lexical features
"""
features = []
# Add the first token from the top-1st span on stack
if self.stackspan1 is not None:
text = self.stackspan1.text
texts1 = word_tokenize(text)
# print texts1
sent_tokenize_list =sent_tokenize(text)
wordb = word_tokenize(sent_tokenize_list[0] )
worde = word_tokenize(sent_tokenize_list[-1] )
# print wordb[0]
features.append(('StackSpan1','BEGIN-WORD-STACK1',wordb[0].lower()))
features.append(('StackSpan1','BEGIN-END-STACK1',worde[-1].lower()))
features.append(('StackSpan1','BEGIN-END-WORD-STACK1',wordb[0].lower(),worde[-1].lower()))
if self.stackspan2 is not None:
text = self.stackspan2.text
texts2 = word_tokenize(text)
# print texts1
sent_tokenize_list =sent_tokenize(text)
wordb = word_tokenize(sent_tokenize_list[0] )
worde = word_tokenize(sent_tokenize_list[-1] )
# print wordb[0]
features.append(('StackSpan2','BEGIN-WORD-STACK2',wordb[0].lower()))
features.append(('StackSpan2','BEGIN-END-STACK2',worde[-1].lower()))
if self.queuespan1 is not None:
text = self.queuespan1.text
textq1 = word_tokenize(text)
# print texts1
sent_tokenize_list =sent_tokenize(text)
wordb = word_tokenize(sent_tokenize_list[0] )
worde = word_tokenize(sent_tokenize_list[-1] )
# print wordb[0]
features.append(('QueueSpan1','BEGIN-WORD-QUEUE1',wordb[0].lower()))
features.append(('QueueSpan1','BEGIN-END-QUEUE',worde[-1].lower()))
features.append(('QueueSpan1','BEGIN-END-WORD-QUEUE1',wordb[0].lower(),worde[-1].lower()))
if self.stackspan2 is not None and self.stackspan1 is not None:
features.append(('StackSpan1','LENGTH-STACK1-STACK2',len(texts1),len(texts2)))
if self.queuespan1 is not None and self.stackspan1 is not None :
features.append(('StackSpan1','LENGTH-STACK1-QUEUE1',len(texts1),len(textq1)))
# features.append(('StackSpan1','POS-START-STACK1-QUEUE1',begins1,beginq1))
for feat in features:
yield feat