本文整理汇总了Python中nltk.word_tokenize函数的典型用法代码示例。如果您正苦于以下问题:Python word_tokenize函数的具体用法?Python word_tokenize怎么用?Python word_tokenize使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了word_tokenize函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: load_file_without_frequency
def load_file_without_frequency(self,positif, negatif):
tab = []
maxs = self.nbFeatures
phrases = []
y = []
with codecs.open(positif,"r",encoding='latin-1') as my_file:
for line in my_file:
line= line.strip().lower() # remove the \n*
phrases.append(line)
y.append(1)
for mot in word_tokenize(line):
tab.append(mot)
with codecs.open(negatif,"r",encoding='latin-1') as my_file:
for line in my_file:
line= line.strip().lower() # remove the \n*
phrases.append(line)
y.append(0)
for mot in word_tokenize(line):
tab.append(mot)
word_fd = FreqDist(tab)
print(word_fd)
for i in range(len(phrases)):
mots = word_tokenize(phrases[i])
tmp = []
for element in mots:
tmp.append(word_fd[element])
if(len(tmp) < maxs):
for j in range(maxs - len(tmp)):
tmp.append(0)
elif(len(tmp)>maxs):
tmp = tmp[:maxs]
phrases[i] = tmp
return (np.array(phrases),np.array(list(set(tab))),np.array(y))
示例2: __init__
def __init__(self, title, full_text, sentence):
self.title = title
self.sentence = sentence
# map of word -> number of times it appears in the full article text
self.full_text_word_frequencies = nltk.FreqDist(word.lower() for word in nltk.word_tokenize(full_text))
# map of word -> number of times it appears in the given sentence
self.sentence_word_frequencies = nltk.FreqDist(word.lower() for word in nltk.word_tokenize(sentence))
示例3: vectorize
def vectorize(data, s):
'''
:param data: list of instances for a given lexelt with the following structure:
{
[(instance_id, left_context, head, right_context, sense_id), ...]
}
:param s: list of words (features) for a given lexelt: [w1,w2,w3, ...]
:return: vectors: A dictionary with the following structure
{ instance_id: [w_1 count, w_2 count, ...],
...
}
labels: A dictionary with the following structure
{ instance_id : sense_id }
'''
vectors = {}
labels = {}
for (instance_id, left_context, head, right_context, sense_id) in data:
labels[instance_id] = sense_id
left_tokens = nltk.word_tokenize(left_context)
right_tokens = nltk.word_tokenize(right_context)
words = k_nearest_words_vector_from_tokens(left_tokens, right_tokens, window_size)
vectors[instance_id] = frequency_vector_from_near_words(s, words)
return vectors, labels
示例4: colocation
def colocation(windowSize, pos, context,dictionary):
if windowSize<=0:
return dictionary
#going forward
forward= context[:(pos)]
f= forward[(-windowSize/2):]
#going backward
backward= context[pos+1:]
b= backward[:windowSize/2]
for item in f:
key= "pre"+str(len(f)-f.index(item))+"-word"
value= item
dictionary[key]=value
key= "pre"+str(len(f)-f.index(item))+"-pos"
text = nltk.word_tokenize(item)
value= nltk.pos_tag(text)[0][1]
dictionary[key]=value
for item in b:
key= "fol"+str(b.index(item)+1)+"-word"
value= item
dictionary[key]=value
key= "fol"+str(b.index(item)+1)+"-pos"
text = nltk.word_tokenize(item)
value= nltk.pos_tag(text)[0][1]
dictionary[key]=value
return dictionary
示例5: __tokenize
def __tokenize(self, utter, semantic_tagged=None):
result = None
if semantic_tagged is None:
result = [(word, None) for word in nltk.word_tokenize(utter)]
else:
parser_raw = SemanticTagParser(False)
parser_tagged = SemanticTagParser(False)
segmented = ' '.join(nltk.word_tokenize(utter))
tagged = ' '.join(semantic_tagged)
parser_raw.feed(segmented)
parser_tagged.feed(tagged)
raw_chr_seq = parser_raw.get_chr_seq()
raw_space_seq = parser_raw.get_chr_space_seq()
tagged_chr_seq = parser_tagged.get_chr_seq()
tagged_space_seq = parser_tagged.get_chr_space_seq()
if raw_chr_seq == tagged_chr_seq:
merged_space_seq = [
x or y for x, y in zip(raw_space_seq, tagged_space_seq)]
word_seq = parser_tagged.tokenize(merged_space_seq)
tag_seq = parser_tagged.get_word_tag_seq()
result = [(word, tag) for word, tag in zip(word_seq, tag_seq)]
return result
示例6: reading_level
def reading_level(full_text):
#Clean the full_text
full_text_clean = ""
for char in full_text:
if char == ".":
full_text_clean += ". "
else:
full_text_clean += char
#Language features
import nltk
words = nltk.word_tokenize(full_text_clean)
n_sents = len(nltk.sent_tokenize(full_text_clean))
n_words = len(nltk.word_tokenize(full_text_clean))
#Count the syllables
n_syll = 0
for word in words:
n_syll += syllable_count(word)
#Calculate the reading level
#https://en.wikipedia.org/wiki/Flesch%E2%80%93Kincaid_readability_tests
grade_level = -15.59 + 0.39*(n_words/n_sents) + 11.8*(n_syll/n_words)
return round(grade_level,1)
示例7: update
def update(self, other):
"""Adds counts for elements in other"""
if isinstance(other, self.__class__):
self.n_sents += other.n_sents
for x, n in other.items():
self[x] += n
else:
for sent in other:
self.n_sents += 1
# import pdb;pdb.set_trace()
if self.poscache is not None:
if sent in self.poscache:
tags = self.poscache[sent]
else:
self.poscache[sent] = tags = nltk.pos_tag(
nltk.word_tokenize(sent))
else:
tags = nltk.pos_tag(nltk.word_tokenize(sent))
for x in tags:
tok, tag = x
self[tag] += 1
if self.normalize:
for x, n in self.items():
self[x] /= float(self.n_sents)
示例8: main
def main(question, article):
ddict = {}
counts = get_counts()
for tok in nltk.word_tokenize(article):
ddict[tok] = ddict.get(tok, 0) + 1
vec = []
for tok in nltk.word_tokenize(question):
# count in article
tf = ddict.get(tok, 0)
# total articles is 108 / number that have current token
idf = math.log(float(108)/len(filter(lambda x:tok in x.keys(),counts)) + 1)
vec.append(tf*idf)
largest = max(vec)
normalized = map(lambda y: y/largest, vec)
finDic = {}
for word,i in enumerate(nltk.word_tokenize(question)):
finDic[word] = normalized[i]
print finDic
return finDic
示例9: next_note
def next_note(tokenizer):
print 'SemEval data'
for semeval_file in semeval_files:
print 'File', semeval_file
with open(semeval_file, 'r') as f:
st = []
for line in f:
st += [line.strip()]
text = read_visit_sem(st)
text = tokenizer.tokenize(text)
for sent in text:
yield nltk.word_tokenize(sent.lower())
print 'MIMIC data'
for notes_file in subset(notes_files, 15): # 15 random MIMIC files
print 'File', notes_file
try:
with open(notes_file, 'r') as f:
ct = 0
st = []
for line in f:
ct += 1
if ct % 50000 == 0:
print ct
if line.strip() == '</VISIT>':
text = read_visit(st)
text = tokenizer.tokenize(text)
for sent in text:
yield nltk.word_tokenize(sent.lower())
st = []
elif line.strip() != '<VISIT>':
st += [line.strip()]
except IOError:
pass
示例10: PushDataPair
def PushDataPair(data, database):
last = len(database['Q'].keys())
for pair in data:
database['Q'][last] = nltk.word_tokenize(pair['question'])
database['A'][last] = nltk.word_tokenize(pair['answer'])
last += 1
return database
示例11: build_s
def build_s(data):
'''
Compute the context vector for each lexelt
:param data: dict with the following structure:
{
lexelt: [(instance_id, left_context, head, right_context, sense_id), ...],
...
}
:return: dict s with the following structure:
{
lexelt: [w1,w2,w3, ...],
...
}
'''
s = {}
# implement your code here
for key,value in data.items():
for i in value:
tokens_left = nltk.word_tokenize(i[1])
tokens_right = nltk.word_tokenize(i[3])
left = [w for w in tokens_left if w not in string.punctuation][-window_size:]
right = [w for w in tokens_right if w not in string.punctuation][:window_size]
context = left + right
if key not in s:
s[key]=[]
for word in context:
if word not in s[key]:
s[key].append(word)
return s
示例12: paragraph_features
def paragraph_features(paragraph_sents):
global count
count += 1
print '\r', count,
if FEATURE == FEAT_CONTAINS:
paragraph_words = set(
sents_to_words(paragraph_sents)
)
elif FEATURE == FEAT_LINKED_TITLES:
paragraph_words = ' '.join(paragraph_sents)
elif FEATURE == FEAT_FIRST_SENT:
paragraph_words = nltk.word_tokenize(
paragraph_sents[0]
)
elif FEATURE == FEAT_BEGIN_SENT:
paragraph_words = {
nltk.word_tokenize(sent)[0]
for sent in paragraph_sents
}
else:
paragraph_words = None
print 'FEATURE NOT SUPPORTED'
exit()
features = dict()
for word in word_features:
features[word_features[word]] = (
word in paragraph_words
)
return features
示例13: synsym
def synsym(s1,s2):
ts0 = nltk.pos_tag(nltk.word_tokenize(s1))
ts1 = nltk.pos_tag(nltk.word_tokenize(s2))
# adj
jj0 = [x for x,y in ts0 if y=='JJ' or y=='JJR' or y=='JJS']
jj1 = [x for x,y in ts1 if y=='JJ' or y=='JJR' or y=='JJS']
if len(jj0) == 0 or len(jj1) ==0:
jjps = 0
else:
v1 = makeFeatureVec(jj0,model,300)
v2 = makeFeatureVec(jj1,model,300)
jjps = np.inner(v1,v2)/(LA.norm(v1)*LA.norm(v2))
# noum
jj0 = [x for x,y in ts0 if y=='NN' or y=='NNS' or y=='NNP' or y=='NNPS' or y=='DT']
jj1 = [x for x,y in ts1 if y=='NN' or y=='NNS' or y=='NNP' or y=='NNPS' or y=='DT']
if len(jj0) == 0 or len(jj1) ==0:
nps = 0
else:
v1 = makeFeatureVec(jj0,model,300)
v2 = makeFeatureVec(jj1,model,300)
nps = np.inner(v1,v2)/(LA.norm(v1)*LA.norm(v2))
# verb
jj0 = [x for x,y in ts0 if y=='VB' or y=='VBD' or y=='VBG' or y=='VBN' or y=='VBP' or y=='VBZ']
jj1 = [x for x,y in ts1 if y=='VB' or y=='VBD' or y=='VBG' or y=='VBN' or y=='VBP' or y=='VBZ']
if len(jj0) == 0 or len(jj1) ==0:
vps = 0
else:
v1 = makeFeatureVec(jj0,model,300)
v2 = makeFeatureVec(jj1,model,300)
vps = np.inner(v1,v2)/(LA.norm(v1)*LA.norm(v2))
return [jjps,nps,vps]
示例14: build_s
def build_s(data):
"""
Compute the context vector for each lexelt
:param data: dic with the following structure:
{
lexelt: [(instance_id, left_context, head, right_context, sense_id), ...],
...
}
:return: dic s with the following structure:
{
lexelt: [w1,w2,w3, ...],
...
}
"""
s = {}
# implement your code here
for lexelt in data:
words = set()
for instance in data[lexelt]:
left_context = word_tokenize(instance[1].strip())
for token in left_context[-window_size:]:
if token not in puncts:
words.add(token)
right_context = word_tokenize(instance[3].strip())
for token in right_context[:window_size]:
if token not in puncts:
words.add(token)
s[lexelt] = list(words)
return s
示例15: parseFile
def parseFile(file):
""" Parse the header and source files for the class, and return the bindings dictionary, which contains tag data (and other pertinent
information about the file)
"""
#print file
bindings = []
# Load header file
tokens = []
if (file['header'] != ''):
with open(file['header'], 'r') as f:
# Tokenize
for line in f.readlines():
tokens += nltk.word_tokenize(line)
# Parse tokens
bindings += parseTokens( tokens, file, 'header' )
# Load source file
tokens = []
if (file['source'] != ''):
with open(file['source'], 'r') as f:
# Tokenize
for line in f.readlines():
tokens += nltk.word_tokenize(line)
# Parse tokens
bindings += parseTokens( tokens, file, 'source' )
return bindings