本文整理汇总了Python中nltk.tokenize.word_tokenize函数的典型用法代码示例。如果您正苦于以下问题:Python word_tokenize函数的具体用法?Python word_tokenize怎么用?Python word_tokenize使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了word_tokenize函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: createTrainingVectors
def createTrainingVectors(tokenized_texts_dict):
"""
Given the filenames and their contents, this methods creates the training
vectors by creating a unique list of all words together in the training
set
"""
print("Creating vectors for training data")
unique_words = []
for filename, text in tokenized_texts_dict.iteritems():
# print("Reading {0} and adding to unique word list".format(filename))
unique_words.extend(word_tokenize(text))
unique_words = set(unique_words)
# Creating the initial vector with counts 0 for all training sets
zero_vector = OrderedDict(zip(unique_words, [0] * len(unique_words)))
print("Creating the zero vector")
# For each training file, create an OrderedDict containing its word counts (together with zero counts),
# and store it in a dict, indexed by its corresponding filename
vectors = {}
for filename, token_list in tokenized_texts_dict.iteritems():
current_vector = zero_vector.copy()
current_vector.update(Counter(word_tokenize(token_list)))
vectors[filename] = current_vector
return vectors, zero_vector
示例2: max_similarity
def max_similarity(context_sentence, ambiguous_word, option="path",
lemma=True, context_is_lemmatized=False, pos=None, best=True):
"""
Perform WSD by maximizing the sum of maximum similarity between possible
synsets of all words in the context sentence and the possible synsets of the
ambiguous words (see http://goo.gl/XMq2BI):
{argmax}_{synset(a)}(\sum_{i}^{n}{{max}_{synset(i)}(sim(i,a))}
"""
ambiguous_word = lemmatize(ambiguous_word)
# If ambiguous word not in WordNet return None
if not wn.synsets(ambiguous_word):
return None
if context_is_lemmatized:
context_sentence = word_tokenize(context_sentence)
else:
context_sentence = [lemmatize(w) for w in word_tokenize(context_sentence)]
result = {}
for i in wn.synsets(ambiguous_word):
try:
if pos and pos != str(i.pos()):
continue
except:
if pos and pos != str(i.pos):
continue
result[i] = sum(max([sim(i,k,option) for k in wn.synsets(j)]+[0]) \
for j in context_sentence)
if option in ["res","resnik"]: # lower score = more similar
result = sorted([(v,k) for k,v in result.items()])
else: # higher score = more similar
result = sorted([(v,k) for k,v in result.items()],reverse=True)
##print result
if best: return result[0][1];
return result
示例3: main
def main():
# Load up txt files
speech_file = open('trump-speeches/speeches.txt').read()
tweets = json.load(open('trump_tweets.json'))
tweet_list = []
for tweet in tweets:
tweet_list.append(tweet['text'])
tweet_list = ' '.join(tweet_list)
# Tokenize
logging.info('Formatting training text')
speech_token = word_tokenize(speech_file)
tweet_token = word_tokenize(tweet_list)
# Train trigram models
logging.info('Setting up models')
speech_gram, speech_format = ngram(speech_token, 3)
tweet_gram, tweet_format = ngram(tweet_token, 3)
# Generate responses
cont = True
while cont:
response = input("Hello sir, what can I Trumpinate for you?: ")
num_words = input("And how many words should I write?: ")
# Print Phrases
gen_phrase(speech_gram, int(num_words), starter_word=[response])
print('')
gen_phrase(tweet_gram, int(num_words), starter_word=[response])
more = input("Would you like to generate more? (Yes, No): ")
if more != 'Yes':
cont = False
示例4: getBigramBeginWithNotCount
def getBigramBeginWithNotCount(sent):
negative_keywords = ["bad", "sad", "don't", "could not", "crappy", "unfortunately", "remove", "why", "poor",
"bothersome", "terrible", "although", "complaints", "outrageous", "isn't", "poorly",
"drawback", "annoying", "against", "irritating", "wouldn't", "won't", "wasn't", "couldn't",
"awful", "didn't", "hasn't", "difficult", "hate", "incorrect", "junk", "trash", "removed",
"complain", "complained", "hated", "negative"]
bigramPostiveCount = 0
'''
from nltk.corpus import brown
brown_tagged_sents = brown.tagged_sents(categories='news')
brown_sents = brown.sents(categories='news')
unigram_tagger = nltk.UnigramTagger(brown_tagged_sents)
for bigram in nltk.bigrams(word_tokenize(sent)):
if bigram[0].lower() == "not" and bigram[1].lower() in negative_keywords:
print sent
print bigram
print unigram_tagger.tag(word_tokenize(sent))
bigramNotCount += 1
'''
for i, word in enumerate(word_tokenize(sent)):
if word.lower() == "not":
if word_tokenize(sent)[i + 1] in negative_keywords : # e.g. NOT bad
bigramPostiveCount += 1
if i < len(word_tokenize(sent)) - 2 and word_tokenize(sent)[i + 2] in negative_keywords: # e.g. NOT too bad
bigramPostiveCount += 1
else: # e.g. NOT good
bigramPostiveCount -= 1
return bigramPostiveCount
示例5: test
def test(testAccents, testNoAccents, dictnoAccents):
count = 0
correct = 0
notWord = []
result = []
incorrect = {}
wordCount = 0
nonWordCount = 0
for i in range(len(testAccents)):
sent = ""
sentenceAccents = testAccents[i]
sentenceNoAccents = testNoAccents[i]
tokensAccents = word_tokenize(sentenceAccents)
tokensNoAccents = word_tokenize(sentenceNoAccents)
if len(tokensAccents) == len(tokensNoAccents):
for j in range(len(tokensAccents)):
tA = tokensAccents[j]
tNA = tokensNoAccents[j]
if tNA not in punctuation and not tNA.isdigit():
wordCount +=1
if tNA in dictnoAccents.keys():
newToken = max(dictnoAccents[tNA], key=dictnoAccents[tNA].get)
#print(newToken)
#print("YES")
else:
newToken = tNA
if newToken == tA:
correct +=1
else:
incorrect[newToken] = tA
# print(newToken)
# print(tA)
count +=1
#print("HI")
if j != 0:
newToken = " " + newToken
else:
nonWordCount +=1
notWord.append(tNA)
newToken = tNA
sent = sent + newToken
result.append(sent)
print("Le nombre de mot dans le corpus: " + str(wordCount) )
print("Le nombre de ponctuation et de nombres dans le corpus: " + str(nonWordCount))
print("Nombre au total de changements/non changements possibles " + str(count ))
print("Nombre au total de decisions correctes " + str(correct))
print("Accuracy: " + str(correct/count) )
return([incorrect,correct/count, wordCount, nonWordCount])
示例6: load_data
def load_data(loc='./data/'):
"""
Load MSRP dataset
"""
trainloc = os.path.join(loc, 'msr_paraphrase_train.txt')
testloc = os.path.join(loc, 'msr_paraphrase_test.txt')
trainA, trainB, testA, testB = [],[],[],[]
trainS, devS, testS = [],[],[]
f = open(trainloc, 'rb')
for line in f:
text = line.strip().split('\t')
trainA.append(' '.join(word_tokenize(text[3])))
trainB.append(' '.join(word_tokenize(text[4])))
trainS.append(text[0])
f.close()
f = open(testloc, 'rb')
for line in f:
text = line.strip().split('\t')
testA.append(' '.join(word_tokenize(text[3])))
testB.append(' '.join(word_tokenize(text[4])))
testS.append(text[0])
f.close()
trainS = [int(s) for s in trainS[1:]]
testS = [int(s) for s in testS[1:]]
return [trainA[1:], trainB[1:]], [testA[1:], testB[1:]], [trainS, testS]
示例7: tokenize
def tokenize(s, stem=True, digit=False, stop=True, use_re=False):
"""
:type s: str
:type stem: bool
:type use_re: bool
:rtype: set(str)
"""
stop_words = stopwords.words('english')
stemmer = SnowballStemmer('english')
wordnet = WordNetLemmatizer()
table = string.maketrans("","")
if use_re:
s = re.sub('(.)([A-Z][a-z]+)', r'\1 \2', s)
if digit:
tokens = set(word_tokenize(unify_units(s).translate(table, string.punctuation + string.digits)))
else:
tokens = set(word_tokenize(unify_units(s).translate(table, string.punctuation)))
if stop:
tokens = set(word for word in tokens if word not in stop_words)
if stem:
tokens = set(stemmer.stem(word) for word in tokens)
return tokens
示例8: clean_raw_txt
def clean_raw_txt(body, headline, punct_dct=None, stopwrds_set=None):
"""Clean the body and headline to remove punctuation, stopwords, etc.
Args:
----
body: str
headline: str
punct_dct (optional): dict
Translation dict resulting from a `str.maketrans()` call
stopwords_set (optional): set
Return:
------
(body, headline): tuple
"""
if punct_dct:
body = body.translate(punct_dct)
headline = headline.translate(punct_dct)
body_wrds = word_tokenize(body)
headline_wrds = word_tokenize(headline)
stopwrds_set = set() if stopwrds_set is None else stopwrds_set
body_wrds = [wrd.lower() for wrd in body_wrds if wrd.lower() not in stopwrds_set]
headline_wrds = [wrd.lower() for wrd in headline_wrds if wrd.lower() not in stopwrds_set]
return (body_wrds, headline_wrds)
示例9: obtaindata
def obtaindata(pos_file,neg_file):
##read the input files
short_pos = open(pos_file, "r").read()
short_neg = open(neg_file, "r").read()
documents = [] # documents is gonna be a list of tuples that have a line of review and a class (pos or neg)
for r in short_pos.split('\n'):
documents.append((r, "pos"))
for r in short_neg.split('\n'):
documents.append((r, "neg"))
all_words = [] # gonna contain all the words in both corpuses combined (nonunique)
short_pos_words = word_tokenize(short_pos)
short_neg_words = word_tokenize(short_neg)
for w in short_pos_words:
all_words.append(w.lower())
for w in short_neg_words:
all_words.append(w.lower())
all_words = nltk.FreqDist(all_words)
word_features = list(all_words.keys())[:5000]#gets the top 5000 most common words to use as features
featuresets = [(find_features(rev,word_features), category) for (rev, category) in documents]
random.shuffle(featuresets)
return featuresets
示例10: load_samples
def load_samples(question, prop_labels):
samples = []
q = word_tokenize(question)
for label in prop_labels:
text = word_tokenize(label.lower())
samples.append({'qtext': ' '.join(q), 'label': 0, 'atext': ' '.join(text)})
return samples
示例11: _doc2vec_doc_stream
def _doc2vec_doc_stream(paths, n, sentences=True):
"""
Generator to feed sentences to the dov2vec model.
"""
phrases = Bigram()
i = 0
p = Progress()
for path in paths:
with open(path, 'r') as f:
for line in f:
i += 1
p.print_progress(i/n)
# We do minimal pre-processing here so the model can learn
# punctuation
line = line.lower()
if sentences:
for sent in sent_tokenize(line):
tokens = word_tokenize(sent)
yield LabeledSentence(phrases[tokens], ['SENT_{}'.format(i)])
else:
tokens = word_tokenize(line)
yield LabeledSentence(phrases[tokens], ['SENT_{}'.format(i)])
示例12: load_sick2014
def load_sick2014(dsfile, mode='relatedness'):
""" load a dataset in the sick2014 tsv .txt format;
mode='relatedness': use the sts relatedness score as label
mode='entailment': use -1 (contr.), 0 (neutral), 1 (ent.) as label """
s0 = []
s1 = []
labels = []
with open(dsfile) as f:
first = True
for line in f:
if first:
# skip first line with header
first = False
continue
line = line.rstrip()
pair_ID, sentence_A, sentence_B, relatedness_score, entailment_judgement = line.split('\t')
if mode == 'relatedness':
label = float(relatedness_score)
elif mode == 'entailment':
if entailment_judgement == 'CONTRADICTION':
label = -1
elif entailment_judgement == 'NEUTRAL':
label = 0
elif entailment_judgement == 'ENTAILMENT':
label = +1
else:
raise ValueError('invalid label on line: %s' % (line,))
else:
raise ValueError('invalid mode: %s' % (mode,))
labels.append(label)
s0.append(word_tokenize(sentence_A))
s1.append(word_tokenize(sentence_B))
return (s0, s1, np.array(labels))
示例13: load_anssel
def load_anssel(dsfile, subsample0=3):
""" load a dataset in the anssel csv format;
subsample0=N denotes that only every N-th 0-labelled sample
should be loaded; so e.g. N=3 reduces 80k negatives to 28k
negatives in the training set (vs. 4k positives); N=10k
gets you just 8k negatives, etc. """
s0 = []
s1 = []
labels = []
i = 0
with open(dsfile) as f:
c = csv.DictReader(f)
for l in c:
label = int(l['label'])
if label == 0 and (i % subsample0) != 0:
i += 1
continue
labels.append(label)
try:
qtext = l['qtext'].decode('utf8')
atext = l['atext'].decode('utf8')
except AttributeError: # python3 has no .decode()
qtext = l['qtext']
atext = l['atext']
s0.append(word_tokenize(qtext))
s1.append(word_tokenize(atext))
i += 1
return (s0, s1, np.array(labels))
示例14: testing
def testing():
# - tokenize on sentence and word
ex_txt = "hello there Mr. Bartuska, How are you? The weather is great and I enjoy Python. cheers!"
print(sent_tokenize(ex_txt))
print(word_tokenize(ex_txt, language='english'))
# - stop words (pre-defined by nltk)
stop_words = set(stopwords.words('english'))
print(stop_words)
words = word_tokenize(ex_txt)
print(words)
filtered_sent = []
for w in words:
if w not in stop_words:
filtered_sent.append(w)
print(filtered_sent)
filtered_sent = [w for w in words if not w in stop_words]
print(filtered_sent)
# - stemming
ps = PorterStemmer()
example_words = [python,pythoner,pythoning,pythoned,pythonly]
# for w in example_words:
# print(ps.stem(w))
new_text = "it is very important to be pothonly while you are pythoning with python. All pythoners have pythoned poorly at least once."
words = word_tokenize(new_text)
for w in words:
print(ps.stem(w))
示例15: __init__
def __init__(self, txt_type: str, txt: str):
self.txt_type = txt_type
if txt_type is "paragraph":
self.sentences = [word_tokenize(w) for w in sent_tokenize(txt)]
else:
self.title = word_tokenize(txt)