本文整理汇总了Python中nltk.tokenize.TweetTokenizer.tokenize方法的典型用法代码示例。如果您正苦于以下问题:Python TweetTokenizer.tokenize方法的具体用法?Python TweetTokenizer.tokenize怎么用?Python TweetTokenizer.tokenize使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类nltk.tokenize.TweetTokenizer
的用法示例。
在下文中一共展示了TweetTokenizer.tokenize方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: get_utterances
# 需要导入模块: from nltk.tokenize import TweetTokenizer [as 别名]
# 或者: from nltk.tokenize.TweetTokenizer import tokenize [as 别名]
def get_utterances(utterances, line, category, wgram, cgram):
tknzr = TweetTokenizer()
gram_list = []
# WORD GRAMS
if wgram == 1: # unigram
wgram_list = tknzr.tokenize(line)
elif wgram == 2: # uni + bigram
# unigram list
tokens = nltk.wordpunct_tokenize(line)
# bigram list
finder = BigramCollocationFinder.from_words(tokens)
scored = finder.score_ngrams(bigram_measures.raw_freq)
bigram_list = sorted(bigram for bigram, score in scored)
# res
wgram_list = tknzr.tokenize(line) + bigram_list
elif wgram == 3: # uni + bi + trigram
# unigram list
tokens = nltk.wordpunct_tokenize(line)
# bigram list
bi_finder = BigramCollocationFinder.from_words(tokens)
bi_scored = bi_finder.score_ngrams(bigram_measures.raw_freq)
bigram_list = sorted(bigram for bigram, biscore in bi_scored)
# trigram list
tri_finder = TrigramCollocationFinder.from_words(tokens)
tri_scored = tri_finder.score_ngrams(trigram_measures.raw_freq)
trigram_list = sorted(trigram for trigram, triscore in tri_scored)
# res
wgram_list = tknzr.tokenize(line) + bigram_list + trigram_list
# CHAR GRAMS
cgram_list = []
if cgram == 1: # uni-chargram
cgram_list = [line[i:i+1] for i in range(len(line)-1)]
elif cgram == 2: # bi-chargram
cgram_list = [line[i:i+2] for i in range(len(line)-1)]
elif cgram == 3: # tri-chargram
cgram_list = [line[i:i+3] for i in range(len(line)-1)]
# RESULT
if category == 'QA': # non-task
utterances.append((wgram_list + cgram_list, 0))
elif category == 'Shopping': # task
utterances.append((wgram_list + cgram_list, 1))
elif category == 'Travel': # task
utterances.append((wgram_list + cgram_list, 2))
elif category == 'Hotel': # task
utterances.append((wgram_list + cgram_list, 3))
elif category == 'Food': # task
utterances.append((wgram_list + cgram_list, 4))
elif category == 'Art': # task
utterances.append((wgram_list + cgram_list, 5))
elif category == 'Weather': # task
utterances.append((wgram_list + cgram_list, 6))
elif category == 'Friends': # task
utterances.append((wgram_list + cgram_list, 7))
elif category == 'Chat': # chat
utterances.append((wgram_list + cgram_list, 8))
else:
print utt_category,"ERROR"
示例2: load_data_and_labels_semeval
# 需要导入模块: from nltk.tokenize import TweetTokenizer [as 别名]
# 或者: from nltk.tokenize.TweetTokenizer import tokenize [as 别名]
def load_data_and_labels_semeval():
# load the entire semeval dataset
old_dataset = list(open("./input/2013-dev"))
old_dataset.extend(list(open("./input/2013-devtest")))
old_dataset.extend(list(open("./input/2013-train")))
old_dataset.extend(list(open("./input/2014-devtest")))
new_dataset = list(open("./input/2016-train"))
new_dataset.extend(list(open("./input/2016-dev")))
new_dataset.extend(list(open("./input/2016-devtest")))
# filter out invalid tweets from new dataset
new_dataset = [entry for entry in new_dataset if entry.split('\t')[2] != 'Not Available\n']
# generate x from old
tk = TweetTokenizer(reduce_len=True) # handles punctuations
x_text = [entry.split('\t')[3] for entry in old_dataset]
x_text = [clean_str(tweet) for tweet in x_text]
x_text = [tk.tokenize(tweet) for tweet in x_text]
# generate x from new
x_text_new = [entry.split('\t')[2] for entry in new_dataset]
x_text_new = [clean_str(tweet) for tweet in x_text_new]
x_text_new = [tk.tokenize(tweet) for tweet in x_text_new]
# concat x and x_new
x_text.extend(x_text_new)
# generate y from old
y = [entry.split('\t')[2] for entry in old_dataset]
for idx, label in enumerate(y):
if label == 'positive':
y[idx] = [1, 0, 0]
elif label == 'neutral':
y[idx] = [0, 1, 0]
elif label == 'negative':
y[idx] = [0, 0, 1]
else:
print 'wrong label in semeval: ' + label
# generate y from new
y_new = [entry.split('\t')[1] for entry in new_dataset]
for idx, label in enumerate(y_new):
if label == 'positive':
y_new[idx] = [1, 0, 0]
elif label == 'neutral':
y_new[idx] = [0, 1, 0]
elif label == 'negative':
y_new[idx] = [0, 0, 1]
else:
print 'wrong label in semeval: ' + label
# concat y and y_new
y.extend(y_new)
return [x_text, y]
示例3: custom_tokenizer
# 需要导入模块: from nltk.tokenize import TweetTokenizer [as 别名]
# 或者: from nltk.tokenize.TweetTokenizer import tokenize [as 别名]
def custom_tokenizer(text, bigrams = None):
chunks = text.split('-')
tokenizer = TweetTokenizer(reduce_len=True, preserve_case=False)
tokens = tokenizer.tokenize(text)
tokens = [ subchunk for chunk in chunks for subchunk in tokenizer.tokenize(chunk) ]
tokens = [ token for token in tokens if token.isalpha() ]
if bigrams:
tokens = mwe_tokenize(tokens, bigrams)
stemmer = SnowballStemmer('english', ignore_stopwords=True)
tokens = [ stemmer.stem(token) for token in tokens ]
return tokens
示例4: getVocab
# 需要导入模块: from nltk.tokenize import TweetTokenizer [as 别名]
# 或者: from nltk.tokenize.TweetTokenizer import tokenize [as 别名]
def getVocab():
freq = []
vocab = []
length = 0
tknzr = TweetTokenizer()
with open(path+'/data/training/training_stances.csv', 'r', encoding='UTF-8') as csvDataFile:
csvReader = csv.reader(csvDataFile)
first = 1
for row in csvReader:
if first == 1:
first = 0
else:
headline = row[0]
tokens = tknzr.tokenize(headline)
tokens=[token.lower() for token in tokens if (token.isalpha() and token not in stop_words)]
#for word in r.split(headline):
length = length + len(tokens)
for word in tokens:
if word not in vocab:
vocab.append(word)
freq.append(1)
else:
ind = vocab.index(word)
freq[ind] = freq[ind] + 1
with open(path+'/data/training/train_bodies.csv', 'r', encoding='UTF-8') as csvDataFile:
csvReader = csv.reader(csvDataFile)
first = 1
for row in csvReader:
if first == 1:
first = 0
else:
body = row[1]
tokens = tknzr.tokenize(body)
tokens=[token.lower() for token in tokens if (token.isalpha() and token not in stop_words)]
length = length + len(tokens)
#for word in r.split(headline):
for word in tokens:
if word not in vocab:
vocab.append(word)
freq.append(1)
else:
ind = vocab.index(word)
freq[ind] = freq[ind] + 1
return vocab, freq, length
#vocab list
#vocab, freq, length = getVocab()
示例5: get_classifier
# 需要导入模块: from nltk.tokenize import TweetTokenizer [as 别名]
# 或者: from nltk.tokenize.TweetTokenizer import tokenize [as 别名]
def get_classifier(featx):
tokenizer = TweetTokenizer()
print "Training Classifier..."
negstr = [obj["text"] for obj in handle.negative_tweets.find()]
posstr = [obj["text"] for obj in handle.positive_tweets.find()]
negfeats = [(featx(tokenizer.tokenize(Twitter.process_tweet(negstr[i]))), 'neg')
for i in range(0, len(negstr)-1)]
posfeats = [(featx(tokenizer.tokenize(Twitter.process_tweet(posstr[i]))), 'pos')
for i in range(0, len(posstr)-1)]
trainfeats = negfeats + posfeats
classifier = NaiveBayesClassifier.train(trainfeats)
return classifier
示例6: get_features
# 需要导入模块: from nltk.tokenize import TweetTokenizer [as 别名]
# 或者: from nltk.tokenize.TweetTokenizer import tokenize [as 别名]
def get_features(utterances, ngram, classify_method):
features = []
tknzr = TweetTokenizer()
for utt in utterances:
utt_content = utt[0] # text content of the utterance
utt_category = utt[1]
if ngram: # use bow & ngram as feature
# bow list
bow_list = tknzr.tokenize(utt_content)
# cgram list
uni_cgram_list = [utt_content[i:i+1] for i in range(len(utt_content)-1)]
bi_cgram_list = [utt_content[i:i+2] for i in range(len(utt_content)-1)]
tri_cgram_list = [utt_content[i:i+3] for i in range(len(utt_content)-1)]
feature_list = bow_list # add bow tokens
feature_list += uni_cgram_list # add unigram character lists
feature_list += bi_cgram_list # add bigram character lists
feature_list += tri_cgram_list # add trigram character lists
else: # only use bow as feature
feature_list = tknzr.tokenize(utt_content)
if classify_method == 'binary':
if utt_category == 'QA': # non-task
features.append((feature_list, 0))
else: # task
features.append((feature_list, 1))
elif classify_method == 'multi':
if utt_category == 'QA': # non-task
features.append((feature_list, 0))
elif utt_category == 'Shopping': # task
features.append((feature_list, 1))
elif utt_category == 'Travel': # task
features.append((feature_list, 2))
elif utt_category == 'Hotel': # task
features.append((feature_list, 3))
elif utt_category == 'Food': # task
features.append((feature_list, 4))
elif utt_category == 'Art': # task
features.append((feature_list, 5))
elif utt_category == 'Weather': # task
features.append((feature_list, 6))
elif utt_category == 'Friends': # task
features.append((feature_list, 7))
elif utt_category == 'Chat': # chat
features.append((feature_list, 8))
else:
print utt_category,"ERROR"
return features
示例7: get_test
# 需要导入模块: from nltk.tokenize import TweetTokenizer [as 别名]
# 或者: from nltk.tokenize.TweetTokenizer import tokenize [as 别名]
def get_test(infile, NUM_TEST):
with codecs.open(infile, 'rb') as csvfile:
test = []
pos_tweets = 0
neg_tweets = 0
reader = csv.reader(csvfile)
tokenizer = TweetTokenizer(preserve_case=True)
for line in reader:
if line[0] == "0":
sent="Negative"
neg_tweets+=1
if neg_tweets < NUM_TEST:
text = tokenizer.tokenize(line[5].decode("utf-8"))
for i,token in enumerate(text):
text[i] = re.sub("@[\S]+", "USERNAME", text[i])
text[i] = re.sub("www.[\S]+|https://[\S]+", "URL", text[i])
newstr = ""
for ch in text[i]:
if ord(ch)>128:
newstr+= "EMOJI_{0}".format(ord(ch))
#print [ch], ord(ch)
else:
newstr+=(ch)
text[i] = newstr
test.append((text, sent))
if line[0] == "4":
sent = "Positive"
pos_tweets+=1
if pos_tweets < NUM_TEST:
text = tokenizer.tokenize(line[5].decode("utf-8"))
for i,token in enumerate(text):
text[i] = re.sub("@[\S]+", "USERNAME", text[i])
text[i] = re.sub("www.[\S]+|https://[\S]+", "URL", text[i])
newstr = ""
for ch in text[i]:
if ord(ch)>128:
newstr+= "EMOJI_{0}".format(ord(ch))
#print [ch], ord(ch)
else:
newstr+=(ch)
text[i] = newstr
test.append((text, sent))
return test
示例8: _get_nouns
# 需要导入模块: from nltk.tokenize import TweetTokenizer [as 别名]
# 或者: from nltk.tokenize.TweetTokenizer import tokenize [as 别名]
def _get_nouns(tweet_text):
"""
Args:
tweet_text:
Returns:
"""
tokenizer = TweetTokenizer()
tokenizer.tokenize(tweet_text)
nouns = []
tag = pos_tag(tokenizer.tokenize(tweet_text))
nouns.extend([t[0] for t in tag if t[1] == 'NN' or t[1] == 'NNP'])
return nouns
示例9: get_diff
# 需要导入模块: from nltk.tokenize import TweetTokenizer [as 别名]
# 或者: from nltk.tokenize.TweetTokenizer import tokenize [as 别名]
def get_diff(query, event_name):
tknzr = TweetTokenizer()
query_strip = tknzr.tokenize(query)
name_strip = tknzr.tokenize(event_name)
ratio = 0
for word in query_strip:
for word2 in name_strip:
r = difflib.SequenceMatcher(None, word, word2).ratio()
rrr = r*r*r
ratio += rrr
if ratio >= len(query_strip):
# werk om eoa reden niet
print ratio ,len(name_strip)
ratio = 100
return ratio
示例10: format_text
# 需要导入模块: from nltk.tokenize import TweetTokenizer [as 别名]
# 或者: from nltk.tokenize.TweetTokenizer import tokenize [as 别名]
def format_text(entries, LSTM_shape=True):
THIS_FOLDER = str(os.path.dirname(os.path.abspath(__file__)))
sentences = []
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
decoded = base64.b64decode(entries)
decoded = str(decoded)
decoded = decoded[2:]
decoded = decoded[:-1]
decoded = decoded.split(".")
#print(decoded, "is decoded")
for entry in decoded:
token_sentences = tokenizer.tokenize(entry)
for sentence in token_sentences:
sentences.append(sentence)
tokenized_sentences = []
#remove_tokens = ['%', ']', '[', '.', ',', '?', '!', '\'']
#remove_tokens = string.punctuation
remove_tokens = '!"#$%&\'()*+,-./:;<=>[email protected][\\]^_`{|}~'
stop_words = set(stopwords.words('english'))
tweet_tknzr = TweetTokenizer()
for sentence in sentences:
tokens = tweet_tknzr.tokenize(sentence)
tokens = list(filter(lambda a: a not in remove_tokens and a not in stop_words, tokens))
tokenized_sentences.append(tokens)
all_ngrams1 = np.load(THIS_FOLDER+'/ngrams1.npy').item()
all_ngrams2 = np.load(THIS_FOLDER+'/ngrams2.npy').item()
all_ngrams3 = np.load(THIS_FOLDER+'/ngrams3.npy').item()
#once the model gets updated with good data, ngrams.py needs to get changed/updated too!
X = np.zeros((len(sentences), len(all_ngrams1)+len(all_ngrams2)+len(all_ngrams3)))
for i in range(len(tokenized_sentences)):
sentence = tokenized_sentences[i]
my_ngrams = ngrams(sentence, 1)
for gram in my_ngrams:
if gram in all_ngrams1:
index = all_ngrams1[gram]
X[i][index] = 1
for i in range(len(tokenized_sentences)):
sentence = tokenized_sentences[i]
my_ngrams = ngrams(sentence, 2)
for gram in my_ngrams:
if gram in all_ngrams2:
index = len(all_ngrams1) + all_ngrams2[gram]
X[i][index] = 1
for i in range(len(tokenized_sentences)):
sentence = tokenized_sentences[i]
my_ngrams = ngrams(sentence, 3)
for gram in my_ngrams:
if gram in all_ngrams3:
index = len(all_ngrams1) + len(all_ngrams2) + all_ngrams3[gram]
X[i][index] = 1
if LSTM_shape:
X = np.reshape(X, (X.shape[0], 1, X.shape[1]))
else:
X = np.reshape(X, (X.shape[0], X.shape[1]))
return X
示例11: main
# 需要导入模块: from nltk.tokenize import TweetTokenizer [as 别名]
# 或者: from nltk.tokenize.TweetTokenizer import tokenize [as 别名]
def main():
text = sys.stdin.read().decode("utf-8")
tknzr = TweetTokenizer()
tok = tknzr.tokenize(text)
saved_object = construct_dict(tok)
print json.dumps(saved_object)
示例12: load_data_and_labels_sam
# 需要导入模块: from nltk.tokenize import TweetTokenizer [as 别名]
# 或者: from nltk.tokenize.TweetTokenizer import tokenize [as 别名]
def load_data_and_labels_sam():
# load
with open("./input/2780_freshmen_tweets.csv", 'rU') as f:
rdr = csv.reader(f)
dataset = list(rdr)[1:] # remove header
# filter out tweets with unknown sentiment
dataset = [entry for entry in dataset if entry[4] != '0']
# generate x
tk = TweetTokenizer(reduce_len=True)
x_text = [entry[3] for entry in dataset]
x_text = [clean_str(tweet) for tweet in x_text]
x_text = [tk.tokenize(tweet) for tweet in x_text]
# generate y
y = [entry[4] for entry in dataset]
for idx, label in enumerate(y):
if label == '1': # positive
y[idx] = [1, 0, 0]
elif label == '2': # neutral
y[idx] = [0, 1, 0]
elif label == '3': # negative
y[idx] = [0, 0, 1]
else:
print 'wrong label in sam: ' + label
return [x_text, y]
示例13: load_data_and_labels_gameforum
# 需要导入模块: from nltk.tokenize import TweetTokenizer [as 别名]
# 或者: from nltk.tokenize.TweetTokenizer import tokenize [as 别名]
def load_data_and_labels_gameforum():
# load
with open("./input/gameforum-1000.csv", 'rU') as f:
rdr = csv.reader(f)
dataset = list(rdr)[1:] # remove header
dataset = [entry for entry in dataset if (entry[1] == '1' or entry[1] == '2' or entry[1] == '3')]
# generate x
tk = TweetTokenizer(reduce_len=True)
x_text = [entry[0] for entry in dataset]
x_text = [clean_str(post) for post in x_text]
x_text = [tk.tokenize(post) for post in x_text]
# generate y
y = [entry[1] for entry in dataset]
for idx, label in enumerate(y):
if label == '1': # positive
y[idx] = [1, 0, 0]
elif label == '2': # neutral
y[idx] = [0, 1, 0]
elif label == '3': # negative
y[idx] = [0, 0, 1]
else:
print 'wrong label in gameforum: ' + label
return [x_text, y]
示例14: load_tweetkeywords
# 需要导入模块: from nltk.tokenize import TweetTokenizer [as 别名]
# 或者: from nltk.tokenize.TweetTokenizer import tokenize [as 别名]
def load_tweetkeywords():
"""
Check and see which keywords are used in each tweet, and load the association
table linking tweets and keywords
"""
# TweetKeyword.query.delete()
tweets = Tweet.query.all()
keyword_query = Keyword.query.all()
keywords = []
[keywords.append(word.keyword) for word in keyword_query]
tknzr = TweetTokenizer()
for tweet in tweets:
tokenized_tweets = tknzr.tokenize(tweet.text)
for token in tokenized_tweets:
if token in keywords:
tweet_id = Tweet.query.filter(Tweet.tweet_id == tweet.tweet_id).one()
keyword_id = Keyword.query.filter(Keyword.keyword == token).one()
tweet_keyword = TweetKeyword(keyword_id=keyword_id.keyword_id, tweet_id=tweet_id.tweet_id)
print "Added to TweetKeyword table: {}".format(tweet_keyword.keyword_id)
db.session.add(tweet_keyword)
db.session.commit()
示例15: createDataset
# 需要导入模块: from nltk.tokenize import TweetTokenizer [as 别名]
# 或者: from nltk.tokenize.TweetTokenizer import tokenize [as 别名]
def createDataset(filename, MAX_VOCAB_SIZE):
yaks = []
tokenizer = TweetTokenizer()
ids = set()
numyaks = 0
for line in open(filename).readlines():
stuff = line.split(":::")
id = stuff[0]
if len(stuff) > 3 and id not in ids:
numyaks+=1
sentence = stuff[3]
ids.add(id)
tokens = [START_TOKEN]
tokens.extend(tokenizer.tokenize(sentence.lower()))
tokens.append(END_TOKEN)
yaks.append(tokens)
token_frequency = nltk.FreqDist(itertools.chain(*yaks))
vocab = token_frequency.most_common(MAX_VOCAB_SIZE-1)
i2t = [token[0] for token in vocab]
i2t.append(UNKNOWN_TOKEN)
t2i = dict()
for i,t in enumerate(i2t):
t2i[t] = i
yaks = [[t if t in t2i else UNKNOWN_TOKEN for t in yak] for yak in yaks]
Xtrain = np.asarray([[t2i[token] for token in yak[:-1]] for yak in yaks])
Ytrain = np.asarray([[t2i[token] for token in yak[1:]] for yak in yaks])
print "Num unique Yaks: "+str(numyaks)
return (Xtrain, Ytrain, i2t, t2i)