本文整理汇总了Python中nltk.tokenize.TweetTokenizer类的典型用法代码示例。如果您正苦于以下问题:Python TweetTokenizer类的具体用法?Python TweetTokenizer怎么用?Python TweetTokenizer使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了TweetTokenizer类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: load_data_and_labels_gameforum
def load_data_and_labels_gameforum():
# load
with open("./input/gameforum-1000.csv", 'rU') as f:
rdr = csv.reader(f)
dataset = list(rdr)[1:] # remove header
dataset = [entry for entry in dataset if (entry[1] == '1' or entry[1] == '2' or entry[1] == '3')]
# generate x
tk = TweetTokenizer(reduce_len=True)
x_text = [entry[0] for entry in dataset]
x_text = [clean_str(post) for post in x_text]
x_text = [tk.tokenize(post) for post in x_text]
# generate y
y = [entry[1] for entry in dataset]
for idx, label in enumerate(y):
if label == '1': # positive
y[idx] = [1, 0, 0]
elif label == '2': # neutral
y[idx] = [0, 1, 0]
elif label == '3': # negative
y[idx] = [0, 0, 1]
else:
print 'wrong label in gameforum: ' + label
return [x_text, y]
示例2: process_tweets
def process_tweets(file_name):
'''
Person Responsible: Devin Munger
+ file_name: filename of tweets as returned from API based on query
Extract text from file; return dataframe with tweet text, id
'''
## Create empty dataframe
tweet_df = pd.DataFrame(columns = ["text", "id"])
tokenizer = TweetTokenizer(preserve_case = False, strip_handles = True)
## Read each JSON from file
with open(file_name) as data_file:
for entry in data_file.readlines():
tweet = json.loads(entry)
tweet_id = str(tweet.get("id", ""))
text = tweet.get("text", "")
## Remove links from text
text = re.sub(r"http\S+", "", text)
## Remove twitter keywords
text.replace("RT ", "")
## Remove handle, punctuation from tweet text
text_words = filter(lambda x: x not in string.punctuation, tokenizer.tokenize(text))
## Add tweet to dataframe
tweet_df.loc[len(tweet_df)] = [" ".join(text_words), tweet_id]
return tweet_df
示例3: parse
def parse(self, text):
# Tokenize message
tokenizer = TweetTokenizer()
words = tokenizer.tokenize(text)
retweet_term = 'RT'
urls = []
users = []
hash_tags = []
for word in words:
if (word[0] == '@'):
# user in Twitter
users.append(word)
elif (word[0] == '#'):
# hash tags
hash_tags.append(word)
elif (word.find('http:') == 0 or word.find('https:') == 0):
# url
urls.append(word)
for f in urls + users + hash_tags + [retweet_term]:
if f in words:
words.remove(f)
self.words = words
self.urls = urls
self.users = users
self.hash_tags = hash_tags
示例4: nltk_tokenize
def nltk_tokenize(text):
tokens = []
tknzr = TweetTokenizer()
tokens = tknzr.tokenize(text)
return tokens
示例5: load_csv
def load_csv():
with open('Tweets.csv', 'rb') as csvfile:
reader = csv.DictReader(csvfile)
count = 1
reviews = []
stars = []
tknzr = TweetTokenizer()
for row in reader:
try:
words=tknzr.tokenize(row['text'])
label = 'SENT_%s' % count
#print label
# TaggedDocument(utils.to_unicode(row['text']).split(), [label])
# print "label:", label
#labels = [label]
#lab_sent = LabeledSentence(words, label)
#print lab_sent
#reviews.append(TaggedDocument(utils.to_unicode(row['text']).split(), [label]))
reviews.append(TaggedDocument(words, [label]))
stars.append(row['airline_sentiment'])
count += 1
except:
continue
print "final count:", count
return reviews, stars
示例6: getTweetTokens
def getTweetTokens(classification, toRead, info, tags):
i=0
tknzr = TweetTokenizer()
with open(toRead) as f:
content = f.readlines()
c = 0
for item in content:
#adapt the list into python dictionary format
content[c] = item.replace("null", "None")
content[c] = content[c].replace("false", "False")
content[c] = content[c].replace("true", "True")
c+=1
for i in range(len(content)):
tweet = eval(content[i])["text"]
tokenTweet = tknzr.tokenize(tweet)
j = 0
k = 0
while j < (len(tokenTweet) - k):
#print j
if tokenTweet[j][0] == "#":
tokenTweet[j] = tokenTweet[j][1:]
elif tokenTweet[j][0] == "@":
del tokenTweet[j]
j-=1
k+=1
j+=1
info.append((word_feats(tokenTweet), classification))
示例7: _tag_text
def _tag_text(self, tweet_text):
tokenizer = TweetTokenizer()
tokens = tokenizer.tokenize(tweet_text)
tagged = nltk.pos_tag(tokens)
entities = nltk.chunk.ne_chunk(tagged)
neList = traverse(entities)
return neList
示例8: createDataset
def createDataset(filename, MAX_VOCAB_SIZE):
yaks = []
tokenizer = TweetTokenizer()
ids = set()
numyaks = 0
for line in open(filename).readlines():
stuff = line.split(":::")
id = stuff[0]
if len(stuff) > 3 and id not in ids:
numyaks+=1
sentence = stuff[3]
ids.add(id)
tokens = [START_TOKEN]
tokens.extend(tokenizer.tokenize(sentence.lower()))
tokens.append(END_TOKEN)
yaks.append(tokens)
token_frequency = nltk.FreqDist(itertools.chain(*yaks))
vocab = token_frequency.most_common(MAX_VOCAB_SIZE-1)
i2t = [token[0] for token in vocab]
i2t.append(UNKNOWN_TOKEN)
t2i = dict()
for i,t in enumerate(i2t):
t2i[t] = i
yaks = [[t if t in t2i else UNKNOWN_TOKEN for t in yak] for yak in yaks]
Xtrain = np.asarray([[t2i[token] for token in yak[:-1]] for yak in yaks])
Ytrain = np.asarray([[t2i[token] for token in yak[1:]] for yak in yaks])
print "Num unique Yaks: "+str(numyaks)
return (Xtrain, Ytrain, i2t, t2i)
示例9: check
def check():
check_id = request.args.get("id")
if check_id is not None:
check_sentence = Sentence.query.get(check_id)
if check_sentence is not None:
Word.query.filter_by(sentence_id=check_id).delete()
tweet_tokenizer = TweetTokenizer()
tokens = tweet_tokenizer.tokenize(check_sentence.text)
for token in tokens:
url = "http://kateglo.com/api.php?format=json&phrase="+token
resp = requests.get(url)
exist = False
if (resp.ok):
try:
resp_json = json.loads(resp.content)
exist = True
except ValueError:
exist = False
word = Word(check_sentence.id, token, exist)
db.session.add(word)
db.session.commit()
sentences = Sentence.query.all()
c = ((sentence.id,
sentence.source,
sentence.text,
((w.word, w.exist,) for w in sentence.words.all()),
) for sentence in sentences)
return render_template('check.html', rows=c)
示例10: preprocess_db
def preprocess_db():
tkn = TweetTokenizer()
photos = pd.read_pickle(r'./data/restaurant_photos_with_labels.pkl')
img_path = r'./data/restaurant_photos/'
sentid = 1
img_list = []
# Split data in such a way that labels are evenly distributed between 6 folds
skf = StratifiedKFold(photos['label'], n_folds=6)
folds = []
# Initialize all images to train dataset initially
photos['split'] = ['train' for i in range(len(photos))]
# Obtain the indices for the test and validation splits and change value appropriately
for _, test_ix in skf:
folds.append(test_ix)
photos.split[folds[0]] = 'test'
photos.split[folds[1]] = 'val'
# Obtain the information from each picture and move the pictures to the appropriate dir. The images are renamed.
for i, photo_id in enumerate(photos.photo_id):
img_dict = dict()
img_dict['sentids'] = [sentid]
img_dict['business_id'] = photo_id.business_id[i]
if photos.split[i] in ['train']:
img_dict['filepath'] = u'train'
img_dict['imgid'] = 0
img_dict['split'] = u'train'
shutil.copy(img_path + photo_id + '.jpg', './data/restaurant_photos_split/train/' + str(sentid).zfill(6) + '.jpg')
elif photos.split[i] in ['test']:
img_dict['filepath'] = u'test'
img_dict['imgid'] = 0
img_dict['split'] = u'test'
shutil.copy(img_path + photo_id + '.jpg', './data/restaurant_photos_split/test/' + str(sentid).zfill(6) + '.jpg')
else:
img_dict['filepath'] = u'val'
img_dict['imgid'] = 0
img_dict['split'] = u'val'
shutil.copy(img_path + photo_id + '.jpg', './data/restaurant_photos_split/val/' + str(sentid).zfill(6) + '.jpg')
img_dict['label'] = photos.label[i]
caption_dict = dict()
if photos.caption[i]:
# Tokenize the captions
caption_dict['tokens'] = tkn.tokenize(photos.caption[i])
caption_dict['raw'] = photos.caption[i]
else:
caption_dict['tokens'] = 'None'
caption_dict['raw'] = 'None'
caption_dict['imgid'] = 0
caption_dict['sentid'] = sentid
img_dict['sentences'] = [caption_dict]
img_dict['photoid'] = sentid
img_dict['yelpid'] = photo_id
img_list.append(img_dict)
sentid += 1
# Store the new dataset as a JSON file
with open("./data/image_caption_dataset.json", "w") as outfile:
json.dump(img_list, outfile)
示例11: main
def main():
text = sys.stdin.read().decode("utf-8")
tknzr = TweetTokenizer()
tok = tknzr.tokenize(text)
saved_object = construct_dict(tok)
print json.dumps(saved_object)
示例12: get_tweet_tags
def get_tweet_tags(tweet):
""" Break up a tweet into individual word parts """
tknzr = TweetTokenizer()
tokens = tknzr.tokenize(tweet)
# replace handles with real names
for n, tok in enumerate(tokens):
if tok.startswith('@'):
handle = tok.strip("@")
if handle in user.students:
# If we have a database entry for the mentioned user, we can
# easily substitute a full name.
usr = user.NPUser(handle)
tokens[n] = usr.fullname
else:
# If there is no database entry, we use the user's alias. While
# this is the full name in many cases, it is often not reliable
usr = api.get_user(handle)
tokens[n] = usr.name
tagged = nltk.pos_tag(tokens)
# In nltk, if a teacher's name is written with a period after an
# abbreviated prefix, it is awkwardly broken up into 3 tags
for n, tag in enumerate(tagged):
# If there is the weird period after the prefix,
if tag[1] == '.':
# and it is in fact splitting up a person's name,
if tagged[n - 1][1] == 'NNP' and tagged[n + 1][1] == 'NNP':
if tagged[n - 1][0] in ['Mr', 'Ms', 'Mrs', 'Mx']:
# combine it into the actual name,
tagged[n - 1] = ('{}. {}'.format(tagged[n - 1][0],
tagged[n + 1][0]), 'NNP')
# and then remove the extra tags.
del tagged[n + 1]
del tagged[n]
return tagged
示例13: load_data_and_labels_sam
def load_data_and_labels_sam():
# load
with open("./input/2780_freshmen_tweets.csv", 'rU') as f:
rdr = csv.reader(f)
dataset = list(rdr)[1:] # remove header
# filter out tweets with unknown sentiment
dataset = [entry for entry in dataset if entry[4] != '0']
# generate x
tk = TweetTokenizer(reduce_len=True)
x_text = [entry[3] for entry in dataset]
x_text = [clean_str(tweet) for tweet in x_text]
x_text = [tk.tokenize(tweet) for tweet in x_text]
# generate y
y = [entry[4] for entry in dataset]
for idx, label in enumerate(y):
if label == '1': # positive
y[idx] = [1, 0, 0]
elif label == '2': # neutral
y[idx] = [0, 1, 0]
elif label == '3': # negative
y[idx] = [0, 0, 1]
else:
print 'wrong label in sam: ' + label
return [x_text, y]
示例14: preprocess_tweets
def preprocess_tweets(event_date, dt=datetime.timedelta(seconds=30),
match=None, tweet_processor=None, match_type='home'):
import collections
tknzr = TweetTokenizer()
dbname = match['dbname']
collname_home = match['collname_home']
collname_away = match['collname_away']
home_team = match['home_team']
away_team = match['away_team']
if match_type == 'home':
coll = client[dbname][collname_home]
else:
coll = client[dbname][collname_away]
# add some padding to the start and end times
date_start = event_date - dt
date_end = event_date + dt
query = { "created_at": {"$gt": date_start, "$lt": date_end}}
results = coll.find( query )
clean_tweets = []
for result in results:
tweet_id = result['id_str']
tweet_split = tweet_processor.preprocess(result['text'].encode('ascii', 'ignore'))
parts = tknzr.tokenize(tweet_split)
clean = [i for i in parts if i not in stop]
clean_text = " ".join (clean)
clean_tweets.append( (clean_text, tweet_id) )
return clean_tweets
示例15: format_text
def format_text(entries, LSTM_shape=True):
THIS_FOLDER = str(os.path.dirname(os.path.abspath(__file__)))
sentences = []
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
decoded = base64.b64decode(entries)
decoded = str(decoded)
decoded = decoded[2:]
decoded = decoded[:-1]
decoded = decoded.split(".")
#print(decoded, "is decoded")
for entry in decoded:
token_sentences = tokenizer.tokenize(entry)
for sentence in token_sentences:
sentences.append(sentence)
tokenized_sentences = []
#remove_tokens = ['%', ']', '[', '.', ',', '?', '!', '\'']
#remove_tokens = string.punctuation
remove_tokens = '!"#$%&\'()*+,-./:;<=>[email protected][\\]^_`{|}~'
stop_words = set(stopwords.words('english'))
tweet_tknzr = TweetTokenizer()
for sentence in sentences:
tokens = tweet_tknzr.tokenize(sentence)
tokens = list(filter(lambda a: a not in remove_tokens and a not in stop_words, tokens))
tokenized_sentences.append(tokens)
all_ngrams1 = np.load(THIS_FOLDER+'/ngrams1.npy').item()
all_ngrams2 = np.load(THIS_FOLDER+'/ngrams2.npy').item()
all_ngrams3 = np.load(THIS_FOLDER+'/ngrams3.npy').item()
#once the model gets updated with good data, ngrams.py needs to get changed/updated too!
X = np.zeros((len(sentences), len(all_ngrams1)+len(all_ngrams2)+len(all_ngrams3)))
for i in range(len(tokenized_sentences)):
sentence = tokenized_sentences[i]
my_ngrams = ngrams(sentence, 1)
for gram in my_ngrams:
if gram in all_ngrams1:
index = all_ngrams1[gram]
X[i][index] = 1
for i in range(len(tokenized_sentences)):
sentence = tokenized_sentences[i]
my_ngrams = ngrams(sentence, 2)
for gram in my_ngrams:
if gram in all_ngrams2:
index = len(all_ngrams1) + all_ngrams2[gram]
X[i][index] = 1
for i in range(len(tokenized_sentences)):
sentence = tokenized_sentences[i]
my_ngrams = ngrams(sentence, 3)
for gram in my_ngrams:
if gram in all_ngrams3:
index = len(all_ngrams1) + len(all_ngrams2) + all_ngrams3[gram]
X[i][index] = 1
if LSTM_shape:
X = np.reshape(X, (X.shape[0], 1, X.shape[1]))
else:
X = np.reshape(X, (X.shape[0], X.shape[1]))
return X