Python tokenize.TweetTokenizer类代码示例

本文整理汇总了Python中nltk.tokenize.TweetTokenizer类的典型用法代码示例。如果您正苦于以下问题：Python TweetTokenizer类的具体用法？Python TweetTokenizer怎么用？Python TweetTokenizer使用的例子？那么, 这里精选的类代码示例或许可以为您提供帮助。

在下文中一共展示了TweetTokenizer类的15个代码示例，这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞，您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: load_data_and_labels_gameforum

def load_data_and_labels_gameforum():
    # load
    with open("./input/gameforum-1000.csv", 'rU') as f:
        rdr = csv.reader(f)
        dataset = list(rdr)[1:]  # remove header

    dataset = [entry for entry in dataset if (entry[1] == '1' or entry[1] == '2' or entry[1] == '3')]

    # generate x
    tk = TweetTokenizer(reduce_len=True)
    x_text = [entry[0] for entry in dataset]
    x_text = [clean_str(post) for post in x_text]
    x_text = [tk.tokenize(post) for post in x_text]

    # generate y
    y = [entry[1] for entry in dataset]
    for idx, label in enumerate(y):
        if label == '1':  # positive
            y[idx] = [1, 0, 0]
        elif label == '2':  # neutral
            y[idx] = [0, 1, 0]
        elif label == '3':  # negative
            y[idx] = [0, 0, 1]
        else:
            print 'wrong label in gameforum: ' + label

    return [x_text, y]

开发者ID:ydj0604，项目名称:DeepLearning-On-Tweets，代码行数:27，代码来源:data_helpers.py

示例2: process_tweets

def process_tweets(file_name):
    '''
    Person Responsible: Devin Munger

    + file_name: filename of tweets as returned from API based on query
   
    Extract text from file; return dataframe with tweet text, id
    '''
    ## Create empty dataframe
    tweet_df = pd.DataFrame(columns = ["text", "id"])

    tokenizer = TweetTokenizer(preserve_case = False, strip_handles = True)
    ## Read each JSON from file
    with open(file_name) as data_file:
        for entry in data_file.readlines():
            tweet = json.loads(entry)
            tweet_id = str(tweet.get("id", ""))
            text = tweet.get("text", "")
            ## Remove links from text
            text = re.sub(r"http\S+", "", text)
            ## Remove twitter keywords
            text.replace("RT ", "")
            ## Remove handle, punctuation from tweet text
            text_words = filter(lambda x: x not in string.punctuation, tokenizer.tokenize(text))
            ## Add tweet to dataframe
            tweet_df.loc[len(tweet_df)] = [" ".join(text_words), tweet_id]
    return tweet_df

开发者ID:annalizhaz，项目名称:twitterdetective，代码行数:27，代码来源:functions.py

示例3: parse

    def parse(self, text):

        # Tokenize message
        tokenizer = TweetTokenizer()
        words = tokenizer.tokenize(text)

        retweet_term = 'RT'

        urls = []
        users = []
        hash_tags = []
        for word in words:
            if (word[0] == '@'):
                # user in Twitter
                users.append(word)
            elif (word[0] == '#'):
                # hash tags
                hash_tags.append(word)
            elif (word.find('http:') == 0 or word.find('https:') == 0):
                # url
                urls.append(word)

        for f in urls + users + hash_tags + [retweet_term]:
            if f in words:
                words.remove(f)

        self.words = words
        self.urls = urls
        self.users = users
        self.hash_tags = hash_tags

开发者ID:nicolay-r，项目名称:tone-classifier，代码行数:30，代码来源:msg.py

示例4: nltk_tokenize

def nltk_tokenize(text):
    tokens = []

    tknzr = TweetTokenizer()
    tokens = tknzr.tokenize(text)

    return tokens

开发者ID:xR86，项目名称:ml-stuff，代码行数:7，代码来源:ontology.py

示例5: load_csv

def load_csv():
    with open('Tweets.csv', 'rb') as csvfile:
        reader = csv.DictReader(csvfile)
        count = 1

        reviews = []
        stars = []
        tknzr = TweetTokenizer()
        for row in reader:
            try:
                words=tknzr.tokenize(row['text'])
                label = 'SENT_%s' % count

                #print label
               # TaggedDocument(utils.to_unicode(row['text']).split(), [label])
                # print "label:", label
                #labels = [label]
                #lab_sent = LabeledSentence(words, label)
                #print lab_sent
                #reviews.append(TaggedDocument(utils.to_unicode(row['text']).split(), [label]))
                reviews.append(TaggedDocument(words, [label]))
                stars.append(row['airline_sentiment'])
                count += 1
            except:
                continue

    print "final count:", count
    return reviews, stars

开发者ID:akshaykamath，项目名称:Flight-Data-Sentiment-Analysis，代码行数:28，代码来源:DbowClassification.py

示例6: getTweetTokens

def getTweetTokens(classification, toRead, info, tags):
    i=0
    tknzr = TweetTokenizer()

    with open(toRead) as f:
        content = f.readlines()

    c = 0

    for item in content:
        #adapt the list into python dictionary format
        content[c] = item.replace("null", "None")
        content[c] = content[c].replace("false", "False")
        content[c] = content[c].replace("true", "True")
        c+=1

    for i in range(len(content)):
        tweet = eval(content[i])["text"]
        tokenTweet = tknzr.tokenize(tweet)
        j = 0
        k = 0
        while j < (len(tokenTweet) - k):
            #print j
            if tokenTweet[j][0] == "#":
                tokenTweet[j] = tokenTweet[j][1:]
            elif tokenTweet[j][0] == "@":
                del tokenTweet[j]
                j-=1
                k+=1
            j+=1
            
        info.append((word_feats(tokenTweet), classification))

开发者ID:imjackyan，项目名称:TweetSpace，代码行数:32，代码来源:twitter_trainer.py

示例7: _tag_text

 def _tag_text(self, tweet_text):
     tokenizer = TweetTokenizer()
     tokens = tokenizer.tokenize(tweet_text)
     tagged = nltk.pos_tag(tokens)
     entities = nltk.chunk.ne_chunk(tagged)
     neList = traverse(entities)
     return neList

开发者ID:smilligan93，项目名称:undergrad，代码行数:7，代码来源:tag.py

示例8: createDataset

def createDataset(filename, MAX_VOCAB_SIZE):
    yaks = []
    tokenizer = TweetTokenizer()
    ids = set()
    numyaks = 0
    for line in open(filename).readlines():
        stuff = line.split(":::")
        id = stuff[0]
        if len(stuff) > 3 and id not in ids:
            numyaks+=1
            sentence = stuff[3]
            ids.add(id)
            tokens = [START_TOKEN]
            tokens.extend(tokenizer.tokenize(sentence.lower()))
            tokens.append(END_TOKEN)
            yaks.append(tokens)
    token_frequency = nltk.FreqDist(itertools.chain(*yaks))
    vocab = token_frequency.most_common(MAX_VOCAB_SIZE-1)
    i2t = [token[0] for token in vocab]
    i2t.append(UNKNOWN_TOKEN)
    t2i = dict()
    for i,t in enumerate(i2t):
        t2i[t] = i
    
    yaks = [[t if t in t2i else UNKNOWN_TOKEN for t in yak] for yak in yaks]
    
    Xtrain = np.asarray([[t2i[token] for token in yak[:-1]] for yak in yaks])
    Ytrain = np.asarray([[t2i[token] for token in yak[1:]] for yak in yaks])
    print "Num unique Yaks: "+str(numyaks)
    return (Xtrain, Ytrain, i2t, t2i)

开发者ID:jdbrandon，项目名称:15780proj，代码行数:30，代码来源:train_rnn.py

示例9: check

def check():
	check_id = request.args.get("id")
	if check_id is not None:
		check_sentence = Sentence.query.get(check_id)
		if check_sentence is not None:
			Word.query.filter_by(sentence_id=check_id).delete()
			tweet_tokenizer = TweetTokenizer()
			tokens = tweet_tokenizer.tokenize(check_sentence.text)
			for token in tokens:
				url = "http://kateglo.com/api.php?format=json&phrase="+token
				resp = requests.get(url)
				exist = False
				if (resp.ok):
					try:
						resp_json = json.loads(resp.content)
						exist = True
					except ValueError:
						exist = False
				word = Word(check_sentence.id, token, exist)
				db.session.add(word)
			db.session.commit()
	sentences = Sentence.query.all()
	c = ((sentence.id, 
		sentence.source, 
		sentence.text, 
		((w.word, w.exist,) for w in sentence.words.all()), 
		) for sentence in sentences)
	return render_template('check.html', rows=c)

开发者ID:tentangdata，项目名称:bilp-heroku，代码行数:28，代码来源:main.py

示例10: preprocess_db

def preprocess_db():
    tkn = TweetTokenizer()
    photos = pd.read_pickle(r'./data/restaurant_photos_with_labels.pkl')
    img_path = r'./data/restaurant_photos/'
    sentid = 1
    img_list = []

    # Split data in such a way that labels are evenly distributed between 6 folds
    skf = StratifiedKFold(photos['label'], n_folds=6)

    folds = []
    # Initialize all images to train dataset initially
    photos['split'] = ['train' for i in range(len(photos))]

    # Obtain the indices for the test and validation splits and change value appropriately
    for _, test_ix in skf:
        folds.append(test_ix)
    photos.split[folds[0]] = 'test'
    photos.split[folds[1]] = 'val'

    # Obtain the information from each picture and move the pictures to the appropriate dir. The images are renamed.
    for i, photo_id in enumerate(photos.photo_id):
        img_dict = dict()
        img_dict['sentids'] = [sentid]
        img_dict['business_id'] = photo_id.business_id[i]
        if photos.split[i] in ['train']:
            img_dict['filepath'] = u'train'
            img_dict['imgid'] = 0
            img_dict['split'] = u'train'
            shutil.copy(img_path + photo_id + '.jpg', './data/restaurant_photos_split/train/' + str(sentid).zfill(6) + '.jpg')
        elif photos.split[i] in ['test']:
            img_dict['filepath'] = u'test'
            img_dict['imgid'] = 0
            img_dict['split'] = u'test'
            shutil.copy(img_path + photo_id + '.jpg', './data/restaurant_photos_split/test/' + str(sentid).zfill(6) + '.jpg')
        else:
            img_dict['filepath'] = u'val'
            img_dict['imgid'] = 0
            img_dict['split'] = u'val'
            shutil.copy(img_path + photo_id + '.jpg', './data/restaurant_photos_split/val/' + str(sentid).zfill(6) + '.jpg')
        img_dict['label'] = photos.label[i]
        caption_dict = dict()
        if photos.caption[i]:
            # Tokenize the captions
            caption_dict['tokens'] = tkn.tokenize(photos.caption[i])
            caption_dict['raw'] = photos.caption[i]
        else:
            caption_dict['tokens'] = 'None'
            caption_dict['raw'] = 'None'
        caption_dict['imgid'] = 0
        caption_dict['sentid'] = sentid
        img_dict['sentences'] = [caption_dict]
        img_dict['photoid'] = sentid
        img_dict['yelpid'] = photo_id
        img_list.append(img_dict)
        sentid += 1

    # Store the new dataset as a JSON file
    with open("./data/image_caption_dataset.json", "w") as outfile:
        json.dump(img_list, outfile)

开发者ID:neostoic，项目名称:image_classification，代码行数:60，代码来源:preprocess_db.py

示例11: main

def main():
    text = sys.stdin.read().decode("utf-8")

    tknzr = TweetTokenizer()
    tok = tknzr.tokenize(text)
    saved_object = construct_dict(tok)
    print json.dumps(saved_object)

开发者ID:redserg，项目名称:shad-python-hw-3，代码行数:7，代码来源:counter.py

示例12: get_tweet_tags

def get_tweet_tags(tweet):
    """ Break up a tweet into individual word parts """
    tknzr = TweetTokenizer()
    tokens = tknzr.tokenize(tweet)
    # replace handles with real names
    for n, tok in enumerate(tokens):
        if tok.startswith('@'):
            handle = tok.strip("@")
            if handle in user.students:
                # If we have a database entry for the mentioned user, we can
                # easily substitute a full name.
                usr = user.NPUser(handle)
                tokens[n] = usr.fullname
            else:
                # If there is no database entry, we use the user's alias. While
                # this is the full name in many cases, it is often not reliable
                usr = api.get_user(handle)
                tokens[n] = usr.name
    tagged = nltk.pos_tag(tokens)
    # In nltk, if a teacher's name is written with a period after an
    # abbreviated prefix, it is awkwardly broken up into 3 tags
    for n, tag in enumerate(tagged):
        # If there is the weird period after the prefix,
        if tag[1] == '.':
            # and it is in fact splitting up a person's name,
            if tagged[n - 1][1] == 'NNP' and tagged[n + 1][1] == 'NNP':
                if tagged[n - 1][0] in ['Mr', 'Ms', 'Mrs', 'Mx']:
                    # combine it into the actual name,
                    tagged[n - 1] = ('{}. {}'.format(tagged[n - 1][0],
                                                     tagged[n + 1][0]), 'NNP')
                    # and then remove the extra tags.
                    del tagged[n + 1]
                    del tagged[n]
    return tagged

开发者ID:SocialNPHS，项目名称:SocialNPHS，代码行数:34，代码来源:tweet.py

示例13: load_data_and_labels_sam

def load_data_and_labels_sam():
    # load
    with open("./input/2780_freshmen_tweets.csv", 'rU') as f:
        rdr = csv.reader(f)
        dataset = list(rdr)[1:]  # remove header

    # filter out tweets with unknown sentiment
    dataset = [entry for entry in dataset if entry[4] != '0']

    # generate x
    tk = TweetTokenizer(reduce_len=True)
    x_text = [entry[3] for entry in dataset]
    x_text = [clean_str(tweet) for tweet in x_text]
    x_text = [tk.tokenize(tweet) for tweet in x_text]

    # generate y
    y = [entry[4] for entry in dataset]
    for idx, label in enumerate(y):
        if label == '1': # positive
            y[idx] = [1, 0, 0]
        elif label == '2': # neutral
            y[idx] = [0, 1, 0]
        elif label == '3': # negative
            y[idx] = [0, 0, 1]
        else:
            print 'wrong label in sam: ' + label

    return [x_text, y]

开发者ID:ydj0604，项目名称:DeepLearning-On-Tweets，代码行数:28，代码来源:data_helpers.py

示例14: preprocess_tweets

def preprocess_tweets(event_date, dt=datetime.timedelta(seconds=30),
                      match=None, tweet_processor=None, match_type='home'):
    import collections
    
    tknzr = TweetTokenizer()
    
    dbname = match['dbname']
    collname_home = match['collname_home']
    collname_away = match['collname_away']
    home_team = match['home_team']
    away_team = match['away_team']
    
    if match_type == 'home':
        coll = client[dbname][collname_home]
    else:
        coll = client[dbname][collname_away]

    # add some padding to the start and end times
    date_start = event_date - dt
    date_end = event_date + dt

    query = { "created_at": {"$gt": date_start, "$lt": date_end}}
    
    results = coll.find( query )
    clean_tweets = []
    for result in results:
        tweet_id = result['id_str']
        tweet_split = tweet_processor.preprocess(result['text'].encode('ascii', 'ignore'))
        
        parts = tknzr.tokenize(tweet_split)
        clean = [i for i in parts if i not in stop]
        clean_text = " ".join (clean)
        clean_tweets.append( (clean_text, tweet_id) )
        
    return clean_tweets

开发者ID:RyanDRKane，项目名称:RK_FYP，代码行数:35，代码来源:SocialMediaAnalyticsSportsEvents_RyanKane_ProjectCode.py

示例15: format_text

def format_text(entries, LSTM_shape=True):
	THIS_FOLDER = str(os.path.dirname(os.path.abspath(__file__)))
	sentences = []
	tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
	decoded = base64.b64decode(entries)
	decoded = str(decoded)
	decoded = decoded[2:]
	decoded = decoded[:-1]
	decoded = decoded.split(".")
	#print(decoded, "is decoded")
	for entry in decoded:
		token_sentences = tokenizer.tokenize(entry)
		for sentence in token_sentences:
			sentences.append(sentence)

	tokenized_sentences = []
	#remove_tokens = ['%', ']', '[', '.', ',', '?', '!', '\'']
	#remove_tokens = string.punctuation
	remove_tokens = '!"#$%&\'()*+,-./:;<=>[email protected][\\]^_`{|}~'
	stop_words = set(stopwords.words('english'))
	tweet_tknzr = TweetTokenizer()
	for sentence in sentences:
		tokens = tweet_tknzr.tokenize(sentence)
		tokens = list(filter(lambda a: a not in remove_tokens and a not in stop_words, tokens))
		tokenized_sentences.append(tokens)

	all_ngrams1 = np.load(THIS_FOLDER+'/ngrams1.npy').item()
	all_ngrams2 = np.load(THIS_FOLDER+'/ngrams2.npy').item()
	all_ngrams3 = np.load(THIS_FOLDER+'/ngrams3.npy').item()
	#once the model gets updated with good data, ngrams.py needs to get changed/updated too!

	X = np.zeros((len(sentences), len(all_ngrams1)+len(all_ngrams2)+len(all_ngrams3)))
	for i in range(len(tokenized_sentences)):
		sentence = tokenized_sentences[i]
		my_ngrams = ngrams(sentence, 1)
		for gram in my_ngrams:
			if gram in all_ngrams1:
				index = all_ngrams1[gram]
				X[i][index] = 1
	for i in range(len(tokenized_sentences)):
		sentence = tokenized_sentences[i]
		my_ngrams = ngrams(sentence, 2)
		for gram in my_ngrams:
			if gram in all_ngrams2:
				index = len(all_ngrams1) + all_ngrams2[gram]
				X[i][index] = 1
	for i in range(len(tokenized_sentences)):
		sentence = tokenized_sentences[i]
		my_ngrams = ngrams(sentence, 3)
		for gram in my_ngrams:
			if gram in all_ngrams3:
				index = len(all_ngrams1) + len(all_ngrams2) + all_ngrams3[gram]
				X[i][index] = 1


	if LSTM_shape:
		X = np.reshape(X, (X.shape[0], 1, X.shape[1]))
	else:
		X = np.reshape(X, (X.shape[0], X.shape[1]))
	return X

开发者ID:mit-teaching-systems-lab，项目名称:threeflows，代码行数:60，代码来源:calculate_emotion.py

注：本文中的nltk.tokenize.TweetTokenizer类示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台，相关代码片段筛选自各路编程大神贡献的开源项目，源码版权归原作者所有，传播和使用请参考对应项目的License；未经允许，请勿转载。