当前位置: 首页>>代码示例>>Python>>正文


Python TweetTokenizer.tokenize方法代码示例

本文整理汇总了Python中nltk.tokenize.TweetTokenizer.tokenize方法的典型用法代码示例。如果您正苦于以下问题:Python TweetTokenizer.tokenize方法的具体用法?Python TweetTokenizer.tokenize怎么用?Python TweetTokenizer.tokenize使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在nltk.tokenize.TweetTokenizer的用法示例。


在下文中一共展示了TweetTokenizer.tokenize方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: get_utterances

# 需要导入模块: from nltk.tokenize import TweetTokenizer [as 别名]
# 或者: from nltk.tokenize.TweetTokenizer import tokenize [as 别名]
def get_utterances(utterances, line, category, wgram, cgram):
    tknzr = TweetTokenizer()
    gram_list = []
    # WORD GRAMS
    if wgram == 1:  # unigram
        wgram_list = tknzr.tokenize(line)
    elif wgram == 2:  # uni + bigram
        # unigram list
        tokens = nltk.wordpunct_tokenize(line)
        # bigram list
        finder = BigramCollocationFinder.from_words(tokens)
        scored = finder.score_ngrams(bigram_measures.raw_freq)
        bigram_list = sorted(bigram for bigram, score in scored)
        # res
        wgram_list = tknzr.tokenize(line) + bigram_list
    elif wgram == 3: # uni + bi + trigram
        # unigram list
        tokens = nltk.wordpunct_tokenize(line)
        # bigram list
        bi_finder = BigramCollocationFinder.from_words(tokens)
        bi_scored = bi_finder.score_ngrams(bigram_measures.raw_freq)
        bigram_list = sorted(bigram for bigram, biscore in bi_scored)  
        # trigram list
        tri_finder = TrigramCollocationFinder.from_words(tokens)
        tri_scored = tri_finder.score_ngrams(trigram_measures.raw_freq)
        trigram_list = sorted(trigram for trigram, triscore in tri_scored)
        # res
        wgram_list = tknzr.tokenize(line) + bigram_list + trigram_list
    
    # CHAR GRAMS
    cgram_list = []
    if cgram == 1:   # uni-chargram
        cgram_list = [line[i:i+1] for i in range(len(line)-1)]
    elif cgram == 2: # bi-chargram
        cgram_list = [line[i:i+2] for i in range(len(line)-1)]
    elif cgram == 3: # tri-chargram
        cgram_list = [line[i:i+3] for i in range(len(line)-1)]
        
    # RESULT
    if category == 'QA':            # non-task
        utterances.append((wgram_list + cgram_list, 0))
    elif category == 'Shopping':    # task
        utterances.append((wgram_list + cgram_list, 1))
    elif category == 'Travel':      # task
        utterances.append((wgram_list + cgram_list, 2))
    elif category == 'Hotel':       # task
        utterances.append((wgram_list + cgram_list, 3))
    elif category == 'Food':        # task
        utterances.append((wgram_list + cgram_list, 4))
    elif category == 'Art':         # task
        utterances.append((wgram_list + cgram_list, 5))
    elif category == 'Weather':     # task
        utterances.append((wgram_list + cgram_list, 6))
    elif category == 'Friends':     # task
        utterances.append((wgram_list + cgram_list, 7))
    elif category == 'Chat':        # chat
        utterances.append((wgram_list + cgram_list, 8))
    else:
        print utt_category,"ERROR"
开发者ID:SharleneL,项目名称:SpellErrorDetection,代码行数:61,代码来源:sklearn_lr_detect.py

示例2: load_data_and_labels_semeval

# 需要导入模块: from nltk.tokenize import TweetTokenizer [as 别名]
# 或者: from nltk.tokenize.TweetTokenizer import tokenize [as 别名]
def load_data_and_labels_semeval():
    # load the entire semeval dataset
    old_dataset = list(open("./input/2013-dev"))
    old_dataset.extend(list(open("./input/2013-devtest")))
    old_dataset.extend(list(open("./input/2013-train")))
    old_dataset.extend(list(open("./input/2014-devtest")))

    new_dataset = list(open("./input/2016-train"))
    new_dataset.extend(list(open("./input/2016-dev")))
    new_dataset.extend(list(open("./input/2016-devtest")))

    # filter out invalid tweets from new dataset
    new_dataset = [entry for entry in new_dataset if entry.split('\t')[2] != 'Not Available\n']

    # generate x from old
    tk = TweetTokenizer(reduce_len=True) # handles punctuations
    x_text = [entry.split('\t')[3] for entry in old_dataset]
    x_text = [clean_str(tweet) for tweet in x_text]
    x_text = [tk.tokenize(tweet) for tweet in x_text]

    # generate x from new
    x_text_new = [entry.split('\t')[2] for entry in new_dataset]
    x_text_new = [clean_str(tweet) for tweet in x_text_new]
    x_text_new = [tk.tokenize(tweet) for tweet in x_text_new]

    # concat x and x_new
    x_text.extend(x_text_new)

    # generate y from old
    y = [entry.split('\t')[2] for entry in old_dataset]
    for idx, label in enumerate(y):
        if label == 'positive':
            y[idx] = [1, 0, 0]
        elif label == 'neutral':
            y[idx] = [0, 1, 0]
        elif label == 'negative':
            y[idx] = [0, 0, 1]
        else:
            print 'wrong label in semeval: ' + label

    # generate y from new
    y_new = [entry.split('\t')[1] for entry in new_dataset]
    for idx, label in enumerate(y_new):
        if label == 'positive':
            y_new[idx] = [1, 0, 0]
        elif label == 'neutral':
            y_new[idx] = [0, 1, 0]
        elif label == 'negative':
            y_new[idx] = [0, 0, 1]
        else:
            print 'wrong label in semeval: ' + label

    # concat y and y_new
    y.extend(y_new)

    return [x_text, y]
开发者ID:ydj0604,项目名称:DeepLearning-On-Tweets,代码行数:58,代码来源:data_helpers.py

示例3: custom_tokenizer

# 需要导入模块: from nltk.tokenize import TweetTokenizer [as 别名]
# 或者: from nltk.tokenize.TweetTokenizer import tokenize [as 别名]
def custom_tokenizer(text, bigrams = None):
    chunks = text.split('-')
    tokenizer = TweetTokenizer(reduce_len=True, preserve_case=False)
    tokens = tokenizer.tokenize(text)
    tokens = [ subchunk for chunk in chunks for subchunk in tokenizer.tokenize(chunk) ]
    tokens = [ token for token in tokens if token.isalpha() ]
    if bigrams:
        tokens = mwe_tokenize(tokens, bigrams)
    stemmer = SnowballStemmer('english', ignore_stopwords=True)
    tokens = [ stemmer.stem(token) for token in tokens ]
    return tokens
开发者ID:annamarie-g,项目名称:capstone_project,代码行数:13,代码来源:text_preprocessing.py

示例4: getVocab

# 需要导入模块: from nltk.tokenize import TweetTokenizer [as 别名]
# 或者: from nltk.tokenize.TweetTokenizer import tokenize [as 别名]
def getVocab():
	freq = []
	vocab = []
	length = 0
	tknzr = TweetTokenizer()
	with open(path+'/data/training/training_stances.csv', 'r', encoding='UTF-8') as csvDataFile: 
		csvReader = csv.reader(csvDataFile)
		first = 1
		for row in csvReader:
			if first == 1:
				first = 0
			else:
				headline = row[0]
				tokens = tknzr.tokenize(headline)
				tokens=[token.lower() for token in tokens if (token.isalpha() and token not in stop_words)]
				#for word in r.split(headline):
				length = length + len(tokens)
				for word in tokens:
					if word not in vocab:
						vocab.append(word)
						freq.append(1)
					else:
						ind = vocab.index(word)
						freq[ind] = freq[ind] + 1
				
	with open(path+'/data/training/train_bodies.csv', 'r', encoding='UTF-8') as csvDataFile: 
		csvReader = csv.reader(csvDataFile)
		first = 1
		for row in csvReader:
			if first == 1:
				first = 0
			else:
				body = row[1]
				tokens = tknzr.tokenize(body)
				tokens=[token.lower() for token in tokens if (token.isalpha() and token not in stop_words)]
				length = length + len(tokens)
				#for word in r.split(headline):
				for word in tokens:
					if word not in vocab:
						vocab.append(word)
						freq.append(1)
					else:
						ind = vocab.index(word)
						freq[ind] = freq[ind] + 1
	return vocab, freq, length


				
#vocab list
#vocab, freq, length = getVocab()
开发者ID:ajia95,项目名称:fakenewsdetection,代码行数:52,代码来源:collection.py

示例5: get_classifier

# 需要导入模块: from nltk.tokenize import TweetTokenizer [as 别名]
# 或者: from nltk.tokenize.TweetTokenizer import tokenize [as 别名]
def get_classifier(featx):
    tokenizer = TweetTokenizer()
    print "Training Classifier..."
    negstr = [obj["text"] for obj in handle.negative_tweets.find()]
    posstr = [obj["text"] for obj in handle.positive_tweets.find()]
    negfeats = [(featx(tokenizer.tokenize(Twitter.process_tweet(negstr[i]))), 'neg')
                for i in range(0, len(negstr)-1)]
    posfeats = [(featx(tokenizer.tokenize(Twitter.process_tweet(posstr[i]))), 'pos')
                for i in range(0, len(posstr)-1)]
    trainfeats = negfeats + posfeats

    classifier = NaiveBayesClassifier.train(trainfeats)

    return classifier
开发者ID:ruaronicola,项目名称:TelepathyBot,代码行数:16,代码来源:sentiment.py

示例6: get_features

# 需要导入模块: from nltk.tokenize import TweetTokenizer [as 别名]
# 或者: from nltk.tokenize.TweetTokenizer import tokenize [as 别名]
def get_features(utterances, ngram, classify_method):
    features = []
    tknzr = TweetTokenizer()
    for utt in utterances:
        utt_content = utt[0]  # text content of the utterance
        utt_category = utt[1]

        if ngram:  # use bow & ngram as feature
            # bow list
            bow_list = tknzr.tokenize(utt_content)
            # cgram list
            uni_cgram_list = [utt_content[i:i+1] for i in range(len(utt_content)-1)]
            bi_cgram_list = [utt_content[i:i+2] for i in range(len(utt_content)-1)]
            tri_cgram_list = [utt_content[i:i+3] for i in range(len(utt_content)-1)]
            feature_list = bow_list         # add bow tokens
            feature_list += uni_cgram_list  # add unigram character lists
            feature_list += bi_cgram_list   # add bigram character lists
            feature_list += tri_cgram_list  # add trigram character lists
        else:  # only use bow as feature
            feature_list = tknzr.tokenize(utt_content)

        if classify_method == 'binary':
            if utt_category == 'QA':  # non-task
                features.append((feature_list, 0))
            else:  # task
                features.append((feature_list, 1))
        elif classify_method == 'multi':
            if utt_category == 'QA':            # non-task
                features.append((feature_list, 0))
            elif utt_category == 'Shopping':    # task
                features.append((feature_list, 1))
            elif utt_category == 'Travel':      # task
                features.append((feature_list, 2))
            elif utt_category == 'Hotel':       # task
                features.append((feature_list, 3))
            elif utt_category == 'Food':        # task
                features.append((feature_list, 4))
            elif utt_category == 'Art':         # task
                features.append((feature_list, 5))
            elif utt_category == 'Weather':     # task
                features.append((feature_list, 6))
            elif utt_category == 'Friends':     # task
                features.append((feature_list, 7))
            elif utt_category == 'Chat':        # chat
                features.append((feature_list, 8))
            else:
                print utt_category,"ERROR"

    return features
开发者ID:SharleneL,项目名称:UtteranceClassifier,代码行数:51,代码来源:data_process.py

示例7: get_test

# 需要导入模块: from nltk.tokenize import TweetTokenizer [as 别名]
# 或者: from nltk.tokenize.TweetTokenizer import tokenize [as 别名]
def get_test(infile, NUM_TEST):
	with codecs.open(infile, 'rb') as csvfile:
		test = []
		pos_tweets = 0
		neg_tweets = 0
		reader = csv.reader(csvfile)
		tokenizer = TweetTokenizer(preserve_case=True)
		for line in reader:
			if line[0] == "0":
				sent="Negative"
				neg_tweets+=1

				if neg_tweets < NUM_TEST:
					text = tokenizer.tokenize(line[5].decode("utf-8"))
					for i,token in enumerate(text):
						text[i] = re.sub("@[\S]+", "USERNAME", text[i])
						text[i] = re.sub("www.[\S]+|https://[\S]+", "URL", text[i])
						newstr = ""
						for ch in text[i]:
							if ord(ch)>128:
								newstr+= "EMOJI_{0}".format(ord(ch))
								#print [ch], ord(ch)
							else:
								newstr+=(ch)
						text[i] = newstr
					test.append((text, sent))

		
			if line[0] == "4":
				sent = "Positive"
				pos_tweets+=1
				
				if pos_tweets < NUM_TEST:			
					text = tokenizer.tokenize(line[5].decode("utf-8"))
					for i,token in enumerate(text):
						text[i] = re.sub("@[\S]+", "USERNAME", text[i])
						text[i] = re.sub("www.[\S]+|https://[\S]+", "URL", text[i])
						newstr = ""
						for ch in text[i]:
							if ord(ch)>128:
								newstr+= "EMOJI_{0}".format(ord(ch))
								#print [ch], ord(ch)
							else:
								newstr+=(ch)
						text[i] = newstr
					test.append((text, sent))
			

		return test
开发者ID:AlasdairNorton,项目名称:ClusterCloudAssg2,代码行数:51,代码来源:trainer.py

示例8: _get_nouns

# 需要导入模块: from nltk.tokenize import TweetTokenizer [as 别名]
# 或者: from nltk.tokenize.TweetTokenizer import tokenize [as 别名]
    def _get_nouns(tweet_text):
        """

        Args:
            tweet_text:

        Returns:

        """
        tokenizer = TweetTokenizer()
        tokenizer.tokenize(tweet_text)
        nouns = []
        tag = pos_tag(tokenizer.tokenize(tweet_text))
        nouns.extend([t[0] for t in tag if t[1] == 'NN' or t[1] == 'NNP'])
        return nouns
开发者ID:bdeloeste,项目名称:lima,代码行数:17,代码来源:streamhandler.py

示例9: get_diff

# 需要导入模块: from nltk.tokenize import TweetTokenizer [as 别名]
# 或者: from nltk.tokenize.TweetTokenizer import tokenize [as 别名]
def get_diff(query, event_name):
    tknzr = TweetTokenizer()
    query_strip = tknzr.tokenize(query)
    name_strip = tknzr.tokenize(event_name)
    ratio = 0
    for word in query_strip:
        for word2 in name_strip:
            r = difflib.SequenceMatcher(None, word, word2).ratio()
            rrr = r*r*r
            ratio += rrr
    if ratio >= len(query_strip):
        # werk om eoa reden niet
        print ratio ,len(name_strip)
        ratio = 100
    return ratio
开发者ID:smoquet,项目名称:anticipator,代码行数:17,代码来源:helper.py

示例10: format_text

# 需要导入模块: from nltk.tokenize import TweetTokenizer [as 别名]
# 或者: from nltk.tokenize.TweetTokenizer import tokenize [as 别名]
def format_text(entries, LSTM_shape=True):
	THIS_FOLDER = str(os.path.dirname(os.path.abspath(__file__)))
	sentences = []
	tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
	decoded = base64.b64decode(entries)
	decoded = str(decoded)
	decoded = decoded[2:]
	decoded = decoded[:-1]
	decoded = decoded.split(".")
	#print(decoded, "is decoded")
	for entry in decoded:
		token_sentences = tokenizer.tokenize(entry)
		for sentence in token_sentences:
			sentences.append(sentence)

	tokenized_sentences = []
	#remove_tokens = ['%', ']', '[', '.', ',', '?', '!', '\'']
	#remove_tokens = string.punctuation
	remove_tokens = '!"#$%&\'()*+,-./:;<=>[email protected][\\]^_`{|}~'
	stop_words = set(stopwords.words('english'))
	tweet_tknzr = TweetTokenizer()
	for sentence in sentences:
		tokens = tweet_tknzr.tokenize(sentence)
		tokens = list(filter(lambda a: a not in remove_tokens and a not in stop_words, tokens))
		tokenized_sentences.append(tokens)

	all_ngrams1 = np.load(THIS_FOLDER+'/ngrams1.npy').item()
	all_ngrams2 = np.load(THIS_FOLDER+'/ngrams2.npy').item()
	all_ngrams3 = np.load(THIS_FOLDER+'/ngrams3.npy').item()
	#once the model gets updated with good data, ngrams.py needs to get changed/updated too!

	X = np.zeros((len(sentences), len(all_ngrams1)+len(all_ngrams2)+len(all_ngrams3)))
	for i in range(len(tokenized_sentences)):
		sentence = tokenized_sentences[i]
		my_ngrams = ngrams(sentence, 1)
		for gram in my_ngrams:
			if gram in all_ngrams1:
				index = all_ngrams1[gram]
				X[i][index] = 1
	for i in range(len(tokenized_sentences)):
		sentence = tokenized_sentences[i]
		my_ngrams = ngrams(sentence, 2)
		for gram in my_ngrams:
			if gram in all_ngrams2:
				index = len(all_ngrams1) + all_ngrams2[gram]
				X[i][index] = 1
	for i in range(len(tokenized_sentences)):
		sentence = tokenized_sentences[i]
		my_ngrams = ngrams(sentence, 3)
		for gram in my_ngrams:
			if gram in all_ngrams3:
				index = len(all_ngrams1) + len(all_ngrams2) + all_ngrams3[gram]
				X[i][index] = 1


	if LSTM_shape:
		X = np.reshape(X, (X.shape[0], 1, X.shape[1]))
	else:
		X = np.reshape(X, (X.shape[0], X.shape[1]))
	return X
开发者ID:mit-teaching-systems-lab,项目名称:threeflows,代码行数:62,代码来源:calculate_emotion.py

示例11: main

# 需要导入模块: from nltk.tokenize import TweetTokenizer [as 别名]
# 或者: from nltk.tokenize.TweetTokenizer import tokenize [as 别名]
def main():
    text = sys.stdin.read().decode("utf-8")

    tknzr = TweetTokenizer()
    tok = tknzr.tokenize(text)
    saved_object = construct_dict(tok)
    print json.dumps(saved_object)
开发者ID:redserg,项目名称:shad-python-hw-3,代码行数:9,代码来源:counter.py

示例12: load_data_and_labels_sam

# 需要导入模块: from nltk.tokenize import TweetTokenizer [as 别名]
# 或者: from nltk.tokenize.TweetTokenizer import tokenize [as 别名]
def load_data_and_labels_sam():
    # load
    with open("./input/2780_freshmen_tweets.csv", 'rU') as f:
        rdr = csv.reader(f)
        dataset = list(rdr)[1:]  # remove header

    # filter out tweets with unknown sentiment
    dataset = [entry for entry in dataset if entry[4] != '0']

    # generate x
    tk = TweetTokenizer(reduce_len=True)
    x_text = [entry[3] for entry in dataset]
    x_text = [clean_str(tweet) for tweet in x_text]
    x_text = [tk.tokenize(tweet) for tweet in x_text]

    # generate y
    y = [entry[4] for entry in dataset]
    for idx, label in enumerate(y):
        if label == '1': # positive
            y[idx] = [1, 0, 0]
        elif label == '2': # neutral
            y[idx] = [0, 1, 0]
        elif label == '3': # negative
            y[idx] = [0, 0, 1]
        else:
            print 'wrong label in sam: ' + label

    return [x_text, y]
开发者ID:ydj0604,项目名称:DeepLearning-On-Tweets,代码行数:30,代码来源:data_helpers.py

示例13: load_data_and_labels_gameforum

# 需要导入模块: from nltk.tokenize import TweetTokenizer [as 别名]
# 或者: from nltk.tokenize.TweetTokenizer import tokenize [as 别名]
def load_data_and_labels_gameforum():
    # load
    with open("./input/gameforum-1000.csv", 'rU') as f:
        rdr = csv.reader(f)
        dataset = list(rdr)[1:]  # remove header

    dataset = [entry for entry in dataset if (entry[1] == '1' or entry[1] == '2' or entry[1] == '3')]

    # generate x
    tk = TweetTokenizer(reduce_len=True)
    x_text = [entry[0] for entry in dataset]
    x_text = [clean_str(post) for post in x_text]
    x_text = [tk.tokenize(post) for post in x_text]

    # generate y
    y = [entry[1] for entry in dataset]
    for idx, label in enumerate(y):
        if label == '1':  # positive
            y[idx] = [1, 0, 0]
        elif label == '2':  # neutral
            y[idx] = [0, 1, 0]
        elif label == '3':  # negative
            y[idx] = [0, 0, 1]
        else:
            print 'wrong label in gameforum: ' + label

    return [x_text, y]
开发者ID:ydj0604,项目名称:DeepLearning-On-Tweets,代码行数:29,代码来源:data_helpers.py

示例14: load_tweetkeywords

# 需要导入模块: from nltk.tokenize import TweetTokenizer [as 别名]
# 或者: from nltk.tokenize.TweetTokenizer import tokenize [as 别名]
def load_tweetkeywords():
    """
    Check and see which keywords are used in each tweet, and load the association
    table linking tweets and keywords
    """

    # TweetKeyword.query.delete()

    tweets = Tweet.query.all()
    keyword_query = Keyword.query.all()
    keywords = []
    [keywords.append(word.keyword) for word in keyword_query]

    tknzr = TweetTokenizer()

    for tweet in tweets:
        tokenized_tweets = tknzr.tokenize(tweet.text)

        for token in tokenized_tweets:
            if token in keywords:
                tweet_id = Tweet.query.filter(Tweet.tweet_id == tweet.tweet_id).one()
                keyword_id = Keyword.query.filter(Keyword.keyword == token).one()
                tweet_keyword = TweetKeyword(keyword_id=keyword_id.keyword_id, tweet_id=tweet_id.tweet_id)
                print "Added to TweetKeyword table: {}".format(tweet_keyword.keyword_id)
                db.session.add(tweet_keyword)

    db.session.commit()
开发者ID:lgorham,项目名称:TwitterElectionTracking,代码行数:29,代码来源:seed.py

示例15: createDataset

# 需要导入模块: from nltk.tokenize import TweetTokenizer [as 别名]
# 或者: from nltk.tokenize.TweetTokenizer import tokenize [as 别名]
def createDataset(filename, MAX_VOCAB_SIZE):
    yaks = []
    tokenizer = TweetTokenizer()
    ids = set()
    numyaks = 0
    for line in open(filename).readlines():
        stuff = line.split(":::")
        id = stuff[0]
        if len(stuff) > 3 and id not in ids:
            numyaks+=1
            sentence = stuff[3]
            ids.add(id)
            tokens = [START_TOKEN]
            tokens.extend(tokenizer.tokenize(sentence.lower()))
            tokens.append(END_TOKEN)
            yaks.append(tokens)
    token_frequency = nltk.FreqDist(itertools.chain(*yaks))
    vocab = token_frequency.most_common(MAX_VOCAB_SIZE-1)
    i2t = [token[0] for token in vocab]
    i2t.append(UNKNOWN_TOKEN)
    t2i = dict()
    for i,t in enumerate(i2t):
        t2i[t] = i
    
    yaks = [[t if t in t2i else UNKNOWN_TOKEN for t in yak] for yak in yaks]
    
    Xtrain = np.asarray([[t2i[token] for token in yak[:-1]] for yak in yaks])
    Ytrain = np.asarray([[t2i[token] for token in yak[1:]] for yak in yaks])
    print "Num unique Yaks: "+str(numyaks)
    return (Xtrain, Ytrain, i2t, t2i)
开发者ID:jdbrandon,项目名称:15780proj,代码行数:32,代码来源:train_rnn.py


注:本文中的nltk.tokenize.TweetTokenizer.tokenize方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。