Python tokenize.TweetTokenizer方法代码示例

本文整理汇总了Python中nltk.tokenize.TweetTokenizer方法的典型用法代码示例。如果您正苦于以下问题：Python tokenize.TweetTokenizer方法的具体用法？Python tokenize.TweetTokenizer怎么用？Python tokenize.TweetTokenizer使用的例子？那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类nltk.tokenize的用法示例。

在下文中一共展示了tokenize.TweetTokenizer方法的15个代码示例，这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞，您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: init

# 需要导入模块: from nltk import tokenize [as 别名]
# 或者: from nltk.tokenize import TweetTokenizer [as 别名]
def __init__(self, root, fileids=None,
                 word_tokenizer=TweetTokenizer(),
                 encoding='utf8'):
        """

        :param root: The root directory for this corpus.

        :param fileids: A list or regexp specifying the fileids in this corpus.

        :param word_tokenizer: Tokenizer for breaking the text of Tweets into
        smaller units, including but not limited to words.

        """
        CorpusReader.__init__(self, root, fileids, encoding)

        for path in self.abspaths(self._fileids):
            if isinstance(path, ZipFilePathPointer):
                pass
            elif os.path.getsize(path) == 0:
                raise ValueError("File {} is empty".format(path))
        """Check that all user-created corpus files are non-empty."""

        self._word_tokenizer = word_tokenizer

开发者ID:rafasashi，项目名称:razzy-spinner，代码行数:25，代码来源:twitter.py

示例2: get_ngram_features_from_map

# 需要导入模块: from nltk import tokenize [as 别名]
# 或者: from nltk.tokenize import TweetTokenizer [as 别名]
def get_ngram_features_from_map(tweets, ngram_map, n):
    regexp_tknzr = RegexpTokenizer(r'\w+')
    tweet_tknzr = TweetTokenizer()
    features = []
    for tweet in tweets:
        feature_list = [0] * np.zeros(len(ngram_map))
        tweet = tweet.lower()
        ngram_list = get_ngram_list(tweet_tknzr, tweet, 1)
        if n > 1:
            ngram_list += get_ngram_list(regexp_tknzr, tweet, 2)
        if n > 2:
            ngram_list += get_ngram_list(regexp_tknzr, tweet, 3)
        for gram in ngram_list:
            if gram in ngram_map:
                feature_list[ngram_map[gram]] += 1.0
        features.append(feature_list)
    return features

开发者ID:MirunaPislar，项目名称:Sarcasm-Detection，代码行数:19，代码来源:extract_baseline_features.py

示例3: init

# 需要导入模块: from nltk import tokenize [as 别名]
# 或者: from nltk.tokenize import TweetTokenizer [as 别名]
def __init__(self, dictionary_file):
        with open(dictionary_file, 'r') as f:
            self.word2i = json.load(f)['word2i']
        self.wpt = TweetTokenizer(preserve_case=False)

        if "<stop_dialogue>" not in self.word2i:
            self.word2i["<stop_dialogue>"] = len(self.word2i)

        self.i2word = {}
        for (k, v) in self.word2i.items():
            self.i2word[v] = k

        # Retrieve key values
        self.no_words = len(self.word2i)
        self.start_token = self.word2i["<start>"]
        self.stop_token = self.word2i["?"]
        self.stop_dialogue = self.word2i["<stop_dialogue>"]
        self.padding_token = self.word2i["<padding>"]
        self.yes_token = self.word2i["<yes>"]
        self.no_token = self.word2i["<no>"]
        self.non_applicable_token = self.word2i["<n/a>"]

        self.answers = [self.yes_token, self.no_token, self.non_applicable_token]

开发者ID:GuessWhatGame，项目名称:guesswhat，代码行数:25，代码来源:guesswhat_tokenizer.py

示例4: init

# 需要导入模块: from nltk import tokenize [as 别名]
# 或者: from nltk.tokenize import TweetTokenizer [as 别名]
def __init__(
        self, root, fileids=None, word_tokenizer=TweetTokenizer(), encoding='utf8'
    ):
        """

        :param root: The root directory for this corpus.

        :param fileids: A list or regexp specifying the fileids in this corpus.

        :param word_tokenizer: Tokenizer for breaking the text of Tweets into
        smaller units, including but not limited to words.

        """
        CorpusReader.__init__(self, root, fileids, encoding)

        for path in self.abspaths(self._fileids):
            if isinstance(path, ZipFilePathPointer):
                pass
            elif os.path.getsize(path) == 0:
                raise ValueError("File {} is empty".format(path))
        """Check that all user-created corpus files are non-empty."""

        self._word_tokenizer = word_tokenizer

开发者ID:V1EngineeringInc，项目名称:V1EngineeringInc-Docs，代码行数:25，代码来源:twitter.py

示例5: gpt_norm_sentence

# 需要导入模块: from nltk import tokenize [as 别名]
# 或者: from nltk.tokenize import TweetTokenizer [as 别名]
def gpt_norm_sentence(txt):
	# url and tag
	words = []
	for word in txt.split():
		if word[0] == '#': # don't allow tag
			continue
		i = word.lower().find('http')
		if i >= 0:
			word = word[:i] + ' ' + '__url__'
		words.append(word.strip())
	txt = ' '.join(words)

	# remove illegal char
	txt = txt.replace(chr(92),'') # chr(92) = '\'. as twitter has 'b\/c' rather than 'b/c'
	txt = txt.replace("b/c","because").replace('j/k','just kidding').replace('w/o','without').replace('w/','with')
	txt = re.sub('__mention__','MENTION',txt)
	txt = re.sub('__url__','URL',txt)
	txt = re.sub(r"[^A-Za-z0-9()\[\]:,.!?'“” ]", " ", txt)
	txt = re.sub('MENTION','__mention__',txt)
	txt = re.sub('URL','__url__',txt)

	tokenizer = TweetTokenizer(preserve_case=True)
	txt = ' ' + ' '.join(tokenizer.tokenize(txt)) + ' '

	# remove un-necessary space
	return ' '.join(txt.split())

开发者ID:microsoft，项目名称:DialoGPT，代码行数:28，代码来源:reddit.py

示例6: clean_str

# 需要导入模块: from nltk import tokenize [as 别名]
# 或者: from nltk.tokenize import TweetTokenizer [as 别名]
def clean_str(txt):
	#print("in=[%s]" % txt)
	txt = txt.lower()
	txt = re.sub('^',' ', txt)
	txt = re.sub('$',' ', txt)

	# url and tag
	words = []
	for word in txt.split():
		i = word.find('http') 
		if i >= 0:
			word = word[:i] + ' ' + '__url__'
		words.append(word.strip())
	txt = ' '.join(words)

	# remove markdown URL
	txt = re.sub(r'\[([^\]]*)\] \( *__url__ *\)', r'\1', txt)

	# remove illegal char
	txt = re.sub('__url__','URL',txt)
	txt = re.sub(r"[^A-Za-z0-9():,.!?\"\']", " ", txt)
	txt = re.sub('URL','__url__',txt)	

	# contraction
	add_space = ["'s", "'m", "'re", "n't", "'ll","'ve","'d","'em"]
	tokenizer = TweetTokenizer(preserve_case=False)
	txt = ' ' + ' '.join(tokenizer.tokenize(txt)) + ' '
	txt = txt.replace(" won't ", " will n't ")
	txt = txt.replace(" can't ", " can n't ")
	for a in add_space:
		txt = txt.replace(a+' ', ' '+a+' ')

	txt = re.sub(r'^\s+', '', txt)
	txt = re.sub(r'\s+$', '', txt)
	txt = re.sub(r'\s+', ' ', txt) # remove extra spaces
	
	#print("out=[%s]" % txt)
	return txt

开发者ID:microsoft，项目名称:DialoGPT，代码行数:40，代码来源:tokenizers.py

示例7: init

# 需要导入模块: from nltk import tokenize [as 别名]
# 或者: from nltk.tokenize import TweetTokenizer [as 别名]
def __init__(self, dictionary_file):

        self.tokenizer = TweetTokenizer(preserve_case=False)
        with open(dictionary_file, 'r') as f:
            data = json.load(f)
            self.word2i = data['word2i']
            self.answer2i = data['answer2i']
            self.preprocess_answers = data['preprocess_answers']

        self.dictionary_file = dictionary_file

        self.i2word = {}
        for (k, v) in self.word2i.items():
            self.i2word[v] = k

        self.i2answer = {}
        for (k, v) in self.answer2i.items():
            self.i2answer[v] = k

        # Retrieve key values
        self.no_words = len(self.word2i)
        self.no_answers = len(self.answer2i)

        self.unknown_question_token = self.word2i["<unk>"]
        self.padding_token = self.word2i["<unk>"]

        self.unknown_answer = self.answer2i["<unk>"]

开发者ID:ap229997，项目名称:Conditional-Batch-Norm，代码行数:29，代码来源:vqa_tokenizer.py

示例8: init

# 需要导入模块: from nltk import tokenize [as 别名]
# 或者: from nltk.tokenize import TweetTokenizer [as 别名]
def __init__(self):
        self.tokenizers = {
            'en': TweetTokenizer(),
            'de': WordPunctTokenizer(),
            'it': WordPunctTokenizer(),
            'fr': WordPunctTokenizer(),
            'default': WordPunctTokenizer()
        }

        self.tokenizer = TweetTokenizer()

开发者ID:spinningbytes，项目名称:deep-mlsa，代码行数:12，代码来源:parse_utils.py

示例9: init

# 需要导入模块: from nltk import tokenize [as 别名]
# 或者: from nltk.tokenize import TweetTokenizer [as 别名]
def __init__(self, max_fact_len=12, max_facts_count=500, min_fact_len=8):
		self.tokenizer = TweetTokenizer(preserve_case=False)
		self.extractor = pke.unsupervised.TopicRank()
		self.max_fact_len = max_fact_len
		self.max_facts_count = max_facts_count
		self.min_fact_len = min_fact_len

开发者ID:qkaren，项目名称:converse_reading_cmr，代码行数:8，代码来源:fetch_realtime_grounding.py

示例10: get_features2

# 需要导入模块: from nltk import tokenize [as 别名]
# 或者: from nltk.tokenize import TweetTokenizer [as 别名]
def get_features2(tweets, subj_dict):
    print("Getting features type 2...")
    features = []
    tknzr = TweetTokenizer(preserve_case=True, reduce_len=False, strip_handles=False)
    lemmatizer = WordNetLemmatizer()
    for tweet in tweets:
        feature_list = [0.0] * 5
        tokens = tknzr.tokenize(tweet)
        # Take the number of positive and negative words as features
        for word in tokens:
            stemmed = lemmatizer.lemmatize(word, 'v')
            stemmed = lemmatizer.lemmatize(stemmed)
            if stemmed in subj_dict:
                dictlist = []
                for word in subj_dict[stemmed]:
                    dictlist.extend(subj_dict[stemmed][word])
                if 'strongsubj' in dictlist:
                    value = 1.0
                else:
                    value = 0.5
                if 'positive' in dictlist:
                    feature_list[0] += value
                elif 'negative' in dictlist:
                    feature_list[1] += value
        # Take the report of positives to negatives as a feature
        if feature_list[0] != 0.0 and feature_list[1] != 0.0:
            feature_list[2] = feature_list[0] / feature_list[1]
        # Derive features from punctuation
        feature_list[2] += count_apparitions(tokens, helper.punctuation)
        # Take strong negations as a feature
        feature_list[3] += count_apparitions(tokens, helper.strong_negations)
        # Take strong affirmatives as a feature
        feature_list[4] += count_apparitions(tokens, helper.strong_affirmatives)
        features.append(feature_list)
    print("Done.")
    return features

开发者ID:MirunaPislar，项目名称:Sarcasm-Detection，代码行数:38，代码来源:extract_baseline_features.py

示例11: get_ngrams

# 需要导入模块: from nltk import tokenize [as 别名]
# 或者: from nltk.tokenize import TweetTokenizer [as 别名]
def get_ngrams(tweets, n):
    unigrams = Counter()
    bigrams = Counter()
    trigrams = Counter()
    regexp_tknzr = RegexpTokenizer(r'\w+')
    tweet_tknzr = TweetTokenizer()
    for tweet in tweets:
        tweet = tweet.lower()
        # Get the unigram list for this tweet and update the unigram counter
        unigram_list = get_ngram_list(tweet_tknzr, tweet, 1)
        unigrams.update(unigram_list)
        # Get the bigram list for this tweet and update the bigram counter
        if n > 1:
            bigram_list = get_ngram_list(regexp_tknzr, tweet, 2)
            bigrams.update(bigram_list)
            # Get the trigram list for this tweet and update the trigram counter
            if n > 2:
                trigram_list = get_ngram_list(regexp_tknzr, tweet, 3)
                trigrams.update(trigram_list)
    # Update the counters such that each n-gram appears at least min_occurence times
    min_occurence = 2
    unigram_tokens = [k for k, c in unigrams.items() if c >= min_occurence]
    # In case using just unigrams, make the bigrams and trigrams empty
    bigram_tokens = trigram_tokens = []
    if n > 1:
        bigram_tokens = [k for k, c in bigrams.items() if c >= min_occurence]
    if n > 2:
        trigram_tokens = [k for k, c in trigrams.items() if c >= min_occurence]
    return unigram_tokens, bigram_tokens, trigram_tokens

开发者ID:MirunaPislar，项目名称:Sarcasm-Detection，代码行数:31，代码来源:extract_baseline_features.py

示例12: preprocess_tweets

# 需要导入模块: from nltk import tokenize [as 别名]
# 或者: from nltk.tokenize import TweetTokenizer [as 别名]
def preprocess_tweets( docs, stopwords, min_df = 3, min_term_length = 2, ngram_range = (1,1), apply_tfidf = True, apply_norm = True):
	"""
	Preprocess a list containing text documents stored as strings, where the documents have already been tokenized and are separated by whitespace
	"""
	from nltk.tokenize import TweetTokenizer
	tweet_tokenizer = TweetTokenizer(preserve_case = False, strip_handles=True, reduce_len=True)

	def custom_tokenizer( s ):
		# need to manually replace quotes
		s = s.replace("'"," ").replace('"',' ')
		tokens = []
		for x in tweet_tokenizer.tokenize(s):
			if len(x) >= min_term_length:
				if x[0] == "#" or x[0].isalpha():
					tokens.append( x )
		return tokens

	# Build the Vector Space Model, apply TF-IDF and normalize lines to unit length all in one call
	if apply_norm:
		norm_function = "l2"
	else:
		norm_function = None
	tfidf = TfidfVectorizer(stop_words=stopwords, lowercase=True, strip_accents="unicode", tokenizer=custom_tokenizer, use_idf=apply_tfidf, norm=norm_function, min_df = min_df, ngram_range = ngram_range) 
	X = tfidf.fit_transform(docs)
	terms = []
	# store the vocabulary map
	v = tfidf.vocabulary_
	for i in range(len(v)):
		terms.append("")
	for term in v.keys():
		terms[ v[term] ] = term
	return (X,terms)

# --------------------------------------------------------------

开发者ID:derekgreene，项目名称:topic-ensemble，代码行数:36，代码来源:util.py

示例13: init

# 需要导入模块: from nltk import tokenize [as 别名]
# 或者: from nltk.tokenize import TweetTokenizer [as 别名]
def __init__(self, dist_file_path=None):
        """ Initialize module with default data/english.dist file """
        if dist_file_path is None:
            dist_file_path = os.path.join(
                os.path.dirname(os.path.abspath(__file__)),
                "data/english.dist")

        with open(dist_file_path, "rb") as distributions_file:
            pickle_dict = pickle.load(distributions_file)
            self.uni_dist = pickle_dict["uni_dist"]
            self.backward_bi_dist = pickle_dict["backward_bi_dist"]
            self.forward_bi_dist = pickle_dict["forward_bi_dist"]
            self.trigram_dist = pickle_dict["trigram_dist"]
            self.word_casing_lookup = pickle_dict["word_casing_lookup"]
        self.tknzr = TweetTokenizer()

开发者ID:daltonfury42，项目名称:truecase，代码行数:17，代码来源:TrueCaser.py

示例14: clean_body

# 需要导入模块: from nltk import tokenize [as 别名]
# 或者: from nltk.tokenize import TweetTokenizer [as 别名]
def clean_body(self, tknzr=None):
		if tknzr is None:
			tknzr = TweetTokenizer()

		# unescape html symbols.
		new_body = html.unescape(self.body)

		# remove extraneous whitespace.
		new_body = new_body.replace('\n', ' ')
		new_body = new_body.replace('\t', ' ')
		new_body = re.sub(r'\s+', ' ', new_body).strip()

		# remove non-ascii symbols.
		new_body = new_body.encode('ascii', errors='ignore').decode()

		# replace URLS with a special token.
		new_body = re.sub(URL_REGEX, URL_TOKEN, new_body)

		# replace reddit user with a token
		new_body = re.sub(USER_REGEX, USER_TOKEN, new_body)

		# replace subreddit names with a token
		new_body = re.sub(SUBREDDIT_REGEX, SUBREDDIT_TOKEN, new_body)

		# lowercase the text
		new_body = new_body.casefold()

		# Could be done in addition:
		# get rid of comments with quotes

		# tokenize the text
		new_body = tknzr.tokenize(new_body)

		self.body = ' '.join(new_body)

开发者ID:TalwalkarLab，项目名称:leaf，代码行数:36，代码来源:reddit_utils.py

示例15: main

# 需要导入模块: from nltk import tokenize [as 别名]
# 或者: from nltk.tokenize import TweetTokenizer [as 别名]
def main():
	tknzr = TweetTokenizer()

	if not os.path.exists(FINAL_DIR):
		os.makedirs(FINAL_DIR)

	files = [f for f in os.listdir(DIR) if f.endswith('.pck')]
	files.sort()

	num_files = len(files)
	for i, f in enumerate(files):
		clean_file(f, tknzr)
		print('Done with {} of {}'.format(i, num_files))

开发者ID:TalwalkarLab，项目名称:leaf，代码行数:15，代码来源:clean_raw.py

注：本文中的nltk.tokenize.TweetTokenizer方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台，相关代码片段筛选自各路编程大神贡献的开源项目，源码版权归原作者所有，传播和使用请参考对应项目的License；未经允许，请勿转载。

示例1: __init__