本文整理汇总了Python中nltk.tokenize.TweetTokenizer方法的典型用法代码示例。如果您正苦于以下问题:Python tokenize.TweetTokenizer方法的具体用法?Python tokenize.TweetTokenizer怎么用?Python tokenize.TweetTokenizer使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类nltk.tokenize
的用法示例。
在下文中一共展示了tokenize.TweetTokenizer方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: __init__
# 需要导入模块: from nltk import tokenize [as 别名]
# 或者: from nltk.tokenize import TweetTokenizer [as 别名]
def __init__(self, root, fileids=None,
word_tokenizer=TweetTokenizer(),
encoding='utf8'):
"""
:param root: The root directory for this corpus.
:param fileids: A list or regexp specifying the fileids in this corpus.
:param word_tokenizer: Tokenizer for breaking the text of Tweets into
smaller units, including but not limited to words.
"""
CorpusReader.__init__(self, root, fileids, encoding)
for path in self.abspaths(self._fileids):
if isinstance(path, ZipFilePathPointer):
pass
elif os.path.getsize(path) == 0:
raise ValueError("File {} is empty".format(path))
"""Check that all user-created corpus files are non-empty."""
self._word_tokenizer = word_tokenizer
示例2: get_ngram_features_from_map
# 需要导入模块: from nltk import tokenize [as 别名]
# 或者: from nltk.tokenize import TweetTokenizer [as 别名]
def get_ngram_features_from_map(tweets, ngram_map, n):
regexp_tknzr = RegexpTokenizer(r'\w+')
tweet_tknzr = TweetTokenizer()
features = []
for tweet in tweets:
feature_list = [0] * np.zeros(len(ngram_map))
tweet = tweet.lower()
ngram_list = get_ngram_list(tweet_tknzr, tweet, 1)
if n > 1:
ngram_list += get_ngram_list(regexp_tknzr, tweet, 2)
if n > 2:
ngram_list += get_ngram_list(regexp_tknzr, tweet, 3)
for gram in ngram_list:
if gram in ngram_map:
feature_list[ngram_map[gram]] += 1.0
features.append(feature_list)
return features
示例3: __init__
# 需要导入模块: from nltk import tokenize [as 别名]
# 或者: from nltk.tokenize import TweetTokenizer [as 别名]
def __init__(self, dictionary_file):
with open(dictionary_file, 'r') as f:
self.word2i = json.load(f)['word2i']
self.wpt = TweetTokenizer(preserve_case=False)
if "<stop_dialogue>" not in self.word2i:
self.word2i["<stop_dialogue>"] = len(self.word2i)
self.i2word = {}
for (k, v) in self.word2i.items():
self.i2word[v] = k
# Retrieve key values
self.no_words = len(self.word2i)
self.start_token = self.word2i["<start>"]
self.stop_token = self.word2i["?"]
self.stop_dialogue = self.word2i["<stop_dialogue>"]
self.padding_token = self.word2i["<padding>"]
self.yes_token = self.word2i["<yes>"]
self.no_token = self.word2i["<no>"]
self.non_applicable_token = self.word2i["<n/a>"]
self.answers = [self.yes_token, self.no_token, self.non_applicable_token]
示例4: __init__
# 需要导入模块: from nltk import tokenize [as 别名]
# 或者: from nltk.tokenize import TweetTokenizer [as 别名]
def __init__(
self, root, fileids=None, word_tokenizer=TweetTokenizer(), encoding='utf8'
):
"""
:param root: The root directory for this corpus.
:param fileids: A list or regexp specifying the fileids in this corpus.
:param word_tokenizer: Tokenizer for breaking the text of Tweets into
smaller units, including but not limited to words.
"""
CorpusReader.__init__(self, root, fileids, encoding)
for path in self.abspaths(self._fileids):
if isinstance(path, ZipFilePathPointer):
pass
elif os.path.getsize(path) == 0:
raise ValueError("File {} is empty".format(path))
"""Check that all user-created corpus files are non-empty."""
self._word_tokenizer = word_tokenizer
示例5: gpt_norm_sentence
# 需要导入模块: from nltk import tokenize [as 别名]
# 或者: from nltk.tokenize import TweetTokenizer [as 别名]
def gpt_norm_sentence(txt):
# url and tag
words = []
for word in txt.split():
if word[0] == '#': # don't allow tag
continue
i = word.lower().find('http')
if i >= 0:
word = word[:i] + ' ' + '__url__'
words.append(word.strip())
txt = ' '.join(words)
# remove illegal char
txt = txt.replace(chr(92),'') # chr(92) = '\'. as twitter has 'b\/c' rather than 'b/c'
txt = txt.replace("b/c","because").replace('j/k','just kidding').replace('w/o','without').replace('w/','with')
txt = re.sub('__mention__','MENTION',txt)
txt = re.sub('__url__','URL',txt)
txt = re.sub(r"[^A-Za-z0-9()\[\]:,.!?'“” ]", " ", txt)
txt = re.sub('MENTION','__mention__',txt)
txt = re.sub('URL','__url__',txt)
tokenizer = TweetTokenizer(preserve_case=True)
txt = ' ' + ' '.join(tokenizer.tokenize(txt)) + ' '
# remove un-necessary space
return ' '.join(txt.split())
示例6: clean_str
# 需要导入模块: from nltk import tokenize [as 别名]
# 或者: from nltk.tokenize import TweetTokenizer [as 别名]
def clean_str(txt):
#print("in=[%s]" % txt)
txt = txt.lower()
txt = re.sub('^',' ', txt)
txt = re.sub('$',' ', txt)
# url and tag
words = []
for word in txt.split():
i = word.find('http')
if i >= 0:
word = word[:i] + ' ' + '__url__'
words.append(word.strip())
txt = ' '.join(words)
# remove markdown URL
txt = re.sub(r'\[([^\]]*)\] \( *__url__ *\)', r'\1', txt)
# remove illegal char
txt = re.sub('__url__','URL',txt)
txt = re.sub(r"[^A-Za-z0-9():,.!?\"\']", " ", txt)
txt = re.sub('URL','__url__',txt)
# contraction
add_space = ["'s", "'m", "'re", "n't", "'ll","'ve","'d","'em"]
tokenizer = TweetTokenizer(preserve_case=False)
txt = ' ' + ' '.join(tokenizer.tokenize(txt)) + ' '
txt = txt.replace(" won't ", " will n't ")
txt = txt.replace(" can't ", " can n't ")
for a in add_space:
txt = txt.replace(a+' ', ' '+a+' ')
txt = re.sub(r'^\s+', '', txt)
txt = re.sub(r'\s+$', '', txt)
txt = re.sub(r'\s+', ' ', txt) # remove extra spaces
#print("out=[%s]" % txt)
return txt
示例7: __init__
# 需要导入模块: from nltk import tokenize [as 别名]
# 或者: from nltk.tokenize import TweetTokenizer [as 别名]
def __init__(self, dictionary_file):
self.tokenizer = TweetTokenizer(preserve_case=False)
with open(dictionary_file, 'r') as f:
data = json.load(f)
self.word2i = data['word2i']
self.answer2i = data['answer2i']
self.preprocess_answers = data['preprocess_answers']
self.dictionary_file = dictionary_file
self.i2word = {}
for (k, v) in self.word2i.items():
self.i2word[v] = k
self.i2answer = {}
for (k, v) in self.answer2i.items():
self.i2answer[v] = k
# Retrieve key values
self.no_words = len(self.word2i)
self.no_answers = len(self.answer2i)
self.unknown_question_token = self.word2i["<unk>"]
self.padding_token = self.word2i["<unk>"]
self.unknown_answer = self.answer2i["<unk>"]
示例8: __init__
# 需要导入模块: from nltk import tokenize [as 别名]
# 或者: from nltk.tokenize import TweetTokenizer [as 别名]
def __init__(self):
self.tokenizers = {
'en': TweetTokenizer(),
'de': WordPunctTokenizer(),
'it': WordPunctTokenizer(),
'fr': WordPunctTokenizer(),
'default': WordPunctTokenizer()
}
self.tokenizer = TweetTokenizer()
示例9: __init__
# 需要导入模块: from nltk import tokenize [as 别名]
# 或者: from nltk.tokenize import TweetTokenizer [as 别名]
def __init__(self, max_fact_len=12, max_facts_count=500, min_fact_len=8):
self.tokenizer = TweetTokenizer(preserve_case=False)
self.extractor = pke.unsupervised.TopicRank()
self.max_fact_len = max_fact_len
self.max_facts_count = max_facts_count
self.min_fact_len = min_fact_len
示例10: get_features2
# 需要导入模块: from nltk import tokenize [as 别名]
# 或者: from nltk.tokenize import TweetTokenizer [as 别名]
def get_features2(tweets, subj_dict):
print("Getting features type 2...")
features = []
tknzr = TweetTokenizer(preserve_case=True, reduce_len=False, strip_handles=False)
lemmatizer = WordNetLemmatizer()
for tweet in tweets:
feature_list = [0.0] * 5
tokens = tknzr.tokenize(tweet)
# Take the number of positive and negative words as features
for word in tokens:
stemmed = lemmatizer.lemmatize(word, 'v')
stemmed = lemmatizer.lemmatize(stemmed)
if stemmed in subj_dict:
dictlist = []
for word in subj_dict[stemmed]:
dictlist.extend(subj_dict[stemmed][word])
if 'strongsubj' in dictlist:
value = 1.0
else:
value = 0.5
if 'positive' in dictlist:
feature_list[0] += value
elif 'negative' in dictlist:
feature_list[1] += value
# Take the report of positives to negatives as a feature
if feature_list[0] != 0.0 and feature_list[1] != 0.0:
feature_list[2] = feature_list[0] / feature_list[1]
# Derive features from punctuation
feature_list[2] += count_apparitions(tokens, helper.punctuation)
# Take strong negations as a feature
feature_list[3] += count_apparitions(tokens, helper.strong_negations)
# Take strong affirmatives as a feature
feature_list[4] += count_apparitions(tokens, helper.strong_affirmatives)
features.append(feature_list)
print("Done.")
return features
示例11: get_ngrams
# 需要导入模块: from nltk import tokenize [as 别名]
# 或者: from nltk.tokenize import TweetTokenizer [as 别名]
def get_ngrams(tweets, n):
unigrams = Counter()
bigrams = Counter()
trigrams = Counter()
regexp_tknzr = RegexpTokenizer(r'\w+')
tweet_tknzr = TweetTokenizer()
for tweet in tweets:
tweet = tweet.lower()
# Get the unigram list for this tweet and update the unigram counter
unigram_list = get_ngram_list(tweet_tknzr, tweet, 1)
unigrams.update(unigram_list)
# Get the bigram list for this tweet and update the bigram counter
if n > 1:
bigram_list = get_ngram_list(regexp_tknzr, tweet, 2)
bigrams.update(bigram_list)
# Get the trigram list for this tweet and update the trigram counter
if n > 2:
trigram_list = get_ngram_list(regexp_tknzr, tweet, 3)
trigrams.update(trigram_list)
# Update the counters such that each n-gram appears at least min_occurence times
min_occurence = 2
unigram_tokens = [k for k, c in unigrams.items() if c >= min_occurence]
# In case using just unigrams, make the bigrams and trigrams empty
bigram_tokens = trigram_tokens = []
if n > 1:
bigram_tokens = [k for k, c in bigrams.items() if c >= min_occurence]
if n > 2:
trigram_tokens = [k for k, c in trigrams.items() if c >= min_occurence]
return unigram_tokens, bigram_tokens, trigram_tokens
示例12: preprocess_tweets
# 需要导入模块: from nltk import tokenize [as 别名]
# 或者: from nltk.tokenize import TweetTokenizer [as 别名]
def preprocess_tweets( docs, stopwords, min_df = 3, min_term_length = 2, ngram_range = (1,1), apply_tfidf = True, apply_norm = True):
"""
Preprocess a list containing text documents stored as strings, where the documents have already been tokenized and are separated by whitespace
"""
from nltk.tokenize import TweetTokenizer
tweet_tokenizer = TweetTokenizer(preserve_case = False, strip_handles=True, reduce_len=True)
def custom_tokenizer( s ):
# need to manually replace quotes
s = s.replace("'"," ").replace('"',' ')
tokens = []
for x in tweet_tokenizer.tokenize(s):
if len(x) >= min_term_length:
if x[0] == "#" or x[0].isalpha():
tokens.append( x )
return tokens
# Build the Vector Space Model, apply TF-IDF and normalize lines to unit length all in one call
if apply_norm:
norm_function = "l2"
else:
norm_function = None
tfidf = TfidfVectorizer(stop_words=stopwords, lowercase=True, strip_accents="unicode", tokenizer=custom_tokenizer, use_idf=apply_tfidf, norm=norm_function, min_df = min_df, ngram_range = ngram_range)
X = tfidf.fit_transform(docs)
terms = []
# store the vocabulary map
v = tfidf.vocabulary_
for i in range(len(v)):
terms.append("")
for term in v.keys():
terms[ v[term] ] = term
return (X,terms)
# --------------------------------------------------------------
示例13: __init__
# 需要导入模块: from nltk import tokenize [as 别名]
# 或者: from nltk.tokenize import TweetTokenizer [as 别名]
def __init__(self, dist_file_path=None):
""" Initialize module with default data/english.dist file """
if dist_file_path is None:
dist_file_path = os.path.join(
os.path.dirname(os.path.abspath(__file__)),
"data/english.dist")
with open(dist_file_path, "rb") as distributions_file:
pickle_dict = pickle.load(distributions_file)
self.uni_dist = pickle_dict["uni_dist"]
self.backward_bi_dist = pickle_dict["backward_bi_dist"]
self.forward_bi_dist = pickle_dict["forward_bi_dist"]
self.trigram_dist = pickle_dict["trigram_dist"]
self.word_casing_lookup = pickle_dict["word_casing_lookup"]
self.tknzr = TweetTokenizer()
示例14: clean_body
# 需要导入模块: from nltk import tokenize [as 别名]
# 或者: from nltk.tokenize import TweetTokenizer [as 别名]
def clean_body(self, tknzr=None):
if tknzr is None:
tknzr = TweetTokenizer()
# unescape html symbols.
new_body = html.unescape(self.body)
# remove extraneous whitespace.
new_body = new_body.replace('\n', ' ')
new_body = new_body.replace('\t', ' ')
new_body = re.sub(r'\s+', ' ', new_body).strip()
# remove non-ascii symbols.
new_body = new_body.encode('ascii', errors='ignore').decode()
# replace URLS with a special token.
new_body = re.sub(URL_REGEX, URL_TOKEN, new_body)
# replace reddit user with a token
new_body = re.sub(USER_REGEX, USER_TOKEN, new_body)
# replace subreddit names with a token
new_body = re.sub(SUBREDDIT_REGEX, SUBREDDIT_TOKEN, new_body)
# lowercase the text
new_body = new_body.casefold()
# Could be done in addition:
# get rid of comments with quotes
# tokenize the text
new_body = tknzr.tokenize(new_body)
self.body = ' '.join(new_body)
示例15: main
# 需要导入模块: from nltk import tokenize [as 别名]
# 或者: from nltk.tokenize import TweetTokenizer [as 别名]
def main():
tknzr = TweetTokenizer()
if not os.path.exists(FINAL_DIR):
os.makedirs(FINAL_DIR)
files = [f for f in os.listdir(DIR) if f.endswith('.pck')]
files.sort()
num_files = len(files)
for i, f in enumerate(files):
clean_file(f, tknzr)
print('Done with {} of {}'.format(i, num_files))