本文整理汇总了Python中twokenize.tokenize函数的典型用法代码示例。如果您正苦于以下问题:Python tokenize函数的具体用法?Python tokenize怎么用?Python tokenize使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了tokenize函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: Extract
def Extract(self, text):
features = []
words = twokenize.tokenize(text)
#hand-crafted features
iCapitalized = True
nCapitalized = 0.1
nAllCaps = 0.1
nCapLowerViolated = 0.1
nCapUpperViolated = 0.1
nWords = 0.1
for i in range(len(words)):
capitalized = re.search(r'^([A-Z]|[a-z][A-Z])', words[i])
if capitalized and not (i == 0 or re.match(r"\.|\?|!|@.+|http:.+|:|\"", words[i-1])):
nCapitalized += 1.0
if not (i == 0 or re.match(r"\.|\?|!|@.+|http:.+|:|\"", words[i-1])):
if capitalized and self.capDict.get(words[i].lower(), '1') != '1':
nCapUpperViolated += 1.0
features.append(self.fVocab.GetID('upperViolated=%s' % words[i].lower()))
elif not capitalized and re.match(r'[a-z]+', words[i]) and self.capDict.get(words[i].lower(), '1') != '0':
nCapLowerViolated += 1.0
#features.append(self.fVocab.GetID('lowerViolated=%s' % words[i].lower()))
if re.match(r'\w+', words[i][0:1]):
nWords += 1
if re.match(r"i|i'm|im|u", words[i]):
iCapitalized = False
if re.match(r"[A-Z]{2,}", words[i]):
nAllCaps += 1
features.append(self.fVocab.GetID('iCapitalized=%s' % iCapitalized))
return ' '.join(["%s:1" % x for x in features]) + " %s:%s" % (self.fVocab.GetID('nAllCaps'), nAllCaps/nWords) + " %s:%s" % (self.fVocab.GetID('nCapitalized'), nCapitalized/nWords) + " %s:%s" % (self.fVocab.GetID('nCapLowerViolated'), nCapLowerViolated/nWords) + " %s:%s" % (self.fVocab.GetID('nCapUpperViolated'), nCapUpperViolated/nWords)
示例2: kshinglize
def kshinglize(s, k=KSHINGLES, stopwords=STOPWORDS):
""" Tokenizes string s, removes stopwords, and returns a set of k-shingles
"""
s = s.strip().lower()
tokens_raw = twokenize.tokenize(s)
tokens = filterstopwords(tokens_raw, stopwords)
return tokens_to_kshingles(tokens, k)
示例3: main
def main(argv):
tagger = PerceptronTagger()
tagset = None
tokens = tokenize(line)
tags = nltk.tag._pos_tag(tokens, tagset, tagger)
format_tagged(tags)
示例4: learn_terms
def learn_terms(self, tweets_file_object, learn_lemmas=True, cache_size=1000000):
reader = csv.reader(tweets_file_object, delimiter=",", quotechar="\"")
term_freq = Counter()
term_id_map = dict()
tweet_vectors = []
for row in reader:
tweet_id = int(row[0])
tweet_text = row[-1]
terms = [t.lower().encode("utf-8") for t in twokenize.tokenize(tweet_text)]
if learn_lemmas:
terms = [self.lmtz.lemmatize(term) for term in terms]
tweet_sp_vector = []
counted_ids = []
for term in terms:
if term not in term_id_map:
term_id = len(term_id_map)
term_id_map[term] = term_id
else:
term_id = term_id_map[term]
if term_id not in counted_ids:
term_freq[term_id] += 1
counted_ids.append(term_id)
tweet_sp_vector.append(term_id)
tweet_vectors.append((tweet_id, tweet_sp_vector))
if len(tweet_vectors) >= cache_size:
self.write_tweet_vectors(tweet_vectors)
tweet_vectors = []
self.write_tweet_vectors(tweet_vectors)
self.write_terms(term_id_map, term_freq)
示例5: preprocess
def preprocess(m, sep_emoji=False):
m = m.lower()
m = max_reps(m)
#replace user mentions with token '@user'
user_regex = r"[email protected]+?( |$)|<@mention>"
m = re.sub(user_regex," @user ", m, flags=re.I)
#replace urls with token 'url'
m = re.sub(twokenize.url," url ", m, flags=re.I)
tokenized_msg = ' '.join(twokenize.tokenize(m)).strip()
if sep_emoji:
#tokenize emoji, this tokenzier however has a problem where repeated punctuation gets separated e.g. "blah blah!!!"" -> ['blah','blah','!!!'], instead of ['blah','blah','!','!','!']
m_toks = tokenized_msg.split()
n_toks = twk.tokenize(tokenized_msg)
if len(n_toks)!=len(m_toks):
#check if there is any punctuation in this string
has_punct = map(lambda x:x in twk.punctuation, n_toks)
if any(has_punct):
new_m = n_toks[0]
for i in xrange(1,len(n_toks)):
#while the same punctuation token shows up, concatenate
if has_punct[i] and has_punct[i-1] and (n_toks[i] == n_toks[i-1]):
new_m += n_toks[i]
else:
#otherwise add space
new_m += " "+n_toks[i]
tokenized_msg = new_m
return tokenized_msg.lstrip()
示例6: main
def main(argv):
if len(sys.argv) != 3:
print("Usage:> python getTaggedFile.py infile.txt outfile.txt")
exit()
infile_name = str(sys.argv[1])
outfile_name = str(sys.argv[2])
infile = open(infile_name, 'r')
outfile = open(outfile_name, 'w')
tagger = PerceptronTagger()
print("Reading file...")
line = infile.readline()
while line != '':
# Use Twokenizer for twitter parser
tagset = None
tokens = tokenize(line)
tags = nltk.tag._pos_tag(tokens, tagset, tagger)
outfile.write(format_tagged(tags))
line = infile.readline()
# close file and connection
infile.close()
outfile.close()
print("Finished tagging... Closing files.")
示例7: __init__
def __init__(self, testData):
self.labeledTweets = []
for line in open(testData):
line = line.rstrip('\n')
fields = line.split('\t')
fields[6] = ' '.join(twokenize.tokenize(fields[6]))
self.labeledTweets.append(fields)
示例8: process
def process(self,text):
tTweet = ""
for word in text.split():
if "#" in word:
word = word.replace("#"," ")
f=0
for tt in self.remove:
if tt in word:
f=1
if f==1:
continue
tTweet = " ".join([tTweet,word])
tTweet = tTweet.strip()
tempTweet = ""
for word in twokenize.tokenize(tTweet):
if word != " " and word not in self.stop and not word.isdigit():
word = word.strip().lower()
if len(word) > 26:
word=word[:27]
#### Normalize Emoticons
try:
word = self.emoticons[word]
except:
#Normalize Acronyms
try:
try:
if self.wordDict[word] ==1:
word = word
except:
word = self.acronyms[word]
except:
#Normalize Contractions
try:
word = self.contractions[word]
except:
#Normalize words (Spell)
try:
if self.wordDict[word] == 1:
word = word
except:
CW = self.correct(word)
if "@" in word or "#" in word:
word = word
else:
if CW != "a":
word = CW
if "@" in word:
word="@user"
tempTweet = " ".join([tempTweet,word.strip()])
tempTweet = tempTweet.lower().strip()
tempTweet = " ".join(stemmer.stem(w) for w in tempTweet.split(" ") if w not in self.stop)
#print(tempTweet.encode("utf-8"))
return(tempTweet)
##Usage
# pre = Preprocess()
# pre.process("lol god pls help with my hw :) :(:D")
示例9: process_line
def process_line(s, clean_string=True):
if clean_string:
s = clean_str(s)
tokens = tokenize(s)
#return [process_token(None,token).lower() for token in tokens]
sent = nltk.pos_tag(tokens)
chunks = nltk.ne_chunk(sent, binary=False)
return [process_token(c,token).lower().encode('UTF-8') for c,token in map(None, chunks, tokens)]
示例10: all_tokens
def all_tokens(tweetreader):
i = 0
for r in tweetreader:
i += 1
tokens = tokenize(r[-1])
for t in tokens:
yield t
if i >= 50000:
return
示例11: process_line
def process_line(s, clean_string=True):
"""
Processes a line by iteratively calling process_token.
"""
if clean_string:
s = clean_str(s)
tokens = tokenize(s)
sent = nltk.pos_tag(tokens)
chunks = nltk.ne_chunk(sent, binary=False)
return [process_token(c,token).lower().encode('UTF-8') for c,token in map(None, chunks, tokens)]
示例12: get_idx_from_sent
def get_idx_from_sent(sent, word_idx_map, k):
"""
Transforms sentence into a list of indices. Pad with zeroes.
"""
x = []
words = tokenize(sent)
for word in words:
if word in word_idx_map:
x.append(word_idx_map[word])
else:
x.append(word_idx_map[UNK_TOKEN])
return x
示例13: process_statuses
def process_statuses(self, statuses):
statuses = [twokenize.tokenize(s.text.lower()) for s in statuses]
for s in xrange(len(statuses)):
w = 1
while True:
if w >= len(statuses[s]):
break
if statuses[s][w][0] == "'":
statuses[s] = statuses[s][:w-1] + [statuses[s][w-1] + statuses[s][w]] + statuses[s][w+1:]
w = 0
w += 1
return statuses
示例14: tokenize_and_clean
def tokenize_and_clean(msg, alignments):
if alignments:
toks = twokenize.tokenize(msg)
else:
toks = twokenize.simple_tokenize(msg)
for i in range(len(toks)):
toks[i] = toks[i].lower()
inds = range(len(toks))
#if len(inds) < len(toks): print "dropping junk", sorted(list(toks[i] for i in (set(range(len(toks)))-set(inds))))
if alignments:
return toks.subset(inds)
else:
return [toks[i] for i in inds]
示例15: normalize_tweet
def normalize_tweet(text, lowercase=False, rm_digits=False, return_tokens=False):
if lowercase:
text = text.lower()
text = re.sub(URL_PATTERN, 'URL', text)
tokens = twokenize.tokenize(text)
if return_tokens:
if rm_digits:
tokens = map(lambda tk: re.sub(NUM_PATTERN, 'NUM', tokens))
return tokens
clean = ' '.join(tokens)
if rm_digits:
re.sub(NUM_PATTERN, 'NUM', clean)
return clean