当前位置: 首页>>代码示例>>Python>>正文


Python twokenize.tokenize函数代码示例

本文整理汇总了Python中twokenize.tokenize函数的典型用法代码示例。如果您正苦于以下问题:Python tokenize函数的具体用法?Python tokenize怎么用?Python tokenize使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。


在下文中一共展示了tokenize函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: Extract

    def Extract(self, text):
        features = []
        words = twokenize.tokenize(text)

        #hand-crafted features
        iCapitalized = True
        nCapitalized = 0.1
        nAllCaps = 0.1
        nCapLowerViolated = 0.1
        nCapUpperViolated = 0.1
        nWords = 0.1
        for i in range(len(words)):
            capitalized = re.search(r'^([A-Z]|[a-z][A-Z])', words[i])

            if capitalized and not (i == 0 or re.match(r"\.|\?|!|@.+|http:.+|:|\"", words[i-1])):
                nCapitalized += 1.0

            if not (i == 0 or re.match(r"\.|\?|!|@.+|http:.+|:|\"", words[i-1])):
                if capitalized and self.capDict.get(words[i].lower(), '1') != '1':
                    nCapUpperViolated += 1.0
                    features.append(self.fVocab.GetID('upperViolated=%s' % words[i].lower()))
                elif not capitalized and re.match(r'[a-z]+', words[i]) and self.capDict.get(words[i].lower(), '1') != '0':
                    nCapLowerViolated += 1.0
                    #features.append(self.fVocab.GetID('lowerViolated=%s' % words[i].lower()))
                if re.match(r'\w+', words[i][0:1]):
                    nWords += 1
            if re.match(r"i|i'm|im|u", words[i]):
                iCapitalized = False
            if re.match(r"[A-Z]{2,}", words[i]):
                nAllCaps += 1
                
        features.append(self.fVocab.GetID('iCapitalized=%s' % iCapitalized))

        return ' '.join(["%s:1" % x for x in features]) + " %s:%s" % (self.fVocab.GetID('nAllCaps'), nAllCaps/nWords) + " %s:%s" % (self.fVocab.GetID('nCapitalized'), nCapitalized/nWords) + " %s:%s" % (self.fVocab.GetID('nCapLowerViolated'), nCapLowerViolated/nWords) + " %s:%s" % (self.fVocab.GetID('nCapUpperViolated'), nCapUpperViolated/nWords)
开发者ID:52nlp,项目名称:twitter_nlp,代码行数:34,代码来源:cap_classifier.py

示例2: kshinglize

def kshinglize(s, k=KSHINGLES, stopwords=STOPWORDS):
    """ Tokenizes string s, removes stopwords, and returns a set of k-shingles
    """
    s = s.strip().lower()
    tokens_raw = twokenize.tokenize(s)
    tokens = filterstopwords(tokens_raw, stopwords)
    return tokens_to_kshingles(tokens, k)
开发者ID:driscoll,项目名称:cluster,代码行数:7,代码来源:cluster.py

示例3: main

def main(argv):

    tagger = PerceptronTagger()
    tagset = None
    tokens = tokenize(line)
    tags = nltk.tag._pos_tag(tokens, tagset, tagger)
    format_tagged(tags)
开发者ID:h4x0rsz,项目名称:senior-design,代码行数:7,代码来源:tagAndLabel.py

示例4: learn_terms

 def learn_terms(self, tweets_file_object, learn_lemmas=True, cache_size=1000000):
     reader = csv.reader(tweets_file_object, delimiter=",", quotechar="\"")
     term_freq = Counter()
     term_id_map = dict()
     tweet_vectors = []
     for row in reader:
         tweet_id = int(row[0])
         tweet_text = row[-1]
         terms = [t.lower().encode("utf-8") for t in twokenize.tokenize(tweet_text)]
         if learn_lemmas:
             terms = [self.lmtz.lemmatize(term) for term in terms]
         tweet_sp_vector = []
         counted_ids = []
         for term in terms:
             if term not in term_id_map:
                 term_id = len(term_id_map)
                 term_id_map[term] = term_id
             else:
                 term_id = term_id_map[term]
             if term_id not in counted_ids:
                 term_freq[term_id] += 1
                 counted_ids.append(term_id)
             tweet_sp_vector.append(term_id)
         tweet_vectors.append((tweet_id, tweet_sp_vector))
         if len(tweet_vectors) >= cache_size:
             self.write_tweet_vectors(tweet_vectors)
             tweet_vectors = []
     self.write_tweet_vectors(tweet_vectors)
     self.write_terms(term_id_map, term_freq)
开发者ID:zaycev,项目名称:n7,代码行数:29,代码来源:search.py

示例5: preprocess

def preprocess(m, sep_emoji=False):
    m = m.lower()    
    m = max_reps(m)
    #replace user mentions with token '@user'
    user_regex = r"[email protected]+?( |$)|<@mention>"    
    m = re.sub(user_regex," @user ", m, flags=re.I)
    #replace urls with token 'url'
    m = re.sub(twokenize.url," url ", m, flags=re.I)        
    tokenized_msg = ' '.join(twokenize.tokenize(m)).strip()
    if sep_emoji:
        #tokenize emoji, this tokenzier however has a problem where repeated punctuation gets separated e.g. "blah blah!!!"" -> ['blah','blah','!!!'], instead of ['blah','blah','!','!','!']
        m_toks = tokenized_msg.split()
        n_toks = twk.tokenize(tokenized_msg)         
        if len(n_toks)!=len(m_toks):
            #check if there is any punctuation in this string
            has_punct = map(lambda x:x in twk.punctuation, n_toks)
            if any(has_punct):  
                new_m = n_toks[0]
                for i in xrange(1,len(n_toks)):
                    #while the same punctuation token shows up, concatenate
                    if has_punct[i] and has_punct[i-1] and (n_toks[i] == n_toks[i-1]):
                        new_m += n_toks[i]
                    else:
                        #otherwise add space
                        new_m += " "+n_toks[i]                   
                tokenized_msg = new_m                
    return tokenized_msg.lstrip()
开发者ID:samiroid,项目名称:utils,代码行数:27,代码来源:__init__.py

示例6: main

def main(argv):

    if len(sys.argv) != 3:
        print("Usage:> python getTaggedFile.py infile.txt outfile.txt")
        exit()

    infile_name = str(sys.argv[1])
    outfile_name = str(sys.argv[2])

    infile = open(infile_name, 'r')
    outfile = open(outfile_name, 'w')

    tagger = PerceptronTagger()

    print("Reading file...")
    line = infile.readline()

    while line != '':
        # Use Twokenizer for twitter parser
        tagset = None
        tokens = tokenize(line)
        tags = nltk.tag._pos_tag(tokens, tagset, tagger)
        outfile.write(format_tagged(tags))
        line = infile.readline()

    # close file and connection
    infile.close()
    outfile.close()
    print("Finished tagging... Closing files.")
开发者ID:h4x0rsz,项目名称:senior-design,代码行数:29,代码来源:getTaggedFile.py

示例7: __init__

 def __init__(self, testData):
     self.labeledTweets = []
     for line in open(testData):
         line = line.rstrip('\n')
         fields = line.split('\t')
         fields[6] = ' '.join(twokenize.tokenize(fields[6]))
         self.labeledTweets.append(fields)
开发者ID:52nlp,项目名称:twitter_nlp,代码行数:7,代码来源:cap_eval.py

示例8: process

	def process(self,text):
		
		tTweet = ""
		for word in text.split():
			if "#" in word:
				word = word.replace("#"," ")
				f=0
				for tt in self.remove:
					if tt in word:
						f=1
				if f==1:
					continue
			tTweet = " ".join([tTweet,word])
			tTweet = tTweet.strip()

		tempTweet = ""
		for word in twokenize.tokenize(tTweet):
			if word != " " and word not in self.stop and not word.isdigit():
				word = word.strip().lower()
				if len(word) > 26:
					word=word[:27]
				#### Normalize Emoticons
				try:
					word = self.emoticons[word]
				except:
					#Normalize Acronyms
					try:
						try:
							if  self.wordDict[word] ==1:
								word = word
						except:
							word = self.acronyms[word]
					except:
					#Normalize Contractions
						try:
							word = self.contractions[word]
						except:
							#Normalize words (Spell)
							try:
								if self.wordDict[word] == 1:
									word =	word
							except:
								CW = self.correct(word)
								if "@" in word or "#" in word:
									word = word
								else:
									if CW != "a":
										word = CW
				if "@" in word:
					word="@user"
				tempTweet = " ".join([tempTweet,word.strip()])
				tempTweet = tempTweet.lower().strip()
		tempTweet = " ".join(stemmer.stem(w) for w in tempTweet.split(" ") if w not in self.stop)
		#print(tempTweet.encode("utf-8"))
		return(tempTweet)

##Usage
# pre = Preprocess()
# pre.process("lol god pls help with my hw :) :(:D")
开发者ID:suddu16,项目名称:Youtube-Comedy-Comparison,代码行数:59,代码来源:PreprocessClass.py

示例9: process_line

def process_line(s, clean_string=True):
    if clean_string:
        s = clean_str(s)
    tokens = tokenize(s)
    #return [process_token(None,token).lower() for token in tokens]
    sent = nltk.pos_tag(tokens)
    chunks = nltk.ne_chunk(sent, binary=False)
    return [process_token(c,token).lower().encode('UTF-8') for c,token in map(None, chunks, tokens)]
开发者ID:npow,项目名称:Ubuntu-Dialogue-Generationv2,代码行数:8,代码来源:createDictionaries.py

示例10: all_tokens

def all_tokens(tweetreader):
    i = 0
    for r in tweetreader:
        i += 1
        tokens = tokenize(r[-1])
        for t in tokens:
            yield t
        if i >= 50000:
            return
开发者ID:zaycev,项目名称:n7,代码行数:9,代码来源:pmi.py

示例11: process_line

def process_line(s, clean_string=True):
    """
    Processes a line by iteratively calling process_token.
    """
    if clean_string:
        s = clean_str(s)
    tokens = tokenize(s)
    sent = nltk.pos_tag(tokens)
    chunks = nltk.ne_chunk(sent, binary=False)
    return [process_token(c,token).lower().encode('UTF-8') for c,token in map(None, chunks, tokens)]
开发者ID:npow,项目名称:Ubuntu-Dialogue-Generationv2,代码行数:10,代码来源:find_testfiles.py

示例12: get_idx_from_sent

def get_idx_from_sent(sent, word_idx_map, k):
    """
    Transforms sentence into a list of indices. Pad with zeroes.
    """
    x = []
    words = tokenize(sent)
    for word in words:
        if word in word_idx_map:
            x.append(word_idx_map[word])
        else:
            x.append(word_idx_map[UNK_TOKEN])
    return x
开发者ID:BinbinBian,项目名称:ubottu,代码行数:12,代码来源:merge_data.py

示例13: process_statuses

 def process_statuses(self, statuses):
     statuses = [twokenize.tokenize(s.text.lower()) for s in statuses]
     for s in xrange(len(statuses)):
         w = 1
         while True:
             if w >= len(statuses[s]):
                 break
             if statuses[s][w][0] == "'":
                 statuses[s] = statuses[s][:w-1] + [statuses[s][w-1] + statuses[s][w]] + statuses[s][w+1:]
                 w = 0
             w += 1
     return statuses
开发者ID:goddardc,项目名称:nlp-twitter,代码行数:12,代码来源:main.py

示例14: tokenize_and_clean

def tokenize_and_clean(msg, alignments):
  if alignments: 
    toks = twokenize.tokenize(msg)
  else:          
    toks = twokenize.simple_tokenize(msg)
  for i in range(len(toks)):
    toks[i] = toks[i].lower()
  inds = range(len(toks))
  #if len(inds) < len(toks): print "dropping junk", sorted(list(toks[i] for i in (set(range(len(toks)))-set(inds))))
  if alignments: 
    return toks.subset(inds)
  else:
    return [toks[i] for i in inds]
开发者ID:AnnuSachan,项目名称:tweetmotif,代码行数:13,代码来源:bigrams.py

示例15: normalize_tweet

def normalize_tweet(text, lowercase=False, rm_digits=False, return_tokens=False):
    if lowercase:
        text = text.lower()
    text = re.sub(URL_PATTERN, 'URL', text)
    tokens = twokenize.tokenize(text)
    if return_tokens:
        if rm_digits:
            tokens = map(lambda tk: re.sub(NUM_PATTERN, 'NUM', tokens))
        return tokens
    clean = ' '.join(tokens)
    if rm_digits:
        re.sub(NUM_PATTERN, 'NUM', clean)
    return clean
开发者ID:imgemp,项目名称:semeval16,代码行数:13,代码来源:__init__.py


注:本文中的twokenize.tokenize函数示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。