当前位置: 首页>>代码示例>>Python>>正文


Python tokenize.wordpunct_tokenize函数代码示例

本文整理汇总了Python中nltk.tokenize.wordpunct_tokenize函数的典型用法代码示例。如果您正苦于以下问题:Python wordpunct_tokenize函数的具体用法?Python wordpunct_tokenize怎么用?Python wordpunct_tokenize使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。


在下文中一共展示了wordpunct_tokenize函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: formatting_features

def formatting_features(obj):
	question = obj['question_text'].strip()
	topics   = [ t['name'] for t in obj['topics'] ]
	tokens   = [ w for w in wordpunct_tokenize(question) if not re.match(r'[\'\"\.\?\!\,\/\\\(\)\`]',w) ]
	punct    = [ p for p in wordpunct_tokenize(question) if re.match(r'[\'\"\.\?\!\,\/\\\(\)\`]',p) ]
	top_toks = set([ w.lower() for t in obj['topics'] 
						for w in wordpunct_tokenize(t['name']) ])
	qn_toks  = set(tokens)
	#qn_topic_words = len(top_toks & qn_toks)

	qn_mark   = 1 if "?" in question else -1 
	start_cap = 1 if re.match(r'^[A-Z]',question) else -1
	if tokens:
		qn_type = [ sum(1.0 for w in tokens if w in qws)
						for qws in qn_type_words ]
		nm_pres = sum(1.0 for w in tokens if w.lower() in names
							and re.match(r'^[A-Z]',w))
		pl_pres = sum(1.0 for w in tokens if w.lower() in places
							and re.match(r'^[A-Z]',w))
	else:
		qn_type = [0.0]*len(qn_type_words)
		nm_pres = -1.0
		pl_pres = -1.0

#	qn_somewhere =  1 if sum(qn_type) and (re.match(r'\?$',question)
#						or re.match(r'\?\s*[A-Z]',question)) else -1

	total_words = len(tokens)
	dict_words  = sum(1 for w in tokens if w.lower() in eng_words)
	correct_form_count = sum(1.0 for w in tokens
			if (w.lower() in eng_words and not re.match(r'^[A-Z]+$',w))
			or re.match(r'^[A-Z]',w)
		)
	question_form = 1 if '?' in punct and sum(1 for w in tokens if w in qn_words) else -1
	correct_form_ratio = correct_form_count/float(total_words+1)
	#topic_word_ratio   = qn_topic_words/float(total_words+1)
	name_ratio         = (nm_pres + pl_pres)/float(total_words+1)
	punctuation_ratio  = len(punct)/float(total_words+1)
	result = [
			#	1 if nm_pres else 0,
				nm_pres,
			#	1 if pl_pres else 0,
				pl_pres,
				qn_mark,
				start_cap,
			#	qn_somewhere,
				correct_form_ratio,
				#len(punct),
				punctuation_ratio,
		   		math.log(len(topics)+1),
		   		#len(topics),
				name_ratio,
			#	topic_word_ratio,
				dict_words,
			#	qn_topic_words,
			#	correct_form_count,
			#	math.log(total_words+1),
				total_words,
			] + qn_type
	return result
开发者ID:shawntan,项目名称:quora-codesprint-2013,代码行数:60,代码来源:qn1.py

示例2: getResult

def getResult(textFile, ind1, ind2, outFile, outFile2):
	fout = open(outFile,"w")
	fout2 = open(outFile2, "w")
	#probs = []
	for line in open(textFile):
		hyp1 = wordpunct_tokenize(line.strip().split("|||")[ind1].strip().decode("utf-8"))
		hyp2 = wordpunct_tokenize(line.strip().split("|||")[ind2].strip().decode("utf-8"))

		f = open("temp.txt","w")
		f.write("%s\n"%" ".join([x.encode("utf-8") for x in hyp1]))
		f.close()
		os.system("~/Course/AMMML/project/FeatureAugmentedRNNToolkit/rnnlm -rnnlm ~/Course/AMMML/project/FeatureAugmentedRNNToolkit/model -test temp.txt -features-matrix ~/Course/AMMML/project/FeatureAugmentedRNNToolkit/feature.txt -independent > temp_out.txt")
		
		prob1 = getProb("temp_out.txt")
	
		f = open("temp.txt","w")
		f.write("%s\n"%" ".join([x.encode("utf-8") for x in hyp2]))
		f.close()
		os.system("~/Course/AMMML/project/FeatureAugmentedRNNToolkit/rnnlm -rnnlm ~/Course/AMMML/project/FeatureAugmentedRNNToolkit/model -test temp.txt -features-matrix ~/Course/AMMML/project/FeatureAugmentedRNNToolkit/feature.txt -independent > temp_out.txt")
			
		prob2 = getProb("temp_out.txt")

		#probs.append([prob1,prob2])
		fout.write("%f\t%f\n"%(prob1,prob2))
		fout2.write("%f\t%f\n"%(prob1/float(len(hyp1)),prob2/float(len(hyp2))))
	fout.close()
	fout2.close()
开发者ID:sshiang,项目名称:sp2016.11-731,代码行数:27,代码来源:rnnlm.py

示例3: text_to_sentences

 def text_to_sentences(self, text, tokenizer, remove_stopwords=False ):
     print "text_to_sentence"
     #from nltk.tokenize import wordpunct_tokenize
     # Function to split a review into parsed sentences. Returns a 
     # list of sentences, where each sentence is a list of words
     #
     text=text.decode("utf8")
     from nltk.tokenize import sent_tokenize,wordpunct_tokenize
     # 1. Use the NLTK tokenizer to split the paragraph into sentences
     #raw_sentences = tokenizer.tokenize(text.strip())
     raw_sentences = sent_tokenize(text.strip())
     print "finish tokenize sentence",len(raw_sentences)
     #
     # 2. Loop over each sentence
     sentences = []
     for raw_sentence in raw_sentences:
         
         #print "sentence:",raw_sentence
         # If a sentence is empty, skip it
         if len(raw_sentence) > 0:
             # Otherwise, call review_to_wordlist to get a list of words
             #sentences.append( text_to_wordlist( raw_sentence, \
 #               remove_stopwords ))
             #print removePunctuation(raw_sentence).lower().split()
             print raw_sentence
             sentences.append(wordpunct_tokenize(raw_sentence))#raw_sentence.split())
             print wordpunct_tokenize(raw_sentence)
             #print  text_to_wordlist( raw_sentence, remove_stopwords )
     #    
     # Return the list of sentences (each sentence is a list of words,
     # so this returns a list of lists
     return sentences
开发者ID:billy322,项目名称:BioNLP-2016,代码行数:32,代码来源:utilities.py

示例4: getFormattingFeatures

def getFormattingFeatures(obj):
  question = obj["question_text"].strip()
  topics = [t["name"] for t in obj["topics"]]
  tokens = [w for w in wordpunct_tokenize(question) if not re.match(r"[\'\"\.\?\!\,\/\\\(\)\`]", w)]
  punct = [p for p in wordpunct_tokenize(question) if re.match(r"[\'\"\.\?\!\,\/\\\(\)\`]", p)]
  top_toks = set([w.lower() for t in obj["topics"] for w in wordpunct_tokenize(t["name"])])
  qn_toks  = set(tokens)
  qn_topic_words = len(top_toks & qn_toks)
  start_cap = 1 if re.match(r"^[A-Z]", question) else 0
  if len(tokens) > 0:
    qn_type = [1 if sum(1.0 for w in tokens if w in qws) else 0 for qws in qn_type_words]
  else:
    # penalize having no token words
    qn_type = [-1.0] * len(qn_type_words)
  total_words = len(tokens)
  correct_form_count = sum(1.0 for w in tokens if (not re.match(r"^[A-Z]+$", w)) or re.match(r"^[A-Z]", w))
  topic_word_ratio1  = max(0, qn_topic_words - 2) / float(total_words + 1)
  topic_word_ratio2  = max(0, 2 - qn_topic_words) / float(total_words + 1)
  topic_word_ratio   = qn_topic_words / float(total_words + 1)
  punctuation_ratio  = len(punct) / float(total_words + 1)
  word_overshoot = max(0, total_words - 10.1)
  word_undershoot = max(0, 10.1 - total_words)
  result = [
    start_cap,
    punctuation_ratio,
    math.log(len(topics) + 1),
    topic_word_ratio1,
    topic_word_ratio2,
    topic_word_ratio,
    word_overshoot,
    word_undershoot,
   ] + qn_type
  return result
开发者ID:ChandanBP,项目名称:quora,代码行数:33,代码来源:interest_solution.py

示例5: check_len_stats

def check_len_stats(std_dev):
    fraction = 0
    for i in range(1,5):
        fraction+=0.25
        count1 = 0
        count2 = 0
        mcount = 0
        ncount = 0
        threshold = fraction*std_dev
        print threshold
        with open(infile, 'r') as f:
            for line in f:
                mem_len = 0
                nonmem_len= 0
                if(line.strip().split('\t')[1]=='M'):
                    mem_len+=len(wordpunct_tokenize(line.strip().split('\t')[0]))
                    mcount +=1
                    if (float(mem_len) < threshold):
                        count1+=1
                else:
                    nonmem_len+=len(wordpunct_tokenize(line.strip().split('\t')[0]))
                    ncount+=1
                    if (float(nonmem_len) < threshold):
                        count2+=1
        f.close()
        print "iteration-" , i
        print "memorable quotes below threshold-", count1
        print "total memorable quotes-",mcount
        print "non-memorable quotes below threshold-",count2
        print "non memorable quotes-",ncount
开发者ID:anushabala,项目名称:memorability,代码行数:30,代码来源:get_average_length.py

示例6: jaccard_sentence

def jaccard_sentence(sentence1, sentence2):
    """
    Determines jaccard value of two sentences

    :param sentence1:
    :param sentence2:
    :return: jaccard value
    """
    return jaccard(wordpunct_tokenize(sentence1), wordpunct_tokenize(sentence2))
开发者ID:bphenriques,项目名称:NLPMiniProjects,代码行数:9,代码来源:SimilarityUtil.py

示例7: dice_sentence

def dice_sentence(sentence1, sentence2):
    """
    Determines the Dice value of two sentences

    :param sentence1:
    :param sentence2:
    :return: dice value
    """
    return dice(wordpunct_tokenize(sentence1), wordpunct_tokenize(sentence2))
开发者ID:bphenriques,项目名称:NLPMiniProjects,代码行数:9,代码来源:SimilarityUtil.py

示例8: common_words

def common_words(sent1,sent2):
    # remove stop words, lemmatise and return count of common words
    porter = PorterStemmer()
    #stop = stopwords.words('english')
    s1_words =  [porter.stem(i.lower()) for i in wordpunct_tokenize(sent1)  ]
    s2_words =  [porter.stem(i.lower()) for i in wordpunct_tokenize(sent2)  ]
    s1 = set(s1_words)
    s2 = set(s2_words)
    return len(s1.intersection(s2)) / ((len(s1)+0.1+len(s2))/2.0) # normalised 
开发者ID:lavanyats,项目名称:iMATCH,代码行数:9,代码来源:wordnet_utils.py

示例9: load_memes

    def load_memes (self, filenames):

        for filename in filenames:
            f = open(filename, 'r')
            contents = f.readlines()
            for entry in contents:
                fields = [s.strip() for s in entry.split("|")]
                meme_type = fields[0]
                top_text = wordpunct_tokenize(fields[1].lower())
                bottom_text = wordpunct_tokenize(fields[2].lower())
                self.memes[meme_type].append ((top_text, bottom_text))
开发者ID:AlexeyMK,项目名称:DATASS,代码行数:11,代码来源:SentimentAnalysis.py

示例10: generate_vocabulary

    def generate_vocabulary(self, review_summary_file):
        self.rev_sum_pair = pd.read_csv(review_summary_file,header=0).values

        for review,summary in self.rev_sum_pair:
            rev_lst = wordpunct_tokenize(review)
            sum_lst = wordpunct_tokenize(summary)
            self.__add_list_to_dict(rev_lst)
            self.__add_list_to_dict(sum_lst)

        # Now store the "" empty string as the last word of the voacabulary
        self.map[""] = len(self.map)
        self.revmap[len(self.map)] = ""
开发者ID:githubgzc,项目名称:deep-summarization,代码行数:12,代码来源:data2tensor.py

示例11: features_from_dump

def features_from_dump(infile,variant,embeddings,bowfilter):
    frame = read_dump(infile)
    refstatements = [wordpunct_tokenize(st) for st in list(frame.Ref)]
    targetstatements = [wordpunct_tokenize(st) for st in list(frame.Target)]
    featuredicts = []

    for i in range(len(refstatements)):
        sp = StatementPair(i, refstatements[i], targetstatements[i], 0)
        commonwords, onlyref, onlytarget = sp._word_venn_diagram()
        trainingbow.update(onlyref)
        featuredicts.append(sp.featurize(variant, embeddings,bowfilter))

    return featuredicts
开发者ID:hectormartinez,项目名称:verdisandbox,代码行数:13,代码来源:classify_dga_dump.py

示例12: med_sentence

def med_sentence(sentence1, sentence2, c1=1, c2=1, c3=1):
    """
    Determines minimum edit distance of two sentences.

    :param sentence1: first sentence
    :param sentence2: second sentence
    :param c1: optional weight
    :param c2: optional weight
    :param c3: optional weight
    :return: integer, minimum edit distance
    """

    return med(wordpunct_tokenize(sentence1), wordpunct_tokenize(sentence2), c1, c2, c3)
开发者ID:bphenriques,项目名称:NLPMiniProjects,代码行数:13,代码来源:SimilarityUtil.py

示例13: main

def main():

    # related_words = {
    #     'art':['art', 'arts', , 'op art', 'pop art', 'art deco', 'art form', 'art house', 'art-house', 'clip art', 'fine art', 'art gallery', 'art nouveau', 'art therapy',  'kinetic art', 'martial art', 'art director', 'conceptual art', "objet d'art", 'performance art', 'work of art', 'state-of-the-art', 'the black art', 'thou art', 'noble art', 'craft', 'craftsmanship', 'ingenuity', 'mastery', 'artistry', 'imagination', 'Biedermeier', 'Parian', 'Queen Anne', 'annulate', 'anomphalous', 'banded', 'chryselephantine', 'aperture', 'collared', 'artificial', 'condensed', 'camera', 'copied'],

    #     'sport':['athletcis', 'recreation', 'candidacy', 'championship', 'clash', 'contention', 'event', 'fight', 'game', 'match', 'race', 'rivalry', 'run', 'sport', 'sports', 'struggle', 'tournament', 'trial', 'basketball', 'football', 'soccer', 'badminton', 'archery', 'tennis', 'swim']
    # }
    result = dict()
    clubs = list(Club.objects.all())
    print len(clubs)

    for club in clubs:
        score = 0
        # try:
        if club.introduction:
            intro = club.introduction
        else:
            intro = ""
        name = club.name
        max_score = 0
        max_cat = None
        for category in CATEGORIES:
            all_words = wordpunct_tokenize(intro.lower())
            all_name_words = wordpunct_tokenize(name.lower())
            score = 0
            for word in determinstic_words[category]:
                score += all_words.count(word) * 2
                score += all_name_words.count(word) * 10
            if score > max_score:
                max_cat = category
                max_score = score

        if max_cat and max_score > 2:
            category = Category.objects.get(name=max_cat)
            club.categories.add(category)
            club.save()

            try:
                # print name, max_cat, max_score
                result[max_cat].append(name)
            except KeyError:
                result[max_cat] = [name]

    for category in CATEGORIES:
        print category
        try:
            for club in result[category]:
                print club
        except:
            pass
        print "\n"
开发者ID:hpec,项目名称:rateyourclub,代码行数:51,代码来源:categorize.py

示例14: hypernym_count

def hypernym_count(sent1,sent2):

    s1_words =  [i.lower() for i in wordpunct_tokenize(sent1) ]
    s2_words =  [i.lower() for i in wordpunct_tokenize(sent2) ]
    s1_all = []
    s2_all = []

    for w in s1_words:
        s1_all.extend(get_hypernyms(w))
    for w in s2_words:
	s2_all.extend(get_hypernyms(w))
    w1_hypernym = len(set(s1_words).intersection(set(s2_all)))
    w2_hypernym = len(set(s2_words).intersection(set(s1_all)))
    return w1_hypernym-w2_hypernym
开发者ID:lavanyats,项目名称:iMATCH,代码行数:14,代码来源:wordnet_utils.py

示例15: best_dressed

def best_dressed(year):
    if year not in yearMap.keys():
        prep_year(year)

    strings = yearMap[year]['strings']
    dressPattern = re.compile(r'(dress)|(red carpet)|(redcarpet)', re.IGNORECASE)
    posPattern = re.compile(r'(best)|(beautiful)|(stun)|(love)', re.IGNORECASE)
    negPattern = re.compile(r'(worst)|(bad)|(ugly)|(hate)', re.IGNORECASE)
    namePattern = re.compile(r'[A-Z]\w* [A-Z]\w*') 
    stoplist = ['new','red','carpet','redcarpet','globes','golden','best','worst','movie','motion','picture','film','drama','comedy','musical','cecil','demille','award','tv','performance', 'actress','actor','television','feature','foreign','language','supporting','role','director','original','series']

    dress_mentions = Counter()
    dress_mentions_neg = Counter()
    dress_mentions_pos = Counter()
    for tweet in strings:
        if re.search(dressPattern, tweet):
            matches = re.findall(namePattern, tweet)
            matches = (w.lower() for w in matches)
            for match in matches:
                match_words = wordpunct_tokenize(match)
   
                if match_words[0] not in stoplist and match_words[1] not in stoplist:
                    dress_mentions[match] += 1
                    if re.search(posPattern, tweet):
                        dress_mentions_pos[match] += 1
                    if re.search(negPattern, tweet):
                        dress_mentions_neg[match] += 1


    discussed_dress = dress_mentions.most_common(1)
    best_dress = dress_mentions_pos.most_common(1)
    worst_dress = dress_mentions_neg.most_common(1)

    return best_dress[0][0], worst_dress[0][0], discussed_dress[0][0]
开发者ID:irabkina,项目名称:gg-project-master-2016,代码行数:34,代码来源:gg_api.py


注:本文中的nltk.tokenize.wordpunct_tokenize函数示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。