当前位置: 首页>>代码示例>>Python>>正文


Python wordsegment.segment函数代码示例

本文整理汇总了Python中wordsegment.segment函数的典型用法代码示例。如果您正苦于以下问题:Python segment函数的具体用法?Python segment怎么用?Python segment使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。


在下文中一共展示了segment函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: main

def main(arg="iamtoocoolforthis"):

    s = clean(arg)
    print "CLEANED STRING:", s
    print "======================RUNNING OPTIMIZED==================="
    print segment_method1(s)
    print "======================RUNNING VANILLA==================="
    print segment(s)
开发者ID:nitin7,项目名称:WordBreak,代码行数:8,代码来源:example.py

示例2: precisioncalc

def precisioncalc(query):
	print query,
	k = searchgoogle(query)
	seg = segment(query)
	m = []
	for n in seg:
		m.append(stemming.porter2.stem(n))
	seg = " ".join(m)
	if socialListProxy:
		proxy = ulib.ProxyHandler({'https': socialListHttps_Proxy, 'http': socialListHttp_Proxy})
		opener = ulib.build_opener(proxy)
		ulib.install_opener(opener)
	counter = 0
	total = 0
	for i in xrange(len(k)):
		req = ulib.Request(k[i], headers={'User-Agent': "Mozilla/5.0"})
		k[i] = segment(k[i])
		l = []
		for j in k[i]:
			l.append(stemming.porter2.stem(j))
		k[i] = " ".join(k[i])
		# print k[i]
		try:
			content = ulib.urlopen(req)
			x = re.findall("<\S*?title\S*?>(.*?)<\S*?/\S*?title\S*?>", content.read())
			t = []
			for s in x:
				t.append(stemming.porter2.stem(s))
			t = " ".join(t)
			# print t
			if ((seg in k[i]) or (seg in t)):
				counter = counter + 1
			total = total + 1
		except:
			pass

		if (total == 10):
			print str(counter)+"/"+str(total),
		if (total == 20):
			print str(counter)+"/"+str(total),


	if total < 10:
		print str(counter)+"/"+str(10), str(counter)+"/"+str(20)
	elif total < 20:
		print str(counter)+"/"+str(20)
	else:
		print ""
#precisioncalc("madhusai") #uncomment this to check the presion of some word
开发者ID:SummerProject16,项目名称:SocialList,代码行数:49,代码来源:precision.py

示例3: info_extract

def info_extract(u):
		
        final_string = ""
        twe=url.split(u)

        newtweet=""
        for a in range(len(twe)):
            newtweet = newtweet+twe[a]+" "

        text = sep.split(newtweet);
        tex=""    
        for i in range(len(text)):
                if(hasht.match(text[i]) or atp.match(text[i])):
                        m=text[i][1:]
                        text[i]=segment(m.lower())
                        n=""
                        for j in text[i]:
                            n=n+j+" "
                        text[i]=n
                tex+=text[i]+" "

        final_string=final_string+categorize(tex)+"####"
        final_string=final_string+babelnet(tex)+"####"
        twee = url.search(u)
        try:
            urls = str(twee.group(0))
            final_string=final_string+url_categ(urls)+"<br>"
        except:
            pass
        final_string=final_string+twe_cat(tex)+"####"
        final_string=final_string+senti(u)+"####"
        return final_string
开发者ID:InfoExtr2015,项目名称:Retrieval_Extraction_Information,代码行数:32,代码来源:__init__.py

示例4: create_dict

def create_dict():
	relation_name=[x[2] for x in os.walk("nell/relations")][0]
	sub_table={}
	obj_table={}
	for r in relation_name:
		lst=[]
		r_name=' '.join(segment(r.split(':')[1]))
		print r_name
		with open("nell/relations/"+r) as fp:
			for line in fp:
				line=line.rstrip('\n')
				sub,obj=line.split('\t')
				sub=' '.join((sub.split(":")[2]).split('_'))
				obj=' '.join((obj.split(":")[2]).split('_'))
				if sub in sub_table:
					tmp=sub_table[sub]
					tmp=tmp.union([r_name])
					sub_table[sub]=tmp
					#print("y")
				else:
					sub_table[sub]=set([r_name])
				if obj in obj_table:
					tmp=obj_table[obj]
					tmp=tmp.union([r_name])
					obj_table[obj]=tmp
					#print("yy")
				else:
					obj_table[obj]=set([r_name])
				#print len(sub_table[sub]),len(obj_table[obj])
	return sub_table,obj_table
开发者ID:vedsarkushwaha,项目名称:KBH_NELL,代码行数:30,代码来源:relations_dict.py

示例5: test_segment_12

def test_segment_12():
    result = [
        'far', 'out', 'in', 'the', 'uncharted', 'backwaters', 'of', 'the',
        'unfashionable', 'end', 'of', 'the', 'western', 'spiral', 'arm', 'of',
        'the', 'galaxy', 'lies', 'a', 'small', 'un', 'regarded', 'yellow', 'sun'
    ]
    assert segment(''.join(result)) == result
开发者ID:grantjenks,项目名称:wordsegment,代码行数:7,代码来源:test_coverage.py

示例6: test_segment_10

def test_segment_10():
    result = [
        'as', 'gregor', 'samsa', 'awoke', 'one', 'morning', 'from', 'uneasy',
        'dreams', 'he', 'found', 'himself', 'transformed', 'in', 'his', 'bed',
        'into', 'a', 'gigantic', 'insect'
    ]
    assert segment(''.join(result)) == result
开发者ID:grantjenks,项目名称:wordsegment,代码行数:7,代码来源:test_coverage.py

示例7: test_segment_9

def test_segment_9():
    result = [
        'it', 'was', 'the', 'best', 'of', 'times', 'it', 'was', 'the', 'worst',
        'of', 'times', 'it', 'was', 'the', 'age', 'of', 'wisdom', 'it', 'was',
        'the', 'age', 'of', 'foolishness'
    ]
    assert segment(''.join(result)) == result
开发者ID:grantjenks,项目名称:wordsegment,代码行数:7,代码来源:test_coverage.py

示例8: k_list_repeat

def k_list_repeat(query):
	k = searchgoogle(query)
	m = []

	if socialListProxy:
		proxy = ulib.ProxyHandler({'https': socialListHttps_Proxy, 'http': socialListHttp_Proxy})
		opener = ulib.build_opener(proxy)
		ulib.install_opener(opener)

	for i in xrange(len(k)):
		req = ulib.Request(k[i], headers={'User-Agent': "Mozilla/5.0"})
		k[i] = segment(k[i])
		l = []
		for j in k[i]:
			l.append(stemming.porter2.stem(j))
		k[i] = " ".join(k[i])
		# print k[i]
		try:
			content = ulib.urlopen(req)
			#reading the title of url
			x = re.findall("<\S*?title\S*?>(.*?)<\S*?/\S*?title\S*?>", content.read())
			t = []
			for s in x:
				t.append(stemming.porter2.stem(s))
			t = " ".join(t)
			m.append(t)

		except:
			pass
	return m
开发者ID:SummerProject16,项目名称:SocialList,代码行数:30,代码来源:k_list_repeat.py

示例9: segment_hashtag

def segment_hashtag(h):
    """segment the words inside the hashtag h, discard non alphanum chars"""
    if hasattr(h, "group"):
        h = h.group()[1:]
    else:
        h = h[1:]
    # print(h, " hashtag " + wordsegment.segment(h) + " . ")
    return " hashtag " + " ".join(wordsegment.segment(h)) + " , "
开发者ID:lambros-mavrides,项目名称:ml_practice,代码行数:8,代码来源:morph.py

示例10: get_word_vector

 def get_word_vector(self, word):
     if word is None:
         return None
     word = word.strip().strip('[').strip(']').strip('(').strip(')')
     word_lower = word.lower()
     word_upper = word.upper()
     try:
         if word_lower not in self.word_vectors_map:
             if config.debug:
                 print 'getting word vector for ', word
             if word in self.word2vec_model.vocab:
                 self.word_vectors_map[word_lower] = self.word2vec_model[word]
             #todo: if vocab us ensured to be lower case, this condition is not required
             elif word_lower in self.word2vec_model.vocab:
                 self.word_vectors_map[word_lower] = self.word2vec_model[word_lower]
             elif word_upper in self.word2vec_model.vocab:
                 self.word_vectors_map[word_lower] = self.word2vec_model[word_upper]
             else:
                 if not constants.concept_regexp.sub('', word):
                     return self.get_word_vector(constants.alpha_regex.sub('', word))
                 subwords = word.split()
                 if len(subwords) == 1:
                     subwords = word.split(',')
                     if len(subwords) == 1:
                         subwords = word.split('/')
                         if len(subwords) == 1:
                             subwords = word.split(':')
                             if len(subwords) == 1:
                                 subwords = word.split('-')
                                 if len(subwords) == 1:
                                     subwords = word.split('_')
                                     if len(subwords) == 1:
                                         # print 'performing word segmentation on ', word
                                         subwords = ws.segment(word.encode('utf8'))
                                         if len(subwords) == 1:
                                             print 'could not get wordvector for ', word
                                             self.word_vectors_map[word_lower] = None
                 if len(subwords) > 1:
                     curr_wordvec = None
                     for curr_subword in subwords:
                         curr_subword_vec = self.get_word_vector(curr_subword)
                         if curr_subword_vec is not None:
                             if curr_wordvec is None:
                                 curr_wordvec = curr_subword_vec
                             else:
                                 start_time = time.time()
                                 curr_wordvec = ss.fftconvolve(curr_wordvec, curr_subword_vec, mode='same')
                                 if config.debug:
                                     print 'performed fast fourier transform convolution on word vectors in {} seconds.'.format(time.time()-start_time)
                     self.word_vectors_map[word_lower] = curr_wordvec
         return self.word_vectors_map[word_lower]
     except UnicodeDecodeError as ude:
         print 'error getting word vector for ', word
         print ude.message
         self.word_vectors_map[word_lower] = None
         return self.word_vectors_map[word_lower]
开发者ID:sgarg87,项目名称:big_mech_isi_gg,代码行数:56,代码来源:word_vectors.py

示例11: read_nell_relations

def read_nell_relations():
	"""
		this function will read relations from nell graph
		
		return the list of relations
	"""
	rel=os.walk("nell/relations")
	relation=[]
	for i in rel:
		trel=i[2]
	for i in trel:
		relation.append(' '.join(segment(i.split(':')[1])))
	return relation
开发者ID:vedsarkushwaha,项目名称:KBH_NELL,代码行数:13,代码来源:read_data.py

示例12: test12

def test12(tagtocheck):
	d=en.Dict("en-US")
	correct = 0
	incorrect = 0
	words=ws.segment(tagtocheck)
	for x in words:
		if d.check(x)==False:
			incorrect+=1
		else:
			correct+=1
	if correct!= 0:
		return "%.4f"%(float(incorrect)/correct)
	else:
		return 0
开发者ID:SummerProject16,项目名称:SocialList,代码行数:14,代码来源:testFile12.py

示例13: create_dict_adva

def create_dict_adva():
	relation_name=[x[2] for x in os.walk("nell/relations")][0]
	sub_table={}
	obj_table={}
	for r in relation_name:
		lst=[]
		r_name=' '.join(segment(r.split(':')[1]))
		print r_name
		with open("nell/relations/"+r) as fp:
			for line in fp:
				line=line.rstrip('\n')
				sub,obj=line.split('\t')
				sub=sub.split(":")[1:]
				obj=obj.split(":")[1:]
				for tmp in sub:
					tmpsb=''.join(tmp.split('_'))
					tmpsb=segment(tmpsb)
					for sb in tmpsb:
						if sb in sub_table:
							tmp=sub_table[sb]
							tmp=tmp.union([r_name])
							sub_table[sb]=tmp
							#print("y")
						else:
							sub_table[sb]=set([r_name])
				for tmp in obj:
					tmpob=''.join(tmp.split('_'))
					tmpob=segment(tmpob)
					for ob in tmpob:
						if ob in obj_table:
							tmp=obj_table[ob]
							tmp=tmp.union([r_name])
							obj_table[ob]=tmp
							#print("yy")
						else:
							obj_table[ob]=set([r_name])
	return sub_table,obj_table
开发者ID:vedsarkushwaha,项目名称:KBH_NELL,代码行数:37,代码来源:relations_dict.py

示例14: read_relation_name

def read_relation_name(folder_name):
	"""
		This function will look inside the folder folder_name and fetch out all relations where relations are the name of inside folder names. Here each folder name should have name format "concept:relation".
		
		return the list of relations
	"""
	#print folder_name
	folder_list=[]
	#print folder_name
	tmp=[x[0] for x in os.walk(folder_name)]
	#print tmp
	for name in tmp[1:]:
		#print name
		folder_list.append(' '.join(segment(name.split(':')[1])))
	return folder_list[1:]
开发者ID:vedsarkushwaha,项目名称:KBH_NELL,代码行数:15,代码来源:read_data.py

示例15: checkTweetNums

def checkTweetNums(tweets,minTweets):
	#number as adjective check
	count = 0
	processedtweets = []
	for line in tweets:
		processedtweets.append(" ".join(wordsegment.segment(line)))
	postags = cmu.runtagger_parse(processedtweets)
	for postag in postags:
		postag = "".join(postag)
		if "$N" in postag or "$^" in postag or "$M" in postag or "$Z" in postag:
			#Checking for Consecutive numbers and Nouns
			count += 1
	if count >= minTweets:
		return 1
	else:
		return 0
开发者ID:SummerProject16,项目名称:opinion-or-fact,代码行数:16,代码来源:TweetCheck.py


注:本文中的wordsegment.segment函数示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。