本文整理汇总了Python中wordsegment.segment函数的典型用法代码示例。如果您正苦于以下问题:Python segment函数的具体用法?Python segment怎么用?Python segment使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了segment函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: main
def main(arg="iamtoocoolforthis"):
s = clean(arg)
print "CLEANED STRING:", s
print "======================RUNNING OPTIMIZED==================="
print segment_method1(s)
print "======================RUNNING VANILLA==================="
print segment(s)
示例2: precisioncalc
def precisioncalc(query):
print query,
k = searchgoogle(query)
seg = segment(query)
m = []
for n in seg:
m.append(stemming.porter2.stem(n))
seg = " ".join(m)
if socialListProxy:
proxy = ulib.ProxyHandler({'https': socialListHttps_Proxy, 'http': socialListHttp_Proxy})
opener = ulib.build_opener(proxy)
ulib.install_opener(opener)
counter = 0
total = 0
for i in xrange(len(k)):
req = ulib.Request(k[i], headers={'User-Agent': "Mozilla/5.0"})
k[i] = segment(k[i])
l = []
for j in k[i]:
l.append(stemming.porter2.stem(j))
k[i] = " ".join(k[i])
# print k[i]
try:
content = ulib.urlopen(req)
x = re.findall("<\S*?title\S*?>(.*?)<\S*?/\S*?title\S*?>", content.read())
t = []
for s in x:
t.append(stemming.porter2.stem(s))
t = " ".join(t)
# print t
if ((seg in k[i]) or (seg in t)):
counter = counter + 1
total = total + 1
except:
pass
if (total == 10):
print str(counter)+"/"+str(total),
if (total == 20):
print str(counter)+"/"+str(total),
if total < 10:
print str(counter)+"/"+str(10), str(counter)+"/"+str(20)
elif total < 20:
print str(counter)+"/"+str(20)
else:
print ""
#precisioncalc("madhusai") #uncomment this to check the presion of some word
示例3: info_extract
def info_extract(u):
final_string = ""
twe=url.split(u)
newtweet=""
for a in range(len(twe)):
newtweet = newtweet+twe[a]+" "
text = sep.split(newtweet);
tex=""
for i in range(len(text)):
if(hasht.match(text[i]) or atp.match(text[i])):
m=text[i][1:]
text[i]=segment(m.lower())
n=""
for j in text[i]:
n=n+j+" "
text[i]=n
tex+=text[i]+" "
final_string=final_string+categorize(tex)+"####"
final_string=final_string+babelnet(tex)+"####"
twee = url.search(u)
try:
urls = str(twee.group(0))
final_string=final_string+url_categ(urls)+"<br>"
except:
pass
final_string=final_string+twe_cat(tex)+"####"
final_string=final_string+senti(u)+"####"
return final_string
示例4: create_dict
def create_dict():
relation_name=[x[2] for x in os.walk("nell/relations")][0]
sub_table={}
obj_table={}
for r in relation_name:
lst=[]
r_name=' '.join(segment(r.split(':')[1]))
print r_name
with open("nell/relations/"+r) as fp:
for line in fp:
line=line.rstrip('\n')
sub,obj=line.split('\t')
sub=' '.join((sub.split(":")[2]).split('_'))
obj=' '.join((obj.split(":")[2]).split('_'))
if sub in sub_table:
tmp=sub_table[sub]
tmp=tmp.union([r_name])
sub_table[sub]=tmp
#print("y")
else:
sub_table[sub]=set([r_name])
if obj in obj_table:
tmp=obj_table[obj]
tmp=tmp.union([r_name])
obj_table[obj]=tmp
#print("yy")
else:
obj_table[obj]=set([r_name])
#print len(sub_table[sub]),len(obj_table[obj])
return sub_table,obj_table
示例5: test_segment_12
def test_segment_12():
result = [
'far', 'out', 'in', 'the', 'uncharted', 'backwaters', 'of', 'the',
'unfashionable', 'end', 'of', 'the', 'western', 'spiral', 'arm', 'of',
'the', 'galaxy', 'lies', 'a', 'small', 'un', 'regarded', 'yellow', 'sun'
]
assert segment(''.join(result)) == result
示例6: test_segment_10
def test_segment_10():
result = [
'as', 'gregor', 'samsa', 'awoke', 'one', 'morning', 'from', 'uneasy',
'dreams', 'he', 'found', 'himself', 'transformed', 'in', 'his', 'bed',
'into', 'a', 'gigantic', 'insect'
]
assert segment(''.join(result)) == result
示例7: test_segment_9
def test_segment_9():
result = [
'it', 'was', 'the', 'best', 'of', 'times', 'it', 'was', 'the', 'worst',
'of', 'times', 'it', 'was', 'the', 'age', 'of', 'wisdom', 'it', 'was',
'the', 'age', 'of', 'foolishness'
]
assert segment(''.join(result)) == result
示例8: k_list_repeat
def k_list_repeat(query):
k = searchgoogle(query)
m = []
if socialListProxy:
proxy = ulib.ProxyHandler({'https': socialListHttps_Proxy, 'http': socialListHttp_Proxy})
opener = ulib.build_opener(proxy)
ulib.install_opener(opener)
for i in xrange(len(k)):
req = ulib.Request(k[i], headers={'User-Agent': "Mozilla/5.0"})
k[i] = segment(k[i])
l = []
for j in k[i]:
l.append(stemming.porter2.stem(j))
k[i] = " ".join(k[i])
# print k[i]
try:
content = ulib.urlopen(req)
#reading the title of url
x = re.findall("<\S*?title\S*?>(.*?)<\S*?/\S*?title\S*?>", content.read())
t = []
for s in x:
t.append(stemming.porter2.stem(s))
t = " ".join(t)
m.append(t)
except:
pass
return m
示例9: segment_hashtag
def segment_hashtag(h):
"""segment the words inside the hashtag h, discard non alphanum chars"""
if hasattr(h, "group"):
h = h.group()[1:]
else:
h = h[1:]
# print(h, " hashtag " + wordsegment.segment(h) + " . ")
return " hashtag " + " ".join(wordsegment.segment(h)) + " , "
示例10: get_word_vector
def get_word_vector(self, word):
if word is None:
return None
word = word.strip().strip('[').strip(']').strip('(').strip(')')
word_lower = word.lower()
word_upper = word.upper()
try:
if word_lower not in self.word_vectors_map:
if config.debug:
print 'getting word vector for ', word
if word in self.word2vec_model.vocab:
self.word_vectors_map[word_lower] = self.word2vec_model[word]
#todo: if vocab us ensured to be lower case, this condition is not required
elif word_lower in self.word2vec_model.vocab:
self.word_vectors_map[word_lower] = self.word2vec_model[word_lower]
elif word_upper in self.word2vec_model.vocab:
self.word_vectors_map[word_lower] = self.word2vec_model[word_upper]
else:
if not constants.concept_regexp.sub('', word):
return self.get_word_vector(constants.alpha_regex.sub('', word))
subwords = word.split()
if len(subwords) == 1:
subwords = word.split(',')
if len(subwords) == 1:
subwords = word.split('/')
if len(subwords) == 1:
subwords = word.split(':')
if len(subwords) == 1:
subwords = word.split('-')
if len(subwords) == 1:
subwords = word.split('_')
if len(subwords) == 1:
# print 'performing word segmentation on ', word
subwords = ws.segment(word.encode('utf8'))
if len(subwords) == 1:
print 'could not get wordvector for ', word
self.word_vectors_map[word_lower] = None
if len(subwords) > 1:
curr_wordvec = None
for curr_subword in subwords:
curr_subword_vec = self.get_word_vector(curr_subword)
if curr_subword_vec is not None:
if curr_wordvec is None:
curr_wordvec = curr_subword_vec
else:
start_time = time.time()
curr_wordvec = ss.fftconvolve(curr_wordvec, curr_subword_vec, mode='same')
if config.debug:
print 'performed fast fourier transform convolution on word vectors in {} seconds.'.format(time.time()-start_time)
self.word_vectors_map[word_lower] = curr_wordvec
return self.word_vectors_map[word_lower]
except UnicodeDecodeError as ude:
print 'error getting word vector for ', word
print ude.message
self.word_vectors_map[word_lower] = None
return self.word_vectors_map[word_lower]
示例11: read_nell_relations
def read_nell_relations():
"""
this function will read relations from nell graph
return the list of relations
"""
rel=os.walk("nell/relations")
relation=[]
for i in rel:
trel=i[2]
for i in trel:
relation.append(' '.join(segment(i.split(':')[1])))
return relation
示例12: test12
def test12(tagtocheck):
d=en.Dict("en-US")
correct = 0
incorrect = 0
words=ws.segment(tagtocheck)
for x in words:
if d.check(x)==False:
incorrect+=1
else:
correct+=1
if correct!= 0:
return "%.4f"%(float(incorrect)/correct)
else:
return 0
示例13: create_dict_adva
def create_dict_adva():
relation_name=[x[2] for x in os.walk("nell/relations")][0]
sub_table={}
obj_table={}
for r in relation_name:
lst=[]
r_name=' '.join(segment(r.split(':')[1]))
print r_name
with open("nell/relations/"+r) as fp:
for line in fp:
line=line.rstrip('\n')
sub,obj=line.split('\t')
sub=sub.split(":")[1:]
obj=obj.split(":")[1:]
for tmp in sub:
tmpsb=''.join(tmp.split('_'))
tmpsb=segment(tmpsb)
for sb in tmpsb:
if sb in sub_table:
tmp=sub_table[sb]
tmp=tmp.union([r_name])
sub_table[sb]=tmp
#print("y")
else:
sub_table[sb]=set([r_name])
for tmp in obj:
tmpob=''.join(tmp.split('_'))
tmpob=segment(tmpob)
for ob in tmpob:
if ob in obj_table:
tmp=obj_table[ob]
tmp=tmp.union([r_name])
obj_table[ob]=tmp
#print("yy")
else:
obj_table[ob]=set([r_name])
return sub_table,obj_table
示例14: read_relation_name
def read_relation_name(folder_name):
"""
This function will look inside the folder folder_name and fetch out all relations where relations are the name of inside folder names. Here each folder name should have name format "concept:relation".
return the list of relations
"""
#print folder_name
folder_list=[]
#print folder_name
tmp=[x[0] for x in os.walk(folder_name)]
#print tmp
for name in tmp[1:]:
#print name
folder_list.append(' '.join(segment(name.split(':')[1])))
return folder_list[1:]
示例15: checkTweetNums
def checkTweetNums(tweets,minTweets):
#number as adjective check
count = 0
processedtweets = []
for line in tweets:
processedtweets.append(" ".join(wordsegment.segment(line)))
postags = cmu.runtagger_parse(processedtweets)
for postag in postags:
postag = "".join(postag)
if "$N" in postag or "$^" in postag or "$M" in postag or "$Z" in postag:
#Checking for Consecutive numbers and Nouns
count += 1
if count >= minTweets:
return 1
else:
return 0