本文整理汇总了Python中thulac.thulac方法的典型用法代码示例。如果您正苦于以下问题:Python thulac.thulac方法的具体用法?Python thulac.thulac怎么用?Python thulac.thulac使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类thulac
的用法示例。
在下文中一共展示了thulac.thulac方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: main
# 需要导入模块: import thulac [as 别名]
# 或者: from thulac import thulac [as 别名]
def main():
parser = argparse.ArgumentParser()
parser.add_argument('--raw_data_path', default='../data/train.json', type=str, required=False, help='原始训练语料')
parser.add_argument('--vocab_file', default='vocab_processed.txt', type=str, required=False, help='生成vocab链接')
parser.add_argument('--vocab_size', default=50000, type=int, required=False, help='词表大小')
args = parser.parse_args()
lac = thulac.thulac(seg_only=True)
tokenizer = Tokenizer(num_words=args.vocab_size)
print('args:\n' + args.__repr__())
print('This script is extremely slow especially for large corpus. Take a break.')
f = open(args.raw_data_path, 'r')
lines = json.load(f)
for i, line in enumerate(tqdm(lines)):
lines[i] = lac.cut(line, text=True)
tokenizer.fit_on_texts(lines)
vocab = list(tokenizer.index_word.values())
pre = ['[SEP]', '[CLS]', '[MASK]', '[PAD]', '[UNK]']
vocab = pre + vocab
with open(args.vocab_file, 'w') as f:
for word in vocab[:args.vocab_size + 5]:
f.write(word + '\n')
示例2: return_cut_word_list_list
# 需要导入模块: import thulac [as 别名]
# 或者: from thulac import thulac [as 别名]
def return_cut_word_list_list(filepath):
#return summary and short text[[],[],[]]
list_summary , list_short_text = get_clean_data(filepath)
thu_cut = thulac.thulac("-seg_only")
list_word_short_text = []
list_word_summary = []
for i,short_text in enumerate(list_short_text):
list_temp = thu_cut.cut(short_text)
#if i%10000 == 0:
# print i," ".join(list_temp)
list_word_short_text.append(list_temp)
for i,summary in enumerate(list_summary):
# list_temp = list(" ".join(thu_cut.cut(summary)))
list_temp = thu_cut.cut(summary)
#if i%10000 == 0:
# print i," ".join(list_temp)
list_word_summary.append(list_temp)
#print len(list_word_summary[0]), ' '.join(list_word_summary[0])
return list_word_summary , list_word_short_text
示例3: testSegOnly
# 需要导入模块: import thulac [as 别名]
# 或者: from thulac import thulac [as 别名]
def testSegOnly():
test_text = "我爱北京天安门"
thu = thulac.thulac(seg_only = True)
gold = thu.cut(test_text, text = True)
assert gold == "我 爱 北京 天安门"
#由于Tag模型初始化耗时较大,在这里将两个Tag模型的测试放在一起
示例4: preprocess
# 需要导入模块: import thulac [as 别名]
# 或者: from thulac import thulac [as 别名]
def preprocess(dataset: str):
global segment_tool, dictionary
print('Loading Segment Model...')
segment_tool = thulac(rm_space=True)
print('Loading dictionary')
dictionary = set(map(lambda s: s.rstrip('\n'), open('dataset/dictionary.txt', encoding='utf-8').readlines()))
dataset_list = (['train', 'test'], [dataset])
for dataset_type, dataset_name in product(*dataset_list):
with open('dataset/%s/%s_seg.txt' % (dataset_name, dataset_type), 'w', encoding='utf-8') as f:
for line in handle_data('dataset/%s/%s.txt' % (dataset_name, dataset_type)):
f.write(json.dumps(line, ensure_ascii=False) + '\n')
示例5: testTagAndDeli
# 需要导入模块: import thulac [as 别名]
# 或者: from thulac import thulac [as 别名]
def testTagAndDeli():
test_text = "我爱北京天安门"
thu = thulac.thulac(deli = '#')
gold = thu.cut(test_text, text = True)
assert gold == "我#r 爱#v 北京#ns 天安门#ns"
示例6: testUserDict
# 需要导入模块: import thulac [as 别名]
# 或者: from thulac import thulac [as 别名]
def testUserDict():
test_text = "我爱北京天安门"
thu = thulac.thulac(seg_only = True, user_dict = prefix + "/userDict.txt")
gold = thu.cut(test_text, text = True)
assert gold == "我爱北京天安门"
示例7: testT2S
# 需要导入模块: import thulac [as 别名]
# 或者: from thulac import thulac [as 别名]
def testT2S():
test_text = "我愛北京天安門"
thu = thulac.thulac(seg_only = True, T2S = True)
gold = thu.cut(test_text, text = True)
print(gold)
assert gold == "我 爱 北京 天安门"
示例8: testFilt
# 需要导入模块: import thulac [as 别名]
# 或者: from thulac import thulac [as 别名]
def testFilt():
test_text = "我可以爱北京天安门"
thu = thulac.thulac(seg_only = True, filt = True)
gold = thu.cut(test_text, text = True)
print(gold)
assert gold == "我 爱 北京 天安门"
示例9: init_thulac
# 需要导入模块: import thulac [as 别名]
# 或者: from thulac import thulac [as 别名]
def init_thulac(config):
global cutter
cutter = thulac.thulac(model_path=config.get("data","thulac"), seg_only=True, filt=False)
示例10: write_cut_word_to_file
# 需要导入模块: import thulac [as 别名]
# 或者: from thulac import thulac [as 别名]
def write_cut_word_to_file():
#write summary and short text to file
filepath = "./LCSTS/DATA/PART_I.txt"
list_summary , list_short_text = get_clean_data(filepath)
thu_cut = thulac.thulac("-seg_only")
f_short_text = open("./LCSTS/DATA/PART_I_cut_short_text.txt","w+")
f_summary = open("./LCSTS/DATA/PART_I_cut_summary.txt","w+")
print len(list_summary),type(list_summary),len(list_short_text),type(list_short_text)
for i,short_text in enumerate(list_short_text):
list_temp = thu_cut.cut(short_text)
try:
content = " ".join(list_temp)
#if i%5000 == 0:
# print i,content
except:
content = "wrong short text"
f_short_text.write(content+"\n")
f_short_text.close()
for i,summary in enumerate(list_summary):
list_temp = thu_cut.cut(summary)
try:
content = " ".join(list_temp)
#if i%5000 == 0:
# print i,content
except:
content = "wrong summary"
f_summary.write(content+"\n")
f_summary.close()
示例11: createTable
# 需要导入模块: import thulac [as 别名]
# 或者: from thulac import thulac [as 别名]
def createTable(num):
start = time.time()
thu = thulac.thulac()
file = open('agri_economic.json', encoding='utf-8')
print("begin!")
table = set()
f = json.load(file)
count = 0
file_text = ""
for p in f:
count += 1
if int(count/2000) != num:
continue
if count % 10 == 0:
cur = time.time()
print("now id : " + str(count) + " table size :" + str(len(table)))
print("Running Time : " + str(int(cur-start)) + " s......")
detail = p['detail']
# if len(detail) > 600:
# detail = detail[0:600]
title = p['title']
table.add(title)
# 分词
text = thu.cut(detail)
table = table | createWordSet(text)
for t in table:
file_text += t+' '
file_object = open('table'+str(num)+".txt",'w')
file_object.write(file_text)
file_object.close()
#createTable(0)
#createTable(1)
#createTable(2)
#createTable(3)
#createTable(4)
#createTable(5)
示例12: createTable
# 需要导入模块: import thulac [as 别名]
# 或者: from thulac import thulac [as 别名]
def createTable(num):
start = time.time()
thu = thulac.thulac()
file = open('agri_economic.json', encoding='utf-8')
print("begin!")
f = json.load(file)
count = 0
file_text = ""
for p in f:
count += 1
if int(count/100) != num:
continue
if count % 10 == 0:
cur = time.time()
print("now id : " + str(count) + " table size :" )
print("Running Time : " + str(int(cur-start)) + " s......")
detail = p['detail']
# if len(detail) > 600:
# detail = detail[0:600]
title = p['title']
# 分词
text = thu.cut(detail)
wordList = createWordList(text)
file_text += title
for word in wordList:
file_text += ' ' + word
file_text += '\n'
file_object = open('article'+str(num)+".txt",'w')
file_object.write(file_text)
file_object.close()
示例13: cut_text
# 需要导入模块: import thulac [as 别名]
# 或者: from thulac import thulac [as 别名]
def cut_text(alltext):
count = 0
cut = thulac.thulac(seg_only = True)
train_text = []
for text in alltext:
count += 1
if count % 2000 == 0:
print(count)
train_text.append(cut.cut(text, text = True))
return train_text
示例14: __init__
# 需要导入模块: import thulac [as 别名]
# 或者: from thulac import thulac [as 别名]
def __init__(self):
self.tfidf = joblib.load('predictor/model/tfidf.model')
self.law = joblib.load('predictor/model/law.model')
self.accu = joblib.load('predictor/model/accu.model')
self.time = joblib.load('predictor/model/time.model')
self.batch_size = 1
self.cut = thulac.thulac(seg_only = True)
示例15: get_NE
# 需要导入模块: import thulac [as 别名]
# 或者: from thulac import thulac [as 别名]
def get_NE(text):
# 读取thulac,neo4j,分词
thu1 = pre_load_thu
TagList = thu1.cut(text, text=False)
TagList.append(['===',None]) #末尾加个不合法的,后面好写
# 读取实体类别,注意要和predict_labels.txt一个目录
label = predict_labels
answerList = []
i = 0
length = len(TagList) - 1 # 扣掉多加的那个
while i < length:
p1 = TagList[i][0]
t1 = TagList[i][1]
p2 = TagList[i+1][0]
t2 = TagList[i+1][1]
p12 = p1 + TagList[i+1][0]
# 不但需要txt中有实体,还需要判断数据库中有没有
if p12 in label and preok(t1) and nowok(t2): # 组合2个词如果得到实体
answerList.append([p12,label[p12]])
i += 2
continue
if p1 in label and nowok(t1): # 当前词如果是实体
answerList.append([p1,label[p1]])
i += 1
continue
if temporaryok(t1):
answerList.append([p1,t1])
i += 1
continue
answerList.append([p1,0])
i += 1
return answerList
#分句标识符号