当前位置: 首页>>代码示例>>Python>>正文


Python thulac.thulac方法代码示例

本文整理汇总了Python中thulac.thulac方法的典型用法代码示例。如果您正苦于以下问题:Python thulac.thulac方法的具体用法?Python thulac.thulac怎么用?Python thulac.thulac使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在thulac的用法示例。


在下文中一共展示了thulac.thulac方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: main

# 需要导入模块: import thulac [as 别名]
# 或者: from thulac import thulac [as 别名]
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--raw_data_path', default='../data/train.json', type=str, required=False, help='原始训练语料')
    parser.add_argument('--vocab_file', default='vocab_processed.txt', type=str, required=False, help='生成vocab链接')
    parser.add_argument('--vocab_size', default=50000, type=int, required=False, help='词表大小')
    args = parser.parse_args()

    lac = thulac.thulac(seg_only=True)
    tokenizer = Tokenizer(num_words=args.vocab_size)
    print('args:\n' + args.__repr__())
    print('This script is extremely slow especially for large corpus. Take a break.')

    f = open(args.raw_data_path, 'r')
    lines = json.load(f)
    for i, line in enumerate(tqdm(lines)):
        lines[i] = lac.cut(line, text=True)

    tokenizer.fit_on_texts(lines)
    vocab = list(tokenizer.index_word.values())
    pre = ['[SEP]', '[CLS]', '[MASK]', '[PAD]', '[UNK]']
    vocab = pre + vocab
    with open(args.vocab_file, 'w') as f:
        for word in vocab[:args.vocab_size + 5]:
            f.write(word + '\n') 
开发者ID:Morizeyao,项目名称:GPT2-Chinese,代码行数:26,代码来源:make_vocab.py

示例2: return_cut_word_list_list

# 需要导入模块: import thulac [as 别名]
# 或者: from thulac import thulac [as 别名]
def return_cut_word_list_list(filepath):
	#return summary and short text[[],[],[]]
	list_summary , list_short_text = get_clean_data(filepath)
	thu_cut = thulac.thulac("-seg_only")
	list_word_short_text = []
	list_word_summary = []
	for i,short_text in enumerate(list_short_text):
		list_temp = thu_cut.cut(short_text)
		#if i%10000 == 0:
		#	print i," ".join(list_temp)
		list_word_short_text.append(list_temp)
	for i,summary in enumerate(list_summary):
		# list_temp = list(" ".join(thu_cut.cut(summary)))
		list_temp = thu_cut.cut(summary)
		#if i%10000 == 0:
		#	print i," ".join(list_temp)
		list_word_summary.append(list_temp)
	
	#print len(list_word_summary[0]), ' '.join(list_word_summary[0])
	return list_word_summary , list_word_short_text 
开发者ID:yangzhiye,项目名称:Short-Text-Summarization,代码行数:22,代码来源:get_data.py

示例3: testSegOnly

# 需要导入模块: import thulac [as 别名]
# 或者: from thulac import thulac [as 别名]
def testSegOnly():
	test_text = "我爱北京天安门"
	thu = thulac.thulac(seg_only = True)
	gold = thu.cut(test_text, text = True)
	assert gold == "我 爱 北京 天安门"

#由于Tag模型初始化耗时较大,在这里将两个Tag模型的测试放在一起 
开发者ID:thunlp,项目名称:THULAC-Python,代码行数:9,代码来源:testInitVariables.py

示例4: preprocess

# 需要导入模块: import thulac [as 别名]
# 或者: from thulac import thulac [as 别名]
def preprocess(dataset: str):
    global segment_tool, dictionary
    print('Loading Segment Model...')
    segment_tool = thulac(rm_space=True)
    print('Loading dictionary')
    dictionary = set(map(lambda s: s.rstrip('\n'), open('dataset/dictionary.txt', encoding='utf-8').readlines()))

    dataset_list = (['train', 'test'], [dataset])
    for dataset_type, dataset_name in product(*dataset_list):
        with open('dataset/%s/%s_seg.txt' % (dataset_name, dataset_type), 'w', encoding='utf-8') as f:
            for line in handle_data('dataset/%s/%s.txt' % (dataset_name, dataset_type)):
                f.write(json.dumps(line, ensure_ascii=False) + '\n') 
开发者ID:kdsec,项目名称:chinese-opinion-target-extraction,代码行数:14,代码来源:preprocess.py

示例5: testTagAndDeli

# 需要导入模块: import thulac [as 别名]
# 或者: from thulac import thulac [as 别名]
def testTagAndDeli():
	test_text = "我爱北京天安门"
	thu = thulac.thulac(deli = '#')
	gold = thu.cut(test_text, text = True)
	assert gold == "我#r 爱#v 北京#ns 天安门#ns" 
开发者ID:thunlp,项目名称:THULAC-Python,代码行数:7,代码来源:testInitVariables.py

示例6: testUserDict

# 需要导入模块: import thulac [as 别名]
# 或者: from thulac import thulac [as 别名]
def testUserDict():
	test_text = "我爱北京天安门"
	thu = thulac.thulac(seg_only = True, user_dict = prefix + "/userDict.txt")
	gold = thu.cut(test_text, text = True)
	assert gold == "我爱北京天安门" 
开发者ID:thunlp,项目名称:THULAC-Python,代码行数:7,代码来源:testInitVariables.py

示例7: testT2S

# 需要导入模块: import thulac [as 别名]
# 或者: from thulac import thulac [as 别名]
def testT2S():
	test_text = "我愛北京天安門"
	thu = thulac.thulac(seg_only = True, T2S = True)
	gold = thu.cut(test_text, text = True)
	print(gold)
	assert gold == "我 爱 北京 天安门" 
开发者ID:thunlp,项目名称:THULAC-Python,代码行数:8,代码来源:testInitVariables.py

示例8: testFilt

# 需要导入模块: import thulac [as 别名]
# 或者: from thulac import thulac [as 别名]
def testFilt():
	test_text = "我可以爱北京天安门"
	thu = thulac.thulac(seg_only = True, filt = True)
	gold = thu.cut(test_text, text = True)
	print(gold)
	assert gold == "我 爱 北京 天安门" 
开发者ID:thunlp,项目名称:THULAC-Python,代码行数:8,代码来源:testInitVariables.py

示例9: init_thulac

# 需要导入模块: import thulac [as 别名]
# 或者: from thulac import thulac [as 别名]
def init_thulac(config):
    global cutter
    cutter = thulac.thulac(model_path=config.get("data","thulac"), seg_only=True, filt=False) 
开发者ID:thunlp,项目名称:TopJudge,代码行数:5,代码来源:utils.py

示例10: write_cut_word_to_file

# 需要导入模块: import thulac [as 别名]
# 或者: from thulac import thulac [as 别名]
def write_cut_word_to_file():
	#write summary and short text to file
	filepath = "./LCSTS/DATA/PART_I.txt"
	list_summary , list_short_text = get_clean_data(filepath)
	thu_cut = thulac.thulac("-seg_only")
	
	f_short_text = open("./LCSTS/DATA/PART_I_cut_short_text.txt","w+")
	f_summary = open("./LCSTS/DATA/PART_I_cut_summary.txt","w+")

	print len(list_summary),type(list_summary),len(list_short_text),type(list_short_text)
	for i,short_text in enumerate(list_short_text):
		list_temp = thu_cut.cut(short_text)
		try:
			content = " ".join(list_temp)
			#if i%5000 == 0:
			#	print i,content
		except:
			content = "wrong short text"
		f_short_text.write(content+"\n")
	
	f_short_text.close()

	for i,summary in enumerate(list_summary):
		list_temp = thu_cut.cut(summary)
		try:
			content = " ".join(list_temp)
			#if i%5000 == 0:
			#	print i,content
		except:
			content = "wrong summary"
		f_summary.write(content+"\n")
		
	f_summary.close() 
开发者ID:yangzhiye,项目名称:Short-Text-Summarization,代码行数:35,代码来源:get_data.py

示例11: createTable

# 需要导入模块: import thulac [as 别名]
# 或者: from thulac import thulac [as 别名]
def createTable(num):
	start = time.time()
	thu = thulac.thulac()
	file = open('agri_economic.json', encoding='utf-8')
	print("begin!")
	table = set()
	f = json.load(file)
	count = 0
	file_text = ""
	for p in f:
		count += 1
		if int(count/2000) != num:
			continue
		if count % 10 == 0:
			cur = time.time()
			print("now id : " + str(count) + "  table size :" + str(len(table)))
			print("Running Time : " + str(int(cur-start)) + " s......")
		detail = p['detail']
#		if len(detail) > 600:
#			detail = detail[0:600]
		title = p['title']
		table.add(title)
		# 分词
		text = thu.cut(detail)
		table = table | createWordSet(text)
				
	for t in table:
		file_text += t+' '
	file_object = open('table'+str(num)+".txt",'w')
	file_object.write(file_text)
	file_object.close()

#createTable(0)
#createTable(1)
#createTable(2)
#createTable(3)
#createTable(4)
#createTable(5) 
开发者ID:qq547276542,项目名称:Agriculture_KnowledgeGraph,代码行数:40,代码来源:create_word_table.py

示例12: createTable

# 需要导入模块: import thulac [as 别名]
# 或者: from thulac import thulac [as 别名]
def createTable(num):
	start = time.time()
	thu = thulac.thulac()
	file = open('agri_economic.json', encoding='utf-8')
	print("begin!")
	f = json.load(file)
	count = 0
	file_text = ""
	for p in f:
		count += 1
		if int(count/100) != num:
			continue
		if count % 10 == 0:
			cur = time.time()
			print("now id : " + str(count) + "  table size :" )
			print("Running Time : " + str(int(cur-start)) + " s......")
		detail = p['detail']
#		if len(detail) > 600:
#			detail = detail[0:600]
		title = p['title']
		# 分词
		text = thu.cut(detail)
		wordList = createWordList(text)
		file_text += title
		for word in wordList:
			file_text += ' ' + word
		file_text += '\n'
				
	file_object = open('article'+str(num)+".txt",'w')
	file_object.write(file_text)
	file_object.close() 
开发者ID:qq547276542,项目名称:Agriculture_KnowledgeGraph,代码行数:33,代码来源:create_word2vec_input.py

示例13: cut_text

# 需要导入模块: import thulac [as 别名]
# 或者: from thulac import thulac [as 别名]
def cut_text(alltext):
	count = 0	
	cut = thulac.thulac(seg_only = True)
	train_text = []
	for text in alltext:
		count += 1
		if count % 2000 == 0:
			print(count)
		train_text.append(cut.cut(text, text = True))
	
	return train_text 
开发者ID:thunlp,项目名称:CAIL2018,代码行数:13,代码来源:svm.py

示例14: __init__

# 需要导入模块: import thulac [as 别名]
# 或者: from thulac import thulac [as 别名]
def __init__(self):
		self.tfidf = joblib.load('predictor/model/tfidf.model')
		self.law = joblib.load('predictor/model/law.model')
		self.accu = joblib.load('predictor/model/accu.model')
		self.time = joblib.load('predictor/model/time.model')
		self.batch_size = 1
		
		self.cut = thulac.thulac(seg_only = True) 
开发者ID:thunlp,项目名称:CAIL2018,代码行数:10,代码来源:predictor.py

示例15: get_NE

# 需要导入模块: import thulac [as 别名]
# 或者: from thulac import thulac [as 别名]
def get_NE(text):
    # 读取thulac,neo4j,分词
    thu1 = pre_load_thu

    TagList = thu1.cut(text, text=False)
    TagList.append(['===',None])  #末尾加个不合法的,后面好写
    
    # 读取实体类别,注意要和predict_labels.txt一个目录
    label = predict_labels
    
    answerList = []
    i = 0
    length = len(TagList) - 1 # 扣掉多加的那个
    while i < length:
        p1 = TagList[i][0]
        t1 = TagList[i][1]
        p2 = TagList[i+1][0]
        t2 = TagList[i+1][1]
        p12 = p1 + TagList[i+1][0]
        
        # 不但需要txt中有实体,还需要判断数据库中有没有
        if p12 in label and preok(t1) and nowok(t2):  # 组合2个词如果得到实体
            answerList.append([p12,label[p12]])
            i += 2
            continue
    
        if p1 in label and nowok(t1):     # 当前词如果是实体
            answerList.append([p1,label[p1]])
            i += 1
            continue
        
        if temporaryok(t1):
            answerList.append([p1,t1])
            i += 1
            continue
        
        answerList.append([p1,0])
        i += 1
    
    return answerList

#分句标识符号 
开发者ID:qq547276542,项目名称:Agriculture_KnowledgeGraph,代码行数:44,代码来源:parallel_extract_training.py


注:本文中的thulac.thulac方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。