本文整理匯總了Python中jieba.add_word方法的典型用法代碼示例。如果您正苦於以下問題:Python jieba.add_word方法的具體用法?Python jieba.add_word怎麽用?Python jieba.add_word使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在類jieba
的用法示例。
在下文中一共展示了jieba.add_word方法的13個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。
示例1: segment
# 需要導入模塊: import jieba [as 別名]
# 或者: from jieba import add_word [as 別名]
def segment(self, sentence, entity_postag=dict()):
"""采用NLPIR進行分詞處理
Args:
sentence: string,句子
entity_postag: dict,實體詞性詞典,默認為空集合,分析每一個案例的結構化文本時產生
Returns:
lemmas: list,分詞結果
"""
# 添加實體詞典
if entity_postag:
for entity in entity_postag:
# pynlpir.nlpir.AddUserWord(c_char_p(entity.encode()))
jieba.add_word(entity)
# pynlpir.nlpir.AddUserWord(c_char_p('前任'.encode())) # 單個用戶詞加入示例
# pynlpir.nlpir.AddUserWord(c_char_p('習近平'.encode())) # 單個用戶詞加入示例
# 分詞,不進行詞性標注
# lemmas = pynlpir.segment(sentence, pos_tagging=False)
lemmas = jieba.lcut(sentence)
# pynlpir.close() # 釋放
return lemmas
示例2: jiebaCustomSetting
# 需要導入模塊: import jieba [as 別名]
# 或者: from jieba import add_word [as 別名]
def jiebaCustomSetting(self, dict_path, usr_dict_path):
jieba.set_dictionary(dict_path)
with open(usr_dict_path, 'r', encoding='utf-8') as dic:
for word in dic:
jieba.add_word(word.strip('\n'))
示例3: __init__
# 需要導入模塊: import jieba [as 別名]
# 或者: from jieba import add_word [as 別名]
def __init__(self, user_dict_dir=default_user_dict_dir, model_dir=default_model_dir):
self.default_user_dict_dir = user_dict_dir
self.default_model_dir = model_dir
# 初始化分詞器
# pynlpir.open() # 初始化分詞器
# 添加用戶詞典(法律文書大辭典與清華大學法律詞典),這種方式是添加進內存中,速度更快
files = os.listdir(user_dict_dir)
for file in files:
file_path = os.path.join(user_dict_dir, file)
# 文件夾則跳過
if os.path.isdir(file):
continue
with open(file_path, 'r', encoding='utf-8') as f:
line = f.readline()
while line:
word = line.strip('\n').strip()
jieba.add_word(word)
# print(c_char_p(word.encode()))
# pynlpir.nlpir.AddUserWord(c_char_p(word.encode()))
line = f.readline()
# 加載ltp模型
# 詞性標注模型
self.postagger = Postagger()
postag_flag = self.postagger.load(os.path.join(self.default_model_dir, 'pos.model'))
# 命名實體識別模型
self.recognizer = NamedEntityRecognizer()
ner_flag = self.recognizer.load(os.path.join(self.default_model_dir, 'ner.model'))
# 依存句法分析模型
self.parser = Parser()
parse_flag = self.parser.load(os.path.join(self.default_model_dir, 'parser.model'))
if postag_flag or ner_flag or parse_flag:
print('load model failed!')
示例4: load_dict
# 需要導入模塊: import jieba [as 別名]
# 或者: from jieba import add_word [as 別名]
def load_dict():
dics=csv.reader(open("DICT_NOW.csv",'r',encoding='utf8'))
flag=0
fuhao = [';', '。', '?', '?', '!', '!', ';']
for row in dics:
if flag==0:
flag=1
continue
if len(row)==2:
jieba.add_word(row[0].strip(),tag=row[1].strip())
示例5: prepare
# 需要導入模塊: import jieba [as 別名]
# 或者: from jieba import add_word [as 別名]
def prepare(self):
self.prepared = True
for type0 in self.entity_types:
tag0 = "n"
if "人名" in type0:
tag0 = "nr"
elif "地名" in type0:
tag0 = "ns"
elif "機構" in type0:
tag0 = "nt"
elif "其他專名" in type0:
tag0 = "nz"
jieba.add_word(type0, freq = 10000, tag=tag0)
示例6: cut
# 需要導入模塊: import jieba [as 別名]
# 或者: from jieba import add_word [as 別名]
def cut(text,custom_words=['FLOAT','TIME','DATE','EOS']):
jieba.enable_parallel(32)
for word in custom_words:
jieba.add_word(word)
words=jieba.lcut(text)
return words
示例7: load_word_dict
# 需要導入模塊: import jieba [as 別名]
# 或者: from jieba import add_word [as 別名]
def load_word_dict(self) -> None:
if self.external_word_dict:
for word in self.external_word_dict:
jieba.add_word(word, freq=1000000)
示例8: load_word_dict
# 需要導入模塊: import jieba [as 別名]
# 或者: from jieba import add_word [as 別名]
def load_word_dict(self):
if self.external_word_dict:
for word in self.external_word_dict:
jieba.add_word(word, freq=1000000)
示例9: load_word_dict
# 需要導入模塊: import jieba [as 別名]
# 或者: from jieba import add_word [as 別名]
def load_word_dict(self):
"""Load external word dictionary in jieba"""
if self.external_word_dict:
for word in self.external_word_dict:
jieba.add_word(word, freq=1000000)
示例10: load_word2jieba
# 需要導入模塊: import jieba [as 別名]
# 或者: from jieba import add_word [as 別名]
def load_word2jieba(self):
vocab_list = load_pkl(self.vocab_list)
if vocab_list != []:
print("加載詞的總量: ", len(vocab_list))
for word in vocab_list:
jieba.add_word(word)
示例11: load_data
# 需要導入模塊: import jieba [as 別名]
# 或者: from jieba import add_word [as 別名]
def load_data(self, word_index, vocab_list, test_size=0.25):
STOPWORDS = ["-", "\t", "\n", ".", "。", ",", ",", ";", "!", "!", "?", "?", "%"]
if vocab_list != []:
for word in vocab_list:
jieba.add_word(word)
def func(line):
# 將文本 ['1, 2, 3', '1, 2, .., n'] 分解為: [[1, 2, 3], [1, 2, .., n]]
words = [word for word in jieba.cut(str(line), cut_all=False) if word not in STOPWORDS]
indexs = [word_index.get(word, 0) for word in words]
return indexs
df = pd.read_excel(self.data_file, header=0, error_bad_lines=False, encoding="utf_8_sig")
x = df["comment"].apply(lambda line: func(line)).tolist()
x = pad_sequences(x, maxlen=self.MAX_SEQUENCE_LENGTH)
y = df["label"].tolist()
# 按照大小和順序,生成 label(0,1,2...自然數類型)
"""
In [7]: to_categorical(np.asarray([1,1,0,1,3]))
Out[7]:
array([[0., 1., 0., 0.],
[0., 1., 0., 0.],
[1., 0., 0., 0.],
[0., 1., 0., 0.],
[0., 0., 0., 1.]], dtype=float32)
"""
y = to_categorical(np.asarray(y))
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=test_size, random_state=10000)
return (x_train, y_train), (x_test, y_test)
示例12: TaibaCustomSetting
# 需要導入模塊: import jieba [as 別名]
# 或者: from jieba import add_word [as 別名]
def TaibaCustomSetting(self, usr_dict):
with open(usr_dict, 'r', encoding='utf-8') as dic:
for word in dic:
Taiba.add_word(word.strip('\n'))
示例13: sentence_segmentation
# 需要導入模塊: import jieba [as 別名]
# 或者: from jieba import add_word [as 別名]
def sentence_segmentation(self, sentence, entity1, entity2):
jieba.add_word(entity1, freq=999999)
jieba.add_word(entity2, freq=999999)
seglist = list(jieba.cut(sentence, cut_all=False, HMM=False))
jieba.del_word(entity1)
jieba.del_word(entity2)
return seglist