本文整理汇总了Python中hanziconv.HanziConv.toSimplified方法的典型用法代码示例。如果您正苦于以下问题:Python HanziConv.toSimplified方法的具体用法?Python HanziConv.toSimplified怎么用?Python HanziConv.toSimplified使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类hanziconv.HanziConv
的用法示例。
在下文中一共展示了HanziConv.toSimplified方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: preprocess
# 需要导入模块: from hanziconv import HanziConv [as 别名]
# 或者: from hanziconv.HanziConv import toSimplified [as 别名]
def preprocess(string):
invalid_unicode = u'[\u25A0-\u25FF\u0080-\u00A0\uE000-\uFBFF\u2000-\u201B\u201E-\u2027\u2030-\u206F]+'
lang_char = u'[\u3040-\u309f\u30A0-\u30FF\u1100-\u11FF\u0E00-\u0E7F\u0600-\u06ff\u0750-\u077f\u0400-\u04ff]+'
invalid_char = u'[\xa0\x7f\x9f]+'
string = re.sub(EMOJI_UNICODE, '', string)
string = re.sub(HTML, '', string)
string = re.sub(r'\r|\t|<\w+>|&\w+;?|br\s*|li>', '', string)
string = re.sub(invalid_char, '', string)
string = re.sub(r'<U\+2028>|<U\+F09F>|<U\+F06C>|<U\+F0A7>', '', string)
string = re.sub(r'[ \u3000]+', 's_', string)
string = re.sub(invalid_unicode, 'ss_', string)
string = re.sub(lang_char, 'lan_', string)
# string = re.sub(r'(工作描述|工作职责|岗位职责|任职要求)(:|:)', '', string)
string = HanziConv.toSimplified(strQ2B(string))
string = re.sub(
r'[^\u4e00-\u9fa5\u0020-\u007f,。!?;、():\n\u2029\u2028a-zA-Z0-9]+', '', string)
return string
# 分句策略(比直接切开慢3倍)
示例2: process_poetry
# 需要导入模块: from hanziconv import HanziConv [as 别名]
# 或者: from hanziconv.HanziConv import toSimplified [as 别名]
def process_poetry(self, data_dir='/media/pony/DLdigest/data/languageModel/chinese-poetry/json'):
"""
Process Tang and Song poems dataset
"""
save_dir = os.path.join(self.save_dir, 'poem')
check_path_exists(save_dir)
count = 0
for entry in os.scandir(data_dir):
if entry.name.startswith('poet'):
with open(entry.path, 'r') as json_file:
poems = json.load(json_file)
for p in poems:
paras = HanziConv.toSimplified(''.join(p['paragraphs']).replace('\n', ''))
paras = filter_punctuation(paras)
for para in paras.split(' '):
if len(para.strip())>1:
pys = ' '.join(np.array(pinyin(para)).flatten())
with open(os.path.join(save_dir, str(count//400000+1)+'.txt'), 'a') as f:
f.write(para+','+pys+'\n')
count += 1
示例3: process_audioLabels
# 需要导入模块: from hanziconv import HanziConv [as 别名]
# 或者: from hanziconv.HanziConv import toSimplified [as 别名]
def process_audioLabels(self, data_dir='/media/pony/DLdigest/data/ASR_zh/'):
"""
Processing label files in collected Chinese audio dataset
"""
save_dir = os.path.join(self.save_dir, 'audioLabels')
check_path_exists(save_dir)
count = 0
for subdir, dirs, files in os.walk(data_dir):
print(subdir)
for f in files:
if f.endswith("label"):
fullFilename = os.path.join(subdir, f)
with open(fullFilename, 'r') as f:
line = f.read()
con = HanziConv.toSimplified(line)
con = filter_punctuation(con)
for c in con.split(' '):
if len(c.strip())>1:
pys = ' '.join(np.array(pinyin(c)).flatten())
count += 1
with open(os.path.join(save_dir, str(count//400000+1)+'.txt'), 'a') as f:
f.write(c+','+pys+'\n')
示例4: clean
# 需要导入模块: from hanziconv import HanziConv [as 别名]
# 或者: from hanziconv.HanziConv import toSimplified [as 别名]
def clean(text):
text = text.strip()
text = HanziConv.toSimplified(text)
text = full2half(text)
text = re.sub("\\#.*?#|\\|.*?\\||\\[.*?]", "", text)
# text = re.sub("\s*", "", text)
return text
示例5: clean
# 需要导入模块: from hanziconv import HanziConv [as 别名]
# 或者: from hanziconv.HanziConv import toSimplified [as 别名]
def clean(text):
text = text.strip()
# text = text.lower()
text = HanziConv.toSimplified(text)
text = full2half(text)
# text = re.sub("\\#.*?#|\\|.*?\\||\\[.*?]", "", text)
# text = re.sub("\s*", "", text)
return text
示例6: clean
# 需要导入模块: from hanziconv import HanziConv [as 别名]
# 或者: from hanziconv.HanziConv import toSimplified [as 别名]
def clean(text):
text = text.strip()
# text = text.lower()
text = HanziConv.toSimplified(text)
text = full2half(text)
text = re.sub("\\#.*?#|\\|.*?\\||\\[.*?]", "", text)
# text = re.sub("\s*", "", text)
return text
示例7: clean
# 需要导入模块: from hanziconv import HanziConv [as 别名]
# 或者: from hanziconv.HanziConv import toSimplified [as 别名]
def clean(text):
text = text.strip()
text = HanziConv.toSimplified(text)
text = full2half(text)
text = re.sub("\\#.*?#|\\|.*?\\||\\[.*?]", "", text)
text = re.sub("\s*", "", text)
return text
示例8: clean
# 需要导入模块: from hanziconv import HanziConv [as 别名]
# 或者: from hanziconv.HanziConv import toSimplified [as 别名]
def clean(text):
text = text.strip()
# text = HanziConv.toSimplified(text)
# text = full2half(text)
text = re.sub("\\#.*?#|\\|.*?\\||\\[.*?]", "", text)
# text = re.sub("\s*", "", text)
return text
示例9: clean
# 需要导入模块: from hanziconv import HanziConv [as 别名]
# 或者: from hanziconv.HanziConv import toSimplified [as 别名]
def clean(text):
text = text.strip()
text = HanziConv.toSimplified(text)
text = full2half(text)
text = re.sub("\\#.*?#|\\|.*?\\||\\[.*?]", "", text)
text = re.sub("\s*", "", text)
text = re.sub(CH_PUNCTUATION, "", text)
return text
示例10: chinese_tokenizer
# 需要导入模块: from hanziconv import HanziConv [as 别名]
# 或者: from hanziconv.HanziConv import toSimplified [as 别名]
def chinese_tokenizer(documents):
"""
把中文文本转为词序列
繁体转简体、英文转小写
"""
for document in documents:
text = HanziConv.toSimplified(document)
text = text.lower()
yield list(cut(text))
示例11: simplify_or_none
# 需要导入模块: from hanziconv import HanziConv [as 别名]
# 或者: from hanziconv.HanziConv import toSimplified [as 别名]
def simplify_or_none(text):
if text is None:
return None
else:
return HanziConv.toSimplified(text)
示例12: simplified_eq
# 需要导入模块: from hanziconv import HanziConv [as 别名]
# 或者: from hanziconv.HanziConv import toSimplified [as 别名]
def simplified_eq(a, b):
return len(a) == len(b) and \
HanziConv.toSimplified(a[0]) == \
HanziConv.toSimplified(b[0])
示例13: main
# 需要导入模块: from hanziconv import HanziConv [as 别名]
# 或者: from hanziconv.HanziConv import toSimplified [as 别名]
def main():
parent_dir = TRAIN_DIR / 'cmn'
for target_file in ['tatoeba.txt', 'w2c.txt']:
hant_file = open(TRAIN_DIR / 'cmn-hant' / target_file, mode='w')
hans_file = open(TRAIN_DIR / 'cmn-hans' / target_file, mode='w')
with open(parent_dir / target_file) as f:
for line in f:
text = line.rstrip()
if not text:
continue
try:
lang = langdetect.detect(text)
except langdetect.lang_detect_exception.LangDetectException:
continue
if lang in {'zh-tw', 'ko'}:
text = HanziConv.toTraditional(text)
hant_file.write(text)
hant_file.write('\n')
elif lang == 'zh-cn':
text = HanziConv.toSimplified(text)
hans_file.write(text)
hans_file.write('\n')
hant_file.close()
hans_file.close()
示例14: clean
# 需要导入模块: from hanziconv import HanziConv [as 别名]
# 或者: from hanziconv.HanziConv import toSimplified [as 别名]
def clean(self, line):
line = re.sub(r'\[[\u4e00-\u9fa5a-z]{1,4}\]|\[aloha\]', '', line)
line = re.sub(EMOJI_UNICODE, '', line)
line = re.sub(self.html_texts, '', line)
if re.search(r'[\u4300-\u9fa5]+', line):
line = HanziConv.toSimplified(line)
return re.sub(' {2,}|\t', ' ', line).lower()
else:
return None
示例15: process
# 需要导入模块: from hanziconv import HanziConv [as 别名]
# 或者: from hanziconv.HanziConv import toSimplified [as 别名]
def process(line):
line = line.strip()
line = RE_LATIN_SPACE.sub('‧', line)
line = RE_SPACES.sub('', line)
line = unicodedata.normalize('NFKC', line)
line = HanziConv.toSimplified(line)
return line