当前位置: 首页>>代码示例>>Python>>正文


Python HanziConv.toSimplified方法代码示例

本文整理汇总了Python中hanziconv.HanziConv.toSimplified方法的典型用法代码示例。如果您正苦于以下问题:Python HanziConv.toSimplified方法的具体用法?Python HanziConv.toSimplified怎么用?Python HanziConv.toSimplified使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在hanziconv.HanziConv的用法示例。


在下文中一共展示了HanziConv.toSimplified方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: preprocess

# 需要导入模块: from hanziconv import HanziConv [as 别名]
# 或者: from hanziconv.HanziConv import toSimplified [as 别名]
def preprocess(string):
    invalid_unicode = u'[\u25A0-\u25FF\u0080-\u00A0\uE000-\uFBFF\u2000-\u201B\u201E-\u2027\u2030-\u206F]+'
    lang_char = u'[\u3040-\u309f\u30A0-\u30FF\u1100-\u11FF\u0E00-\u0E7F\u0600-\u06ff\u0750-\u077f\u0400-\u04ff]+'
    invalid_char = u'[\xa0\x7f\x9f]+'
    string = re.sub(EMOJI_UNICODE, '', string)
    string = re.sub(HTML, '', string)
    string = re.sub(r'\r|\t|<\w+>|&\w+;?|br\s*|li>', '', string)
    string = re.sub(invalid_char, '', string)
    string = re.sub(r'<U\+2028>|<U\+F09F>|<U\+F06C>|<U\+F0A7>', '', string)
    string = re.sub(r'[ \u3000]+', 's_', string)
    string = re.sub(invalid_unicode, 'ss_', string)
    string = re.sub(lang_char, 'lan_', string)
    # string = re.sub(r'(工作描述|工作职责|岗位职责|任职要求)(:|:)', '', string)
    string = HanziConv.toSimplified(strQ2B(string))
    string = re.sub(
        r'[^\u4e00-\u9fa5\u0020-\u007f,。!?;、():\n\u2029\u2028a-zA-Z0-9]+', '', string)
    return string


# 分句策略(比直接切开慢3倍) 
开发者ID:stevewyl,项目名称:nlp_toolkit,代码行数:22,代码来源:utils.py

示例2: process_poetry

# 需要导入模块: from hanziconv import HanziConv [as 别名]
# 或者: from hanziconv.HanziConv import toSimplified [as 别名]
def process_poetry(self, data_dir='/media/pony/DLdigest/data/languageModel/chinese-poetry/json'):
        """
        Process Tang and Song poems dataset
        """
        save_dir = os.path.join(self.save_dir, 'poem')
        check_path_exists(save_dir)
        count = 0
        for entry in os.scandir(data_dir):
            if entry.name.startswith('poet'):
                with open(entry.path, 'r') as json_file:
                    poems = json.load(json_file)
                    for p in poems: 
                        paras = HanziConv.toSimplified(''.join(p['paragraphs']).replace('\n', ''))
                        paras = filter_punctuation(paras)
                        for para in paras.split(' '):
                            if len(para.strip())>1:
                                pys = ' '.join(np.array(pinyin(para)).flatten())
                                with open(os.path.join(save_dir, str(count//400000+1)+'.txt'), 'a') as f:
                                    f.write(para+','+pys+'\n')
                                count += 1 
开发者ID:zzw922cn,项目名称:Automatic_Speech_Recognition,代码行数:22,代码来源:gardener.py

示例3: process_audioLabels

# 需要导入模块: from hanziconv import HanziConv [as 别名]
# 或者: from hanziconv.HanziConv import toSimplified [as 别名]
def process_audioLabels(self, data_dir='/media/pony/DLdigest/data/ASR_zh/'): 
        """
        Processing label files in collected Chinese audio dataset
        """
        save_dir = os.path.join(self.save_dir, 'audioLabels')
        check_path_exists(save_dir)
        count = 0
        for subdir, dirs, files in os.walk(data_dir):
            print(subdir)
            for f in files:
                if f.endswith("label"):
                    fullFilename = os.path.join(subdir, f)
                    with open(fullFilename, 'r') as f:
                        line = f.read()
                        con = HanziConv.toSimplified(line)
                        con = filter_punctuation(con)
                        for c in con.split(' '):
                            if len(c.strip())>1:
                                pys = ' '.join(np.array(pinyin(c)).flatten())
                                count += 1
                                with open(os.path.join(save_dir, str(count//400000+1)+'.txt'), 'a') as f:
                                    f.write(c+','+pys+'\n') 
开发者ID:zzw922cn,项目名称:Automatic_Speech_Recognition,代码行数:24,代码来源:gardener.py

示例4: clean

# 需要导入模块: from hanziconv import HanziConv [as 别名]
# 或者: from hanziconv.HanziConv import toSimplified [as 别名]
def clean(text):
	text = text.strip()
	text = HanziConv.toSimplified(text)
	text = full2half(text)
	text = re.sub("\\#.*?#|\\|.*?\\||\\[.*?]", "", text)
	# text = re.sub("\s*", "", text)
	return text 
开发者ID:yyht,项目名称:BERT,代码行数:9,代码来源:pretrain_classifier_processor.py

示例5: clean

# 需要导入模块: from hanziconv import HanziConv [as 别名]
# 或者: from hanziconv.HanziConv import toSimplified [as 别名]
def clean(text):
	text = text.strip()
	# text = text.lower()
	text = HanziConv.toSimplified(text)
	text = full2half(text)
	# text = re.sub("\\#.*?#|\\|.*?\\||\\[.*?]", "", text)
	# text = re.sub("\s*", "", text)
	return text 
开发者ID:yyht,项目名称:BERT,代码行数:10,代码来源:classifier_processor.py

示例6: clean

# 需要导入模块: from hanziconv import HanziConv [as 别名]
# 或者: from hanziconv.HanziConv import toSimplified [as 别名]
def clean(text):
	text = text.strip()
	# text = text.lower()
	text = HanziConv.toSimplified(text)
	text = full2half(text)
	text = re.sub("\\#.*?#|\\|.*?\\||\\[.*?]", "", text)
	# text = re.sub("\s*", "", text)
	return text 
开发者ID:yyht,项目名称:BERT,代码行数:10,代码来源:sequence_processor.py

示例7: clean

# 需要导入模块: from hanziconv import HanziConv [as 别名]
# 或者: from hanziconv.HanziConv import toSimplified [as 别名]
def clean(text):
	text = text.strip()
	text = HanziConv.toSimplified(text)
	text = full2half(text)
	text = re.sub("\\#.*?#|\\|.*?\\||\\[.*?]", "", text)
	text = re.sub("\s*", "", text)
	return text 
开发者ID:yyht,项目名称:BERT,代码行数:9,代码来源:classifier_processor.py

示例8: clean

# 需要导入模块: from hanziconv import HanziConv [as 别名]
# 或者: from hanziconv.HanziConv import toSimplified [as 别名]
def clean(text):
	text = text.strip()
	# text = HanziConv.toSimplified(text)
	# text = full2half(text)
	text = re.sub("\\#.*?#|\\|.*?\\||\\[.*?]", "", text)
	# text = re.sub("\s*", "", text)
	return text 
开发者ID:yyht,项目名称:BERT,代码行数:9,代码来源:classifier_processor.py

示例9: clean

# 需要导入模块: from hanziconv import HanziConv [as 别名]
# 或者: from hanziconv.HanziConv import toSimplified [as 别名]
def clean(text):
	text = text.strip()
	text = HanziConv.toSimplified(text)
	text = full2half(text)
	text = re.sub("\\#.*?#|\\|.*?\\||\\[.*?]", "", text)
	text = re.sub("\s*", "", text)
	text = re.sub(CH_PUNCTUATION, "", text)
	return text 
开发者ID:yyht,项目名称:BERT,代码行数:10,代码来源:classifier_processor.py

示例10: chinese_tokenizer

# 需要导入模块: from hanziconv import HanziConv [as 别名]
# 或者: from hanziconv.HanziConv import toSimplified [as 别名]
def chinese_tokenizer(documents):
    """
    把中文文本转为词序列
    繁体转简体、英文转小写
    """

    for document in documents:
        text = HanziConv.toSimplified(document)
        text = text.lower()
        yield list(cut(text)) 
开发者ID:xfgryujk,项目名称:TaobaoAnalysis,代码行数:12,代码来源:sentiment.py

示例11: simplify_or_none

# 需要导入模块: from hanziconv import HanziConv [as 别名]
# 或者: from hanziconv.HanziConv import toSimplified [as 别名]
def simplify_or_none(text):
    if text is None:
        return None
    else:
        return HanziConv.toSimplified(text) 
开发者ID:wikimedia,项目名称:revscoring,代码行数:7,代码来源:chinese.py

示例12: simplified_eq

# 需要导入模块: from hanziconv import HanziConv [as 别名]
# 或者: from hanziconv.HanziConv import toSimplified [as 别名]
def simplified_eq(a, b):
    return len(a) == len(b) and \
           HanziConv.toSimplified(a[0]) == \
           HanziConv.toSimplified(b[0]) 
开发者ID:wikimedia,项目名称:revscoring,代码行数:6,代码来源:test_chinese.py

示例13: main

# 需要导入模块: from hanziconv import HanziConv [as 别名]
# 或者: from hanziconv.HanziConv import toSimplified [as 别名]
def main():
    parent_dir = TRAIN_DIR / 'cmn'

    for target_file in ['tatoeba.txt', 'w2c.txt']:

        hant_file = open(TRAIN_DIR / 'cmn-hant' / target_file, mode='w')
        hans_file = open(TRAIN_DIR / 'cmn-hans' / target_file, mode='w')

        with open(parent_dir / target_file) as f:
            for line in f:
                text = line.rstrip()
                if not text:
                    continue

                try:
                    lang = langdetect.detect(text)
                except langdetect.lang_detect_exception.LangDetectException:
                    continue

                if lang in {'zh-tw', 'ko'}:
                    text = HanziConv.toTraditional(text)
                    hant_file.write(text)
                    hant_file.write('\n')
                elif lang == 'zh-cn':
                    text = HanziConv.toSimplified(text)
                    hans_file.write(text)
                    hans_file.write('\n')

        hant_file.close()
        hans_file.close() 
开发者ID:mhagiwara,项目名称:nanigonet,代码行数:32,代码来源:clean_chinese.py

示例14: clean

# 需要导入模块: from hanziconv import HanziConv [as 别名]
# 或者: from hanziconv.HanziConv import toSimplified [as 别名]
def clean(self, line):
        line = re.sub(r'\[[\u4e00-\u9fa5a-z]{1,4}\]|\[aloha\]', '', line)
        line = re.sub(EMOJI_UNICODE, '', line)
        line = re.sub(self.html_texts, '', line)
        if re.search(r'[\u4300-\u9fa5]+', line):
            line = HanziConv.toSimplified(line)
            return re.sub(' {2,}|\t', ' ', line).lower()
        else:
            return None 
开发者ID:stevewyl,项目名称:nlp_toolkit,代码行数:11,代码来源:data.py

示例15: process

# 需要导入模块: from hanziconv import HanziConv [as 别名]
# 或者: from hanziconv.HanziConv import toSimplified [as 别名]
def process(line):
    line = line.strip()
    line = RE_LATIN_SPACE.sub('‧', line)
    line = RE_SPACES.sub('', line)
    line = unicodedata.normalize('NFKC', line)
    line = HanziConv.toSimplified(line)
    return line 
开发者ID:robertostling,项目名称:hnmt,代码行数:9,代码来源:normalize_zh.py


注:本文中的hanziconv.HanziConv.toSimplified方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。