Python HanziConv.toSimplified方法代码示例

本文整理汇总了Python中hanziconv.HanziConv.toSimplified方法的典型用法代码示例。如果您正苦于以下问题：Python HanziConv.toSimplified方法的具体用法？Python HanziConv.toSimplified怎么用？Python HanziConv.toSimplified使用的例子？那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类hanziconv.HanziConv的用法示例。

在下文中一共展示了HanziConv.toSimplified方法的15个代码示例，这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞，您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: preprocess

# 需要导入模块: from hanziconv import HanziConv [as 别名]
# 或者: from hanziconv.HanziConv import toSimplified [as 别名]
def preprocess(string):
    invalid_unicode = u'[\u25A0-\u25FF\u0080-\u00A0\uE000-\uFBFF\u2000-\u201B\u201E-\u2027\u2030-\u206F]+'
    lang_char = u'[\u3040-\u309f\u30A0-\u30FF\u1100-\u11FF\u0E00-\u0E7F\u0600-\u06ff\u0750-\u077f\u0400-\u04ff]+'
    invalid_char = u'[\xa0\x7f\x9f]+'
    string = re.sub(EMOJI_UNICODE, '', string)
    string = re.sub(HTML, '', string)
    string = re.sub(r'\r|\t|<\w+>|&\w+;?|br\s*|li>', '', string)
    string = re.sub(invalid_char, '', string)
    string = re.sub(r'<U\+2028>|<U\+F09F>|<U\+F06C>|<U\+F0A7>', '', string)
    string = re.sub(r'[ \u3000]+', 's_', string)
    string = re.sub(invalid_unicode, 'ss_', string)
    string = re.sub(lang_char, 'lan_', string)
    # string = re.sub(r'(工作描述|工作职责|岗位职责|任职要求)(:|：)', '', string)
    string = HanziConv.toSimplified(strQ2B(string))
    string = re.sub(
        r'[^\u4e00-\u9fa5\u0020-\u007f，。！？；、（）：\n\u2029\u2028a-zA-Z0-9]+', '', string)
    return string


# 分句策略(比直接切开慢3倍)

开发者ID:stevewyl，项目名称:nlp_toolkit，代码行数:22，代码来源:utils.py

示例2: process_poetry

# 需要导入模块: from hanziconv import HanziConv [as 别名]
# 或者: from hanziconv.HanziConv import toSimplified [as 别名]
def process_poetry(self, data_dir='/media/pony/DLdigest/data/languageModel/chinese-poetry/json'):
        """
        Process Tang and Song poems dataset
        """
        save_dir = os.path.join(self.save_dir, 'poem')
        check_path_exists(save_dir)
        count = 0
        for entry in os.scandir(data_dir):
            if entry.name.startswith('poet'):
                with open(entry.path, 'r') as json_file:
                    poems = json.load(json_file)
                    for p in poems: 
                        paras = HanziConv.toSimplified(''.join(p['paragraphs']).replace('\n', ''))
                        paras = filter_punctuation(paras)
                        for para in paras.split(' '):
                            if len(para.strip())>1:
                                pys = ' '.join(np.array(pinyin(para)).flatten())
                                with open(os.path.join(save_dir, str(count//400000+1)+'.txt'), 'a') as f:
                                    f.write(para+','+pys+'\n')
                                count += 1

开发者ID:zzw922cn，项目名称:Automatic_Speech_Recognition，代码行数:22，代码来源:gardener.py

示例3: process_audioLabels

# 需要导入模块: from hanziconv import HanziConv [as 别名]
# 或者: from hanziconv.HanziConv import toSimplified [as 别名]
def process_audioLabels(self, data_dir='/media/pony/DLdigest/data/ASR_zh/'): 
        """
        Processing label files in collected Chinese audio dataset
        """
        save_dir = os.path.join(self.save_dir, 'audioLabels')
        check_path_exists(save_dir)
        count = 0
        for subdir, dirs, files in os.walk(data_dir):
            print(subdir)
            for f in files:
                if f.endswith("label"):
                    fullFilename = os.path.join(subdir, f)
                    with open(fullFilename, 'r') as f:
                        line = f.read()
                        con = HanziConv.toSimplified(line)
                        con = filter_punctuation(con)
                        for c in con.split(' '):
                            if len(c.strip())>1:
                                pys = ' '.join(np.array(pinyin(c)).flatten())
                                count += 1
                                with open(os.path.join(save_dir, str(count//400000+1)+'.txt'), 'a') as f:
                                    f.write(c+','+pys+'\n')

开发者ID:zzw922cn，项目名称:Automatic_Speech_Recognition，代码行数:24，代码来源:gardener.py

示例4: clean

# 需要导入模块: from hanziconv import HanziConv [as 别名]
# 或者: from hanziconv.HanziConv import toSimplified [as 别名]
def clean(text):
	text = text.strip()
	text = HanziConv.toSimplified(text)
	text = full2half(text)
	text = re.sub("\\#.*?#|\\|.*?\\||\\[.*?]", "", text)
	# text = re.sub("\s*", "", text)
	return text

开发者ID:yyht，项目名称:BERT，代码行数:9，代码来源:pretrain_classifier_processor.py

示例5: clean

# 需要导入模块: from hanziconv import HanziConv [as 别名]
# 或者: from hanziconv.HanziConv import toSimplified [as 别名]
def clean(text):
	text = text.strip()
	# text = text.lower()
	text = HanziConv.toSimplified(text)
	text = full2half(text)
	# text = re.sub("\\#.*?#|\\|.*?\\||\\[.*?]", "", text)
	# text = re.sub("\s*", "", text)
	return text

开发者ID:yyht，项目名称:BERT，代码行数:10，代码来源:classifier_processor.py

示例6: clean

# 需要导入模块: from hanziconv import HanziConv [as 别名]
# 或者: from hanziconv.HanziConv import toSimplified [as 别名]
def clean(text):
	text = text.strip()
	# text = text.lower()
	text = HanziConv.toSimplified(text)
	text = full2half(text)
	text = re.sub("\\#.*?#|\\|.*?\\||\\[.*?]", "", text)
	# text = re.sub("\s*", "", text)
	return text

开发者ID:yyht，项目名称:BERT，代码行数:10，代码来源:sequence_processor.py

示例7: clean

# 需要导入模块: from hanziconv import HanziConv [as 别名]
# 或者: from hanziconv.HanziConv import toSimplified [as 别名]
def clean(text):
	text = text.strip()
	text = HanziConv.toSimplified(text)
	text = full2half(text)
	text = re.sub("\\#.*?#|\\|.*?\\||\\[.*?]", "", text)
	text = re.sub("\s*", "", text)
	return text

开发者ID:yyht，项目名称:BERT，代码行数:9，代码来源:classifier_processor.py

示例8: clean

# 需要导入模块: from hanziconv import HanziConv [as 别名]
# 或者: from hanziconv.HanziConv import toSimplified [as 别名]
def clean(text):
	text = text.strip()
	# text = HanziConv.toSimplified(text)
	# text = full2half(text)
	text = re.sub("\\#.*?#|\\|.*?\\||\\[.*?]", "", text)
	# text = re.sub("\s*", "", text)
	return text

开发者ID:yyht，项目名称:BERT，代码行数:9，代码来源:classifier_processor.py

示例9: clean

# 需要导入模块: from hanziconv import HanziConv [as 别名]
# 或者: from hanziconv.HanziConv import toSimplified [as 别名]
def clean(text):
	text = text.strip()
	text = HanziConv.toSimplified(text)
	text = full2half(text)
	text = re.sub("\\#.*?#|\\|.*?\\||\\[.*?]", "", text)
	text = re.sub("\s*", "", text)
	text = re.sub(CH_PUNCTUATION, "", text)
	return text

开发者ID:yyht，项目名称:BERT，代码行数:10，代码来源:classifier_processor.py

示例10: chinese_tokenizer

# 需要导入模块: from hanziconv import HanziConv [as 别名]
# 或者: from hanziconv.HanziConv import toSimplified [as 别名]
def chinese_tokenizer(documents):
    """
    把中文文本转为词序列
    繁体转简体、英文转小写
    """

    for document in documents:
        text = HanziConv.toSimplified(document)
        text = text.lower()
        yield list(cut(text))

开发者ID:xfgryujk，项目名称:TaobaoAnalysis，代码行数:12，代码来源:sentiment.py

示例11: simplify_or_none

# 需要导入模块: from hanziconv import HanziConv [as 别名]
# 或者: from hanziconv.HanziConv import toSimplified [as 别名]
def simplify_or_none(text):
    if text is None:
        return None
    else:
        return HanziConv.toSimplified(text)

开发者ID:wikimedia，项目名称:revscoring，代码行数:7，代码来源:chinese.py

示例12: simplified_eq

# 需要导入模块: from hanziconv import HanziConv [as 别名]
# 或者: from hanziconv.HanziConv import toSimplified [as 别名]
def simplified_eq(a, b):
    return len(a) == len(b) and \
           HanziConv.toSimplified(a[0]) == \
           HanziConv.toSimplified(b[0])

开发者ID:wikimedia，项目名称:revscoring，代码行数:6，代码来源:test_chinese.py

示例13: main

# 需要导入模块: from hanziconv import HanziConv [as 别名]
# 或者: from hanziconv.HanziConv import toSimplified [as 别名]
def main():
    parent_dir = TRAIN_DIR / 'cmn'

    for target_file in ['tatoeba.txt', 'w2c.txt']:

        hant_file = open(TRAIN_DIR / 'cmn-hant' / target_file, mode='w')
        hans_file = open(TRAIN_DIR / 'cmn-hans' / target_file, mode='w')

        with open(parent_dir / target_file) as f:
            for line in f:
                text = line.rstrip()
                if not text:
                    continue

                try:
                    lang = langdetect.detect(text)
                except langdetect.lang_detect_exception.LangDetectException:
                    continue

                if lang in {'zh-tw', 'ko'}:
                    text = HanziConv.toTraditional(text)
                    hant_file.write(text)
                    hant_file.write('\n')
                elif lang == 'zh-cn':
                    text = HanziConv.toSimplified(text)
                    hans_file.write(text)
                    hans_file.write('\n')

        hant_file.close()
        hans_file.close()

开发者ID:mhagiwara，项目名称:nanigonet，代码行数:32，代码来源:clean_chinese.py

示例14: clean

# 需要导入模块: from hanziconv import HanziConv [as 别名]
# 或者: from hanziconv.HanziConv import toSimplified [as 别名]
def clean(self, line):
        line = re.sub(r'\[[\u4e00-\u9fa5a-z]{1,4}\]|\[aloha\]', '', line)
        line = re.sub(EMOJI_UNICODE, '', line)
        line = re.sub(self.html_texts, '', line)
        if re.search(r'[\u4300-\u9fa5]+', line):
            line = HanziConv.toSimplified(line)
            return re.sub(' {2,}|\t', ' ', line).lower()
        else:
            return None

开发者ID:stevewyl，项目名称:nlp_toolkit，代码行数:11，代码来源:data.py

示例15: process

# 需要导入模块: from hanziconv import HanziConv [as 别名]
# 或者: from hanziconv.HanziConv import toSimplified [as 别名]
def process(line):
    line = line.strip()
    line = RE_LATIN_SPACE.sub('‧', line)
    line = RE_SPACES.sub('', line)
    line = unicodedata.normalize('NFKC', line)
    line = HanziConv.toSimplified(line)
    return line

开发者ID:robertostling，项目名称:hnmt，代码行数:9，代码来源:normalize_zh.py

注：本文中的hanziconv.HanziConv.toSimplified方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台，相关代码片段筛选自各路编程大神贡献的开源项目，源码版权归原作者所有，传播和使用请参考对应项目的License；未经允许，请勿转载。