Python regex.sub方法代碼示例

本文整理匯總了Python中regex.sub方法的典型用法代碼示例。如果您正苦於以下問題：Python regex.sub方法的具體用法？Python regex.sub怎麽用？Python regex.sub使用的例子？那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在類regex的用法示例。

在下文中一共展示了regex.sub方法的15個代碼示例，這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚，您的評價將有助於係統推薦出更棒的Python代碼示例。

示例1: pred_to_dict

# 需要導入模塊: import regex [as 別名]
# 或者: from regex import sub [as 別名]
def pred_to_dict(text, pred, prob):
    res = {"company": ("", 0), "date": ("", 0), "address": ("", 0), "total": ("", 0)}
    keys = list(res.keys())

    seps = [0] + (numpy.nonzero(numpy.diff(pred))[0] + 1).tolist() + [len(pred)]
    for i in range(len(seps) - 1):
        pred_class = pred[seps[i]] - 1
        if pred_class == -1:
            continue

        new_key = keys[pred_class]
        new_prob = prob[seps[i] : seps[i + 1]].max()
        if new_prob > res[new_key][1]:
            res[new_key] = (text[seps[i] : seps[i + 1]], new_prob)

    return {k: regex.sub(r"[\t\n]", " ", v[0].strip()) for k, v in res.items()}

開發者ID:zzzDavid，項目名稱:ICDAR-2019-SROIE，代碼行數:18，代碼來源:my_utils.py

示例2: normalize_answer

# 需要導入模塊: import regex [as 別名]
# 或者: from regex import sub [as 別名]
def normalize_answer(s):
    """Lower text and remove punctuation, articles and extra whitespace."""
    def remove_articles(text):
        return re.sub(r'\b(a|an|the)\b', ' ', text)

    def white_space_fix(text):
        return ' '.join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return ''.join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(s))))

開發者ID:HKUST-KnowComp，項目名稱:MnemonicReader，代碼行數:18，代碼來源:utils.py

示例3: make_vocab

# 需要導入模塊: import regex [as 別名]
# 或者: from regex import sub [as 別名]
def make_vocab(fpath, fname):
    '''Constructs vocabulary.
    
    Args:
      fpath: A string. Input file path.
      fname: A string. Output file name.
    
    Writes vocabulary line by line to `preprocessed/fname`
    '''  
    text = codecs.open(fpath, 'r', 'utf-8').read()
    text = regex.sub("[^\s\p{Latin}']", "", text)
    words = text.split()
    word2cnt = Counter(words)
    if not os.path.exists('preprocessed'): os.mkdir('preprocessed')
    with codecs.open('preprocessed/{}'.format(fname), 'w', 'utf-8') as fout:
        fout.write("{}\t1000000000\n{}\t1000000000\n{}\t1000000000\n{}\t1000000000\n".format("<PAD>", "<UNK>", "<S>", "</S>"))
        for word, cnt in word2cnt.most_common(len(word2cnt)):
            fout.write(u"{}\t{}\n".format(word, cnt))

開發者ID:Kyubyong，項目名稱:transformer，代碼行數:20，代碼來源:prepro.py

示例4: normalize

# 需要導入模塊: import regex [as 別名]
# 或者: from regex import sub [as 別名]
def normalize(self, text):
        """
        Returns a string with normalized punctuation.
        """
        # Optionally, replace unicode puncts BEFORE normalization.
        if self.pre_replace_unicode_punct:
            text = self.replace_unicode_punct(text)

        # Actual normalization.
        for regexp, substitution in self.substitutions:
            # print(regexp, substitution)
            text = re.sub(regexp, substitution, text_type(text))
            # print(text)

        # Optionally, replace unicode puncts BEFORE normalization.
        if self.post_remove_control_chars:
            text = self.remove_control_chars(text)

        return text

開發者ID:alvations，項目名稱:sacremoses，代碼行數:21，代碼來源:normalize.py

示例5: _load_search_pattern

# 需要導入模塊: import regex [as 別名]
# 或者: from regex import sub [as 別名]
def _load_search_pattern(self):
        self.type_mapper = {}
        py_regex_pattern = self.pattern
        while True:
            # Finding all types specified in the groks
            m = re.findall(r'%{(\w+):(\w+):(\w+)}', py_regex_pattern)
            for n in m:
                self.type_mapper[n[1]] = n[2]
            #replace %{pattern_name:custom_name} (or %{pattern_name:custom_name:type}
            # with regex and regex group name

            py_regex_pattern = re.sub(r'%{(\w+):(\w+)(?::\w+)?}',
                lambda m: "(?P<" + m.group(2) + ">" + self.predefined_patterns[m.group(1)].regex_str + ")",
                py_regex_pattern)

            #replace %{pattern_name} with regex
            py_regex_pattern = re.sub(r'%{(\w+)}',
                lambda m: "(" + self.predefined_patterns[m.group(1)].regex_str + ")",
                py_regex_pattern)

            if re.search('%{\w+(:\w+)?}', py_regex_pattern) is None:
                break

        self.regex_obj = re.compile(py_regex_pattern)

開發者ID:garyelephant，項目名稱:pygrok，代碼行數:26，代碼來源:pygrok.py

示例6: normalize_phone

# 需要導入模塊: import regex [as 別名]
# 或者: from regex import sub [as 別名]
def normalize_phone(cls, number):
        """
        Normalizes the passed in phone number
        """
        # remove any invalid characters
        number = regex.sub(r"[^0-9a-z\+]", "", number.lower(), regex.V0)

        # add on a plus if it looks like it could be a fully qualified number
        if len(number) >= 11 and number[0] not in ["+", "0"]:
            number = "+" + number

        try:
            normalized = phonenumbers.parse(number)

            if phonenumbers.is_possible_number(normalized):
                return phonenumbers.format_number(normalized, phonenumbers.PhoneNumberFormat.E164)
        except Exception:
            pass

        return number

開發者ID:rapidpro，項目名稱:casepro，代碼行數:22，代碼來源:models.py

示例7: sql

# 需要導入模塊: import regex [as 別名]
# 或者: from regex import sub [as 別名]
def sql(self, query: str):
        """
        Convert a SQL query to an Ibis table expression.

        Parameters
        ----------
        query : string

        Returns
        -------
        table : TableExpr
        """
        # Remove `;` + `--` (comment)
        query = re.sub(r'\s*;\s*--', '\n--', query.strip())
        # Remove trailing ;
        query = re.sub(r'\s*;\s*$', '', query.strip())
        schema = self._get_schema_using_validator(query)
        return ops.SQLQueryResult(query, schema, self).to_expr()

開發者ID:ibis-project，項目名稱:ibis，代碼行數:20，代碼來源:client.py

示例8: _handle_splits

# 需要導入模塊: import regex [as 別名]
# 或者: from regex import sub [as 別名]
def _handle_splits(_str):
    """Check if incoming date has a '-" or '/', if so do stuff."""
    _str = _str.replace('/', '-')
    _tmp_dict = {}

    if '-' in _str:
        start, stop = _str.split('-')
        if _check_number(start):
            start = regex.sub(r'[0-9]+\?*', start, stop)
        elif _check_number(stop):
            stop = regex.sub(r'[0-9]+\?*', stop, start)
    else:
        start = _str
        stop = _str
    _tmp_dict['start_raw'] = start
    _tmp_dict['stop_raw'] = stop

    _tmp_dict['start_epoch'] = _get_epoch(start)
    _tmp_dict['stop_epoch'] = _get_epoch(stop)

    return _tmp_dict

開發者ID:cltk，項目名稱:cltk，代碼行數:23，代碼來源:parse_tlg_indices.py

示例9: simplify_accent_notation

# 需要導入模塊: import regex [as 別名]
# 或者: from regex import sub [as 別名]
def simplify_accent_notation(cls, text):
        # References: https://en.wikipedia.org/wiki/Combining_Diacritical_Marks
        text = text.replace("á", "á")
        text = text.replace("í", "í")
        text = text.replace("ú", "ú")
        text = text.replace("ŕ", "ŕ")
        text = text.replace("é", "é")
        text = text.replace("ó", "ó")

        text = text.replace("à", "à")
        text = text.replace("ì", "ì")
        text = text.replace("ù", "ù")
        text = text.replace("è", "è")
        text = text.replace("ò", "ò")
        
        text = regex.sub("([̀́])([̥̇¯̄]+)", "\\2\\1", text)
        return text

開發者ID:sanskrit-coders，項目名稱:indic_transliteration，代碼行數:19，代碼來源:roman.py

示例10: build_detection_class

# 需要導入模塊: import regex [as 別名]
# 或者: from regex import sub [as 別名]
def build_detection_class(folder, dataset_filename,
                          label, sender_known=True):
    """Builds signature detection class.

    Signature detection dataset includes patterns for two classes:
    * class for positive patterns (goes with label 1)
    * class for negative patterns (goes with label -1)

    The patterns are build of emails from `folder` and appended to
    dataset file.

    >>> build_signature_detection_class('emails/P', 'train.data', 1)
    """
    with open(dataset_filename, 'a') as dataset:
        for filename in os.listdir(folder):
            filename = os.path.join(folder, filename)
            sender, msg = parse_msg_sender(filename, sender_known)
            if sender is None or msg is None:
                continue
            msg = re.sub('|'.join(ANNOTATIONS), '', msg)
            X = build_pattern(msg, features(sender))
            X.append(label)
            labeled_pattern = ','.join([str(e) for e in X])
            dataset.write(labeled_pattern + '\n')

開發者ID:mailgun，項目名稱:talon，代碼行數:26，代碼來源:dataset.py

示例11: _replace_link_brackets

# 需要導入模塊: import regex [as 別名]
# 或者: from regex import sub [as 別名]
def _replace_link_brackets(msg_body):
    """
    Normalize links i.e. replace '<', '>' wrapping the link with some symbols
    so that '>' closing the link couldn't be mistakenly taken for quotation
    marker.

    Converts msg_body into a unicode
    """
    if isinstance(msg_body, bytes):
        msg_body = msg_body.decode('utf8')

    def link_wrapper(link):
        newline_index = msg_body[:link.start()].rfind("\n")
        if msg_body[newline_index + 1] == ">":
            return link.group()
        else:
            return "@@%s@@" % link.group(1)

    msg_body = re.sub(RE_LINK, link_wrapper, msg_body)
    return msg_body

開發者ID:mailgun，項目名稱:talon，代碼行數:22，代碼來源:quotations.py

示例12: _wrap_splitter_with_newline

# 需要導入模塊: import regex [as 別名]
# 或者: from regex import sub [as 別名]
def _wrap_splitter_with_newline(msg_body, delimiter, content_type='text/plain'):
    """
    Splits line in two if splitter pattern preceded by some text on the same
    line (done only for 'On <date> <person> wrote:' pattern.
    """
    def splitter_wrapper(splitter):
        """Wraps splitter with new line"""
        if splitter.start() and msg_body[splitter.start() - 1] != '\n':
            return '%s%s' % (delimiter, splitter.group())
        else:
            return splitter.group()

    if content_type == 'text/plain':
        msg_body = re.sub(RE_ON_DATE_SMB_WROTE, splitter_wrapper, msg_body)

    return msg_body

開發者ID:mailgun，項目名稱:talon，代碼行數:18，代碼來源:quotations.py

示例13: sentence_split

# 需要導入模塊: import regex [as 別名]
# 或者: from regex import sub [as 別名]
def sentence_split(sentence):

    # If not an embedded detokenizer - split by spaces
    if not preprocessing['embedded_detokenizer']:
        return sentence.split()

    global re_split

    # Prepare for split sentence into a words by ' ▁'
    line = ' ▁▁' + sentence[1:].replace('▁', '▁▁')
    line = re_split.sub(r' ▁\1\2 ▁', line)

    # split, filer and return
    return list(filter(lambda line: False if len(line) == 0 or line == '▁' else True, [token.strip() for token in line.split(' ▁')]))

# Load json file with BPE join pairs

開發者ID:daniel-kukiela，項目名稱:nmt-chatbot，代碼行數:18，代碼來源:tokenizer.py

示例14: process_text

# 需要導入模塊: import regex [as 別名]
# 或者: from regex import sub [as 別名]
def process_text(text):
    soup = BeautifulSoup(text, "lxml")
    tags_del = soup.get_text()
    no_html = re.sub('<[^>]*>', '', tags_del)
    tokenized = casual_tokenizer(no_html)
    lower = [item.lower() for item in tokenized]
    decontract = [expandContractions(item, c_re=c_re) for item in lower]
    tagged = nltk.pos_tag(decontract)
    lemma = lemma_wordnet(tagged)
    #no_num = [re.sub('[0-9]+', '', each) for each in lemma]
    no_punc = [w for w in lemma if w not in punc]
    no_stop = [w for w in no_punc if w not in stop_words]
    return no_stop
################################################################################################################################################################
####   THE ABOVE Process_Text secion Re-used with Permission from:
####  R O B   S A L G A D O    robert.salgado@gmail.com Thank YOU!
################################################################################

開發者ID:AutoViML，項目名稱:Auto_ViML，代碼行數:19，代碼來源:Auto_NLP.py

示例15: to_snake_case

# 需要導入模塊: import regex [as 別名]
# 或者: from regex import sub [as 別名]
def to_snake_case(self):
        """Convert string to snake case

        Converts the input string to snake case. Snake case is all lower case 
        with underscores as word boundaries. e.g. this_is_snake_case.

        Returns:
            Chepy: The Chepy object.

        Examples:
            >>> Chepy("helloWorld").to_snake_case().o
            "hello_world"
        """
        s1 = re.sub("(.)([A-Z][a-z]+)", r"\1_\2", self._convert_to_str())
        self.state = re.sub("([a-z0-9])([A-Z])", r"\1_\2", s1).lower()
        return self

開發者ID:securisec，項目名稱:chepy，代碼行數:18，代碼來源:codetidy.py

注：本文中的regex.sub方法示例由純淨天空整理自Github/MSDocs等開源代碼及文檔管理平台，相關代碼片段篩選自各路編程大神貢獻的開源項目，源碼版權歸原作者所有，傳播和使用請參考對應項目的License；未經允許，請勿轉載。