本文整理匯總了Python中regex.sub方法的典型用法代碼示例。如果您正苦於以下問題:Python regex.sub方法的具體用法?Python regex.sub怎麽用?Python regex.sub使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在類regex
的用法示例。
在下文中一共展示了regex.sub方法的15個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。
示例1: pred_to_dict
# 需要導入模塊: import regex [as 別名]
# 或者: from regex import sub [as 別名]
def pred_to_dict(text, pred, prob):
res = {"company": ("", 0), "date": ("", 0), "address": ("", 0), "total": ("", 0)}
keys = list(res.keys())
seps = [0] + (numpy.nonzero(numpy.diff(pred))[0] + 1).tolist() + [len(pred)]
for i in range(len(seps) - 1):
pred_class = pred[seps[i]] - 1
if pred_class == -1:
continue
new_key = keys[pred_class]
new_prob = prob[seps[i] : seps[i + 1]].max()
if new_prob > res[new_key][1]:
res[new_key] = (text[seps[i] : seps[i + 1]], new_prob)
return {k: regex.sub(r"[\t\n]", " ", v[0].strip()) for k, v in res.items()}
示例2: normalize_answer
# 需要導入模塊: import regex [as 別名]
# 或者: from regex import sub [as 別名]
def normalize_answer(s):
"""Lower text and remove punctuation, articles and extra whitespace."""
def remove_articles(text):
return re.sub(r'\b(a|an|the)\b', ' ', text)
def white_space_fix(text):
return ' '.join(text.split())
def remove_punc(text):
exclude = set(string.punctuation)
return ''.join(ch for ch in text if ch not in exclude)
def lower(text):
return text.lower()
return white_space_fix(remove_articles(remove_punc(lower(s))))
示例3: make_vocab
# 需要導入模塊: import regex [as 別名]
# 或者: from regex import sub [as 別名]
def make_vocab(fpath, fname):
'''Constructs vocabulary.
Args:
fpath: A string. Input file path.
fname: A string. Output file name.
Writes vocabulary line by line to `preprocessed/fname`
'''
text = codecs.open(fpath, 'r', 'utf-8').read()
text = regex.sub("[^\s\p{Latin}']", "", text)
words = text.split()
word2cnt = Counter(words)
if not os.path.exists('preprocessed'): os.mkdir('preprocessed')
with codecs.open('preprocessed/{}'.format(fname), 'w', 'utf-8') as fout:
fout.write("{}\t1000000000\n{}\t1000000000\n{}\t1000000000\n{}\t1000000000\n".format("<PAD>", "<UNK>", "<S>", "</S>"))
for word, cnt in word2cnt.most_common(len(word2cnt)):
fout.write(u"{}\t{}\n".format(word, cnt))
示例4: normalize
# 需要導入模塊: import regex [as 別名]
# 或者: from regex import sub [as 別名]
def normalize(self, text):
"""
Returns a string with normalized punctuation.
"""
# Optionally, replace unicode puncts BEFORE normalization.
if self.pre_replace_unicode_punct:
text = self.replace_unicode_punct(text)
# Actual normalization.
for regexp, substitution in self.substitutions:
# print(regexp, substitution)
text = re.sub(regexp, substitution, text_type(text))
# print(text)
# Optionally, replace unicode puncts BEFORE normalization.
if self.post_remove_control_chars:
text = self.remove_control_chars(text)
return text
示例5: _load_search_pattern
# 需要導入模塊: import regex [as 別名]
# 或者: from regex import sub [as 別名]
def _load_search_pattern(self):
self.type_mapper = {}
py_regex_pattern = self.pattern
while True:
# Finding all types specified in the groks
m = re.findall(r'%{(\w+):(\w+):(\w+)}', py_regex_pattern)
for n in m:
self.type_mapper[n[1]] = n[2]
#replace %{pattern_name:custom_name} (or %{pattern_name:custom_name:type}
# with regex and regex group name
py_regex_pattern = re.sub(r'%{(\w+):(\w+)(?::\w+)?}',
lambda m: "(?P<" + m.group(2) + ">" + self.predefined_patterns[m.group(1)].regex_str + ")",
py_regex_pattern)
#replace %{pattern_name} with regex
py_regex_pattern = re.sub(r'%{(\w+)}',
lambda m: "(" + self.predefined_patterns[m.group(1)].regex_str + ")",
py_regex_pattern)
if re.search('%{\w+(:\w+)?}', py_regex_pattern) is None:
break
self.regex_obj = re.compile(py_regex_pattern)
示例6: normalize_phone
# 需要導入模塊: import regex [as 別名]
# 或者: from regex import sub [as 別名]
def normalize_phone(cls, number):
"""
Normalizes the passed in phone number
"""
# remove any invalid characters
number = regex.sub(r"[^0-9a-z\+]", "", number.lower(), regex.V0)
# add on a plus if it looks like it could be a fully qualified number
if len(number) >= 11 and number[0] not in ["+", "0"]:
number = "+" + number
try:
normalized = phonenumbers.parse(number)
if phonenumbers.is_possible_number(normalized):
return phonenumbers.format_number(normalized, phonenumbers.PhoneNumberFormat.E164)
except Exception:
pass
return number
示例7: sql
# 需要導入模塊: import regex [as 別名]
# 或者: from regex import sub [as 別名]
def sql(self, query: str):
"""
Convert a SQL query to an Ibis table expression.
Parameters
----------
query : string
Returns
-------
table : TableExpr
"""
# Remove `;` + `--` (comment)
query = re.sub(r'\s*;\s*--', '\n--', query.strip())
# Remove trailing ;
query = re.sub(r'\s*;\s*$', '', query.strip())
schema = self._get_schema_using_validator(query)
return ops.SQLQueryResult(query, schema, self).to_expr()
示例8: _handle_splits
# 需要導入模塊: import regex [as 別名]
# 或者: from regex import sub [as 別名]
def _handle_splits(_str):
"""Check if incoming date has a '-" or '/', if so do stuff."""
_str = _str.replace('/', '-')
_tmp_dict = {}
if '-' in _str:
start, stop = _str.split('-')
if _check_number(start):
start = regex.sub(r'[0-9]+\?*', start, stop)
elif _check_number(stop):
stop = regex.sub(r'[0-9]+\?*', stop, start)
else:
start = _str
stop = _str
_tmp_dict['start_raw'] = start
_tmp_dict['stop_raw'] = stop
_tmp_dict['start_epoch'] = _get_epoch(start)
_tmp_dict['stop_epoch'] = _get_epoch(stop)
return _tmp_dict
示例9: simplify_accent_notation
# 需要導入模塊: import regex [as 別名]
# 或者: from regex import sub [as 別名]
def simplify_accent_notation(cls, text):
# References: https://en.wikipedia.org/wiki/Combining_Diacritical_Marks
text = text.replace("á", "á")
text = text.replace("í", "í")
text = text.replace("ú", "ú")
text = text.replace("ŕ", "ŕ")
text = text.replace("é", "é")
text = text.replace("ó", "ó")
text = text.replace("à", "à")
text = text.replace("ì", "ì")
text = text.replace("ù", "ù")
text = text.replace("è", "è")
text = text.replace("ò", "ò")
text = regex.sub("([̀́])([̥̇¯̄]+)", "\\2\\1", text)
return text
示例10: build_detection_class
# 需要導入模塊: import regex [as 別名]
# 或者: from regex import sub [as 別名]
def build_detection_class(folder, dataset_filename,
label, sender_known=True):
"""Builds signature detection class.
Signature detection dataset includes patterns for two classes:
* class for positive patterns (goes with label 1)
* class for negative patterns (goes with label -1)
The patterns are build of emails from `folder` and appended to
dataset file.
>>> build_signature_detection_class('emails/P', 'train.data', 1)
"""
with open(dataset_filename, 'a') as dataset:
for filename in os.listdir(folder):
filename = os.path.join(folder, filename)
sender, msg = parse_msg_sender(filename, sender_known)
if sender is None or msg is None:
continue
msg = re.sub('|'.join(ANNOTATIONS), '', msg)
X = build_pattern(msg, features(sender))
X.append(label)
labeled_pattern = ','.join([str(e) for e in X])
dataset.write(labeled_pattern + '\n')
示例11: _replace_link_brackets
# 需要導入模塊: import regex [as 別名]
# 或者: from regex import sub [as 別名]
def _replace_link_brackets(msg_body):
"""
Normalize links i.e. replace '<', '>' wrapping the link with some symbols
so that '>' closing the link couldn't be mistakenly taken for quotation
marker.
Converts msg_body into a unicode
"""
if isinstance(msg_body, bytes):
msg_body = msg_body.decode('utf8')
def link_wrapper(link):
newline_index = msg_body[:link.start()].rfind("\n")
if msg_body[newline_index + 1] == ">":
return link.group()
else:
return "@@%s@@" % link.group(1)
msg_body = re.sub(RE_LINK, link_wrapper, msg_body)
return msg_body
示例12: _wrap_splitter_with_newline
# 需要導入模塊: import regex [as 別名]
# 或者: from regex import sub [as 別名]
def _wrap_splitter_with_newline(msg_body, delimiter, content_type='text/plain'):
"""
Splits line in two if splitter pattern preceded by some text on the same
line (done only for 'On <date> <person> wrote:' pattern.
"""
def splitter_wrapper(splitter):
"""Wraps splitter with new line"""
if splitter.start() and msg_body[splitter.start() - 1] != '\n':
return '%s%s' % (delimiter, splitter.group())
else:
return splitter.group()
if content_type == 'text/plain':
msg_body = re.sub(RE_ON_DATE_SMB_WROTE, splitter_wrapper, msg_body)
return msg_body
示例13: sentence_split
# 需要導入模塊: import regex [as 別名]
# 或者: from regex import sub [as 別名]
def sentence_split(sentence):
# If not an embedded detokenizer - split by spaces
if not preprocessing['embedded_detokenizer']:
return sentence.split()
global re_split
# Prepare for split sentence into a words by ' ▁'
line = ' ▁▁' + sentence[1:].replace('▁', '▁▁')
line = re_split.sub(r' ▁\1\2 ▁', line)
# split, filer and return
return list(filter(lambda line: False if len(line) == 0 or line == '▁' else True, [token.strip() for token in line.split(' ▁')]))
# Load json file with BPE join pairs
示例14: process_text
# 需要導入模塊: import regex [as 別名]
# 或者: from regex import sub [as 別名]
def process_text(text):
soup = BeautifulSoup(text, "lxml")
tags_del = soup.get_text()
no_html = re.sub('<[^>]*>', '', tags_del)
tokenized = casual_tokenizer(no_html)
lower = [item.lower() for item in tokenized]
decontract = [expandContractions(item, c_re=c_re) for item in lower]
tagged = nltk.pos_tag(decontract)
lemma = lemma_wordnet(tagged)
#no_num = [re.sub('[0-9]+', '', each) for each in lemma]
no_punc = [w for w in lemma if w not in punc]
no_stop = [w for w in no_punc if w not in stop_words]
return no_stop
################################################################################################################################################################
#### THE ABOVE Process_Text secion Re-used with Permission from:
#### R O B S A L G A D O robert.salgado@gmail.com Thank YOU!
################################################################################
示例15: to_snake_case
# 需要導入模塊: import regex [as 別名]
# 或者: from regex import sub [as 別名]
def to_snake_case(self):
"""Convert string to snake case
Converts the input string to snake case. Snake case is all lower case
with underscores as word boundaries. e.g. this_is_snake_case.
Returns:
Chepy: The Chepy object.
Examples:
>>> Chepy("helloWorld").to_snake_case().o
"hello_world"
"""
s1 = re.sub("(.)([A-Z][a-z]+)", r"\1_\2", self._convert_to_str())
self.state = re.sub("([a-z0-9])([A-Z])", r"\1_\2", s1).lower()
return self