本文整理汇总了Python中regex.UNICODE属性的典型用法代码示例。如果您正苦于以下问题:Python regex.UNICODE属性的具体用法?Python regex.UNICODE怎么用?Python regex.UNICODE使用的例子?那么, 这里精选的属性代码示例或许可以为您提供帮助。您也可以进一步了解该属性所在类regex
的用法示例。
在下文中一共展示了regex.UNICODE属性的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: __init__
# 需要导入模块: import regex [as 别名]
# 或者: from regex import UNICODE [as 别名]
def __init__(self, **kwargs):
"""
Args:
annotators: None or empty set (only tokenizes).
substitutions: if true, normalizes some token types (e.g. quotes).
"""
self._regexp = regex.compile(
'(?P<digit>%s)|(?P<title>%s)|(?P<abbr>%s)|(?P<neg>%s)|(?P<hyph>%s)|'
'(?P<contr1>%s)|(?P<alphanum>%s)|(?P<contr2>%s)|(?P<sdquote>%s)|'
'(?P<edquote>%s)|(?P<ssquote>%s)|(?P<esquote>%s)|(?P<dash>%s)|'
'(?<ellipses>%s)|(?P<punct>%s)|(?P<nonws>%s)' %
(self.DIGIT, self.TITLE, self.ABBRV, self.NEGATION, self.HYPHEN,
self.CONTRACTION1, self.ALPHA_NUM, self.CONTRACTION2,
self.START_DQUOTE, self.END_DQUOTE, self.START_SQUOTE,
self.END_SQUOTE, self.DASH, self.ELLIPSES, self.PUNCT,
self.NON_WS),
flags=regex.IGNORECASE + regex.UNICODE + regex.MULTILINE
)
if len(kwargs.get('annotators', {})) > 0:
logger.warning('%s only tokenizes! Skipping annotators: %s' %
(type(self).__name__, kwargs.get('annotators')))
self.annotators = set()
self.substitutions = kwargs.get('substitutions', True)
示例2: _phrase_to_regex
# 需要导入模块: import regex [as 别名]
# 或者: from regex import UNICODE [as 别名]
def _phrase_to_regex(phrase):
# Treat whitespace between words as meaning anything other than alphanumeric
# characters.
pattern = r"[^\w--_]+".join(regex.escape(word) for word in phrase.split())
# Treat spaces at the beginning or end of the phrase as matching any
# whitespace character. This makes it easy to select stuff like non-breaking
# space, which occurs frequently in browsers.
# TODO Support newlines. Note that these are frequently implemented as
# separate text nodes in the accessibility tree, so the obvious
# implementation would not work well.
if phrase == " ":
pattern = r"\s"
else:
if phrase.startswith(" "):
pattern = r"\s" + pattern
if phrase.endswith(" "):
pattern = pattern + r"\s"
# Only match at boundaries of alphanumeric sequences if the phrase ends
# are alphanumeric.
if regex.search(r"^[\w--_]", phrase, regex.VERSION1 | regex.UNICODE):
pattern = r"(?<![\w--_])" + pattern
if regex.search(r"[\w--_]$", phrase, regex.VERSION1 | regex.UNICODE):
pattern = pattern + r"(?![\w--_])"
return pattern
示例3: __init__
# 需要导入模块: import regex [as 别名]
# 或者: from regex import UNICODE [as 别名]
def __init__(self, opt: Opt, shared: TShared = None):
"""
Initialize the BPE module.
:param opt:
options
:param shared:
shared dictionary
"""
super().__init__(opt, shared)
if not SUBWORD_BPE_INSTALLED:
raise RuntimeError(
"Please run \"pip install 'git+https://github.com/rsennrich"
"/subword-nmt.git#egg=subword-nmt'\""
)
if not opt.get('dict_file'):
raise RuntimeError('--dict-file is mandatory.')
self.splitter = re.compile(r'\w+|[^\w\s]', re.UNICODE)
self.codecs = f"{opt['dict_file']}.codecs"
if os.path.exists(self.codecs):
self._load_from_codecs()
示例4: sdm_sim
# 需要导入模块: import regex [as 别名]
# 或者: from regex import UNICODE [as 别名]
def sdm_sim(headlines, bodies):
def similarity(headline, body):
clean_headline = clean(headline)
clean_body = clean(body)
fullClient = retinasdk.FullClient("e8bf8de0-fe52-11e6-b22d-93a4ae922ff1", apiServer="http://api.cortical.io/rest", retinaName="en_associative")
RE = re.compile(u'[⺀-⺙⺛-⻳⼀-⿕々〇〡-〩〸-〺〻㐀-䶵一-鿃豈-鶴侮-頻並-龎]', re.UNICODE)
clean_body = RE.sub(u'', clean_body)
# clean_body = clean_body.encode('ascii', 'ignore')
clean_body = clean_body.encode('utf8', 'ignore')
clean_body = clean_body.decode('utf8', 'ignore')
# print(clean_body)
clean_body.replace("0x6e", " ")
# newdata = clean_body[:start] + clean_body[end:]
# clean_body = clean_body.translate(None, '0x6e')
comp_with_stop_words = fullClient.compare('[{"text": "'+clean_headline+'"}, {"text": "'+clean_body +'"}]')
sim = comp_with_stop_words.cosineSimilarity
features = []
features.append(sim)
return features
x = []
for i, (headline, body) in tqdm(enumerate(zip(headlines, bodies))):
x.append(similarity(headline, body))
return x
示例5: remove_punctuation
# 需要导入模块: import regex [as 别名]
# 或者: from regex import UNICODE [as 别名]
def remove_punctuation(text: str, marks=None) -> str:
"""
Remove punctuation from ``text`` by removing all instances of ``marks``.
Args:
text (str): Urdu text
marks (str): If specified, remove only the characters in this string,
e.g. ``marks=',;:'`` removes commas, semi-colons, and colons.
Otherwise, all punctuation marks are removed.
Returns:
str: returns a ``str`` object containing normalized text.
Note:
When ``marks=None``, Python's built-in :meth:`str.translate()` is
used to remove punctuation; otherwise, a regular expression is used
instead. The former's performance is about 5-10x faster.
Examples:
>>> from urduhack.preprocessing import remove_punctuation
>>> output = remove_punctuation("کر ؟ سکتی ہے۔")
کر سکتی ہے
"""
if marks:
return re.sub('[{}]+'.format(re.escape(marks)), '', text, flags=re.UNICODE)
return text.translate(PUNCTUATION_TRANSLATE_UNICODE)
示例6: _set_splitters
# 需要导入模块: import regex [as 别名]
# 或者: from regex import UNICODE [as 别名]
def _set_splitters(self, settings=None):
splitters = {
'wordchars': set(), # The ones that split string only if they are not surrounded by letters from both sides
'capturing': set(), # The ones that are not filtered out from tokens after split
}
splitters['capturing'] |= set(ALWAYS_KEEP_TOKENS)
wordchars = self._get_wordchars(settings)
skip = set(self.info.get('skip', [])) | splitters['capturing']
for token in skip:
if not re.match(r'^\W+$', token, re.UNICODE):
continue
if token in wordchars:
splitters['wordchars'].add(token)
self._splitters = splitters
示例7: regex_match_score
# 需要导入模块: import regex [as 别名]
# 或者: from regex import UNICODE [as 别名]
def regex_match_score(prediction, pattern):
"""Check if the prediction matches the given regular expression."""
try:
compiled = re.compile(
pattern,
flags=re.IGNORECASE + re.UNICODE + re.MULTILINE
)
except BaseException:
logger.warn('Regular expression failed to compile: %s' % pattern)
return False
return compiled.match(prediction) is not None
示例8: has_answer
# 需要导入模块: import regex [as 别名]
# 或者: from regex import UNICODE [as 别名]
def has_answer(args, answer, t):
global PROCESS_TOK
text = []
for i in range(len(t)):
text.append(t[i].lower())
res_list = []
if (args.dataset == "CuratedTrec"):
try:
ans_regex = re.compile("(%s)"%answer[0], flags=re.IGNORECASE + re.UNICODE)
except:
return False, res_list
paragraph = " ".join(text)
answer_new = ans_regex.findall(paragraph)
for a in answer_new:
single_answer = normalize(a[0])
single_answer = PROCESS_TOK.tokenize(single_answer)
single_answer = single_answer.words(uncased=True)
for i in range(0, len(text) - len(single_answer) + 1):
if single_answer == text[i: i + len(single_answer)]:
res_list.append((i, i+len(single_answer)-1))
else:
for a in answer:
single_answer = " ".join(a).lower()
single_answer = normalize(single_answer)
single_answer = PROCESS_TOK.tokenize(single_answer)
single_answer = single_answer.words(uncased=True)
for i in range(0, len(text) - len(single_answer) + 1):
if single_answer == text[i: i + len(single_answer)]:
res_list.append((i, i+len(single_answer)-1))
if (len(res_list)>0):
return True, res_list
else:
return False, res_list
示例9: __init__
# 需要导入模块: import regex [as 别名]
# 或者: from regex import UNICODE [as 别名]
def __init__(self, **kwargs):
"""
Args:
annotators: None or empty set (only tokenizes).
"""
self._regexp = regex.compile(
'(%s)|(%s)' % (self.ALPHA_NUM, self.NON_WS),
flags=regex.IGNORECASE + regex.UNICODE + regex.MULTILINE
)
if len(kwargs.get('annotators', {})) > 0:
logger.warning('%s only tokenizes! Skipping annotators: %s' %
(type(self).__name__, kwargs.get('annotators')))
self.annotators = set()
示例10: regex_match
# 需要导入模块: import regex [as 别名]
# 或者: from regex import UNICODE [as 别名]
def regex_match(text, pattern):
"""Test if a regex pattern is contained within a text."""
try:
pattern = re.compile(
pattern,
flags=re.IGNORECASE + re.UNICODE + re.MULTILINE,
)
except BaseException:
return False
return pattern.search(text) is not None
示例11: clean_name
# 需要导入模块: import regex [as 别名]
# 或者: from regex import UNICODE [as 别名]
def clean_name(self):
name = self.cleaned_data["name"].strip()
if name.lower() == "flagged":
raise forms.ValidationError(_("Reserved label name"))
elif len(name) > Label.MAX_NAME_LEN:
raise forms.ValidationError(_("Label name must be %d characters or less") % Label.MAX_NAME_LEN)
# first character must be a word char
elif not regex.match(r"\w", name[0], flags=regex.UNICODE):
raise forms.ValidationError(_("Label name must start with a letter or digit"))
return name
示例12: matches
# 需要导入模块: import regex [as 别名]
# 或者: from regex import UNICODE [as 别名]
def matches(self, message):
text = normalize(message.text)
def keyword_check(w):
return lambda: bool(regex.search(r"\b" + w + r"\b", text, flags=regex.UNICODE | regex.V0))
checks = [keyword_check(keyword) for keyword in self.keywords]
return self.quantifier.evaluate(checks)
示例13: replace_tippi
# 需要导入模块: import regex [as 别名]
# 或者: from regex import UNICODE [as 别名]
def replace_tippi(cls, text):
import regex
text = regex.sub("ੱ([ਕਖ])", r"ਕ੍\g<1>", text, flags=regex.UNICODE)
text = regex.sub(r"ੱ([ਗਘ])", "ਗ੍\g<1>", text)
text = regex.sub("ੱ([ਚਛ])", "ਚ੍\g<1>", text)
text = regex.sub("ੱ([ਜਝ])", "ਜ੍\g<1>", text)
text = regex.sub("ੱ([ਟਠ])", "ਟ੍\g<1>", text)
text = regex.sub("ੱ([ਡਢ])", "ਡ੍\g<1>", text)
text = regex.sub("ੱ([ਤਥ])", "ਤ੍\g<1>", text)
text = regex.sub("ੱ([ਦਧ])", "ਦ੍\g<1>", text)
text = regex.sub("ੱ([ਪਫ])", "ਪ੍\g<1>", text)
text = regex.sub("ੱ([ਬਭ])", "ਬ੍\g<1>", text)
text = regex.sub("ੱ([ਯਰਲਵਸ਼ਸਹਙਞਣਨਮਜ਼ੜਫ਼])", "\g<1>੍\g<1>", text)
return text
示例14: __init__
# 需要导入模块: import regex [as 别名]
# 或者: from regex import UNICODE [as 别名]
def __init__(self, **kwargs):
"""
Args:
annotators: None or empty set (only tokenizes).
"""
self._regexp = regex.compile(
'(%s)|(%s)' % (self.ALPHA_NUM, self.NON_WS),
flags=regex.IGNORECASE + regex.UNICODE + regex.MULTILINE,
)
if len(kwargs.get('annotators', {})) > 0:
logger.warning(
'%s only tokenizes! Skipping annotators: %s'
% (type(self).__name__, kwargs.get('annotators'))
)
self.annotators = set()
示例15: __init__
# 需要导入模块: import regex [as 别名]
# 或者: from regex import UNICODE [as 别名]
def __init__(self, **kwargs):
"""
Args:
annotators: None or empty set (only tokenizes).
substitutions: if true, normalizes some token types (e.g. quotes).
"""
self._regexp = regex.compile(
'(?P<digit>%s)|(?P<title>%s)|(?P<abbr>%s)|(?P<neg>%s)|(?P<hyph>%s)|'
'(?P<contr1>%s)|(?P<alphanum>%s)|(?P<contr2>%s)|(?P<sdquote>%s)|'
'(?P<edquote>%s)|(?P<ssquote>%s)|(?P<esquote>%s)|(?P<dash>%s)|'
'(?<ellipses>%s)|(?P<punct>%s)|(?P<nonws>%s)'
% (
self.DIGIT,
self.TITLE,
self.ABBRV,
self.NEGATION,
self.HYPHEN,
self.CONTRACTION1,
self.ALPHA_NUM,
self.CONTRACTION2,
self.START_DQUOTE,
self.END_DQUOTE,
self.START_SQUOTE,
self.END_SQUOTE,
self.DASH,
self.ELLIPSES,
self.PUNCT,
self.NON_WS,
),
flags=regex.IGNORECASE + regex.UNICODE + regex.MULTILINE,
)
if len(kwargs.get('annotators', {})) > 0:
logger.warning(
'%s only tokenizes! Skipping annotators: %s'
% (type(self).__name__, kwargs.get('annotators'))
)
self.annotators = set()
self.substitutions = kwargs.get('substitutions', True)