本文整理汇总了Python中regex.split方法的典型用法代码示例。如果您正苦于以下问题:Python regex.split方法的具体用法?Python regex.split怎么用?Python regex.split使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类regex
的用法示例。
在下文中一共展示了regex.split方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: _split_doc
# 需要导入模块: import regex [as 别名]
# 或者: from regex import split [as 别名]
def _split_doc(doc):
"""Given a doc, split it into chunks (by paragraph)."""
curr = []
curr_len = 0
for split in regex.split(r'\n+', doc):
split = split.strip()
if len(split) == 0:
continue
# Maybe group paragraphs together until we hit a length limit
if len(curr) > 0 and curr_len + len(split) > GROUP_LENGTH:
return ' '.join(curr)
curr = []
curr_len = 0
curr.append(split)
curr_len += len(split)
if len(curr) > 0:
return ' '.join(curr)
示例2: sentence_segment
# 需要导入模块: import regex [as 别名]
# 或者: from regex import split [as 别名]
def sentence_segment(text):
'''
Args:
text: A string. A unsegmented paragraph.
Returns:
A list of sentences.
'''
global lcode
if lcode in ['ja', 'zh']:
sents = regex.split(u"([。!?])?[\n]+|[。!?]", text)
elif lcode in ['th']:
sents = text.split("[\n]+")
elif lcode in ['hi', 'bn']: # hindi, bengali
sents = regex.split(u"([.।?!])?[\n]+|[.।?!] ", text)
elif lcode in ['de']: # german
sents = regex.split("([.?!])?[\n]+|[.?!] ", text)
sents = [sent[0].lower() + sent[1:] for sent in sents if sent is not None and len(sent) > 1]
else:
sents = regex.split("([.?!])?[\n]+|[.?!] ", text)
return sents
示例3: word_segment
# 需要导入模块: import regex [as 别名]
# 或者: from regex import split [as 别名]
def word_segment(sent):
'''
Args:
sent: A string. A sentence.
Returns:
A list of words.
'''
global lcode
if lcode in ['ko']:
words = [word for word, _ in kkma.pos(sent)]
elif lcode in ['ja']:
words = mecab.parse(sent.encode('utf8')).split()
elif lcode in ['th']:
words = pythai.split(sent)
elif lcode in ['vi']:
words = ViTokenizer.tokenize(sent).split()
elif lcode in ['zh']:
words = list(jieba.cut(sent, cut_all=False))
# elif lcode in ['ar']:
# words = segmenter.segment(sent).split()
else: # Mostly european languages
words = sent.split()
return words
示例4: _split_doc
# 需要导入模块: import regex [as 别名]
# 或者: from regex import split [as 别名]
def _split_doc(self, doc):
"""Given a doc, split it into chunks (by paragraph)."""
curr = []
curr_len = 0
for split in regex.split(r'\n+', doc):
split = split.strip()
if len(split) == 0:
continue
# Maybe group paragraphs together until we hit a length limit
if len(curr) > 0 and curr_len + len(split) > self.GROUP_LENGTH:
yield ' '.join(curr)
curr = []
curr_len = 0
curr.append(split)
curr_len += len(split)
if len(curr) > 0:
yield ' '.join(curr)
示例5: search_docs
# 需要导入模块: import regex [as 别名]
# 或者: from regex import split [as 别名]
def search_docs(inputs, max_ex=5, opts=None):
"""Given a set of document ids (returned by ranking for a question), search
for top N best matching (by heuristic) paragraphs that contain the answer.
"""
if not opts:
raise RuntimeError('Options dict must be supplied.')
doc_ids, q_tokens, answer = inputs
examples = []
for i, doc_id in enumerate(doc_ids):
for j, paragraph in enumerate(re.split(r'\n+', fetch_text(doc_id))):
found = find_answer(paragraph, q_tokens, answer, opts)
if found:
# Reverse ranking, giving priority to early docs + paragraphs
score = (found[0], -i, -j, random.random())
if len(examples) < max_ex:
heapq.heappush(examples, (score, found[1]))
else:
heapq.heappushpop(examples, (score, found[1]))
return [e[1] for e in examples]
示例6: extract_names
# 需要导入模块: import regex [as 别名]
# 或者: from regex import split [as 别名]
def extract_names(sender):
"""Tries to extract sender's names from `From:` header.
It could extract not only the actual names but e.g.
the name of the company, parts of email, etc.
>>> extract_names('Sergey N. Obukhov <serobnic@mail.ru>')
['Sergey', 'Obukhov', 'serobnic']
>>> extract_names('')
[]
"""
sender = to_unicode(sender, precise=True)
# Remove non-alphabetical characters
sender = "".join([char if char.isalpha() else ' ' for char in sender])
# Remove too short words and words from "black" list i.e.
# words like `ru`, `gmail`, `com`, `org`, etc.
sender = [word for word in sender.split() if len(word) > 1 and
not word in BAD_SENDER_NAMES]
# Remove duplicates
names = list(set(sender))
return names
示例7: capitalized_words_percent
# 需要导入模块: import regex [as 别名]
# 或者: from regex import split [as 别名]
def capitalized_words_percent(s):
'''Returns capitalized words percent.'''
s = to_unicode(s, precise=True)
words = re.split('\s', s)
words = [w for w in words if w.strip()]
words = [w for w in words if len(w) > 2]
capitalized_words_counter = 0
valid_words_counter = 0
for word in words:
if not INVALID_WORD_START.match(word):
valid_words_counter += 1
if word[0].isupper() and not word[1].isupper():
capitalized_words_counter += 1
if valid_words_counter > 0 and len(words) > 1:
return 100 * float(capitalized_words_counter) / valid_words_counter
return 0
示例8: add_cmdline_args
# 需要导入模块: import regex [as 别名]
# 或者: from regex import split [as 别名]
def add_cmdline_args(argparser):
"""
Add command-line arguments specifically for this agent.
"""
agent = argparser.add_argument_group('RetrieverReader Arguments')
agent.add_argument('--retriever-model-file', type=str, default=None)
agent.add_argument('--reader-model-file', type=str, default=None)
agent.add_argument(
'--num-retrieved', type=int, default=5, help='how many passages to retrieve'
)
agent.add_argument(
'--split-paragraphs',
type='bool',
default=True,
help='Whether to split the retrieved passages into ' 'paragraphs',
)
return agent
示例9: _split_doc
# 需要导入模块: import regex [as 别名]
# 或者: from regex import split [as 别名]
def _split_doc(self, doc):
"""
Given a doc, split it into chunks (by paragraph).
"""
GROUP_LENGTH = 0
docs = []
curr = []
curr_len = 0
for split in regex.split(r'\n+', doc):
split = split.strip()
if len(split) == 0:
continue
# Maybe group paragraphs together until we hit a length limit
if len(curr) > 0 and curr_len + len(split) > GROUP_LENGTH:
# yield ' '.join(curr)
docs.append(' '.join(curr))
curr = []
curr_len = 0
curr.append(split)
curr_len += len(split)
if len(curr) > 0:
# yield ' '.join(curr)
docs.append(' '.join(curr))
return docs
示例10: ascii_emoticons
# 需要导入模块: import regex [as 别名]
# 或者: from regex import split [as 别名]
def ascii_emoticons(index, question, answer):
global valid_emoticon
valid_emoticon = False
# Disabled
if score_settings['ascii_emoticon_modifier_value'] is None:
return 0
# Split by words (tokens)
tokens = answer.split()
# Calculate emoticon score
score = [1 if len(token) > 1 and len(re.findall('[^a-zA-Z0-9]', token)) / len(token) > score_settings['ascii_emoticon_non_char_to_all_chars_ratio'] else 0 for token in tokens]
score = sum([1 if (index > 0 and score[index - 1] == 0 and value == 1) or (index == 0 and value == 1) else 0 for index, value in enumerate(score)]) * score_settings['ascii_emoticon_modifier_value']
if score:
valid_emoticon = True
return score
# Check if sentence includes 'unk' token
示例11: split_by
# 需要导入模块: import regex [as 别名]
# 或者: from regex import split [as 别名]
def split_by(self, pattern: str = "\n", trim=True):
"""Split a string by the given pattern
Args:
pattern (str, optional): Pattern to split by. Defaults to '\\n'.
time (bool, optional): Trim whitespace after split. Defaults to True
Returns:
Chepy: The Chepy object.
"""
if trim:
self.state = list(
map(pydash.trim, re.split(pattern, self._convert_to_str()))
)
else:
self.state = re.split(pattern, self._convert_to_str())
return self
示例12: _split_doc
# 需要导入模块: import regex [as 别名]
# 或者: from regex import split [as 别名]
def _split_doc(self, doc):
"""Given a doc, split it into chunks (by paragraph)."""
GROUP_LENGTH = 0
docs = []
curr = []
curr_len = 0
for split in regex.split(r'\n+', doc):
split = split.strip()
if len(split) == 0:
continue
# Maybe group paragraphs together until we hit a length limit
if len(curr) > 0 and curr_len + len(split) > GROUP_LENGTH:
# yield ' '.join(curr)
docs.append(' '.join(curr))
curr = []
curr_len = 0
curr.append(split)
curr_len += len(split)
if len(curr) > 0:
# yield ' '.join(curr)
docs.append(' '.join(curr))
return docs
示例13: _process_text_line
# 需要导入模块: import regex [as 别名]
# 或者: from regex import split [as 别名]
def _process_text_line(self, text):
split_text = [token for token in new_regex.split(self.tokenisation_pattern, text) \
if token != '']
if self.replace_whitespace:
new_text = []
for token in split_text:
if token.isspace():
new_text.append(self.replace_whitespace)
else:
new_text.append(token)
split_text = new_text
split_text = [token.strip(u' ') for token in split_text] ## prevent multiple spaces
split_text = [token for token in split_text if token != u''] ## prevent multiple spaces
split_text = [token.lower() for token in split_text] ## lowercase
text = ' '.join(split_text)
return text
示例14: setup
# 需要导入模块: import regex [as 别名]
# 或者: from regex import split [as 别名]
def setup(*args, **kwargs):
try:
global binarization
global pageseg
global rpred
global models
global mod_db
from kraken import binarization
from kraken import pageseg
from kraken import rpred
from kraken.lib import models
# pronn/clstm models get prioritized over pyrnn ones
mod_db = {k: storage.get_abs_path(*v) for k, v in nidaba_cfg['ocropus_models'].iteritems()}
if kwargs.get('modeldata'):
md = kwargs.get('modeldata')
if isinstance(md, list):
md = storage.get_abs_path(md)
for model in glob.glob(md + '/*/*/DESCRIPTION'):
with open(model) as fp:
meta = json.load(fp)
mod_db[model.split('/')[-2]] = os.path.join(os.path.dirname(model), meta['name'])
ocr_kraken.arg_values['model'] = mod_db.keys()
except ImportError as e:
raise NidabaPluginException(e.message)
示例15: extract_unencoded_urls
# 需要导入模块: import regex [as 别名]
# 或者: from regex import split [as 别名]
def extract_unencoded_urls(data, refang=False, strip=False):
"""Extract only unencoded URLs.
:param data: Input text
:param bool refang: Refang output?
:param bool strip: Strip possible garbage from the end of URLs
:rtype: Iterator[:class:`str`]
"""
unencoded_urls = itertools.chain(
GENERIC_URL_RE.finditer(data),
BRACKET_URL_RE.finditer(data),
BACKSLASH_URL_RE.finditer(data),
)
for url in unencoded_urls:
if refang:
url = refang_url(url.group(1))
else:
url = url.group(1)
if strip:
url = re.split(URL_SPLIT_STR, url)[0]
yield url