Python regex.split方法代碼示例

本文整理匯總了Python中regex.split方法的典型用法代碼示例。如果您正苦於以下問題：Python regex.split方法的具體用法？Python regex.split怎麽用？Python regex.split使用的例子？那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在類regex的用法示例。

在下文中一共展示了regex.split方法的15個代碼示例，這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚，您的評價將有助於係統推薦出更棒的Python代碼示例。

示例1: _split_doc

# 需要導入模塊: import regex [as 別名]
# 或者: from regex import split [as 別名]
def _split_doc(doc):
    """Given a doc, split it into chunks (by paragraph)."""
    curr = []
    curr_len = 0
    for split in regex.split(r'\n+', doc):
        split = split.strip()
        if len(split) == 0:
            continue
        # Maybe group paragraphs together until we hit a length limit
        if len(curr) > 0 and curr_len + len(split) > GROUP_LENGTH:
            return ' '.join(curr)
            curr = []
            curr_len = 0
        curr.append(split)
        curr_len += len(split)
    if len(curr) > 0:
        return ' '.join(curr)

開發者ID:Pinafore，項目名稱:qb，代碼行數:19，代碼來源:data.py

示例2: sentence_segment

# 需要導入模塊: import regex [as 別名]
# 或者: from regex import split [as 別名]
def sentence_segment(text):
    '''
    Args:
      text: A string. A unsegmented paragraph.
    
    Returns:
      A list of sentences.
    '''
    global lcode
    if lcode in ['ja', 'zh']:
        sents = regex.split(u"([。！？])?[\n]+|[。！？]", text) 
    elif lcode in ['th']:
        sents = text.split("[\n]+") 
    elif lcode in ['hi', 'bn']: # hindi, bengali
        sents = regex.split(u"([.।?!])?[\n]+|[.।?!] ", text)
    elif lcode in ['de']: # german
        sents = regex.split("([.?!])?[\n]+|[.?!] ", text)
        sents = [sent[0].lower() + sent[1:] for sent in sents if sent is not None and len(sent) > 1]
    else:
        sents = regex.split("([.?!])?[\n]+|[.?!] ", text)
    return sents

開發者ID:Kyubyong，項目名稱:wordvectors，代碼行數:23，代碼來源:build_corpus.py

示例3: word_segment

# 需要導入模塊: import regex [as 別名]
# 或者: from regex import split [as 別名]
def word_segment(sent):
    '''
    Args:
      sent: A string. A sentence.
    
    Returns:
      A list of words.
    '''
    global lcode
    if lcode in ['ko']:
        words = [word for word, _ in kkma.pos(sent)]
    elif lcode in ['ja']:
        words = mecab.parse(sent.encode('utf8')).split() 
    elif lcode in ['th']:
        words = pythai.split(sent)
    elif lcode in ['vi']:
        words = ViTokenizer.tokenize(sent).split()        
    elif lcode in ['zh']:
        words = list(jieba.cut(sent, cut_all=False)) 
#     elif lcode in ['ar']:
#         words = segmenter.segment(sent).split()
    else: # Mostly european languages
        words = sent.split()
    
    return words

開發者ID:Kyubyong，項目名稱:wordvectors，代碼行數:27，代碼來源:build_corpus.py

示例4: _split_doc

# 需要導入模塊: import regex [as 別名]
# 或者: from regex import split [as 別名]
def _split_doc(self, doc):
        """Given a doc, split it into chunks (by paragraph)."""
        curr = []
        curr_len = 0
        for split in regex.split(r'\n+', doc):
            split = split.strip()
            if len(split) == 0:
                continue
            # Maybe group paragraphs together until we hit a length limit
            if len(curr) > 0 and curr_len + len(split) > self.GROUP_LENGTH:
                yield ' '.join(curr)
                curr = []
                curr_len = 0
            curr.append(split)
            curr_len += len(split)
        if len(curr) > 0:
            yield ' '.join(curr)

開發者ID:ailabstw，項目名稱:justcopy-backend，代碼行數:19，代碼來源:drqa.py

示例5: search_docs

# 需要導入模塊: import regex [as 別名]
# 或者: from regex import split [as 別名]
def search_docs(inputs, max_ex=5, opts=None):
    """Given a set of document ids (returned by ranking for a question), search
    for top N best matching (by heuristic) paragraphs that contain the answer.
    """
    if not opts:
        raise RuntimeError('Options dict must be supplied.')

    doc_ids, q_tokens, answer = inputs
    examples = []
    for i, doc_id in enumerate(doc_ids):
        for j, paragraph in enumerate(re.split(r'\n+', fetch_text(doc_id))):
            found = find_answer(paragraph, q_tokens, answer, opts)
            if found:
                # Reverse ranking, giving priority to early docs + paragraphs
                score = (found[0], -i, -j, random.random())
                if len(examples) < max_ex:
                    heapq.heappush(examples, (score, found[1]))
                else:
                    heapq.heappushpop(examples, (score, found[1]))
    return [e[1] for e in examples]

開發者ID:ailabstw，項目名稱:justcopy-backend，代碼行數:22，代碼來源:generate.py

示例6: extract_names

# 需要導入模塊: import regex [as 別名]
# 或者: from regex import split [as 別名]
def extract_names(sender):
    """Tries to extract sender's names from `From:` header.

    It could extract not only the actual names but e.g.
    the name of the company, parts of email, etc.

    >>> extract_names('Sergey N.  Obukhov <serobnic@mail.ru>')
    ['Sergey', 'Obukhov', 'serobnic']
    >>> extract_names('')
    []
    """
    sender = to_unicode(sender, precise=True)
    # Remove non-alphabetical characters
    sender = "".join([char if char.isalpha() else ' ' for char in sender])
    # Remove too short words and words from "black" list i.e.
    # words like `ru`, `gmail`, `com`, `org`, etc.
    sender = [word for word in sender.split() if len(word) > 1 and
              not word in BAD_SENDER_NAMES]
    # Remove duplicates
    names = list(set(sender))
    return names

開發者ID:mailgun，項目名稱:talon，代碼行數:23，代碼來源:helpers.py

示例7: capitalized_words_percent

# 需要導入模塊: import regex [as 別名]
# 或者: from regex import split [as 別名]
def capitalized_words_percent(s):
    '''Returns capitalized words percent.'''
    s = to_unicode(s, precise=True)
    words = re.split('\s', s)
    words = [w for w in words if w.strip()]
    words = [w for w in words if len(w) > 2]    
    capitalized_words_counter = 0
    valid_words_counter = 0
    for word in words:
        if not INVALID_WORD_START.match(word):
            valid_words_counter += 1
            if word[0].isupper() and not word[1].isupper():
                capitalized_words_counter += 1
    if valid_words_counter > 0 and len(words) > 1:
        return 100 * float(capitalized_words_counter) / valid_words_counter

    return 0

開發者ID:mailgun，項目名稱:talon，代碼行數:19，代碼來源:helpers.py

示例8: add_cmdline_args

# 需要導入模塊: import regex [as 別名]
# 或者: from regex import split [as 別名]
def add_cmdline_args(argparser):
        """
        Add command-line arguments specifically for this agent.
        """
        agent = argparser.add_argument_group('RetrieverReader Arguments')
        agent.add_argument('--retriever-model-file', type=str, default=None)
        agent.add_argument('--reader-model-file', type=str, default=None)
        agent.add_argument(
            '--num-retrieved', type=int, default=5, help='how many passages to retrieve'
        )
        agent.add_argument(
            '--split-paragraphs',
            type='bool',
            default=True,
            help='Whether to split the retrieved passages into ' 'paragraphs',
        )
        return agent

開發者ID:facebookresearch，項目名稱:ParlAI，代碼行數:19，代碼來源:retriever_reader.py

示例9: _split_doc

# 需要導入模塊: import regex [as 別名]
# 或者: from regex import split [as 別名]
def _split_doc(self, doc):
        """
        Given a doc, split it into chunks (by paragraph).
        """
        GROUP_LENGTH = 0
        docs = []
        curr = []
        curr_len = 0
        for split in regex.split(r'\n+', doc):
            split = split.strip()
            if len(split) == 0:
                continue
            # Maybe group paragraphs together until we hit a length limit
            if len(curr) > 0 and curr_len + len(split) > GROUP_LENGTH:
                # yield ' '.join(curr)
                docs.append(' '.join(curr))
                curr = []
                curr_len = 0
            curr.append(split)
            curr_len += len(split)
        if len(curr) > 0:
            # yield ' '.join(curr)
            docs.append(' '.join(curr))
        return docs

開發者ID:facebookresearch，項目名稱:ParlAI，代碼行數:26，代碼來源:retriever_reader.py

示例10: ascii_emoticons

# 需要導入模塊: import regex [as 別名]
# 或者: from regex import split [as 別名]
def ascii_emoticons(index, question, answer):
    global valid_emoticon

    valid_emoticon = False

    # Disabled
    if score_settings['ascii_emoticon_modifier_value'] is None:
        return 0

    # Split by words (tokens)
    tokens = answer.split()

    # Calculate emoticon score
    score = [1 if len(token) > 1 and len(re.findall('[^a-zA-Z0-9]', token)) / len(token) > score_settings['ascii_emoticon_non_char_to_all_chars_ratio'] else 0 for token in tokens]
    score = sum([1 if (index > 0 and score[index - 1] == 0 and value == 1) or (index == 0 and value == 1) else 0 for index, value in enumerate(score)]) * score_settings['ascii_emoticon_modifier_value']

    if score:
        valid_emoticon = True

    return score

# Check if sentence includes 'unk' token

開發者ID:daniel-kukiela，項目名稱:nmt-chatbot，代碼行數:24，代碼來源:scorer.py

示例11: split_by

# 需要導入模塊: import regex [as 別名]
# 或者: from regex import split [as 別名]
def split_by(self, pattern: str = "\n", trim=True):
        """Split a string by the given pattern
        
        Args:
            pattern (str, optional): Pattern to split by. Defaults to '\\n'.
            time (bool, optional): Trim whitespace after split. Defaults to True
        
        Returns:
            Chepy: The Chepy object.
        """
        if trim:
            self.state = list(
                map(pydash.trim, re.split(pattern, self._convert_to_str()))
            )
        else:
            self.state = re.split(pattern, self._convert_to_str())
        return self

開發者ID:securisec，項目名稱:chepy，代碼行數:19，代碼來源:utils.py

示例12: _split_doc

# 需要導入模塊: import regex [as 別名]
# 或者: from regex import split [as 別名]
def _split_doc(self, doc):
        """Given a doc, split it into chunks (by paragraph)."""
        GROUP_LENGTH = 0
        docs = []
        curr = []
        curr_len = 0
        for split in regex.split(r'\n+', doc):
            split = split.strip()
            if len(split) == 0:
                continue
            # Maybe group paragraphs together until we hit a length limit
            if len(curr) > 0 and curr_len + len(split) > GROUP_LENGTH:
                # yield ' '.join(curr)
                docs.append(' '.join(curr))
                curr = []
                curr_len = 0
            curr.append(split)
            curr_len += len(split)
        if len(curr) > 0:
            # yield ' '.join(curr)
            docs.append(' '.join(curr))
        return docs

開發者ID:natashamjaques，項目名稱:neural_chat，代碼行數:24，代碼來源:retriever_reader.py

示例13: _process_text_line

# 需要導入模塊: import regex [as 別名]
# 或者: from regex import split [as 別名]
def _process_text_line(self, text):            

        split_text = [token for token in new_regex.split(self.tokenisation_pattern, text) \
                            if token != '']
        if self.replace_whitespace:
            new_text = []
            for token in split_text:
                if token.isspace():
                    new_text.append(self.replace_whitespace)                        
                else:
                    new_text.append(token)  
            split_text = new_text
        
        split_text = [token.strip(u' ') for token in split_text]  ## prevent multiple spaces
        split_text = [token for token in split_text if token != u'']  ## prevent multiple spaces
        split_text = [token.lower() for token in split_text]     ## lowercase
        text = ' '.join(split_text) 
        return text

開發者ID:CSTR-Edinburgh，項目名稱:Ossian，代碼行數:20，代碼來源:VSMTagger.py

示例14: setup

# 需要導入模塊: import regex [as 別名]
# 或者: from regex import split [as 別名]
def setup(*args, **kwargs):
    try:
        global binarization
        global pageseg
        global rpred
        global models
        global mod_db
        from kraken import binarization
        from kraken import pageseg
        from kraken import rpred
        from kraken.lib import models
        # pronn/clstm models get prioritized over pyrnn ones
        mod_db = {k: storage.get_abs_path(*v) for k, v in nidaba_cfg['ocropus_models'].iteritems()}
        if kwargs.get('modeldata'):
            md = kwargs.get('modeldata')
            if isinstance(md, list):
                md = storage.get_abs_path(md)
            for model in glob.glob(md + '/*/*/DESCRIPTION'):
                with open(model) as fp:
                    meta = json.load(fp)
                    mod_db[model.split('/')[-2]] = os.path.join(os.path.dirname(model), meta['name'])
        ocr_kraken.arg_values['model'] = mod_db.keys()

    except ImportError as e:
        raise NidabaPluginException(e.message)

開發者ID:OpenPhilology，項目名稱:nidaba，代碼行數:27，代碼來源:kraken.py

示例15: extract_unencoded_urls

# 需要導入模塊: import regex [as 別名]
# 或者: from regex import split [as 別名]
def extract_unencoded_urls(data, refang=False, strip=False):
    """Extract only unencoded URLs.

    :param data: Input text
    :param bool refang: Refang output?
    :param bool strip: Strip possible garbage from the end of URLs
    :rtype: Iterator[:class:`str`]
    """
    unencoded_urls = itertools.chain(
        GENERIC_URL_RE.finditer(data),
        BRACKET_URL_RE.finditer(data),
        BACKSLASH_URL_RE.finditer(data),
    )
    for url in unencoded_urls:
        if refang:
            url = refang_url(url.group(1))
        else:
            url = url.group(1)

        if strip:
            url = re.split(URL_SPLIT_STR, url)[0]

        yield url

開發者ID:InQuest，項目名稱:python-iocextract，代碼行數:25，代碼來源:iocextract.py

注：本文中的regex.split方法示例由純淨天空整理自Github/MSDocs等開源代碼及文檔管理平台，相關代碼片段篩選自各路編程大神貢獻的開源項目，源碼版權歸原作者所有，傳播和使用請參考對應項目的License；未經允許，請勿轉載。