当前位置: 首页>>代码示例>>Python>>正文


Python DocumentFeature.recompile_pattern方法代码示例

本文整理汇总了Python中discoutils.tokens.DocumentFeature.recompile_pattern方法的典型用法代码示例。如果您正苦于以下问题:Python DocumentFeature.recompile_pattern方法的具体用法?Python DocumentFeature.recompile_pattern怎么用?Python DocumentFeature.recompile_pattern使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在discoutils.tokens.DocumentFeature的用法示例。


在下文中一共展示了DocumentFeature.recompile_pattern方法的6个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: test_with_different_separators

# 需要导入模块: from discoutils.tokens import DocumentFeature [as 别名]
# 或者: from discoutils.tokens.DocumentFeature import recompile_pattern [as 别名]
def test_with_different_separators():
    DocumentFeature.recompile_pattern(pos_separator='_', ngram_separator='!')
    assert DocumentFeature('2-GRAM', (Token('very', 'RB'), Token('big', 'J'))) == \
           DocumentFeature.from_string('very_RB!big_J')

    DocumentFeature.recompile_pattern(pos_separator='-', ngram_separator=' ')
    assert DocumentFeature('1-GRAM', (Token('very', 'RB'),)) == DocumentFeature.from_string('very-RB')
    assert DocumentFeature('2-GRAM', (Token('very', 'RB'), Token('big', 'J'))) == \
           DocumentFeature.from_string('very-RB big-J')
开发者ID:mbatchkarov,项目名称:DiscoUtils,代码行数:11,代码来源:test_token.py

示例2: test_document_feature_slicing

# 需要导入模块: from discoutils.tokens import DocumentFeature [as 别名]
# 或者: from discoutils.tokens.DocumentFeature import recompile_pattern [as 别名]
def test_document_feature_slicing():
    DocumentFeature.recompile_pattern()
    x = DocumentFeature.from_string('big/J_cat/N')
    assert x[0] == DocumentFeature.from_string('big/J')
    assert x[1] == DocumentFeature.from_string('cat/N')
    assert x[1] == DocumentFeature('1-GRAM', (Token('cat', 'N', 1), ))
    assert x[0:] == DocumentFeature.from_string('big/J_cat/N')

    x = DocumentFeature.from_string('cat/N')
    assert x[0] == DocumentFeature.from_string('cat/N')
    assert x[0:] == DocumentFeature.from_string('cat/N')
    assert x[:] == DocumentFeature.from_string('cat/N')
开发者ID:mbatchkarov,项目名称:DiscoUtils,代码行数:14,代码来源:test_token.py

示例3: test_smart_lower

# 需要导入模块: from discoutils.tokens import DocumentFeature [as 别名]
# 或者: from discoutils.tokens.DocumentFeature import recompile_pattern [as 别名]
def test_smart_lower():
    # test that the PoS of an n-gram entry is not lowercased
    assert DocumentFeature.smart_lower('Cat/N') == 'cat/N'
    assert DocumentFeature.smart_lower('Cat/n') == 'cat/n'
    assert DocumentFeature.smart_lower('Red/J_CaT/N') == 'red/J_cat/N'
    # test that features are not touched
    assert DocumentFeature.smart_lower('amod-DEP:former', lowercasing=False) == 'amod-DEP:former'

    DocumentFeature.recompile_pattern(ngram_separator=' ')
    assert DocumentFeature.smart_lower('Red/J CaT/N') == 'red/J cat/N'

    DocumentFeature.recompile_pattern(pos_separator='-')
    assert DocumentFeature.smart_lower('Red-J') == 'red-J'
开发者ID:mbatchkarov,项目名称:DiscoUtils,代码行数:15,代码来源:test_token.py

示例4: test_document_feature_from_string

# 需要导入模块: from discoutils.tokens import DocumentFeature [as 别名]
# 或者: from discoutils.tokens.DocumentFeature import recompile_pattern [as 别名]
def test_document_feature_from_string():
    DocumentFeature.recompile_pattern()
    x = DocumentFeature.from_string('big/J_cat/N')
    y = DocumentFeature('AN', (Token('big', 'J'), Token('cat', 'N')))
    assert y == x

    assert DocumentFeature('1-GRAM', (Token('cat', 'N'), )) == DocumentFeature.from_string('cat/N')

    assert DocumentFeature('VO', (Token('chase', 'V'), Token('cat', 'N'))) == \
           DocumentFeature.from_string('chase/V_cat/N')

    assert DocumentFeature('NN', (Token('dog', 'N'), Token('cat', 'N'))) == \
           DocumentFeature.from_string('dog/N_cat/N')

    assert DocumentFeature('NN', (Token('dog', 'N'), Token('cat', 'N'))) == \
           DocumentFeature.from_string('dog/n_cat/n')

    assert DocumentFeature('3-GRAM', (Token('dog', 'V'), Token('chase', 'V'), Token('cat', 'V'))) == \
           DocumentFeature.from_string('dog/V_chase/V_cat/V')

    assert DocumentFeature('2-GRAM', (Token('chase', 'V'), Token('cat', 'V'))) == \
           DocumentFeature.from_string('chase/V_cat/V')

    assert DocumentFeature('SVO', (Token('dog', 'N'), Token('chase', 'V'), Token('cat', 'N'))) == \
           DocumentFeature.from_string('dog/N_chase/V_cat/N')

    assert DocumentFeature('2-GRAM', (Token('very', 'RB'), Token('big', 'J'))) == \
           DocumentFeature.from_string('very/RB_big/J')

    assert DocumentFeature('2-GRAM', (Token('very', None), Token('big', None))) == \
           DocumentFeature.from_string('very_big')

    for invalid_string in ['a\/s/N', 'l\/h/N_clinton\/south/N', 'l\/h//N_clinton\/south/N',
                           'l//fasdlj/fasd/dfs/sdf', 'l//fasdlj/fasd/dfs\_/sdf', 'dfs\_/sdf',
                           'dfs\_/fadslk_/sdf', '/_dfs\_/sdf', '_/_/', '_///f_/',
                           'drop/V_bomb', '/V_/N', 'word1_word2//', 'mk8/N_6hp/N',
                           'a./N_gordon/N', 'great/J_c.d./N', '[email protected]/N', 'w1/N',
                           '-lrb-306-rrb- 569-1995/N', 'mumaharps.com/N', 'c+l+a+v+i+e+r+/N',
                           'b/N_o\o/N', '%/N', '|/V', '-lrb-852-rrb- 2829 6281/N']:
        print(invalid_string)
        assert DocumentFeature('EMPTY', tuple()) == DocumentFeature.from_string(invalid_string)
开发者ID:mbatchkarov,项目名称:DiscoUtils,代码行数:43,代码来源:test_token.py

示例5: from_tsv

# 需要导入模块: from discoutils.tokens import DocumentFeature [as 别名]
# 或者: from discoutils.tokens.DocumentFeature import recompile_pattern [as 别名]
    def from_tsv(cls, tsv_file, sim_threshold=0, include_self=False,
                 lowercasing=False, ngram_separator='_', pos_separator='/', allow_lexical_overlap=True,
                 row_filter=lambda x, y: True, column_filter=lambda x: True, max_len=50,
                 max_neighbours=1e8, merge_duplicates=False, immutable=True,
                 enforce_word_entry_pos_format=True, **kwargs):
        """
        Create a Thesaurus by parsing a Byblo-compatible TSV files (events or sims).
        If duplicate values are encoutered during parsing, only the latest will be kept.

        :param tsv_file: path to input TSV file. May be gzipped.
        :type tsv_file:  str
        :param sim_threshold: min similarity between an entry and its neighbour for the neighbour to be included
        :type sim_threshold: float
        :param include_self: whether to include self as nearest neighbour.
        :type include_self: bool
        :param lowercasing: if true, most of what is read will be lowercased (excluding PoS tags), so
            Cat/N -> cat/N. This is desirable when reading thesauri with this class. If False, no lowercasing
            will take place. This might be desirable when readings feature lists or already lowercased neighbour
            lists. FET + Byblo thesauri are already lowercased.
        :type lowercasing: bool
        :param ngram_separator: When n_gram entries are read in, what are the indidivual tokens separated by
        :param column_filter: A function that takes a string (column in the file) and returns whether or not
        the string should be kept
        :param row_filter: takes a string and its corresponding DocumentFeature and determines if it should be loaded.
        If `enforce_word_entry_pos_format` is `False`, the second parameter to this function will be `None`
        :param allow_lexical_overlap: whether neighbours/features are allowed to overlap lexically with the entry
        they are neighbours/features of. OTE: THE BEHAVIOUR OF THIS PARAMETER IS SLIGHTLY DIFFERENT FROM THE EQUIVALENT
        IN VECTORS. SEE COMMENT THERE.
        :param max_len: maximum length (in characters) of permissible **entries**. Longer entries are ignored.
        :param max_neighbours: maximum neighbours per entry. This is applied AFTER the filtering defined by
        column_filter and allow_lexical_overlap is finished.
        :param merge_duplicates: whether to raise en error if multiple entries exist, or concatenate/add them together.
        The former is appropriate for `Thesaurus`, and the latter for `Vectors`
        :param enforce_word_entry_pos_format: if true, entries that are not in a `word/POS` format are skipped. This
        must be true for `allow_lexical_overlap` to work.
        """

        if not tsv_file:
            raise ValueError("No thesaurus specified")

        DocumentFeature.recompile_pattern(pos_separator=pos_separator, ngram_separator=ngram_separator)
        to_return = dict()
        logging.info('Loading thesaurus %s from disk', tsv_file)

        if not allow_lexical_overlap:
            logging.warning('DISALLOWING LEXICAL OVERLAP')

        if not allow_lexical_overlap and not enforce_word_entry_pos_format:
            raise ValueError('allow_lexical_overlap requires entries to be converted to a DocumentFeature. '
                             'Please enable enforce_word_entry_pos_format')
        FILTERED = '___FILTERED___'.lower()

        gzipped = is_gzipped(tsv_file)
        if gzipped:
            logging.info('Attempting to read a gzipped file')
            fhandle = gzip.open(tsv_file)
        else:
            fhandle = open(tsv_file)

        with fhandle as infile:
            for line in infile.readlines():
                if gzipped:
                    # this is a byte steam, needs to be decoded
                    tokens = line.decode('UTF8').strip().split('\t')
                else:
                    tokens = line.strip().split('\t')

                if len(tokens) % 2 == 0:
                    # must have an odd number of things, one for the entry
                    # and pairs for (neighbour, similarity)
                    logging.warning('Skipping dodgy line in thesaurus file: %s\n %s', tsv_file, line)
                    continue

                if tokens[0] != FILTERED:
                    key = DocumentFeature.smart_lower(tokens[0], lowercasing)
                    dfkey = DocumentFeature.from_string(key) if enforce_word_entry_pos_format else None

                    if enforce_word_entry_pos_format and dfkey.type == 'EMPTY':
                        # do not load things in the wrong format, they'll get in the way later
                        # logging.warning('%s is not in the word/POS format, skipping', tokens[0])
                        continue

                    if (not row_filter(key, dfkey)) or len(key) > max_len:
                        logging.debug('Skipping entry for %s', key)
                        continue

                    to_insert = [(DocumentFeature.smart_lower(word, lowercasing), float(sim))
                                 for (word, sim) in walk_nonoverlapping_pairs(tokens, 1)
                                 if word.lower() != FILTERED and column_filter(word) and float(sim) > sim_threshold]

                    if not allow_lexical_overlap:
                        to_insert = cls.remove_overlapping_neighbours(dfkey, to_insert)

                    if len(to_insert) > max_neighbours:
                        to_insert = to_insert[:max_neighbours]

                    if include_self:
                        to_insert.insert(0, (key, 1.0))

                    # the steps above may filter out all neighbours of an entry. if this happens,
#.........这里部分代码省略.........
开发者ID:tttthomasssss,项目名称:DiscoUtils,代码行数:103,代码来源:thesaurus_loader.py

示例6: test_token_to_string

# 需要导入模块: from discoutils.tokens import DocumentFeature [as 别名]
# 或者: from discoutils.tokens.DocumentFeature import recompile_pattern [as 别名]
def test_token_to_string():
    assert 'dog/J' == str(DocumentFeature.from_string('dog/J').tokens[0])
    DocumentFeature.recompile_pattern(pos_separator='-')
    my_feature = DocumentFeature.from_string('dog-J')
    assert 'dog-J' == str(my_feature)
    DocumentFeature.recompile_pattern()
开发者ID:mbatchkarov,项目名称:DiscoUtils,代码行数:8,代码来源:test_token.py


注:本文中的discoutils.tokens.DocumentFeature.recompile_pattern方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。