本文整理匯總了Python中discoutils.tokens.DocumentFeature.recompile_pattern方法的典型用法代碼示例。如果您正苦於以下問題:Python DocumentFeature.recompile_pattern方法的具體用法?Python DocumentFeature.recompile_pattern怎麽用?Python DocumentFeature.recompile_pattern使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在類discoutils.tokens.DocumentFeature
的用法示例。
在下文中一共展示了DocumentFeature.recompile_pattern方法的6個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。
示例1: test_with_different_separators
# 需要導入模塊: from discoutils.tokens import DocumentFeature [as 別名]
# 或者: from discoutils.tokens.DocumentFeature import recompile_pattern [as 別名]
def test_with_different_separators():
DocumentFeature.recompile_pattern(pos_separator='_', ngram_separator='!')
assert DocumentFeature('2-GRAM', (Token('very', 'RB'), Token('big', 'J'))) == \
DocumentFeature.from_string('very_RB!big_J')
DocumentFeature.recompile_pattern(pos_separator='-', ngram_separator=' ')
assert DocumentFeature('1-GRAM', (Token('very', 'RB'),)) == DocumentFeature.from_string('very-RB')
assert DocumentFeature('2-GRAM', (Token('very', 'RB'), Token('big', 'J'))) == \
DocumentFeature.from_string('very-RB big-J')
示例2: test_document_feature_slicing
# 需要導入模塊: from discoutils.tokens import DocumentFeature [as 別名]
# 或者: from discoutils.tokens.DocumentFeature import recompile_pattern [as 別名]
def test_document_feature_slicing():
DocumentFeature.recompile_pattern()
x = DocumentFeature.from_string('big/J_cat/N')
assert x[0] == DocumentFeature.from_string('big/J')
assert x[1] == DocumentFeature.from_string('cat/N')
assert x[1] == DocumentFeature('1-GRAM', (Token('cat', 'N', 1), ))
assert x[0:] == DocumentFeature.from_string('big/J_cat/N')
x = DocumentFeature.from_string('cat/N')
assert x[0] == DocumentFeature.from_string('cat/N')
assert x[0:] == DocumentFeature.from_string('cat/N')
assert x[:] == DocumentFeature.from_string('cat/N')
示例3: test_smart_lower
# 需要導入模塊: from discoutils.tokens import DocumentFeature [as 別名]
# 或者: from discoutils.tokens.DocumentFeature import recompile_pattern [as 別名]
def test_smart_lower():
# test that the PoS of an n-gram entry is not lowercased
assert DocumentFeature.smart_lower('Cat/N') == 'cat/N'
assert DocumentFeature.smart_lower('Cat/n') == 'cat/n'
assert DocumentFeature.smart_lower('Red/J_CaT/N') == 'red/J_cat/N'
# test that features are not touched
assert DocumentFeature.smart_lower('amod-DEP:former', lowercasing=False) == 'amod-DEP:former'
DocumentFeature.recompile_pattern(ngram_separator=' ')
assert DocumentFeature.smart_lower('Red/J CaT/N') == 'red/J cat/N'
DocumentFeature.recompile_pattern(pos_separator='-')
assert DocumentFeature.smart_lower('Red-J') == 'red-J'
示例4: test_document_feature_from_string
# 需要導入模塊: from discoutils.tokens import DocumentFeature [as 別名]
# 或者: from discoutils.tokens.DocumentFeature import recompile_pattern [as 別名]
def test_document_feature_from_string():
DocumentFeature.recompile_pattern()
x = DocumentFeature.from_string('big/J_cat/N')
y = DocumentFeature('AN', (Token('big', 'J'), Token('cat', 'N')))
assert y == x
assert DocumentFeature('1-GRAM', (Token('cat', 'N'), )) == DocumentFeature.from_string('cat/N')
assert DocumentFeature('VO', (Token('chase', 'V'), Token('cat', 'N'))) == \
DocumentFeature.from_string('chase/V_cat/N')
assert DocumentFeature('NN', (Token('dog', 'N'), Token('cat', 'N'))) == \
DocumentFeature.from_string('dog/N_cat/N')
assert DocumentFeature('NN', (Token('dog', 'N'), Token('cat', 'N'))) == \
DocumentFeature.from_string('dog/n_cat/n')
assert DocumentFeature('3-GRAM', (Token('dog', 'V'), Token('chase', 'V'), Token('cat', 'V'))) == \
DocumentFeature.from_string('dog/V_chase/V_cat/V')
assert DocumentFeature('2-GRAM', (Token('chase', 'V'), Token('cat', 'V'))) == \
DocumentFeature.from_string('chase/V_cat/V')
assert DocumentFeature('SVO', (Token('dog', 'N'), Token('chase', 'V'), Token('cat', 'N'))) == \
DocumentFeature.from_string('dog/N_chase/V_cat/N')
assert DocumentFeature('2-GRAM', (Token('very', 'RB'), Token('big', 'J'))) == \
DocumentFeature.from_string('very/RB_big/J')
assert DocumentFeature('2-GRAM', (Token('very', None), Token('big', None))) == \
DocumentFeature.from_string('very_big')
for invalid_string in ['a\/s/N', 'l\/h/N_clinton\/south/N', 'l\/h//N_clinton\/south/N',
'l//fasdlj/fasd/dfs/sdf', 'l//fasdlj/fasd/dfs\_/sdf', 'dfs\_/sdf',
'dfs\_/fadslk_/sdf', '/_dfs\_/sdf', '_/_/', '_///f_/',
'drop/V_bomb', '/V_/N', 'word1_word2//', 'mk8/N_6hp/N',
'a./N_gordon/N', 'great/J_c.d./N', '[email protected]/N', 'w1/N',
'-lrb-306-rrb- 569-1995/N', 'mumaharps.com/N', 'c+l+a+v+i+e+r+/N',
'b/N_o\o/N', '%/N', '|/V', '-lrb-852-rrb- 2829 6281/N']:
print(invalid_string)
assert DocumentFeature('EMPTY', tuple()) == DocumentFeature.from_string(invalid_string)
示例5: from_tsv
# 需要導入模塊: from discoutils.tokens import DocumentFeature [as 別名]
# 或者: from discoutils.tokens.DocumentFeature import recompile_pattern [as 別名]
def from_tsv(cls, tsv_file, sim_threshold=0, include_self=False,
lowercasing=False, ngram_separator='_', pos_separator='/', allow_lexical_overlap=True,
row_filter=lambda x, y: True, column_filter=lambda x: True, max_len=50,
max_neighbours=1e8, merge_duplicates=False, immutable=True,
enforce_word_entry_pos_format=True, **kwargs):
"""
Create a Thesaurus by parsing a Byblo-compatible TSV files (events or sims).
If duplicate values are encoutered during parsing, only the latest will be kept.
:param tsv_file: path to input TSV file. May be gzipped.
:type tsv_file: str
:param sim_threshold: min similarity between an entry and its neighbour for the neighbour to be included
:type sim_threshold: float
:param include_self: whether to include self as nearest neighbour.
:type include_self: bool
:param lowercasing: if true, most of what is read will be lowercased (excluding PoS tags), so
Cat/N -> cat/N. This is desirable when reading thesauri with this class. If False, no lowercasing
will take place. This might be desirable when readings feature lists or already lowercased neighbour
lists. FET + Byblo thesauri are already lowercased.
:type lowercasing: bool
:param ngram_separator: When n_gram entries are read in, what are the indidivual tokens separated by
:param column_filter: A function that takes a string (column in the file) and returns whether or not
the string should be kept
:param row_filter: takes a string and its corresponding DocumentFeature and determines if it should be loaded.
If `enforce_word_entry_pos_format` is `False`, the second parameter to this function will be `None`
:param allow_lexical_overlap: whether neighbours/features are allowed to overlap lexically with the entry
they are neighbours/features of. OTE: THE BEHAVIOUR OF THIS PARAMETER IS SLIGHTLY DIFFERENT FROM THE EQUIVALENT
IN VECTORS. SEE COMMENT THERE.
:param max_len: maximum length (in characters) of permissible **entries**. Longer entries are ignored.
:param max_neighbours: maximum neighbours per entry. This is applied AFTER the filtering defined by
column_filter and allow_lexical_overlap is finished.
:param merge_duplicates: whether to raise en error if multiple entries exist, or concatenate/add them together.
The former is appropriate for `Thesaurus`, and the latter for `Vectors`
:param enforce_word_entry_pos_format: if true, entries that are not in a `word/POS` format are skipped. This
must be true for `allow_lexical_overlap` to work.
"""
if not tsv_file:
raise ValueError("No thesaurus specified")
DocumentFeature.recompile_pattern(pos_separator=pos_separator, ngram_separator=ngram_separator)
to_return = dict()
logging.info('Loading thesaurus %s from disk', tsv_file)
if not allow_lexical_overlap:
logging.warning('DISALLOWING LEXICAL OVERLAP')
if not allow_lexical_overlap and not enforce_word_entry_pos_format:
raise ValueError('allow_lexical_overlap requires entries to be converted to a DocumentFeature. '
'Please enable enforce_word_entry_pos_format')
FILTERED = '___FILTERED___'.lower()
gzipped = is_gzipped(tsv_file)
if gzipped:
logging.info('Attempting to read a gzipped file')
fhandle = gzip.open(tsv_file)
else:
fhandle = open(tsv_file)
with fhandle as infile:
for line in infile.readlines():
if gzipped:
# this is a byte steam, needs to be decoded
tokens = line.decode('UTF8').strip().split('\t')
else:
tokens = line.strip().split('\t')
if len(tokens) % 2 == 0:
# must have an odd number of things, one for the entry
# and pairs for (neighbour, similarity)
logging.warning('Skipping dodgy line in thesaurus file: %s\n %s', tsv_file, line)
continue
if tokens[0] != FILTERED:
key = DocumentFeature.smart_lower(tokens[0], lowercasing)
dfkey = DocumentFeature.from_string(key) if enforce_word_entry_pos_format else None
if enforce_word_entry_pos_format and dfkey.type == 'EMPTY':
# do not load things in the wrong format, they'll get in the way later
# logging.warning('%s is not in the word/POS format, skipping', tokens[0])
continue
if (not row_filter(key, dfkey)) or len(key) > max_len:
logging.debug('Skipping entry for %s', key)
continue
to_insert = [(DocumentFeature.smart_lower(word, lowercasing), float(sim))
for (word, sim) in walk_nonoverlapping_pairs(tokens, 1)
if word.lower() != FILTERED and column_filter(word) and float(sim) > sim_threshold]
if not allow_lexical_overlap:
to_insert = cls.remove_overlapping_neighbours(dfkey, to_insert)
if len(to_insert) > max_neighbours:
to_insert = to_insert[:max_neighbours]
if include_self:
to_insert.insert(0, (key, 1.0))
# the steps above may filter out all neighbours of an entry. if this happens,
#.........這裏部分代碼省略.........
示例6: test_token_to_string
# 需要導入模塊: from discoutils.tokens import DocumentFeature [as 別名]
# 或者: from discoutils.tokens.DocumentFeature import recompile_pattern [as 別名]
def test_token_to_string():
assert 'dog/J' == str(DocumentFeature.from_string('dog/J').tokens[0])
DocumentFeature.recompile_pattern(pos_separator='-')
my_feature = DocumentFeature.from_string('dog-J')
assert 'dog-J' == str(my_feature)
DocumentFeature.recompile_pattern()