本文整理汇总了Python中LTTL.Segmenter.tokenize方法的典型用法代码示例。如果您正苦于以下问题:Python Segmenter.tokenize方法的具体用法?Python Segmenter.tokenize怎么用?Python Segmenter.tokenize使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类LTTL.Segmenter
的用法示例。
在下文中一共展示了Segmenter.tokenize方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: test_tokenize_exception_mode
# 需要导入模块: from LTTL import Segmenter [as 别名]
# 或者: from LTTL.Segmenter import tokenize [as 别名]
def test_tokenize_exception_mode(self):
"""Does tokenize raise exception for unknown mode?"""
with self.assertRaises(
ValueError,
msg="tokenize doesn't raise exception for unknown mode!"
):
Segmenter.tokenize(
self.entire_text_seg,
[(re.compile(r'\W+'), 'unknown_mode')],
)
示例2: test_tokenize_progress
# 需要导入模块: from LTTL import Segmenter [as 别名]
# 或者: from LTTL.Segmenter import tokenize [as 别名]
def test_tokenize_progress(self):
"""Does tokenize track progress?"""
def progress_callback():
"""Mock progress callback"""
self.count += 1
Segmenter.tokenize(
self.word_seg,
[(re.compile(r'\w'), 'tokenize')],
progress_callback=progress_callback,
)
self.assertEqual(
self.count,
len(self.word_seg),
msg="tokenize doesn't track progress!"
)
示例3: test_tokenize_create_static_annotations_split
# 需要导入模块: from LTTL import Segmenter [as 别名]
# 或者: from LTTL.Segmenter import tokenize [as 别名]
def test_tokenize_create_static_annotations_split(self):
"""Does tokenize create static annotations (mode split)?"""
segmentation = Segmenter.tokenize(
self.word_seg,
[(re.compile(r'\W'), 'split', {'c': '3'})],
)
self.assertEqual(
[s.annotations['c'] for s in segmentation],
['3', '3'],
msg="tokenize doesn't create static annotations (mode split)!"
)
示例4: test_tokenize_import_annotations_false_split
# 需要导入模块: from LTTL import Segmenter [as 别名]
# 或者: from LTTL.Segmenter import tokenize [as 别名]
def test_tokenize_import_annotations_false_split(self):
"""Does tokenize skip importing annotations (mode split)?"""
segmentation = Segmenter.tokenize(
self.word_seg,
[(re.compile(r'a'), 'split')],
import_annotations=False
)
self.assertFalse(
'a' in segmentation[0].annotations,
msg="tokenize doesn't skip importing annotations (mode split)!"
)
示例5: test_tokenize_import_annotations_split
# 需要导入模块: from LTTL import Segmenter [as 别名]
# 或者: from LTTL.Segmenter import tokenize [as 别名]
def test_tokenize_import_annotations_split(self):
"""Does tokenize import annotations (mode split)?"""
segmentation = Segmenter.tokenize(
self.word_seg,
[(re.compile(r'a'), 'split')],
)
self.assertEqual(
segmentation[0].annotations['a'],
'1',
msg="tokenize doesn't import annotations (mode split)!"
)
示例6: test_tokenize_import_annotations_tokenize
# 需要导入模块: from LTTL import Segmenter [as 别名]
# 或者: from LTTL.Segmenter import tokenize [as 别名]
def test_tokenize_import_annotations_tokenize(self):
"""Does tokenize import annotations (mode tokenize)?"""
segmentation = Segmenter.tokenize(
self.word_seg,
[(re.compile(r'\w{2}'), 'tokenize')],
import_annotations=True
)
self.assertEqual(
segmentation[0].annotations['a'],
'1',
msg="tokenize doesn't import annotations (mode tokenize)!"
)
示例7: test_tokenize_create_dynamic_annotations_tokenize
# 需要导入模块: from LTTL import Segmenter [as 别名]
# 或者: from LTTL.Segmenter import tokenize [as 别名]
def test_tokenize_create_dynamic_annotations_tokenize(self):
"""Does tokenize create dynamic annotations (mode tokenize)?"""
segmentation = Segmenter.tokenize(
self.word_seg,
[
(re.compile(r'\w(\w)(\w)'), 'tokenize', {'&1': '&2'}),
],
)
self.assertEqual(
segmentation[0].annotations['d'],
'e',
msg="tokenize doesn't create dynamic annotations (mode tokenize)!"
)
示例8: test_tokenize_sort
# 需要导入模块: from LTTL import Segmenter [as 别名]
# 或者: from LTTL.Segmenter import tokenize [as 别名]
def test_tokenize_sort(self):
"""Does tokenize sort output segments?"""
segmentation = Segmenter.tokenize(
self.word_seg,
[
(re.compile(r'\w'), 'tokenize'),
(re.compile(r'[ae]'), 'tokenize'),
],
)
self.assertEqual(
[s.get_content() for s in segmentation],
['a', 'a', 'b', 'c', 'd', 'e', 'e'],
msg="tokenize doesn't sort output segments!"
)
示例9: test_tokenize_segment_tokenize
# 需要导入模块: from LTTL import Segmenter [as 别名]
# 或者: from LTTL.Segmenter import tokenize [as 别名]
def test_tokenize_segment_tokenize(self):
"""Does tokenize tokenize input?"""
segmentation = Segmenter.tokenize(
self.entire_text_seg,
[
(re.compile(r'\w+'), 'tokenize'),
(re.compile(r'\w{3,}'), 'tokenize'),
],
)
self.assertEqual(
[s.get_content() for s in segmentation],
['ab', 'cde', 'cde'],
msg="tokenize doesn't tokenize input!"
)
示例10: test_tokenize_segment_split
# 需要导入模块: from LTTL import Segmenter [as 别名]
# 或者: from LTTL.Segmenter import tokenize [as 别名]
def test_tokenize_segment_split(self):
"""Does tokenize split input?"""
segmentation = Segmenter.tokenize(
self.entire_text_seg,
[
(re.compile(r'\W+'), 'split'),
(re.compile(r'd'), 'split'),
],
)
self.assertEqual(
[s.get_content() for s in segmentation],
['ab', 'ab c', 'cde', 'e'],
msg="tokenize doesn't split input!"
)
示例11: test_tokenize_solve_conflicts_merge_duplicates
# 需要导入模块: from LTTL import Segmenter [as 别名]
# 或者: from LTTL.Segmenter import tokenize [as 别名]
def test_tokenize_solve_conflicts_merge_duplicates(self):
"""Does tokenize solve conflicts when merging duplicates?"""
segmentation = Segmenter.tokenize(
self.word_seg,
[
(re.compile(r'\w+'), 'tokenize', {'a': '10'}),
(re.compile(r'\W+'), 'split', {'a': '20'}),
],
merge_duplicates=True,
)
self.assertEqual(
segmentation[1].annotations['a'],
'20',
msg="tokenize doesn't solve conflicts when merging duplicates!"
)
示例12: test_tokenize_merge_duplicates
# 需要导入模块: from LTTL import Segmenter [as 别名]
# 或者: from LTTL.Segmenter import tokenize [as 别名]
def test_tokenize_merge_duplicates(self):
"""Does tokenize merge duplicates?"""
segmentation = Segmenter.tokenize(
self.word_seg,
[
(re.compile(r'\w+'), 'tokenize'),
(re.compile(r'\W+'), 'split'),
],
merge_duplicates=True,
)
self.assertEqual(
[s.get_content() for s in segmentation],
['ab', 'cde'],
msg="tokenize doesn't merge duplicates!"
)
示例13: test_tokenize_autonumber
# 需要导入模块: from LTTL import Segmenter [as 别名]
# 或者: from LTTL.Segmenter import tokenize [as 别名]
def test_tokenize_autonumber(self):
"""Does tokenize autonumber input segments?"""
segmentation = Segmenter.tokenize(
self.word_seg,
[
(re.compile(r'\w+'), 'tokenize'),
(re.compile(r'\W+'), 'split'),
],
auto_number_as='num'
)
self.assertEqual(
[s.annotations['num'] for s in segmentation],
[1, 2, 3, 4],
msg="tokenize doesn't autonumber input segments!"
)
示例14: main
# 需要导入模块: from LTTL import Segmenter [as 别名]
# 或者: from LTTL.Segmenter import tokenize [as 别名]
def main():
input_seg = Input("un texte")
verbatim_seg = Segmenter.tokenize(
input_seg,
[(re.compile(r'.+'), 'tokenize')],
)
# verbatim in input = ok
print "verbatim in input:",
contained_segments = input_seg[0].get_contained_segments(verbatim_seg)
try:
print "ok" if contained_segments[0].get_content() == 'un texte' else "fail"
except:
print "fail"
# verbatim in verbatim = ok
print "verbatim in verbatim:",
contained_segments = verbatim_seg[0].get_contained_segments(verbatim_seg)
try:
print "ok" if contained_segments[0].get_content() == 'un texte' else "fail"
except:
print "fail"
# input in verbatim = fail
print "input in verbatim:",
contained_segments = verbatim_seg[0].get_contained_segments(input_seg)
try:
print "ok" if contained_segments[0].get_content() == 'un texte' else "fail"
except:
print "fail"
# input in input = fail
print "input in input:",
contained_segments = input_seg[0].get_contained_segments(input_seg)
try:
print "ok" if contained_segments[0].get_content() == 'un texte' else "fail"
except:
print "fail"
示例15: setUp
# 需要导入模块: from LTTL import Segmenter [as 别名]
# 或者: from LTTL.Segmenter import tokenize [as 别名]
def setUp(self):
input_seg = Input("un texte")
word_seg = Segmenter.tokenize(
input_seg,
[(re.compile(r'\w+'), 'tokenize')],
import_annotations=False,
)
letter_seg = Segmenter.tokenize(
input_seg,
[
(re.compile(r'\w'), 'tokenize', {'type': 'C'}),
(re.compile(r'[aeiouy]'), 'tokenize', {'type': 'V'}),
],
import_annotations=False,
merge_duplicates=True,
)
vowel_seg, consonant_seg = Segmenter.select(
letter_seg,
re.compile(r'V'),
annotation_key='type',
)
# Create the cooccurrence matrix for cooccurrence in window
# with window_size=3 and without annotation (woa):
self.window_woa_row_ids = ['u', 'n', 't', 'e', 'x']
self.window_woa_col_ids = ['u', 'n', 't', 'e', 'x']
self.window_woa_values = {
('u', 'u'): 1,
('u', 'n'): 1,
('u', 't'): 1,
('u', 'e'): 0,
('u', 'x'): 0,
('n', 'u'): 1,
('n', 'n'): 2,
('n', 't'): 2,
('n', 'e'): 1,
('n', 'x'): 0,
('t', 'u'): 1,
('t', 'n'): 2,
('t', 't'): 5,
('t', 'e'): 4,
('t', 'x'): 3,
('e', 'u'): 0,
('e', 'n'): 1,
('e', 't'): 4,
('e', 'e'): 4,
('e', 'x'): 3,
('x', 'u'): 0,
('x', 'n'): 0,
('x', 't'): 3,
('x', 'e'): 3,
('x', 'x'): 3,
}
self.window_woa_header_row_id = '__unit__'
self.window_woa_header_row_type = 'string'
self.window_woa_header_col_id = '__unit2__'
self.window_woa_header_col_type = 'string'
self.window_woa_col_type = {
col_id: 'continuous' for col_id in self.window_woa_col_ids
}
self.window_woa_ref = IntPivotCrosstab(
self.window_woa_row_ids,
self.window_woa_col_ids,
self.window_woa_values,
self.window_woa_header_row_id,
self.window_woa_header_row_type,
self.window_woa_header_col_id,
self.window_woa_header_col_type,
self.window_woa_col_type,
)
# Create the cooccurrence matrix for cooccurrence in window
# with window_size=3 and with annotation (wa):
self.window_wa_row_ids = ['C', 'V']
self.window_wa_col_ids = ['C', 'V']
self.window_wa_values = {
('C', 'C'): 5,
('C', 'V'): 5,
('V', 'C'): 5,
('V', 'V'): 5,
}
self.window_wa_header_row_id = '__unit__'
self.window_wa_header_row_type = 'string'
self.window_wa_header_col_id = '__unit2__'
self.window_wa_header_col_type = 'string'
self.window_wa_col_type = {
col_id: 'continuous' for col_id in self.window_wa_col_ids
}
self.window_wa_ref = IntPivotCrosstab(
self.window_wa_row_ids,
self.window_wa_col_ids,
self.window_wa_values,
self.window_wa_header_row_id,
self.window_wa_header_row_type,
self.window_wa_header_col_id,
self.window_wa_header_col_type,
self.window_wa_col_type,
)
# Create the cooccurrence matrix for cooccurrence in context
# without the secondary unit (wos) and without annotation (woa):
self.context_wos_woa_row_ids = ['u', 'n', 't', 'e', 'x']
#.........这里部分代码省略.........