本文整理汇总了Python中scrapely.extraction.InstanceBasedLearningExtractor类的典型用法代码示例。如果您正苦于以下问题:Python InstanceBasedLearningExtractor类的具体用法?Python InstanceBasedLearningExtractor怎么用?Python InstanceBasedLearningExtractor使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了InstanceBasedLearningExtractor类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: test_extraction
def test_extraction(self, name, templates, page, descriptor, expected_output):
template_pages = [HtmlPage(None, {}, t) for t in templates]
extractor = InstanceBasedLearningExtractor([(t, descriptor) for t in template_pages])
actual_output, _ = extractor.extract(HtmlPage(None, {}, page))
self.assertEqual(expected_output, actual_output and actual_output[0])
示例2: test_extractor_w_empty_string_extraction
def test_extractor_w_empty_string_extraction(self):
schema = {
"id": "test",
"properties": [
('gender', {
'description': '',
'optional': True,
'type': 'text',
'vary': False,
}),
('name', {
'description': '',
'optional': False,
'type': 'text',
'vary': False,
}),
],
}
descriptor = create_slybot_item_descriptor(schema)
extractors = {
1: {
"_id": 2,
"field_name": "gender",
"regular_expression": "([0-9]+)"
}
}
apply_extractors(descriptor, [1], extractors)
ibl_extractor = InstanceBasedLearningExtractor([(self.template2, descriptor)])
self.assertEqual(ibl_extractor.extract(self.target2)[0][0], {u'name': [u'Name Olivia']})
示例3: test_annotate_multiple
def test_annotate_multiple(self):
tm = TemplateMaker(self.PAGE)
tm.annotate('field1', best_match('text to annotate'), best_match=False)
tpl = tm.get_template()
ex = InstanceBasedLearningExtractor([(tpl, None)])
self.assertEqual(ex.extract(self.PAGE)[0],
[{u'field1': [u'Some text to annotate here', u'Another text to annotate there']}])
示例4: test_annotate_ignore_unpaired
def test_annotate_ignore_unpaired(self):
tm = TemplateMaker(self.PAGE)
tm.annotate('field1', best_match("and that's"), best_match=False)
tpl = tm.get_template()
ex = InstanceBasedLearningExtractor([(tpl, None)])
self.assertEqual(ex.extract(self.PAGE)[0],
[{u'field1': [u"More text with unpaired tag <img />and that's it"]}])
示例5: _run_extraction
def _run_extraction(self, name, templates, page, descriptor, expected_output):
self.trace = None
template_pages = [HtmlPage(None, {}, t) for t in templates]
# extracts with trace enabled in order to generate traceback
extractor = InstanceBasedLearningExtractor([(t, descriptor) for t in template_pages], True)
actual_output, _ = extractor.extract(HtmlPage(None, {}, page))
if actual_output is not None:
actual_output = actual_output[0]
self.trace = ["Extractor:\n%s" % extractor] + actual_output.pop('trace')
# extracts again with trace disabled in order to get the pure output
extractor = InstanceBasedLearningExtractor([(t, descriptor) for t in template_pages])
actual_output, _ = extractor.extract(HtmlPage(None, {}, page))
if actual_output is None:
if expected_output is None:
return
assert False, "failed to extract data for test '%s'" % name
else:
actual_output = actual_output[0]
expected_names = set(expected_output.keys())
actual_names = set(actual_output.keys())
missing_in_output = filter(None, expected_names - actual_names)
error = "attributes '%s' were expected but were not present in test '%s'" % \
("', '".join(missing_in_output), name)
assert len(missing_in_output) == 0, error
unexpected = actual_names - expected_names
error = "unexpected attributes %s in test '%s'" % \
(', '.join(unexpected), name)
assert len(unexpected) == 0, error
for k, v in expected_output.items():
extracted = actual_output[k]
assert v == extracted, "in test '%s' for attribute '%s', " \
"expected value '%s' but got '%s'" % (name, k, v, extracted)
示例6: scrape
def scrape(self, url=None, html=None, encoding='utf-8'):
## not version from https://github.com/scrapy/scrapely/blob/master/scrapely/extraction/pageparsing.py
## may need to replace with version from inspect.getsourcelines(Scraper.scrape), as this version is
page = self._get_page(url, encoding, html)
ex = InstanceBasedLearningExtractor(self.templates)
return ex.extract(page)[0]
示例7: _run_extraction
def _run_extraction(self, name, templates, page, extractors, expected_output):
self.trace = None
template_pages = [HtmlPage(None, {}, t) for t in templates]
extractor = InstanceBasedLearningExtractor(template_pages, extractors, True)
actual_output, _ = extractor.extract(HtmlPage(None, {}, page))
if not actual_output:
if expected_output is None:
return
assert False, "failed to extract data for test '%s'" % name
actual_output = actual_output[0]
self.trace = ["Extractor:\n%s" % extractor] + actual_output.pop('trace', [])
expected_names = set(expected_output.keys())
actual_names = set(actual_output.keys())
missing_in_output = filter(None, expected_names - actual_names)
error = "attributes '%s' were expected but were not present in test '%s'" % \
("', '".join(missing_in_output), name)
assert len(missing_in_output) == 0, error
unexpected = actual_names - expected_names
error = "unexpected attributes %s in test '%s'" % \
(', '.join(unexpected), name)
assert len(unexpected) == 0, error
for k, v in expected_output.items():
extracted = actual_output[k]
assert v == extracted, "in test '%s' for attribute '%s', " \
"expected value '%s' but got '%s'" % (name, k, v, extracted)
示例8: test_extractor_w_empty_string_extraction
def test_extractor_w_empty_string_extraction(self):
schema = {
'fields': {
'gender': {
'required': False,
'type': 'text',
'vary': False,
},
'name': {
'required': True,
'type': 'text',
'vary': False,
}
}
}
descriptor = create_slybot_item_descriptor(schema)
extractors = {
1: {
"regular_expression": "([0-9]+)"
}
}
apply_extractors(descriptor, {"gender": [1]}, extractors)
ibl_extractor = InstanceBasedLearningExtractor([(self.template2, descriptor)])
self.assertEqual(ibl_extractor.extract(self.target2)[0][0], {u'name': [u'Name Olivia']})
示例9: test_type_extractor
def test_type_extractor(self):
schema = {
"id": "test",
"properties": [('gender', {
'description': '',
'optional': True,
'type': 'number',
'vary': False,
})],
}
descriptor = create_slybot_item_descriptor(schema)
extractors = {
1: {
"_id": 1,
"field_name": "gender",
"type_extractor": "text"
},
2: {
"_id": 2,
"field_name": "gender",
"regular_expression": "Gender\\s+(Male|Female)"
}
}
apply_extractors(descriptor, [1, 2], extractors)
ibl_extractor = InstanceBasedLearningExtractor([(self.template, descriptor)])
self.assertEqual(ibl_extractor.extract(self.target)[0][0], {u'gender': [u'Male']})
示例10: test_annotate_multiple
def test_annotate_multiple(self):
tm = TemplateMaker(self.PAGE)
tm.annotate("field1", best_match("text to annotate"), best_match=False)
tpl = tm.get_template()
ex = InstanceBasedLearningExtractor([tpl])
self.assertEqual(
ex.extract(self.PAGE)[0], [{u"field1": [u"Some text to annotate here", u"Another text to annotate there"]}]
)
示例11: do_s
def do_s(self, url):
"""s <url> - scrape url (uses encoding from templates)"""
templates = self._load_templates()
if assert_or_print(templates, "no templates available"):
return
page = get_page(url, templates[0].encoding)
ex = InstanceBasedLearningExtractor(templates)
pprint.pprint(ex.extract(page)[0])
示例12: do_s
def do_s(self, url):
"""s <url> - scrape url"""
templates = self._load_templates()
if assert_or_print(templates, "no templates available"):
return
# fall back to the template encoding if none is specified
page = url_to_page(url, default_encoding=templates[0].encoding)
ex = InstanceBasedLearningExtractor((t, None) for t in templates)
pprint.pprint(ex.extract(page)[0])
示例13: do_s
def do_s(self, line):
"""s <url> [--encoding ENCODING --useragent 'User-Agent'] - scrape url"""
templates = self._load_templates()
if assert_or_print(templates, "no templates available"):
return
opts, (url,) = parse_at_s(line)
headers = { 'User-Agent' : opts.useragent or self.user_agent }
url = urllib2.Request(url, headers=headers)
# fall back to the template encoding if none is specified
page = url_to_page(url, opts.encoding, templates[0].encoding)
ex = InstanceBasedLearningExtractor((t, None) for t in templates)
pprint.pprint(ex.extract(page)[0])
示例14: test_default_type_extractor
def test_default_type_extractor(self):
schema = {
'fields': {}
}
descriptor = create_slybot_item_descriptor(schema)
extractors = {
1: {
"regular_expression": "Gender\\s+(Male|Female)"
}
}
apply_extractors(descriptor, {"gender": [1]}, extractors)
ibl_extractor = InstanceBasedLearningExtractor([(self.template, descriptor)])
self.assertEqual(ibl_extractor.extract(self.target)[0][0], {u'gender': [u'Male']})
示例15: setup_bot
def setup_bot(self, settings, spec, items, extractors):
"""
Perform any initialization needed for crawling using this plugin
"""
_item_template_pages = sorted((
[t.get('scrapes'), dict_to_page(t, 'annotated_body'),
t.get('extractors', []), t.get('version', '0.12.0')]
for t in spec['templates'] if t.get('page_type', 'item') == 'item'
), key=lambda x: x[0])
self.item_classes = {}
self.template_scrapes = {template.get('page_id'): template['scrapes']
for template in spec.get('templates')}
self.html_link_extractor = HtmlLinkExtractor()
for schema_name, schema in items.items():
if schema_name not in self.item_classes:
if not schema.get('name'):
schema['name'] = schema_name
item_cls = SlybotItem.create_iblitem_class(schema)
self.item_classes[schema_name] = item_cls
# Create descriptors and apply additional extractors to fields
page_descriptor_pairs = []
self.schema_descriptors = {}
for default, template, template_extractors, v in _item_template_pages:
descriptors = OrderedDict()
for schema_name, schema in items.items():
item_descriptor = create_slybot_item_descriptor(schema,
schema_name)
apply_extractors(item_descriptor, template_extractors,
extractors)
descriptors[schema_name] = item_descriptor
descriptor = descriptors.values() or [{}]
descriptors['#default'] = descriptors.get(default, descriptor[0])
self.schema_descriptors[template.page_id] = descriptors['#default']
page_descriptor_pairs.append((template, descriptors, v))
add_extractors_to_descriptors(descriptors, extractors)
grouped = itertools.groupby(sorted(page_descriptor_pairs,
key=operator.itemgetter(2)),
lambda x: x[2] < '0.13.0')
self.extractors = []
for version, group in grouped:
if version:
self.extractors.append(
InstanceBasedLearningExtractor(
[(page, scrapes['#default'])
for page, scrapes, version in group]))
else:
self.extractors.append(SlybotIBLExtractor(list(group)))
# generate ibl extractor for links pages
_links_pages = [dict_to_page(t, 'annotated_body')
for t in spec['templates']
if t.get('page_type') == 'links']
_links_item_descriptor = create_slybot_item_descriptor({'fields': {}})
self._links_ibl_extractor = InstanceBasedLearningExtractor(
[(t, _links_item_descriptor) for t in _links_pages]) \
if _links_pages else None
self.build_url_filter(spec)