本文整理汇总了Python中scrapely.extraction.InstanceBasedLearningExtractor.extract方法的典型用法代码示例。如果您正苦于以下问题:Python InstanceBasedLearningExtractor.extract方法的具体用法?Python InstanceBasedLearningExtractor.extract怎么用?Python InstanceBasedLearningExtractor.extract使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类scrapely.extraction.InstanceBasedLearningExtractor
的用法示例。
在下文中一共展示了InstanceBasedLearningExtractor.extract方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: _run_extraction
# 需要导入模块: from scrapely.extraction import InstanceBasedLearningExtractor [as 别名]
# 或者: from scrapely.extraction.InstanceBasedLearningExtractor import extract [as 别名]
def _run_extraction(self, name, templates, page, descriptor, expected_output):
self.trace = None
template_pages = [HtmlPage(None, {}, t) for t in templates]
# extracts with trace enabled in order to generate traceback
extractor = InstanceBasedLearningExtractor([(t, descriptor) for t in template_pages], True)
actual_output, _ = extractor.extract(HtmlPage(None, {}, page))
if actual_output is not None:
actual_output = actual_output[0]
self.trace = ["Extractor:\n%s" % extractor] + actual_output.pop('trace')
# extracts again with trace disabled in order to get the pure output
extractor = InstanceBasedLearningExtractor([(t, descriptor) for t in template_pages])
actual_output, _ = extractor.extract(HtmlPage(None, {}, page))
if actual_output is None:
if expected_output is None:
return
assert False, "failed to extract data for test '%s'" % name
else:
actual_output = actual_output[0]
expected_names = set(expected_output.keys())
actual_names = set(actual_output.keys())
missing_in_output = filter(None, expected_names - actual_names)
error = "attributes '%s' were expected but were not present in test '%s'" % \
("', '".join(missing_in_output), name)
assert len(missing_in_output) == 0, error
unexpected = actual_names - expected_names
error = "unexpected attributes %s in test '%s'" % \
(', '.join(unexpected), name)
assert len(unexpected) == 0, error
for k, v in expected_output.items():
extracted = actual_output[k]
assert v == extracted, "in test '%s' for attribute '%s', " \
"expected value '%s' but got '%s'" % (name, k, v, extracted)
示例2: test_extractor_w_empty_string_extraction
# 需要导入模块: from scrapely.extraction import InstanceBasedLearningExtractor [as 别名]
# 或者: from scrapely.extraction.InstanceBasedLearningExtractor import extract [as 别名]
def test_extractor_w_empty_string_extraction(self):
schema = {
"id": "test",
"properties": [
('gender', {
'description': '',
'optional': True,
'type': 'text',
'vary': False,
}),
('name', {
'description': '',
'optional': False,
'type': 'text',
'vary': False,
}),
],
}
descriptor = create_slybot_item_descriptor(schema)
extractors = {
1: {
"_id": 2,
"field_name": "gender",
"regular_expression": "([0-9]+)"
}
}
apply_extractors(descriptor, [1], extractors)
ibl_extractor = InstanceBasedLearningExtractor([(self.template2, descriptor)])
self.assertEqual(ibl_extractor.extract(self.target2)[0][0], {u'name': [u'Name Olivia']})
示例3: test_annotate_multiple
# 需要导入模块: from scrapely.extraction import InstanceBasedLearningExtractor [as 别名]
# 或者: from scrapely.extraction.InstanceBasedLearningExtractor import extract [as 别名]
def test_annotate_multiple(self):
tm = TemplateMaker(self.PAGE)
tm.annotate('field1', best_match('text to annotate'), best_match=False)
tpl = tm.get_template()
ex = InstanceBasedLearningExtractor([(tpl, None)])
self.assertEqual(ex.extract(self.PAGE)[0],
[{u'field1': [u'Some text to annotate here', u'Another text to annotate there']}])
示例4: scrape
# 需要导入模块: from scrapely.extraction import InstanceBasedLearningExtractor [as 别名]
# 或者: from scrapely.extraction.InstanceBasedLearningExtractor import extract [as 别名]
def scrape(self, url=None, html=None, encoding='utf-8'):
## not version from https://github.com/scrapy/scrapely/blob/master/scrapely/extraction/pageparsing.py
## may need to replace with version from inspect.getsourcelines(Scraper.scrape), as this version is
page = self._get_page(url, encoding, html)
ex = InstanceBasedLearningExtractor(self.templates)
return ex.extract(page)[0]
示例5: _run_extraction
# 需要导入模块: from scrapely.extraction import InstanceBasedLearningExtractor [as 别名]
# 或者: from scrapely.extraction.InstanceBasedLearningExtractor import extract [as 别名]
def _run_extraction(self, name, templates, page, extractors, expected_output):
self.trace = None
template_pages = [HtmlPage(None, {}, t) for t in templates]
extractor = InstanceBasedLearningExtractor(template_pages, extractors, True)
actual_output, _ = extractor.extract(HtmlPage(None, {}, page))
if not actual_output:
if expected_output is None:
return
assert False, "failed to extract data for test '%s'" % name
actual_output = actual_output[0]
self.trace = ["Extractor:\n%s" % extractor] + actual_output.pop('trace', [])
expected_names = set(expected_output.keys())
actual_names = set(actual_output.keys())
missing_in_output = filter(None, expected_names - actual_names)
error = "attributes '%s' were expected but were not present in test '%s'" % \
("', '".join(missing_in_output), name)
assert len(missing_in_output) == 0, error
unexpected = actual_names - expected_names
error = "unexpected attributes %s in test '%s'" % \
(', '.join(unexpected), name)
assert len(unexpected) == 0, error
for k, v in expected_output.items():
extracted = actual_output[k]
assert v == extracted, "in test '%s' for attribute '%s', " \
"expected value '%s' but got '%s'" % (name, k, v, extracted)
示例6: test_extractor_w_empty_string_extraction
# 需要导入模块: from scrapely.extraction import InstanceBasedLearningExtractor [as 别名]
# 或者: from scrapely.extraction.InstanceBasedLearningExtractor import extract [as 别名]
def test_extractor_w_empty_string_extraction(self):
schema = {
'fields': {
'gender': {
'required': False,
'type': 'text',
'vary': False,
},
'name': {
'required': True,
'type': 'text',
'vary': False,
}
}
}
descriptor = create_slybot_item_descriptor(schema)
extractors = {
1: {
"regular_expression": "([0-9]+)"
}
}
apply_extractors(descriptor, {"gender": [1]}, extractors)
ibl_extractor = InstanceBasedLearningExtractor([(self.template2, descriptor)])
self.assertEqual(ibl_extractor.extract(self.target2)[0][0], {u'name': [u'Name Olivia']})
示例7: test_extraction
# 需要导入模块: from scrapely.extraction import InstanceBasedLearningExtractor [as 别名]
# 或者: from scrapely.extraction.InstanceBasedLearningExtractor import extract [as 别名]
def test_extraction(self, name, templates, page, descriptor, expected_output):
template_pages = [HtmlPage(None, {}, t) for t in templates]
extractor = InstanceBasedLearningExtractor([(t, descriptor) for t in template_pages])
actual_output, _ = extractor.extract(HtmlPage(None, {}, page))
self.assertEqual(expected_output, actual_output and actual_output[0])
示例8: test_type_extractor
# 需要导入模块: from scrapely.extraction import InstanceBasedLearningExtractor [as 别名]
# 或者: from scrapely.extraction.InstanceBasedLearningExtractor import extract [as 别名]
def test_type_extractor(self):
schema = {
"id": "test",
"properties": [('gender', {
'description': '',
'optional': True,
'type': 'number',
'vary': False,
})],
}
descriptor = create_slybot_item_descriptor(schema)
extractors = {
1: {
"_id": 1,
"field_name": "gender",
"type_extractor": "text"
},
2: {
"_id": 2,
"field_name": "gender",
"regular_expression": "Gender\\s+(Male|Female)"
}
}
apply_extractors(descriptor, [1, 2], extractors)
ibl_extractor = InstanceBasedLearningExtractor([(self.template, descriptor)])
self.assertEqual(ibl_extractor.extract(self.target)[0][0], {u'gender': [u'Male']})
示例9: test_annotate_ignore_unpaired
# 需要导入模块: from scrapely.extraction import InstanceBasedLearningExtractor [as 别名]
# 或者: from scrapely.extraction.InstanceBasedLearningExtractor import extract [as 别名]
def test_annotate_ignore_unpaired(self):
tm = TemplateMaker(self.PAGE)
tm.annotate('field1', best_match("and that's"), best_match=False)
tpl = tm.get_template()
ex = InstanceBasedLearningExtractor([(tpl, None)])
self.assertEqual(ex.extract(self.PAGE)[0],
[{u'field1': [u"More text with unpaired tag <img />and that's it"]}])
示例10: test_annotate_multiple
# 需要导入模块: from scrapely.extraction import InstanceBasedLearningExtractor [as 别名]
# 或者: from scrapely.extraction.InstanceBasedLearningExtractor import extract [as 别名]
def test_annotate_multiple(self):
tm = TemplateMaker(self.PAGE)
tm.annotate("field1", best_match("text to annotate"), best_match=False)
tpl = tm.get_template()
ex = InstanceBasedLearningExtractor([tpl])
self.assertEqual(
ex.extract(self.PAGE)[0], [{u"field1": [u"Some text to annotate here", u"Another text to annotate there"]}]
)
示例11: do_s
# 需要导入模块: from scrapely.extraction import InstanceBasedLearningExtractor [as 别名]
# 或者: from scrapely.extraction.InstanceBasedLearningExtractor import extract [as 别名]
def do_s(self, url):
"""s <url> - scrape url (uses encoding from templates)"""
templates = self._load_templates()
if assert_or_print(templates, "no templates available"):
return
page = get_page(url, templates[0].encoding)
ex = InstanceBasedLearningExtractor(templates)
pprint.pprint(ex.extract(page)[0])
示例12: do_s
# 需要导入模块: from scrapely.extraction import InstanceBasedLearningExtractor [as 别名]
# 或者: from scrapely.extraction.InstanceBasedLearningExtractor import extract [as 别名]
def do_s(self, url):
"""s <url> - scrape url"""
templates = self._load_templates()
if assert_or_print(templates, "no templates available"):
return
# fall back to the template encoding if none is specified
page = url_to_page(url, default_encoding=templates[0].encoding)
ex = InstanceBasedLearningExtractor((t, None) for t in templates)
pprint.pprint(ex.extract(page)[0])
示例13: do_s
# 需要导入模块: from scrapely.extraction import InstanceBasedLearningExtractor [as 别名]
# 或者: from scrapely.extraction.InstanceBasedLearningExtractor import extract [as 别名]
def do_s(self, line):
"""s <url> [--encoding ENCODING --useragent 'User-Agent'] - scrape url"""
templates = self._load_templates()
if assert_or_print(templates, "no templates available"):
return
opts, (url,) = parse_at_s(line)
headers = { 'User-Agent' : opts.useragent or self.user_agent }
url = urllib2.Request(url, headers=headers)
# fall back to the template encoding if none is specified
page = url_to_page(url, opts.encoding, templates[0].encoding)
ex = InstanceBasedLearningExtractor((t, None) for t in templates)
pprint.pprint(ex.extract(page)[0])
示例14: test_default_type_extractor
# 需要导入模块: from scrapely.extraction import InstanceBasedLearningExtractor [as 别名]
# 或者: from scrapely.extraction.InstanceBasedLearningExtractor import extract [as 别名]
def test_default_type_extractor(self):
schema = {
'fields': {}
}
descriptor = create_slybot_item_descriptor(schema)
extractors = {
1: {
"regular_expression": "Gender\\s+(Male|Female)"
}
}
apply_extractors(descriptor, {"gender": [1]}, extractors)
ibl_extractor = InstanceBasedLearningExtractor([(self.template, descriptor)])
self.assertEqual(ibl_extractor.extract(self.target)[0][0], {u'gender': [u'Male']})
示例15: Scraper
# 需要导入模块: from scrapely.extraction import InstanceBasedLearningExtractor [as 别名]
# 或者: from scrapely.extraction.InstanceBasedLearningExtractor import extract [as 别名]
class Scraper(object):
def __init__(self, templates=None):
"""Initialize an empty scraper."""
self._templates = templates or []
self._ex = None
@classmethod
def fromfile(cls, file):
"""Initialize a scraper from a file previously stored by tofile()
method.
"""
templates = [HtmlPage(**x) for x in json.load(file)['templates']]
return cls(templates)
def tofile(self, file):
"""Store the scraper into the given file-like object"""
tpls = [page_to_dict(x) for x in self._templates]
json.dump({'templates': tpls}, file)
def add_template(self, template):
self._templates.append(template)
self._ex = None
def train_from_htmlpage(self, htmlpage, data):
assert data, "Cannot train with empty data"
tm = TemplateMaker(htmlpage)
for field, values in data.items():
if (isinstance(values, (bytes, str)) or
not hasattr(values, '__iter__')):
values = [values]
for value in values:
value = str_to_unicode(value, htmlpage.encoding)
tm.annotate(field, best_match(value))
self.add_template(tm.get_template())
def train(self, url, data, encoding=None):
page = url_to_page(url, encoding)
self.train_from_htmlpage(page, data)
def scrape(self, url, encoding=None):
page = url_to_page(url, encoding)
return self.scrape_page(page)
def scrape_page(self, page):
if self._ex is None:
self._ex = InstanceBasedLearningExtractor((t, None) for t in
self._templates)
return self._ex.extract(page)[0]