当前位置: 首页>>代码示例>>Python>>正文


Python InstanceBasedLearningExtractor.extract方法代码示例

本文整理汇总了Python中scrapely.extraction.InstanceBasedLearningExtractor.extract方法的典型用法代码示例。如果您正苦于以下问题:Python InstanceBasedLearningExtractor.extract方法的具体用法?Python InstanceBasedLearningExtractor.extract怎么用?Python InstanceBasedLearningExtractor.extract使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在scrapely.extraction.InstanceBasedLearningExtractor的用法示例。


在下文中一共展示了InstanceBasedLearningExtractor.extract方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: _run_extraction

# 需要导入模块: from scrapely.extraction import InstanceBasedLearningExtractor [as 别名]
# 或者: from scrapely.extraction.InstanceBasedLearningExtractor import extract [as 别名]
    def _run_extraction(self, name, templates, page, descriptor, expected_output):
        self.trace = None
        template_pages = [HtmlPage(None, {}, t) for t in templates]
        # extracts with trace enabled in order to generate traceback
        extractor = InstanceBasedLearningExtractor([(t, descriptor) for t in template_pages], True)
        actual_output, _ = extractor.extract(HtmlPage(None, {}, page))
        if actual_output is not None:
            actual_output = actual_output[0]
            self.trace = ["Extractor:\n%s" % extractor] + actual_output.pop('trace')
        # extracts again with trace disabled in order to get the pure output
        extractor = InstanceBasedLearningExtractor([(t, descriptor) for t in template_pages])
        actual_output, _ = extractor.extract(HtmlPage(None, {}, page))
        if actual_output is None:
            if expected_output is None:
                return
            assert False, "failed to extract data for test '%s'" % name
        else:
            actual_output = actual_output[0]
        expected_names = set(expected_output.keys())
        actual_names = set(actual_output.keys())
        
        missing_in_output = filter(None, expected_names - actual_names)
        error = "attributes '%s' were expected but were not present in test '%s'" % \
                ("', '".join(missing_in_output), name)
        assert len(missing_in_output) == 0, error

        unexpected = actual_names - expected_names
        error = "unexpected attributes %s in test '%s'" % \
                (', '.join(unexpected), name)
        assert len(unexpected) == 0, error

        for k, v in expected_output.items():
            extracted = actual_output[k]
            assert v == extracted, "in test '%s' for attribute '%s', " \
                "expected value '%s' but got '%s'" % (name, k, v, extracted)
开发者ID:alepharchives,项目名称:scrapely,代码行数:37,代码来源:test_extraction.py

示例2: test_extractor_w_empty_string_extraction

# 需要导入模块: from scrapely.extraction import InstanceBasedLearningExtractor [as 别名]
# 或者: from scrapely.extraction.InstanceBasedLearningExtractor import extract [as 别名]
 def test_extractor_w_empty_string_extraction(self):
     schema = {
         "id": "test",
         "properties": [
             ('gender', {
                 'description': '',
                 'optional': True,
                 'type': 'text',
                 'vary': False,
             }),
             ('name', {
                 'description': '',
                 'optional': False,
                 'type': 'text',
                 'vary': False,
             }),
         ],
     }
     descriptor = create_slybot_item_descriptor(schema)
     extractors =  {
                 1: {
                     "_id": 2,
                     "field_name": "gender",
                     "regular_expression": "([0-9]+)"
                 }
     }
     apply_extractors(descriptor, [1], extractors)
     
     ibl_extractor = InstanceBasedLearningExtractor([(self.template2, descriptor)])
     self.assertEqual(ibl_extractor.extract(self.target2)[0][0], {u'name': [u'Name Olivia']})
开发者ID:alepharchives,项目名称:slybot,代码行数:32,代码来源:test_extractors.py

示例3: test_annotate_multiple

# 需要导入模块: from scrapely.extraction import InstanceBasedLearningExtractor [as 别名]
# 或者: from scrapely.extraction.InstanceBasedLearningExtractor import extract [as 别名]
 def test_annotate_multiple(self):
     tm = TemplateMaker(self.PAGE)
     tm.annotate('field1', best_match('text to annotate'), best_match=False)
     tpl = tm.get_template()
     ex = InstanceBasedLearningExtractor([(tpl, None)])
     self.assertEqual(ex.extract(self.PAGE)[0],
         [{u'field1': [u'Some text to annotate here', u'Another text to annotate there']}])
开发者ID:scrapy,项目名称:scrapely,代码行数:9,代码来源:test_template.py

示例4: scrape

# 需要导入模块: from scrapely.extraction import InstanceBasedLearningExtractor [as 别名]
# 或者: from scrapely.extraction.InstanceBasedLearningExtractor import extract [as 别名]
    def scrape(self, url=None, html=None, encoding='utf-8'): 
        ## not version from https://github.com/scrapy/scrapely/blob/master/scrapely/extraction/pageparsing.py
        ## may need to replace with version from inspect.getsourcelines(Scraper.scrape), as this version is

        page = self._get_page(url, encoding, html)
        ex = InstanceBasedLearningExtractor(self.templates)
        return ex.extract(page)[0]
开发者ID:carriercomm,项目名称:scraperwiki-scraper-vault,代码行数:9,代码来源:scrapely-hack.py

示例5: _run_extraction

# 需要导入模块: from scrapely.extraction import InstanceBasedLearningExtractor [as 别名]
# 或者: from scrapely.extraction.InstanceBasedLearningExtractor import extract [as 别名]
    def _run_extraction(self, name, templates, page, extractors, expected_output):
        self.trace = None
        template_pages = [HtmlPage(None, {}, t) for t in templates]
        extractor = InstanceBasedLearningExtractor(template_pages, extractors, True)
        actual_output, _ = extractor.extract(HtmlPage(None, {}, page))
        if not actual_output:
            if expected_output is None:
                return
            assert False, "failed to extract data for test '%s'" % name
        actual_output = actual_output[0]
        self.trace = ["Extractor:\n%s" % extractor] + actual_output.pop('trace', [])
        expected_names = set(expected_output.keys())
        actual_names = set(actual_output.keys())
        
        missing_in_output = filter(None, expected_names - actual_names)
        error = "attributes '%s' were expected but were not present in test '%s'" % \
                ("', '".join(missing_in_output), name)
        assert len(missing_in_output) == 0, error

        unexpected = actual_names - expected_names
        error = "unexpected attributes %s in test '%s'" % \
                (', '.join(unexpected), name)
        assert len(unexpected) == 0, error

        for k, v in expected_output.items():
            extracted = actual_output[k]
            assert v == extracted, "in test '%s' for attribute '%s', " \
                "expected value '%s' but got '%s'" % (name, k, v, extracted)
开发者ID:esimionato,项目名称:scrapely,代码行数:30,代码来源:test_extraction.py

示例6: test_extractor_w_empty_string_extraction

# 需要导入模块: from scrapely.extraction import InstanceBasedLearningExtractor [as 别名]
# 或者: from scrapely.extraction.InstanceBasedLearningExtractor import extract [as 别名]
 def test_extractor_w_empty_string_extraction(self):
     schema = {
         'fields': {
             'gender': {
                 'required': False,
                 'type': 'text',
                 'vary': False,
             },
             'name': {
                 'required': True,
                 'type': 'text',
                 'vary': False,
             }
         }
     }
     descriptor = create_slybot_item_descriptor(schema)
     extractors =  {
                 1: {
                     "regular_expression": "([0-9]+)"
                 }
     }
     apply_extractors(descriptor, {"gender": [1]}, extractors)
     
     ibl_extractor = InstanceBasedLearningExtractor([(self.template2, descriptor)])
     self.assertEqual(ibl_extractor.extract(self.target2)[0][0], {u'name': [u'Name Olivia']})
开发者ID:1060460048,项目名称:portia,代码行数:27,代码来源:test_extractors.py

示例7: test_extraction

# 需要导入模块: from scrapely.extraction import InstanceBasedLearningExtractor [as 别名]
# 或者: from scrapely.extraction.InstanceBasedLearningExtractor import extract [as 别名]
    def test_extraction(self, name, templates, page, descriptor, expected_output):
        template_pages = [HtmlPage(None, {}, t) for t in templates]

        extractor = InstanceBasedLearningExtractor([(t, descriptor) for t in template_pages])
        actual_output, _ = extractor.extract(HtmlPage(None, {}, page))

        self.assertEqual(expected_output, actual_output and actual_output[0])
开发者ID:1060460048,项目名称:scrapely,代码行数:9,代码来源:test_extraction.py

示例8: test_type_extractor

# 需要导入模块: from scrapely.extraction import InstanceBasedLearningExtractor [as 别名]
# 或者: from scrapely.extraction.InstanceBasedLearningExtractor import extract [as 别名]
 def test_type_extractor(self):
     schema = {
         "id": "test",
         "properties": [('gender', {
                 'description': '',
                 'optional': True,
                 'type': 'number',
                 'vary': False,
         })],
     }
     descriptor = create_slybot_item_descriptor(schema)
     extractors =  {
                 1: {
                     "_id": 1,
                     "field_name": "gender",
                     "type_extractor": "text"
                 },
                 2: {
                     "_id": 2,
                     "field_name": "gender",
                     "regular_expression": "Gender\\s+(Male|Female)"
                 }
     }
     apply_extractors(descriptor, [1, 2], extractors)
     
     ibl_extractor = InstanceBasedLearningExtractor([(self.template, descriptor)])
     self.assertEqual(ibl_extractor.extract(self.target)[0][0], {u'gender': [u'Male']})
开发者ID:alepharchives,项目名称:slybot,代码行数:29,代码来源:test_extractors.py

示例9: test_annotate_ignore_unpaired

# 需要导入模块: from scrapely.extraction import InstanceBasedLearningExtractor [as 别名]
# 或者: from scrapely.extraction.InstanceBasedLearningExtractor import extract [as 别名]
 def test_annotate_ignore_unpaired(self):
     tm = TemplateMaker(self.PAGE)
     tm.annotate('field1', best_match("and that's"), best_match=False)
     tpl = tm.get_template()
     ex = InstanceBasedLearningExtractor([(tpl, None)])
     self.assertEqual(ex.extract(self.PAGE)[0],
         [{u'field1': [u"More text with unpaired tag <img />and that's it"]}])
开发者ID:scrapy,项目名称:scrapely,代码行数:9,代码来源:test_template.py

示例10: test_annotate_multiple

# 需要导入模块: from scrapely.extraction import InstanceBasedLearningExtractor [as 别名]
# 或者: from scrapely.extraction.InstanceBasedLearningExtractor import extract [as 别名]
 def test_annotate_multiple(self):
     tm = TemplateMaker(self.PAGE)
     tm.annotate("field1", best_match("text to annotate"), best_match=False)
     tpl = tm.get_template()
     ex = InstanceBasedLearningExtractor([tpl])
     self.assertEqual(
         ex.extract(self.PAGE)[0], [{u"field1": [u"Some text to annotate here", u"Another text to annotate there"]}]
     )
开发者ID:netconstructor,项目名称:scrapely,代码行数:10,代码来源:test_template.py

示例11: do_s

# 需要导入模块: from scrapely.extraction import InstanceBasedLearningExtractor [as 别名]
# 或者: from scrapely.extraction.InstanceBasedLearningExtractor import extract [as 别名]
 def do_s(self, url):
     """s <url> - scrape url (uses encoding from templates)"""
     templates = self._load_templates()
     if assert_or_print(templates, "no templates available"):
         return
     page = get_page(url, templates[0].encoding)
     ex = InstanceBasedLearningExtractor(templates)
     pprint.pprint(ex.extract(page)[0])
开发者ID:esimionato,项目名称:scrapely,代码行数:10,代码来源:tool.py

示例12: do_s

# 需要导入模块: from scrapely.extraction import InstanceBasedLearningExtractor [as 别名]
# 或者: from scrapely.extraction.InstanceBasedLearningExtractor import extract [as 别名]
 def do_s(self, url):
     """s <url> - scrape url"""
     templates = self._load_templates()
     if assert_or_print(templates, "no templates available"):
         return
     # fall back to the template encoding if none is specified
     page = url_to_page(url, default_encoding=templates[0].encoding)
     ex = InstanceBasedLearningExtractor((t, None) for t in templates)
     pprint.pprint(ex.extract(page)[0])
开发者ID:1060460048,项目名称:scrapely,代码行数:11,代码来源:tool.py

示例13: do_s

# 需要导入模块: from scrapely.extraction import InstanceBasedLearningExtractor [as 别名]
# 或者: from scrapely.extraction.InstanceBasedLearningExtractor import extract [as 别名]
 def do_s(self, line):
     """s <url> [--encoding ENCODING --useragent 'User-Agent'] - scrape url"""
     templates = self._load_templates()
     if assert_or_print(templates, "no templates available"):
         return
     opts, (url,) = parse_at_s(line)
     headers = { 'User-Agent' : opts.useragent or self.user_agent }
     url = urllib2.Request(url, headers=headers)
     # fall back to the template encoding if none is specified
     page = url_to_page(url, opts.encoding, templates[0].encoding)
     ex = InstanceBasedLearningExtractor((t, None) for t in templates)
     pprint.pprint(ex.extract(page)[0])
开发者ID:boite,项目名称:scrapely,代码行数:14,代码来源:tool.py

示例14: test_default_type_extractor

# 需要导入模块: from scrapely.extraction import InstanceBasedLearningExtractor [as 别名]
# 或者: from scrapely.extraction.InstanceBasedLearningExtractor import extract [as 别名]
 def test_default_type_extractor(self):
     schema = {
         'fields': {}
     }
     descriptor = create_slybot_item_descriptor(schema)
     extractors =  {
                 1: {
                     "regular_expression": "Gender\\s+(Male|Female)"
                 }
     }
     apply_extractors(descriptor, {"gender": [1]}, extractors)
     
     ibl_extractor = InstanceBasedLearningExtractor([(self.template, descriptor)])
     self.assertEqual(ibl_extractor.extract(self.target)[0][0], {u'gender': [u'Male']})
开发者ID:1060460048,项目名称:portia,代码行数:16,代码来源:test_extractors.py

示例15: Scraper

# 需要导入模块: from scrapely.extraction import InstanceBasedLearningExtractor [as 别名]
# 或者: from scrapely.extraction.InstanceBasedLearningExtractor import extract [as 别名]
class Scraper(object):

    def __init__(self, templates=None):
        """Initialize an empty scraper."""
        self._templates = templates or []
        self._ex = None

    @classmethod
    def fromfile(cls, file):
        """Initialize a scraper from a file previously stored by tofile()
        method.
        """
        templates = [HtmlPage(**x) for x in json.load(file)['templates']]
        return cls(templates)

    def tofile(self, file):
        """Store the scraper into the given file-like object"""
        tpls = [page_to_dict(x) for x in self._templates]
        json.dump({'templates': tpls}, file)

    def add_template(self, template):
        self._templates.append(template)
        self._ex = None

    def train_from_htmlpage(self, htmlpage, data):
        assert data, "Cannot train with empty data"
        tm = TemplateMaker(htmlpage)
        for field, values in data.items():
            if (isinstance(values, (bytes, str)) or
                    not hasattr(values, '__iter__')):
                values = [values]
            for value in values:
                value = str_to_unicode(value, htmlpage.encoding)
                tm.annotate(field, best_match(value))
        self.add_template(tm.get_template())

    def train(self, url, data, encoding=None):
        page = url_to_page(url, encoding)
        self.train_from_htmlpage(page, data)

    def scrape(self, url, encoding=None):
        page = url_to_page(url, encoding)
        return self.scrape_page(page)

    def scrape_page(self, page):
        if self._ex is None:
            self._ex = InstanceBasedLearningExtractor((t, None) for t in
                    self._templates)
        return self._ex.extract(page)[0]
开发者ID:CodeOps,项目名称:scrapely,代码行数:51,代码来源:__init__.py


注:本文中的scrapely.extraction.InstanceBasedLearningExtractor.extract方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。