当前位置: 首页>>代码示例>>Python>>正文


Python extraction.InstanceBasedLearningExtractor类代码示例

本文整理汇总了Python中scrapely.extraction.InstanceBasedLearningExtractor的典型用法代码示例。如果您正苦于以下问题:Python InstanceBasedLearningExtractor类的具体用法?Python InstanceBasedLearningExtractor怎么用?Python InstanceBasedLearningExtractor使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。


在下文中一共展示了InstanceBasedLearningExtractor类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: test_extraction

    def test_extraction(self, name, templates, page, descriptor, expected_output):
        template_pages = [HtmlPage(None, {}, t) for t in templates]

        extractor = InstanceBasedLearningExtractor([(t, descriptor) for t in template_pages])
        actual_output, _ = extractor.extract(HtmlPage(None, {}, page))

        self.assertEqual(expected_output, actual_output and actual_output[0])
开发者ID:1060460048,项目名称:scrapely,代码行数:7,代码来源:test_extraction.py

示例2: test_extractor_w_empty_string_extraction

 def test_extractor_w_empty_string_extraction(self):
     schema = {
         "id": "test",
         "properties": [
             ('gender', {
                 'description': '',
                 'optional': True,
                 'type': 'text',
                 'vary': False,
             }),
             ('name', {
                 'description': '',
                 'optional': False,
                 'type': 'text',
                 'vary': False,
             }),
         ],
     }
     descriptor = create_slybot_item_descriptor(schema)
     extractors =  {
                 1: {
                     "_id": 2,
                     "field_name": "gender",
                     "regular_expression": "([0-9]+)"
                 }
     }
     apply_extractors(descriptor, [1], extractors)
     
     ibl_extractor = InstanceBasedLearningExtractor([(self.template2, descriptor)])
     self.assertEqual(ibl_extractor.extract(self.target2)[0][0], {u'name': [u'Name Olivia']})
开发者ID:alepharchives,项目名称:slybot,代码行数:30,代码来源:test_extractors.py

示例3: test_annotate_multiple

 def test_annotate_multiple(self):
     tm = TemplateMaker(self.PAGE)
     tm.annotate('field1', best_match('text to annotate'), best_match=False)
     tpl = tm.get_template()
     ex = InstanceBasedLearningExtractor([(tpl, None)])
     self.assertEqual(ex.extract(self.PAGE)[0],
         [{u'field1': [u'Some text to annotate here', u'Another text to annotate there']}])
开发者ID:scrapy,项目名称:scrapely,代码行数:7,代码来源:test_template.py

示例4: test_annotate_ignore_unpaired

 def test_annotate_ignore_unpaired(self):
     tm = TemplateMaker(self.PAGE)
     tm.annotate('field1', best_match("and that's"), best_match=False)
     tpl = tm.get_template()
     ex = InstanceBasedLearningExtractor([(tpl, None)])
     self.assertEqual(ex.extract(self.PAGE)[0],
         [{u'field1': [u"More text with unpaired tag <img />and that's it"]}])
开发者ID:scrapy,项目名称:scrapely,代码行数:7,代码来源:test_template.py

示例5: _run_extraction

    def _run_extraction(self, name, templates, page, descriptor, expected_output):
        self.trace = None
        template_pages = [HtmlPage(None, {}, t) for t in templates]
        # extracts with trace enabled in order to generate traceback
        extractor = InstanceBasedLearningExtractor([(t, descriptor) for t in template_pages], True)
        actual_output, _ = extractor.extract(HtmlPage(None, {}, page))
        if actual_output is not None:
            actual_output = actual_output[0]
            self.trace = ["Extractor:\n%s" % extractor] + actual_output.pop('trace')
        # extracts again with trace disabled in order to get the pure output
        extractor = InstanceBasedLearningExtractor([(t, descriptor) for t in template_pages])
        actual_output, _ = extractor.extract(HtmlPage(None, {}, page))
        if actual_output is None:
            if expected_output is None:
                return
            assert False, "failed to extract data for test '%s'" % name
        else:
            actual_output = actual_output[0]
        expected_names = set(expected_output.keys())
        actual_names = set(actual_output.keys())
        
        missing_in_output = filter(None, expected_names - actual_names)
        error = "attributes '%s' were expected but were not present in test '%s'" % \
                ("', '".join(missing_in_output), name)
        assert len(missing_in_output) == 0, error

        unexpected = actual_names - expected_names
        error = "unexpected attributes %s in test '%s'" % \
                (', '.join(unexpected), name)
        assert len(unexpected) == 0, error

        for k, v in expected_output.items():
            extracted = actual_output[k]
            assert v == extracted, "in test '%s' for attribute '%s', " \
                "expected value '%s' but got '%s'" % (name, k, v, extracted)
开发者ID:alepharchives,项目名称:scrapely,代码行数:35,代码来源:test_extraction.py

示例6: scrape

    def scrape(self, url=None, html=None, encoding='utf-8'): 
        ## not version from https://github.com/scrapy/scrapely/blob/master/scrapely/extraction/pageparsing.py
        ## may need to replace with version from inspect.getsourcelines(Scraper.scrape), as this version is

        page = self._get_page(url, encoding, html)
        ex = InstanceBasedLearningExtractor(self.templates)
        return ex.extract(page)[0]
开发者ID:carriercomm,项目名称:scraperwiki-scraper-vault,代码行数:7,代码来源:scrapely-hack.py

示例7: _run_extraction

    def _run_extraction(self, name, templates, page, extractors, expected_output):
        self.trace = None
        template_pages = [HtmlPage(None, {}, t) for t in templates]
        extractor = InstanceBasedLearningExtractor(template_pages, extractors, True)
        actual_output, _ = extractor.extract(HtmlPage(None, {}, page))
        if not actual_output:
            if expected_output is None:
                return
            assert False, "failed to extract data for test '%s'" % name
        actual_output = actual_output[0]
        self.trace = ["Extractor:\n%s" % extractor] + actual_output.pop('trace', [])
        expected_names = set(expected_output.keys())
        actual_names = set(actual_output.keys())
        
        missing_in_output = filter(None, expected_names - actual_names)
        error = "attributes '%s' were expected but were not present in test '%s'" % \
                ("', '".join(missing_in_output), name)
        assert len(missing_in_output) == 0, error

        unexpected = actual_names - expected_names
        error = "unexpected attributes %s in test '%s'" % \
                (', '.join(unexpected), name)
        assert len(unexpected) == 0, error

        for k, v in expected_output.items():
            extracted = actual_output[k]
            assert v == extracted, "in test '%s' for attribute '%s', " \
                "expected value '%s' but got '%s'" % (name, k, v, extracted)
开发者ID:esimionato,项目名称:scrapely,代码行数:28,代码来源:test_extraction.py

示例8: test_extractor_w_empty_string_extraction

 def test_extractor_w_empty_string_extraction(self):
     schema = {
         'fields': {
             'gender': {
                 'required': False,
                 'type': 'text',
                 'vary': False,
             },
             'name': {
                 'required': True,
                 'type': 'text',
                 'vary': False,
             }
         }
     }
     descriptor = create_slybot_item_descriptor(schema)
     extractors =  {
                 1: {
                     "regular_expression": "([0-9]+)"
                 }
     }
     apply_extractors(descriptor, {"gender": [1]}, extractors)
     
     ibl_extractor = InstanceBasedLearningExtractor([(self.template2, descriptor)])
     self.assertEqual(ibl_extractor.extract(self.target2)[0][0], {u'name': [u'Name Olivia']})
开发者ID:1060460048,项目名称:portia,代码行数:25,代码来源:test_extractors.py

示例9: test_type_extractor

 def test_type_extractor(self):
     schema = {
         "id": "test",
         "properties": [('gender', {
                 'description': '',
                 'optional': True,
                 'type': 'number',
                 'vary': False,
         })],
     }
     descriptor = create_slybot_item_descriptor(schema)
     extractors =  {
                 1: {
                     "_id": 1,
                     "field_name": "gender",
                     "type_extractor": "text"
                 },
                 2: {
                     "_id": 2,
                     "field_name": "gender",
                     "regular_expression": "Gender\\s+(Male|Female)"
                 }
     }
     apply_extractors(descriptor, [1, 2], extractors)
     
     ibl_extractor = InstanceBasedLearningExtractor([(self.template, descriptor)])
     self.assertEqual(ibl_extractor.extract(self.target)[0][0], {u'gender': [u'Male']})
开发者ID:alepharchives,项目名称:slybot,代码行数:27,代码来源:test_extractors.py

示例10: test_annotate_multiple

 def test_annotate_multiple(self):
     tm = TemplateMaker(self.PAGE)
     tm.annotate("field1", best_match("text to annotate"), best_match=False)
     tpl = tm.get_template()
     ex = InstanceBasedLearningExtractor([tpl])
     self.assertEqual(
         ex.extract(self.PAGE)[0], [{u"field1": [u"Some text to annotate here", u"Another text to annotate there"]}]
     )
开发者ID:netconstructor,项目名称:scrapely,代码行数:8,代码来源:test_template.py

示例11: do_s

 def do_s(self, url):
     """s <url> - scrape url (uses encoding from templates)"""
     templates = self._load_templates()
     if assert_or_print(templates, "no templates available"):
         return
     page = get_page(url, templates[0].encoding)
     ex = InstanceBasedLearningExtractor(templates)
     pprint.pprint(ex.extract(page)[0])
开发者ID:esimionato,项目名称:scrapely,代码行数:8,代码来源:tool.py

示例12: do_s

 def do_s(self, url):
     """s <url> - scrape url"""
     templates = self._load_templates()
     if assert_or_print(templates, "no templates available"):
         return
     # fall back to the template encoding if none is specified
     page = url_to_page(url, default_encoding=templates[0].encoding)
     ex = InstanceBasedLearningExtractor((t, None) for t in templates)
     pprint.pprint(ex.extract(page)[0])
开发者ID:1060460048,项目名称:scrapely,代码行数:9,代码来源:tool.py

示例13: do_s

 def do_s(self, line):
     """s <url> [--encoding ENCODING --useragent 'User-Agent'] - scrape url"""
     templates = self._load_templates()
     if assert_or_print(templates, "no templates available"):
         return
     opts, (url,) = parse_at_s(line)
     headers = { 'User-Agent' : opts.useragent or self.user_agent }
     url = urllib2.Request(url, headers=headers)
     # fall back to the template encoding if none is specified
     page = url_to_page(url, opts.encoding, templates[0].encoding)
     ex = InstanceBasedLearningExtractor((t, None) for t in templates)
     pprint.pprint(ex.extract(page)[0])
开发者ID:boite,项目名称:scrapely,代码行数:12,代码来源:tool.py

示例14: test_default_type_extractor

 def test_default_type_extractor(self):
     schema = {
         'fields': {}
     }
     descriptor = create_slybot_item_descriptor(schema)
     extractors =  {
                 1: {
                     "regular_expression": "Gender\\s+(Male|Female)"
                 }
     }
     apply_extractors(descriptor, {"gender": [1]}, extractors)
     
     ibl_extractor = InstanceBasedLearningExtractor([(self.template, descriptor)])
     self.assertEqual(ibl_extractor.extract(self.target)[0][0], {u'gender': [u'Male']})
开发者ID:1060460048,项目名称:portia,代码行数:14,代码来源:test_extractors.py

示例15: setup_bot

    def setup_bot(self, settings, spec, items, extractors):
        """
        Perform any initialization needed for crawling using this plugin
        """
        _item_template_pages = sorted((
            [t.get('scrapes'), dict_to_page(t, 'annotated_body'),
             t.get('extractors', []), t.get('version', '0.12.0')]
            for t in spec['templates'] if t.get('page_type', 'item') == 'item'
        ), key=lambda x: x[0])
        self.item_classes = {}
        self.template_scrapes = {template.get('page_id'): template['scrapes']
                                 for template in spec.get('templates')}
        self.html_link_extractor = HtmlLinkExtractor()
        for schema_name, schema in items.items():
            if schema_name not in self.item_classes:
                if not schema.get('name'):
                    schema['name'] = schema_name
                item_cls = SlybotItem.create_iblitem_class(schema)
                self.item_classes[schema_name] = item_cls

        # Create descriptors and apply additional extractors to fields
        page_descriptor_pairs = []
        self.schema_descriptors = {}
        for default, template, template_extractors, v in _item_template_pages:
            descriptors = OrderedDict()
            for schema_name, schema in items.items():
                item_descriptor = create_slybot_item_descriptor(schema,
                                                                schema_name)
                apply_extractors(item_descriptor, template_extractors,
                                 extractors)
                descriptors[schema_name] = item_descriptor
            descriptor = descriptors.values() or [{}]
            descriptors['#default'] = descriptors.get(default, descriptor[0])
            self.schema_descriptors[template.page_id] = descriptors['#default']
            page_descriptor_pairs.append((template, descriptors, v))
            add_extractors_to_descriptors(descriptors, extractors)

        grouped = itertools.groupby(sorted(page_descriptor_pairs,
                                           key=operator.itemgetter(2)),
                                    lambda x: x[2] < '0.13.0')
        self.extractors = []
        for version, group in grouped:
            if version:
                self.extractors.append(
                    InstanceBasedLearningExtractor(
                        [(page, scrapes['#default'])
                         for page, scrapes, version in group]))
            else:
                self.extractors.append(SlybotIBLExtractor(list(group)))

        # generate ibl extractor for links pages
        _links_pages = [dict_to_page(t, 'annotated_body')
                        for t in spec['templates']
                        if t.get('page_type') == 'links']
        _links_item_descriptor = create_slybot_item_descriptor({'fields': {}})
        self._links_ibl_extractor = InstanceBasedLearningExtractor(
            [(t, _links_item_descriptor) for t in _links_pages]) \
            if _links_pages else None

        self.build_url_filter(spec)
开发者ID:FrankieChan885,项目名称:portia,代码行数:60,代码来源:annotations.py


注:本文中的scrapely.extraction.InstanceBasedLearningExtractor类示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。