本文整理汇总了Python中slybot.linkextractor.create_linkextractor_from_specs函数的典型用法代码示例。如果您正苦于以下问题:Python create_linkextractor_from_specs函数的具体用法?Python create_linkextractor_from_specs怎么用?Python create_linkextractor_from_specs使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了create_linkextractor_from_specs函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: test_simple
def test_simple(self):
specs = {"type": "html", "value": None}
lextractor = create_linkextractor_from_specs(specs)
response = UTF8HtmlResponse(url='http://www.example.com/', body=html)
links = list(lextractor.links_to_follow(response))
self.assertEqual(len(links), 1)
self.assertEqual(links[0].url, 'http://www.example.com/path')
self.assertEqual(links[0].text, 'Click here')
示例2: test_custom_withargs
def test_custom_withargs(self):
specs = {"type": "regex", "value": 'url: ((?:http|https)://www.example.com/[\w/]+)', 'allowed_schemes': ['http']}
lextractor = create_linkextractor_from_specs(specs)
text = "url: http://www.example.com/path, more text url: https://www.example.com/path2. And more text url: https://aws.amazon.com/product?id=23#tre"
response = UTF8TextResponse(url='http://www.example.com/', body=text)
links = list(lextractor.links_to_follow(response))
self.assertEqual(len(links), 1)
self.assertEqual(links[0].url, 'http://www.example.com/path')
示例3: test_extra_params
def test_extra_params(self):
specs = {"type": "column", "value": 1, "delimiter": "|"}
lextractor = create_linkextractor_from_specs(specs)
response = TextResponse(url="http://www.example.com/", body=csvfeed2)
links = list(lextractor.links_to_follow(response))
self.assertEqual(len(links), 2)
self.assertEqual(links[0].url, "http://www.example.com/path")
self.assertEqual(links[1].url, "http://www.example.com/path2")
示例4: test_header
def test_header(self):
specs = {"type": "column", "value": 1}
lextractor = create_linkextractor_from_specs(specs)
response = UTF8TextResponse(url='http://www.example.com/', body=csvfeed3)
links = list(lextractor.links_to_follow(response))
self.assertEqual(len(links), 2)
self.assertEqual(links[0].url, 'http://www.example.com/path')
self.assertEqual(links[1].url, 'http://www.example.com/path2')
示例5: test_default
def test_default(self):
specs = {"type": "regex", "value": ''}
lextractor = create_linkextractor_from_specs(specs)
text = "Hello http://www.example.com/path, more text https://aws.amazon.com/product?id=23#tre?"
response = UTF8TextResponse(url='http://www.example.com/', body=text)
links = list(lextractor.links_to_follow(response))
self.assertEqual(len(links), 2)
self.assertEqual(links[0].url, 'http://www.example.com/path')
self.assertEqual(links[1].url, 'https://aws.amazon.com/product?id=23')
示例6: test_simple
def test_simple(self):
specs = {"type": "pagination", "value": None}
lextractor = create_linkextractor_from_specs(specs)
html_page = htmlpage_from_response(HtmlResponse(url="http://www.example.com/", body=html))
html_page.headers["n_items"] = 1
links = list(lextractor.links_to_follow(html_page))
self.assertEqual(len(links), 1)
self.assertEqual(links[0].url, "http://www.example.com/path")
self.assertEqual(links[0].text, "Click here")
示例7: test_custom
def test_custom(self):
specs = {"type": "regex", "value": "url: ((?:http|https)://www.example.com/[\w/]+)"}
lextractor = create_linkextractor_from_specs(specs)
text = "url: http://www.example.com/path, more text url: https://www.example.com/path2. And more text url: https://aws.amazon.com/product?id=23#tre"
response = TextResponse(url="http://www.example.com/", body=text)
links = list(lextractor.links_to_follow(response))
self.assertEqual(len(links), 2)
self.assertEqual(links[0].url, "http://www.example.com/path")
self.assertEqual(links[1].url, "https://www.example.com/path2")
示例8: test_sitemap
def test_sitemap(self):
specs = {"type": "sitemap", "value": ""}
lextractor = create_linkextractor_from_specs(specs)
links = list(lextractor.links_to_follow(self.sitemap))
self.assertEqual(len(links), 3)
self.assertEqual(links[0].url, 'http://www.accommodationforstudents.com/')
links = list(lextractor.links_to_follow(self.sitemapindex))
self.assertEqual(len(links), 1)
self.assertEqual(links[0].url, 'http://www.example.com/sitemap1.xml.gz')
示例9: _create_start_request_from_specs
def _create_start_request_from_specs(self, info):
url = info["url"]
lspecs = info.get("link_extractor")
if lspecs:
linkextractor = create_linkextractor_from_specs(lspecs)
def _callback(spider, response):
for link in linkextractor.links_to_follow(response):
yield Request(url=link.url, callback=spider.parse)
return Request(url=url, callback=_callback)
return Request(url=url, callback=self.parse)
示例10: handle_xml
def handle_xml(self, response, seen):
_type = content_type(response).subtype.split('+')[0]
try:
link_extractor = create_linkextractor_from_specs({
'type': _type, 'value': ''
})
except ValueError:
link_extractor = XmlLinkExtractor()
for link in link_extractor.links_to_follow(response):
request = self._filter_link(link, seen)
if request:
yield request
示例11: handle_xml
def handle_xml(self, response, seen):
_type = XML_APPLICATION_TYPE(response.headers.get('Content-Type', ''))
_type = _type.groupdict()['type'] if _type else 'xml'
try:
link_extractor = create_linkextractor_from_specs({
'type': _type, 'value': ''
})
except ValueError:
link_extractor = SitemapLinkExtractor()
for link in link_extractor.links_to_follow(response):
request = self._filter_link(link, seen)
if request:
yield request
示例12: test_start_urls
def test_start_urls(self):
specs = {"type": "pagination",
"value": None,
"start_urls": ['http://www.spam.com/?p=1',
'http://www.eggs.com/?page=0']
}
lextractor = create_linkextractor_from_specs(specs)
html = """
<a href="http://www.spam.com/?p=100">Click here 1</a>
<a href="http://www.spam.com/?p=200">Click here 2</a>
<a href="http://www.spam.com/?p=300">Click here 3</a>
"""
html_page = htmlpage_from_response(
HtmlResponse(url='http://www.example.com/', body=html))
links = list(lextractor.links_to_follow(html_page))
links = sorted(links, key=lambda link: link.url)
self.assertEqual(len(links), 3)
self.assertEqual(links[0].url, "http://www.spam.com/?p=100")
self.assertEqual(links[1].url, "http://www.spam.com/?p=200")
self.assertEqual(links[2].url, "http://www.spam.com/?p=300")
self.assertEqual(links[0].text, 'Click here 1')
self.assertEqual(links[1].text, 'Click here 2')
self.assertEqual(links[2].text, 'Click here 3')
示例13: test_xml_remove_namespaces
def test_xml_remove_namespaces(self):
specs = {"type": "xpath", "value": "//link/@href", "remove_namespaces": True}
lextractor = create_linkextractor_from_specs(specs)
links = list(lextractor.links_to_follow(self.atom))
self.assertEqual(len(links), 3)
self.assertEqual(links[0].url, 'http://example.org/feed/')
示例14: test_atom
def test_atom(self):
specs = {"type": "atom", "value": ""}
lextractor = create_linkextractor_from_specs(specs)
links = list(lextractor.links_to_follow(self.atom))
self.assertEqual(len(links), 3)
self.assertEqual(links[0].url, 'http://example.org/feed/')
示例15: test_xml
def test_xml(self):
specs = {"type": "xpath", "value": "//item/link/text()"}
lextractor = create_linkextractor_from_specs(specs)
links = list(lextractor.links_to_follow(self.response))
self.assertEqual(len(links), 1)
self.assertEqual(links[0].url, 'http://www.wikipedia.org/')