本文整理汇总了Python中scrapy.contrib.linkextractors.sgml.BaseSgmlLinkExtractor类的典型用法代码示例。如果您正苦于以下问题:Python BaseSgmlLinkExtractor类的具体用法?Python BaseSgmlLinkExtractor怎么用?Python BaseSgmlLinkExtractor使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了BaseSgmlLinkExtractor类的9个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: test_matches
def test_matches(self):
url1 = 'http://lotsofstuff.com/stuff1/index'
url2 = 'http://evenmorestuff.com/uglystuff/index'
lx = BaseSgmlLinkExtractor()
self.assertEqual(lx.matches(url1), True)
self.assertEqual(lx.matches(url2), True)
示例2: test_extraction_encoding
def test_extraction_encoding(self):
body = get_testdata('link_extractor', 'linkextractor_noenc.html')
response_utf8 = HtmlResponse(
url='http://example.com/utf8', body=body, headers={'Content-Type': ['text/html; charset=utf-8']})
response_noenc = HtmlResponse(
url='http://example.com/noenc', body=body)
body = get_testdata('link_extractor', 'linkextractor_latin1.html')
response_latin1 = HtmlResponse(
url='http://example.com/latin1', body=body)
lx = BaseSgmlLinkExtractor()
self.assertEqual(lx.extract_links(response_utf8), [
Link(url='http://example.com/sample_%C3%B1.html', text=''),
Link(url='http://example.com/sample_%E2%82%AC.html',
text='sample \xe2\x82\xac text'.decode('utf-8')),
])
self.assertEqual(lx.extract_links(response_noenc), [
Link(url='http://example.com/sample_%C3%B1.html', text=''),
Link(url='http://example.com/sample_%E2%82%AC.html',
text='sample \xe2\x82\xac text'.decode('utf-8')),
])
self.assertEqual(lx.extract_links(response_latin1), [
Link(url='http://example.com/sample_%F1.html', text=''),
Link(url='http://example.com/sample_%E1.html',
text='sample \xe1 text'.decode('latin1')),
])
示例3: test_link_text_wrong_encoding
def test_link_text_wrong_encoding(self):
html = """<body><p><a href="item/12.html">Wrong: \xed</a></p></body></html>"""
response = HtmlResponse("http://www.example.com", body=html, encoding='utf-8')
lx = BaseSgmlLinkExtractor()
self.assertEqual(lx.extract_links(response), [
Link(url='http://www.example.com/item/12.html', text=u'Wrong: \ufffd'),
])
示例4: test_base_url
def test_base_url(self):
html = """<html><head><title>Page title<title><base href="http://otherdomain.com/base/" />
<body><p><a href="item/12.html">Item 12</a></p>
</body></html>"""
response = HtmlResponse(
"http://example.org/somepage/index.html", body=html)
lx = BaseSgmlLinkExtractor() # default: tag=a, attr=href
self.assertEqual(lx.extract_links(response),
[Link(url='http://otherdomain.com/base/item/12.html', text='Item 12')])
# base url is an absolute path and relative to host
html = """<html><head><title>Page title<title><base href="/" />
<body><p><a href="item/12.html">Item 12</a></p></body></html>"""
response = HtmlResponse(
"https://example.org/somepage/index.html", body=html)
self.assertEqual(lx.extract_links(response),
[Link(url='https://example.org/item/12.html', text='Item 12')])
# base url has no scheme
html = """<html><head><title>Page title<title><base href="//noschemedomain.com/path/to/" />
<body><p><a href="item/12.html">Item 12</a></p></body></html>"""
response = HtmlResponse(
"https://example.org/somepage/index.html", body=html)
self.assertEqual(lx.extract_links(response),
[Link(url='https://noschemedomain.com/path/to/item/12.html', text='Item 12')])
示例5: test_extraction_encoding
def test_extraction_encoding(self):
body = get_testdata("link_extractor", "linkextractor_noenc.html")
response_utf8 = HtmlResponse(
url="http://example.com/utf8", body=body, headers={"Content-Type": ["text/html; charset=utf-8"]}
)
response_noenc = HtmlResponse(url="http://example.com/noenc", body=body)
body = get_testdata("link_extractor", "linkextractor_latin1.html")
response_latin1 = HtmlResponse(url="http://example.com/latin1", body=body)
lx = BaseSgmlLinkExtractor()
self.assertEqual(
lx.extract_links(response_utf8),
[
Link(url="http://example.com/sample_%C3%B1.html", text=""),
Link(url="http://example.com/sample_%E2%82%AC.html", text="sample \xe2\x82\xac text".decode("utf-8")),
],
)
self.assertEqual(
lx.extract_links(response_noenc),
[
Link(url="http://example.com/sample_%C3%B1.html", text=""),
Link(url="http://example.com/sample_%E2%82%AC.html", text="sample \xe2\x82\xac text".decode("utf-8")),
],
)
self.assertEqual(
lx.extract_links(response_latin1),
[
Link(url="http://example.com/sample_%F1.html", text=""),
Link(url="http://example.com/sample_%E1.html", text="sample \xe1 text".decode("latin1")),
],
)
示例6: test_base_url
def test_base_url(self):
html = """<html><head><title>Page title<title><base href="http://otherdomain.com/base/" />
<body><p><a href="item/12.html">Item 12</a></p>
</body></html>"""
response = HtmlResponse("http://example.org/somepage/index.html", body=html)
lx = BaseSgmlLinkExtractor() # default: tag=a, attr=href
self.assertEqual(
lx.extract_links(response), [Link(url="http://otherdomain.com/base/item/12.html", text="Item 12")]
)
示例7: _process_links
def _process_links(self, links):
links = [link for link in links if _is_valid_url(link.url)]
if self.allow_res:
links = [link for link in links if _matches(link.url, self.allow_res)]
if self.deny_res:
links = [link for link in links if not _matches(link.url, self.deny_res)]
if self.allow_domains:
links = [link for link in links if url_is_from_any_domain(link.url, self.allow_domains)]
if self.deny_domains:
links = [link for link in links if not url_is_from_any_domain(link.url, self.deny_domains)]
new_links = []
for link in links:
ASIN = link.url.split('/')[5]
if not self._ignore_identifier(ASIN):
log.msg("Found ASIN: "+ASIN,level=log.DEBUG)
link.url = "http://www.amazon.com/product-reviews/"+ASIN+"/ref%3Ddp_top_cm_cr_acr_txt?ie=UTF8&showViewpoints=0"
new_links.append(link)
links = new_links
if self.canonicalize:
for link in links:
link.url = canonicalize_url(link.url)
links = BaseSgmlLinkExtractor._process_links(self, links)
return links
示例8: test_basic
def test_basic(self):
html = """<html><head><title>Page title<title>
<body><p><a href="item/12.html">Item 12</a></p>
<p><a href="/about.html">About us</a></p>
<img src="/logo.png" alt="Company logo (not a link)" />
<p><a href="../othercat.html">Other category</a></p>
<p><a href="/" /></p>
</body></html>"""
response = HtmlResponse("http://example.org/somepage/index.html", body=html)
lx = BaseSgmlLinkExtractor() # default: tag=a, attr=href
self.assertEqual(lx.extract_links(response),
[Link(url='http://example.org/somepage/item/12.html', text='Item 12'),
Link(url='http://example.org/about.html', text='About us'),
Link(url='http://example.org/othercat.html', text='Other category'),
Link(url='http://example.org/', text='')])
示例9: _process_links
def _process_links(self, links):
links = [link for link in links if not self.check_url or _is_valid_url(link.url)]
if self.allow_res:
links = [link for link in links if _matches(link.url, self.allow_res)]
if self.deny_res:
links = [link for link in links if not _matches(link.url, self.deny_res)]
if self.allow_domains:
links = [link for link in links if url_is_from_any_domain(link.url, self.allow_domains)]
if self.deny_domains:
links = [link for link in links if not url_is_from_any_domain(link.url, self.deny_domains)]
if self.canonicalize:
for link in links:
link.url = canonicalize_url(link.url)
links = BaseSgmlLinkExtractor._process_links(self, links)
return links