本文整理汇总了Python中wpull.scraper.HTMLScraper类的典型用法代码示例。如果您正苦于以下问题:Python HTMLScraper类的具体用法?Python HTMLScraper怎么用?Python HTMLScraper使用的例子?那么, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了HTMLScraper类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: test_javascript_heavy_inline_monstrosity
def test_javascript_heavy_inline_monstrosity(self):
scraper = HTMLScraper()
request = Request.new('http://example.com/')
response = Response('HTTP/1.0', 200, 'OK')
with wpull.util.reset_file_offset(response.body.content_file):
html_file_path = os.path.join(os.path.dirname(__file__),
'testing', 'samples',
'twitchplayspokemonfirered.html')
with open(html_file_path, 'rb') as in_file:
shutil.copyfileobj(in_file, response.body.content_file)
scrape_info = scraper.scrape(request, response)
inline_urls = scrape_info['inline_urls']
linked_urls = scrape_info['linked_urls']
self.assertIn(
'http://cdn.bulbagarden.net/upload/archive/a/a4/'
'20090718115357%21195Quagsire.png',
inline_urls
)
self.assertIn(
'http://www.google.com/url?q=http%3A%2F%2Fwww.reddit.com%2F'
'user%2FGoldenSandslash15&sa=D&sntz=1&'
'usg=AFQjCNElFBxZYdNm5mWoRSncf5tbdIJQ-A',
linked_urls
)
示例2: test_html_scraper_links_base_href
def test_html_scraper_links_base_href(self):
scraper = HTMLScraper()
request = Request.new('http://example.com/')
response = Response('HTTP/1.0', 200, 'OK')
with wpull.util.reset_file_offset(response.body.content_file):
html_file_path = os.path.join(os.path.dirname(__file__),
'testing', 'samples',
'basehref.html')
with open(html_file_path, 'rb') as in_file:
shutil.copyfileobj(in_file, response.body.content_file)
scrape_info = scraper.scrape(request, response)
inline_urls = scrape_info['inline_urls']
linked_urls = scrape_info['linked_urls']
self.assertEqual('utf-8', scrape_info['encoding'])
self.assertEqual({
'http://cdn.example.com/stylesheet1.css',
'http://www.example.com/stylesheet2.css',
'http://example.com/a/stylesheet3.css',
'http://example.com/a/dir/image1.png',
'http://example.com/dir/image2.png',
'http://example.net/image3.png',
'http://example.com/dir/image4.png',
},
inline_urls
)
self.assertEqual({
'http://example.com/a/'
},
linked_urls
)
示例3: test_rss_as_html
def test_rss_as_html(self):
scraper = HTMLScraper()
request = Request.new('http://example.com/')
response = Response('HTTP/1.0', 200, '')
response.fields['content-type'] = 'application/rss+xml'
with wpull.util.reset_file_offset(response.body.content_file):
html_file_path = os.path.join(os.path.dirname(__file__),
'testing', 'samples', 'rss.xml')
with open(html_file_path, 'rb') as in_file:
shutil.copyfileobj(in_file, response.body.content_file)
scrape_info = scraper.scrape(request, response)
self.assertTrue(scrape_info)
inline_urls = scrape_info['inline_urls']
linked_urls = scrape_info['linked_urls']
self.assertFalse(
inline_urls
)
self.assertEqual(
{
'http://www.someexamplerssdomain.com/main.html',
'http://www.wikipedia.org/'
},
linked_urls
)
示例4: test_xhtml_invalid
def test_xhtml_invalid(self):
scraper = HTMLScraper()
request = Request.new('http://example.com/')
response = Response('HTTP/1.0', 200, '')
with wpull.util.reset_file_offset(response.body.content_file):
html_file_path = os.path.join(os.path.dirname(__file__),
'testing', 'samples',
'xhtml_invalid.html')
with open(html_file_path, 'rb') as in_file:
shutil.copyfileobj(in_file, response.body.content_file)
scrape_info = scraper.scrape(request, response)
inline_urls = scrape_info['inline_urls']
linked_urls = scrape_info['linked_urls']
self.assertEqual(
{
'http://example.com/image.png',
'http://example.com/script.js',
},
inline_urls
)
self.assertEqual(
{
'http://example.com/link'
},
linked_urls
)
示例5: test_html_soup
def test_html_soup(self):
scraper = HTMLScraper()
request = Request.new('http://example.com/')
response = Response('HTTP/1.0', 200, '')
response.fields['Refresh'] = 'yes'
with wpull.util.reset_file_offset(response.body.content_file):
html_file_path = os.path.join(os.path.dirname(__file__),
'testing', 'samples', 'soup.html')
with open(html_file_path, 'rb') as in_file:
shutil.copyfileobj(in_file, response.body.content_file)
scrape_info = scraper.scrape(request, response)
inline_urls = scrape_info['inline_urls']
linked_urls = scrape_info['linked_urls']
self.assertEqual(
{'http://example.com/ABOUTM~1.JPG'},
inline_urls
)
self.assertEqual(
{
'http://example.com/BLOG',
'http://example.com/web ring/Join.htm',
},
linked_urls
)
示例6: test_html_krokozyabry
def test_html_krokozyabry(self):
scraper = HTMLScraper()
request = Request.new('http://example.com/')
response = Response('HTTP/1.0', 200, '')
response.fields['content-type'] = 'text/html; charset=KOI8-R'
with wpull.util.reset_file_offset(response.body.content_file):
html_file_path = os.path.join(os.path.dirname(__file__),
'testing', 'samples',
'krokozyabry.html')
with open(html_file_path, 'rb') as in_file:
shutil.copyfileobj(in_file, response.body.content_file)
scrape_info = scraper.scrape(request, response)
inline_urls = scrape_info['inline_urls']
linked_urls = scrape_info['linked_urls']
self.assertEqual('koi8-r', scrape_info['encoding'])
self.assertEqual(
set(),
inline_urls
)
self.assertEqual(
{'http://example.com/Кракозябры'},
linked_urls
)
示例7: test_html_serious_bad_encoding
def test_html_serious_bad_encoding(self):
scraper = HTMLScraper(encoding_override='utf8')
request = Request.new('http://example.com/')
response = Response('HTTP/1.0', 200, '')
response.fields['content-type'] = 'text/html; charset=utf8'
with wpull.util.reset_file_offset(response.body.content_file):
html_file_path = os.path.join(os.path.dirname(__file__),
'testing', 'samples', 'xkcd_1_evil.html')
with open(html_file_path, 'rb') as in_file:
shutil.copyfileobj(in_file, response.body.content_file)
scrape_info = scraper.scrape(request, response)
self.assertTrue(scrape_info)
示例8: test_html_encoding_lxml_name_mismatch
def test_html_encoding_lxml_name_mismatch(self):
'''It should accept encoding names with underscore.'''
scraper = HTMLScraper()
request = Request.new('http://example.com/')
response = Response('HTTP/1.0', 200, '')
response.fields['content-type'] = 'text/html; charset=EUC_KR'
with wpull.util.reset_file_offset(response.body.content_file):
response.body.content_file.write(
'힖'.encode('euc_kr')
)
scrape_info = scraper.scrape(request, response)
self.assertTrue(scrape_info)
self.assertEqual('euc_kr', scrape_info['encoding'])
示例9: test_html_garbage
def test_html_garbage(self):
scraper = HTMLScraper()
request = Request.new('http://example.com/')
response = Response('HTTP/1.0', 200, '')
response.fields['content-type'] = 'text/html'
with wpull.util.reset_file_offset(response.body.content_file):
response.body.content_file.write(
b'\x01\x00\x01\x00l~Z\xff\x0f`y\x80\x00p<\x7f'
b'\xffndo\xff\xff-\x83{d\xec</\xfe\x80\x00\xb4Bo'
b'\x7f\xff\xff\xffV\xc1\xff\x7f\xff7'
)
scrape_info = scraper.scrape(request, response)
self.assertTrue(scrape_info)
示例10: _read_input_file_as_html
def _read_input_file_as_html(self):
'''Read input file as HTML and return the links.'''
scrape_info = HTMLScraper.scrape_file(
self._args.input_file,
encoding=self._args.local_encoding or 'utf-8'
)
links = itertools.chain(
scrape_info['inline_urls'], scrape_info['linked_urls']
)
return links
示例11: test_html_wrong_charset
def test_html_wrong_charset(self):
scraper = HTMLScraper()
request = Request.new('http://example.com/')
response = Response('HTTP/1.0', 200, '')
with wpull.util.reset_file_offset(response.body.content_file):
html_file_path = os.path.join(os.path.dirname(__file__),
'testing', 'samples', 'kcna.html')
with open(html_file_path, 'rb') as in_file:
shutil.copyfileobj(in_file, response.body.content_file)
scrape_info = scraper.scrape(request, response)
inline_urls = scrape_info['inline_urls']
linked_urls = scrape_info['linked_urls']
self.assertEqual('utf-16-le', scrape_info['encoding'])
self.assertEqual(
{
'http://example.com/utm/__utm.js',
'http://example.com/Knewskage.gif',
'http://example.com/Lline.gif',
'http://example.com/Sline.gif',
'http://example.com/korean01.gif',
'http://example.com/korean02.gif',
'http://example.com/english01.gif',
'http://example.com/english02.gif',
'http://example.com/Tongsinkage.gif',
'http://example.com/Knewskage.gif',
},
inline_urls
)
self.assertEqual(
{
'http://example.com/index-k.htm',
'http://example.com/index-e.htm',
},
linked_urls
)
示例12: test_html_not_quite_charset
def test_html_not_quite_charset(self):
scraper = HTMLScraper()
request = Request.new('http://example.com/')
response = Response('HTTP/1.0', 200, '')
with wpull.util.reset_file_offset(response.body.content_file):
html_file_path = os.path.join(os.path.dirname(__file__),
'testing', 'samples', 'videogame_top.htm')
with open(html_file_path, 'rb') as in_file:
shutil.copyfileobj(in_file, response.body.content_file)
scrape_info = scraper.scrape(request, response)
inline_urls = scrape_info['inline_urls']
linked_urls = scrape_info['linked_urls']
self.assertIn(
'http://example.com/copyright_2001_2006_rtype.gif',
inline_urls
)
self.assertIn(
'http://www.geocities.jp/gamehouse_grindcrusher/',
linked_urls
)
示例13: convert_by_record
def convert_by_record(self, url_record):
'''Convert using given URL Record.'''
filename = url_record.filename
if not os.path.exists(filename):
return
if url_record.link_type:
if url_record.link_type not in ('css', 'html'):
return
else:
link_type = url_record.link_type
else:
with open(filename, 'rb') as in_file:
if HTMLScraper.is_supported(
file=in_file, url_info=url_record.url_info):
link_type = 'html'
elif CSSScraper.is_supported(
file=in_file, url_info=url_record.url_info):
link_type = 'css'
else:
link_type = None
_logger.info(__(
_('Converting links in file ‘{filename}’ (type={type}).'),
filename=filename, type=link_type
))
if self._backup_enabled:
shutil.copy2(filename, filename + '.orig')
temp_filename = filename + '-new'
if link_type == 'css':
self._css_converter.convert(
filename, temp_filename, base_url=url_record.url)
elif link_type == 'html':
self._html_converter.convert(
filename, temp_filename, base_url=url_record.url)
else:
raise Exception('Unknown link type.')
os.remove(filename)
os.rename(temp_filename, filename)
示例14: convert_by_record
def convert_by_record(self, url_record):
'''Convert using given URL Record.'''
filename = self._path_namer.get_filename(
URLInfo.parse(url_record.url)
)
if not os.path.exists(filename):
return
if url_record.link_type:
if url_record.link_type not in ('css', 'html'):
return
else:
link_type = url_record.link_type
else:
with open(filename, 'rb') as in_file:
if HTMLScraper.is_supported(
in_file, url_info=url_record.url_info):
link_type = 'html'
elif CSSScraper.is_supported(
in_file, url_info=url_record.url_info):
link_type = 'css'
else:
link_type = None
_logger.info(
_('Converting links in file ‘{filename}’ (type={type}).')\
.format(filename=filename, type=link_type)
)
if self._backup_enabled:
shutil.copy2(filename, filename + '.orig')
if link_type == 'css':
self._css_converter.convert(
filename, filename, base_url=url_record.url)
elif link_type == 'html':
self._html_converter.convert(
filename, filename, base_url=url_record.url)
示例15: test_html_scraper_links
def test_html_scraper_links(self):
scraper = HTMLScraper()
request = Request.new('http://example.com/')
response = Response('HTTP/1.0', 200, 'OK')
response.fields['Refresh'] = '3; url=header_refresh.html'
with wpull.util.reset_file_offset(response.body.content_file):
html_file_path = os.path.join(os.path.dirname(__file__),
'testing', 'samples',
'many_urls.html')
with open(html_file_path, 'rb') as in_file:
shutil.copyfileobj(in_file, response.body.content_file)
scrape_info = scraper.scrape(request, response)
inline_urls = scrape_info['inline_urls']
linked_urls = scrape_info['linked_urls']
self.assertEqual('ascii', scrape_info['encoding'])
self.assertEqual({
'http://example.com/style_import_url.css',
'http://example.com/style_import_quote_url.css',
'http://example.com/style_single_quote_import.css',
'http://example.com/style_double_quote_import.css',
'http://example.com/link_href.css',
'http://example.com/script.js',
'http://example.com/body_background.png',
'http://example.com/images/table_background.png',
'http://example.com/images/td_background.png',
'http://example.com/images/th_background.png',
'http://example.com/style_url1.png',
'http://example.com/style_url2.png',
'http://example.com/applet/', # returned by lxml
'http://example.com/applet/applet_code.class',
'http://example.com/applet/applet_src.class',
'http://example.com/bgsound.mid',
'http://example.com/audio_src.wav',
'http://example.net/source_src.wav',
'http://example.com/embed_src.mov',
'http://example.com/fig_src.png',
'http://example.com/frame_src.html',
'http://example.com/iframe_src.html',
'http://example.com/img_href.png',
'http://example.com/img_lowsrc.png',
'http://example.com/img_src.png',
'http://example.com/img_data.png',
'http://example.com/input_src.png',
'http://example.com/layer_src.png',
'http://example.com/object/', # returned by lxml
'http://example.com/object/object_data.swf',
'http://example.com/object/object_archive.dat',
'http://example.com/param_ref_value.php',
'http://example.com/overlay_src.html',
'http://example.com/script_variable.png',
},
inline_urls
)
self.assertEqual({
'http://example.net/soup.html',
'http://example.com/a_href.html',
'http://example.com/area_href.html',
'http://example.com/frame_src.html',
'http://example.com/embed_href.html',
'http://example.com/embed_src.mov',
'http://example.com/form_action.html',
'http://example.com/iframe_src.html',
'http://example.com/layer_src.png',
'http://example.com/overlay_src.html',
'ftp://ftp.protocol.invalid/',
'mailto:[email protected]',
'http://a-double-slash.example',
'http://example.com/header_refresh.html',
'https://[2001:db8:85a3:8d3:1319:8a2e:370:7348]:8080/ipv6',
'http://example.com/document_write.html',
'http://example.com/http_document_write.html',
'http://example.com/http_document_write2.html',
'http://example.com/http document write.html',
'http://example.com/script_variable.html',
'http://example.com/http_script_variable.html',
'https://example.com/https_script_variable.html',
'ftp://example.com/ftp_script_variable.html',
'http://example.com/end_dir_script_variable/',
'http://example.com/start_dir_script_variable',
'http://example.com/../relative_dir_script_variable',
'http://example.com/script_json.html',
'http://example.com/http_script_json.html?a=b',
'http://example.com/a_javascript_link.html',
'http://example.com/a_onclick_link.html',
},
linked_urls
)
for url in inline_urls | linked_urls:
self.assertIsInstance(url, str)