本文整理汇总了Python中link_finder.LinkFinder.page_links方法的典型用法代码示例。如果您正苦于以下问题:Python LinkFinder.page_links方法的具体用法?Python LinkFinder.page_links怎么用?Python LinkFinder.page_links使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类link_finder.LinkFinder
的用法示例。
在下文中一共展示了LinkFinder.page_links方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: gather_links
# 需要导入模块: from link_finder import LinkFinder [as 别名]
# 或者: from link_finder.LinkFinder import page_links [as 别名]
def gather_links(page_url):
html_string = ''
try:
print("urlopen("+page_url+Spider.suffix+")")
response = urlopen(page_url+Spider.suffix)
#if response.getheader('Content-Type') == 'text/html':
html_bytes = response.read()
html_string = html_bytes.decode("utf-8")
print('page_url = '+page_url)
urlElems = page_url.split('/')
fileName = Spider.project_name +'/'+urlElems[-1]+'.html'
print("save to "+fileName)
with open(fileName, 'w') as f:
f.write(html_string)
#else:
# print('Failed to get Content-Type')
finder = LinkFinder(Spider.base_url, page_url, Spider.ahref_class)
finder.feed(html_string)
converter = HTMLToTXTConverter()
converter.feed(html_string)
fileName = Spider.project_name +'/'+urlElems[-1]+'.txt'
print("save to "+fileName)
with open(fileName, 'w') as f:
f.write(converter.getText())
except:
e = sys.exc_info()[0]
print(e)
print('Error: can not crawl page')
return set()
return finder.page_links()
示例2: gather_links
# 需要导入模块: from link_finder import LinkFinder [as 别名]
# 或者: from link_finder.LinkFinder import page_links [as 别名]
def gather_links(page_url):
try:
finder = LinkFinder(Spider.base_url, page_url)
finder.feed(CustomConnection.URL(page_url))
except:
return set()
return finder.page_links()
示例3: gather_links
# 需要导入模块: from link_finder import LinkFinder [as 别名]
# 或者: from link_finder.LinkFinder import page_links [as 别名]
def gather_links(page_url):
html_string = ""
try:
response = urlopen(page_url)
if "text/html" in response.getheader("content-Type"):
zipped_html_bytes = response.read()
if Spider.html_gzipped:
try:
html_bytes = gzip.decompress(zipped_html_bytes)
except IOError:
Spider.html_gzipped = False
html_bytes = zipped_html_bytes
else:
html_bytes = zipped_html_bytes
try:
html_string = html_bytes.decode("utf-8")
except UnicodeDecodeError:
try:
html_string = html_bytes.decode("gbk")
except Exception as e:
print(e)
finder = LinkFinder(Spider.base_url, page_url)
finder.feed(html_string)
except Exception as e:
print(e)
print("Error: can not craw page.")
return set()
response.close()
return finder.page_links()
示例4: gather_links
# 需要导入模块: from link_finder import LinkFinder [as 别名]
# 或者: from link_finder.LinkFinder import page_links [as 别名]
def gather_links(page_url):
html_string = ''
try:
response = urlopen(page_url)
if 'text/html' in response.getheader('Content-Type'):
html_string = response.read().decode('utf-8')
finder = LinkFinder(Spider.base_url, Spider.page_url)
finder.feed(html_string)
except Exception as e:
print('Error: can not crawl page| ', e)
return set()
return finder.page_links()
示例5: gather_links
# 需要导入模块: from link_finder import LinkFinder [as 别名]
# 或者: from link_finder.LinkFinder import page_links [as 别名]
def gather_links(page_url):
html_string = ''
try:
response = requests.get(page_url)
if 'text/html' in response.headers['Content-Type']:
html_string = str(response.content)
finder = LinkFinder(Spider.base_url, page_url)
finder.feed(html_string)
except Exception as e:
print(e)
print('Error: can not crawl page')
return set()
return finder.page_links()
示例6: gather_links
# 需要导入模块: from link_finder import LinkFinder [as 别名]
# 或者: from link_finder.LinkFinder import page_links [as 别名]
def gather_links(page_url):
html_string = ''
try:
response = urlopen(page_url)
if response.getheader('Content-type') == 'text/html; charset=utf-8':
html_bytes = response.read()
html_string = html_bytes.decode('utf-8')
finder = LinkFinder(Spider.base_url, page_url)
finder.feed(html_string)
except:
print('Error: can not crawl page')
return set()
return finder.page_links()
示例7: gather_links
# 需要导入模块: from link_finder import LinkFinder [as 别名]
# 或者: from link_finder.LinkFinder import page_links [as 别名]
def gather_links(page_url):
html_string = ''
try:
response = urlopen(page_url)
if 'text/html' in response.getheader('Content-Type'):
html_bytes = response.read()
html_string = html_bytes.decode("utf-8")
finder = LinkFinder(Spider.base_url, page_url)
finder.feed(html_string)
except Exception as e:
print(str(e))
return set()
return finder.page_links()
示例8: gather_links
# 需要导入模块: from link_finder import LinkFinder [as 别名]
# 或者: from link_finder.LinkFinder import page_links [as 别名]
def gather_links(page__url):
html_string = ""
try:
response = urlopen(page__url)
if response.getheader("Content-Type") == "text/html":
html_bytes = response.read()
html_string = html_bytes.decode("utf-8")
finder = LinkFinder(Spider.base_url, page__url)
finder.feed(html_string)
except:
print("Error: cannot crawl page")
return set()
return finder.page_links()
示例9: gather_links
# 需要导入模块: from link_finder import LinkFinder [as 别名]
# 或者: from link_finder.LinkFinder import page_links [as 别名]
def gather_links(page_url):
html_string = ''
try:
response = urlopen(page_url)
if 'text/html' in response.getheader('Content-Type'):
html_bytes = response.read()
html_string = html_bytes.decode('utf-8')
finder = LinkFinder(Spider.base_url, page_url)
finder.feed(html_string)
except:
print("Error : Can't crawl page")
return set()
return finder.page_links()
示例10: gather_links
# 需要导入模块: from link_finder import LinkFinder [as 别名]
# 或者: from link_finder.LinkFinder import page_links [as 别名]
def gather_links(page_url):
html_str = ''
try:
request = Request(page_url, headers=Spider.headers)
response = urlopen(request)
if 'text/html' in response.getheader('Content-Type'):
html_bytes = response.read()
html_str = html_bytes.decode('utf-8')
finder = LinkFinder(Spider.base_url, page_url)
finder.feed(html_str)
except:
print('Cannot access ' + page_url)
return set()
return finder.page_links()
示例11: gather_links
# 需要导入模块: from link_finder import LinkFinder [as 别名]
# 或者: from link_finder.LinkFinder import page_links [as 别名]
def gather_links(page_url):
html_string = ''
try:
header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 Safari/537.36 SE 2.X MetaSr 1.0'
}
response=requests.get(page_url,header)
header=response.headers['Content-Type']
if header=='text/html; charset=utf-8':
html_string=response.text
finder=LinkFinder(Spider.base_url, page_url)
finder.feed(html_string)
except:
print('Error: can not crawl page')
return set()
return finder.page_links()
示例12: gather_links
# 需要导入模块: from link_finder import LinkFinder [as 别名]
# 或者: from link_finder.LinkFinder import page_links [as 别名]
def gather_links(page_url):
html_string = ''
try:
response = urlopen(page_url)
if response.getheader('Content-Type') == 'text/html':
html_bytes = response.read()
html_string = html_bytes.decode(encoding='utf-8')
elif response.getheader('content-type') == 'text/html;charset=utf-8':
html_bytes = response.read()
html_string = html_bytes.decode(encoding='utf-8')
finder = LinkFinder(Spider.base_url, page_url)
finder.feed(html_string)
except Exception as e:
print('\nException : ' + str(e) + '\n')
return set()
return finder.page_links()
示例13: gather_link
# 需要导入模块: from link_finder import LinkFinder [as 别名]
# 或者: from link_finder.LinkFinder import page_links [as 别名]
def gather_link(page_url):
html_string = ''
try:
response = urlopen(page_url)
# convert bytes from the python parsing data to human readable data
if response.info()['Content-type']=='text/html' or \
response.info()['content-type'] == 'text/html; charset=utf-8' or \
response.info()['Content-type'] == 'text/html; charset=utf-8' or \
response.info()['Content-type'] == 'text/html; charset=UTF-8':
html_bytes = response.read()
html_string = html_bytes.decode("utf-8")
finder = LinkFinder(PySpider.base_url, page_url)
finder.feed(html_string)
except:
print 'Error: can not crawl page'
return set()
return finder.page_links()
示例14: gather_link
# 需要导入模块: from link_finder import LinkFinder [as 别名]
# 或者: from link_finder.LinkFinder import page_links [as 别名]
def gather_link(page_url):
html_string = ''
finder = ''
# goto website, get the byte data convert to string
# pass it through to linkfinder, and find all the links
html_string = ''
try:
response = urlopen(page_url)
if 'text/html' in response.info().headers:
html_bytes = response.read()
html_string = html_bytes.decode("utf-8")
finder = LinkFinder(Spider.base_url, page_url)
finder.feed(html_string)
except Exception as e:
print(str(e))
return set()
return finder.page_links()
示例15: gather_link
# 需要导入模块: from link_finder import LinkFinder [as 别名]
# 或者: from link_finder.LinkFinder import page_links [as 别名]
def gather_link(page_url):
html_string = ''
# urlopen returns byte data which we have to turn into a readable string
try:
response = urlopen(page_url)
# make sure it is html data (in case we crawl a pdf file)
if response.getheader('Content-Type') == 'text/html':
html_bytes = response.read()
html_string = html_bytes.decode("utf-a")
finder = LinkFinder(Spider.base_url, page_url)
finder.feed(html_string)
except:
print('Error: Cannot crawl page')
# Return empty set if we cannot crawl the link
return set()
return finder.page_links()