当前位置: 首页>>代码示例>>Python>>正文


Python LinkFinder.page_links方法代码示例

本文整理汇总了Python中link_finder.LinkFinder.page_links方法的典型用法代码示例。如果您正苦于以下问题:Python LinkFinder.page_links方法的具体用法?Python LinkFinder.page_links怎么用?Python LinkFinder.page_links使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在link_finder.LinkFinder的用法示例。


在下文中一共展示了LinkFinder.page_links方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: gather_links

# 需要导入模块: from link_finder import LinkFinder [as 别名]
# 或者: from link_finder.LinkFinder import page_links [as 别名]
    def gather_links(page_url):
        html_string = ''
        try:
            print("urlopen("+page_url+Spider.suffix+")")
            response = urlopen(page_url+Spider.suffix)
            #if response.getheader('Content-Type') == 'text/html':
            html_bytes = response.read()
            html_string = html_bytes.decode("utf-8")
            print('page_url = '+page_url)
            urlElems = page_url.split('/')
            fileName = Spider.project_name +'/'+urlElems[-1]+'.html'
            print("save to "+fileName)
            with open(fileName, 'w') as f:
                f.write(html_string)
            #else:
            #    print('Failed to get Content-Type')
            finder = LinkFinder(Spider.base_url, page_url, Spider.ahref_class)
            finder.feed(html_string)

            converter = HTMLToTXTConverter()
            converter.feed(html_string)
            fileName = Spider.project_name +'/'+urlElems[-1]+'.txt'
            print("save to "+fileName)
            with open(fileName, 'w') as f:
                f.write(converter.getText())

        except:
            e = sys.exc_info()[0]
            print(e)
            print('Error: can not crawl page')
            return set()
        return finder.page_links()
开发者ID:hbdhj,项目名称:python,代码行数:34,代码来源:spider.py

示例2: gather_links

# 需要导入模块: from link_finder import LinkFinder [as 别名]
# 或者: from link_finder.LinkFinder import page_links [as 别名]
 def gather_links(page_url):
     try:
         finder = LinkFinder(Spider.base_url, page_url)
         finder.feed(CustomConnection.URL(page_url))
     except:
         return set()
     return finder.page_links()
开发者ID:tutu86,项目名称:Spider,代码行数:9,代码来源:spider.py

示例3: gather_links

# 需要导入模块: from link_finder import LinkFinder [as 别名]
# 或者: from link_finder.LinkFinder import page_links [as 别名]
    def gather_links(page_url):
        html_string = ""
        try:
            response = urlopen(page_url)

            if "text/html" in response.getheader("content-Type"):
                zipped_html_bytes = response.read()
                if Spider.html_gzipped:
                    try:
                        html_bytes = gzip.decompress(zipped_html_bytes)
                    except IOError:
                        Spider.html_gzipped = False
                        html_bytes = zipped_html_bytes
                else:
                    html_bytes = zipped_html_bytes
                try:
                    html_string = html_bytes.decode("utf-8")
                except UnicodeDecodeError:
                    try:
                        html_string = html_bytes.decode("gbk")
                    except Exception as e:
                        print(e)
            finder = LinkFinder(Spider.base_url, page_url)
            finder.feed(html_string)
        except Exception as e:
            print(e)
            print("Error: can not craw page.")
            return set()
        response.close()
        return finder.page_links()
开发者ID:safetychinese,项目名称:link_crawler,代码行数:32,代码来源:spider.py

示例4: gather_links

# 需要导入模块: from link_finder import LinkFinder [as 别名]
# 或者: from link_finder.LinkFinder import page_links [as 别名]
 def gather_links(page_url):
     html_string = ''
     try:
         response = urlopen(page_url)
         if 'text/html' in response.getheader('Content-Type'):
             html_string = response.read().decode('utf-8')
         finder = LinkFinder(Spider.base_url, Spider.page_url)
         finder.feed(html_string)
     except Exception as e:
         print('Error: can not crawl page| ', e)
         return set()
     return finder.page_links()
开发者ID:suqingdong,项目名称:Sources,代码行数:14,代码来源:spider.py

示例5: gather_links

# 需要导入模块: from link_finder import LinkFinder [as 别名]
# 或者: from link_finder.LinkFinder import page_links [as 别名]
 def gather_links(page_url):
     html_string = ''
     try:
         response = requests.get(page_url)
         if 'text/html' in response.headers['Content-Type']:
             html_string = str(response.content)
         finder = LinkFinder(Spider.base_url, page_url)
         finder.feed(html_string)
     except Exception as e:
         print(e)
         print('Error: can not crawl page')
         return set()
     return finder.page_links()
开发者ID:andreisid,项目名称:python,代码行数:15,代码来源:spider.py

示例6: gather_links

# 需要导入模块: from link_finder import LinkFinder [as 别名]
# 或者: from link_finder.LinkFinder import page_links [as 别名]
 def gather_links(page_url):
     html_string = ''
     try:
          response = urlopen(page_url)
          if response.getheader('Content-type') == 'text/html; charset=utf-8':
              html_bytes = response.read()
              html_string = html_bytes.decode('utf-8')
          finder = LinkFinder(Spider.base_url, page_url)
          finder.feed(html_string)
     except:
         print('Error: can not crawl page')
         return set()
     return finder.page_links()
开发者ID:parkchul72,项目名称:Crawler,代码行数:15,代码来源:spider.py

示例7: gather_links

# 需要导入模块: from link_finder import LinkFinder [as 别名]
# 或者: from link_finder.LinkFinder import page_links [as 别名]
 def gather_links(page_url):
     html_string = ''
     try:
         response = urlopen(page_url)
         if 'text/html' in response.getheader('Content-Type'):
             html_bytes = response.read()
             html_string = html_bytes.decode("utf-8")
         finder = LinkFinder(Spider.base_url, page_url)
         finder.feed(html_string)
     except Exception as e:
         print(str(e))
         return set()
     return finder.page_links()
开发者ID:deviantdear,项目名称:Python_Webscraper,代码行数:15,代码来源:spider.py

示例8: gather_links

# 需要导入模块: from link_finder import LinkFinder [as 别名]
# 或者: from link_finder.LinkFinder import page_links [as 别名]
 def gather_links(page__url):
     html_string = ""
     try:
         response = urlopen(page__url)
         if response.getheader("Content-Type") == "text/html":
             html_bytes = response.read()
             html_string = html_bytes.decode("utf-8")
         finder = LinkFinder(Spider.base_url, page__url)
         finder.feed(html_string)
     except:
         print("Error: cannot crawl page")
         return set()
     return finder.page_links()
开发者ID:keegaz,项目名称:Python,代码行数:15,代码来源:spider.py

示例9: gather_links

# 需要导入模块: from link_finder import LinkFinder [as 别名]
# 或者: from link_finder.LinkFinder import page_links [as 别名]
 def gather_links(page_url):
     html_string = ''
     try:
         response = urlopen(page_url)
         if 'text/html' in response.getheader('Content-Type'):
             html_bytes = response.read()
             html_string = html_bytes.decode('utf-8')
         finder = LinkFinder(Spider.base_url, page_url)
         finder.feed(html_string)
     except:
         print("Error : Can't crawl page")
         return set()
     return finder.page_links()
开发者ID:Agham,项目名称:Spidey,代码行数:15,代码来源:spider.py

示例10: gather_links

# 需要导入模块: from link_finder import LinkFinder [as 别名]
# 或者: from link_finder.LinkFinder import page_links [as 别名]
 def gather_links(page_url):
     html_str = ''
     try:
         request = Request(page_url, headers=Spider.headers)
         response = urlopen(request)
         if 'text/html' in response.getheader('Content-Type'):
             html_bytes = response.read()
             html_str = html_bytes.decode('utf-8')
         finder = LinkFinder(Spider.base_url, page_url)
         finder.feed(html_str)
     except:
         print('Cannot access ' + page_url)
         return set()
     return finder.page_links()
开发者ID:macctown,项目名称:Crawler,代码行数:16,代码来源:spider.py

示例11: gather_links

# 需要导入模块: from link_finder import LinkFinder [as 别名]
# 或者: from link_finder.LinkFinder import page_links [as 别名]
 def gather_links(page_url):
     html_string = ''
     try:
         header = {
             'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 Safari/537.36 SE 2.X MetaSr 1.0'
         }
         response=requests.get(page_url,header)
         header=response.headers['Content-Type']
         if header=='text/html; charset=utf-8':
             html_string=response.text
         finder=LinkFinder(Spider.base_url, page_url)
         finder.feed(html_string)
     except:
         print('Error: can not crawl page')
         return set()
     return finder.page_links()
开发者ID:lq08025107,项目名称:pyspider,代码行数:18,代码来源:spider.py

示例12: gather_links

# 需要导入模块: from link_finder import LinkFinder [as 别名]
# 或者: from link_finder.LinkFinder import page_links [as 别名]
 def gather_links(page_url):
     html_string = ''
     try:
         response = urlopen(page_url)
         if response.getheader('Content-Type') == 'text/html':
             html_bytes = response.read()
             html_string = html_bytes.decode(encoding='utf-8')
         elif response.getheader('content-type') == 'text/html;charset=utf-8':
             html_bytes = response.read()
             html_string = html_bytes.decode(encoding='utf-8')
         finder = LinkFinder(Spider.base_url, page_url)
         finder.feed(html_string)
     except Exception as e:
         print('\nException : ' + str(e) + '\n')
         return set()
     return finder.page_links()
开发者ID:v4iv,项目名称:Spider,代码行数:18,代码来源:spider.py

示例13: gather_link

# 需要导入模块: from link_finder import LinkFinder [as 别名]
# 或者: from link_finder.LinkFinder import page_links [as 别名]
 def gather_link(page_url):
     html_string = ''
     try:
         response = urlopen(page_url)
         # convert bytes from the python parsing data to human readable data
         if response.info()['Content-type']=='text/html' or \
                         response.info()['content-type'] == 'text/html; charset=utf-8' or \
                         response.info()['Content-type'] == 'text/html; charset=utf-8' or \
                         response.info()['Content-type'] == 'text/html; charset=UTF-8':
             html_bytes = response.read()
             html_string = html_bytes.decode("utf-8")
         finder = LinkFinder(PySpider.base_url, page_url)
         finder.feed(html_string)
     except:
         print 'Error: can not crawl page'
         return set()
     return finder.page_links()
开发者ID:yjhao,项目名称:PySpider,代码行数:19,代码来源:PySpider.py

示例14: gather_link

# 需要导入模块: from link_finder import LinkFinder [as 别名]
# 或者: from link_finder.LinkFinder import page_links [as 别名]
    def gather_link(page_url):
        html_string = ''
        finder = ''
        # goto website, get the byte data convert to string
        # pass it through to linkfinder, and find all the links
        html_string = ''
        try:
            response = urlopen(page_url)
            if 'text/html' in response.info().headers:
                html_bytes = response.read()
                html_string = html_bytes.decode("utf-8")
                finder = LinkFinder(Spider.base_url, page_url)
                finder.feed(html_string)
           
        except Exception as e:
            print(str(e))
            return set()

        return finder.page_links()
开发者ID:iFun,项目名称:WebCrawler,代码行数:21,代码来源:spider.py

示例15: gather_link

# 需要导入模块: from link_finder import LinkFinder [as 别名]
# 或者: from link_finder.LinkFinder import page_links [as 别名]
	def gather_link(page_url):

		html_string = ''

		# urlopen returns byte data which we have to turn into a readable string 
		try:
			response = urlopen(page_url)
			
			# make sure it is html data (in case we crawl a pdf file)
			if response.getheader('Content-Type') == 'text/html':
				html_bytes = response.read()
				html_string = html_bytes.decode("utf-a")

			finder = LinkFinder(Spider.base_url, page_url)
			finder.feed(html_string)
		except:
			print('Error: Cannot crawl page')
			# Return empty set if we cannot crawl the link
			return set()

		return finder.page_links()
开发者ID:Souloist,项目名称:Projects,代码行数:23,代码来源:spider.py


注:本文中的link_finder.LinkFinder.page_links方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。