本文整理汇总了Python中scrapy.xlib.pydispatch.dispatcher.connect方法的典型用法代码示例。如果您正苦于以下问题:Python dispatcher.connect方法的具体用法?Python dispatcher.connect怎么用?Python dispatcher.connect使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类scrapy.xlib.pydispatch.dispatcher
的用法示例。
在下文中一共展示了dispatcher.connect方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: __init__
# 需要导入模块: from scrapy.xlib.pydispatch import dispatcher [as 别名]
# 或者: from scrapy.xlib.pydispatch.dispatcher import connect [as 别名]
def __init__(self,rule):
dispatcher.connect(self.spider_opened, signals.spider_opened)
dispatcher.connect(self.spider_closed, signals.spider_closed)
self.rule = rule
self.name = rule.name
self.allowed_domains = rule.allowed_domains.split(',')
self.start_urls = rule.start_urls.split(',')
rule_list = []
# ??`???`???
if len(rule.next_page):
rule_list.append(Rule(LinkExtractor(restrict_xpaths=rule.next_page), follow=True))
rule_list.append(Rule(LinkExtractor(
allow=rule.allow_url.split(','),
unique=True),
follow=True,
callback='parse_item'))
self.rules = tuple(rule_list)
super(ProxySpiderSpider, self).__init__()
示例2: start_requests
# 需要导入模块: from scrapy.xlib.pydispatch import dispatcher [as 别名]
# 或者: from scrapy.xlib.pydispatch.dispatcher import connect [as 别名]
def start_requests(self):
"""
NOTE: This method is ONLY CALLED ONCE by Scrapy (to kick things off).
Get the first url to crawl and return a Request object
This will be parsed to self.parse which will continue
the process of parsing all the other generated URLs
"""
if not self.args:
# connect to mysql database
self.url.connect()
# grab the first URL to begin crawling
start_url = self.url.next_url().next()
else:
start_url = self.start_urls[0]
request = Request(start_url, dont_filter=True)
# important to yield, not return
yield request
示例3: __init__
# 需要导入模块: from scrapy.xlib.pydispatch import dispatcher [as 别名]
# 或者: from scrapy.xlib.pydispatch.dispatcher import connect [as 别名]
def __init__(self):
dispatcher.connect(self.spider_opended, signals.spider_opened)
dispatcher.connect(self.spider_closed, signals.spider_closed)
dispatcher.connect(self.engine_stopped, signals.engine_stopped)
dispatcher.connect(self.engine_started, signals.engine_started)
# ????????????scrapy_site??????
self.curpath = os.getcwd()
#?????????????
self.spidername_filepath = self.curpath + "/scrapy_site/msg/"
# ?????keyword.conf????????
self.keywordsDict = dict()
self.getKeywords()
#????????????
self.webnamesDict = dict()
self.getWebnames()
# ????
self.msgDict = dict()
SavePipeline.initCount = SavePipeline.initCount + 1
示例4: main
# 需要导入模块: from scrapy.xlib.pydispatch import dispatcher [as 别名]
# 或者: from scrapy.xlib.pydispatch.dispatcher import connect [as 别名]
def main():
"""Rutina principal para la ejecución del Spider"""
# set up signal to catch items scraped
def catch_item(sender, item, **kwargs):
print "Item extracted:", item
dispatcher.connect(catch_item, signal=signals.item_passed)
settings = Settings()
settings.set("USER_AGENT", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.134 Safari/537.36")
settings.set("LOG_ENABLED",False)
# setup crawler
from scrapy.crawler import CrawlerProcess
crawler = CrawlerProcess(settings)
# definir el spider para el crawler
crawler.crawl(EuropythonSpyder())
# iniciar scrapy
print "STARTING ENGINE"
crawler.start() #iniciar el crawler llamando al spider definido
print "ENGINE STOPPED"
示例5: main
# 需要导入模块: from scrapy.xlib.pydispatch import dispatcher [as 别名]
# 或者: from scrapy.xlib.pydispatch.dispatcher import connect [as 别名]
def main():
"""Rutina principal para la ejecución del Spider"""
# set up signal to catch items scraped
def catch_item(sender, item, **kwargs):
print "Item Extraido:", item
dispatcher.connect(catch_item, signal=signals.item_passed)
settings = Settings()
settings.set("USER_AGENT", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.134 Safari/537.36")
settings.set("LOG_ENABLED",False)
# setup crawler
from scrapy.crawler import CrawlerProcess
crawler = CrawlerProcess(settings)
# definir el spider para el crawler
crawler.crawl(BloggerSpider())
# iniciar scrapy
print "STARTING ENGINE"
crawler.start() #iniciar el crawler llamando al spider definido
print "ENGINE STOPPED"
示例6: main
# 需要导入模块: from scrapy.xlib.pydispatch import dispatcher [as 别名]
# 或者: from scrapy.xlib.pydispatch.dispatcher import connect [as 别名]
def main():
from scrapy.xlib.pydispatch import dispatcher
"""Rutina principal para la ejecución del Spider"""
# set up signal to catch items scraped
def catch_item(sender, item, **kwargs):
print "Item extracted:", item
dispatcher.connect(catch_item, signal=signals.item_passed)
settings = Settings()
settings.set("USER_AGENT", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.134 Safari/537.36")
settings.set("LOG_ENABLED",False)
# setup crawler
from scrapy.crawler import CrawlerProcess
crawler = CrawlerProcess(settings)
# define spyder for the crawler
crawler.crawl(PydataSpiderDetails())
print "STARTING ENGINE"
crawler.start() #start the crawler
print "ENGINE STOPPED"
示例7: __init__
# 需要导入模块: from scrapy.xlib.pydispatch import dispatcher [as 别名]
# 或者: from scrapy.xlib.pydispatch.dispatcher import connect [as 别名]
def __init__(self,*a, **kw):
super(StackSpider,self).__init__(*a, **kw)
self.time = datetime.datetime.now()
self.congress = Congress()
self.members = self.congress.searchAll("diputados")
self.groups = self.congress.searchAll("grupos")
dispatcher.connect(self.whenFinish, signals.spider_closed)
示例8: __init__
# 需要导入模块: from scrapy.xlib.pydispatch import dispatcher [as 别名]
# 或者: from scrapy.xlib.pydispatch.dispatcher import connect [as 别名]
def __init__(self, *a, **kw):
"""Attach a callback to the spider_closed signal"""
super(Kijiji, self).__init__(*a, **kw)
dispatcher.connect(self.spider_closed, signals.spider_closed)
if USE_DB is True:
self.open_database()
if DRAW_ALL_DB is True and DRAW_NEW_AD_ONLY is False:
# add already know marker
for x in self.m_list:
self.add_marker(x, False)
示例9: __init__
# 需要导入模块: from scrapy.xlib.pydispatch import dispatcher [as 别名]
# 或者: from scrapy.xlib.pydispatch.dispatcher import connect [as 别名]
def __init__(self, *a, **kw):
super(TianqiSpider, self).__init__(*a, **kw)
dispatcher.connect(self.spider_closed, signals.spider_closed)
self.sql = SqlHelper()
self.weather_table_name = config.weather_table
self.citys = []
self.init()
示例10: __init__
# 需要导入模块: from scrapy.xlib.pydispatch import dispatcher [as 别名]
# 或者: from scrapy.xlib.pydispatch.dispatcher import connect [as 别名]
def __init__(self):
dispatcher.connect(self.spider_opened, signals.spider_opened)
dispatcher.connect(self.spider_closed, signals.spider_closed)
示例11: __init__
# 需要导入模块: from scrapy.xlib.pydispatch import dispatcher [as 别名]
# 或者: from scrapy.xlib.pydispatch.dispatcher import connect [as 别名]
def __init__(self):
self.filename += settings.MARKET_NAME
self.filename += ".db"
self.filename = path.join(settings.DATABASE_DIR, self.filename)
print self.filename
self.conn = None
dispatcher.connect(self.initialize, signals.engine_started)
dispatcher.connect(self.finalize, signals.engine_stopped)
示例12: initialize
# 需要导入模块: from scrapy.xlib.pydispatch import dispatcher [as 别名]
# 或者: from scrapy.xlib.pydispatch.dispatcher import connect [as 别名]
def initialize(self):
if path.exists(self.filename):
self.conn = sqlite3.connect(self.filename)
else:
self.create_table()
self.conn.execute("PRAGMA journal_mode=WAL;")
self.conn.commit()
示例13: create_table
# 需要导入模块: from scrapy.xlib.pydispatch import dispatcher [as 别名]
# 或者: from scrapy.xlib.pydispatch.dispatcher import connect [as 别名]
def create_table(self):
self.conn = sqlite3.connect(self.filename)
self.conn.execute("create table apps( \
id integer primary key autoincrement, \
url varchar(100) not null unique, \
downloaded int default 0)"
)
self.conn.commit()
示例14: __init__
# 需要导入模块: from scrapy.xlib.pydispatch import dispatcher [as 别名]
# 或者: from scrapy.xlib.pydispatch.dispatcher import connect [as 别名]
def __init__(self, *args, **kwargs):
super(FullDomainSpider, self).__init__(*args, **kwargs)
self.allowed_domains = kwargs.get('allowed_domains').split(',')
self.org = kwargs.get('org')
self.start_urls = kwargs.get('start_urls').split(',')
dispatcher.connect(self.spider_opened, signals.spider_opened)
dispatcher.connect(self.spider_closed, signals.spider_closed)
示例15: spider_opened
# 需要导入模块: from scrapy.xlib.pydispatch import dispatcher [as 别名]
# 或者: from scrapy.xlib.pydispatch.dispatcher import connect [as 别名]
def spider_opened(self, spider):
self.conn = MySQLdb.connect(host=settings.MYSQL_HOST, db=settings.MYSQL_DB, user=settings.MYSQL_USER, passwd=settings.MYSQL_PASSWORD, charset='utf8', use_unicode=True)
cursor = spider.conn.cursor()
sql_str = "SELECT pattern from whitelist"
cursor.execute(sql_str)
self.custom_whitelist = cursor.fetchall()
try:
alexa_whitelist_file = pkgutil.get_data("malspider", "resources/alexa-1k-whitelist.csv").decode('ascii')
self.alexa_whitelist = alexa_whitelist_file.splitlines()
except:
log.msg("Error loading alexa whitelist...", level=log.ERROR)