本文整理汇总了Python中pyspider.database.connect_database函数的典型用法代码示例。如果您正苦于以下问题:Python connect_database函数的具体用法?Python connect_database怎么用?Python connect_database使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了connect_database函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: resultdb_migrating
def resultdb_migrating(project, from_connection, to_connection):
logging.info("resultdb: %s", project)
f = connect_database(from_connection)
t = connect_database(to_connection)
t.drop(project)
for result in f.select(project):
t.save(project, result['taskid'], result['url'], result['result'])
示例2: taskdb_migrating
def taskdb_migrating(project, from_connection, to_connection):
logging.info("taskdb: %s", project)
f = connect_database(from_connection)
t = connect_database(to_connection)
t.drop(project)
for status in range(1, 5):
for task in f.load_tasks(status, project=project):
t.insert(project, task['taskid'], task)
示例3: cli
def cli(ctx, **kwargs):
"""
A powerful spider system in python.
"""
logging.config.fileConfig(os.path.join(os.path.dirname(__file__), "logging.conf"))
# get db from env
for db in ('taskdb', 'projectdb', 'resultdb'):
if kwargs[db] is not None:
continue
if os.environ.get('MYSQL_NAME'):
kwargs[db] = Get(lambda db=db: connect_database('mysql+%s://%s:%s/%s' % (
db, os.environ['MYSQL_PORT_3306_TCP_ADDR'],
os.environ['MYSQL_PORT_3306_TCP_PORT'], db)))
elif os.environ.get('MONGODB_NAME'):
kwargs[db] = Get(lambda db=db: connect_database('mongodb+%s://%s:%s/%s' % (
db, os.environ['MONGODB_PORT_27017_TCP_ADDR'],
os.environ['MONGODB_PORT_27017_TCP_PORT'], db)))
else:
if not os.path.exists(kwargs['data_path']):
os.mkdir(kwargs['data_path'])
kwargs[db] = Get(lambda db=db: connect_database('sqlite+%s:///%s/%s.db' % (
db, kwargs['data_path'], db[:-2])))
# queue
if kwargs.get('amqp_url'):
from pyspider.libs.rabbitmq import Queue
for name in ('newtask_queue', 'status_queue', 'scheduler2fetcher',
'fetcher2processor', 'processor2result'):
kwargs[name] = Get(lambda name=name: Queue(name, amqp_url=kwargs['amqp_url'],
maxsize=kwargs['queue_maxsize']))
elif os.environ.get('RABBITMQ_NAME'):
from pyspider.libs.rabbitmq import Queue
amqp_url = ("amqp://guest:[email protected]%(RABBITMQ_PORT_5672_TCP_ADDR)s"
":%(RABBITMQ_PORT_5672_TCP_PORT)s/%%2F" % os.environ)
for name in ('newtask_queue', 'status_queue', 'scheduler2fetcher',
'fetcher2processor', 'processor2result'):
kwargs[name] = Get(lambda name=name: Queue(name, amqp_url=amqp_url,
maxsize=kwargs['queue_maxsize']))
else:
from multiprocessing import Queue
for name in ('newtask_queue', 'status_queue', 'scheduler2fetcher',
'fetcher2processor', 'processor2result'):
kwargs[name] = Queue(kwargs['queue_maxsize'])
# phantomjs-proxy
if kwargs.get('phantomjs_proxy'):
pass
elif os.environ.get('PHANTOMJS_NAME'):
kwargs['phantomjs_proxy'] = os.environ['PHANTOMJS_PORT'][len('tcp://'):]
ctx.obj['instances'] = []
ctx.obj.update(kwargs)
if ctx.invoked_subcommand is None and not ctx.obj.get('testing_mode'):
ctx.invoke(all)
return ctx
示例4: migrate
def migrate(pool, from_connection, to_connection):
"""
Migrate tool for pyspider
"""
f = connect_database(from_connection)
t = connect_database(to_connection)
if isinstance(f, ProjectDB):
for each in f.get_all():
each = unicode_obj(each)
logging.info("projectdb: %s", each['name'])
t.drop(each['name'])
t.insert(each['name'], each)
elif isinstance(f, TaskDB):
pool = Pool(pool)
pool.map(
lambda x, f=from_connection, t=to_connection: taskdb_migrating(x, f, t),
f.projects)
elif isinstance(f, ResultDB):
pool = Pool(pool)
pool.map(
lambda x, f=from_connection, t=to_connection: resultdb_migrating(x, f, t),
f.projects)
示例5: setUpClass
def setUpClass(self):
self.resultdb = database.connect_database(
'sqlalchemy+mysql+mysqlconnector+resultdb://[email protected]/pyspider_test_resultdb'
)
示例6: cli
def cli(ctx, **kwargs):
"""
A powerful spider system in python.
"""
logging.config.fileConfig(os.path.join(os.path.dirname(__file__), "logging.conf"))
# get db from env
for db in ("taskdb", "projectdb", "resultdb"):
if kwargs[db] is not None:
continue
if os.environ.get("MYSQL_NAME"):
kwargs[db] = utils.Get(
lambda db=db: connect_database(
"mysql+%s://%s:%s/%s"
% (db, os.environ["MYSQL_PORT_3306_TCP_ADDR"], os.environ["MYSQL_PORT_3306_TCP_PORT"], db)
)
)
elif os.environ.get("MONGODB_NAME"):
kwargs[db] = utils.Get(
lambda db=db: connect_database(
"mongodb+%s://%s:%s/%s"
% (db, os.environ["MONGODB_PORT_27017_TCP_ADDR"], os.environ["MONGODB_PORT_27017_TCP_PORT"], db)
)
)
elif ctx.invoked_subcommand == "bench":
if kwargs["data_path"] == "./data":
kwargs["data_path"] += "/bench"
shutil.rmtree(kwargs["data_path"], ignore_errors=True)
os.mkdir(kwargs["data_path"])
if db in ("taskdb", "resultdb"):
kwargs[db] = utils.Get(lambda db=db: connect_database("sqlite+%s://" % (db)))
else:
kwargs[db] = utils.Get(
lambda db=db: connect_database("sqlite+%s:///%s/%s.db" % (db, kwargs["data_path"], db[:-2]))
)
else:
if not os.path.exists(kwargs["data_path"]):
os.mkdir(kwargs["data_path"])
kwargs[db] = utils.Get(
lambda db=db: connect_database("sqlite+%s:///%s/%s.db" % (db, kwargs["data_path"], db[:-2]))
)
# queue
if kwargs.get("amqp_url"):
from pyspider.libs.rabbitmq import Queue
for name in ("newtask_queue", "status_queue", "scheduler2fetcher", "fetcher2processor", "processor2result"):
kwargs[name] = utils.Get(
lambda name=name: Queue(name, amqp_url=kwargs["amqp_url"], maxsize=kwargs["queue_maxsize"])
)
elif os.environ.get("RABBITMQ_NAME"):
from pyspider.libs.rabbitmq import Queue
amqp_url = (
"amqp://guest:[email protected]%(RABBITMQ_PORT_5672_TCP_ADDR)s" ":%(RABBITMQ_PORT_5672_TCP_PORT)s/%%2F" % os.environ
)
for name in ("newtask_queue", "status_queue", "scheduler2fetcher", "fetcher2processor", "processor2result"):
kwargs[name] = utils.Get(lambda name=name: Queue(name, amqp_url=amqp_url, maxsize=kwargs["queue_maxsize"]))
else:
from multiprocessing import Queue
for name in ("newtask_queue", "status_queue", "scheduler2fetcher", "fetcher2processor", "processor2result"):
kwargs[name] = Queue(kwargs["queue_maxsize"])
# phantomjs-proxy
if kwargs.get("phantomjs_proxy"):
pass
elif os.environ.get("PHANTOMJS_NAME"):
kwargs["phantomjs_proxy"] = os.environ["PHANTOMJS_PORT"][len("tcp://") :]
ctx.obj = utils.ObjectDict(ctx.obj or {})
ctx.obj["instances"] = []
ctx.obj.update(kwargs)
if ctx.invoked_subcommand is None and not ctx.obj.get("testing_mode"):
ctx.invoke(all)
return ctx
示例7: one
def one(ctx, interactive, enable_phantomjs, scripts):
"""
One mode not only means all-in-one, it runs every thing in one process over
tornado.ioloop, for debug purpose
"""
ctx.obj['debug'] = False
g = ctx.obj
g['testing_mode'] = True
if scripts:
from pyspider.database.local.projectdb import ProjectDB
g['projectdb'] = ProjectDB(scripts)
if g.get('is_taskdb_default'):
g['taskdb'] = connect_database('sqlite+taskdb://')
if g.get('is_resultdb_default'):
g['resultdb'] = None
if enable_phantomjs:
phantomjs_config = g.config.get('phantomjs', {})
phantomjs_obj = ctx.invoke(phantomjs, **phantomjs_config)
if phantomjs_obj:
g.setdefault('phantomjs_proxy', 'localhost:%s' % phantomjs_obj.port)
else:
phantomjs_obj = None
result_worker_config = g.config.get('result_worker', {})
if g.resultdb is None:
result_worker_config.setdefault('result_cls',
'pyspider.result.OneResultWorker')
result_worker_obj = ctx.invoke(result_worker, **result_worker_config)
processor_config = g.config.get('processor', {})
processor_config.setdefault('enable_stdout_capture', False)
processor_obj = ctx.invoke(processor, **processor_config)
fetcher_config = g.config.get('fetcher', {})
fetcher_config.setdefault('xmlrpc', False)
fetcher_obj = ctx.invoke(fetcher, **fetcher_config)
scheduler_config = g.config.get('scheduler', {})
scheduler_config.setdefault('xmlrpc', False)
scheduler_config.setdefault('scheduler_cls',
'pyspider.scheduler.OneScheduler')
scheduler_obj = ctx.invoke(scheduler, **scheduler_config)
scheduler_obj.init_one(ioloop=fetcher_obj.ioloop,
fetcher=fetcher_obj,
processor=processor_obj,
result_worker=result_worker_obj,
interactive=interactive)
if scripts:
for project in g.projectdb.projects:
scheduler_obj.trigger_on_start(project)
try:
scheduler_obj.run()
finally:
scheduler_obj.quit()
if phantomjs_obj:
phantomjs_obj.quit()
示例8: cli
def cli(ctx, **kwargs):
"""
A powerful spider system in python.
"""
if kwargs['add_sys_path']:
sys.path.append(os.getcwd())
logging.config.fileConfig(kwargs['logging_config'])
# get db from env
for db in ('taskdb', 'projectdb', 'resultdb'):
if kwargs[db] is not None:
continue
if os.environ.get('MYSQL_NAME'):
kwargs[db] = utils.Get(lambda db=db: connect_database(
'sqlalchemy+mysql+{0!s}://{1!s}:{2!s}/{3!s}'.format(
db, os.environ['MYSQL_PORT_3306_TCP_ADDR'],
os.environ['MYSQL_PORT_3306_TCP_PORT'], db)))
elif os.environ.get('MONGODB_NAME'):
kwargs[db] = utils.Get(lambda db=db: connect_database(
'mongodb+{0!s}://{1!s}:{2!s}/{3!s}'.format(
db, os.environ['MONGODB_PORT_27017_TCP_ADDR'],
os.environ['MONGODB_PORT_27017_TCP_PORT'], db)))
elif ctx.invoked_subcommand == 'bench':
if kwargs['data_path'] == './data':
kwargs['data_path'] += '/bench'
shutil.rmtree(kwargs['data_path'], ignore_errors=True)
os.mkdir(kwargs['data_path'])
if db in ('taskdb', 'resultdb'):
kwargs[db] = utils.Get(lambda db=db: connect_database('sqlite+{0!s}://'.format((db))))
else:
kwargs[db] = utils.Get(lambda db=db: connect_database('sqlite+{0!s}:///{1!s}/{2!s}.db'.format(
db, kwargs['data_path'], db[:-2])))
else:
if not os.path.exists(kwargs['data_path']):
os.mkdir(kwargs['data_path'])
kwargs[db] = utils.Get(lambda db=db: connect_database('sqlite+{0!s}:///{1!s}/{2!s}.db'.format(
db, kwargs['data_path'], db[:-2])))
kwargs['is_{0!s}_default'.format(db)] = True
# create folder for counter.dump
if not os.path.exists(kwargs['data_path']):
os.mkdir(kwargs['data_path'])
# message queue, compatible with old version
if kwargs.get('message_queue'):
pass
elif kwargs.get('amqp_url'):
kwargs['message_queue'] = kwargs['amqp_url']
elif os.environ.get('RABBITMQ_NAME'):
kwargs['message_queue'] = ("amqp://guest:[email protected]%(RABBITMQ_PORT_5672_TCP_ADDR)s"
":%(RABBITMQ_PORT_5672_TCP_PORT)s/%%2F" % os.environ)
elif kwargs.get('beanstalk'):
kwargs['message_queue'] = "beanstalk://{0!s}/".format(kwargs['beanstalk'])
for name in ('newtask_queue', 'status_queue', 'scheduler2fetcher',
'fetcher2processor', 'processor2result'):
if kwargs.get('message_queue'):
kwargs[name] = utils.Get(lambda name=name: connect_message_queue(
name, kwargs.get('message_queue'), kwargs['queue_maxsize']))
else:
kwargs[name] = connect_message_queue(name, kwargs.get('message_queue'),
kwargs['queue_maxsize'])
# phantomjs-proxy
if kwargs.get('phantomjs_proxy'):
pass
elif os.environ.get('PHANTOMJS_NAME'):
kwargs['phantomjs_proxy'] = os.environ['PHANTOMJS_PORT_25555_TCP'][len('tcp://'):]
ctx.obj = utils.ObjectDict(ctx.obj or {})
ctx.obj['instances'] = []
ctx.obj.update(kwargs)
if ctx.invoked_subcommand is None and not ctx.obj.get('testing_mode'):
ctx.invoke(all)
return ctx
示例9: connect_db
def connect_db(ctx, param, value):
if value is None:
return
return Get(lambda: connect_database(value))
示例10: one
def one(ctx, interactive, enable_phantomjs, scripts):
"""
One mode not only means all-in-one, it runs every thing in one process over
tornado.ioloop, for debug purpose
"""
ctx.obj["debug"] = False
g = ctx.obj
g["testing_mode"] = True
if scripts:
from pyspider.database.local.projectdb import ProjectDB
g["projectdb"] = ProjectDB(scripts)
if g.get("is_taskdb_default"):
g["taskdb"] = connect_database("sqlite+taskdb://")
if g.get("is_resultdb_default"):
g["resultdb"] = None
if enable_phantomjs:
phantomjs_config = g.config.get("phantomjs", {})
phantomjs_obj = ctx.invoke(phantomjs, **phantomjs_config)
if phantomjs_obj:
g.setdefault("phantomjs_proxy", "127.0.0.1:%s" % phantomjs_obj.port)
else:
phantomjs_obj = None
result_worker_config = g.config.get("result_worker", {})
if g.resultdb is None:
result_worker_config.setdefault("result_cls", "pyspider.result.OneResultWorker")
result_worker_obj = ctx.invoke(result_worker, **result_worker_config)
processor_config = g.config.get("processor", {})
processor_config.setdefault("enable_stdout_capture", False)
processor_obj = ctx.invoke(processor, **processor_config)
fetcher_config = g.config.get("fetcher", {})
fetcher_config.setdefault("xmlrpc", False)
fetcher_obj = ctx.invoke(fetcher, **fetcher_config)
scheduler_config = g.config.get("scheduler", {})
scheduler_config.setdefault("xmlrpc", False)
scheduler_config.setdefault("scheduler_cls", "pyspider.scheduler.OneScheduler")
scheduler_obj = ctx.invoke(scheduler, **scheduler_config)
scheduler_obj.init_one(
ioloop=fetcher_obj.ioloop,
fetcher=fetcher_obj,
processor=processor_obj,
result_worker=result_worker_obj,
interactive=interactive,
)
if scripts:
for project in g.projectdb.projects:
scheduler_obj.trigger_on_start(project)
try:
scheduler_obj.run()
finally:
scheduler_obj.quit()
if phantomjs_obj:
phantomjs_obj.quit()
示例11: cli
def cli(ctx, **kwargs):
"""
A powerful spider system in python.
"""
if kwargs["add_sys_path"]:
sys.path.append(os.getcwd())
logging.config.fileConfig(kwargs["logging_config"])
# get db from env
for db in ("taskdb", "projectdb", "resultdb"):
if kwargs[db] is not None:
continue
if os.environ.get("MYSQL_NAME"):
kwargs[db] = utils.Get(
lambda db=db: connect_database(
"sqlalchemy+mysql+%s://%s:%s/%s"
% (db, os.environ["MYSQL_PORT_3306_TCP_ADDR"], os.environ["MYSQL_PORT_3306_TCP_PORT"], db)
)
)
elif os.environ.get("MONGODB_NAME"):
kwargs[db] = utils.Get(
lambda db=db: connect_database(
"mongodb+%s://%s:%s/%s"
% (db, os.environ["MONGODB_PORT_27017_TCP_ADDR"], os.environ["MONGODB_PORT_27017_TCP_PORT"], db)
)
)
elif ctx.invoked_subcommand == "bench":
if kwargs["data_path"] == "./data":
kwargs["data_path"] += "/bench"
shutil.rmtree(kwargs["data_path"], ignore_errors=True)
os.mkdir(kwargs["data_path"])
if db in ("taskdb", "resultdb"):
kwargs[db] = utils.Get(lambda db=db: connect_database("sqlite+%s://" % (db)))
else:
kwargs[db] = utils.Get(
lambda db=db: connect_database("sqlite+%s:///%s/%s.db" % (db, kwargs["data_path"], db[:-2]))
)
else:
if not os.path.exists(kwargs["data_path"]):
os.mkdir(kwargs["data_path"])
kwargs[db] = utils.Get(
lambda db=db: connect_database("sqlite+%s:///%s/%s.db" % (db, kwargs["data_path"], db[:-2]))
)
kwargs["is_%s_default" % db] = True
# create folder for counter.dump
if not os.path.exists(kwargs["data_path"]):
os.mkdir(kwargs["data_path"])
# message queue, compatible with old version
if kwargs.get("message_queue"):
pass
elif kwargs.get("amqp_url"):
kwargs["message_queue"] = kwargs["amqp_url"]
elif os.environ.get("RABBITMQ_NAME"):
kwargs["message_queue"] = (
"amqp://guest:[email protected]%(RABBITMQ_PORT_5672_TCP_ADDR)s" ":%(RABBITMQ_PORT_5672_TCP_PORT)s/%%2F" % os.environ
)
elif kwargs.get("beanstalk"):
kwargs["message_queue"] = "beanstalk://%s/" % kwargs["beanstalk"]
for name in ("newtask_queue", "status_queue", "scheduler2fetcher", "fetcher2processor", "processor2result"):
if kwargs.get("message_queue"):
kwargs[name] = utils.Get(
lambda name=name: connect_message_queue(name, kwargs.get("message_queue"), kwargs["queue_maxsize"])
)
else:
kwargs[name] = connect_message_queue(name, kwargs.get("message_queue"), kwargs["queue_maxsize"])
# phantomjs-proxy
if kwargs.get("phantomjs_proxy"):
pass
elif os.environ.get("PHANTOMJS_NAME"):
kwargs["phantomjs_proxy"] = os.environ["PHANTOMJS_PORT_25555_TCP"][len("tcp://") :]
ctx.obj = utils.ObjectDict(ctx.obj or {})
ctx.obj["instances"] = []
ctx.obj.update(kwargs)
if ctx.invoked_subcommand is None and not ctx.obj.get("testing_mode"):
ctx.invoke(all)
return ctx
示例12: connect_database
conn=psycopg2.connect(database="resultdb", user="postgres",password="", host="", port="")
cur = conn.cursor()
from pyspider.database import connect_database
resultdb = connect_database("sqlalchemy+postgresql+resultdb://postgres:@10.1.36.183:5432/resultdb")
#result=resultdb.select('test6').next()
#row_result = result['result']
#url=row_result['wages_and_employment_content']
#print type(url),url
##抓数据
##抓取相关的数据 并存到数据库中
##获取进一步的链接 并返回列表 列表是一个字典 (带相关内容的)
def get_more_touchs(list_content,types):
for each in list_content:
url=each[0]
try:
r=requests.get(url,headers=header)
soup=BeautifulSoup(r.text)
websites={
"detailed_work_activities" :"/search/dwa/compare/.*?g=Continue",
"work_context":"^/find/descriptor/result/.*?",
"work_values_content":"^/explore/workvalues/.*?",
"work_styles_content":"^/find/descriptor/result/.*?",
"work_activities":"^/find/descriptor/result/.*?",
"skills_content":"^/find/descriptor/result/.*?",
"knowledge_content":"^/find/descriptor/result/.*?",
"interests":"^/explore/interests/.*?",
"abilities":"^/explore/interests/.*?"
示例13: one
def one(ctx, interactive, enable_phantomjs, scripts):
"""
One mode not only means all-in-one, it runs every thing in one process over
tornado.ioloop, for debug purpose
* webui is not running in one mode.
* SCRIPTS is the script file path of project
- when set, taskdb and resultdb will use a in-memery sqlite db by default
- when set, on_start callback will be triggered on start
* the status of project is always RUNNING.
* rate and burst can be set in script with comments like:
# rate: 1.0
# burst: 3
"""
ctx.obj['debug'] = False
g = ctx.obj
g['testing_mode'] = True
if scripts:
from pyspider.database.local.projectdb import ProjectDB
g['projectdb'] = ProjectDB(scripts)
if g.get('is_taskdb_default'):
g['taskdb'] = connect_database('sqlite+taskdb://')
if g.get('is_resultdb_default'):
g['resultdb'] = connect_database('sqlite+resultdb://')
if enable_phantomjs:
phantomjs_config = g.config.get('phantomjs', {})
phantomjs_obj = ctx.invoke(phantomjs, **phantomjs_config)
if phantomjs_obj:
g.setdefault('phantomjs_proxy', 'localhost:%s' % phantomjs_obj.port)
else:
phantomjs_obj = None
result_worker_config = g.config.get('result_worker', {})
result_worker_obj = ctx.invoke(result_worker, **result_worker_config)
processor_config = g.config.get('processor', {})
processor_obj = ctx.invoke(processor, **processor_config)
fetcher_config = g.config.get('fetcher', {})
fetcher_config.setdefault('xmlrpc', False)
fetcher_obj = ctx.invoke(fetcher, **fetcher_config)
scheduler_config = g.config.get('scheduler', {})
scheduler_config.setdefault('xmlrpc', False)
scheduler_config.setdefault('scheduler_cls',
'pyspider.scheduler.scheduler.OneScheduler')
scheduler_obj = ctx.invoke(scheduler, **scheduler_config)
scheduler_obj.init_one(ioloop=fetcher_obj.ioloop,
fetcher=fetcher_obj,
processor=processor_obj,
result_worker=result_worker_obj,
interactive=interactive)
if scripts:
for project in g.projectdb.projects:
scheduler_obj.trigger_on_start(project)
try:
scheduler_obj.run()
except KeyboardInterrupt:
scheduler_obj.quit()
if phantomjs_obj:
phantomjs_obj.quit()
raise
示例14: setUpClass
def setUpClass(self):
self.taskdb = database.connect_database(
'sqlalchemy+postgresql+taskdb://[email protected]:5432/pyspider_test_taskdb'
)
self.tearDownClass()
示例15: connect_db
def connect_db(ctx, param, value):
if not value:
return
return utils.Get(lambda: connect_database(value))