本文整理匯總了Python中db.DB.init_task方法的典型用法代碼示例。如果您正苦於以下問題:Python DB.init_task方法的具體用法?Python DB.init_task怎麽用?Python DB.init_task使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在類db.DB
的用法示例。
在下文中一共展示了DB.init_task方法的3個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。
示例1: Updater
# 需要導入模塊: from db import DB [as 別名]
# 或者: from db.DB import init_task [as 別名]
class Updater(object):
def __init__(self, verbose = False, id_task = None, supplier = None):
self.verbose = verbose
self.supplier = supplier
self.config = {}
config_file = os.path.join(os.path.dirname(__file__), "updater.conf")
execfile(config_file, self.config)
#logger
self.logger = logging.getLogger('UPDATER')
hdlr = logging.handlers.TimedRotatingFileHandler(os.path.join(os.path.dirname(__file__), \
self.config['log_file'].replace(".log", "%s.log" % id_task)),"d",2)
hdlr.suffix = "-%s" % id_task if id_task else "%Y-%m-%d-%H-%M"
formatter = logging.Formatter('%(asctime)s - %(levelname)s: %(message)s')
hdlr.setFormatter(formatter)
self.logger.addHandler(hdlr)
self.logger.setLevel(logging.INFO)
self.logger.info("[__init__]")
#initialite DB
self.db = DB(self.logger, config_file)
if not id_task:
self.id_task = self.db.start_new_task()
else:
self.id_task = int(id_task)
self.name_supplier = self.db.get_name_supplier(self.supplier)
#initialite csv
self.filename_csv = os.path.join(os.path.dirname(__file__), "csv/%s" % self.config['csv_filename'] % (self.supplier, self.id_task))
self.filename_stock_master = os.path.join(os.path.dirname(__file__), "csv/%s" % "STOCK_MASTER_%d.csv" % self.id_task)
self.print_line(self.config["csv_header"], True)
def get_metas_orderer(self, data):
"""select metas required"""
return [data[meta] for meta in self.config['csv_header'] if meta in data and data[meta]]
def print_line(self, line, header = False):
"""print line in csv"""
#~ pprint([str(i).replace(",", ".") if is_number(i) else i for i in line])
#~ pprint([is_number(i) for i in line])
with open(self.filename_csv, 'wb' if header else 'ab') as csvfile:
csvwriter = csv.writer(csvfile, delimiter='\t',quotechar='"', quoting=csv.QUOTE_NONNUMERIC)
csvwriter.writerow(line)
def download_stock_master(self):
""" download csv to compare stock """
connected = False
tries = 0
self.logger.info("[download_stock_master] Descargando...")
while not connected:
try:
ftps = mFTP_TLS()
ftps.connect(self.config['ftp_host'], port=990, timeout = 60)
ftps.login(self.config['ftp_user'], self.config['ftp_pass'])
ftps.prot_p()
connected = True
except:
tries +=1
if tries > 5:
raise
time.sleep(tries)
ftps.retrbinary("RETR " + self.config['ftp_filename'] ,open(self.filename_stock_master, 'wb').write)
ftps.quit()
def load_data_stock(self):
self.logger.info("[load_data_stock] leyendo...")
self.data_stock = {}
with open(self.filename_stock_master, 'rb') as f:
reader = csv.reader(f)
header = True
for row in reader:
if not header:
data_line = dict(zip(self.config["csv_header"], [r.decode('iso-8859-1').encode('utf8') for r in row]))
self.data_stock[data_line['id']] = data_line
header = False
def run(self):
try:
self.db.init_task(self.id_task)
self.download_stock_master()
self.load_data_stock()
last_task = self.db.get_last_task_supplier(self.supplier)
#.........這裏部分代碼省略.........
示例2: CrawlerComics_3
# 需要導入模塊: from db import DB [as 別名]
# 或者: from db.DB import init_task [as 別名]
#.........這裏部分代碼省略.........
except UnicodeDecodeError:
pass
#~ print meta
#~ print repr(self.metas[meta])
#~ raise
#~ print meta, self.metas[meta]
self.db.save_data(url, self.metas, self.id_task)
self.upload_images()
self.urls_seen.append(url)
return 1
def get_external(self, name):
html = self.download_url_login(self.config['url_search'] % name)
tree = etree.fromstring(html, self.parser)
find = etree.XPath('//*[@id="aspnetForm"]/div[2]/div/div[3]/div/div/div[3]/ul//li')
products = find(tree)
for product in products:
t = clean_spaces(product.xpath(".//div/div/div[1]/span//text()")[0])
if t == name:
#todo: ?
in_novedades = "novedad" in "".join(product.xpath(".//text()"))
isbn = "".join(product.xpath(".//div/div/div[4]/text()")).split()[-1]
return isbn, in_novedades
return None, None
def extract_category(self, url, second_level = False):
""" crawl a category page"""
if url in self.urls_seen:
return False
html = self.download_url(url)
tree = etree.fromstring(html, self.parser)
f = open("a.html", "w")
f.write(html)
f.close()
if second_level:
find = etree.XPath('//a/@href')
else:
find = etree.XPath('//*[contains(@class, "categorias")]//a/@href')
links = find(tree)
self.logger.info("[extract_category] recorriendo %s" % url)
self.urls_seen.append(url)
for link in links:
if "/comics/" in link and (not second_level or "?p=" in link):
self.extract_category(link, True)
if "/comic/" in link and second_level:
self.extract_product(link)
def run(self):
"""start complete crawler"""
self.logger.info("[run] iniciando(Completo=%s)" % self.mode_complete)
try:
self.db.init_task(self.id_task)
html = self.download_url(self.config['discover_url'])
tree = etree.fromstring(html, self.parser)
find = etree.XPath('//a/@href')
links = find(tree)
for link in links:
if "/comics/" in link :
self.logger.info("[run] recorriendo %s" % link)
self.extract_category(link)
self.generate_csv()
self.db.finish_task(self.id_task)
except Exception as e:
self.db.finish_task(self.id_task, True)
exc_type, exc_obj, exc_tb = sys.exc_info()
self.logger.error("%s\n %d: %s" %(traceback.format_exc(), exc_tb.tb_lineno, str(e)))
raise
示例3: CrawlerComics_2
# 需要導入模塊: from db import DB [as 別名]
# 或者: from db.DB import init_task [as 別名]
#.........這裏部分代碼省略.........
#keywords & metatags
keys_keywords = ["category", "subcategory", "manufacturer", "title", "extra_field_10", "extra_field_3"]
self.metas['keywords'] = ", ".join(self.metas[i].strip() \
for i in keys_keywords if i in self.metas and isinstance(self.metas[i], basestring) \
and len(self.metas[i])>1)
self.metas['extra_field_7'] = "<div>%s</div>" % self.metas['extra_field_7']
def cut_last_comma(s):
if s[-1] == ",":
s = s[:-1]
if len(s) > 1 and s[-2] == ", ":
s = s[:-2]
return s
self.metas['keywords'] = cut_last_comma(self.metas['keywords'])
if 'extra_field_10' in self.metas:
self.metas['extra_field_10'] = cut_last_comma(self.metas['extra_field_10'])
self.metas['metatags'] = '<META NAME="KEYWORDS" CONTENT="%s">' % self.metas['keywords']
if previous_metas:
#has been seen already
if previous_metas['stock'] == self.metas['stock'] and previous_metas['price'] == self.metas['price']:
#has modifications but not in price or stock. Dont update.
return 0
#extra_field_11
if 'extra_field_11' in self.metas and self.metas['extra_field_11']:
self.metas['extra_field_11'] = "<div>%s</div>" % self.metas['extra_field_11']
self.metas['price2'] = self.metas['price2'].replace(",", ".")
self.metas['content'] = normalize_content(self.metas['content'])
for meta in self.metas:
if isinstance(self.metas[meta],float):
self.metas[meta] = str(round(self.metas[meta],2))
else:
if isinstance(self.metas[meta],basestring):
try:
self.metas[meta] = self.metas[meta].encode("utf-8")
except UnicodeDecodeError:
pass
#~ print meta, self.metas[meta]
self.db.save_data(id_product, self.metas, self.id_task)
self.upload_images()
return 1
def run(self):
"""start complete crawler"""
self.logger.info("[run] iniciando(Completo=%s)" % self.mode_complete)
try:
self.db.init_task(self.id_task)
for url_discover in self.config['discover_urls']:
page = 1
n_products = 1
self.last_first_id = None
while n_products > 0:
if page > 1 and not "%d" in url_discover:
#~ print "Saliendo", url_discover
break
try:
url = url_discover % page
except TypeError:
url = url_discover
self.logger.info("[run] recorriendo %s" % url)
if "campana" in url:
n_products = self.extract_product_campana(url)
else:
n_products = self.extract_product(url)
page += 1
#~ if page > 2: break;
self.logger.info("[run] extraidos %d productos de %s" % (n_products, url))
self.generate_csv()
self.db.finish_task(self.id_task)
except Exception as e:
self.db.finish_task(self.id_task, True)
exc_type, exc_obj, exc_tb = sys.exc_info()
self.logger.error("%s\n %d: %s" %(traceback.format_exc(), exc_tb.tb_lineno, str(e)))
raise