当前位置: 首页>>代码示例>>Python>>正文


Python dbutils.get_db_engine函数代码示例

本文整理汇总了Python中pygaga.helpers.dbutils.get_db_engine函数的典型用法代码示例。如果您正苦于以下问题:Python get_db_engine函数的具体用法?Python get_db_engine怎么用?Python get_db_engine使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。


在下文中一共展示了get_db_engine函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: crawl_all

def crawl_all():
    login_params = {'emailaddress':'[email protected]',
    'password':'12345678',
    'type':'undefined',
    'wbid':'0',
    'savestat':'true'
#    'checkcode':'',
    }
    req = urllib2.Request('http://www.meilishuo.com/users/ajax_logon?frm=undefined', urllib.urlencode(login_params), headers)
    handle = urllib2.urlopen(req)
    logger.info("logged result %s", handle.read())

    if FLAGS.itemid:
        crawl_item(FLAGS.itemid)
    else:
        if FLAGS.group:
            start = FLAGS.group*1000000
            end = (FLAGS.group+1)*1000000
        else:
            start = FLAGS.start
            end = FLAGS.end
        db = get_db_engine()
        for item_id in xrange(start, end, 1):
            try:
                results = db.execute("select item_id from crawl_html where item_id=%s" % item_id)
                if results.rowcount > 0:
                    continue
            except:
                db = get_db_engine()
            crawl_item(item_id)
开发者ID:iloveyo123u1,项目名称:tb-crawler,代码行数:30,代码来源:crawl_meilishuo.py

示例2: convert_main

def convert_main():
    db = get_db_engine()
    db_production = get_db_engine(connstr=FLAGS.production_connstr)
    all_nicks = db_production.execute("select nick from shop");
    all_nick_set = set([row[0] for row in all_nicks])
    result = db.execute("select url, name from shop_shop where is_voted=1 and is_cloth=1 and is_delete=0;")
    for row in result:
        if row[0].find("tmall.com") > 0:
            shop_type = 2
        else:
            shop_type = 1
        if row[1] not in all_nick_set:
            db_production.execute("insert into shop(nick, url, type, status) values(%s, %s ,%s, 2)", row[1], row[0], shop_type)
        else:
            print row[1].encode('utf8'), " exists"
开发者ID:iloveyo123u1,项目名称:tb-crawler,代码行数:15,代码来源:import_shop_to_production.py

示例3: crawl_main

def crawl_main():
    hosts = set()
    hosts_in_db = set()
    hosts_attr = {}

    db = get_db_engine()
    result = db.execute("select url from shop")

    for row in result:
        hosts_in_db.add(str(urlparse.urlparse(row[0]).netloc))

    #print hosts_in_db
    for line in open(FLAGS.path):
        url = line.split()[0]
        host = str(urlparse.urlparse(url).netloc)
        hosts.add(host)
        if url.find('tmall.com') > 0:
            shopname = " ".join(line.split()[1:])
        else:
            shopname = " ".join(line.split()[1:-1])
        hosts_attr[host] = shopname

    hosts_not_in_db = hosts - hosts_in_db
    print "hosts %s indb %s notindb %s" % (len(hosts), len(hosts_in_db), len(hosts_not_in_db))
    for host in hosts_not_in_db:
        print "http://%s/ %s" % (host, hosts_attr[host])
开发者ID:iloveyo123u1,项目名称:tb-crawler,代码行数:26,代码来源:check_taobao_shop.py

示例4: crawl_hotest

def crawl_hotest():
    #查出bi-db1中所有的item_hotest表item_id数据,这个表应该是每小时更新一次
    #写入一个临时表temp_item_hotest,写入前先删除旧数据
    #联合查询item,temp_item_hotest表,进行抓取评论,最多抓取20页
    bi_db = get_db_engine(dbhost=FLAGS.bihost) 
    itemid_list = list(bi_db.execute("select item_id from item_hotest"))

    db = get_db_engine()
    db.execute("TRUNCATE table temp_item_hotest")
    logger.debug("TRUNCATE table temp_item_hotest")
    db.execute("insert into temp_item_hotest values (%s)", itemid_list)
    logger.debug("insert temp_item_hotest")
    if FLAGS.force:
        return crawl_items("select item.id,item.detail_url,item.num_id from item,temp_item_hotest where item.id=temp_item_hotest.item_id")
    else:
        return crawl_items("select item.id,item.detail_url,item.num_id from item,temp_item_hotest where item.status=1 and item.id=temp_item_hotest.item_id order by item.id desc")
开发者ID:iloveyo123u1,项目名称:tb-crawler,代码行数:16,代码来源:crawl_item_impl.py

示例5: update_item

def update_item(sql):
    t = time.time()
    db = get_db_engine()
    item = db.execute(sql)

    results = get_taobao_items(get_top(), item, fn_join_iids=lambda
            x:','.join([str(i[1]) for i in x]), calllimit=60)

    for batch_item in results:
        for iid, item in batch_item.items.iteritems():
            try:
                item_id = item['req'][0]
                item_iid = item['req'][1]
                shop_id = item['req'][2]
                item_title = item['req'][3]
                item_picurl = item['req'][4]
                local_pic_url = item['req'][5]  #直接用数据库的文件名,不更新,类似"18142957186_28924096.jpg"

                if item['resp']:
                    taobao_title = item['resp']['title']
                    taobao_picurl = item['resp']['pic_url']
                    #item_picurl != taobao_picurl,则需要重新获取,并存入dfs,再更新item
                    #title, pic_url, pic_width, pic_height, modified

                    if FLAGS.forcibly:
                        #强制更新
                        is_title_update = True
                        is_picurl_update = True
                        # local_pic_url = "%s_%s.%s" % (item_iid, str(id(item)), item_picurl.split('.')[-1].split('?')[0].split('/')[-1])
                    else:
                        if cmp(item_title, taobao_title):
                            is_title_update = True
                        else:
                            is_title_update = False

                        if cmp(item_picurl, taobao_picurl):
                            is_picurl_update = True
                        else:
                            is_picurl_update = False

                    if is_title_update:
                        if is_picurl_update:
                            width, height = download_image({'item_id': item_id, 'num_id': item_iid, 'shop_id': shop_id, 'pic_url': taobao_picurl, 'image_name': local_pic_url, 'crawl_path': FLAGS.crawl_path})
                            db.execute("update item set modified=now(), title=%s, pic_url=%s, pic_width=%s, pic_height=%s where id=%s", taobao_title, taobao_picurl, width, height, item_id)

                            logger.info("item %s num_id %s update title from %s to %s, pic_url from %s to %s", item_id, item_iid, item_title, taobao_title, item_picurl, taobao_picurl)
                        else:
                            db.execute("update item set modified=now(), title=%s where id=%s", taobao_title, item_id)

                            logger.info("item %s update title from %s to %s", item_id, item_title, taobao_title)
                    elif is_picurl_update:
                        width, height = download_image({'item_id':item_id, 'num_id': item_iid, 'shop_id': shop_id, 'pic_url': taobao_picurl, 'image_name': local_pic_url, 'crawl_path': FLAGS.crawl_path})
                        db.execute("update item set modified=now(), pic_url=%s, pic_width=%s, pic_height=%s where id=%s", taobao_picurl, width, height, item_id)

                        logger.info("item %s num_id %s update pic_url from %s to %s", item_id, item_iid, item_picurl, taobao_picurl)

            except:
                logger.error("update failed %s", traceback.format_exc())
    spent = time.time() - t
    logger.info("update_item_title_image use time : %s", spent*1000)
开发者ID:iloveyo123u1,项目名称:tb-crawler,代码行数:60,代码来源:update_item_title_image.py

示例6: check_image

def check_image():
    #因为数据量比较大,分批检查
    db_limit = {
        1: 100000,
        100000: 200000,
        200000: 300000,
        300000: 400000,
        400000: 500000
    }
    n = 0
    for s, e in db_limit.items():
        sql = "select id, num_id, shop_id, pic_url, local_pic_url from item where status=1 limit %s,%s" % (s, e)
        db = get_db_engine()
        items = list(db.execute(sql))
        for item in items:
            item_id = item[0]
            item_iid = str(item[1])
            shop_id = item[2]
            pic_url = str(item[3])
            local_pic_url = str(item[4])
            validate_path = "/space/wwwroot/image.guang.j.cn/ROOT/images/" + str(shop_id) + "/big/" + local_pic_url
            if not os.path.exists(validate_path):
                n += 1
                logger.error("item %s not pic %s", item_id, validate_path)
                try:
                    download_image({'item_id': item_id, 'num_id': item_iid, 'shop_id': shop_id, 'pic_url': pic_url,
                                    'image_name': local_pic_url, 'crawl_path': FLAGS.crawl_path})
                except:
                    logger.error("download %s:%s failed reason: %s", item_id, pic_url, traceback.format_exc())
                    continue
    logger.info("stat item not image number=%s", n)
开发者ID:iloveyo123u1,项目名称:tb-crawler,代码行数:31,代码来源:update_item_title_image.py

示例7: mig_main

def mig_main():
    db = get_db_engine()
    result = db.execute("select id,name,status from wb_account;")
    for row in result:
        sql = "update wb_qq_account set qqid=%s where name='%s'" % (QQIDS[row[1]], row[1])
        print sql
        db.execute(sql)
开发者ID:iloveyo123u1,项目名称:tb-crawler,代码行数:7,代码来源:generate_table.py

示例8: GET

    def GET(self, id):
        db = get_db_engine()
        results = db.execute("select crawl_item_images.url, crawl_item_images.pos, crawl_item_images.type from crawl_html, crawl_item_images where crawl_item_images.item_id=crawl_html.item_id and crawl_html.item_id=%s;" % id)
        item_crawler = ItemCrawler(id, FLAGS.crawl_path)
        item_crawler.crawl(results, ((94,94), (350,350)), False)

        return render.crawlitem(id, item_crawler.results)
开发者ID:iloveyo123u1,项目名称:tb-crawler,代码行数:7,代码来源:view.py

示例9: process_item

def process_item(item, total, cur):
    try:
        id,shop_id,local_pic_url,pic_url,manual_set,manual_updated_columns,status,num_id,pic_height,pic_width = item
        big_path = "%s%s/big/%s" % (FLAGS.path, shop_id, local_pic_url)
        mid2_path = "%s%s/mid2/%s" % (FLAGS.path, shop_id, local_pic_url)
        mid_path = "%s%s/mid/%s" % (FLAGS.path, shop_id, local_pic_url)
        sma_path = "%s%s/sma/%s" % (FLAGS.path, shop_id, local_pic_url)
        if os.path.exists(big_path) and pic_width == 0:
            size = get_image_size(big_path)
            logger.debug("update %s size %s" % (id, size))
            db = get_db_engine()
            db.execute("update item set pic_width=%s,pic_height=%s where id=%s" % (size[0], size[1], id))

        if status in (2,3) and not FLAGS.force:
            return
        if not os.path.exists(big_path):
            headers = {'Referer' : "http://item.taobao.com/item.htm?id=%s" % id, 'User-Agent' : DEFAULT_UA}
            data = crawl_page(num_id, pic_url, headers)
            # save to path
            logger.debug("crawling %s %s %s %s", cur, total, big_path, item)
            save_image(big_path, data)
        if not os.path.exists(mid2_path):
            logger.debug("thumbing %s %s %s %s", cur, total, mid2_path, item)
            imagemagick_resize(300, 300, big_path, mid2_path)
        if not os.path.exists(mid_path):
            logger.debug("thumbing %s %s", mid_path, item)
            imagemagick_resize(210, 210, big_path, mid_path)
        if not os.path.exists(sma_path):
            logger.debug("thumbing %s %s", sma_path, item)
            imagemagick_resize(60, 60, big_path, sma_path)
    except:
        logger.error("unknown error %s, %s", item, traceback.format_exc())
开发者ID:iloveyo123u1,项目名称:tb-crawler,代码行数:32,代码来源:process_item_image.py

示例10: crawl_shops

def crawl_shops(sql):
    db = get_db_engine()
    shops = list(db.execute(sql))

    if not shops:
        logger.info("not shop crawler.")
        return

    # debug
    if FLAGS.debug_parser:
        import pdb

        pdb.set_trace()

    # global, all shop use
    tb_category = TaobaoCategory(db)
    term_factory = TermFactory(db)
    logger.info("init category %s and term factory %s.", len(tb_category.categories_dict), len(term_factory.sub_terms))

    last_time = 0
    for shop in shops:
        cur = time.time() * 1000
        if cur - last_time < FLAGS.interval:
            time.sleep((FLAGS.interval - (cur - last_time)) / 1000.0)
        last_time = time.time() * 1000
        crawl_one_shop(shop, tb_category, term_factory, db)
开发者ID:ljb-2000,项目名称:tb-crawler,代码行数:26,代码来源:crawl_taobao.py

示例11: get_data

def get_data():
    sql = "select shop_id,local_pic_url from item where modified>'2013-12-09 09' order by shop_id desc"
    db = get_db_engine()
    items = list(db.execute(sql))
    for item in items:
        refreshCdnCache(item[0], item[1])
        time.sleep(1)
开发者ID:iloveyo123u1,项目名称:tb-crawler,代码行数:7,代码来源:clean_cdn_cache.py

示例12: clicklog_main

def clicklog_main():
    click_file_list = []
    for d in eachday(FLAGS.start, FLAGS.end):
        click_file_list.extend(glob("/space/log/filtered/click*/click-" + datestr(d) + "_00???"))
    # TODO: load from conversion db?
    ret = []
    if FLAGS.commit:
        db = get_db_engine()
    for fn in click_file_list:
        logger.debug("processing %s", fn)
        for line in open(fn, "r"):
            click = get_click(line)
            if not click:
                continue
            click_obj, click_ex_obj, score, why = click
            rec   = get_record(click)
            #if rec[0] in written:
            #    continue #already written in db.
            if rec:
                if FLAGS.commit:
                    insert_match(db, rec)
                else:
                    ret.append(rec)
    simplejson.dump(ret, open(FLAGS.out_file, "w"))
    return ret
开发者ID:iloveyo123u1,项目名称:tb-crawler,代码行数:25,代码来源:clicklog.py

示例13: load_click_items

def load_click_items(numid2volume):
    logger.info("Loading click items")
    db = get_db_engine()
    json_file = open(FLAGS.click_input)
    click_json = simplejson.load(json_file)
    click_item_type = namedtuple("ClickItemType", 'click_hash source media_id holder_id site admember_id campaign_id adgroup_id creative_id click_time click_ip area_code lpid price pubcat_list user_attr_list score item_price item_volume')
    click_items = []
    creative_matched = 0
    outercode_matched = 0
    progress = 0
    creative2item_cache = {}
    logger.info("Processing click items")
    for line in click_json:
        progress += 1
        click_item = click_item_type(*line)
        click_items.append(click_item)
        if creative2item_cache.has_key(click_item.creative_id):
            rr = creative2item_cache[click_item.creative_id]
        else:
            # creative_id --> (num_id, shop_name) item_price, item_volume
            r = db.execute("select num_id, shop.nick from item,shop where item.shop_id=shop.id and item.uctrac_creative_id=%s" % click_item.creative_id)
            if not r.rowcount:
                logger.warn("creative not matched %s %s/%s", click_item.creative_id, progress, len(click_json))
                continue
            rr = creative2item_cache[click_item.creative_id] = list(r)
        creative_matched += 1
        num_id, seller_nick = rr[0]
        #import pdb; pdb.set_trace()
        numid2volume[long(num_id)] = click_item.item_volume
        click_hash = 'jn%s' % click_item.click_hash
        r2 = db.execute('select 1 from taobao_report where outer_code="%s"' % click_hash)
        if r2.rowcount:
            outercode_matched += 1
    logger.info("Total click %s creative matched %s outercode matched %s", len(click_items), creative_matched, outercode_matched)
    return click_items
开发者ID:iloveyo123u1,项目名称:tb-crawler,代码行数:35,代码来源:estimate_click2pay.py

示例14: get_xks_tagmatch

def get_xks_tagmatch(xks):
    tagmatch = ''
    if xks:
        db = get_db_engine()
        rows = db.execute("SELECT tag_match FROM recommend_subscriber WHERE id = %s" % xks)
        if rows.rowcount > 0:
            tagmatch = convert_tagmatch(list(rows)[0][0])
    return tagmatch
开发者ID:iloveyo123u1,项目名称:tb-crawler,代码行数:8,代码来源:solrweb.py

示例15: crawler

def crawler(sql):
    db = get_db_engine()
    items = list(db.execute(sql))

    # debug
    if FLAGS.debug_parser:
        import pdb
        pdb.set_trace()

    for item in items:
        shop_id = item[0]
        shop_type = item[1]
        item_id = item[2]
        url = item[3]

        try:
            htm = get_item_htm(item_id, url, db)
            if shop_type == 1:
                htm_obj = parse_html(htm, encoding='gb18030')
                discount_url = htm_obj.xpath("//div[@id='promote']/@data-default")
                if discount_url and len(discount_url) > 0:
                    item_headers = {'Referer': url, 'User-Agent': DEFAULT_UA}
                    disc_content = download(discount_url[0], item_headers)
                    if disc_content.strip():
                        disc_obj = parse_html(disc_content, encoding='gb18030')
                        content = disc_obj.xpath("//div[@id='J_MjsData']/h3/text()")[0].strip()
                        dates = disc_obj.xpath("//div[@id='J_MjsData']/h3/span[@class='tb-indate']/text()")[0].strip()
                        st = dates.encode('utf-8').replace("--","—").split("—")
                        start_time = datetime.datetime.strptime(st[0].strip().replace('年','-').replace("月","-").replace("日",""),'%Y-%m-%d')
                        end_time = datetime.datetime.strptime(st[1].strip().replace('年','-').replace("月","-").replace("日",""),'%Y-%m-%d')

                        db.execute("replace into shop_discount (shop_id,content,start_time,end_time,discount_url,create_time,last_update_time) values (%s,%s,%s,%s,%s,now(),now())",
                                   shop_id, content.encode('utf-8'), start_time, end_time, discount_url[0])
                        logger.info("taobao shop %s get discount success", shop_id)
                    else:
                        logger.warning("taobao shop %s:%s not discount.", shop_id, url)
                else:
                    logger.warning("taobao shop %s:%s not discount.", shop_id, url)
            elif shop_type == 2:
                d_url = get_val(htm, "initApi")
                if d_url:
                    item_headers = {'Referer': url, 'User-Agent': DEFAULT_UA}
                    disc_content = download(d_url, item_headers)
                    cjson = loads(disc_content.decode('gb18030').encode('utf8'))
                    shop_prom = cjson['defaultModel']['itemPriceResultDO']['tmallShopProm']
                    if shop_prom:
                        st = int(shop_prom['startTime'])/1000
                        et = int(shop_prom['endTime'])/1000
                        start_time = time.strftime("%Y-%m-%d", time.localtime(st))
                        end_time = time.strftime("%Y-%m-%d", time.localtime(et))
                        content = shop_prom['promPlan'][0]['msg']
                        db.execute("replace into shop_discount (shop_id,content,start_time,end_time,discount_url,create_time,last_update_time) values (%s,%s,%s,%s,%s,now(),now())",
                            shop_id, content.encode('utf-8'), start_time, end_time, d_url)
                        logger.info("tmall shop %s get discount success", shop_id)
                    else:
                        logger.warning("taobao shop %s:%s not discount.", shop_id, url)
        except:
            logger.error("shop %s:%s xpath failed:%s", shop_id, url, traceback.format_exc())
开发者ID:iloveyo123u1,项目名称:tb-crawler,代码行数:58,代码来源:crawl_shop_discount.py


注:本文中的pygaga.helpers.dbutils.get_db_engine函数示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。