当前位置: 首页>>代码示例>>Python>>正文


Python SQLContext.registerDataFrameAsTable方法代码示例

本文整理汇总了Python中pyspark.SQLContext.registerDataFrameAsTable方法的典型用法代码示例。如果您正苦于以下问题:Python SQLContext.registerDataFrameAsTable方法的具体用法?Python SQLContext.registerDataFrameAsTable怎么用?Python SQLContext.registerDataFrameAsTable使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在pyspark.SQLContext的用法示例。


在下文中一共展示了SQLContext.registerDataFrameAsTable方法的6个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: ALS_fit

# 需要导入模块: from pyspark import SQLContext [as 别名]
# 或者: from pyspark.SQLContext import registerDataFrameAsTable [as 别名]
def ALS_fit():
    usern = request.args.get('usern')
    users_df = pd.read_sql_query('''SELECT DISTINCT mt3ratings.user, user_id FROM mt3ratings WHERE appdata = 1''', engine)
    if usern not in users_df['user'].values:
        return_str =  "can't find user"
        return jsonify(result = return_str)
    user_id = users_df.user_id[users_df.user == usern].values[0]
    try: key = request.args.get('key')
    except NameError: key = 'e'
    if key == 'abcd':
            #start spark

        try:
             conf = SparkConf().setAppName("BeerSleuthALS").set("spark.executor.memory", "4g")
             sc = SparkContext(conf=conf)
        except ValueError: pass
        sqlContext = SQLContext(sc)
        ratings_sqldf = modeling.get_item_user_rev_from_pg(engine, sqlContext)
        sqlContext.registerDataFrameAsTable(ratings_sqldf, "ratings")
        print('fitting model')
    	model = modeling.fit_final_model(ratings_sqldf)
        beer_ids = beer_dict.values()
        to_predict = zip([user_id]*len(beer_ids), beer_ids)
	to_predict_top20 = zip([user_id]*len(beer_id_filt), beer_id_filt)
        user_preds = model.predictAll(sc.parallelize(to_predict)).collect()
	user_preds_top20 = model.predictAll(sc.parallelize(to_predict_top20)).collect()
        print('got preds')
        preds = Counter({x[1]: x[2] for x in user_preds})
	preds_top20 = Counter({x[1]: x[2] for x in user_preds_top20})
        with open('%s%s_preds.pkl'%(pred_path, user_id),'wb') as f:
            pickle.dump(preds, f)
        with open('%s%s_preds_top20.pkl'%(pred_path, user_id),'wb') as f:
            pickle.dump(preds_top20, f)

        print('done')
        sc.stop()
        return jsonify(result="Model training complete, you may now get predictions")
开发者ID:JohnRenshaw,项目名称:BeerSleuth,代码行数:39,代码来源:beersleuth_web.py

示例2: SparkContext

# 需要导入模块: from pyspark import SQLContext [as 别名]
# 或者: from pyspark.SQLContext import registerDataFrameAsTable [as 别名]
    #input = "/impala/parquet/back/back-portal-loginflowlog/dat=%s*" % ym
    input = '/input/loginfowlog/*'

    spark_home = '/opt/cloud/spark'
    os.environ['SPARK_HOME'] = spark_home
    conf = (SparkConf()
            .setMaster(master)
            .setAppName(appName)
            .set("spark.sql.parquet.binaryAsString","true")
            )
    sc = SparkContext(conf = conf)
    sql_context = SQLContext(sc)
    sql_context.registerFunction("to_mac", lambda x: normal_mac(x), StringType())

    parquet_df = sql_context.read.parquet(input)
    sql_context.registerDataFrameAsTable(parquet_df, "loginflowlog")
    #_sql = "select to_mac(upper(usermac)),count(distinct dat) days from loginflowlog group by to_mac(upper(usermac))"
    _sql = "select to_mac(upper(usermac)),count(distinct logtime) days from loginflowlog group by to_mac(upper(usermac))"
    rs_df = sql_context.sql(_sql)
    rs = rs_df.collect()
    logger.info("---->" + str(len(rs)))

    lists = []
    for r in rs:
        usermac = r[0]
        days = r[1]
        t = (usermac,days)
        lists.append(t)
        #logger.debug(t)

    dao = MysqlDao()
开发者ID:wangcunxin,项目名称:spark_py,代码行数:33,代码来源:login_days_main.py

示例3: SparkContext

# 需要导入模块: from pyspark import SQLContext [as 别名]
# 或者: from pyspark.SQLContext import registerDataFrameAsTable [as 别名]
    day = pro_time.strftime("%Y%m%d")

    master = "spark://hadoop:7077"
    appName = "spark_pageflow_outflow"
    input = "/impala/parquet/site/site-pageflowv1/dat=%s" % day

    spark_home = '/opt/cloud/spark'
    os.environ['SPARK_HOME'] = spark_home

    sc = SparkContext(master, appName)
    sql_context = SQLContext(sc)
    sql_context.registerFunction("to_day", lambda x: mill_date_str(x), StringType())
    sql_context.registerFunction("to_str", lambda x: bytearray_str(x), StringType())

    parquet_df = sql_context.read.parquet(input)
    sql_context.registerDataFrameAsTable(parquet_df, "site_pageflowv1")

    _sql = "select to_str(url),to_day(createtime) day,count(1) pv,count(distinct to_str(guuid)) uv " \
           "from site_pageflowv1 where dat= %s and to_str(name)='outflow' " \
           "group by to_str(url),to_day(createtime)" % day

    rs_df = sql_context.sql(_sql)
    rs = rs_df.collect()
    logger.info("---->" + str(len(rs)))

    list = []
    for r in rs:
        url = r[0]
        day = r[1]
        pv = r[2]
        uv = r[3]
开发者ID:wangcunxin,项目名称:spark_py,代码行数:33,代码来源:main.py

示例4: filterData

# 需要导入模块: from pyspark import SQLContext [as 别名]
# 或者: from pyspark.SQLContext import registerDataFrameAsTable [as 别名]
  return [ record[i].replace('"','') for i in indexes]

def filterData(record):
  flag = True
  if (int(record[-4])<1) or (record[-2] not in (['1','4'])) or (record[-1] != ''): flag = False
  return flag

if __name__ == '__main__':
  sc = SparkContext(appName = 'CF_prod_in_transaction')
  sqlContext = SQLContext(sc)
  in_file = sc.textFile(sys.argv[1])
  data = in_file.map(oritentData).filter(filterData).map(lambda x: [int(i) for i in x[:-3]])
  Record = Row('customer_id','product_id','invoice_id','units')
  data = data.map(lambda x: Record(*x))
  data = sqlContext.createDataFrame(data)
  sqlContext.registerDataFrameAsTable(data,'table1')
  df = sqlContext.sql('select customer_id, product_id, sum(units) as prod_in_transactions from table1 group by customer_id, product_id')
  df.map(lambda x: ','.join([str(r) for r in x])).saveAsTextFile(sys.argv[2])
  sc.stop()



data_path,header,train_sample,number,support,confidence,lift,k,testing,testing_split,seed,output_path
write = open('test.csv','w')
wrtr = csv.writer(write)

import csv
read = open('arqiva.csv')
for line in read: wrtr.writerow(line)

from e
开发者ID:pandey957,项目名称:officework,代码行数:33,代码来源:collaborative_filtering.py

示例5: SparkConf

# 需要导入模块: from pyspark import SQLContext [as 别名]
# 或者: from pyspark.SQLContext import registerDataFrameAsTable [as 别名]
     sim_df.drop_duplicates()
     return sim_df

if __name__ == '__main__':
    # set up environment
    conf = SparkConf() \
      .setAppName("BeerSleuthALS") \
      .set("spark.driver.memory", "8g")
    sc = SparkContext(conf=conf)
    sqlContext = SQLContext(sc)

    #load data
    engine = create_engine('postgresql://postgres:[email protected]:5432/beersleuth')
    ratings_sqldf = get_item_user_rev_from_pg(engine, sqlContext)
    beer_sqldf = get_beer_data(engine)
    sqlContext.registerDataFrameAsTable(ratings_sqldf, "ratings")
#    train, test = sqlContext.table('ratings').randomSplit([.8, .2])
#    train = train.cache()
#    test = test.cache()
##    add_rating_to_db(user='johnjohn', beer=u'101 North Heroine IPA' , taste=8, engine=engine)
##    add_rating_to_db(user='johnjohn', beer=u'Boulder Creek Golden Promise' , taste=6, engine=engine)
##    model_param_sweep(train, test)
#    import timeit
#    start_time = timeit.default_timer()
#    model = fit_final_model(ratings_sqldf)
#    elapsed = timeit.default_timer() - start_time




开发者ID:JohnRenshaw,项目名称:BeerSleuth,代码行数:28,代码来源:beer_spark.py

示例6: SparkContext

# 需要导入模块: from pyspark import SQLContext [as 别名]
# 或者: from pyspark.SQLContext import registerDataFrameAsTable [as 别名]
    conf.set("spark.driver.maxResultSize", "10g")

    sc = SparkContext(conf=conf)

    sqlContext = SQLContext(sc)

    # path to hillary/enron avro
    enr = sqlContext.read.format(
        "com.databricks.spark.avro").load(
            "s3n://datasets-396316040607/enron_data/*.avro").repartition(16)
    hil = sqlContext.read.format(
        "com.databricks.spark.avro").load(
            "s3n://datasets-396316040607/hillary/*.avro").repartition(16)

    # register tables
    sqlContext.registerDataFrameAsTable(hil, "hillary")
    sqlContext.registerDataFrameAsTable(enr, "enron")

    # register udf
    sqlContext.registerFunction(
        "getCos", lambda x, y: get_cosine(text_to_vector(x), text_to_vector(y))
    )

    # do the cosine similarity on the text, get the top 1000 matches
    out = sqlContext.sql("SELECT h.author h_auth, e.author e_auth, "
                         "e.contents e_mail, h.contents h_mail, "
                         "getCos(e.contents, h.contents) as cos_sim "
                         "from hillary as h join enron as e order by cos_sim "
                         "desc limit 1000")

    # write back out to s3
开发者ID:JasonSanchez,项目名称:email-like-enron,代码行数:33,代码来源:cross_product.py


注:本文中的pyspark.SQLContext.registerDataFrameAsTable方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。