本文整理汇总了Python中pyspark.SQLContext.registerDataFrameAsTable方法的典型用法代码示例。如果您正苦于以下问题:Python SQLContext.registerDataFrameAsTable方法的具体用法?Python SQLContext.registerDataFrameAsTable怎么用?Python SQLContext.registerDataFrameAsTable使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pyspark.SQLContext
的用法示例。
在下文中一共展示了SQLContext.registerDataFrameAsTable方法的6个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: ALS_fit
# 需要导入模块: from pyspark import SQLContext [as 别名]
# 或者: from pyspark.SQLContext import registerDataFrameAsTable [as 别名]
def ALS_fit():
usern = request.args.get('usern')
users_df = pd.read_sql_query('''SELECT DISTINCT mt3ratings.user, user_id FROM mt3ratings WHERE appdata = 1''', engine)
if usern not in users_df['user'].values:
return_str = "can't find user"
return jsonify(result = return_str)
user_id = users_df.user_id[users_df.user == usern].values[0]
try: key = request.args.get('key')
except NameError: key = 'e'
if key == 'abcd':
#start spark
try:
conf = SparkConf().setAppName("BeerSleuthALS").set("spark.executor.memory", "4g")
sc = SparkContext(conf=conf)
except ValueError: pass
sqlContext = SQLContext(sc)
ratings_sqldf = modeling.get_item_user_rev_from_pg(engine, sqlContext)
sqlContext.registerDataFrameAsTable(ratings_sqldf, "ratings")
print('fitting model')
model = modeling.fit_final_model(ratings_sqldf)
beer_ids = beer_dict.values()
to_predict = zip([user_id]*len(beer_ids), beer_ids)
to_predict_top20 = zip([user_id]*len(beer_id_filt), beer_id_filt)
user_preds = model.predictAll(sc.parallelize(to_predict)).collect()
user_preds_top20 = model.predictAll(sc.parallelize(to_predict_top20)).collect()
print('got preds')
preds = Counter({x[1]: x[2] for x in user_preds})
preds_top20 = Counter({x[1]: x[2] for x in user_preds_top20})
with open('%s%s_preds.pkl'%(pred_path, user_id),'wb') as f:
pickle.dump(preds, f)
with open('%s%s_preds_top20.pkl'%(pred_path, user_id),'wb') as f:
pickle.dump(preds_top20, f)
print('done')
sc.stop()
return jsonify(result="Model training complete, you may now get predictions")
示例2: SparkContext
# 需要导入模块: from pyspark import SQLContext [as 别名]
# 或者: from pyspark.SQLContext import registerDataFrameAsTable [as 别名]
#input = "/impala/parquet/back/back-portal-loginflowlog/dat=%s*" % ym
input = '/input/loginfowlog/*'
spark_home = '/opt/cloud/spark'
os.environ['SPARK_HOME'] = spark_home
conf = (SparkConf()
.setMaster(master)
.setAppName(appName)
.set("spark.sql.parquet.binaryAsString","true")
)
sc = SparkContext(conf = conf)
sql_context = SQLContext(sc)
sql_context.registerFunction("to_mac", lambda x: normal_mac(x), StringType())
parquet_df = sql_context.read.parquet(input)
sql_context.registerDataFrameAsTable(parquet_df, "loginflowlog")
#_sql = "select to_mac(upper(usermac)),count(distinct dat) days from loginflowlog group by to_mac(upper(usermac))"
_sql = "select to_mac(upper(usermac)),count(distinct logtime) days from loginflowlog group by to_mac(upper(usermac))"
rs_df = sql_context.sql(_sql)
rs = rs_df.collect()
logger.info("---->" + str(len(rs)))
lists = []
for r in rs:
usermac = r[0]
days = r[1]
t = (usermac,days)
lists.append(t)
#logger.debug(t)
dao = MysqlDao()
示例3: SparkContext
# 需要导入模块: from pyspark import SQLContext [as 别名]
# 或者: from pyspark.SQLContext import registerDataFrameAsTable [as 别名]
day = pro_time.strftime("%Y%m%d")
master = "spark://hadoop:7077"
appName = "spark_pageflow_outflow"
input = "/impala/parquet/site/site-pageflowv1/dat=%s" % day
spark_home = '/opt/cloud/spark'
os.environ['SPARK_HOME'] = spark_home
sc = SparkContext(master, appName)
sql_context = SQLContext(sc)
sql_context.registerFunction("to_day", lambda x: mill_date_str(x), StringType())
sql_context.registerFunction("to_str", lambda x: bytearray_str(x), StringType())
parquet_df = sql_context.read.parquet(input)
sql_context.registerDataFrameAsTable(parquet_df, "site_pageflowv1")
_sql = "select to_str(url),to_day(createtime) day,count(1) pv,count(distinct to_str(guuid)) uv " \
"from site_pageflowv1 where dat= %s and to_str(name)='outflow' " \
"group by to_str(url),to_day(createtime)" % day
rs_df = sql_context.sql(_sql)
rs = rs_df.collect()
logger.info("---->" + str(len(rs)))
list = []
for r in rs:
url = r[0]
day = r[1]
pv = r[2]
uv = r[3]
示例4: filterData
# 需要导入模块: from pyspark import SQLContext [as 别名]
# 或者: from pyspark.SQLContext import registerDataFrameAsTable [as 别名]
return [ record[i].replace('"','') for i in indexes]
def filterData(record):
flag = True
if (int(record[-4])<1) or (record[-2] not in (['1','4'])) or (record[-1] != ''): flag = False
return flag
if __name__ == '__main__':
sc = SparkContext(appName = 'CF_prod_in_transaction')
sqlContext = SQLContext(sc)
in_file = sc.textFile(sys.argv[1])
data = in_file.map(oritentData).filter(filterData).map(lambda x: [int(i) for i in x[:-3]])
Record = Row('customer_id','product_id','invoice_id','units')
data = data.map(lambda x: Record(*x))
data = sqlContext.createDataFrame(data)
sqlContext.registerDataFrameAsTable(data,'table1')
df = sqlContext.sql('select customer_id, product_id, sum(units) as prod_in_transactions from table1 group by customer_id, product_id')
df.map(lambda x: ','.join([str(r) for r in x])).saveAsTextFile(sys.argv[2])
sc.stop()
data_path,header,train_sample,number,support,confidence,lift,k,testing,testing_split,seed,output_path
write = open('test.csv','w')
wrtr = csv.writer(write)
import csv
read = open('arqiva.csv')
for line in read: wrtr.writerow(line)
from e
示例5: SparkConf
# 需要导入模块: from pyspark import SQLContext [as 别名]
# 或者: from pyspark.SQLContext import registerDataFrameAsTable [as 别名]
sim_df.drop_duplicates()
return sim_df
if __name__ == '__main__':
# set up environment
conf = SparkConf() \
.setAppName("BeerSleuthALS") \
.set("spark.driver.memory", "8g")
sc = SparkContext(conf=conf)
sqlContext = SQLContext(sc)
#load data
engine = create_engine('postgresql://postgres:[email protected]:5432/beersleuth')
ratings_sqldf = get_item_user_rev_from_pg(engine, sqlContext)
beer_sqldf = get_beer_data(engine)
sqlContext.registerDataFrameAsTable(ratings_sqldf, "ratings")
# train, test = sqlContext.table('ratings').randomSplit([.8, .2])
# train = train.cache()
# test = test.cache()
## add_rating_to_db(user='johnjohn', beer=u'101 North Heroine IPA' , taste=8, engine=engine)
## add_rating_to_db(user='johnjohn', beer=u'Boulder Creek Golden Promise' , taste=6, engine=engine)
## model_param_sweep(train, test)
# import timeit
# start_time = timeit.default_timer()
# model = fit_final_model(ratings_sqldf)
# elapsed = timeit.default_timer() - start_time
示例6: SparkContext
# 需要导入模块: from pyspark import SQLContext [as 别名]
# 或者: from pyspark.SQLContext import registerDataFrameAsTable [as 别名]
conf.set("spark.driver.maxResultSize", "10g")
sc = SparkContext(conf=conf)
sqlContext = SQLContext(sc)
# path to hillary/enron avro
enr = sqlContext.read.format(
"com.databricks.spark.avro").load(
"s3n://datasets-396316040607/enron_data/*.avro").repartition(16)
hil = sqlContext.read.format(
"com.databricks.spark.avro").load(
"s3n://datasets-396316040607/hillary/*.avro").repartition(16)
# register tables
sqlContext.registerDataFrameAsTable(hil, "hillary")
sqlContext.registerDataFrameAsTable(enr, "enron")
# register udf
sqlContext.registerFunction(
"getCos", lambda x, y: get_cosine(text_to_vector(x), text_to_vector(y))
)
# do the cosine similarity on the text, get the top 1000 matches
out = sqlContext.sql("SELECT h.author h_auth, e.author e_auth, "
"e.contents e_mail, h.contents h_mail, "
"getCos(e.contents, h.contents) as cos_sim "
"from hillary as h join enron as e order by cos_sim "
"desc limit 1000")
# write back out to s3