本文整理汇总了Python中pyspark.sql.HiveContext.registerDataFrameAsTable方法的典型用法代码示例。如果您正苦于以下问题:Python HiveContext.registerDataFrameAsTable方法的具体用法?Python HiveContext.registerDataFrameAsTable怎么用?Python HiveContext.registerDataFrameAsTable使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pyspark.sql.HiveContext
的用法示例。
在下文中一共展示了HiveContext.registerDataFrameAsTable方法的2个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1:
# 需要导入模块: from pyspark.sql import HiveContext [as 别名]
# 或者: from pyspark.sql.HiveContext import registerDataFrameAsTable [as 别名]
# Perform INNER JOIN on the two data frames on EMP_NO column
# As of Spark 1.4 you don't have to worry about duplicate column on join result
df_emp_sal_join = df_employees.join(df_salaries, "emp_no").select("emp_no", "birth_date", "first_name",
"last_name", "gender", "hire_date",
"salary", "from_date", "to_date")
# Adding a column 'year' to the data frame for partitioning the hive table
df_add_year = df_emp_sal_join.withColumn('year', F.year(df_emp_sal_join.to_date))
# Adding a load date column to the data frame
df_final = df_add_year.withColumn('Load_date', F.current_date())
df_final.repartition(10)
# Registering data frame as a temp table for SparkSQL
hive_ctx.registerDataFrameAsTable(df_final, "EMP_TEMP")
# Target Type: APACHE HIVE
# Database : EMPLOYEES
# Table Name : EMPLOYEE_DIM
# + ------------------------------- +
# | COlUMN NAME| TYPE | PARTITION |
# + ------------------------------- +
# | EMP_NO | INT | |
# | BIRTH_DATE | DATE | |
# | FIRST_NAME | STRING | |
# | LAST_NAME | STRING | |
# | GENDER | STRING | |
# | HIRE_DATE | DATE | |
# | SALARY | INT | |
# | FROM_DATE | DATE | |
示例2: main
# 需要导入模块: from pyspark.sql import HiveContext [as 别名]
# 或者: from pyspark.sql.HiveContext import registerDataFrameAsTable [as 别名]
def main(args):
""" Main code for relevance computation """
start_time = time.time()
# iq (code snippets that set below properties have been removed)
driver =
url =
username =
password =
inputs = [driver, url, username, password]
filename = str(args[0])
if os.path.exists(filename):
pass
else:
sys.exit("Input file %s not found" % filename)
file = open(filename, 'r')
for line in file:
key, val = line.split(",")
if str(key).strip() == "dbalias":
dbalias = str(val).strip()
elif str(key).strip() == "numpartitions":
numpartitions = int(val)
elif str(key).strip() == "datadir":
datadir = str(val).strip()
else:
print("Invalid key not set: %s" % str(key))
# Need to make sure that the datadir variable is set.
try:
print("datadir = '%s' " % datadir)
except NameError:
sys.exit("'datadir' variable not set. Check inputfile '%s'"
% (datadir, filename))
# Spark and Hive contexts
conf = SparkConf()
sc = SparkContext(conf = conf)
sqlContext = HiveContext(sc)
df = utils.returnSparkDF(SQLContext(sc), inputs, "traffic")
if df is None: sys.exit("'traffic' query failed: SystemExit.")
sqlContext.registerDataFrameAsTable(df, "uniquedata")
df = None
df = utils.returnSparkDF(SQLContext(sc), inputs, "fbtraffic")
if df is None: sys.exit("'fbtraffic' query failed: SystemExit.")
sqlContext.registerDataFrameAsTable(df, "uniqueFBdata")
df = None
statement = "Select ud.loginid, ud.adid, ud.Type, ufd.Type as FBType "\
"from uniquedata ud left outer join uniqueFBdata ufd "\
"on ud.loginid = ufd.loginid and ud.adid = ufd.adid"
adswithFBjoined = sqlContext.sql(statement)
adswithFBjoined_cleaned = adswithFBjoined[adswithFBjoined['FBType'].isNull()]
adswithFBjoined_cleaned = adswithFBjoined_cleaned.drop('FBType')
sqlContext.registerDataFrameAsTable(adswithFBjoined_cleaned, "data")
statement = "Select loginid, count(loginid) as viewcount from data group by loginid"
temp = sqlContext.sql(statement)
sqlContext.registerDataFrameAsTable(temp, "viewdata")
statement = "Select d.* from data d, viewdata vd where d.loginid = vd.loginid and vd.viewcount > 1"
temp2 = sqlContext.sql(statement)
sqlContext.sql("drop table data")
sqlContext.registerDataFrameAsTable(temp2, "data")
temp, temp2 = (None, None)
df = utils.returnSparkDF(SQLContext(sc), inputs, "agent")
if df is None: sys.exit("'agent' query failed: SystemExit.")
sqlContext.registerDataFrameAsTable(df, "agentdata")
statement = "select loginid, adid, Type, count(adid) as counter from agentdata group by loginid, adid, Type"
unique_adid_per_loginid = sqlContext.sql(statement)
unique_adid_per_loginid = unique_adid_per_loginid.drop('counter')
sqlContext.registerDataFrameAsTable(unique_adid_per_loginid, "agentdata")
df = utils.returnSparkDF(SQLContext(sc), inputs, "favorite")
if df is None: sys.exit("'favorite' query failed: SystemExit.")
sqlContext.registerDataFrameAsTable(df, "favdata")
df = None
statement = "select * from data union all select * from agentdata union all select * from favdata"
df2 = sqlContext.sql(statement)
sqlContext.registerDataFrameAsTable(df2, "uniondata")
df2 = None
statement = "select loginid, max(Type) as UserMaxConversion from uniondata group by loginid"
maxtype = sqlContext.sql(statement)
sqlContext.registerDataFrameAsTable(maxtype, "maxconversiondata")
statement = "select uniondata.loginid, uniondata.adid, uniondata.Type "\
"from uniondata, maxconversiondata where uniondata.loginid = maxconversiondata.loginid "\
"and uniondata.Type = maxconversiondata.UserMaxConversion"
data = sqlContext.sql(statement)
#.........这里部分代码省略.........