当前位置: 首页>>代码示例>>Python>>正文


Python HiveContext.registerDataFrameAsTable方法代码示例

本文整理汇总了Python中pyspark.sql.HiveContext.registerDataFrameAsTable方法的典型用法代码示例。如果您正苦于以下问题:Python HiveContext.registerDataFrameAsTable方法的具体用法?Python HiveContext.registerDataFrameAsTable怎么用?Python HiveContext.registerDataFrameAsTable使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在pyspark.sql.HiveContext的用法示例。


在下文中一共展示了HiveContext.registerDataFrameAsTable方法的2个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1:

# 需要导入模块: from pyspark.sql import HiveContext [as 别名]
# 或者: from pyspark.sql.HiveContext import registerDataFrameAsTable [as 别名]
# Perform INNER JOIN on  the two data frames on EMP_NO column
# As of Spark 1.4 you don't have to worry about duplicate column on join result
df_emp_sal_join = df_employees.join(df_salaries, "emp_no").select("emp_no", "birth_date", "first_name",
                                                             "last_name", "gender", "hire_date",
                                                             "salary", "from_date", "to_date")

# Adding a column 'year' to the data frame for partitioning the hive table
df_add_year = df_emp_sal_join.withColumn('year', F.year(df_emp_sal_join.to_date))

# Adding a load date column to the data frame
df_final = df_add_year.withColumn('Load_date', F.current_date())

df_final.repartition(10)

# Registering data frame as a temp table for SparkSQL
hive_ctx.registerDataFrameAsTable(df_final, "EMP_TEMP")

# Target Type: APACHE HIVE
# Database   : EMPLOYEES
# Table Name : EMPLOYEE_DIM
# + ------------------------------- +
# | COlUMN NAME| TYPE   | PARTITION |
# + ------------------------------- +
# | EMP_NO     | INT    |           |
# | BIRTH_DATE | DATE   |           |
# | FIRST_NAME | STRING |           |
# | LAST_NAME  | STRING |           |
# | GENDER     | STRING |           |
# | HIRE_DATE  | DATE   |           |
# | SALARY     | INT    |           |
# | FROM_DATE  | DATE   |           |
开发者ID:uday07,项目名称:Spark-ETL,代码行数:33,代码来源:mysql_to_hive_etl.py

示例2: main

# 需要导入模块: from pyspark.sql import HiveContext [as 别名]
# 或者: from pyspark.sql.HiveContext import registerDataFrameAsTable [as 别名]
def main(args):
    """ Main code for relevance computation """
    
    start_time = time.time()
    
    # iq (code snippets that set below properties have been removed)
    driver   = 
    url      = 
    username = 
    password = 
    inputs = [driver, url, username, password]

    
    filename = str(args[0])
    if os.path.exists(filename):
        pass
    else:
        sys.exit("Input file %s not found" % filename)
    file = open(filename, 'r')
    for line in file:
        key, val = line.split(",")
        if str(key).strip() == "dbalias":
            dbalias = str(val).strip()
        elif str(key).strip() == "numpartitions":
            numpartitions = int(val)
        elif str(key).strip() == "datadir":
            datadir = str(val).strip()
        else:
            print("Invalid key not set: %s" % str(key))
    # Need to make sure that the datadir variable is set.
    try:
        print("datadir = '%s' " % datadir)
    except NameError:
        sys.exit("'datadir' variable not set. Check inputfile '%s'" 
                 % (datadir, filename))
            
    # Spark and Hive contexts
    conf = SparkConf()
    sc = SparkContext(conf = conf)
    sqlContext = HiveContext(sc)

    
    df = utils.returnSparkDF(SQLContext(sc), inputs, "traffic")
    if df is None: sys.exit("'traffic' query failed: SystemExit.")
    sqlContext.registerDataFrameAsTable(df, "uniquedata")
    df = None
    
    df = utils.returnSparkDF(SQLContext(sc), inputs, "fbtraffic")
    if df is None: sys.exit("'fbtraffic' query failed: SystemExit.")
    sqlContext.registerDataFrameAsTable(df, "uniqueFBdata")
    df = None    

    statement = "Select ud.loginid, ud.adid, ud.Type, ufd.Type as FBType "\
                "from uniquedata ud left outer join uniqueFBdata ufd "\
                "on ud.loginid = ufd.loginid and ud.adid = ufd.adid"
    adswithFBjoined = sqlContext.sql(statement)
    adswithFBjoined_cleaned = adswithFBjoined[adswithFBjoined['FBType'].isNull()]
    adswithFBjoined_cleaned = adswithFBjoined_cleaned.drop('FBType')

    sqlContext.registerDataFrameAsTable(adswithFBjoined_cleaned, "data")

    statement = "Select loginid, count(loginid) as viewcount from data group by loginid"
    temp = sqlContext.sql(statement)
    sqlContext.registerDataFrameAsTable(temp, "viewdata")
    
    statement = "Select d.* from data d, viewdata vd where d.loginid = vd.loginid and vd.viewcount > 1"
    temp2 = sqlContext.sql(statement)
    
    sqlContext.sql("drop table data")
    sqlContext.registerDataFrameAsTable(temp2, "data")
        
    temp, temp2  = (None, None)

    df = utils.returnSparkDF(SQLContext(sc), inputs, "agent")
    if df is None: sys.exit("'agent' query failed: SystemExit.")
    sqlContext.registerDataFrameAsTable(df, "agentdata")

    statement = "select loginid, adid, Type, count(adid) as counter from agentdata group by loginid, adid, Type"
    unique_adid_per_loginid = sqlContext.sql(statement)
    unique_adid_per_loginid = unique_adid_per_loginid.drop('counter')
    sqlContext.registerDataFrameAsTable(unique_adid_per_loginid, "agentdata")
    
    df = utils.returnSparkDF(SQLContext(sc), inputs, "favorite")
    if df is None: sys.exit("'favorite' query failed: SystemExit.")
    sqlContext.registerDataFrameAsTable(df, "favdata")
    df = None
    
    statement = "select * from data union all select * from agentdata union all select * from favdata"
    df2 = sqlContext.sql(statement)
    sqlContext.registerDataFrameAsTable(df2, "uniondata")
    df2 = None
    
    statement = "select loginid, max(Type) as UserMaxConversion from uniondata group by loginid"
    maxtype = sqlContext.sql(statement)
    sqlContext.registerDataFrameAsTable(maxtype, "maxconversiondata")

    statement = "select uniondata.loginid, uniondata.adid, uniondata.Type "\
                "from uniondata, maxconversiondata where uniondata.loginid = maxconversiondata.loginid "\
                "and uniondata.Type = maxconversiondata.UserMaxConversion"
    data = sqlContext.sql(statement)
#.........这里部分代码省略.........
开发者ID:magnusax,项目名称:production,代码行数:103,代码来源:relevance_stage_v13_sparketl.py


注:本文中的pyspark.sql.HiveContext.registerDataFrameAsTable方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。