当前位置: 首页>>代码示例>>Python>>正文


Python HiveContext.createDataFrame方法代码示例

本文整理汇总了Python中pyspark.sql.HiveContext.createDataFrame方法的典型用法代码示例。如果您正苦于以下问题:Python HiveContext.createDataFrame方法的具体用法?Python HiveContext.createDataFrame怎么用?Python HiveContext.createDataFrame使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在pyspark.sql.HiveContext的用法示例。


在下文中一共展示了HiveContext.createDataFrame方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: gen_report_table

# 需要导入模块: from pyspark.sql import HiveContext [as 别名]
# 或者: from pyspark.sql.HiveContext import createDataFrame [as 别名]
 def gen_report_table(hc,curUnixDay):
     rows_indoor=sc.textFile("/data/indoor/*/*").map(lambda r: r.split(",")).map(lambda p: Row(clientmac=p[0], entityid=int(p[1]),etime=int(p[2]),ltime=int(p[3]),seconds=int(p[4]),utoday=int(p[5]),ufirstday=int(p[6])))
     HiveContext.createDataFrame(hc,rows_indoor).registerTempTable("df_indoor")
     #ClientMac|etime|ltime|seconds|utoday|ENTITYID|UFIRSTDAY 
     sql="select entityid,clientmac,utoday,UFIRSTDAY,seconds,"
     sql=sql+"count(1) over(partition by entityid,clientmac) as total_cnt,"
     sql=sql+"count(1) over (partition by entityid,clientmac order by utoday range  2505600 preceding) as day_30," # 2505600 is 29 days
     sql=sql+"count(1) over (partition by entityid,clientmac order by utoday range  518400 preceding)  as day_7," #518400 is 6 days
     sql=sql+"count(1) over (partition by entityid,clientmac,UFIRSTDAY order by UFIRSTDAY  range 1 preceding) as pre_mon "
     sql=sql+"from df_indoor order by entityid,clientmac,utoday" 
     df_id_stat=hc.sql(sql)
     df_id_mm=df_id_stat.withColumn("min", func.min("utoday").over(Window.partitionBy("entityid","clientmac"))).withColumn("max", func.max("utoday").over(Window.partitionBy("entityid","clientmac")))
     #df_id_mm df_min_max ,to caculate firtarrival and last arrival 
     df_id_stat_distinct=df_id_stat.drop("seconds").drop("day_30").drop("day_7").drop("utoday").drop("total_cnt").distinct()
     #distinct df is for lag function to work
     df_id_prepremon=df_id_stat_distinct.withColumn("prepre_mon",func.lag("pre_mon").over(Window.partitionBy("entityid","clientmac").orderBy("entityid","clientmac","UFIRSTDAY"))).drop("pre_mon").na.fill(0)
     
     cond_id = [df_id_mm.clientmac == df_id_prepremon.clientmac, df_id_mm.entityid == df_id_prepremon.entityid, df_id_mm.UFIRSTDAY==df_id_prepremon.UFIRSTDAY]
     df_indoor_fin_tmp=df_id_mm.join(df_id_prepremon, cond_id, 'outer').select(df_id_mm.entityid,df_id_mm.clientmac,df_id_mm.utoday,df_id_mm.UFIRSTDAY,df_id_mm.seconds,df_id_mm.day_30,df_id_mm.day_7,df_id_mm.min,df_id_mm.max,df_id_mm.total_cnt,df_id_prepremon.prepre_mon)
     df_indoor_fin_tmp=df_indoor_fin_tmp.selectExpr("entityid as entityid","clientmac as  clientmac","utoday as utoday","UFIRSTDAY as ufirstday","seconds as secondsbyday","day_30 as indoors30","day_7 as indoors7","min as FirstIndoor","max as LastIndoor","total_cnt as indoors","prepre_mon as indoorsPrevMonth")
     
     #newly added part for indoors7 and indoors30 based on current date
     df_indoor_fin_tmp1= df_indoor_fin_tmp.withColumn("r_day_7", func.when((curUnixDay- df_indoor_fin_tmp.utoday)/86400<7 , 1).otherwise(0))
     df_indoor_fin_tmp2=df_indoor_fin_tmp1.withColumn("r_day_30", func.when((curUnixDay- df_indoor_fin_tmp1.utoday)/86400<30 , 1).otherwise(0))
     df_indoor_fin_tmp3=df_indoor_fin_tmp2.withColumn("r_indoors7",func.sum("r_day_7").over(Window.partitionBy("entityid","clientmac")))
     df_indoor_fin_tmp4=df_indoor_fin_tmp3.withColumn("r_indoors30",func.sum("r_day_30").over(Window.partitionBy("entityid","clientmac")))
     df_indoor_fin=df_indoor_fin_tmp4.drop("r_day_7").drop("r_day_30")
     hc.sql("drop table if exists df_indoor_fin")
     df_indoor_fin.write.saveAsTable("df_indoor_fin")
     
     rows_flow=sc.textFile("/data/flow/*/*").map(lambda r: r.split(",")).map(lambda p: Row(clientmac=p[0], entityid=int(p[1]),etime=int(p[2]),ltime=int(p[3]),utoday=int(p[4]),ufirstday=int(p[5])))
     HiveContext.createDataFrame(hc,rows_flow).registerTempTable("df_flow")
     
     # ClientMac|ENTITYID|UFIRSTDAY|etime|ltime|utoday
     sql="select entityid,clientmac,utoday,UFIRSTDAY,"
     sql=sql+"count(1) over(partition by entityid,clientmac) as total_cnt,"
     sql=sql+"count(1) over (partition by entityid,clientmac order by utoday range  2505600 preceding) as day_30," # 2505600 is 29 days
     sql=sql+"count(1) over (partition by entityid,clientmac order by utoday range  518400 preceding)  as day_7," #518400 is 6 days
     sql=sql+"count(1) over (partition by entityid,clientmac,UFIRSTDAY order by UFIRSTDAY  range 1 preceding) as pre_mon "
     sql=sql+"from df_flow order by entityid,clientmac,utoday" 
     df_fl_stat=hc.sql(sql)
     df_fl_mm=df_fl_stat.withColumn("min", func.min("utoday").over(Window.partitionBy("entityid","clientmac"))).withColumn("max", func.max("utoday").over(Window.partitionBy("entityid","clientmac")))
     #df_fl_mm df_min_max ,to caculate firtarrival and last arrival 
     df_fl_stat_distinct=df_fl_stat.drop("day_30").drop("day_7").drop("utoday").drop("total_cnt").distinct()
     #distinct df is for lag function to work
     df_fl_prepremon=df_fl_stat_distinct.withColumn("prepre_mon",func.lag("pre_mon").over(Window.partitionBy("entityid","clientmac").orderBy("entityid","clientmac","UFIRSTDAY"))).drop("pre_mon").na.fill(0)
     
     cond_fl = [df_fl_mm.clientmac == df_fl_prepremon.clientmac, df_fl_mm.entityid == df_fl_prepremon.entityid, df_fl_mm.UFIRSTDAY==df_fl_prepremon.UFIRSTDAY]
     df_flow_fin=df_fl_mm.join(df_fl_prepremon, cond_fl, 'outer').select(df_fl_mm.entityid,df_fl_mm.clientmac,df_fl_mm.utoday,df_fl_mm.UFIRSTDAY,df_fl_mm.day_30,df_fl_mm.day_7,df_fl_mm.min,df_fl_mm.max,df_fl_mm.total_cnt,df_fl_prepremon.prepre_mon)
     df_flow_fin=df_flow_fin.selectExpr("entityid as entityid","clientmac as  clientmac","utoday as utoday","UFIRSTDAY as ufirstday","day_30 as visits30","day_7 as visits7","min as FirstVisit","max as LastVisit","total_cnt as visits","prepre_mon as visitsPrevMonth")
     hc.sql("drop table if exists df_flow_fin")
     df_flow_fin.write.saveAsTable("df_flow_fin") 
开发者ID:dalinqin,项目名称:src,代码行数:54,代码来源:main_report.py

示例2: SparkEngine

# 需要导入模块: from pyspark.sql import HiveContext [as 别名]
# 或者: from pyspark.sql.HiveContext import createDataFrame [as 别名]
class SparkEngine(object):
    def __init__(self, sc, debug=False):
        self.export_path = os.environ['COOPERHEWITT_ROOT'] + "/export/"
        self.sc = sc
        # hive requires writable permissions: ~/ephemeral-hdfs/bin/hadoop fs -chmod 777 /tmp/hive
        self.hive_cxt = HiveContext(sc)
        self.sql_cxt  = SQLContext(sc)
        if debug:
            print "{0}\n{1}\n{2}\n".format(sc.master, self.hive_cxt, self.sql_cxt)
            print sc._conf.getAll()
        #TBD destructor Unpersist memory

### functionality to query and create tables
    def _create_df_table(self, schema, frame, name):
        if schema: df = self.hive_cxt.createDataFrame(frame, schema=schema)
        else: df = self.hive_cxt.createDataFrame(frame)
        df.printSchema()
        df.registerTempTable(name)
        self.hive_cxt.cacheTable(name)
        return df

    def _query_temporal_data(self):
        # step 1. create main temporal table
        # n_obs => first join causes for each pen entry * num location entries existent (dependent on time period)
        samples_temporal_tb = self.hive_cxt.sql("""
            SELECT  s.refers_to_object_id, created, visit_raw,
                    room_floor, room_id, room_name,
                    spot_id, spot_name, spot_description,
                    room_count_objects, room_count_spots, spot_count_objects,
                    abs(datediff(
                        from_utc_timestamp(from_unixtime(created,   "yyyy-MM-dd"), 'US/Eastern'),
                        from_utc_timestamp(from_unixtime(visit_raw, "yyyy-MM-dd"), 'US/Eastern')
                    )) as delta
            FROM samples s
            JOIN temporal t
            ON s.refers_to_object_id = t.refers_to_object_id
            ORDER by s.refers_to_object_id, created, delta
        """)
        samples_temporal_tb.registerTempTable('samplestemporal')
        self.hive_cxt.cacheTable('samplestemporal')
        return samples_temporal_tb

    def _minimize_query(self):
        # From the temporal table, we need minimize the location (multiple locations) to the appropriate sample timestamp
        tb_samples = self.hive_cxt.sql("""
            SELECT *
            FROM (
                SELECT *,
                MIN(delta)   OVER ( PARTITION BY refers_to_object_id, created) AS min_delta,
                row_number() OVER ( PARTITION BY refers_to_object_id, created) AS ranks
                FROM samplestemporal st
                ORDER BY refers_to_object_id
            ) query
            where query.ranks = 1
        """)
        tb_samples = tb_samples.withColumn("meta_store", lit(1))
        tb_samples.registerTempTable('minimizedsamples')
        self.hive_cxt.cacheTable('minimizedsamples')
        return tb_samples

    def execute_query(self, (samples_schema,  samples_frame, samples_name),
                                        (temporal_schema, temporal_frame, temporal_name),
                                        cols):
        self.df_samples       = self._create_df_table(samples_schema,  samples_frame,  samples_name)
        self.df_temporal      = self._create_df_table(temporal_schema, temporal_frame, temporal_name)
        self.tb_meta          = self._query_temporal_data()
        self.tb_meta_min      = self._minimize_query()
        # combine to the original pen data (meta_store indicates if we had object data to integrate)
        self.df_samplesmeta   = self.df_samples.join(self.tb_meta_min, ['refers_to_object_id', 'created'], "left_outer")
        self.df_samplesmeta   = self.df_samplesmeta.fillna({'meta_store': 0})
        self.df_samplesmeta.printSchema()
        # pickle file to pandas: alternatively we can store as a json or parquet columnar format
        dropped_cols = ['delta', 'min_delta', 'ranks'] + cols
        samplesmeta_pd  = self.df_samplesmeta.toPandas()
        samplesmeta_pd  = samplesmeta_pd.drop(dropped_cols, axis=1)
        samplesmeta_pd.to_pickle(self.export_path + "penmeta_spark.pkl")
开发者ID:akamlani,项目名称:cooperhewitt,代码行数:78,代码来源:ch_spark.py

示例3: float

# 需要导入模块: from pyspark.sql import HiveContext [as 别名]
# 或者: from pyspark.sql.HiveContext import createDataFrame [as 别名]
        return None
    else:
        info = measureInfo[measureid]
        minScore = info[0]
        scoreRange = info[1]
        return float((score - minScore)/scoreRange)

total_quality_normal = total_quality.map(lambda r: (r[0], r[1], r[2], normalizeScore(r[3], r[2])))

schema = StructType([
    StructField("providerid", StringType(), True),
    StructField("state", StringType(), True),
    StructField("measureid", StringType(), True),
    StructField("normalizedscore", FloatType(), True)])

df_total_quality = sqlContext.createDataFrame(total_quality_normal, schema)
saveAsHiveTable(df_total_quality, "total_quality")


# Some hospitals have too few non-NA measure.  To have a fair ranking, we want to set a min. bar
# on the # of non-NA measure for our hospitals to participate in our evaluation.

# For each hospital, find out the # of non-NA measure it has
nonNAMeasureCount = dict(df_total_quality.map(lambda r: (r.providerid, r.normalizedscore)).
                         combineByKey( # Use combineByKey to count the # of non-NA Measure
                            lambda value: 0 if value is None else 1,
                            lambda x, value: x if value is None else x + 1,
                            lambda x, y: x + y).collect())

# Find the 25th percentile of non-NA measure, and this will be the min-bar of # of non-NA measure.
minMeasureCount = np.percentile(nonNAMeasureCount.values(), 25.)
开发者ID:patng323,项目名称:w205-ex1,代码行数:33,代码来源:transform.py

示例4: if

# 需要导入模块: from pyspark.sql import HiveContext [as 别名]
# 或者: from pyspark.sql.HiveContext import createDataFrame [as 别名]
        if progTag==cnt:                
            folderLists=x         
        if progTag==cnt:                
            folderLists=x         
        cnt=cnt+1
        
    for x in folderLists:
        entityid=int(get_str_entityID_byFolder(x))
        df_id=df_mysql.filter(df_mysql.ID==entityid)
        inThresh  =10  if (df_id.head(1)[0].IndoorSecondsThrehold==0) else df_id.head(1)[0].IndoorSecondsThrehold
        outThresh =300 if (df_id.head(1)[0].LeaveMinutesThrehold ==0) else df_id.head(1)[0].LeaveMinutesThrehold*60
        incFiles=fs.listStatus(sc._jvm.Path(x))
        for incFile in incFiles:
            if incFile.isFile():
                curTime=str(time.strftime("%Y-%m-%d %H:%M:%S",time.localtime(time.time())))
                currrentFile=str(incFile.getPath())
                processData(sc,hc,fs,con,currrentFile,inThresh,outThresh,progTag)
                pd_rowLog = pandas.Series([operType,curTime,currrentFile],index=pd_column)
                pd_dataFrameLog=pd_dataFrameLog.append(pd_rowLog, ignore_index=True)
    
    
    curDay=str(time.strftime("%Y-%m-%d",time.localtime(time.time())))

    
    df_log=hc.createDataFrame(pd_dataFrameLog)
    df_log.sort(df_log.operType,df_log.processDate).repartition(1).write.mode('overwrite').json("/data/log/incData"+str(progTag)+"/"+curDay) 
 
        
    sc.stop()
    
开发者ID:dalinqin,项目名称:src,代码行数:31,代码来源:main_inc.py

示例5: SparkContext

# 需要导入模块: from pyspark.sql import HiveContext [as 别名]
# 或者: from pyspark.sql.HiveContext import createDataFrame [as 别名]
from pyspark import SparkContext
sc = SparkContext()

from pyspark.sql import HiveContext
sqlContext = HiveContext(sc)

tbl_Readmissions = sqlContext.sql("Select Provider_ID, Measure_ID, Compared_to_National, Denominator, Score, Lower_Estimate, Higher_Estimate From tbl_Readmissions_RAW where Score <> 'Not Available' and Measure_ID = 'READM_30_HOSP_WIDE'").rdd

tbl_Readmissions.saveAsTextFile("/user/w205/hospital_compare_txt_TRANSFORMED/txt_readmissions/")
sqlContext.createDataFrame(tbl_Readmissions).write.parquet("/user/w205/hospital_compare_tbl_TRANSFORMED/tbl_readmissions/")

exit()
开发者ID:mmarks13,项目名称:w205_Exercise_1,代码行数:14,代码来源:transform_tbl_Readmission.py

示例6: SparkContext

# 需要导入模块: from pyspark.sql import HiveContext [as 别名]
# 或者: from pyspark.sql.HiveContext import createDataFrame [as 别名]
from pyspark import SparkContext
sc = SparkContext()

from pyspark.sql import HiveContext
sqlContext = HiveContext(sc)

tbl_Effective_Care = sqlContext.sql("Select Provider_ID, Measure_ID, Score From tbl_Effective_Care_RAW where Score <> 'Not Available'").rdd

tbl_Effective_Care.saveAsTextFile("/user/w205/hospital_compare_txt_TRANSFORMED/txt_effective_care/")
sqlContext.createDataFrame(tbl_Effective_Care).write.parquet("/user/w205/hospital_compare_tbl_TRANSFORMED/tbl_effective_care/")
exit()
开发者ID:mmarks13,项目名称:w205_Exercise_1,代码行数:13,代码来源:transform_tbl_Effective_Care.py

示例7: main

# 需要导入模块: from pyspark.sql import HiveContext [as 别名]
# 或者: from pyspark.sql.HiveContext import createDataFrame [as 别名]
def main():
    args = parser_arguments()
    start_date = args.start_date[0]
    end_date = args.end_date[0]

    sc = SparkContext()
    hc = HiveContext(sc)


    select = """
        SELECT 
            * 
        FROM 
            cluster_metrics_prod_2.container_fact
        where 
            date between '{0}' and '{1}'
        """.format(start_date, end_date)

    df = hc.sql(select)

    header = {
            "jobid"         : "string",
            "containerid"   : "string", 
            "start"         : "bigint", 
            "stop"          : "bigint",
            "duration"      : "bigint",
            "event"         : "string", 
            "size"          : "double", 
            "priority"      : "int", 
            "hostname"      : "string", 
            "system"        : "string", 
            "date"          : "string"
            }
            

    all_rows = df.flatMap(split_data)
    schema_split_containers = hc.createDataFrame(all_rows)
    schema_split_containers.registerTempTable("split_containers")


    create_string = """
        create table if not exists cluster_metrics_prod_2.container_fact_event_flattened
            (
            jobid       string,
            containerid string,
            start       bigint,
            stop        bigint,
            duration    bigint,
            event       string,
            size        double,
            priority    int,
            hostname    string
            )
        partitioned by
            (
            system      string,
            date        string
            )
        stored as orc
    """

    set_dyn = "set hive.exec.dynamic.partition=true"
    set_nstat = "set hive.exec.dynamic.partition.mode=nonstrict"

    load_string = """
        insert overwrite table 
            cluster_metrics_prod_2.container_fact_event_flattened 
        partition
            (system, date) 
        select 
            jobid, 
            containerid, 
            start, 
            stop, 
            duration, 
            event, 
            size, 
            priority, 
            hostname, 
            system, 
            date 
        from 
            split_containers
    """

    print("Setting dynamic partition...")
    hc.sql(set_dyn)
    hc.sql(set_nstat)

    print("Creating Table...")
    hc.sql(create_string)
    print("Loading data into table...")
    hc.sql(load_string)
    print("DONE")
开发者ID:tnystrand-altiscale,项目名称:datapipeline_dump,代码行数:96,代码来源:split_containerfact_spark.py

示例8: alternateNames

# 需要导入模块: from pyspark.sql import HiveContext [as 别名]
# 或者: from pyspark.sql.HiveContext import createDataFrame [as 别名]
            predictPlayers.append(pp)
        else:
            #print "p not found in mlb?", p
            newname = alternateNames(p.upper(), pids)
            if newname is not None:
                #print "got it: ", newname
                pp['lookup_name'] = newname
                pp['player_id'] = encodedPlayerIds[str(pids[newname])]
                predictPlayers.append(pp)
            else:
                print "REALLY NOT FOUND.", p.upper()
        

    print "predictHitters=", predictHitters
    phRDD = sc.parallelize(predictHitters)
    phDF = sqlContext.createDataFrame(phRDD, samplingRatio=0.5)
    phDF.registerTempTable("fd_hitters")
    print "phDF=", phDF.take(2)
    
    print "predictPitchers=", predictPitchers
    ppRDD = sc.parallelize(predictPitchers)
    ppDF = sqlContext.createDataFrame(ppRDD, samplingRatio=0.5)
    ppDF.registerTempTable("fd_pitchers")
    print "ppDF=", ppDF.take(22)

    encodedHitterFeatures = sqlContext.parquetFile(rddDir + "/batting_features.enc.parquet")
    encodedHitterFeatures.registerTempTable("bfe")

    hfDF = sqlContext.sql("""select bfe.* from fd_hitters, bfe where
                            fd_hitters.player_id = bfe.player_id
                            and fd_hitters.game_date = bfe.game_date""")
开发者ID:dondrake,项目名称:mlb_stats_spark,代码行数:33,代码来源:FanDuelGame.py

示例9: cleans

# 需要导入模块: from pyspark.sql import HiveContext [as 别名]
# 或者: from pyspark.sql.HiveContext import createDataFrame [as 别名]
datalines = datalines.map(lambda x: cleans(x))

#1 imports

from pyspark.sql.types import *

#2 create metadata

fields = [StructField(field_name,StringType(),True) for field_name in firstline]

schema = StructType(fields)

#3 create a dataframe

schemaLoans = sqlContext.createDataFrame(datalines, schema)

#4 register it as a table called loans

schemaLoans.registerTempTable("loans")

#1 drop table, summarize and store in hive

sqlContext.sql("drop table if exists LoansByTitle")

sql = '''create table LoansByTitle stored as parquet as select title, count(1) as number from loans group by title order by number desc'''

sqlContext.sql(sql)
sqlContext.sql('drop table if exists raw')

sql = '''create table raw stored as parquet as select title, emp_title,grade,home_ownership,int_rate,recoveries,collection_recovery_fee,loan_amnt,term from loans'''
开发者ID:DavyCielen,项目名称:python_book_chapter5,代码行数:32,代码来源:spark.py

示例10: HiveContext

# 需要导入模块: from pyspark.sql import HiveContext [as 别名]
# 或者: from pyspark.sql.HiveContext import createDataFrame [as 别名]
		logSet = sc.textFile("hdfs://hdfs.hadoop.yourdomain.com:9000/hdfs/aatlogstore/YYYY/MM/*").map(loadData)

		aatlog = logSet.map(getPairKey)

		#make vidsession_log & vidsession data
		aatlogSet = aatlog.reduceByKey(lambda x, y: x)
		vidlogSet = aatlogSet.sortByKey().zipWithIndex().map(lambda x: ((x[0][0][0], x[1]), x[0][1]))
		vidlogSet.persist()
		vidlogLag = vidlogSet.map(lambda x:((x[0][0], x[0][1]+1), x[1]))
		vidLogRST = vidlogSet.leftOuterJoin(vidlogLag).map(mkVidLog)

		vidSet = vidlogSet.map(lambda x: (x[0][0], (x[0][1], x[1])))
		vidProcSet = vidSet.combineByKey(vidCreate, vidMerge, vidMerge2)

		print "filter processing done "
		#save RDD to HIVE Tables
		hx = HiveContext(sc)

		vidTable = hx.createDataFrame(vidProcSet.map(lambda x:x[1]))
		vidLogTable = hx.createDataFrame(vidLogRST, samplingRatio=0.5)

		print "Dataframe Done"
		vidLogTable.saveAsTable('vidsession_log', mode='append')
		vidTable.saveAsTable('vidsession', mode='append')

		print "AATLOG Process Done"

	except Exception, e:
		print "MAIN Error %s" % e

开发者ID:ddinsight,项目名称:dd-analytics,代码行数:31,代码来源:smkvidss.py

示例11: map

# 需要导入模块: from pyspark.sql import HiveContext [as 别名]
# 或者: from pyspark.sql.HiveContext import createDataFrame [as 别名]
cluster_center_overall_deltas = map(get_overall_delta_percentage_closure, centers)

# Convert the list to a list of tuples mapping cluster number to the performance percentage
converted_center_delta_list = DunaganListUtility.convert_list_to_value_and_index_tuple_list(cluster_center_overall_deltas)

# Sort the list of tuples by performance (in ascending order)
converted_center_delta_list.sort(lambda tuple_1, tuple_2: cmp(tuple_1[1], tuple_2[1]))

# Convert the list mapping cluster_number to performance percentage into a list of Row() object for insertion into a database table
# (ClusterId, Delta-Percentage) Row list construction
converted_center_delta_list_rows = map(lambda delta_tuple: Row(cluster_id=int(delta_tuple[0]), delta_percentage=float(delta_tuple[1])), converted_center_delta_list)

print "\n\n\n\nAbout to sqlContext.createDataFrame(converted_center_delta_list_rows)\n\n\n\n"

# Create a data frame from the list of Rows
schemaCenterDeltas = sqlContext.createDataFrame(converted_center_delta_list_rows)

print "\n\n\n\nAbout to schemaCenterDeltas.write.jdbc(url=mysql_url, table=cluster_total_delta_percentages)\n\n\n\n"

# Write the data frame to the database in table cluster_total_delta_percentages
schemaCenterDeltas.write.jdbc(url=mysql_url, table='cluster_total_delta_percentages', mode="overwrite")

# Produce a list which maps cluster numbers to symbols to produce an xref database table
# (ClusterId,  Symbol) XRef Row List construction
cluster_id_symbol_xref_rows_list = []
for cluster_id, list_of_symbols in clusterGroupsDictionaryRdd.items():
    for symbol in list_of_symbols:
        print "cluster_id: " + str(cluster_id) + "\t\tsymbol: " + symbol
        xrefRow = Row(cluster_id=int(cluster_id), symbol=str(symbol))
        cluster_id_symbol_xref_rows_list.append(xrefRow)
开发者ID:dunagan5887,项目名称:machine_learning,代码行数:32,代码来源:cluster_stocks_by_performance.py

示例12: CheckValidScore

# 需要导入模块: from pyspark.sql import HiveContext [as 别名]
# 或者: from pyspark.sql.HiveContext import createDataFrame [as 别名]
# removing all row not containing numbers for score variable

# function to test if a string can be parsed in integer or not
def CheckValidScore(s):
    try:
        int(s)
        return True
    except ValueError:
        return False

# creating a RDD by filtering out invalid scores
df_survey_clean_rdd = df_survey_raw.rdd.filter(lambda row: CheckValidScore(row.hcahps_base_score))
df_care_clean_rdd = df_care_raw.rdd.filter(lambda row: CheckValidScore(row.score))

# creating dataframe from the RDD
df_survey_clean = sqlCtx.createDataFrame(df_survey_clean_rdd)
df_care_clean = sqlCtx.createDataFrame(df_care_clean_rdd)
print 'Number of rows in survey table after cleaning {0}'.format(df_survey_clean.count())
print 'Number of rows in effective_care table after cleaning {0}'.format(df_care_clean.count())


# converting the data types for score column
df_survey = df_survey_clean.selectExpr('provider_id','(cast(hcahps_base_score as int) + cast(hcahps_consistency_score as int)) as survey_score')

# converting the data types for score column
df_care = df_care_clean.selectExpr('provider_id','measure_id','cast(score as int) as score')

# get the maximum score per measure_id and name the column as 'max_score'
df_care_max_scores = df_care.groupBy('measure_id').max().collect()
df_care_max_scores = sc.broadcast(df_care_max_scores)
开发者ID:meabhishekkumar,项目名称:w205-lab-exercises,代码行数:32,代码来源:hospitals_and_patients.py

示例13: print

# 需要导入模块: from pyspark.sql import HiveContext [as 别名]
# 或者: from pyspark.sql.HiveContext import createDataFrame [as 别名]
i = 0
for index, row in pd_df_reviews.iterrows():
    tot = 0
    totwords = 0
    print(row)
    if pd.notnull(row['_2']):
        cleantext = cleanText(row['_2'])
        totwords = len(cleantext)
        #return the 5 n-grams with the highest PMI
        unigrams = top5_words(cleantext)
        bigrams = top5_bigram_collocations(cleantext)
        trigrams = top5_trigram_collocations(cleantext)

    df.ix[i, 'BusinessID']= row['_1']
    df.ix[i, 'CleanText']= " ".join(cleantext).encode('utf-8')
    df.ix[i, 'TotWords']= totwords
    df.ix[i, 'unigrams']= unigrams
    df.ix[i, 'bigrams']= bigrams
    df.ix[i, 'trigrams']= trigrams
    i += 1

spark_df = sqlContext.createDataFrame(df, columns)

# Save it as a table
spark_df.registerTempTable("dfBusiness500000")

sqlContext.sql("drop table if exists result2_bybusiness")
sqlContext.sql("CREATE TABLE result2_bybusiness AS SELECT * FROM dfBusiness500000")

开发者ID:SueYang,项目名称:Project,代码行数:30,代码来源:YelpParsing_by_business.py

示例14: CheckValidScore

# 需要导入模块: from pyspark.sql import HiveContext [as 别名]
# 或者: from pyspark.sql.HiveContext import createDataFrame [as 别名]
print 'Number of rows in the table {0}'.format(df_raw.count())

# removing all row not containing numbers for score variable

# function to test if a string can be parsed in integer or not
def CheckValidScore(s):
    try:
        int(s)
        return True
    except ValueError:
        return False

# creating a RDD by filtering out invalid scores
df_clean_rdd = df_raw.rdd.filter(lambda row: CheckValidScore(row.score))
# cretating datframe from the RDD
df_clean = sqlCtx.createDataFrame(df_clean_rdd)
print 'Number of rows in table after cleaning {0}'.format(df_clean.count())

# converting the data types for score column
df_clean = df_clean.selectExpr('provider_id','measure_id','cast(score as int) as score')
# creating dataframe for hospitals table
df_hospital = sqlCtx.table("hospital")

# joining hospitals table to hospital_state
df_clean = df_clean.join(df_hospital, df_clean.provider_id == df_hospital.provider_id,'left_outer').select(df_clean.provider_id, df_clean.measure_id, df_hospital.state, df_clean.score)




# get the maximum score per measure_id and name the column as 'max_score'
df_max_scores = df_clean.groupBy('measure_id').max().collect()
开发者ID:meabhishekkumar,项目名称:w205-lab-exercises,代码行数:33,代码来源:best_states.py

示例15: StringType

# 需要导入模块: from pyspark.sql import HiveContext [as 别名]
# 或者: from pyspark.sql.HiveContext import createDataFrame [as 别名]
encounters = parts.map(lambda p: (p[0], datetime.strptime(p[1], "%Y-%m-%d %H:%M:%S"), p[2], p[3].strip().replace(',','_')))

fields = [StructField("PATIENT_NUM", StringType(), True),
	StructField("START_DATE", DateType(), True),
	StructField("ENCOUNTER_NUM", StringType(), True),
	StructField("ICD9S", StringType(), True)]
schema_encounters = StructType(fields)

# fields = [StructField("PATIENT_NUM", StringType(), True),
# 	StructField("ENCOUNTER_NUM", StringType(), True),
# 	StructField("START_DATE", StringType(), True),
# 	Seq(StructField("ICD9S", ArrayType(StringType(), True), True) ]
# schema_encounters = StructType(fields)

# Apply the schema to the RDD.
schemaEncounters = sqlContext.createDataFrame(encounters, schema_encounters)
schemaEncounters.printSchema()
schemaEncounters.registerTempTable("encounters")

# order data by patient, start date, _then_ encounter

encounteres_ordered = sqlContext.sql("select PATIENT_NUM, START_DATE, ENCOUNTER_NUM, ICD9S from encounters order by PATIENT_NUM, START_DATE, ENCOUNTER_NUM")
encounteres_ordered.registerTempTable("encounteres_ordered")

#sqlContext.sql("select collect_list(ICD9S) as icd9s from encounteres_ordered group by PATIENT_NUM").show(5)

#sqlContext.sql("select PATIENT_NUM, collect_list(ICD9S) as icd9s from encounteres_ordered group by PATIENT_NUM").show(20, truncate=False)

rdd = sqlContext.sql("select collect_list(ICD9S) as icd9s from encounteres_ordered group by PATIENT_NUM").rdd

def splitter(p):
开发者ID:jayurbain,项目名称:machine-learning,代码行数:33,代码来源:subsequence_mining.py


注:本文中的pyspark.sql.HiveContext.createDataFrame方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。