本文整理汇总了Python中pyspark.sql.HiveContext.createDataFrame方法的典型用法代码示例。如果您正苦于以下问题:Python HiveContext.createDataFrame方法的具体用法?Python HiveContext.createDataFrame怎么用?Python HiveContext.createDataFrame使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pyspark.sql.HiveContext
的用法示例。
在下文中一共展示了HiveContext.createDataFrame方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: gen_report_table
# 需要导入模块: from pyspark.sql import HiveContext [as 别名]
# 或者: from pyspark.sql.HiveContext import createDataFrame [as 别名]
def gen_report_table(hc,curUnixDay):
rows_indoor=sc.textFile("/data/indoor/*/*").map(lambda r: r.split(",")).map(lambda p: Row(clientmac=p[0], entityid=int(p[1]),etime=int(p[2]),ltime=int(p[3]),seconds=int(p[4]),utoday=int(p[5]),ufirstday=int(p[6])))
HiveContext.createDataFrame(hc,rows_indoor).registerTempTable("df_indoor")
#ClientMac|etime|ltime|seconds|utoday|ENTITYID|UFIRSTDAY
sql="select entityid,clientmac,utoday,UFIRSTDAY,seconds,"
sql=sql+"count(1) over(partition by entityid,clientmac) as total_cnt,"
sql=sql+"count(1) over (partition by entityid,clientmac order by utoday range 2505600 preceding) as day_30," # 2505600 is 29 days
sql=sql+"count(1) over (partition by entityid,clientmac order by utoday range 518400 preceding) as day_7," #518400 is 6 days
sql=sql+"count(1) over (partition by entityid,clientmac,UFIRSTDAY order by UFIRSTDAY range 1 preceding) as pre_mon "
sql=sql+"from df_indoor order by entityid,clientmac,utoday"
df_id_stat=hc.sql(sql)
df_id_mm=df_id_stat.withColumn("min", func.min("utoday").over(Window.partitionBy("entityid","clientmac"))).withColumn("max", func.max("utoday").over(Window.partitionBy("entityid","clientmac")))
#df_id_mm df_min_max ,to caculate firtarrival and last arrival
df_id_stat_distinct=df_id_stat.drop("seconds").drop("day_30").drop("day_7").drop("utoday").drop("total_cnt").distinct()
#distinct df is for lag function to work
df_id_prepremon=df_id_stat_distinct.withColumn("prepre_mon",func.lag("pre_mon").over(Window.partitionBy("entityid","clientmac").orderBy("entityid","clientmac","UFIRSTDAY"))).drop("pre_mon").na.fill(0)
cond_id = [df_id_mm.clientmac == df_id_prepremon.clientmac, df_id_mm.entityid == df_id_prepremon.entityid, df_id_mm.UFIRSTDAY==df_id_prepremon.UFIRSTDAY]
df_indoor_fin_tmp=df_id_mm.join(df_id_prepremon, cond_id, 'outer').select(df_id_mm.entityid,df_id_mm.clientmac,df_id_mm.utoday,df_id_mm.UFIRSTDAY,df_id_mm.seconds,df_id_mm.day_30,df_id_mm.day_7,df_id_mm.min,df_id_mm.max,df_id_mm.total_cnt,df_id_prepremon.prepre_mon)
df_indoor_fin_tmp=df_indoor_fin_tmp.selectExpr("entityid as entityid","clientmac as clientmac","utoday as utoday","UFIRSTDAY as ufirstday","seconds as secondsbyday","day_30 as indoors30","day_7 as indoors7","min as FirstIndoor","max as LastIndoor","total_cnt as indoors","prepre_mon as indoorsPrevMonth")
#newly added part for indoors7 and indoors30 based on current date
df_indoor_fin_tmp1= df_indoor_fin_tmp.withColumn("r_day_7", func.when((curUnixDay- df_indoor_fin_tmp.utoday)/86400<7 , 1).otherwise(0))
df_indoor_fin_tmp2=df_indoor_fin_tmp1.withColumn("r_day_30", func.when((curUnixDay- df_indoor_fin_tmp1.utoday)/86400<30 , 1).otherwise(0))
df_indoor_fin_tmp3=df_indoor_fin_tmp2.withColumn("r_indoors7",func.sum("r_day_7").over(Window.partitionBy("entityid","clientmac")))
df_indoor_fin_tmp4=df_indoor_fin_tmp3.withColumn("r_indoors30",func.sum("r_day_30").over(Window.partitionBy("entityid","clientmac")))
df_indoor_fin=df_indoor_fin_tmp4.drop("r_day_7").drop("r_day_30")
hc.sql("drop table if exists df_indoor_fin")
df_indoor_fin.write.saveAsTable("df_indoor_fin")
rows_flow=sc.textFile("/data/flow/*/*").map(lambda r: r.split(",")).map(lambda p: Row(clientmac=p[0], entityid=int(p[1]),etime=int(p[2]),ltime=int(p[3]),utoday=int(p[4]),ufirstday=int(p[5])))
HiveContext.createDataFrame(hc,rows_flow).registerTempTable("df_flow")
# ClientMac|ENTITYID|UFIRSTDAY|etime|ltime|utoday
sql="select entityid,clientmac,utoday,UFIRSTDAY,"
sql=sql+"count(1) over(partition by entityid,clientmac) as total_cnt,"
sql=sql+"count(1) over (partition by entityid,clientmac order by utoday range 2505600 preceding) as day_30," # 2505600 is 29 days
sql=sql+"count(1) over (partition by entityid,clientmac order by utoday range 518400 preceding) as day_7," #518400 is 6 days
sql=sql+"count(1) over (partition by entityid,clientmac,UFIRSTDAY order by UFIRSTDAY range 1 preceding) as pre_mon "
sql=sql+"from df_flow order by entityid,clientmac,utoday"
df_fl_stat=hc.sql(sql)
df_fl_mm=df_fl_stat.withColumn("min", func.min("utoday").over(Window.partitionBy("entityid","clientmac"))).withColumn("max", func.max("utoday").over(Window.partitionBy("entityid","clientmac")))
#df_fl_mm df_min_max ,to caculate firtarrival and last arrival
df_fl_stat_distinct=df_fl_stat.drop("day_30").drop("day_7").drop("utoday").drop("total_cnt").distinct()
#distinct df is for lag function to work
df_fl_prepremon=df_fl_stat_distinct.withColumn("prepre_mon",func.lag("pre_mon").over(Window.partitionBy("entityid","clientmac").orderBy("entityid","clientmac","UFIRSTDAY"))).drop("pre_mon").na.fill(0)
cond_fl = [df_fl_mm.clientmac == df_fl_prepremon.clientmac, df_fl_mm.entityid == df_fl_prepremon.entityid, df_fl_mm.UFIRSTDAY==df_fl_prepremon.UFIRSTDAY]
df_flow_fin=df_fl_mm.join(df_fl_prepremon, cond_fl, 'outer').select(df_fl_mm.entityid,df_fl_mm.clientmac,df_fl_mm.utoday,df_fl_mm.UFIRSTDAY,df_fl_mm.day_30,df_fl_mm.day_7,df_fl_mm.min,df_fl_mm.max,df_fl_mm.total_cnt,df_fl_prepremon.prepre_mon)
df_flow_fin=df_flow_fin.selectExpr("entityid as entityid","clientmac as clientmac","utoday as utoday","UFIRSTDAY as ufirstday","day_30 as visits30","day_7 as visits7","min as FirstVisit","max as LastVisit","total_cnt as visits","prepre_mon as visitsPrevMonth")
hc.sql("drop table if exists df_flow_fin")
df_flow_fin.write.saveAsTable("df_flow_fin")
示例2: SparkEngine
# 需要导入模块: from pyspark.sql import HiveContext [as 别名]
# 或者: from pyspark.sql.HiveContext import createDataFrame [as 别名]
class SparkEngine(object):
def __init__(self, sc, debug=False):
self.export_path = os.environ['COOPERHEWITT_ROOT'] + "/export/"
self.sc = sc
# hive requires writable permissions: ~/ephemeral-hdfs/bin/hadoop fs -chmod 777 /tmp/hive
self.hive_cxt = HiveContext(sc)
self.sql_cxt = SQLContext(sc)
if debug:
print "{0}\n{1}\n{2}\n".format(sc.master, self.hive_cxt, self.sql_cxt)
print sc._conf.getAll()
#TBD destructor Unpersist memory
### functionality to query and create tables
def _create_df_table(self, schema, frame, name):
if schema: df = self.hive_cxt.createDataFrame(frame, schema=schema)
else: df = self.hive_cxt.createDataFrame(frame)
df.printSchema()
df.registerTempTable(name)
self.hive_cxt.cacheTable(name)
return df
def _query_temporal_data(self):
# step 1. create main temporal table
# n_obs => first join causes for each pen entry * num location entries existent (dependent on time period)
samples_temporal_tb = self.hive_cxt.sql("""
SELECT s.refers_to_object_id, created, visit_raw,
room_floor, room_id, room_name,
spot_id, spot_name, spot_description,
room_count_objects, room_count_spots, spot_count_objects,
abs(datediff(
from_utc_timestamp(from_unixtime(created, "yyyy-MM-dd"), 'US/Eastern'),
from_utc_timestamp(from_unixtime(visit_raw, "yyyy-MM-dd"), 'US/Eastern')
)) as delta
FROM samples s
JOIN temporal t
ON s.refers_to_object_id = t.refers_to_object_id
ORDER by s.refers_to_object_id, created, delta
""")
samples_temporal_tb.registerTempTable('samplestemporal')
self.hive_cxt.cacheTable('samplestemporal')
return samples_temporal_tb
def _minimize_query(self):
# From the temporal table, we need minimize the location (multiple locations) to the appropriate sample timestamp
tb_samples = self.hive_cxt.sql("""
SELECT *
FROM (
SELECT *,
MIN(delta) OVER ( PARTITION BY refers_to_object_id, created) AS min_delta,
row_number() OVER ( PARTITION BY refers_to_object_id, created) AS ranks
FROM samplestemporal st
ORDER BY refers_to_object_id
) query
where query.ranks = 1
""")
tb_samples = tb_samples.withColumn("meta_store", lit(1))
tb_samples.registerTempTable('minimizedsamples')
self.hive_cxt.cacheTable('minimizedsamples')
return tb_samples
def execute_query(self, (samples_schema, samples_frame, samples_name),
(temporal_schema, temporal_frame, temporal_name),
cols):
self.df_samples = self._create_df_table(samples_schema, samples_frame, samples_name)
self.df_temporal = self._create_df_table(temporal_schema, temporal_frame, temporal_name)
self.tb_meta = self._query_temporal_data()
self.tb_meta_min = self._minimize_query()
# combine to the original pen data (meta_store indicates if we had object data to integrate)
self.df_samplesmeta = self.df_samples.join(self.tb_meta_min, ['refers_to_object_id', 'created'], "left_outer")
self.df_samplesmeta = self.df_samplesmeta.fillna({'meta_store': 0})
self.df_samplesmeta.printSchema()
# pickle file to pandas: alternatively we can store as a json or parquet columnar format
dropped_cols = ['delta', 'min_delta', 'ranks'] + cols
samplesmeta_pd = self.df_samplesmeta.toPandas()
samplesmeta_pd = samplesmeta_pd.drop(dropped_cols, axis=1)
samplesmeta_pd.to_pickle(self.export_path + "penmeta_spark.pkl")
示例3: float
# 需要导入模块: from pyspark.sql import HiveContext [as 别名]
# 或者: from pyspark.sql.HiveContext import createDataFrame [as 别名]
return None
else:
info = measureInfo[measureid]
minScore = info[0]
scoreRange = info[1]
return float((score - minScore)/scoreRange)
total_quality_normal = total_quality.map(lambda r: (r[0], r[1], r[2], normalizeScore(r[3], r[2])))
schema = StructType([
StructField("providerid", StringType(), True),
StructField("state", StringType(), True),
StructField("measureid", StringType(), True),
StructField("normalizedscore", FloatType(), True)])
df_total_quality = sqlContext.createDataFrame(total_quality_normal, schema)
saveAsHiveTable(df_total_quality, "total_quality")
# Some hospitals have too few non-NA measure. To have a fair ranking, we want to set a min. bar
# on the # of non-NA measure for our hospitals to participate in our evaluation.
# For each hospital, find out the # of non-NA measure it has
nonNAMeasureCount = dict(df_total_quality.map(lambda r: (r.providerid, r.normalizedscore)).
combineByKey( # Use combineByKey to count the # of non-NA Measure
lambda value: 0 if value is None else 1,
lambda x, value: x if value is None else x + 1,
lambda x, y: x + y).collect())
# Find the 25th percentile of non-NA measure, and this will be the min-bar of # of non-NA measure.
minMeasureCount = np.percentile(nonNAMeasureCount.values(), 25.)
示例4: if
# 需要导入模块: from pyspark.sql import HiveContext [as 别名]
# 或者: from pyspark.sql.HiveContext import createDataFrame [as 别名]
if progTag==cnt:
folderLists=x
if progTag==cnt:
folderLists=x
cnt=cnt+1
for x in folderLists:
entityid=int(get_str_entityID_byFolder(x))
df_id=df_mysql.filter(df_mysql.ID==entityid)
inThresh =10 if (df_id.head(1)[0].IndoorSecondsThrehold==0) else df_id.head(1)[0].IndoorSecondsThrehold
outThresh =300 if (df_id.head(1)[0].LeaveMinutesThrehold ==0) else df_id.head(1)[0].LeaveMinutesThrehold*60
incFiles=fs.listStatus(sc._jvm.Path(x))
for incFile in incFiles:
if incFile.isFile():
curTime=str(time.strftime("%Y-%m-%d %H:%M:%S",time.localtime(time.time())))
currrentFile=str(incFile.getPath())
processData(sc,hc,fs,con,currrentFile,inThresh,outThresh,progTag)
pd_rowLog = pandas.Series([operType,curTime,currrentFile],index=pd_column)
pd_dataFrameLog=pd_dataFrameLog.append(pd_rowLog, ignore_index=True)
curDay=str(time.strftime("%Y-%m-%d",time.localtime(time.time())))
df_log=hc.createDataFrame(pd_dataFrameLog)
df_log.sort(df_log.operType,df_log.processDate).repartition(1).write.mode('overwrite').json("/data/log/incData"+str(progTag)+"/"+curDay)
sc.stop()
示例5: SparkContext
# 需要导入模块: from pyspark.sql import HiveContext [as 别名]
# 或者: from pyspark.sql.HiveContext import createDataFrame [as 别名]
from pyspark import SparkContext
sc = SparkContext()
from pyspark.sql import HiveContext
sqlContext = HiveContext(sc)
tbl_Readmissions = sqlContext.sql("Select Provider_ID, Measure_ID, Compared_to_National, Denominator, Score, Lower_Estimate, Higher_Estimate From tbl_Readmissions_RAW where Score <> 'Not Available' and Measure_ID = 'READM_30_HOSP_WIDE'").rdd
tbl_Readmissions.saveAsTextFile("/user/w205/hospital_compare_txt_TRANSFORMED/txt_readmissions/")
sqlContext.createDataFrame(tbl_Readmissions).write.parquet("/user/w205/hospital_compare_tbl_TRANSFORMED/tbl_readmissions/")
exit()
示例6: SparkContext
# 需要导入模块: from pyspark.sql import HiveContext [as 别名]
# 或者: from pyspark.sql.HiveContext import createDataFrame [as 别名]
from pyspark import SparkContext
sc = SparkContext()
from pyspark.sql import HiveContext
sqlContext = HiveContext(sc)
tbl_Effective_Care = sqlContext.sql("Select Provider_ID, Measure_ID, Score From tbl_Effective_Care_RAW where Score <> 'Not Available'").rdd
tbl_Effective_Care.saveAsTextFile("/user/w205/hospital_compare_txt_TRANSFORMED/txt_effective_care/")
sqlContext.createDataFrame(tbl_Effective_Care).write.parquet("/user/w205/hospital_compare_tbl_TRANSFORMED/tbl_effective_care/")
exit()
示例7: main
# 需要导入模块: from pyspark.sql import HiveContext [as 别名]
# 或者: from pyspark.sql.HiveContext import createDataFrame [as 别名]
def main():
args = parser_arguments()
start_date = args.start_date[0]
end_date = args.end_date[0]
sc = SparkContext()
hc = HiveContext(sc)
select = """
SELECT
*
FROM
cluster_metrics_prod_2.container_fact
where
date between '{0}' and '{1}'
""".format(start_date, end_date)
df = hc.sql(select)
header = {
"jobid" : "string",
"containerid" : "string",
"start" : "bigint",
"stop" : "bigint",
"duration" : "bigint",
"event" : "string",
"size" : "double",
"priority" : "int",
"hostname" : "string",
"system" : "string",
"date" : "string"
}
all_rows = df.flatMap(split_data)
schema_split_containers = hc.createDataFrame(all_rows)
schema_split_containers.registerTempTable("split_containers")
create_string = """
create table if not exists cluster_metrics_prod_2.container_fact_event_flattened
(
jobid string,
containerid string,
start bigint,
stop bigint,
duration bigint,
event string,
size double,
priority int,
hostname string
)
partitioned by
(
system string,
date string
)
stored as orc
"""
set_dyn = "set hive.exec.dynamic.partition=true"
set_nstat = "set hive.exec.dynamic.partition.mode=nonstrict"
load_string = """
insert overwrite table
cluster_metrics_prod_2.container_fact_event_flattened
partition
(system, date)
select
jobid,
containerid,
start,
stop,
duration,
event,
size,
priority,
hostname,
system,
date
from
split_containers
"""
print("Setting dynamic partition...")
hc.sql(set_dyn)
hc.sql(set_nstat)
print("Creating Table...")
hc.sql(create_string)
print("Loading data into table...")
hc.sql(load_string)
print("DONE")
示例8: alternateNames
# 需要导入模块: from pyspark.sql import HiveContext [as 别名]
# 或者: from pyspark.sql.HiveContext import createDataFrame [as 别名]
predictPlayers.append(pp)
else:
#print "p not found in mlb?", p
newname = alternateNames(p.upper(), pids)
if newname is not None:
#print "got it: ", newname
pp['lookup_name'] = newname
pp['player_id'] = encodedPlayerIds[str(pids[newname])]
predictPlayers.append(pp)
else:
print "REALLY NOT FOUND.", p.upper()
print "predictHitters=", predictHitters
phRDD = sc.parallelize(predictHitters)
phDF = sqlContext.createDataFrame(phRDD, samplingRatio=0.5)
phDF.registerTempTable("fd_hitters")
print "phDF=", phDF.take(2)
print "predictPitchers=", predictPitchers
ppRDD = sc.parallelize(predictPitchers)
ppDF = sqlContext.createDataFrame(ppRDD, samplingRatio=0.5)
ppDF.registerTempTable("fd_pitchers")
print "ppDF=", ppDF.take(22)
encodedHitterFeatures = sqlContext.parquetFile(rddDir + "/batting_features.enc.parquet")
encodedHitterFeatures.registerTempTable("bfe")
hfDF = sqlContext.sql("""select bfe.* from fd_hitters, bfe where
fd_hitters.player_id = bfe.player_id
and fd_hitters.game_date = bfe.game_date""")
示例9: cleans
# 需要导入模块: from pyspark.sql import HiveContext [as 别名]
# 或者: from pyspark.sql.HiveContext import createDataFrame [as 别名]
datalines = datalines.map(lambda x: cleans(x))
#1 imports
from pyspark.sql.types import *
#2 create metadata
fields = [StructField(field_name,StringType(),True) for field_name in firstline]
schema = StructType(fields)
#3 create a dataframe
schemaLoans = sqlContext.createDataFrame(datalines, schema)
#4 register it as a table called loans
schemaLoans.registerTempTable("loans")
#1 drop table, summarize and store in hive
sqlContext.sql("drop table if exists LoansByTitle")
sql = '''create table LoansByTitle stored as parquet as select title, count(1) as number from loans group by title order by number desc'''
sqlContext.sql(sql)
sqlContext.sql('drop table if exists raw')
sql = '''create table raw stored as parquet as select title, emp_title,grade,home_ownership,int_rate,recoveries,collection_recovery_fee,loan_amnt,term from loans'''
示例10: HiveContext
# 需要导入模块: from pyspark.sql import HiveContext [as 别名]
# 或者: from pyspark.sql.HiveContext import createDataFrame [as 别名]
logSet = sc.textFile("hdfs://hdfs.hadoop.yourdomain.com:9000/hdfs/aatlogstore/YYYY/MM/*").map(loadData)
aatlog = logSet.map(getPairKey)
#make vidsession_log & vidsession data
aatlogSet = aatlog.reduceByKey(lambda x, y: x)
vidlogSet = aatlogSet.sortByKey().zipWithIndex().map(lambda x: ((x[0][0][0], x[1]), x[0][1]))
vidlogSet.persist()
vidlogLag = vidlogSet.map(lambda x:((x[0][0], x[0][1]+1), x[1]))
vidLogRST = vidlogSet.leftOuterJoin(vidlogLag).map(mkVidLog)
vidSet = vidlogSet.map(lambda x: (x[0][0], (x[0][1], x[1])))
vidProcSet = vidSet.combineByKey(vidCreate, vidMerge, vidMerge2)
print "filter processing done "
#save RDD to HIVE Tables
hx = HiveContext(sc)
vidTable = hx.createDataFrame(vidProcSet.map(lambda x:x[1]))
vidLogTable = hx.createDataFrame(vidLogRST, samplingRatio=0.5)
print "Dataframe Done"
vidLogTable.saveAsTable('vidsession_log', mode='append')
vidTable.saveAsTable('vidsession', mode='append')
print "AATLOG Process Done"
except Exception, e:
print "MAIN Error %s" % e
示例11: map
# 需要导入模块: from pyspark.sql import HiveContext [as 别名]
# 或者: from pyspark.sql.HiveContext import createDataFrame [as 别名]
cluster_center_overall_deltas = map(get_overall_delta_percentage_closure, centers)
# Convert the list to a list of tuples mapping cluster number to the performance percentage
converted_center_delta_list = DunaganListUtility.convert_list_to_value_and_index_tuple_list(cluster_center_overall_deltas)
# Sort the list of tuples by performance (in ascending order)
converted_center_delta_list.sort(lambda tuple_1, tuple_2: cmp(tuple_1[1], tuple_2[1]))
# Convert the list mapping cluster_number to performance percentage into a list of Row() object for insertion into a database table
# (ClusterId, Delta-Percentage) Row list construction
converted_center_delta_list_rows = map(lambda delta_tuple: Row(cluster_id=int(delta_tuple[0]), delta_percentage=float(delta_tuple[1])), converted_center_delta_list)
print "\n\n\n\nAbout to sqlContext.createDataFrame(converted_center_delta_list_rows)\n\n\n\n"
# Create a data frame from the list of Rows
schemaCenterDeltas = sqlContext.createDataFrame(converted_center_delta_list_rows)
print "\n\n\n\nAbout to schemaCenterDeltas.write.jdbc(url=mysql_url, table=cluster_total_delta_percentages)\n\n\n\n"
# Write the data frame to the database in table cluster_total_delta_percentages
schemaCenterDeltas.write.jdbc(url=mysql_url, table='cluster_total_delta_percentages', mode="overwrite")
# Produce a list which maps cluster numbers to symbols to produce an xref database table
# (ClusterId, Symbol) XRef Row List construction
cluster_id_symbol_xref_rows_list = []
for cluster_id, list_of_symbols in clusterGroupsDictionaryRdd.items():
for symbol in list_of_symbols:
print "cluster_id: " + str(cluster_id) + "\t\tsymbol: " + symbol
xrefRow = Row(cluster_id=int(cluster_id), symbol=str(symbol))
cluster_id_symbol_xref_rows_list.append(xrefRow)
示例12: CheckValidScore
# 需要导入模块: from pyspark.sql import HiveContext [as 别名]
# 或者: from pyspark.sql.HiveContext import createDataFrame [as 别名]
# removing all row not containing numbers for score variable
# function to test if a string can be parsed in integer or not
def CheckValidScore(s):
try:
int(s)
return True
except ValueError:
return False
# creating a RDD by filtering out invalid scores
df_survey_clean_rdd = df_survey_raw.rdd.filter(lambda row: CheckValidScore(row.hcahps_base_score))
df_care_clean_rdd = df_care_raw.rdd.filter(lambda row: CheckValidScore(row.score))
# creating dataframe from the RDD
df_survey_clean = sqlCtx.createDataFrame(df_survey_clean_rdd)
df_care_clean = sqlCtx.createDataFrame(df_care_clean_rdd)
print 'Number of rows in survey table after cleaning {0}'.format(df_survey_clean.count())
print 'Number of rows in effective_care table after cleaning {0}'.format(df_care_clean.count())
# converting the data types for score column
df_survey = df_survey_clean.selectExpr('provider_id','(cast(hcahps_base_score as int) + cast(hcahps_consistency_score as int)) as survey_score')
# converting the data types for score column
df_care = df_care_clean.selectExpr('provider_id','measure_id','cast(score as int) as score')
# get the maximum score per measure_id and name the column as 'max_score'
df_care_max_scores = df_care.groupBy('measure_id').max().collect()
df_care_max_scores = sc.broadcast(df_care_max_scores)
示例13: print
# 需要导入模块: from pyspark.sql import HiveContext [as 别名]
# 或者: from pyspark.sql.HiveContext import createDataFrame [as 别名]
i = 0
for index, row in pd_df_reviews.iterrows():
tot = 0
totwords = 0
print(row)
if pd.notnull(row['_2']):
cleantext = cleanText(row['_2'])
totwords = len(cleantext)
#return the 5 n-grams with the highest PMI
unigrams = top5_words(cleantext)
bigrams = top5_bigram_collocations(cleantext)
trigrams = top5_trigram_collocations(cleantext)
df.ix[i, 'BusinessID']= row['_1']
df.ix[i, 'CleanText']= " ".join(cleantext).encode('utf-8')
df.ix[i, 'TotWords']= totwords
df.ix[i, 'unigrams']= unigrams
df.ix[i, 'bigrams']= bigrams
df.ix[i, 'trigrams']= trigrams
i += 1
spark_df = sqlContext.createDataFrame(df, columns)
# Save it as a table
spark_df.registerTempTable("dfBusiness500000")
sqlContext.sql("drop table if exists result2_bybusiness")
sqlContext.sql("CREATE TABLE result2_bybusiness AS SELECT * FROM dfBusiness500000")
示例14: CheckValidScore
# 需要导入模块: from pyspark.sql import HiveContext [as 别名]
# 或者: from pyspark.sql.HiveContext import createDataFrame [as 别名]
print 'Number of rows in the table {0}'.format(df_raw.count())
# removing all row not containing numbers for score variable
# function to test if a string can be parsed in integer or not
def CheckValidScore(s):
try:
int(s)
return True
except ValueError:
return False
# creating a RDD by filtering out invalid scores
df_clean_rdd = df_raw.rdd.filter(lambda row: CheckValidScore(row.score))
# cretating datframe from the RDD
df_clean = sqlCtx.createDataFrame(df_clean_rdd)
print 'Number of rows in table after cleaning {0}'.format(df_clean.count())
# converting the data types for score column
df_clean = df_clean.selectExpr('provider_id','measure_id','cast(score as int) as score')
# creating dataframe for hospitals table
df_hospital = sqlCtx.table("hospital")
# joining hospitals table to hospital_state
df_clean = df_clean.join(df_hospital, df_clean.provider_id == df_hospital.provider_id,'left_outer').select(df_clean.provider_id, df_clean.measure_id, df_hospital.state, df_clean.score)
# get the maximum score per measure_id and name the column as 'max_score'
df_max_scores = df_clean.groupBy('measure_id').max().collect()
示例15: StringType
# 需要导入模块: from pyspark.sql import HiveContext [as 别名]
# 或者: from pyspark.sql.HiveContext import createDataFrame [as 别名]
encounters = parts.map(lambda p: (p[0], datetime.strptime(p[1], "%Y-%m-%d %H:%M:%S"), p[2], p[3].strip().replace(',','_')))
fields = [StructField("PATIENT_NUM", StringType(), True),
StructField("START_DATE", DateType(), True),
StructField("ENCOUNTER_NUM", StringType(), True),
StructField("ICD9S", StringType(), True)]
schema_encounters = StructType(fields)
# fields = [StructField("PATIENT_NUM", StringType(), True),
# StructField("ENCOUNTER_NUM", StringType(), True),
# StructField("START_DATE", StringType(), True),
# Seq(StructField("ICD9S", ArrayType(StringType(), True), True) ]
# schema_encounters = StructType(fields)
# Apply the schema to the RDD.
schemaEncounters = sqlContext.createDataFrame(encounters, schema_encounters)
schemaEncounters.printSchema()
schemaEncounters.registerTempTable("encounters")
# order data by patient, start date, _then_ encounter
encounteres_ordered = sqlContext.sql("select PATIENT_NUM, START_DATE, ENCOUNTER_NUM, ICD9S from encounters order by PATIENT_NUM, START_DATE, ENCOUNTER_NUM")
encounteres_ordered.registerTempTable("encounteres_ordered")
#sqlContext.sql("select collect_list(ICD9S) as icd9s from encounteres_ordered group by PATIENT_NUM").show(5)
#sqlContext.sql("select PATIENT_NUM, collect_list(ICD9S) as icd9s from encounteres_ordered group by PATIENT_NUM").show(20, truncate=False)
rdd = sqlContext.sql("select collect_list(ICD9S) as icd9s from encounteres_ordered group by PATIENT_NUM").rdd
def splitter(p):