本文整理匯總了Python中pyspark.SQLContext.createDataFrame方法的典型用法代碼示例。如果您正苦於以下問題:Python SQLContext.createDataFrame方法的具體用法?Python SQLContext.createDataFrame怎麽用?Python SQLContext.createDataFrame使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在類pyspark.SQLContext
的用法示例。
在下文中一共展示了SQLContext.createDataFrame方法的15個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。
示例1: hash_rating
# 需要導入模塊: from pyspark import SQLContext [as 別名]
# 或者: from pyspark.SQLContext import createDataFrame [as 別名]
def hash_rating(author_subreddit_rating_rdd, sc):
sql_context = SQLContext(sc)
author_sub_schema = StructType([
StructField("author", StringType(), True),
StructField("subreddit", StringType(), True),
StructField("rating", LongType(), True)
])
asr_df = sql_context.createDataFrame(author_subreddit_rating_rdd, author_sub_schema)
author_rdd = author_subreddit_rating_rdd.map(lambda (a, s, r): a)
aid_rdd = author_rdd.distinct().zipWithUniqueId().cache()
author_id_schema = StructType([
StructField("author", StringType(), True),
StructField("author_id", LongType(), True)
])
aid_df = sql_context.createDataFrame(aid_rdd, author_id_schema)
aid_s_r_df = aid_df.join(asr_df, on='author').drop('author').cache()
subreddit_rdd = author_subreddit_rating_rdd.map(lambda (a, s, r): s)
sid_rdd = subreddit_rdd.distinct().zipWithUniqueId().cache()
subreddit_id_schema = StructType([
StructField("subreddit", StringType(), True),
StructField("subreddit_id", LongType(), True)
])
sid_df = sql_context.createDataFrame(sid_rdd, subreddit_id_schema)
aid_sid_r_df = sid_df.join(aid_s_r_df, on='subreddit').drop('subreddit').cache()
row_aid_sid_r_rdd = aid_sid_r_df.rdd
aid_sid_r_rdd = row_aid_sid_r_rdd.map(lambda row: (row.author_id, row.subreddit_id, row.rating))
return aid_rdd, sid_rdd, aid_sid_r_rdd
示例2: _get_data
# 需要導入模塊: from pyspark import SQLContext [as 別名]
# 或者: from pyspark.SQLContext import createDataFrame [as 別名]
def _get_data(self):
sql_context = SQLContext(self.sc)
l = [
(
"I dont know why people think this is such a bad movie.",
Vectors.sparse(3, {1: 1.0, 2: 1.0, 3: 1.0})
),
]
return sql_context.createDataFrame(l, ['text', 'features'])
示例3: _get_train_data
# 需要導入模塊: from pyspark import SQLContext [as 別名]
# 或者: from pyspark.SQLContext import createDataFrame [as 別名]
def _get_train_data(self):
sql_context = SQLContext(self.sc)
l = [
(1, Vectors.dense([1, 2, 3]), 1.0),
(2, Vectors.dense([1, 2, 3]), 0.0),
(3, Vectors.dense([1, 2, 3]), 1.0),
(4, Vectors.dense([1, 2, 3]), 0.0),
]
return sql_context.createDataFrame(l, ['id', 'features', 'label'])
示例4: print
# 需要導入模塊: from pyspark import SQLContext [as 別名]
# 或者: from pyspark.SQLContext import createDataFrame [as 別名]
mirror_dir = "data/mirror"
data_dir = "data/data-{0}".format(dataset_date)
out_dir = "data/bhl-{0}.parquet".format(dataset_date)
if os.path.isdir(out_dir):
print("Output dir {0} exists".format(out_dir))
exit
get_ocr_udf = sql.udf(get_ocr, types.StringType())
fn = os.path.join(data_dir, "item.txt")
# Optional limit for testing, add this to the chain as second step
# .sample(withReplacement=False, fraction=0.001) \
sqlContext.createDataFrame(t_gen(fn, type_data_item), schema_item()) \
.withColumn("ocrtext", get_ocr_udf(sql.col("barcode"))) \
.write.parquet(out_dir)
# Example run on Elk (16 thread single machine)
#real 84m21.818s
#user 198m57.612s
#sys 15m19.662s
# Example run on okapi (128 thread single machine)
#real 41m13.984s
#user 482m34.084s
#sys 278m12.404s
示例5: main
# 需要導入模塊: from pyspark import SQLContext [as 別名]
# 或者: from pyspark.SQLContext import createDataFrame [as 別名]
def main():
conf = SparkConf().setAppName("climate")
sc = SparkContext(conf=conf)
sqlContext = SQLContext(sc)
climateSchema = StructType(
[
StructField("station", StringType(), False),
StructField("date", IntegerType(), False),
StructField("element", StringType(), False),
StructField("value", IntegerType(), True),
StructField("mflag", StringType(), True),
StructField("qflag", StringType(), True),
StructField("sflag", StringType(), True),
StructField("obstime", StringType(), True),
]
)
info = sqlContext.read.format("com.databricks.spark.csv").options(header="false").schema(climateSchema).load(inputs)
info.registerTempTable("info")
stationinfo = sqlContext.sql("SELECT station, date, element, value, FLOOR(date/10000) as yy FROM info ")
stationinfo.registerTempTable("stationinfo")
stationinfo.cache()
prcpTable = sqlContext.sql("SELECT station, date, value as prcp, yy FROM stationinfo WHERE element='PRCP' ")
prcpTable.registerTempTable("prcpTable")
prcpTable.cache()
# prcpTable.show()
# create 3 tables that hold the monthly average of min, max temperature and prcp
yearlyprcp = sqlContext.sql(
"SELECT station, yy, ROUND(Avg(prcp),0) as avg_prcp FROM prcpTable GROUP BY station, yy "
)
yearlyprcp.registerTempTable("prcpMean")
# yearlyprcp.show()
# get information about stations from stations.txt
def getdata(line):
line = line.split(" ")
values = [x.strip() for x in line]
return values
stations = sc.textFile(input2)
stations = stations.map(getdata)
stations = stations.map(lambda (a, b, c): Row(station=a, latitude=float(b), longitude=float(c))).cache()
stationDF = sqlContext.createDataFrame(stations)
stationDF.registerTempTable("StationTable")
stationDF.cache()
# param = sqlContext.sql("SELECT MAX(latitude) as max_lat, Min(latitude) as min_lat, MAX(longitude) as max_long, Min(longitude) as min_long FROM StationTable")
# param.show()
# Join to station file to add latitude and longitude and stationID
result = (
stationDF.join(yearlyprcp)
.where(stationDF.station == yearlyprcp.station)
.select(yearlyprcp.avg_prcp, yearlyprcp.station, yearlyprcp.yy, stationDF.latitude, stationDF.longitude)
)
# save into parquet file
result.write.format("parquet").save(output)
示例6: features_to_vec
# 需要導入模塊: from pyspark import SQLContext [as 別名]
# 或者: from pyspark.SQLContext import createDataFrame [as 別名]
def features_to_vec(length, entropy, alexa_grams, word_grams):
high_entropy = 0.0
high_length = 0.0
if entropy > 3.5: high_entropy = 1.0
if length > 30: high_length = 1.0
return Vectors.dense(length, entropy, high_entropy, high_length, alexa_grams, word_grams)
#dga_domains = sc.textFile("/user/cloudera/dga.txt")
#dga_domains = dga_domains.map(lambda x: (x, "dga", float(len(x)), entropy(x)))
#dga_domains_df = sqlctx.createDataFrame(dga_domains, schema).dropna().distinct().cache()
words = sc.textFile("/user/cloudera/words.txt")
words = words.map(lambda x: (x, "dict", float(len(x)), entropy(x)))
words_df = sqlctx.createDataFrame(words, schema).dropna().distinct().cache()
dga_domains = sc.textFile("/user/cloudera/c_domains_*")
dga_domains = dga_domains.map(lambda x: (x, "dga", float(len(x)), entropy(x)))
dga_domains_df = sqlctx.createDataFrame(dga_domains, schema).dropna().distinct().cache()
alexa_domains = sqlctx.read.format('com.databricks.spark.csv').options(header='false', inferschema='true').load(
'alexa_100k.csv')\
.map(lambda x: (x[1], "legit", float(len(x[1])), entropy(x[1])))
alexa_domains_df = sqlctx.createDataFrame(alexa_domains, schema).dropna().distinct().cache()
alexa_domains_1M = sqlctx.read.format('com.databricks.spark.csv').options(header='false', inferschema='true').load(
'alexa_1M.csv')\
.map(lambda x: (x[1], "legit", float(len(x[1])), entropy(x[1])))
alexa_domains_1M = sqlctx.createDataFrame(alexa_domains_1M, schema).distinct().cache()
示例7: StructField
# 需要導入模塊: from pyspark import SQLContext [as 別名]
# 或者: from pyspark.SQLContext import createDataFrame [as 別名]
# uid,adid,guuid,createtime
fields = [
StructField('uid', StringType(), True),
StructField('adid', StringType(), True),
StructField('guuid', StringType(), True),
StructField('guuidctime', LongType(), True),
StructField('url', StringType(), True),
StructField('referer', StringType(), True),
StructField('hosid', StringType(), True),
StructField('gwid', StringType(), True),
StructField('ua', StringType(), True),
StructField('ip', StringType(), True),
StructField('createtime', LongType(), True),
]
schema = StructType(fields)
# [(),()] ['','']
df_dest = sqlContext.createDataFrame(rdd, schema)
df_dest.registerTempTable("back_portal_loginlog")
#df_dest.rdd.foreach(my_print)
# save
df_dest.write.parquet(output)
sc.stop()
示例8: str
# 需要導入模塊: from pyspark import SQLContext [as 別名]
# 或者: from pyspark.SQLContext import createDataFrame [as 別名]
nn_gridsearch.debug('-'*40)
nn_gridsearch.debug('Execution time: %s' % str(datetime.now()))
# with open('~/.aws/credentials.json') as f:
# CREDENTIALS = json.load(f)
sc = set_spark_context()
conn = S3Connection()
sqc = SQLContext(sc)
sm = SparkModel(sc, conn, rdd_path='rdd.pkl')
bow_rdd = sm.RDD.join(sm.target).map(lambda (key, (bow, label)): (label, bow)) \
.sample(withReplacement=False, fraction=.5, seed=1)
df = sqc.createDataFrame(bow_rdd, ['string_label', 'raw'])
train_rdd, test_rdd = df.randomSplit([.8, .2], seed=1)
results = []
num_features = 5000
min_doc_freq = 20
layers = [[5000, 2056, 512, 128, 2], [5000, 1000, 128, 2], [5000, 100, 2], [5000, 5000, 2]]
for l in layers:
remover = StopWordsRemover(inputCol="raw", outputCol="words")
hashingTF = HashingTF(inputCol=remover.getOutputCol(), outputCol="word_counts",
numFeatures=num_features)
tfidf = IDF(inputCol=hashingTF.getOutputCol(),
outputCol="features", minDocFreq=min_doc_freq)
indexer = StringIndexer(inputCol="string_label", outputCol="label")
示例9: StructField
# 需要導入模塊: from pyspark import SQLContext [as 別名]
# 或者: from pyspark.SQLContext import createDataFrame [as 別名]
fields = [
StructField('logintype', StringType(), True),
StructField('logtype', StringType(), True),
StructField('hosid', StringType(), True),
StructField('suppid', StringType(), True),
StructField('logtime', LongType(), True),
StructField('usermac', StringType(), True)
]
schema = StructType(fields)
rdd1 = rdd.map(convert_logtype).filter(lambda tup: tup != None)
# rdd1.foreach(printx)
# sc.stop()
ret_df = sqlContext.createDataFrame(rdd1, schema)
ret_df.registerTempTable("loginflowlog_overall")
_sql = "SELECT count(usermac) pv,count(distinct usermac) uv,logtype " \
"from loginflowlog_overall " \
"group by logtype"
rs_df = sqlContext.sql(_sql)
service = LoginflowlogMysqlService()
ret_overall_list = service.getRetOverall(rs_df.collect(), day)
_sql_delete = "delete from login_flow_global_count where date ='%s'" % day
_sql_insert = "insert into login_flow_global_count(date," \
"prelogin_num,prelogin_pnum,login_num,login_pnum," \
"login_click_num,login_click_pnum,forward_num,forward_pnum," \
"preArrive_num,preArrive_pnum,arrive_num,arrive_pnum) " \
"values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"
service.write_mysql(ret_overall_list, _sql_delete, _sql_insert)
示例10: parsePoint
# 需要導入模塊: from pyspark import SQLContext [as 別名]
# 或者: from pyspark.SQLContext import createDataFrame [as 別名]
# Load and parse the data
# line format: (station, latitude, longitude,)
def parsePoint(line):
return LabeledPoint(line[0], line[1:])
# read data from station file
def getdata(line):
line = line.split(' ')
values = [x.strip() for x in line]
return values
stations = sc.textFile(input)
stations = stations.map(getdata)
stations = stations.map(lambda (a,b,c): (float(hash(a)), int(year), float(b), float(c))).cache()
stationsDF = sqlContext.createDataFrame(stations)
# create dataset to fit into model
parseData = stations.map(parsePoint)
# load the model
sameModel = LinearRegressionModel.load(sc, myModelPath)
# run the model
stationidAndPreds = parseData.map(lambda p : (p.label, float(sameModel.predict(p.features))))
stationidAndPredsDF = sqlContext.createDataFrame(stationidAndPreds)
# the result returns a predicted value for each station (stationId) in the given year
# joining the stations rdd with stationidAndPreds to find the latitude and longitude of each station
result = stationsDF.join(stationidAndPredsDF).where(stationidAndPredsDF[0]==stationsDF[0]).select(stationidAndPredsDF[1], stationsDF[2], stationsDF[3])
示例11: SparkContext
# 需要導入模塊: from pyspark import SQLContext [as 別名]
# 或者: from pyspark.SQLContext import createDataFrame [as 別名]
"""
from pyspark import SparkContext, SQLContext
# $example on$
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import HashingTF, Tokenizer
# $example off$
if __name__ == "__main__":
sc = SparkContext(appName="PipelineExample")
sqlContext = SQLContext(sc)
# $example on$
# Prepare training documents from a list of (id, text, label) tuples.
training = sqlContext.createDataFrame([
(0L, "a b c d e spark", 1.0),
(1L, "b d", 0.0),
(2L, "spark f g h", 1.0),
(3L, "hadoop mapreduce", 0.0)], ["id", "text", "label"])
# Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr.
tokenizer = Tokenizer(inputCol="text", outputCol="words")
hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features")
lr = LogisticRegression(maxIter=10, regParam=0.01)
pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])
# Fit the pipeline to training documents.
model = pipeline.fit(training)
# Prepare test documents, which are unlabeled (id, text) tuples.
test = sqlContext.createDataFrame([
示例12: SparkConf
# 需要導入模塊: from pyspark import SQLContext [as 別名]
# 或者: from pyspark.SQLContext import createDataFrame [as 別名]
parser.add_argument('deaths')
parser.add_argument('output')
args = parser.parse_args()
conf = SparkConf().setAppName("correlate")
sc = SparkContext(conf=conf)
sql = SQLContext(sc)
births_raw = sql.read.load(args.births).rdd
deaths_raw = sql.read.load(args.deaths).rdd
births = births_raw.map(to_joinable_on_id)
deaths = deaths_raw.map(to_joinable_on_id)
both = births.fullOuterJoin(deaths)
unjoined_births = both.filter(get_unjoined_births)
unjoined_deaths = both.filter(get_unjoined_deaths)
correctly_joined = both.filter(remove_unjoined_all).map(to_joined_format)
# do a join with jaro-winkler
jaro_input_births = unjoined_births.map(to_jaro_matching_input)
jaro_input_deaths = unjoined_deaths.map(to_jaro_matching_input)
jaro_input_all = jaro_input_births.cartesian(jaro_input_deaths)
jaro_joined = jaro_input_all.filter(jaro_match).map(cart_to_joined_format)
to_save = sql.createDataFrame(correctly_joined)
to_save.write.save(args.output + '/joined', format="parquet")
to_save = sql.createDataFrame(jaro_joined)
to_save.write.save(args.output + '/jaro_joined', format="parquet")
示例13: SparkContext
# 需要導入模塊: from pyspark import SQLContext [as 別名]
# 或者: from pyspark.SQLContext import createDataFrame [as 別名]
.setAppName("adhoscount")
.set("spark.kryoserializer.buffer.mb", "256")
.set("spark.sql.parquet.binaryAsString","true")
)
sc = SparkContext(conf = conf)
sqlContext = SQLContext(sc)
_adloadDF=sqlContext.read.parquet(adLoadFiles)
_adloadRdd=_adloadDF.rdd.map(lambda x:(x.guuid,x.hosid)).groupByKey().map(fetchOne)
fields = [
StructField('guuid', StringType(), True),
StructField('hosid', StringType(), True),
]
schema = StructType(fields)
schemaDest = sqlContext.createDataFrame(_adloadRdd, schema)
schemaDest.registerTempTable("ghid")
_adloadDF.registerAsTable("adload")
sqlContext.read.parquet(adPlayFiles).registerAsTable("adplay")
sqlContext.read.parquet(adClickFiles).registerAsTable("adclick")
'''
_adLoadDF=sqlContext.createDataFrame([
{'uid': '1', 'adid': 'a','guuid':'aa','guuidctime':1,'url':'','referer':'','hosid':'132','gwid':'','ua':'','ip':'','createtime':1450823568766},
{'uid': '2', 'adid': 'b','guuid':'aa','guuidctime':1,'url':'','referer':'','hosid':'132','gwid':'','ua':'','ip':'','createtime':1450823569766},
{'uid': '3', 'adid': 'c','guuid':'aa','guuidctime':1,'url':'','referer':'','hosid':'132','gwid':'','ua':'','ip':'','createtime':1450823550766},
{'uid': '4', 'adid': 'd','guuid':'bb','guuidctime':1,'url':'','referer':'','hosid':'133','gwid':'','ua':'','ip':'','createtime':1450823268766},
]).registerAsTable("adload")
_adPlayDF=sqlContext.createDataFrame([
{'uid': '1', 'adid': 'a','guuid':'aa','createtime':1450823568766},
示例14: main
# 需要導入模塊: from pyspark import SQLContext [as 別名]
# 或者: from pyspark.SQLContext import createDataFrame [as 別名]
def main(argv):
# list of words to look for!
GODWINS_WORDS = ['hitler', 'nazi']
# setup inputs and outputs
input_directory = argv[0]
output_directory = argv[1]
# spark specific setup
conf = SparkConf().setAppName('godwin whaaa')
sc = SparkContext(conf=conf)
sqlContext = SQLContext(sc)
# read input
text = sc.textFile(input_directory)
text = text.repartition(200)
# convert to magic json formatting
loadedJson = text.map(lambda line: json.loads(line))
# make the json skinnier by removing unwanted stuff
fullRedditJson = loadedJson.map(lambda jObj: (jObj['body'], jObj['name'], jObj['parent_id'])).cache()
# code from greg for regex to parse lines
linere = re.compile(regex_from_words(GODWINS_WORDS))
# now filter out stuff without GODWINS_WORDS "body","id", "subreddit", "parent_id"
godwinJsonList = fullRedditJson.filter(lambda (body, name, parent_id): linere.match(body.lower()))
# We don't need the comment body anymore...
# We need to find the paths now...
godwin_node_rdd = godwinJsonList.map(row_into_node).cache()
full_node_rdd = fullRedditJson.map(row_into_node)
# we also need a list of node names so we can later check if we already visited it.
godwinNodes = godwin_node_rdd.map(lambda (name, parent_id): name)
# Convert full data RDD into SQL Data Frame
subredditSchema = StructType([
StructField("name", StringType(), True),
StructField("parent_id", StringType(), True)
])
full_node_df = sqlContext.createDataFrame(full_node_rdd, subredditSchema)
# Convert godwin rows RDD into SQL Data Frame
godwinSchema = StructType([
StructField("g_name", StringType(), True),
StructField("g_parent_id", StringType(), True)
])
godwin_node_df = sqlContext.createDataFrame(godwin_node_rdd, godwinSchema).cache()
count_down = godwin_node_df.count()
print 'There are', count_down, 'comments with a godwins word'
depth = 0
nodes_per_depth = {}
visited_node_list_df = godwin_node_df.select(godwin_node_df.g_name)
print 'visited_node_list_df'
print str(visited_node_list_df.count())
while count_down > 0 and depth < 100:
depth += 1
# Join find next layer of nodes
joined_df = godwin_node_df.join(full_node_df,
[godwin_node_df['g_parent_id'] == full_node_df['name']])
# Drop the columns of the older node
next_node_df = joined_df.select(
joined_df['name'].alias('g_name'),
joined_df['parent_id'].alias('g_parent_id')).cache()
print 'next_node_df count: '+str(next_node_df.count())
# Select only the ones that have NOT been visited
# TODO: is there a better way?
leftt = next_node_df.join(visited_node_list_df, next_node_df.g_name == visited_node_list_df.g_name, 'left')
next_node_df = leftt.select(next_node_df.g_name, next_node_df.g_parent_id, visited_node_list_df.g_name.alias('dup'))
next_node_df = next_node_df.fillna({'dup':'xxxxxx'})
next_node_df = next_node_df.filter(next_node_df.dup == 'xxxxxx')
next_node_df = next_node_df.drop(next_node_df.dup)
# add the g_name to the list of visited nodes
# TODO: make more efficient!
visited_df = next_node_df.select(next_node_df.g_name)
visited_node_list_df = visited_node_list_df.unionAll(visited_df)
visited_node_list_df = visited_node_list_df.dropDuplicates()
count_up = next_node_df.count()
n_nodes = count_down - count_up
print 'number of godwin nodes of heignt', depth, '=', n_nodes
nodes_per_depth[depth] = n_nodes
count_down = count_up
godwin_node_df = next_node_df
avg = compute_average_godwin(nodes_per_depth)
print 'The average distance to the godwin words is', avg
fp = open(output_directory + 'average.txt')
fp.write(str(avg) + '\n')
#.........這裏部分代碼省略.........
示例15: StructField
# 需要導入模塊: from pyspark import SQLContext [as 別名]
# 或者: from pyspark.SQLContext import createDataFrame [as 別名]
#(u'2015-48_6C25B958F2CC_175', u'2015120120')
#rdd1.foreach(my_print)
#(u'2015-50_7014A62FA5B0_0', [u'22',u'23'])
rdd1_2 = rdd1_1.groupByKey().mapValues(list).sortByKey().map(times_count_first)
#(u'2015-48_903C920CAE97_655', [u'15_1'])
#rdd1_2.foreach(my_print)
rdd2_1 = df.rdd.map(convert_kv_last)
rdd2_2 = rdd2_1.groupByKey().mapValues(list).sortByKey().map(times_count_last)
rdd3 = rdd1_2.join(rdd2_2).map(convert_rets).values().flatMap(list)
#(u'2015', u'48', u'A09347EC9FBB', u'189', u'13', u'1', u'14', u'1')
rdd3.foreach(my_print)
logger.info(rdd3.count())
fields = [
StructField('year', StringType(), True),
StructField('week', StringType(), True),
StructField('mac', StringType(), True),
StructField('hosid', StringType(), True),
StructField('firstTime', StringType(), True),
StructField('firstCount', LongType(), True),
StructField('lastTime', StringType(), True),
StructField('lastCount', LongType(), True)
]
schema = StructType(fields)
df1 = sqlContext.createDataFrame(rdd3,schema)
df1.coalesce(2).write.parquet(output,'overwrite')
sc.stop()