本文整理汇总了Python中pyspark.sql.SQLContext.createDataFrame方法的典型用法代码示例。如果您正苦于以下问题:Python SQLContext.createDataFrame方法的具体用法?Python SQLContext.createDataFrame怎么用?Python SQLContext.createDataFrame使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pyspark.sql.SQLContext
的用法示例。
在下文中一共展示了SQLContext.createDataFrame方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: main
# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import createDataFrame [as 别名]
def main(argv):
Conf = (SparkConf().setAppName("recommendation"))
sc = SparkContext(conf=Conf)
sqlContext = SQLContext(sc)
dirPath = "hdfs://ec2-52-71-113-80.compute-1.amazonaws.com:9000/reddit/recommend/data/sr_userCount.parquet"
rawDF = sqlContext.read.parquet(dirPath).persist(StorageLevel.MEMORY_AND_DISK_SER)
# argv[1] is the dump of training data in hdfs
# argv[2] is the user perferences
# User Hash Lookup stored into cassandra
user_hash = rawDF.map(lambda (a,b,c): (a,hashFunction(a)))
distinctUser = user_hash.distinct()
userHashDF = sqlContext.createDataFrame(distinctUser,["user","hash"])
userHashDF.write.format("org.apache.spark.sql.cassandra").options(table ="userhash", keyspace = keyspace).save(mode="append")
# Product Hash Lookup stored into cassandra
product_hash = rawDF.map(lambda (a,b,c): (b, hashFunction(b)))
distinctProduct = product_hash.distinct()
productHashDF = sqlContext.createDataFrame(distinctProduct,["product","hash"])
productHashDF.write.format("org.apache.spark.sql.cassandra").options(table ="producthash", keyspace = keyspace).save(mode="append")
# Ratings for training
# ALS requires a java hash of string. This function does that and stores it as Rating Object
# for the algorithm to consume
ratings = rawDF.map(lambda (a,b,c) : Rating(hashFunction(a),hashFunction(b),float(c)))
model = ALS.trainImplicit(ratings,10,10,alpha=0.01,seed=5)
model.save(sc, "hdfs://ec2-52-71-113-80.compute-1.amazonaws.com:9000/reddit/recommend/model")
sc.stop()
示例2: _rdd_to_df
# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import createDataFrame [as 别名]
def _rdd_to_df(rdd, schema):
"""convert rdd to dataframe using schema."""
spark_context = rdd.context
sql_context = SQLContext(spark_context)
if schema is None:
df = sql_context.createDataFrame(rdd)
else:
df = sql_context.createDataFrame(rdd, schema)
return df
示例3: split_data
# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import createDataFrame [as 别名]
def split_data(rev2, sc):
# Split train and test set.
data = rev2.copy()
train, test = train_test_split(data)
X_test = test.copy()
y_test = X_test.pop("rating")
sql_context = SQLContext(sc)
train_df = sql_context.createDataFrame(train).rdd
X_test_df = sql_context.createDataFrame(X_test).rdd
test_df = sql_context.createDataFrame(test).rdd
return train_df, train, test, test_df, X_test_df, y_test
示例4: main
# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import createDataFrame [as 别名]
def main():
inputs = sys.argv[1]
output = sys.argv[2]
ntlk_path = sys.argv[3]
conf = SparkConf().setAppName('TF-IDF Representation')
sc = SparkContext(conf=conf)
assert sc.version >= '1.5.1'
sqlContext = SQLContext(sc)
'''sbaronia - get 3 fields from json file and filter those with empty review'''
review = sqlContext.read.json(inputs).select('reviewText','overall','reviewTime').cache()
review_df = review.filter(review.reviewText != "").cache()
'''sbaronia - get year and rating and zip them with index'''
year_rdd = rdd_zip(review_df.map(extract_year).cache()).cache()
year_df = sqlContext.createDataFrame(year_rdd, ['year', 'index']).cache()
rating_rdd = rdd_zip(review_df.map(lambda line: float(line.overall)).cache()).cache()
rating_df = sqlContext.createDataFrame(rating_rdd, ['rating', 'index']).cache()
stop_words = stop_words_func(ntlk_path)
'''sbaronia - rdd cotaining unique words from review'''
clean_words_rdd = review_df.map(lambda review: clean_string_to_words(review.reviewText,stop_words)).filter(lambda x: x[0] != 'null').cache()
'''sbaronia - finding tf-idf and zipping it with index'''
tfidf_rdd = rdd_zip(tf_idf_cal(clean_words_rdd).cache()).cache()
tfidf_df = sqlContext.createDataFrame(tfidf_rdd, ['tfidf', 'index']).cache()
'''sbaronia - making dataframe with only rating and tfidf'''
year_rating_df = rating_df.join(year_df, rating_df.index == year_df.index, 'outer').drop(rating_df.index).cache()
tfyrrating_df = tfidf_df.join(year_rating_df, tfidf_df.index == year_rating_df.index, 'inner').drop(tfidf_df.index).cache()
'''sbaronia - making training and testing rdd with <2014 and =2014 condition
in a splitable format with :: '''
train_rdd = tfyrrating_df.filter(tfyrrating_df.year < 2014) \
.select('rating','tfidf') \
.map(lambda line: (str(line.rating) + ' :: ' + str(line.tfidf))) \
.coalesce(1) \
.cache()
test_rdd = tfyrrating_df.filter(tfyrrating_df.year == 2014) \
.select('rating','tfidf') \
.map(lambda line: (str(line.rating) + ' :: ' + str(line.tfidf))) \
.coalesce(1) \
.cache()
'''sbaronia - save rdds to text'''
train_rdd.saveAsTextFile(output + '/train-text')
test_rdd.saveAsTextFile(output + '/test-text')
示例5: main
# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import createDataFrame [as 别名]
def main():
sc = SparkContext()
sqlCtx = SQLContext(sc)
config = configparser.ConfigParser()
config.read('config.ini')
#Path that contains all files for analysis
path_analysis = config.get('DEFAULT', 'path_analysis')
#Ligand Database file
ligand_database = config.get('DEFAULT', 'ligand_database_path_file')
#Path for drugdesign project
path_spark_drugdesign = config.get('DRUGDESIGN', 'path_spark_drugdesign')
#Adding Python Source file
sc.addPyFile(os.path.join(path_spark_drugdesign,"vina_utils.py"))
sc.addPyFile(os.path.join(path_spark_drugdesign,"json_utils.py"))
start_time = datetime.now()
#**************** Loading file that contains all scores
score_file_name = os.path.join(path_analysis,get_file_name_sorted_energy())
text_file = sc.textFile(score_file_name)
#Spliting score file by \t
header = text_file.first() #extract header
rdd_vs_score_sorted_split = text_file.filter(lambda x:x !=header) #filter out header
rdd_vs_score_sorted_split = rdd_vs_score_sorted_split.map(lambda line: line.split("\t"))
rdd_vs_score_sorted = rdd_vs_score_sorted_split.map(lambda p: Row(energy=float(p[0]), pose=str(p[1]), ligand=get_ligand_from_receptor_ligand_model(p[1]) ))
#Creating Vina Datafrase based on score file
vina_table = sqlCtx.createDataFrame(rdd_vs_score_sorted)
vina_table.registerTempTable("vina")
#**************** Finish
#**************** Loading Ligand Database
rdd_database = load_database(sc, ligand_database)
#Creating Dataframe
database_table = sqlCtx.createDataFrame(rdd_database)
database_table.registerTempTable("database")
#**************** Finish
#Computing ligand efficiency
ligand_efficiencyRDD = sqlCtx.sql("SELECT vina.pose, vina.energy as affinity, (vina.energy / database.heavyAtom) as lig_efficiency FROM database JOIN vina ON vina.ligand = database.ligand ORDER BY vina.energy")
ligand_efficiencyRDD = ligand_efficiencyRDD.map(lambda p: (p.pose, p.affinity, p.lig_efficiency) ).collect()
#Saving ligand efficiency file
save_ligand_efficiency(path_analysis, ligand_efficiencyRDD)
finish_time = datetime.now()
save_ligand_efficiency_log(finish_time, start_time)
示例6: RunRandomForest
# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import createDataFrame [as 别名]
def RunRandomForest(tf, ctx):
sqlContext = SQLContext(ctx)
rdd = tf.map(parseForRandomForest)
# The schema is encoded in a string.
schema = ['genre', 'track_id', 'features']
# Apply the schema to the RDD.
songDF = sqlContext.createDataFrame(rdd, schema)
# Register the DataFrame as a table.
songDF.registerTempTable("genclass")
labelIndexer = StringIndexer().setInputCol("genre").setOutputCol("indexedLabel").fit(songDF)
trainingData, testData = songDF.randomSplit([0.8, 0.2])
labelConverter = IndexToString().setInputCol("prediction").setOutputCol("predictedLabel").setLabels(labelIndexer.labels)
rfc = RandomForestClassifier().setMaxDepth(10).setNumTrees(2).setLabelCol("indexedLabel").setFeaturesCol("features")
#rfc = SVMModel([.5, 10, 20], 5)
#rfc = LogisticRegression(maxIter=10, regParam=0.01).setLabelCol("indexedLabel").setFeaturesCol("features")
pipeline = Pipeline(stages=[labelIndexer, rfc, labelConverter])
model = pipeline.fit(trainingData)
predictions = model.transform(testData)
predictions.show()
evaluator = MulticlassClassificationEvaluator().setLabelCol("indexedLabel").setPredictionCol("prediction").setMetricName("precision")
accuracy = evaluator.evaluate(predictions)
print 'Accuracy of RandomForest = ', accuracy * 100
print "Test Error = ", (1.0 - accuracy) * 100
示例7: main
# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import createDataFrame [as 别名]
def main(n_part, hdfs_path):
print "********************\n*"
print "* Start main\n*"
print "********************"
conf = SparkConf().setAppName("Benchmark Spark SQL")
sc = SparkContext(conf = conf)
sqlContext = SQLContext(sc)
rowsRDD = sc.textFile(hdfs_path).repartition(n_part).map(lambda x: recordToRows(x)).cache()
df = sqlContext.createDataFrame(rowsRDD).cache()
df.count()
df.registerTempTable("msd_table")
print "********************\n*"
print "* Start querres\n*"
print "********************"
[ave_t1, std1, dt1, n1] = time_querry("SELECT * FROM msd_table WHERE msd_table.artist_name = 'Taylor Swift'", sqlContext)
[ave_t2, std2, dt2, n2] = time_querry("SELECT COUNT(*) FROM msd_table WHERE msd_table.artist_name = 'Taylor Swift'", sqlContext, method=1)
[ave_t3, std3, dt3, n3] = time_querry("SELECT * FROM msd_table WHERE msd_table.artist_hotness > 0.75", sqlContext)
[ave_t4, std4, dt4, n4] = time_querry("SELECT COUNT(*) FROM msd_table WHERE msd_table.artist_hotness > 0.75", sqlContext, method=1)
if n1 != n2:
print "\t!!!!Error, counts disagree for the number of T.S. songs!"
if n3 != n4:
print "\t!!!!Error, counts disagree for the number of high paced songs!"
print "********************\n*"
print "* Results"
print "\t".join(map(lambda x: str(x), [ave_t1, std1, dt1, ave_t2, std2, dt2, ave_t3, std3, dt3, ave_t4, std4, dt4]))
print "********************"
示例8: log_mapreducer
# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import createDataFrame [as 别名]
def log_mapreducer(logfilename, pattern, filt="None"):
spcon=SparkContext()
if filt == "None":
input=open(logfilename,'r')
paralleldata=spcon.parallelize(input.readlines())
patternlines=paralleldata.filter(lambda patternline: pattern in patternline)
print "pattern lines",patternlines.collect()
matches=patternlines.map(mapFunction).reduceByKey(reduceFunction)
else:
input=spcon.textFile(logfilename)
matches=input.flatMap(lambda line:line.split()).filter(lambda line: filt in line).map(mapFunction).reduceByKey(reduceFunction)
matches_collected=matches.collect()
print "matches_collected:",matches_collected
if len(matches_collected) > 0:
sqlContext=SQLContext(spcon)
bytes_stream_schema=sqlContext.createDataFrame(matches_collected)
bytes_stream_schema.registerTempTable("USBWWAN_bytes_stream")
query_results=sqlContext.sql("SELECT * FROM USBWWAN_bytes_stream")
dict_query_results=dict(query_results.collect())
print "----------------------------------------------------------------------------------"
print "log_mapreducer(): pattern [",pattern,"] in [",logfilename,"] for filter [",filt,"]"
print "----------------------------------------------------------------------------------"
dict_matches=dict(matches_collected)
sorted_dict_matches = sorted(dict_matches.items(),key=operator.itemgetter(1), reverse=True)
print "pattern matching lines:",sorted_dict_matches
print "----------------------------------------------------------------------------------"
print "SparkSQL DataFrame query results:"
print "----------------------------------------------------------------------------------"
pprint.pprint(dict_query_results)
print "----------------------------------------------------------------------------------"
print "Cardinality of Stream Dataset:"
print "----------------------------------------------------------------------------------"
print len(dict_query_results)
spcon.stop()
return sorted_dict_matches
示例9: mock_data
# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import createDataFrame [as 别名]
def mock_data(self):
"""Mock data to imitate read from database."""
sqlContext = SQLContext(self.sc)
mock_data_rdd = self.sc.parallelize([("A", 1, 1), ("B", 1, 0), ("C", 0, 2), ("D", 2, 4), ("E", 3, 5) ])
schema = ["id", "x", "y"]
mock_data_df = sqlContext.createDataFrame(mock_data_rdd, schema)
return mock_data_df
示例10: main
# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import createDataFrame [as 别名]
def main(sc):
sql_context = SQLContext(sc)
all_data = get_all_data()
# Input data: Each row is a bag of words from a sentence or document.
training_data = [(id_gen.next(), text.split(" ")) for text in all_data]
documentdf = sql_context.createDataFrame(training_data, ["id", "text"])
remover = StopWordsRemover(inputCol="text", outputCol="text_filtered")
cleaned_document = remover.transform(documentdf)
# Learn a mapping from words to Vectors.
word2vec = Word2Vec(vectorSize=len(training_data),
inputCol="text_filtered",
outputCol="result")
model = word2vec.fit(cleaned_document)
matrix = column_similarities(model.transform(cleaned_document))
# We use the size of the target data to filter only
# products of target data to filter data and avoid
# products of taret data to itself
values = matrix.entries.filter(
lambda x: x.j >= TARGET_DATA_SIZE and x.i < TARGET_DATA_SIZE).sortBy(
keyfunc=lambda x: x.value, ascending=False).map(
lambda x: x.j).distinct().take(100)
training_data_index = dict(training_data)
for position, item in enumerate(values):
line = " ".join(training_data_index[int(item)])
print('%d -> %s' % (position, line.encode('utf-8')))
示例11: main
# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import createDataFrame [as 别名]
def main(dataFile, outputPath):
conf = SparkConf().setAppName("S3 Example").set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
sc = SparkContext(conf=conf)
sqlContext = SQLContext(sc)
raw_text = sc.textFile(dataFile).persist(StorageLevel.MEMORY_AND_DISK)
csv_data = raw_text.map(lambda l: l.split(","))
row_data = csv_data.map(lambda p: dataIO.dataStruc(p))
interaction_df = sqlContext.createDataFrame(row_data)
# features.save_hdfs_parquet(interaction_df, outputPath)
dataIO.save_hdfs_parquet(interaction_df, outputPath)
interaction_df.registerTempTable("interactions")
tcp_interactions = sqlContext.sql( """
SELECT duration, dst_bytes, protocol_type FROM interactions WHERE protocol_type = 'tcp' AND duration > 1000 AND dst_bytes=0
""")
tcp_interactions.show()
features.print_tcp_interactions(tcp_interactions)
dataIO.print_from_dataio()
features.print_from_feature()
sc.stop()
示例12: Spark_MapReduce_Parents
# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import createDataFrame [as 别名]
def Spark_MapReduce_Parents(keyword, tokensofprevlevel, graphcache):
#tokensofprevlevelkeyword=tokensofprevlevel
#tokensofprevlevelkeyword.append(keyword)
md5hashparents = hashlib.md5(keyword).hexdigest()
#md5hashparents = keyword
md5hashparents = md5hashparents + "$parents"
picklef_keyword=open("RecursiveGlossOverlap_MapReduce_Parents_Persisted.txt","w")
asfer_pickle_string_dump(keyword,picklef_keyword)
picklef_keyword.close()
cachevalue=graphcache.get(md5hashparents)
if cachevalue:
print "Spark_MapReduce_Parents(): hash = ", md5hashparents, "; returning from cache"
return cachevalue
else:
#picklelock.acquire()
spcon = SparkContext("local[2]","Spark_MapReduce_Parents")
#picklef_keyword=open("RecursiveGlossOverlap_MapReduce_Parents_Persisted.txt","w")
#asfer_pickle_string_dump(keyword,picklef_keyword)
#picklef_keyword.close()
paralleldata = spcon.parallelize(tokensofprevlevel).cache()
#k=paralleldata.map(lambda keyword: mapFunction_Parents(keyword,tokensofprevlevel)).reduceByKey(reduceFunction_Parents)
k=paralleldata.map(mapFunction_Parents).reduceByKey(reduceFunction_Parents)
sqlContext=SQLContext(spcon)
parents_schema=sqlContext.createDataFrame(k.collect())
parents_schema.registerTempTable("Interview_RecursiveGlossOverlap_Parents")
query_results=sqlContext.sql("SELECT * FROM Interview_RecursiveGlossOverlap_Parents")
dict_query_results=dict(query_results.collect())
#print "Spark_MapReduce_Parents() - SparkSQL DataFrame query results:"
#picklelock.release()
graphcache.set(md5hashparents,dict_query_results[1])
spcon.stop()
print "graphcache_mapreduce_parents updated:", graphcache
return dict_query_results[1]
开发者ID:shrinivaasanka,项目名称:asfer-github-code,代码行数:37,代码来源:InterviewAlgorithmWithIntrinisicMerit_SparkMapReducer.py
示例13: test_persistence
# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import createDataFrame [as 别名]
def test_persistence(self):
# Test save/load for LDA, LocalLDAModel, DistributedLDAModel.
sqlContext = SQLContext(self.sc)
df = sqlContext.createDataFrame([
[1, Vectors.dense([0.0, 1.0])],
[2, Vectors.sparse(2, {0: 1.0})],
], ["id", "features"])
# Fit model
lda = LDA(k=2, seed=1, optimizer="em")
distributedModel = lda.fit(df)
self.assertTrue(distributedModel.isDistributed())
localModel = distributedModel.toLocal()
self.assertFalse(localModel.isDistributed())
# Define paths
path = tempfile.mkdtemp()
lda_path = path + "/lda"
dist_model_path = path + "/distLDAModel"
local_model_path = path + "/localLDAModel"
# Test LDA
lda.save(lda_path)
lda2 = LDA.load(lda_path)
self._compare(lda, lda2)
# Test DistributedLDAModel
distributedModel.save(dist_model_path)
distributedModel2 = DistributedLDAModel.load(dist_model_path)
self._compare(distributedModel, distributedModel2)
# Test LocalLDAModel
localModel.save(local_model_path)
localModel2 = LocalLDAModel.load(local_model_path)
self._compare(localModel, localModel2)
# Clean up
try:
rmtree(path)
except OSError:
pass
示例14: main
# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import createDataFrame [as 别名]
def main(sc):
path = "events"
#text_file = sc.textFile(path)
sqlContext = SQLContext(sc)
events = sqlContext.jsonFile(path)
events = events.select(events["events.event"]).flatMap(lambda p: p.event)
events = events.map(lambda p: Row(
id=p.id,\
title=p.title, \
lat=p.latitude, \
long=p.longitude, \
postal_code=p.postal_code, \
start_time=datetime.strptime(p.start_time, "%Y-%m-%d %H:%M:%S"), \
stop_time=p.stop_time))
events_df = sqlContext.createDataFrame(events)
events_df.registerTempTable("events")
sqlContext.registerFunction("to_hour", lambda x: x.hour)
sqlContext.registerFunction("str_date", lambda x: str(x.month) + "-" + str(x.day) + "-" + str(x.year))
e = sqlContext.sql("select title, str_date(start_time) as event_date,
to_hour(start_time) as hour, postal_code from events where postal_code is not null and start_time is not null")
events_grouped = sqlContext.sql("select event_date, hour, postal_code,
count(*) from events_filtered group by event_date,hour,postal_code order by postal_code,hour")
grouped_csv = events_grouped.map(toCSV)
grouped_csv.saveAsTextFile('events_cluster')
示例15: main
# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import createDataFrame [as 别名]
def main():
sc = SparkContext()
sqlCtx = SQLContext(sc)
config = configparser.ConfigParser()
config.read('config.ini')
#Path where docking list file will be saved
path_to_save = str(sys.argv[1])
#Path for drugdesign project
path_spark_drugdesign = config.get('DRUGDESIGN', 'path_spark_drugdesign')
sc.addPyFile(os.path.join(path_spark_drugdesign,"database_crud.py"))
sc.addPyFile(os.path.join(path_spark_drugdesign,"database_io.py"))
#**************** Loading Ligand Database
ligand_database = config.get('DEFAULT', 'ligand_database_path_file')
rdd_database = load_database(sc, ligand_database)
#Creating Dataframe
database_table = sqlCtx.createDataFrame(rdd_database)
database_table.registerTempTable("database")
#**************** Finish
#Creating input files for peforming virtual screening
creating_docking_list(path_to_save, config, sqlCtx)