当前位置: 首页>>代码示例>>Python>>正文


Python SQLContext.createDataFrame方法代码示例

本文整理汇总了Python中pyspark.sql.SQLContext.createDataFrame方法的典型用法代码示例。如果您正苦于以下问题:Python SQLContext.createDataFrame方法的具体用法?Python SQLContext.createDataFrame怎么用?Python SQLContext.createDataFrame使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在pyspark.sql.SQLContext的用法示例。


在下文中一共展示了SQLContext.createDataFrame方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: main

# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import createDataFrame [as 别名]
def main(argv):

    Conf = (SparkConf().setAppName("recommendation"))
    sc = SparkContext(conf=Conf)
    sqlContext = SQLContext(sc)

    dirPath = "hdfs://ec2-52-71-113-80.compute-1.amazonaws.com:9000/reddit/recommend/data/sr_userCount.parquet"
    rawDF = sqlContext.read.parquet(dirPath).persist(StorageLevel.MEMORY_AND_DISK_SER)
    # argv[1] is the dump of training data in hdfs
    # argv[2] is the user perferences

    # User Hash Lookup stored into cassandra
    user_hash = rawDF.map(lambda (a,b,c): (a,hashFunction(a)))
    distinctUser = user_hash.distinct()
    userHashDF = sqlContext.createDataFrame(distinctUser,["user","hash"])
    userHashDF.write.format("org.apache.spark.sql.cassandra").options(table ="userhash", keyspace =  keyspace).save(mode="append")
    

    # Product Hash Lookup stored into cassandra
    product_hash = rawDF.map(lambda (a,b,c): (b, hashFunction(b)))
    distinctProduct = product_hash.distinct()
    productHashDF = sqlContext.createDataFrame(distinctProduct,["product","hash"])
    productHashDF.write.format("org.apache.spark.sql.cassandra").options(table ="producthash", keyspace =  keyspace).save(mode="append")

    # Ratings for training
    # ALS requires a java hash of string. This function does that and stores it as Rating Object
    # for the algorithm to consume
    ratings = rawDF.map(lambda (a,b,c) : Rating(hashFunction(a),hashFunction(b),float(c)))

    
    model = ALS.trainImplicit(ratings,10,10,alpha=0.01,seed=5)
    model.save(sc, "hdfs://ec2-52-71-113-80.compute-1.amazonaws.com:9000/reddit/recommend/model")

    sc.stop()
开发者ID:Swebask,项目名称:RedditR--Insight-Data-Engineering-Project,代码行数:36,代码来源:engine.py

示例2: _rdd_to_df

# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import createDataFrame [as 别名]
 def _rdd_to_df(rdd, schema):
     """convert rdd to dataframe using schema."""
     spark_context = rdd.context
     sql_context = SQLContext(spark_context)
     if schema is None:
         df = sql_context.createDataFrame(rdd)
     else:
         df = sql_context.createDataFrame(rdd, schema)
     return df
开发者ID:bigluster,项目名称:monasca-transform,代码行数:11,代码来源:transform_utils.py

示例3: split_data

# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import createDataFrame [as 别名]
def split_data(rev2, sc):
    # Split train and test set.
    data = rev2.copy()
    train, test = train_test_split(data)
    X_test = test.copy()
    y_test = X_test.pop("rating")

    sql_context = SQLContext(sc)
    train_df = sql_context.createDataFrame(train).rdd
    X_test_df = sql_context.createDataFrame(X_test).rdd
    test_df = sql_context.createDataFrame(test).rdd
    return train_df, train, test, test_df, X_test_df, y_test
开发者ID:azstein,项目名称:new-food,代码行数:14,代码来源:recommender.py

示例4: main

# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import createDataFrame [as 别名]
def main():
	inputs = sys.argv[1]
	output = sys.argv[2] 
	ntlk_path = sys.argv[3]

	conf = SparkConf().setAppName('TF-IDF Representation')
	sc = SparkContext(conf=conf)
	assert sc.version >= '1.5.1'

	sqlContext = SQLContext(sc)

	'''sbaronia - get 3 fields from json file and filter those with empty review'''
   	review = sqlContext.read.json(inputs).select('reviewText','overall','reviewTime').cache()
   	review_df = review.filter(review.reviewText != "").cache()

   	'''sbaronia - get year and rating and zip them with index'''
   	year_rdd = rdd_zip(review_df.map(extract_year).cache()).cache()
   	year_df = sqlContext.createDataFrame(year_rdd, ['year', 'index']).cache()

   	rating_rdd = rdd_zip(review_df.map(lambda line: float(line.overall)).cache()).cache()
   	rating_df = sqlContext.createDataFrame(rating_rdd, ['rating', 'index']).cache()
   	
	stop_words = stop_words_func(ntlk_path)

	'''sbaronia - rdd cotaining unique words from review'''
	clean_words_rdd = review_df.map(lambda review: clean_string_to_words(review.reviewText,stop_words)).filter(lambda x: x[0] != 'null').cache()

	'''sbaronia - finding tf-idf and zipping it with index'''
	tfidf_rdd = rdd_zip(tf_idf_cal(clean_words_rdd).cache()).cache()

	tfidf_df = sqlContext.createDataFrame(tfidf_rdd, ['tfidf', 'index']).cache()

	'''sbaronia - making dataframe with only rating and tfidf'''
	year_rating_df = rating_df.join(year_df, rating_df.index == year_df.index, 'outer').drop(rating_df.index).cache()
	tfyrrating_df = tfidf_df.join(year_rating_df, tfidf_df.index == year_rating_df.index, 'inner').drop(tfidf_df.index).cache()
	
	'''sbaronia - making training and testing rdd with <2014 and =2014 condition
	in a splitable format with :: '''
	train_rdd = tfyrrating_df.filter(tfyrrating_df.year < 2014) \
	                        .select('rating','tfidf') \
	                        .map(lambda line: (str(line.rating) + ' :: ' + str(line.tfidf))) \
	                        .coalesce(1) \
	                        .cache()
	
	test_rdd = tfyrrating_df.filter(tfyrrating_df.year == 2014) \
	                       .select('rating','tfidf') \
	                       .map(lambda line: (str(line.rating) + ' :: ' + str(line.tfidf))) \
	                       .coalesce(1) \
	                       .cache()
	
	'''sbaronia - save rdds to text''' 
   	train_rdd.saveAsTextFile(output + '/train-text')
	test_rdd.saveAsTextFile(output + '/test-text')
开发者ID:gitofsid,项目名称:MyBigDataCode,代码行数:55,代码来源:tf_idf_amazon.py

示例5: main

# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import createDataFrame [as 别名]
def main():

	sc = SparkContext()
	sqlCtx = SQLContext(sc)

	config = configparser.ConfigParser()
	config.read('config.ini')

	#Path that contains all files for analysis
	path_analysis = config.get('DEFAULT', 'path_analysis')	
	#Ligand Database file
	ligand_database  = config.get('DEFAULT', 'ligand_database_path_file')
	#Path for drugdesign project
	path_spark_drugdesign = config.get('DRUGDESIGN', 'path_spark_drugdesign')

	#Adding Python Source file
	sc.addPyFile(os.path.join(path_spark_drugdesign,"vina_utils.py"))
	sc.addPyFile(os.path.join(path_spark_drugdesign,"json_utils.py"))

	start_time = datetime.now()

#**************** Loading file that contains all scores
	score_file_name = os.path.join(path_analysis,get_file_name_sorted_energy())
	text_file = sc.textFile(score_file_name)

	#Spliting score file by \t
	header = text_file.first() #extract header
	rdd_vs_score_sorted_split = text_file.filter(lambda x:x !=header)    #filter out header
	rdd_vs_score_sorted_split = rdd_vs_score_sorted_split.map(lambda line: line.split("\t"))
	rdd_vs_score_sorted = rdd_vs_score_sorted_split.map(lambda p: Row(energy=float(p[0]), pose=str(p[1]), ligand=get_ligand_from_receptor_ligand_model(p[1]) )) 
	#Creating Vina Datafrase based on score file
	vina_table = sqlCtx.createDataFrame(rdd_vs_score_sorted)	
	vina_table.registerTempTable("vina")	
#**************** Finish 

#**************** Loading Ligand Database
	rdd_database = load_database(sc, ligand_database)	
	#Creating Dataframe
	database_table = sqlCtx.createDataFrame(rdd_database)	
	database_table.registerTempTable("database")
#**************** Finish 
	
	#Computing ligand efficiency
	ligand_efficiencyRDD = sqlCtx.sql("SELECT vina.pose, vina.energy as affinity, (vina.energy / database.heavyAtom) as lig_efficiency FROM database JOIN  vina ON vina.ligand = database.ligand ORDER BY vina.energy") 
	ligand_efficiencyRDD = ligand_efficiencyRDD.map(lambda p: (p.pose, p.affinity, p.lig_efficiency) ).collect()

	#Saving ligand efficiency file
	save_ligand_efficiency(path_analysis, ligand_efficiencyRDD)

	finish_time = datetime.now()

	save_ligand_efficiency_log(finish_time, start_time)
开发者ID:rodrigofaccioli,项目名称:drugdesign,代码行数:54,代码来源:ligand_efficiency.py

示例6: RunRandomForest

# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import createDataFrame [as 别名]
def RunRandomForest(tf, ctx):
	sqlContext = SQLContext(ctx)
	rdd = tf.map(parseForRandomForest)
	# The schema is encoded in a string.
	schema = ['genre', 'track_id', 'features']
	# Apply the schema to the RDD.
	songDF = sqlContext.createDataFrame(rdd, schema)

	# Register the DataFrame as a table.
	songDF.registerTempTable("genclass")
	labelIndexer = StringIndexer().setInputCol("genre").setOutputCol("indexedLabel").fit(songDF)

	trainingData, testData = songDF.randomSplit([0.8, 0.2])

	labelConverter = IndexToString().setInputCol("prediction").setOutputCol("predictedLabel").setLabels(labelIndexer.labels)

	rfc = RandomForestClassifier().setMaxDepth(10).setNumTrees(2).setLabelCol("indexedLabel").setFeaturesCol("features")
	#rfc = SVMModel([.5, 10, 20], 5)
	#rfc = LogisticRegression(maxIter=10, regParam=0.01).setLabelCol("indexedLabel").setFeaturesCol("features")

	pipeline = Pipeline(stages=[labelIndexer, rfc, labelConverter])
	model = pipeline.fit(trainingData)

	predictions = model.transform(testData)
	predictions.show()

	evaluator = MulticlassClassificationEvaluator().setLabelCol("indexedLabel").setPredictionCol("prediction").setMetricName("precision")
	accuracy = evaluator.evaluate(predictions)
	print 'Accuracy of RandomForest = ', accuracy * 100
	print "Test Error = ", (1.0 - accuracy) * 100
开发者ID:Sunhick,项目名称:music-cognita,代码行数:32,代码来源:genre_classification.py

示例7: main

# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import createDataFrame [as 别名]
def main(n_part, hdfs_path):
    print "********************\n*"
    print "* Start main\n*"
    print "********************"
    conf = SparkConf().setAppName("Benchmark Spark SQL")
    sc = SparkContext(conf = conf)
    sqlContext = SQLContext(sc)
    rowsRDD = sc.textFile(hdfs_path).repartition(n_part).map(lambda x: recordToRows(x)).cache()
    df = sqlContext.createDataFrame(rowsRDD).cache()
    df.count()
    df.registerTempTable("msd_table")
    print "********************\n*"
    print "* Start querres\n*"
    print "********************"
    [ave_t1, std1, dt1, n1] = time_querry("SELECT * FROM msd_table WHERE msd_table.artist_name = 'Taylor Swift'", sqlContext)
    [ave_t2, std2, dt2, n2] = time_querry("SELECT COUNT(*) FROM msd_table WHERE msd_table.artist_name = 'Taylor Swift'", sqlContext, method=1)
    [ave_t3, std3, dt3, n3] = time_querry("SELECT * FROM msd_table WHERE msd_table.artist_hotness > 0.75", sqlContext)
    [ave_t4, std4, dt4, n4] = time_querry("SELECT COUNT(*) FROM msd_table WHERE msd_table.artist_hotness > 0.75", sqlContext, method=1)
    if n1 != n2:
        print "\t!!!!Error, counts disagree for the number of T.S. songs!"
    if n3 != n4:
        print "\t!!!!Error, counts disagree for the number of high paced songs!"
    print "********************\n*"
    print "* Results"
    print "\t".join(map(lambda x: str(x), [ave_t1, std1, dt1, ave_t2, std2, dt2, ave_t3, std3, dt3, ave_t4, std4, dt4]))
    print "********************"
开发者ID:drJAGartner,项目名称:benchmarking_scripts,代码行数:28,代码来源:benchmark_spark_sql.py

示例8: log_mapreducer

# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import createDataFrame [as 别名]
def log_mapreducer(logfilename, pattern, filt="None"):
        spcon=SparkContext()
	if filt == "None":
        	input=open(logfilename,'r')
        	paralleldata=spcon.parallelize(input.readlines())
        	patternlines=paralleldata.filter(lambda patternline: pattern in patternline)
		print "pattern lines",patternlines.collect()
        	matches=patternlines.map(mapFunction).reduceByKey(reduceFunction)
	else:
        	input=spcon.textFile(logfilename)
		matches=input.flatMap(lambda line:line.split()).filter(lambda line: filt in line).map(mapFunction).reduceByKey(reduceFunction)
        matches_collected=matches.collect()
	print "matches_collected:",matches_collected
	if len(matches_collected) > 0:
		sqlContext=SQLContext(spcon)
		bytes_stream_schema=sqlContext.createDataFrame(matches_collected)
		bytes_stream_schema.registerTempTable("USBWWAN_bytes_stream")
		query_results=sqlContext.sql("SELECT * FROM USBWWAN_bytes_stream")
		dict_query_results=dict(query_results.collect())
        	print "----------------------------------------------------------------------------------"
        	print "log_mapreducer(): pattern [",pattern,"] in [",logfilename,"] for filter [",filt,"]"
        	print "----------------------------------------------------------------------------------"
		dict_matches=dict(matches_collected)
		sorted_dict_matches = sorted(dict_matches.items(),key=operator.itemgetter(1), reverse=True)
        	print "pattern matching lines:",sorted_dict_matches 
        	print "----------------------------------------------------------------------------------"
		print "SparkSQL DataFrame query results:"
        	print "----------------------------------------------------------------------------------"
		pprint.pprint(dict_query_results)
        	print "----------------------------------------------------------------------------------"
		print "Cardinality of Stream Dataset:"
        	print "----------------------------------------------------------------------------------"
		print len(dict_query_results)
		spcon.stop()
        	return sorted_dict_matches 
开发者ID:shrinivaasanka,项目名称:usb-md-github-code,代码行数:37,代码来源:Spark_USBWWANLogMapReduceParser.py

示例9: mock_data

# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import createDataFrame [as 别名]
 def mock_data(self):
     """Mock data to imitate read from database."""
     sqlContext = SQLContext(self.sc)
     mock_data_rdd = self.sc.parallelize([("A", 1, 1), ("B", 1, 0), ("C", 0, 2), ("D", 2, 4), ("E", 3, 5) ])
     schema = ["id", "x", "y"]
     mock_data_df = sqlContext.createDataFrame(mock_data_rdd, schema)
     return mock_data_df
开发者ID:Sandy4321,项目名称:spark-tdd-example,代码行数:9,代码来源:test_clustering.py

示例10: main

# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import createDataFrame [as 别名]
def main(sc):
    sql_context = SQLContext(sc)
    all_data = get_all_data()

    # Input data: Each row is a bag of words from a sentence or document.
    training_data = [(id_gen.next(), text.split(" ")) for text in all_data]
    documentdf = sql_context.createDataFrame(training_data, ["id", "text"])

    remover = StopWordsRemover(inputCol="text", outputCol="text_filtered")
    cleaned_document = remover.transform(documentdf)

    # Learn a mapping from words to Vectors.
    word2vec = Word2Vec(vectorSize=len(training_data),
                        inputCol="text_filtered",
                        outputCol="result")
    model = word2vec.fit(cleaned_document)
    matrix = column_similarities(model.transform(cleaned_document))

    # We use the size of the target data to filter only
    # products of target data to filter data and avoid
    # products of taret data to itself
    values = matrix.entries.filter(
        lambda x: x.j >= TARGET_DATA_SIZE and x.i < TARGET_DATA_SIZE).sortBy(
        keyfunc=lambda x: x.value, ascending=False).map(
        lambda x: x.j).distinct().take(100)

    training_data_index = dict(training_data)
    for position, item in enumerate(values):
        line = " ".join(training_data_index[int(item)])
        print('%d -> %s' % (position, line.encode('utf-8')))
开发者ID:victorpoluceno,项目名称:socialbasebr-desafio,代码行数:32,代码来源:main.py

示例11: main

# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import createDataFrame [as 别名]
def main(dataFile, outputPath):

    conf = SparkConf().setAppName("S3 Example").set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
    sc = SparkContext(conf=conf)
    sqlContext = SQLContext(sc)

    raw_text = sc.textFile(dataFile).persist(StorageLevel.MEMORY_AND_DISK)

    csv_data = raw_text.map(lambda l: l.split(","))
    row_data = csv_data.map(lambda p: dataIO.dataStruc(p))

    interaction_df = sqlContext.createDataFrame(row_data)

    # features.save_hdfs_parquet(interaction_df, outputPath)
    dataIO.save_hdfs_parquet(interaction_df, outputPath)

    interaction_df.registerTempTable("interactions")

    tcp_interactions = sqlContext.sql( """
        SELECT duration, dst_bytes, protocol_type FROM interactions WHERE protocol_type = 'tcp' AND duration > 1000 AND dst_bytes=0
    """)

    tcp_interactions.show()

    features.print_tcp_interactions(tcp_interactions)
    dataIO.print_from_dataio()
    features.print_from_feature()

    sc.stop()
开发者ID:yuantuo,项目名称:pysparkexample,代码行数:31,代码来源:example.py

示例12: Spark_MapReduce_Parents

# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import createDataFrame [as 别名]
def Spark_MapReduce_Parents(keyword, tokensofprevlevel, graphcache):
	#tokensofprevlevelkeyword=tokensofprevlevel
	#tokensofprevlevelkeyword.append(keyword)
	md5hashparents = hashlib.md5(keyword).hexdigest()

	#md5hashparents = keyword
	md5hashparents = md5hashparents + "$parents"

	picklef_keyword=open("RecursiveGlossOverlap_MapReduce_Parents_Persisted.txt","w")
	asfer_pickle_string_dump(keyword,picklef_keyword)
	picklef_keyword.close()
	cachevalue=graphcache.get(md5hashparents)
	if cachevalue:
		print "Spark_MapReduce_Parents(): hash = ", md5hashparents, "; returning from cache"
		return cachevalue 
	else:	
		#picklelock.acquire()
		spcon = SparkContext("local[2]","Spark_MapReduce_Parents")
		#picklef_keyword=open("RecursiveGlossOverlap_MapReduce_Parents_Persisted.txt","w")
		#asfer_pickle_string_dump(keyword,picklef_keyword)
		#picklef_keyword.close()
		paralleldata = spcon.parallelize(tokensofprevlevel).cache()
		#k=paralleldata.map(lambda keyword: mapFunction_Parents(keyword,tokensofprevlevel)).reduceByKey(reduceFunction_Parents)
		k=paralleldata.map(mapFunction_Parents).reduceByKey(reduceFunction_Parents)
		sqlContext=SQLContext(spcon)
		parents_schema=sqlContext.createDataFrame(k.collect())
		parents_schema.registerTempTable("Interview_RecursiveGlossOverlap_Parents")
		query_results=sqlContext.sql("SELECT * FROM Interview_RecursiveGlossOverlap_Parents")
		dict_query_results=dict(query_results.collect())
		#print "Spark_MapReduce_Parents() - SparkSQL DataFrame query results:"
		#picklelock.release()
		graphcache.set(md5hashparents,dict_query_results[1])
		spcon.stop()
		print "graphcache_mapreduce_parents updated:", graphcache
		return dict_query_results[1]
开发者ID:shrinivaasanka,项目名称:asfer-github-code,代码行数:37,代码来源:InterviewAlgorithmWithIntrinisicMerit_SparkMapReducer.py

示例13: test_persistence

# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import createDataFrame [as 别名]
 def test_persistence(self):
     # Test save/load for LDA, LocalLDAModel, DistributedLDAModel.
     sqlContext = SQLContext(self.sc)
     df = sqlContext.createDataFrame([
         [1, Vectors.dense([0.0, 1.0])],
         [2, Vectors.sparse(2, {0: 1.0})],
     ], ["id", "features"])
     # Fit model
     lda = LDA(k=2, seed=1, optimizer="em")
     distributedModel = lda.fit(df)
     self.assertTrue(distributedModel.isDistributed())
     localModel = distributedModel.toLocal()
     self.assertFalse(localModel.isDistributed())
     # Define paths
     path = tempfile.mkdtemp()
     lda_path = path + "/lda"
     dist_model_path = path + "/distLDAModel"
     local_model_path = path + "/localLDAModel"
     # Test LDA
     lda.save(lda_path)
     lda2 = LDA.load(lda_path)
     self._compare(lda, lda2)
     # Test DistributedLDAModel
     distributedModel.save(dist_model_path)
     distributedModel2 = DistributedLDAModel.load(dist_model_path)
     self._compare(distributedModel, distributedModel2)
     # Test LocalLDAModel
     localModel.save(local_model_path)
     localModel2 = LocalLDAModel.load(local_model_path)
     self._compare(localModel, localModel2)
     # Clean up
     try:
         rmtree(path)
     except OSError:
         pass
开发者ID:bsangee,项目名称:spark,代码行数:37,代码来源:tests.py

示例14: main

# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import createDataFrame [as 别名]
def main(sc):
    	path = "events"
    	#text_file = sc.textFile(path)
    	sqlContext = SQLContext(sc)
    	events = sqlContext.jsonFile(path)

	events = events.select(events["events.event"]).flatMap(lambda p: p.event)
	events = events.map(lambda p: Row(
		id=p.id,\
		title=p.title, \
		lat=p.latitude, \
		long=p.longitude, \
		postal_code=p.postal_code, \
		start_time=datetime.strptime(p.start_time, "%Y-%m-%d %H:%M:%S"), \
		stop_time=p.stop_time)) 	
	events_df = sqlContext.createDataFrame(events)
	
	events_df.registerTempTable("events")

	sqlContext.registerFunction("to_hour", lambda x: x.hour)
	sqlContext.registerFunction("str_date", lambda x: str(x.month) + "-" + str(x.day) + "-" + str(x.year))

	e = sqlContext.sql("select title, str_date(start_time) as event_date,
	to_hour(start_time) as hour, postal_code from events where postal_code is not null and start_time is not null")

	events_grouped = sqlContext.sql("select event_date, hour, postal_code, 
	count(*) from events_filtered group by event_date,hour,postal_code order by postal_code,hour")

	grouped_csv = events_grouped.map(toCSV)
	grouped_csv.saveAsTextFile('events_cluster')
开发者ID:Narasimman,项目名称:Most-hapennning-places-NYC,代码行数:32,代码来源:parse_events.py

示例15: main

# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import createDataFrame [as 别名]
def main():
	
	sc = SparkContext()
	sqlCtx = SQLContext(sc)
	config = configparser.ConfigParser()
	config.read('config.ini')

	#Path where docking list file will be saved
	path_to_save = str(sys.argv[1])

	#Path for drugdesign project
	path_spark_drugdesign = config.get('DRUGDESIGN', 'path_spark_drugdesign')

	sc.addPyFile(os.path.join(path_spark_drugdesign,"database_crud.py"))
	sc.addPyFile(os.path.join(path_spark_drugdesign,"database_io.py"))


#**************** Loading Ligand Database
	ligand_database = config.get('DEFAULT', 'ligand_database_path_file')
	rdd_database = load_database(sc, ligand_database)	
	#Creating Dataframe
	database_table = sqlCtx.createDataFrame(rdd_database)	
	database_table.registerTempTable("database")
#**************** Finish 

	#Creating input files for peforming virtual screening
	creating_docking_list(path_to_save, config, sqlCtx)
开发者ID:rodrigofaccioli,项目名称:drugdesign,代码行数:29,代码来源:prepare_docking_list.py


注:本文中的pyspark.sql.SQLContext.createDataFrame方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。