本文整理汇总了Python中pyspark.sql.SQLContext.setConf方法的典型用法代码示例。如果您正苦于以下问题:Python SQLContext.setConf方法的具体用法?Python SQLContext.setConf怎么用?Python SQLContext.setConf使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pyspark.sql.SQLContext
的用法示例。
在下文中一共展示了SQLContext.setConf方法的6个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: main
# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import setConf [as 别名]
def main():
# Configure Spark
conf = SparkConf()
conf.setAppName("Application name") # Specify the application name
conf.set("spark.jars", "file:/shared_data/spark_jars/hadoop-openstack-3.0.0-SNAPSHOT.jar") # Don't modify
sc = SparkContext(conf=conf) # Spark Context variable that will be used for all operations running on the cluster
parser = argparse.ArgumentParser()
parser.add_argument("backend", type=str)
parser.add_argument("helperpath", type=str)
parser.add_argument("shuffle_partitions", type=str)
parser.add_argument("params", type=str)
parser.add_argument("inputs", type=str)
parser.add_argument("features", type=str, nargs='?')
args = parser.parse_args()
# Swift Connection
if(args.backend == 'swift'):
hadoopConf = sc._jsc.hadoopConfiguration()
hadoopConf.set("fs.swift.impl", "org.apache.hadoop.fs.swift.snative.SwiftNativeFileSystem")
hadoopConf.set("fs.swift.service.SparkTest.auth.url", os.environ['OS_AUTH_URL'] + "/tokens")
hadoopConf.set("fs.swift.service.SparkTest.http.port", "8443")
hadoopConf.set("fs.swift.service.SparkTest.auth.endpoint.prefix", "/")
hadoopConf.set("fs.swift.service.SparkTest.region", os.environ['OS_REGION_NAME'])
hadoopConf.set("fs.swift.service.SparkTest.public", "false")
hadoopConf.set("fs.swift.service.SparkTest.tenant", os.environ['OS_TENANT_ID'])
hadoopConf.set("fs.swift.service.SparkTest.username", os.environ['OS_USERNAME'])
hadoopConf.set("fs.swift.service.SparkTest.password", os.environ['OS_PASSWORD'])
helperpath = str(args.helperpath) # This is passed by default
sc.addFile(helperpath + "/utils/helper.py") # To import custom modules
shuffle_partitions = args.shuffle_partitions
# Create a dict and pass it in your_module_implementation
params = json.loads(args.params)
inputs = json.loads(args.inputs)
features = json.loads(args.features) # Only used when you want to create a feature set
sqlContext = SQLContext(sc) # Create SQLContext var from SparkContext, To work with our default format of datasets i.e. Parquet
sqlContext.setConf("spark.sql.shuffle.partitions", shuffle_partitions) # Don't change, required for controlling parallelism
# Pass the sc (Spark Context) and sqlContext along with the different paramters and inputs.
module_implementation(sc, sqlContext, params=params, inputs=inputs, features=features)
示例2: SQLContext
# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import setConf [as 别名]
from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)
sqlContext.setConf("spark.sql.parquet.binaryAsString", "true")
df = sqlContext.read.parquet("hdfs://cluster019.test.local/events/testparquet/*.parq")
df.printSchema()
df.show()
df.select("receipttime").show()
root
|-- agentaddress: string (nullable = true)
|-- agentdescriptorid: string (nullable = true)
|-- agentdnsdomain: string (nullable = true)
|-- agenthostname: string (nullable = true)
|-- agentid: string (nullable = true)
|-- agentmacaddress: string (nullable = true)
|-- agentntdomain: string (nullable = true)
|-- agentreceipttime: long (nullable = true)
|-- agenttimezone: string (nullable = true)
|-- agenttranslatedaddress: string (nullable = true)
|-- agenttranslatedzoneexternalid: string (nullable = true)
|-- agenttranslatedzonereferenceid: string (nullable = true)
|-- agenttranslatedzoneuri: string (nullable = true)
|-- agenttype: string (nullable = true)
|-- agentversion: string (nullable = true)
|-- agentzoneexternalid: string (nullable = true)
|-- agentzonereferenceid: string (nullable = true)
|-- agentzoneuri: string (nullable = true)
|-- applicationprotocol: string (nullable = true)
|-- assetcriticality: string (nullable = true)
|-- baseeventcount: long (nullable = true)
|-- baseeventids: string (nullable = true)
示例3: udf
# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import setConf [as 别名]
mymonth + " as month",
mydate + " as day",
"upper(C0) as project",
"C1 as url",
"cast(C2 as int) as pageview",
)
.groupBy("month", "day", "project", "url")
.agg({"pageview": "sum"})
.withColumnRenamed("sum(pageview)", "pageviews")
)
d3 = d2.withColumn("url", udf(d2.url))
# wiki.count()
# Infer the schema, and register the DataFrame as a table.
# Save to MySQL
# Write to parquet file - if needed
d3.write.partitionBy("month", "day").parquet("/data/flash/spark/wikifull/t17", "append")
mount = "/data/opt/wikistat/"
d = date(2015, 1, 1)
end_date = date(2016, 1, 1)
delta = timedelta(days=1)
sqlContext.setConf("spark.sql.parquet.compression.codec", "snappy")
while d < end_date:
print d.strftime("%Y-%m-%d")
filename = mount + "pagecounts-2015" + d.strftime("%m") + d.strftime("%d") + "-*.gz"
print (filename)
# load_day(filename, d.strftime("%Y-%m-%d"))
load_day(filename, d.strftime("%d"), d.strftime("%m"))
d += delta
示例4: chunks
# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import setConf [as 别名]
from pyspark.sql import SQLContext, Row
from datetime import datetime
start = datetime.now()
# credit http://stackoverflow.com/questions/312443/how-do-you-split-a-list-into-evenly-sized-chunks-in-python
def chunks(l, n):
"""Yield successive n-sized chunks from l."""
for i in range(0, len(l), n):
yield l[i:i+n]
# set up context
sc = SparkContext("local[*]", "Simple App")
#sc = SparkContext("spark://url:7077", "Simple App")
sqlContext = SQLContext(sc)
sqlContext.setConf("spark.sql.shuffle.partitions", "5")
# issue movies query
conf = {"es.resource" : "movies2/logs", "es.query" : "?q=name:picture"}
movies = sc.newAPIHadoopRDD("org.elasticsearch.hadoop.mr.EsInputFormat",\
"org.apache.hadoop.io.NullWritable", "org.elasticsearch.hadoop.mr.LinkedMapWritable", conf=conf)
# place results in table
moviesRows = movies.map(lambda p: Row(id=int(p[1]['id']), name=p[1]['name']))
moviesRowsList = moviesRows.collect()
schemaMovies = sqlContext.createDataFrame(moviesRowsList)
schemaMovies.registerTempTable("movies")
sqlContext.cacheTable("movies")
# get ids in order to form acted_in query
ids = []
示例5: mad_based_outlier
# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import setConf [as 别名]
NOTE: regular MAD test evaluates on all points.
My modification compares the out-of-bag points with a pre-built univariate distribution
'''
def mad_based_outlier(point, med, thresh=3.5):
#med_abs_deviation = profile.mad
med = float(med)
med_abs_deviation = med # med is the median of each training point's difference from their median
diff = np.abs(point - med)
if med_abs_deviation !=0:
modified_z_score = 0.6745 * diff / med_abs_deviation
"outlier" if modified_z_score > thresh else "normal"
return modified_z_score > thresh
else:
return False
if __name__ == "__main__":
sc = SparkContext("local", "Univariate anomoly test demo")
ssc = StreamingContext(sc, 10)
sqlc = SQLContext(sc)
sqlc.setConf("spark.sql.shuffle.partition", "10")
profile_X = sqlc.parquetFile(path_profile)
mad = profile_X.first().mad
test = ssc.textFileStream(dirpath_out_of_bag_datapoints)
test = test.map(lambda x: x.split('`')[int(demo_numerical_field)])
anomalousX = test.filter(lambda x: mad_based_outlier(int(x),mad))
anomalousX.pprint()
示例6: saveLogByDate
# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import setConf [as 别名]
def saveLogByDate(self):
sqlCtx = SQLContext(self.sContext)
sqlCtx.setConf('spark.sql.parquet.compression.codec', 'snappy')
print self.path
self.parallelsave()