当前位置: 首页>>代码示例>>Python>>正文


Python SQLContext.setConf方法代码示例

本文整理汇总了Python中pyspark.sql.SQLContext.setConf方法的典型用法代码示例。如果您正苦于以下问题:Python SQLContext.setConf方法的具体用法?Python SQLContext.setConf怎么用?Python SQLContext.setConf使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在pyspark.sql.SQLContext的用法示例。


在下文中一共展示了SQLContext.setConf方法的6个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: main

# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import setConf [as 别名]
def main():

    # Configure Spark
    conf = SparkConf()
    conf.setAppName("Application name")  # Specify the application name
    conf.set("spark.jars", "file:/shared_data/spark_jars/hadoop-openstack-3.0.0-SNAPSHOT.jar")  # Don't modify
    sc = SparkContext(conf=conf)  # Spark Context variable that will be used for all operations running on the cluster

    parser = argparse.ArgumentParser()
    parser.add_argument("backend", type=str)
    parser.add_argument("helperpath", type=str)
    parser.add_argument("shuffle_partitions", type=str)
    parser.add_argument("params", type=str)
    parser.add_argument("inputs", type=str)
    parser.add_argument("features", type=str, nargs='?')

    args = parser.parse_args()

    # Swift Connection
    if(args.backend == 'swift'):
        hadoopConf = sc._jsc.hadoopConfiguration()
        hadoopConf.set("fs.swift.impl", "org.apache.hadoop.fs.swift.snative.SwiftNativeFileSystem")
        hadoopConf.set("fs.swift.service.SparkTest.auth.url", os.environ['OS_AUTH_URL'] + "/tokens")
        hadoopConf.set("fs.swift.service.SparkTest.http.port", "8443")
        hadoopConf.set("fs.swift.service.SparkTest.auth.endpoint.prefix", "/")
        hadoopConf.set("fs.swift.service.SparkTest.region", os.environ['OS_REGION_NAME'])
        hadoopConf.set("fs.swift.service.SparkTest.public", "false")
        hadoopConf.set("fs.swift.service.SparkTest.tenant", os.environ['OS_TENANT_ID'])
        hadoopConf.set("fs.swift.service.SparkTest.username", os.environ['OS_USERNAME'])
        hadoopConf.set("fs.swift.service.SparkTest.password", os.environ['OS_PASSWORD'])

    helperpath = str(args.helperpath)  # This is passed by default
    sc.addFile(helperpath + "/utils/helper.py")  # To import custom modules
    shuffle_partitions = args.shuffle_partitions

    # Create a dict and pass it in your_module_implementation
    params = json.loads(args.params)
    inputs = json.loads(args.inputs)
    features = json.loads(args.features)  # Only used when you want to create a feature set

    sqlContext = SQLContext(sc)  # Create SQLContext var from SparkContext, To work with our default format of datasets i.e. Parquet
    sqlContext.setConf("spark.sql.shuffle.partitions", shuffle_partitions)  # Don't change, required for controlling parallelism

    # Pass the sc (Spark Context) and sqlContext along with the different paramters and inputs.
    module_implementation(sc, sqlContext, params=params, inputs=inputs, features=features)
开发者ID:CSC-IT-Center-for-Science,项目名称:spark-analysis,代码行数:47,代码来源:moduletemplate.py

示例2: SQLContext

# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import setConf [as 别名]
from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)
sqlContext.setConf("spark.sql.parquet.binaryAsString", "true")
df = sqlContext.read.parquet("hdfs://cluster019.test.local/events/testparquet/*.parq")
df.printSchema()
df.show()
df.select("receipttime").show()

root
 |-- agentaddress: string (nullable = true)
 |-- agentdescriptorid: string (nullable = true)
 |-- agentdnsdomain: string (nullable = true)
 |-- agenthostname: string (nullable = true)
 |-- agentid: string (nullable = true)
 |-- agentmacaddress: string (nullable = true)
 |-- agentntdomain: string (nullable = true)
 |-- agentreceipttime: long (nullable = true)
 |-- agenttimezone: string (nullable = true)
 |-- agenttranslatedaddress: string (nullable = true)
 |-- agenttranslatedzoneexternalid: string (nullable = true)
 |-- agenttranslatedzonereferenceid: string (nullable = true)
 |-- agenttranslatedzoneuri: string (nullable = true)
 |-- agenttype: string (nullable = true)
 |-- agentversion: string (nullable = true)
 |-- agentzoneexternalid: string (nullable = true)
 |-- agentzonereferenceid: string (nullable = true)
 |-- agentzoneuri: string (nullable = true)
 |-- applicationprotocol: string (nullable = true)
 |-- assetcriticality: string (nullable = true)
 |-- baseeventcount: long (nullable = true)
 |-- baseeventids: string (nullable = true)
开发者ID:dereksdata,项目名称:pyspark-jupyter-cdh,代码行数:33,代码来源:example-parquet.py

示例3: udf

# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import setConf [as 别名]
            mymonth + " as month",
            mydate + " as day",
            "upper(C0) as project",
            "C1 as url",
            "cast(C2 as int) as pageview",
        )
        .groupBy("month", "day", "project", "url")
        .agg({"pageview": "sum"})
        .withColumnRenamed("sum(pageview)", "pageviews")
    )
    d3 = d2.withColumn("url", udf(d2.url))
    # wiki.count()
    # Infer the schema, and register the DataFrame as a table.
    # Save to MySQL
    # Write to parquet file - if needed
    d3.write.partitionBy("month", "day").parquet("/data/flash/spark/wikifull/t17", "append")


mount = "/data/opt/wikistat/"
d = date(2015, 1, 1)
end_date = date(2016, 1, 1)
delta = timedelta(days=1)
sqlContext.setConf("spark.sql.parquet.compression.codec", "snappy")
while d < end_date:
    print d.strftime("%Y-%m-%d")
    filename = mount + "pagecounts-2015" + d.strftime("%m") + d.strftime("%d") + "-*.gz"
    print (filename)
    # load_day(filename, d.strftime("%Y-%m-%d"))
    load_day(filename, d.strftime("%d"), d.strftime("%m"))
    d += delta
开发者ID:Percona-Lab,项目名称:wikistat-data,代码行数:32,代码来源:loaddata.py

示例4: chunks

# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import setConf [as 别名]
from pyspark.sql import SQLContext, Row
from datetime import datetime

start = datetime.now()

# credit http://stackoverflow.com/questions/312443/how-do-you-split-a-list-into-evenly-sized-chunks-in-python
def chunks(l, n):
    """Yield successive n-sized chunks from l."""
    for i in range(0, len(l), n):
        yield l[i:i+n]

# set up context
sc = SparkContext("local[*]", "Simple App")
#sc = SparkContext("spark://url:7077", "Simple App")
sqlContext = SQLContext(sc)
sqlContext.setConf("spark.sql.shuffle.partitions", "5")

# issue movies query
conf = {"es.resource" : "movies2/logs", "es.query" : "?q=name:picture"}
movies = sc.newAPIHadoopRDD("org.elasticsearch.hadoop.mr.EsInputFormat",\
    "org.apache.hadoop.io.NullWritable", "org.elasticsearch.hadoop.mr.LinkedMapWritable", conf=conf)

# place results in table
moviesRows = movies.map(lambda p: Row(id=int(p[1]['id']), name=p[1]['name']))
moviesRowsList = moviesRows.collect()
schemaMovies = sqlContext.createDataFrame(moviesRowsList)
schemaMovies.registerTempTable("movies")
sqlContext.cacheTable("movies")

# get ids in order to form acted_in query
ids = []
开发者ID:ryancutter,项目名称:bigdata,代码行数:33,代码来源:sql_script.py

示例5: mad_based_outlier

# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import setConf [as 别名]
NOTE: regular MAD test evaluates on all points.
My modification compares the out-of-bag points with a pre-built univariate distribution
'''

def mad_based_outlier(point, med, thresh=3.5):
        #med_abs_deviation = profile.mad
        med = float(med)
        med_abs_deviation = med # med is the median of each training point's difference from their median
        diff = np.abs(point - med)
        if med_abs_deviation !=0:
                modified_z_score = 0.6745 * diff / med_abs_deviation
                "outlier" if modified_z_score > thresh else "normal"
                return modified_z_score > thresh
        else:
                return False

if __name__ == "__main__":
        sc = SparkContext("local", "Univariate anomoly test demo")
        ssc = StreamingContext(sc, 10)
        sqlc = SQLContext(sc)
        sqlc.setConf("spark.sql.shuffle.partition", "10")

        profile_X = sqlc.parquetFile(path_profile)
        mad = profile_X.first().mad

        test = ssc.textFileStream(dirpath_out_of_bag_datapoints)
        test = test.map(lambda x: x.split('`')[int(demo_numerical_field)])

        anomalousX = test.filter(lambda x: mad_based_outlier(int(x),mad))
        anomalousX.pprint()
开发者ID:pingyan,项目名称:Anomaly-Detection,代码行数:32,代码来源:MADtest.py

示例6: saveLogByDate

# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import setConf [as 别名]
 def saveLogByDate(self):
     sqlCtx = SQLContext(self.sContext)
     sqlCtx.setConf('spark.sql.parquet.compression.codec', 'snappy')
     print self.path
     self.parallelsave()
开发者ID:jleaniz,项目名称:bdsa,代码行数:7,代码来源:logfile.py


注:本文中的pyspark.sql.SQLContext.setConf方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。