Python SQLContext.parquetFile方法代码示例

本文整理汇总了Python中pyspark.sql.SQLContext.parquetFile方法的典型用法代码示例。如果您正苦于以下问题：Python SQLContext.parquetFile方法的具体用法？Python SQLContext.parquetFile怎么用？Python SQLContext.parquetFile使用的例子？那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pyspark.sql.SQLContext的用法示例。

在下文中一共展示了SQLContext.parquetFile方法的15个代码示例，这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞，您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: summarize

# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import parquetFile [as 别名]
def summarize(dataset):
    print "schema: %s" % dataset.schema().json()
    labels = dataset.map(lambda r: r.label)
    print "label average: %f" % labels.mean()
    features = dataset.map(lambda r: r.features)
    summary = Statistics.colStats(features)
    print "features average: %r" % summary.mean()

if __name__ == "__main__":
    if len(sys.argv) > 2:
        print >> sys.stderr, "Usage: dataset_example.py <libsvm file>"
        exit(-1)
    sc = SparkContext(appName="DatasetExample")
    sqlContext = SQLContext(sc)
    if len(sys.argv) == 2:
        input = sys.argv[1]
    else:
        input = "data/mllib/sample_libsvm_data.txt"
    points = MLUtils.loadLibSVMFile(sc, input)
    dataset0 = sqlContext.inferSchema(points).setName("dataset0").cache()
    summarize(dataset0)
    tempdir = tempfile.NamedTemporaryFile(delete=False).name
    os.unlink(tempdir)
    print "Save dataset as a Parquet file to %s." % tempdir
    dataset0.saveAsParquetFile(tempdir)
    print "Load it back and summarize it again."
    dataset1 = sqlContext.parquetFile(tempdir).setName("dataset1").cache()
    summarize(dataset1)
    shutil.rmtree(tempdir)

开发者ID:ArafathC，项目名称:spark，代码行数:31，代码来源:dataset_example.py

示例2: SparkConf

# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import parquetFile [as 别名]
    datasets = args.datasets.split(",")

    conf = SparkConf().setAppName(jobNm)
    sc = SparkContext(conf=conf)
    sqlContext = SQLContext(sc)

    # Read in stop word list early to get notified of issues early in process
    bc_lStopWords = fspLib.load_stopwords(sc, stopWordsPath, sCustStop)

    t0 = time.time()

    records = None
    for file in datasets:
        print "reading file: ", file
        if records == None:
            records = sqlContext.parquetFile(file)
        else:
            newRec = sqlContext.parquetFile(file)
            records = records.unionAll(newRec)

    if inputPartitions > 0:
        records = records.repartition(inputPartitions)

    # Find the word document frequency for the corpus
    # this is used for an idf score used in feature vector formation
    t1 = time.time()
    goodRecords = records.map(lambda x: fspLib.uniqueWords(x.text, bUseStopFilter, bc_lStopWords))
    goodRecords = goodRecords.filter(lambda x: len(x) > 0).cache()
    nGoodTweets = goodRecords.count()

    t2 = time.time()

开发者ID:theseusyang，项目名称:GEQE，代码行数:33，代码来源:precomputeIDF.py

示例3: SQLContext

# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import parquetFile [as 别名]
    # Deploy the SQL Module of Spark
    sqlCtx = SQLContext(sc)

    print('Spark SQL and Parquet Testing Script\nTest Script to Build and Test how Spark SQL and Parquet Files can be used\nCreate Date: 17/1/2015\n')

    # Check if Data is Pre-Saved as a Parquet File - People Age
    if os.path.isdir('/home/dan/Desktop/People_Age.parquet') == False:
        # Print Statement
        print('Reading in and creating a Parquet File - People Age')
        # Initiate Command
        table1()
    else:
        # Print Statement
        print('Reading in Pre-made Parquet File')
        # Read in the Parquet file created above.
        parquetFile = sqlCtx.parquetFile(
            "/home/dan/Desktop/People_Age.parquet")
        # Parquet files can also be registered as tables and then used in SQL
        # statements.
        parquetFile.registerTempTable("People_Age")

    # Check if Data is Pre-Saved as a Parquet File - People Details
    if os.path.isdir('/home/dan/Desktop/People_Details.parquet') == False:
        # Print Statement
        print('Reading in and creating a Parquet File - People Details')
        # Initiate Command
        table2()
    else:
        # Print Statement
        print('Reading in Pre-made Parquet File')
        # Read in the Parquet file created above.
        parquetFile = sqlCtx.parquetFile(

开发者ID:AkiraKane，项目名称:CityUniversity2014，代码行数:34，代码来源:Spark_SQL.py

示例4: len

# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import parquetFile [as 别名]
# Finds the names of people who like pandas from a parquet file
# consisting of name & favouriteAnimal.
# For input you can use the result of MakeParquetFile
from pyspark import SparkContext
from pyspark.sql import SQLContext
import json
import sys

if __name__ == "__main__":
    if len(sys.argv) != 3:
        print "Error usage: QueryParquetFile [sparkmaster] [parquetfile]"
        sys.exit(-1)
    master = sys.argv[1]
    parquetFile = sys.argv[2]
    sc = SparkContext(master, "QueryParquetFile")
    sqlCtx = SQLContext(sc)
    # Load some data in from a Parquet file of name & favouriteAnimal
    rows = sqlCtx.parquetFile(parquetFile)
    names = rows.map(lambda row: row.name)
    print "Everyone"
    print names.collect()
    # Find the panda lovers
    tbl = rows.registerAsTable("people")
    pandaFriends = sqlCtx.sql('SELECT name FROM people WHERE favouriteAnimal = "panda"')
    print "Panda Friends"
    print pandaFriends.map(lambda row: row.name).collect()

开发者ID:mrt，项目名称:learning-spark，代码行数:28，代码来源:QueryParquetFile.py

示例5: SparkConf

# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import parquetFile [as 别名]

    #Declare Spark Context
    conf = SparkConf().setAppName(jobNm)
    conf.set('spark.driver.maxResultSize','0')
    sc = SparkContext(conf = conf)
    sqlContext = SQLContext(sc)


    #Create polygon list and broadcast variable based on it
    lPolygon = shapeReader.readInShapeJson(shapeFile)
    bc_lTargetPolygons = sc.broadcast(lPolygon)

    #Read in data, coalesce to limit the number of jobs and avoid shuffling issues later in the job

    records = sqlContext.parquetFile(inputFile) if 0 == nDataType else csvToDataFrame(sc,sqlContext,inputFile,nDataType)
    if inputPartitions != -1:
        records = records.repartition(inputPartitions)
    records.cache()
    records.registerTempTable('records')
    sqlContext.registerFunction("inRegionOfInterest", lambda lat,lon: fspLib.inROI(lat,lon,bc_lTargetPolygons),returnType=BooleanType())
    sqlContext.registerFunction("inEventOfInterest", lambda lat,lon,dt: fspLib.inEOI(lat,lon,dt,bc_lTargetPolygons),returnType=BooleanType())
    data = sqlContext.sql("SELECT * from records WHERE inRegionOfInterest(records.lat,records.lon) AND inEventOfInterest(records.lat,records.lon,records.dt)")


    #Split data into 2 DDSs depending on being in our out of region of interest
    rows = data.collect()
    if not os.path.isdir('previewTrainingFiles'): os.mkdir('previewTrainingFiles')
    fOut = codecs.open('previewTrainingFiles/'+jobNm, encoding="utf-8",mode="wb")
    for row in rows:
        try:

开发者ID:theseusyang，项目名称:GEQE，代码行数:32，代码来源:viewTrainingData.py

示例6: count

# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import parquetFile [as 别名]
for row in ip_access[:10]:
    print row

step1 = schema_dicts.map(lambda row: (row.ip, 1))
step1.take(5)

step2 = step1.reduceByKey(lambda a,b: a+b)
step2.take(5)

step3 = step2.map(lambda r: (r[1], r[0]))
step3.take(5)

step4 = step3.sortByKey(ascending=False)
step4.take(5)

step5 = step4.map(lambda r: (r[1], r[0]))
step5.take(10)

pq_path = '/home/anant/projects/spark-examples/data/log.pq'
log_schema.saveAsParquetFile(pq_path)

parquetFile = sqlContext.parquetFile(pq_path)
parquetFile.registerTempTable('parquetTable')

ip_access = sqlContext.sql("SELECT ip, count(*) as counts FROM parquetTable GROUP BY ip ORDER BY counts DESC LIMIT 10").collect()
for row in ip_access:
    print row

开发者ID:anantasty，项目名称:spark-examples，代码行数:29，代码来源:log_analysis.py

示例7: SQLContext

# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import parquetFile [as 别名]
# load from Parquet files

from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)

bike_df = sqlContext.parquetFile("bike.parquet").cache()

开发者ID:Arouna，项目名称:intro_spark，代码行数:8，代码来源:18.parquet.py

示例8: SparkConf

# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import parquetFile [as 别名]
from pyspark import SparkConf, SparkContext
from pyspark.sql import SQLContext, Row


if __name__ == '__main__':
    #sparkConf = SparkConf().setMaster("local").setAppName("Total Spent by Customer - Sorted")
    sparkConf = SparkConf().setAppName("Slicing and Dicing Parquet File")
    sc = SparkContext(conf=sparkConf)
    sqlContext = SQLContext(sc)

    input = sqlContext.parquetFile()
    print 'type of input var : ', type(input)

开发者ID:dbiswa4，项目名称:spark-projects，代码行数:14，代码来源:parquet_file.py

示例9: PSparkContext

# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import parquetFile [as 别名]
class PSparkContext():

    """This is a thin wrapper around SparkContext from PySpark which makes it
    easy to load data into L{PRDD}s."""

    def __init__(self, spark_context, sql_ctx=None):
        """Initialize a PSparkContext with the associacted spark context,
        and spark sql context if provided.
        :param spark_context: Initialized and configured spark context.
        :param sql_ctx: Initialized and configured SQL context, if relevant.
        :return: Correctly initialized SparklingPandasContext.
        """
        self.spark_ctx = spark_context
        if sql_ctx:
            self.sql_ctx = sql_ctx
        else:
            from pyspark.sql import SQLContext
            self.sql_ctx = SQLContext(self.spark_ctx)
        # Register our magical functions
        register_sql_extensions(self.sql_ctx)

    @classmethod
    def simple(cls, *args, **kwargs):
        """Takes the same arguments as SparkContext and constructs a
        PSparkContext"""
        return PSparkContext(SparkContext(*args, **kwargs))

    def read_csv(self, name, use_whole_file=False, names=None, skiprows=0,
                 *args, **kwargs):
        """Read a CSV file in and parse it into Pandas DataFrames.
        If no names is provided we use the first row for the names.
        header=0 is the default unless names is provided in which case
        header=None is the default.
        skiprows indicates how many rows of input to skip. This will
        only be applied to the first partition of the data (so if
        #skiprows > #row in first partition this will not work). Generally
        this shouldn't be an issue for small values of skiprows.
        No other values of header is supported.
        All additional parameters are passed to the read_csv function.
        TODO: Use spark-csv package if the request could be fulfilled by it.
        """
        def csv_file(partition_number, files):
            # pylint: disable=unexpected-keyword-arg
            file_count = 0
            for _, contents in files:
                # Only skip lines on the first file
                if partition_number == 0 and file_count == 0 and _skiprows > 0:
                    yield pandas.read_csv(
                        sio(contents), *args,
                        header=None,
                        names=mynames,
                        skiprows=_skiprows,
                        **kwargs)
                else:
                    file_count += 1
                    yield pandas.read_csv(
                        sio(contents), *args,
                        header=None,
                        names=mynames,
                        **kwargs)

        def csv_rows(partition_number, rows):
            # pylint: disable=unexpected-keyword-arg
            in_str = "\n".join(rows)
            if partition_number == 0:
                return iter([
                    pandas.read_csv(
                        sio(in_str), *args, header=None,
                        names=mynames,
                        skiprows=_skiprows,
                        **kwargs)])
            else:
                # could use .iterows instead?
                return iter([pandas.read_csv(sio(in_str), *args, header=None,
                                             names=mynames, **kwargs)])

        # If we need to peak at the first partition and determine the column
        # names
        mynames = None
        _skiprows = skiprows
        if names:
            mynames = names
        else:
            # In the future we could avoid this expensive call.
            first_line = self.spark_ctx.textFile(name).first()
            frame = pandas.read_csv(sio(first_line), **kwargs)
            # pylint sees frame as a tuple despite it being a Dataframe
            mynames = list(frame.columns)
            _skiprows += 1

        # Do the actual load
        if use_whole_file:
            return self.from_pandas_rdd(
                self.spark_ctx.wholeTextFiles(name)
                .mapPartitionsWithIndex(csv_file))
        else:
            return self.from_pandas_rdd(
                self.spark_ctx.textFile(name).mapPartitionsWithIndex(csv_rows))

    def parquetFile(self, *paths):
#.........这里部分代码省略.........

开发者ID:asaf-erlich，项目名称:sparklingpandas，代码行数:103，代码来源:pcontext.py

示例10: SQLContext

# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import parquetFile [as 别名]
'''
A SQLConext wraps the SparkContext,
and adds functions for working with structured data.
'''
from pyspark.sql import SQLContext
sqlCtx = SQLContext(sc)


'''
Now we can load a set of data in that is stored in the Parquet format.
Parquet is a self-describing columnar format. Since it is self-describing,
Spark SQL will automatically be able to infer all of the column names and their datatypes. 
'''
wikiData = sqlCtx.parquetFile("file:///root/data/wiki_parquet")

'''
The result of loading in a parquet file is a SchemaRDD.
A SchemaRDD has all of the functions of a normal RDD.
For example, lets figure out how many records are in the data set.
'''
wikiData.count()

'''
In addition to standard RDD operatrions, SchemaRDDs also have
extra information about the names and types of the columns in the dataset.
This extra schema information makes it possible to run SQL queries against
the data after you have registered it as a table.
'''
wikiData.registerAsTable("wikiData")
# to describe the schema
wikiData.printSchema()

开发者ID:cineca-scai，项目名称:course-exercises，代码行数:33，代码来源:03_spark_sql.py

示例11: mad_based_outlier

# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import parquetFile [as 别名]
NOTE: regular MAD test evaluates on all points.
My modification compares the out-of-bag points with a pre-built univariate distribution
'''

def mad_based_outlier(point, med, thresh=3.5):
        #med_abs_deviation = profile.mad
        med = float(med)
        med_abs_deviation = med # med is the median of each training point's difference from their median
        diff = np.abs(point - med)
        if med_abs_deviation !=0:
                modified_z_score = 0.6745 * diff / med_abs_deviation
                "outlier" if modified_z_score > thresh else "normal"
                return modified_z_score > thresh
        else:
                return False

if __name__ == "__main__":
        sc = SparkContext("local", "Univariate anomoly test demo")
        ssc = StreamingContext(sc, 10)
        sqlc = SQLContext(sc)
        sqlc.setConf("spark.sql.shuffle.partition", "10")

        profile_X = sqlc.parquetFile(path_profile)
        mad = profile_X.first().mad

        test = ssc.textFileStream(dirpath_out_of_bag_datapoints)
        test = test.map(lambda x: x.split('`')[int(demo_numerical_field)])

        anomalousX = test.filter(lambda x: mad_based_outlier(int(x),mad))
        anomalousX.pprint()

开发者ID:pingyan，项目名称:Anomaly-Detection，代码行数:32，代码来源:MADtest.py

示例12: saveParquetJson

# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import parquetFile [as 别名]
			]
	},
	{
		"id": "0003",
		"type": "donut",
		"name": "Old Fashioned",
		"ppu": 0.55,
		"batters":
			{
				"batter":
					[
						{ "id": "1001", "type": "Regular" },
						{ "id": "1002", "type": "Chocolate" }
					]
			},
		"topping":
			[
				{ "id": "5001", "type": "None" },
				{ "id": "5002", "type": "Glazed" },
				{ "id": "5003", "type": "Chocolate" },
				{ "id": "5004", "type": "Maple" }
			]
	}
]'''
saveParquetJson(data,filename)
parquetFile = sqlContext.parquetFile(outfile)
parquetFile.registerTempTable("parquetFile");
out = sqlContext.sql('select batters from parquetFile where name = "Cake"')
print out.collect()# =

开发者ID:Radhika-Goel，项目名称:PracticeCode，代码行数:31，代码来源:createParquet.py

示例13: SQLContext

# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import parquetFile [as 别名]
from pyspark.mllib.clustering import KMeans
from pyspark.mllib.feature import Word2Vec
from pyspark.sql import SQLContext
import numpy
from math import sqrt

sqlctx = SQLContext(sc)
bashlogsDF = sqlctx.parquetFile('/user/cloudera/bashlog')
commandsDF = bashlogsDF.select(bashlogsDF.command)

# RDD of list of words in each command
# Review: each command should be considered a "word" instead of each command + arg being an individual word
commandsRDD = commandsDF.rdd.map(lambda row: row.command.split("\n"))

# Convect commands in commandsRDD to vectors.
w2v = Word2Vec()
model = w2v.setVectorSize(2).fit(commandsRDD)

commandsListRDD = commandsDF.rdd.flatMap(lambda row: row.command.split("\n"))
commandsList = sc.parallelize(commandsListRDD.take(10000)).collect()
vectorsList = []

for command in commandsList:
    try:
        vectorsList.append(numpy.array(model.transform(command)))
    except ValueError:
        pass

kmdata = sc.parallelize(vectorsList, 1024)

k = int(sqrt(len(vectorsList) / 2))

开发者ID:jleaniz，项目名称:bdsa，代码行数:33，代码来源:bash_kmeans.py

示例14:

# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import parquetFile [as 别名]
#  8. Using Spark SQL

# Spark DataFrames also allow you to use Spark SQL to query from Petabytes of data. Spark comes with a SQL like query 
# language which can be used to query from Distributed DataFrames. A key advantage of using Spark SQL is that the 
# (https://databricks.com/blog/2015/04/13/deep-dive-into-spark-sqls-catalyst-optimizer.html) under the hood transforms
#  your SQL query to run it most efficiently. 

# Spark SQL can leverage the same functionality as the DataFrame API provides. In fact, it provides more functionality via SQL capabilities and HQL capabilities that are available to Spark SQL environment. 

# For the sake of time constrains, I will explain different functions available in Spark SQL environment by using examples that use multiple functions. This will benefit by:
# + Covering many functions that are possible via spark SQL
# + Giving an understanding about how to pipe multiple functions together

# Read the reviews parquet file
reviewsDF = sqc.parquetFile('Data/Outputs/reviews_filtered.parquet')

# Register the DataFrames to be used in sql
reviewsDF.registerAsTable("reviews")
productDF.registerAsTable("products")

print 'There are {0} reviews about {1} products'.format(reviewsDF.count(),productDF.count())


# NOW LET'S RUN A SQL QUERY

sql_query = """SELECT reviews.asin, overall, reviewText, price
            FROM reviews JOIN products ON  reviews.asin=products.asin
            WHERE price > 50.00
"""

开发者ID:FuzzyDuck79，项目名称:spark_tutorial，代码行数:31，代码来源:spark_tutorial.py

示例15: udf

# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import parquetFile [as 别名]
dateNum = udf(dateStrToCol, IntegerType())

f = open('twitter_schema.pkl')
schema_read = pickle.load(f)
f.close()
#load twitter data as dataframe, and register it as a table
#df = SqlContext.jsonFile("~/Downloads/ntweets_2015-03-31.22.4.txt")
#df = sqlContext.jsonFile("Downloads/twitter_raw")
date_list = ['2014-0'+i.__str__() for i in range(3,10)] + ['2014-' + i.__str__() for i in range(10,13)] + ['2015-0'+i.__str__() for i in range(1,4)]
for d in date_list:
    df = sqlContext.jsonFile("s3n://cs341-data/twitter_test_500/*"+d+"*", schema = schema_read)
    print "loaded"
    #df.cache()
    #df.registerTempTable("tweets")

    #filter fields and tweets:
    #Only keep body, actor.favoritesCount, actor.followersCount, actor.friendsCount, actor.verified,
    #favoritesCount, retweetCount, verb actor.followersCount AS actor.followersCount, \
        #actor.friendsCount AS actor.friendsCount, actor.verified AS actor.verified,
    #sqlContext.sql("SELECT body, actor.favoritesCount AS favoritesCount,  favoritesCount, retweetCount, verb FROM tweets ").save("Downloads/processed.json", 'json')
    df.filter("twitter_lang = 'en' ").filter(slen(df.body) > 10).select(df.body, df.favoritesCount, df.retweetCount, df.verb, 
        df["actor.favoritesCount"].alias('actor.favoritesCount'), df["actor.followersCount"].alias('actor.followersCount'), 
        df['actor.friendsCount'].alias('actor.friendsCount'), df['actor.verified'].alias('actor.verified'), dateNum(df['postedTime']).alias('date'), 
        df['twitter_entities.user_mentions.screen_name'].alias('mentions_screen'), df['twitter_entities.user_mentions.name'].alias('mentions_name')).save("s3n://cs341-data/processed_tweets_test_11/"+d, "parquet")

    #df['twitter_entities.user_mentions.screen_name'].alias('mentions_screen'), df['twitter_entities.user_mentions.name'].alias('mentions_name')
    #df2 = sqlContext.jsonFile("Downloads/processed_tweets_test")
    #df2.printSchema()
df2 = sqlContext.parquetFile("s3n://cs341-data/processed_tweets_test_8")
df2.printSchema()

开发者ID:cs341-predict-app-rank，项目名称:Predict_App_Rank，代码行数:32，代码来源:tweetsPreProcess.py

注：本文中的pyspark.sql.SQLContext.parquetFile方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台，相关代码片段筛选自各路编程大神贡献的开源项目，源码版权归原作者所有，传播和使用请参考对应项目的License；未经允许，请勿转载。