当前位置: 首页>>代码示例>>Python>>正文


Python SQLContext.jsonFile方法代码示例

本文整理汇总了Python中pyspark.sql.SQLContext.jsonFile方法的典型用法代码示例。如果您正苦于以下问题:Python SQLContext.jsonFile方法的具体用法?Python SQLContext.jsonFile怎么用?Python SQLContext.jsonFile使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在pyspark.sql.SQLContext的用法示例。


在下文中一共展示了SQLContext.jsonFile方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: get_recommendations

# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import jsonFile [as 别名]
class RecommendationEngine:
    """A travel recommendation engine
    """
    def get_recommendations(self, user_id):
        """Recommends travel for user
        """
        data = (1,2,3,4,5)
        even_rdd = self.sc.parallelize(data)
        #ratings = even_rdd.collect()
        reco = self.sqlContext.sql("SELECT c.contact_id, o.prod_id  FROM contacts c , offres o WHERE  o.continent_offre = c.continent and o.envie_offre = c.envie and o.moyen_offre = c.moyen").collect()
        return reco

    def __init__(self, sc):
        """Init the recommendation engine given a Spark context and a dataset path
        """
        logger.info("Starting up the Recommendation Engine: ")
        self.sc = sc
        self.sqlContext = SQLContext(sc)

        path_contacts = "data_v3/contacts/attempt_contactV3_perfect_match.json"
        df_contacts = self.sqlContext.jsonFile(path_contacts)

        df_contacts.registerTempTable("contacts")

        path_offres = "data_v3/offres/attempt_productV3_perfect_match.json"
        df_offres = self.sqlContext.jsonFile(path_offres)
        df_offres.registerTempTable("offres")
开发者ID:nicolasclaudon,项目名称:travel,代码行数:29,代码来源:engine.py

示例2: main

# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import jsonFile [as 别名]
def main():
    log = logging.getLogger(prog)
    log.setLevel(logging.INFO)
    # bit hackish and hard to keep aligned with docstring changes, not using this
    # usage = '\r\b\r\b\r' + __doc__ + "usage: %prog -j file.json -p directory.parquet"
    # parser = OptionParser(usage=usage, version='%prog ' + __version__)
    parser = OptionParser(version='%prog ' + __version__)
    parser.add_option('-j', '--json', dest='jsonFile', help='JSON input file/dir', metavar='<file/dir>')
    parser.add_option('-p', '--parquetDir', dest='parquetDir', help='Parquet output dir', metavar='<dir>')

    (options, args) = parser.parse_args()

    jsonFile   = options.jsonFile
    parquetDir = options.parquetDir

    if args or not jsonFile or not parquetDir:
        usage(parser)

    conf = SparkConf().setAppName('HS PySpark JSON => Parquet')
    sc = SparkContext(conf=conf)
    sqlContext = SQLContext(sc)
    spark_version = sc.version
    log.info('Spark version detected as %s' % spark_version)
    if not isVersionLax(spark_version):
        die("Spark version couldn't be determined. " + support_msg('pytools'))
    if isMinVersion(spark_version, 1.4):
        json = sqlContext.read.json(jsonFile)
        json.write.parquet(parquetDir)
    else:
        log.warn('running legacy code for Spark <= 1.3')
        json = sqlContext.jsonFile(jsonFile)
        json.saveAsParquetFile(parquetDir)
开发者ID:zhumzhu,项目名称:pytools,代码行数:34,代码来源:spark-json-to-parquet.py

示例3: run

# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import jsonFile [as 别名]
    def run(self):
        jsonFile   = self.options.jsonFile
        parquetDir = self.options.parquetDir

        if not jsonFile:
            self.usage('--json not defined')
        if not parquetDir:
            self.usage('--parquetDir not defined')
        if self.args:
            self.usage()

        conf = SparkConf().setAppName('HS PySpark JSON => Parquet')
        sc = SparkContext(conf=conf)
        sqlContext = SQLContext(sc)
        spark_version = sc.version
        log.info('Spark version detected as %s' % spark_version)
        if not isVersionLax(spark_version):
            die("Spark version couldn't be determined. " + support_msg('pytools'))
        if isMinVersion(spark_version, 1.4):
            json = sqlContext.read.json(jsonFile)
            json.write.parquet(parquetDir)
        else:
            log.warn('running legacy code for Spark <= 1.3')
            json = sqlContext.jsonFile(jsonFile)
            json.saveAsParquetFile(parquetDir)
开发者ID:gggordon,项目名称:pytools,代码行数:27,代码来源:spark-json-to-parquet.py

示例4: main

# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import jsonFile [as 别名]
    def main(self, sc, *args):
        from pyspark.sql.types import BooleanType, StringType
        from pyspark.sql.types import FloatType, StructField, StructType
        from pyspark.sql import SQLContext

        fields = []
        for field in header_avro["fields"] + self.extra_fields:
            if field["type"] == "float":
                field_type = FloatType()
            elif field["type"] == "bool":
                field_type = BooleanType()
            else:
                field_type = StringType()
            fields.append(StructField(field["name"], field_type))
        schema = StructType(fields)

        sqlContext = SQLContext(sc)
        logger.info("Reading %s from %s" % (self.test_name, self.input().path))
        df = sqlContext.jsonFile(self.input().path, schema)
        df.registerTempTable("reports")

        entries = df.filter("({test_names}) AND"
                            " record_type = 'entry'".format(
                                test_names=' OR '.join([
                                    "test_name = '{test_name}'".format(
                                        test_name=tn)
                                    for tn in self.test_names])))
        interestings = self.find_interesting(entries)

        out_file = self.output().open('w')
        for interesting in interestings.toJSON().collect():
            out_file.write(interesting)
            out_file.write("\n")
        out_file.close()
开发者ID:TylerJFisher,项目名称:ooni-pipeline,代码行数:36,代码来源:spark_apps.py

示例5: main

# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import jsonFile [as 别名]
def main(sc):
    	path = "events"
    	#text_file = sc.textFile(path)
    	sqlContext = SQLContext(sc)
    	events = sqlContext.jsonFile(path)

	events = events.select(events["events.event"]).flatMap(lambda p: p.event)
	events = events.map(lambda p: Row(
		id=p.id,\
		title=p.title, \
		lat=p.latitude, \
		long=p.longitude, \
		postal_code=p.postal_code, \
		start_time=datetime.strptime(p.start_time, "%Y-%m-%d %H:%M:%S"), \
		stop_time=p.stop_time)) 	
	events_df = sqlContext.createDataFrame(events)
	
	events_df.registerTempTable("events")

	sqlContext.registerFunction("to_hour", lambda x: x.hour)
	sqlContext.registerFunction("str_date", lambda x: str(x.month) + "-" + str(x.day) + "-" + str(x.year))

	e = sqlContext.sql("select title, str_date(start_time) as event_date,
	to_hour(start_time) as hour, postal_code from events where postal_code is not null and start_time is not null")

	events_grouped = sqlContext.sql("select event_date, hour, postal_code, 
	count(*) from events_filtered group by event_date,hour,postal_code order by postal_code,hour")

	grouped_csv = events_grouped.map(toCSV)
	grouped_csv.saveAsTextFile('events_cluster')
开发者ID:Narasimman,项目名称:Most-hapennning-places-NYC,代码行数:32,代码来源:parse_events.py

示例6: selectJson

# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import jsonFile [as 别名]
def selectJson(sc, columns, filePath):
    sqlContext = SQLContext(sc)
    if columns[0] == '*':
        df = sqlContext.jsonFile(filePath)

        # displays the content of the DataFrame to stdout
        df.show()

    else:
        df = sqlContext.load(filePath, "json")

        df.select(columns).show()
开发者ID:SophyXu,项目名称:PySpark-Framework,代码行数:14,代码来源:framework.py

示例7: main

# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import jsonFile [as 别名]
def main(sc):
  sqlContext = SQLContext(sc)
  df = sqlContext.jsonFile(DATA_PATH)
  #add the filter file
  sc.addFile(FILTER_TERMS_FILE_PATH)
  filter_terms = sc.textFile(SparkFiles.get("freebase-symptoms-just-terms.txt"))
  global filter_terms_set_bc
  filter_terms_set_bc = sc.broadcast(Set(filter_terms.collect()))
  # Register the DataFrame as a table.
  df.registerTempTable("tweet")
  results = sqlContext.sql("SELECT id,user.id,user.lang,created_at, coordinates,text FROM tweet where user.lang='en'")
  #filter tweets to find health related tweets
  filter_health_tweets = results.rdd.filter(healthFilter)
  filter_health_tweets.mapPartitions(writeRecords).saveAsTextFile("output/")
开发者ID:LeotisBuchanan,项目名称:stream-data-analysis-realtime,代码行数:16,代码来源:filterandConvertToCSV.py

示例8: main

# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import jsonFile [as 别名]
def main():
    conf = SparkConf().setAppName("pyspark test")
    sc = SparkContext(conf=conf)
    sqlContext = SQLContext(sc)


    config = ConfigParser.ConfigParser()
    config.read('configuration.cfg')
    mongodb_connection = config.get('BatchProperties', 'URLMongoDB')

    #######################################################
    # UTILIZACION DE LA LIBRERIA DE PYMONGO
    #######################################################
    client = MongoClient()
    db = client.test

    cursor = db.tabla1.find()

    for document in cursor:
        print(document)


    #######################################################
    # UTILIZACION DE LA LIBRERIA DE pymongo_spark
    #######################################################
    # Lectura de una tabla de mongodb (db: test; coleccion: tabla1)
    rdd = sc.mongoRDD(mongodb_connection + 'test.tabla1')

    # Guardamos el rdd leido en mongodb (db: test; coleccion: tabla2)
    rdd.saveToMongoDB(mongodb_connection + 'test.tabla2')

    # Recuperamos el valor de raiz del proyecto
    BASE_DIR = os.path.dirname(os.path.dirname(__file__))
    # BASE_DIR = /Users/akash/PycharmProjects/masterbigdata

    # Leemos un fichero de ejemplo
    file = os.path.join(BASE_DIR + '/datasets/batch/air', 'ficheroSalidaAire.txt')

    rddfFile = sqlContext.jsonFile(file)

     # Almancemos en mongodb el fichero
    rddfFile.saveToMongoDB(mongodb_connection + 'test.tabla3')
开发者ID:akashdaswani,项目名称:masterbigdata,代码行数:44,代码来源:TestMongoDB.py

示例9: run

# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import jsonFile [as 别名]
    def run(self):
        json_file = self.get_opt('json')
        parquet_dir = self.get_opt('parquet_dir')
        # let Spark fail if csv/parquet aren't available
        # can't check paths exist as want to remain generically portable
        # to HDFS, local filesystm or any other uri scheme Spark supports
        log.info("Json Source: %s" % json_file)
        log.info("Parquet Destination: %s" % parquet_dir)

        conf = SparkConf().setAppName('HS PySpark JSON => Parquet')
        sc = SparkContext(conf=conf) # pylint: disable=invalid-name
        sqlContext = SQLContext(sc)  # pylint: disable=invalid-name
        spark_version = sc.version
        log.info('Spark version detected as %s' % spark_version)
        if not isVersionLax(spark_version):
            die("Spark version couldn't be determined. " + support_msg('pytools'))
        if isMinVersion(spark_version, 1.4):
            df = sqlContext.read.json(json_file) # pylint: disable=invalid-name
            df.write.parquet(parquet_dir)
        else:
            log.warn('running legacy code for Spark <= 1.3')
            df = sqlContext.jsonFile(json_file) # pylint: disable=invalid-name
            df.saveAsParquetFile(parquet_dir)
开发者ID:JGalego,项目名称:pytools,代码行数:25,代码来源:spark_json_to_parquet.py

示例10: get_language_correlation

# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import jsonFile [as 别名]
def get_language_correlation():
    """
        calculates the correlation between github languages
    """
    #Create Spark Context
    sc = SparkContext(appName="LanguageCorrelations")

    #Create SQL Context
    sqlCtx = SQLContext(sc)

    #Create a schemaRDD from json datasets stored in HDFS
    pushes = sqlCtx.jsonFile('git_14_15/git_results')

    #Register the schemaRDD as a Table
    pushes.registerTempTable('pushes')

    #filter the data to get the pushes for the languages from LANG
    filtered = sqlCtx.sql('select * from pushes where repository_language in ' + str(tuple(LANG)))

    #perform map transformation to get the rdd in the format (actor, {lang : pushes})
    f_pair = filtered.map(lambda s: (s.actor, {s.repository_language:s.pushes}))

    #group the RDD's based on actor to get the RDD of the format (actor, [{lang1 : pushes},{lang2 : pushes}...])
    f_group = f_pair.groupByKey()

    #merge lang dictionries to get single orderd dict per actor
    f_merged = f_group.map(lambda s: merge_lang_dict(s[1]))

    #created rdd of vectors from the pushes values, which is required for the correlation algorithm
    vectors = f_merged.map(lambda s: Vectors.dense(map(float, s.values())))  
    
    #call the correlation function
    matrix = Statistics.corr(vectors)
    print matrix
    plot_graph(matrix)
    sc.stop()
开发者ID:rackerlabs,项目名称:cloudbigdata-extras,代码行数:38,代码来源:github_lang_correlations.py

示例11: SparkConf

# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import jsonFile [as 别名]
	master_public_dns = "'ec2-52-34-253-146.us-west-2.compute.amazonaws.com"
	worker_public_dns = ['52.34.253.146', '52.27.28.14', '52.35.88.14', '52.32.240.173', '52.88.31.138']
conf = SparkConf().setAppName("hawkeye")
sc = SparkContext(conf=conf)
##sc = SparkContext("spark://" + master_ip + ":7077", "hawkeye")

from cassandra.cluster import Cluster
if USE_OLD_CLUSTER == 1:
	cluster = Cluster(worker_public_dns)
else:
	cluster = Cluster(worker_public_dns)
	
session = cluster.connect(CASSANDRA_KEYSPACE)

sqlsc = SQLContext(sc)
hemsgs = sqlsc.jsonFile("hdfs://" + master_public_dns + ":9000/camus/topics/" + KAFKA_TOPIC + "/*/*/*/*/*/*")
#hemsgs.count(); hemsgs.take(1)

# Row({
# 	"tsIn": 1454556923889,
# 	"tsOut": 1454556983787,
# 	"packetID": "PACKET19083",
# 	"monitorGroup": [
# 		{"type": "I",	"subgroup": "TASKID",	"id": "TASKID492", 	"power": "1"},
# 		{"type": "T",	"subgroup": "TASKTYPE",	"id": "TASKTYPE69",	"power": "2"},
# 		{"type": "I",	"subgroup": "SWID",		"id": "SWID6",		"power": "3"},
# 		{"type": "T",	"subgroup": "SWTYPE",	"id": "mysql",		"power": "4"},
# 		{"type": "I",	"subgroup": "APPID",	"id": "hawkeye",	"power": "5"},
# 		{"type": "T",	"subgroup": "APPTYPE",	"id": "APP",		"power": "6"}
# 	]
# })
开发者ID:shabss,项目名称:hawkeye,代码行数:33,代码来源:batch.py

示例12: eval

# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import jsonFile [as 别名]
model = pickle.load(open('data/classifier.pkl', 'r'))
model_b = sc.broadcast(model)
fashion.map(lambda x: eval(x)['reviewText']).map(lambda x: (x, model_b.value.predict([x])[0])).first()

################################### Spark DataFrame API and Spark SQL ###################################

#  Part 5 : Loading data to spark
# We start by loading the files to spark
# First, load them as text file to validate
review_filepaths = 'Data/Reviews/*'
textRDD = sc.textFile(review_filepaths)
print 'number of reviews : {0}'.format(textRDD.count())
print 'sample row : \n{0}'.format(textRDD.first())

# You can let spark infer the schema of your DataFrame 
inferredDF = sqc.jsonFile(review_filepaths)
inferredDF.first()

# Or you can programmatically tell spark how the schema looks like
# Define Schema
REVIEWS_SCHEMA_DEF = StructType([
        StructField('reviewerID', StringType(), True),
        StructField('asin', StringType(), True),
        StructField('reviewerName', StringType(), True),
        StructField('helpful', ArrayType(
                IntegerType(), True), 
            True),
        StructField('reviewText', StringType(), True),
        StructField('reviewTime', StringType(), True),
        StructField('overall', DoubleType(), True)
    ])
开发者ID:FuzzyDuck79,项目名称:spark_tutorial,代码行数:33,代码来源:spark_tutorial.py

示例13: PSparkContext

# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import jsonFile [as 别名]

#.........这里部分代码省略.........
            mynames = names
        else:
            # In the future we could avoid this expensive call.
            first_line = self.spark_ctx.textFile(file_path).first()
            frame = pandas.read_csv(sio(first_line), **kwargs)
            # pylint sees frame as a tuple despite it being a DataFrame
            mynames = list(frame.columns)
            _skiprows += 1

        # Do the actual load
        if use_whole_file:
            return self.from_pandas_rdd(
                self.spark_ctx.wholeTextFiles(file_path)
                .mapPartitionsWithIndex(csv_file))
        else:
            return self.from_pandas_rdd(
                self.spark_ctx.textFile(file_path)
                    .mapPartitionsWithIndex(csv_rows))

    def parquetFile(self, *paths):
        """Loads a Parquet file, returning the result as a L{DataFrame}.

        Parameters
        ----------
        paths: string, variable length
             The path(s) of the parquet files to load. Should be Hadoop style
             paths (e.g. hdfs://..., file://... etc.).
        Returns
        -------
        A L{DataFrame} of the contents of the parquet files.
        """
        return self.from_spark_rdd(self.sql_ctx.parquetFile(paths))

    def jsonFile(self, path, schema=None, sampling_ratio=1.0):
        """Loads a text file storing one JSON object per line as a
        L{DataFrame}.
        Parameters
        ----------
        path: string
             The path of the json files to load. Should be Hadoop style
             paths (e.g. hdfs://..., file://... etc.).
        schema: StructType, optional
             If you know the schema of your input data you can specify it. The
             schema is specified using Spark SQL's schema format. If not
             specified will sample the json records to determine the schema.
             Spark SQL's schema format is documented (somewhat) in the
             "Programmatically Specifying the Schema" of the Spark SQL
             programming guide at: http://bit.ly/sparkSQLprogrammingGuide
        sampling_ratio: int, default=1.0
             Percentage of the records to sample when infering schema.
             Defaults to all records for safety, but you may be able to set to
             a lower ratio if the same fields are present accross records or
             your input is of sufficient size.
        Returns
        -------
        A L{DataFrame} of the contents of the json files.
        """
        schema_rdd = self.sql_ctx.jsonFile(path, schema, sampling_ratio)
        return self.from_spark_rdd(schema_rdd)

    def from_pd_data_frame(self, local_df):
        """Make a Sparkling Pandas dataframe from a local Pandas DataFrame.
        The intend use is for testing or joining distributed data with local
        data.
        The types are re-infered, so they may not match.
        Parameters
开发者ID:jhlch,项目名称:sparklingpandas,代码行数:70,代码来源:pcontext.py

示例14: show

# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import jsonFile [as 别名]
from pyspark import SparkContext
from pyspark.mllib.feature import HashingTF	
from numpy import array
from pyspark.mllib.clustering import KMeans
import sys	

def show(x):	
  print	x	


if len(sys.argv) != 2:
	print >> sys.stderr, "Usage: handsOn4.py <filename>"
	exit(-1)  
sc= SparkContext()
sqlContext=SQLContext(sc)	 
tweets=sqlContext.jsonFile(sys.argv[1])	 
tweets.foreach(show)	 
tweets.registerTempTable("pt")	 
words=sqlContext.sql("select hashtags from pt where hashtags is not null")	
words.foreach(show)	
  

wordsArray=words.map(lambda x:array(x[0]))	
hashingTF=HashingTF()	
tf=hashingTF.transform(wordsArray)	
tf.foreach(show)
show("Executing Kmeans")
clusters = KMeans.train(tf,2,1,1)
results= wordsArray.map(lambda x:array([x,clusters.predict(hashingTF.transform(x))]))	
results.foreach(show)	
  
开发者ID:mgparada,项目名称:sparkmeetup,代码行数:32,代码来源:SparkKMeansExample.py

示例15: enumerate

# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import jsonFile [as 别名]
# 			fx_yprime = self.all_data[y][i] #self.get_feats(self.h_tuples[i][0], t)

# 			dot_vector = numpy.dot(numpy.array(fx_yprime), self.param)
# 			numerator = math.exp(dot_vector) 
# 			prob = numerator / denominator
			
# 			for j,val in enumerate(all_sum):
# 				all_sum[j] += prob * fx_yprime[j]


if __name__ == '__main__':
	features = [f1,f2,f3,f4,f5,f6,f7,f8,f9,f10, f11, f12, f13,f14]
	# features = [f1,f2,f3]

	input_data,tags = create_input_dataset()
	distributed_input_data = sqlContext.jsonFile('data.json')
	gradient_preprocess1(input_data,list(set(tags)))
	# print distributed_input_data.show()
	
	all_tags = sc.broadcast(list(set(tags)))
	no_of_features = sc.broadcast(len(features))
	size = sc.broadcast(len(input_data))
	param = [0 for i in range(len(features))]
	# gradient1_new(param)
	# param = [1 for i in range(len(features))]
	# gradient1_new(param)
	dt1 = datetime.datetime.now()
	print 'before training: ', dt1
	params = mymin(cost1, param, method = 'L-BFGS-B', jac = gradient1_new, options = {'maxiter':100}) #, jac = self.gradient) # , options = {'maxiter':100}
	print params.x
	print params
开发者ID:karthikradhakrishnan96,项目名称:ccbd,代码行数:33,代码来源:memm_pyspark.py


注:本文中的pyspark.sql.SQLContext.jsonFile方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。